#!/usr/bin/env perl # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html use strict; my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; my $xlate; $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; my $code = data(); print $code; close STDOUT or die "error closing STDOUT: $!"; # enforce flush sub data { local $/; return ; } __END__ // Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html // // ==================================================================== // Written by Ben Avison for the OpenSSL // project. Rights for redistribution and usage in source and binary // forms are granted according to the OpenSSL license. // ==================================================================== // // This implementation is a translation of bsaes-armv7 for AArch64. // No attempt has been made to carry across the build switches for // kernel targets, since the Linux kernel crypto support has moved on // from when it was based on OpenSSL. // A lot of hand-scheduling has been performed. Consequently, this code // doesn't factor out neatly into macros in the same way that the // AArch32 version did, and there is little to be gained by wrapping it // up in Perl, and it is presented as pure assembly. #include "crypto/arm_arch.h" .text .extern AES_cbc_encrypt .extern AES_encrypt .extern AES_decrypt .type _bsaes_decrypt8,%function .align 4 // On entry: // x9 -> key (previously expanded using _bsaes_key_convert) // x10 = number of rounds // v0-v7 input data // On exit: // x9-x11 corrupted // other general-purpose registers preserved // v0-v7 output data // v11-v15 preserved // other SIMD registers corrupted _bsaes_decrypt8: ldr q8, [x9], #16 adr x11, .LM0ISR movi v9.16b, #0x55 ldr q10, [x11], #16 movi v16.16b, #0x33 movi v17.16b, #0x0f sub x10, x10, #1 eor v0.16b, v0.16b, v8.16b eor v1.16b, v1.16b, v8.16b eor v2.16b, v2.16b, v8.16b eor v4.16b, v4.16b, v8.16b eor v3.16b, v3.16b, v8.16b eor v5.16b, v5.16b, v8.16b tbl v0.16b, {v0.16b}, v10.16b tbl v1.16b, {v1.16b}, v10.16b tbl v2.16b, {v2.16b}, v10.16b tbl v4.16b, {v4.16b}, v10.16b eor v6.16b, v6.16b, v8.16b eor v7.16b, v7.16b, v8.16b tbl v3.16b, {v3.16b}, v10.16b tbl v5.16b, {v5.16b}, v10.16b tbl v6.16b, {v6.16b}, v10.16b ushr v8.2d, v0.2d, #1 tbl v7.16b, {v7.16b}, v10.16b ushr v10.2d, v4.2d, #1 ushr v18.2d, v2.2d, #1 eor v8.16b, v8.16b, v1.16b ushr v19.2d, v6.2d, #1 eor v10.16b, v10.16b, v5.16b eor v18.16b, v18.16b, v3.16b and v8.16b, v8.16b, v9.16b eor v19.16b, v19.16b, v7.16b and v10.16b, v10.16b, v9.16b and v18.16b, v18.16b, v9.16b eor v1.16b, v1.16b, v8.16b shl v8.2d, v8.2d, #1 and v9.16b, v19.16b, v9.16b eor v5.16b, v5.16b, v10.16b shl v10.2d, v10.2d, #1 eor v3.16b, v3.16b, v18.16b shl v18.2d, v18.2d, #1 eor v0.16b, v0.16b, v8.16b shl v8.2d, v9.2d, #1 eor v7.16b, v7.16b, v9.16b eor v4.16b, v4.16b, v10.16b eor v2.16b, v2.16b, v18.16b ushr v9.2d, v1.2d, #2 eor v6.16b, v6.16b, v8.16b ushr v8.2d, v0.2d, #2 ushr v10.2d, v5.2d, #2 ushr v18.2d, v4.2d, #2 eor v9.16b, v9.16b, v3.16b eor v8.16b, v8.16b, v2.16b eor v10.16b, v10.16b, v7.16b eor v18.16b, v18.16b, v6.16b and v9.16b, v9.16b, v16.16b and v8.16b, v8.16b, v16.16b and v10.16b, v10.16b, v16.16b and v16.16b, v18.16b, v16.16b eor v3.16b, v3.16b, v9.16b shl v9.2d, v9.2d, #2 eor v2.16b, v2.16b, v8.16b shl v8.2d, v8.2d, #2 eor v7.16b, v7.16b, v10.16b shl v10.2d, v10.2d, #2 eor v6.16b, v6.16b, v16.16b shl v16.2d, v16.2d, #2 eor v1.16b, v1.16b, v9.16b eor v0.16b, v0.16b, v8.16b eor v5.16b, v5.16b, v10.16b eor v4.16b, v4.16b, v16.16b ushr v8.2d, v3.2d, #4 ushr v9.2d, v2.2d, #4 ushr v10.2d, v1.2d, #4 ushr v16.2d, v0.2d, #4 eor v8.16b, v8.16b, v7.16b eor v9.16b, v9.16b, v6.16b eor v10.16b, v10.16b, v5.16b eor v16.16b, v16.16b, v4.16b and v8.16b, v8.16b, v17.16b and v9.16b, v9.16b, v17.16b and v10.16b, v10.16b, v17.16b and v16.16b, v16.16b, v17.16b eor v7.16b, v7.16b, v8.16b shl v8.2d, v8.2d, #4 eor v6.16b, v6.16b, v9.16b shl v9.2d, v9.2d, #4 eor v5.16b, v5.16b, v10.16b shl v10.2d, v10.2d, #4 eor v4.16b, v4.16b, v16.16b shl v16.2d, v16.2d, #4 eor v3.16b, v3.16b, v8.16b eor v2.16b, v2.16b, v9.16b eor v1.16b, v1.16b, v10.16b eor v0.16b, v0.16b, v16.16b b .Ldec_sbox .align 4 .Ldec_loop: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 ldp q8, q9, [x9], #32 eor v0.16b, v16.16b, v0.16b ldr q10, [x9], #16 eor v1.16b, v17.16b, v1.16b ldr q16, [x9], #16 eor v2.16b, v18.16b, v2.16b eor v3.16b, v19.16b, v3.16b eor v4.16b, v8.16b, v4.16b eor v5.16b, v9.16b, v5.16b eor v6.16b, v10.16b, v6.16b eor v7.16b, v16.16b, v7.16b tbl v0.16b, {v0.16b}, v28.16b tbl v1.16b, {v1.16b}, v28.16b tbl v2.16b, {v2.16b}, v28.16b tbl v3.16b, {v3.16b}, v28.16b tbl v4.16b, {v4.16b}, v28.16b tbl v5.16b, {v5.16b}, v28.16b tbl v6.16b, {v6.16b}, v28.16b tbl v7.16b, {v7.16b}, v28.16b .Ldec_sbox: eor v1.16b, v1.16b, v4.16b eor v3.16b, v3.16b, v4.16b subs x10, x10, #1 eor v4.16b, v4.16b, v7.16b eor v2.16b, v2.16b, v7.16b eor v1.16b, v1.16b, v6.16b eor v6.16b, v6.16b, v4.16b eor v2.16b, v2.16b, v5.16b eor v0.16b, v0.16b, v1.16b eor v7.16b, v7.16b, v6.16b eor v8.16b, v6.16b, v2.16b and v9.16b, v4.16b, v6.16b eor v10.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v0.16b eor v5.16b, v5.16b, v0.16b eor v16.16b, v7.16b, v4.16b eor v17.16b, v4.16b, v0.16b and v18.16b, v0.16b, v2.16b eor v19.16b, v7.16b, v4.16b eor v1.16b, v1.16b, v3.16b eor v20.16b, v3.16b, v0.16b eor v21.16b, v5.16b, v2.16b eor v22.16b, v3.16b, v7.16b and v8.16b, v17.16b, v8.16b orr v17.16b, v3.16b, v5.16b eor v23.16b, v1.16b, v6.16b eor v24.16b, v20.16b, v16.16b eor v25.16b, v1.16b, v5.16b orr v26.16b, v20.16b, v21.16b and v20.16b, v20.16b, v21.16b and v27.16b, v7.16b, v1.16b eor v21.16b, v21.16b, v23.16b orr v28.16b, v16.16b, v23.16b orr v29.16b, v22.16b, v25.16b eor v26.16b, v26.16b, v8.16b and v16.16b, v16.16b, v23.16b and v22.16b, v22.16b, v25.16b and v21.16b, v24.16b, v21.16b eor v8.16b, v28.16b, v8.16b eor v23.16b, v5.16b, v2.16b eor v24.16b, v1.16b, v6.16b eor v16.16b, v16.16b, v22.16b eor v22.16b, v3.16b, v0.16b eor v25.16b, v29.16b, v21.16b eor v21.16b, v26.16b, v21.16b eor v8.16b, v8.16b, v20.16b eor v26.16b, v23.16b, v24.16b eor v16.16b, v16.16b, v20.16b eor v28.16b, v22.16b, v19.16b eor v20.16b, v25.16b, v20.16b eor v9.16b, v21.16b, v9.16b eor v8.16b, v8.16b, v18.16b eor v18.16b, v5.16b, v1.16b eor v21.16b, v16.16b, v17.16b eor v16.16b, v16.16b, v17.16b eor v17.16b, v20.16b, v27.16b eor v20.16b, v3.16b, v7.16b eor v25.16b, v9.16b, v8.16b eor v27.16b, v0.16b, v4.16b and v29.16b, v9.16b, v17.16b eor v30.16b, v8.16b, v29.16b eor v31.16b, v21.16b, v29.16b eor v29.16b, v21.16b, v29.16b bsl v30.16b, v17.16b, v21.16b bsl v31.16b, v9.16b, v8.16b bsl v16.16b, v30.16b, v29.16b bsl v21.16b, v29.16b, v30.16b eor v8.16b, v31.16b, v30.16b and v1.16b, v1.16b, v31.16b and v9.16b, v16.16b, v31.16b and v6.16b, v6.16b, v30.16b eor v16.16b, v17.16b, v21.16b and v4.16b, v4.16b, v30.16b eor v17.16b, v8.16b, v30.16b and v21.16b, v24.16b, v8.16b eor v9.16b, v9.16b, v25.16b and v19.16b, v19.16b, v8.16b eor v24.16b, v30.16b, v16.16b eor v25.16b, v30.16b, v16.16b and v7.16b, v7.16b, v17.16b and v10.16b, v10.16b, v16.16b eor v29.16b, v9.16b, v16.16b eor v30.16b, v31.16b, v9.16b and v0.16b, v24.16b, v0.16b and v9.16b, v18.16b, v9.16b and v2.16b, v25.16b, v2.16b eor v10.16b, v10.16b, v6.16b eor v18.16b, v29.16b, v16.16b and v5.16b, v30.16b, v5.16b eor v24.16b, v8.16b, v29.16b and v25.16b, v26.16b, v29.16b and v26.16b, v28.16b, v29.16b eor v8.16b, v8.16b, v29.16b eor v17.16b, v17.16b, v18.16b eor v5.16b, v1.16b, v5.16b and v23.16b, v24.16b, v23.16b eor v21.16b, v21.16b, v25.16b eor v19.16b, v19.16b, v26.16b eor v0.16b, v4.16b, v0.16b and v3.16b, v17.16b, v3.16b eor v1.16b, v9.16b, v1.16b eor v9.16b, v25.16b, v23.16b eor v5.16b, v5.16b, v21.16b eor v2.16b, v6.16b, v2.16b and v6.16b, v8.16b, v22.16b eor v3.16b, v7.16b, v3.16b and v8.16b, v20.16b, v18.16b eor v10.16b, v10.16b, v9.16b eor v0.16b, v0.16b, v19.16b eor v9.16b, v1.16b, v9.16b eor v1.16b, v2.16b, v21.16b eor v3.16b, v3.16b, v19.16b and v16.16b, v27.16b, v16.16b eor v17.16b, v26.16b, v6.16b eor v6.16b, v8.16b, v7.16b eor v7.16b, v1.16b, v9.16b eor v1.16b, v5.16b, v3.16b eor v2.16b, v10.16b, v3.16b eor v4.16b, v16.16b, v4.16b eor v8.16b, v6.16b, v17.16b eor v5.16b, v9.16b, v3.16b eor v9.16b, v0.16b, v1.16b eor v6.16b, v7.16b, v1.16b eor v0.16b, v4.16b, v17.16b eor v4.16b, v8.16b, v7.16b eor v7.16b, v9.16b, v2.16b eor v8.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v5.16b eor v3.16b, v4.16b, v7.16b eor v4.16b, v7.16b, v0.16b eor v7.16b, v8.16b, v3.16b bcc .Ldec_done ext v8.16b, v0.16b, v0.16b, #8 ext v9.16b, v1.16b, v1.16b, #8 ldr q28, [x11] // load from .LISR in common case (x10 > 0) ext v10.16b, v6.16b, v6.16b, #8 ext v16.16b, v3.16b, v3.16b, #8 ext v17.16b, v5.16b, v5.16b, #8 ext v18.16b, v4.16b, v4.16b, #8 eor v8.16b, v8.16b, v0.16b eor v9.16b, v9.16b, v1.16b eor v10.16b, v10.16b, v6.16b eor v16.16b, v16.16b, v3.16b eor v17.16b, v17.16b, v5.16b ext v19.16b, v2.16b, v2.16b, #8 ext v20.16b, v7.16b, v7.16b, #8 eor v18.16b, v18.16b, v4.16b eor v6.16b, v6.16b, v8.16b eor v8.16b, v2.16b, v10.16b eor v4.16b, v4.16b, v9.16b eor v2.16b, v19.16b, v2.16b eor v9.16b, v20.16b, v7.16b eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v6.16b, v6.16b, v17.16b eor v8.16b, v8.16b, v16.16b eor v7.16b, v7.16b, v18.16b eor v4.16b, v4.16b, v16.16b eor v2.16b, v3.16b, v2.16b eor v1.16b, v1.16b, v17.16b eor v3.16b, v5.16b, v9.16b eor v5.16b, v8.16b, v17.16b eor v7.16b, v7.16b, v17.16b ext v8.16b, v0.16b, v0.16b, #12 ext v9.16b, v6.16b, v6.16b, #12 ext v10.16b, v4.16b, v4.16b, #12 ext v16.16b, v1.16b, v1.16b, #12 ext v17.16b, v5.16b, v5.16b, #12 ext v18.16b, v7.16b, v7.16b, #12 eor v0.16b, v0.16b, v8.16b eor v6.16b, v6.16b, v9.16b eor v4.16b, v4.16b, v10.16b ext v19.16b, v2.16b, v2.16b, #12 ext v20.16b, v3.16b, v3.16b, #12 eor v1.16b, v1.16b, v16.16b eor v5.16b, v5.16b, v17.16b eor v7.16b, v7.16b, v18.16b eor v2.16b, v2.16b, v19.16b eor v16.16b, v16.16b, v0.16b eor v3.16b, v3.16b, v20.16b eor v17.16b, v17.16b, v4.16b eor v10.16b, v10.16b, v6.16b ext v0.16b, v0.16b, v0.16b, #8 eor v9.16b, v9.16b, v1.16b ext v1.16b, v1.16b, v1.16b, #8 eor v8.16b, v8.16b, v3.16b eor v16.16b, v16.16b, v3.16b eor v18.16b, v18.16b, v5.16b eor v19.16b, v19.16b, v7.16b ext v21.16b, v5.16b, v5.16b, #8 ext v5.16b, v7.16b, v7.16b, #8 eor v7.16b, v20.16b, v2.16b ext v4.16b, v4.16b, v4.16b, #8 ext v20.16b, v3.16b, v3.16b, #8 eor v17.16b, v17.16b, v3.16b ext v2.16b, v2.16b, v2.16b, #8 eor v3.16b, v10.16b, v3.16b ext v10.16b, v6.16b, v6.16b, #8 eor v0.16b, v0.16b, v8.16b eor v1.16b, v1.16b, v16.16b eor v5.16b, v5.16b, v18.16b eor v3.16b, v3.16b, v4.16b eor v7.16b, v20.16b, v7.16b eor v6.16b, v2.16b, v19.16b eor v4.16b, v21.16b, v17.16b eor v2.16b, v10.16b, v9.16b bne .Ldec_loop ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) b .Ldec_loop .align 4 .Ldec_done: ushr v8.2d, v0.2d, #1 movi v9.16b, #0x55 ldr q10, [x9] ushr v16.2d, v2.2d, #1 movi v17.16b, #0x33 ushr v18.2d, v6.2d, #1 movi v19.16b, #0x0f eor v8.16b, v8.16b, v1.16b ushr v20.2d, v3.2d, #1 eor v16.16b, v16.16b, v7.16b eor v18.16b, v18.16b, v4.16b and v8.16b, v8.16b, v9.16b eor v20.16b, v20.16b, v5.16b and v16.16b, v16.16b, v9.16b and v18.16b, v18.16b, v9.16b shl v21.2d, v8.2d, #1 eor v1.16b, v1.16b, v8.16b and v8.16b, v20.16b, v9.16b eor v7.16b, v7.16b, v16.16b shl v9.2d, v16.2d, #1 eor v4.16b, v4.16b, v18.16b shl v16.2d, v18.2d, #1 eor v0.16b, v0.16b, v21.16b shl v18.2d, v8.2d, #1 eor v5.16b, v5.16b, v8.16b eor v2.16b, v2.16b, v9.16b eor v6.16b, v6.16b, v16.16b ushr v8.2d, v1.2d, #2 eor v3.16b, v3.16b, v18.16b ushr v9.2d, v0.2d, #2 ushr v16.2d, v7.2d, #2 ushr v18.2d, v2.2d, #2 eor v8.16b, v8.16b, v4.16b eor v9.16b, v9.16b, v6.16b eor v16.16b, v16.16b, v5.16b eor v18.16b, v18.16b, v3.16b and v8.16b, v8.16b, v17.16b and v9.16b, v9.16b, v17.16b and v16.16b, v16.16b, v17.16b and v17.16b, v18.16b, v17.16b eor v4.16b, v4.16b, v8.16b shl v8.2d, v8.2d, #2 eor v6.16b, v6.16b, v9.16b shl v9.2d, v9.2d, #2 eor v5.16b, v5.16b, v16.16b shl v16.2d, v16.2d, #2 eor v3.16b, v3.16b, v17.16b shl v17.2d, v17.2d, #2 eor v1.16b, v1.16b, v8.16b eor v0.16b, v0.16b, v9.16b eor v7.16b, v7.16b, v16.16b eor v2.16b, v2.16b, v17.16b ushr v8.2d, v4.2d, #4 ushr v9.2d, v6.2d, #4 ushr v16.2d, v1.2d, #4 ushr v17.2d, v0.2d, #4 eor v8.16b, v8.16b, v5.16b eor v9.16b, v9.16b, v3.16b eor v16.16b, v16.16b, v7.16b eor v17.16b, v17.16b, v2.16b and v8.16b, v8.16b, v19.16b and v9.16b, v9.16b, v19.16b and v16.16b, v16.16b, v19.16b and v17.16b, v17.16b, v19.16b eor v5.16b, v5.16b, v8.16b shl v8.2d, v8.2d, #4 eor v3.16b, v3.16b, v9.16b shl v9.2d, v9.2d, #4 eor v7.16b, v7.16b, v16.16b shl v16.2d, v16.2d, #4 eor v2.16b, v2.16b, v17.16b shl v17.2d, v17.2d, #4 eor v4.16b, v4.16b, v8.16b eor v6.16b, v6.16b, v9.16b eor v7.16b, v7.16b, v10.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v10.16b eor v0.16b, v0.16b, v17.16b eor v4.16b, v4.16b, v10.16b eor v6.16b, v6.16b, v10.16b eor v3.16b, v3.16b, v10.16b eor v5.16b, v5.16b, v10.16b eor v1.16b, v1.16b, v10.16b eor v0.16b, v0.16b, v10.16b ret .size _bsaes_decrypt8,.-_bsaes_decrypt8 .type _bsaes_const,%object .align 6 _bsaes_const: // InvShiftRows constants // Used in _bsaes_decrypt8, which assumes contiguity // .LM0ISR used with round 0 key // .LISR used with middle round keys // .LISRM0 used with final round key .LM0ISR: .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 .LISR: .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 .LISRM0: .quad 0x01040b0e0205080f, 0x0306090c00070a0d // ShiftRows constants // Used in _bsaes_encrypt8, which assumes contiguity // .LM0SR used with round 0 key // .LSR used with middle round keys // .LSRM0 used with final round key .LM0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01 .LSR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d .LM0_bigendian: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LM0_littleendian: .quad 0x0105090d0004080c, 0x03070b0f02060a0e // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR .LREVM0SR: .quad 0x090d01050c000408, 0x03070b0f060a0e02 .align 6 .size _bsaes_const,.-_bsaes_const .type _bsaes_encrypt8,%function .align 4 // On entry: // x9 -> key (previously expanded using _bsaes_key_convert) // x10 = number of rounds // v0-v7 input data // On exit: // x9-x11 corrupted // other general-purpose registers preserved // v0-v7 output data // v11-v15 preserved // other SIMD registers corrupted _bsaes_encrypt8: ldr q8, [x9], #16 adr x11, .LM0SR ldr q9, [x11], #16 _bsaes_encrypt8_alt: eor v0.16b, v0.16b, v8.16b eor v1.16b, v1.16b, v8.16b sub x10, x10, #1 eor v2.16b, v2.16b, v8.16b eor v4.16b, v4.16b, v8.16b eor v3.16b, v3.16b, v8.16b eor v5.16b, v5.16b, v8.16b tbl v0.16b, {v0.16b}, v9.16b tbl v1.16b, {v1.16b}, v9.16b tbl v2.16b, {v2.16b}, v9.16b tbl v4.16b, {v4.16b}, v9.16b eor v6.16b, v6.16b, v8.16b eor v7.16b, v7.16b, v8.16b tbl v3.16b, {v3.16b}, v9.16b tbl v5.16b, {v5.16b}, v9.16b tbl v6.16b, {v6.16b}, v9.16b ushr v8.2d, v0.2d, #1 movi v10.16b, #0x55 tbl v7.16b, {v7.16b}, v9.16b ushr v9.2d, v4.2d, #1 movi v16.16b, #0x33 ushr v17.2d, v2.2d, #1 eor v8.16b, v8.16b, v1.16b movi v18.16b, #0x0f ushr v19.2d, v6.2d, #1 eor v9.16b, v9.16b, v5.16b eor v17.16b, v17.16b, v3.16b and v8.16b, v8.16b, v10.16b eor v19.16b, v19.16b, v7.16b and v9.16b, v9.16b, v10.16b and v17.16b, v17.16b, v10.16b eor v1.16b, v1.16b, v8.16b shl v8.2d, v8.2d, #1 and v10.16b, v19.16b, v10.16b eor v5.16b, v5.16b, v9.16b shl v9.2d, v9.2d, #1 eor v3.16b, v3.16b, v17.16b shl v17.2d, v17.2d, #1 eor v0.16b, v0.16b, v8.16b shl v8.2d, v10.2d, #1 eor v7.16b, v7.16b, v10.16b eor v4.16b, v4.16b, v9.16b eor v2.16b, v2.16b, v17.16b ushr v9.2d, v1.2d, #2 eor v6.16b, v6.16b, v8.16b ushr v8.2d, v0.2d, #2 ushr v10.2d, v5.2d, #2 ushr v17.2d, v4.2d, #2 eor v9.16b, v9.16b, v3.16b eor v8.16b, v8.16b, v2.16b eor v10.16b, v10.16b, v7.16b eor v17.16b, v17.16b, v6.16b and v9.16b, v9.16b, v16.16b and v8.16b, v8.16b, v16.16b and v10.16b, v10.16b, v16.16b and v16.16b, v17.16b, v16.16b eor v3.16b, v3.16b, v9.16b shl v9.2d, v9.2d, #2 eor v2.16b, v2.16b, v8.16b shl v8.2d, v8.2d, #2 eor v7.16b, v7.16b, v10.16b shl v10.2d, v10.2d, #2 eor v6.16b, v6.16b, v16.16b shl v16.2d, v16.2d, #2 eor v1.16b, v1.16b, v9.16b eor v0.16b, v0.16b, v8.16b eor v5.16b, v5.16b, v10.16b eor v4.16b, v4.16b, v16.16b ushr v8.2d, v3.2d, #4 ushr v9.2d, v2.2d, #4 ushr v10.2d, v1.2d, #4 ushr v16.2d, v0.2d, #4 eor v8.16b, v8.16b, v7.16b eor v9.16b, v9.16b, v6.16b eor v10.16b, v10.16b, v5.16b eor v16.16b, v16.16b, v4.16b and v8.16b, v8.16b, v18.16b and v9.16b, v9.16b, v18.16b and v10.16b, v10.16b, v18.16b and v16.16b, v16.16b, v18.16b eor v7.16b, v7.16b, v8.16b shl v8.2d, v8.2d, #4 eor v6.16b, v6.16b, v9.16b shl v9.2d, v9.2d, #4 eor v5.16b, v5.16b, v10.16b shl v10.2d, v10.2d, #4 eor v4.16b, v4.16b, v16.16b shl v16.2d, v16.2d, #4 eor v3.16b, v3.16b, v8.16b eor v2.16b, v2.16b, v9.16b eor v1.16b, v1.16b, v10.16b eor v0.16b, v0.16b, v16.16b b .Lenc_sbox .align 4 .Lenc_loop: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 ldp q8, q9, [x9], #32 eor v0.16b, v16.16b, v0.16b ldr q10, [x9], #16 eor v1.16b, v17.16b, v1.16b ldr q16, [x9], #16 eor v2.16b, v18.16b, v2.16b eor v3.16b, v19.16b, v3.16b eor v4.16b, v8.16b, v4.16b eor v5.16b, v9.16b, v5.16b eor v6.16b, v10.16b, v6.16b eor v7.16b, v16.16b, v7.16b tbl v0.16b, {v0.16b}, v28.16b tbl v1.16b, {v1.16b}, v28.16b tbl v2.16b, {v2.16b}, v28.16b tbl v3.16b, {v3.16b}, v28.16b tbl v4.16b, {v4.16b}, v28.16b tbl v5.16b, {v5.16b}, v28.16b tbl v6.16b, {v6.16b}, v28.16b tbl v7.16b, {v7.16b}, v28.16b .Lenc_sbox: eor v5.16b, v5.16b, v6.16b eor v3.16b, v3.16b, v0.16b subs x10, x10, #1 eor v2.16b, v2.16b, v1.16b eor v5.16b, v5.16b, v0.16b eor v8.16b, v3.16b, v7.16b eor v6.16b, v6.16b, v2.16b eor v7.16b, v7.16b, v5.16b eor v8.16b, v8.16b, v4.16b eor v3.16b, v6.16b, v3.16b eor v4.16b, v4.16b, v5.16b eor v6.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v7.16b eor v1.16b, v8.16b, v1.16b eor v8.16b, v7.16b, v4.16b eor v9.16b, v3.16b, v0.16b eor v10.16b, v7.16b, v6.16b eor v16.16b, v5.16b, v3.16b eor v17.16b, v6.16b, v2.16b eor v18.16b, v5.16b, v1.16b eor v19.16b, v2.16b, v4.16b eor v20.16b, v1.16b, v0.16b orr v21.16b, v8.16b, v9.16b orr v22.16b, v10.16b, v16.16b eor v23.16b, v8.16b, v17.16b eor v24.16b, v9.16b, v18.16b and v19.16b, v19.16b, v20.16b orr v20.16b, v17.16b, v18.16b and v8.16b, v8.16b, v9.16b and v9.16b, v17.16b, v18.16b and v17.16b, v23.16b, v24.16b and v10.16b, v10.16b, v16.16b eor v16.16b, v21.16b, v19.16b eor v18.16b, v20.16b, v19.16b and v19.16b, v2.16b, v1.16b and v20.16b, v6.16b, v5.16b eor v21.16b, v22.16b, v17.16b eor v9.16b, v9.16b, v10.16b eor v10.16b, v16.16b, v17.16b eor v16.16b, v18.16b, v8.16b and v17.16b, v4.16b, v0.16b orr v18.16b, v7.16b, v3.16b eor v21.16b, v21.16b, v8.16b eor v8.16b, v9.16b, v8.16b eor v9.16b, v10.16b, v19.16b eor v10.16b, v3.16b, v0.16b eor v16.16b, v16.16b, v17.16b eor v17.16b, v5.16b, v1.16b eor v19.16b, v21.16b, v20.16b eor v20.16b, v8.16b, v18.16b eor v8.16b, v8.16b, v18.16b eor v18.16b, v7.16b, v4.16b eor v21.16b, v9.16b, v16.16b eor v22.16b, v6.16b, v2.16b and v23.16b, v9.16b, v19.16b eor v24.16b, v10.16b, v17.16b eor v25.16b, v0.16b, v1.16b eor v26.16b, v7.16b, v6.16b eor v27.16b, v18.16b, v22.16b eor v28.16b, v3.16b, v5.16b eor v29.16b, v16.16b, v23.16b eor v30.16b, v20.16b, v23.16b eor v23.16b, v20.16b, v23.16b eor v31.16b, v4.16b, v2.16b bsl v29.16b, v19.16b, v20.16b bsl v30.16b, v9.16b, v16.16b bsl v8.16b, v29.16b, v23.16b bsl v20.16b, v23.16b, v29.16b eor v9.16b, v30.16b, v29.16b and v5.16b, v5.16b, v30.16b and v8.16b, v8.16b, v30.16b and v1.16b, v1.16b, v29.16b eor v16.16b, v19.16b, v20.16b and v2.16b, v2.16b, v29.16b eor v19.16b, v9.16b, v29.16b and v17.16b, v17.16b, v9.16b eor v8.16b, v8.16b, v21.16b and v20.16b, v22.16b, v9.16b eor v21.16b, v29.16b, v16.16b eor v22.16b, v29.16b, v16.16b and v23.16b, v25.16b, v16.16b and v6.16b, v6.16b, v19.16b eor v25.16b, v8.16b, v16.16b eor v29.16b, v30.16b, v8.16b and v4.16b, v21.16b, v4.16b and v8.16b, v28.16b, v8.16b and v0.16b, v22.16b, v0.16b eor v21.16b, v23.16b, v1.16b eor v22.16b, v9.16b, v25.16b eor v9.16b, v9.16b, v25.16b eor v23.16b, v25.16b, v16.16b and v3.16b, v29.16b, v3.16b and v24.16b, v24.16b, v25.16b and v25.16b, v27.16b, v25.16b and v10.16b, v22.16b, v10.16b and v9.16b, v9.16b, v18.16b eor v18.16b, v19.16b, v23.16b and v19.16b, v26.16b, v23.16b eor v3.16b, v5.16b, v3.16b eor v17.16b, v17.16b, v24.16b eor v10.16b, v24.16b, v10.16b and v16.16b, v31.16b, v16.16b eor v20.16b, v20.16b, v25.16b eor v9.16b, v25.16b, v9.16b eor v4.16b, v2.16b, v4.16b and v7.16b, v18.16b, v7.16b eor v18.16b, v19.16b, v6.16b eor v5.16b, v8.16b, v5.16b eor v0.16b, v1.16b, v0.16b eor v1.16b, v21.16b, v10.16b eor v8.16b, v3.16b, v17.16b eor v2.16b, v16.16b, v2.16b eor v3.16b, v6.16b, v7.16b eor v6.16b, v18.16b, v9.16b eor v4.16b, v4.16b, v20.16b eor v10.16b, v5.16b, v10.16b eor v0.16b, v0.16b, v17.16b eor v9.16b, v2.16b, v9.16b eor v3.16b, v3.16b, v20.16b eor v7.16b, v6.16b, v1.16b eor v5.16b, v8.16b, v4.16b eor v6.16b, v10.16b, v1.16b eor v2.16b, v4.16b, v0.16b eor v4.16b, v3.16b, v10.16b eor v9.16b, v9.16b, v7.16b eor v3.16b, v0.16b, v5.16b eor v0.16b, v1.16b, v4.16b eor v1.16b, v4.16b, v8.16b eor v4.16b, v9.16b, v5.16b eor v6.16b, v6.16b, v3.16b bcc .Lenc_done ext v8.16b, v0.16b, v0.16b, #12 ext v9.16b, v4.16b, v4.16b, #12 ldr q28, [x11] ext v10.16b, v6.16b, v6.16b, #12 ext v16.16b, v1.16b, v1.16b, #12 ext v17.16b, v3.16b, v3.16b, #12 ext v18.16b, v7.16b, v7.16b, #12 eor v0.16b, v0.16b, v8.16b eor v4.16b, v4.16b, v9.16b eor v6.16b, v6.16b, v10.16b ext v19.16b, v2.16b, v2.16b, #12 ext v20.16b, v5.16b, v5.16b, #12 eor v1.16b, v1.16b, v16.16b eor v3.16b, v3.16b, v17.16b eor v7.16b, v7.16b, v18.16b eor v2.16b, v2.16b, v19.16b eor v16.16b, v16.16b, v0.16b eor v5.16b, v5.16b, v20.16b eor v17.16b, v17.16b, v6.16b eor v10.16b, v10.16b, v4.16b ext v0.16b, v0.16b, v0.16b, #8 eor v9.16b, v9.16b, v1.16b ext v1.16b, v1.16b, v1.16b, #8 eor v8.16b, v8.16b, v5.16b eor v16.16b, v16.16b, v5.16b eor v18.16b, v18.16b, v3.16b eor v19.16b, v19.16b, v7.16b ext v3.16b, v3.16b, v3.16b, #8 ext v7.16b, v7.16b, v7.16b, #8 eor v20.16b, v20.16b, v2.16b ext v6.16b, v6.16b, v6.16b, #8 ext v21.16b, v5.16b, v5.16b, #8 eor v17.16b, v17.16b, v5.16b ext v2.16b, v2.16b, v2.16b, #8 eor v10.16b, v10.16b, v5.16b ext v22.16b, v4.16b, v4.16b, #8 eor v0.16b, v0.16b, v8.16b eor v1.16b, v1.16b, v16.16b eor v5.16b, v7.16b, v18.16b eor v4.16b, v3.16b, v17.16b eor v3.16b, v6.16b, v10.16b eor v7.16b, v21.16b, v20.16b eor v6.16b, v2.16b, v19.16b eor v2.16b, v22.16b, v9.16b bne .Lenc_loop ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) b .Lenc_loop .align 4 .Lenc_done: ushr v8.2d, v0.2d, #1 movi v9.16b, #0x55 ldr q10, [x9] ushr v16.2d, v3.2d, #1 movi v17.16b, #0x33 ushr v18.2d, v4.2d, #1 movi v19.16b, #0x0f eor v8.16b, v8.16b, v1.16b ushr v20.2d, v2.2d, #1 eor v16.16b, v16.16b, v7.16b eor v18.16b, v18.16b, v6.16b and v8.16b, v8.16b, v9.16b eor v20.16b, v20.16b, v5.16b and v16.16b, v16.16b, v9.16b and v18.16b, v18.16b, v9.16b shl v21.2d, v8.2d, #1 eor v1.16b, v1.16b, v8.16b and v8.16b, v20.16b, v9.16b eor v7.16b, v7.16b, v16.16b shl v9.2d, v16.2d, #1 eor v6.16b, v6.16b, v18.16b shl v16.2d, v18.2d, #1 eor v0.16b, v0.16b, v21.16b shl v18.2d, v8.2d, #1 eor v5.16b, v5.16b, v8.16b eor v3.16b, v3.16b, v9.16b eor v4.16b, v4.16b, v16.16b ushr v8.2d, v1.2d, #2 eor v2.16b, v2.16b, v18.16b ushr v9.2d, v0.2d, #2 ushr v16.2d, v7.2d, #2 ushr v18.2d, v3.2d, #2 eor v8.16b, v8.16b, v6.16b eor v9.16b, v9.16b, v4.16b eor v16.16b, v16.16b, v5.16b eor v18.16b, v18.16b, v2.16b and v8.16b, v8.16b, v17.16b and v9.16b, v9.16b, v17.16b and v16.16b, v16.16b, v17.16b and v17.16b, v18.16b, v17.16b eor v6.16b, v6.16b, v8.16b shl v8.2d, v8.2d, #2 eor v4.16b, v4.16b, v9.16b shl v9.2d, v9.2d, #2 eor v5.16b, v5.16b, v16.16b shl v16.2d, v16.2d, #2 eor v2.16b, v2.16b, v17.16b shl v17.2d, v17.2d, #2 eor v1.16b, v1.16b, v8.16b eor v0.16b, v0.16b, v9.16b eor v7.16b, v7.16b, v16.16b eor v3.16b, v3.16b, v17.16b ushr v8.2d, v6.2d, #4 ushr v9.2d, v4.2d, #4 ushr v16.2d, v1.2d, #4 ushr v17.2d, v0.2d, #4 eor v8.16b, v8.16b, v5.16b eor v9.16b, v9.16b, v2.16b eor v16.16b, v16.16b, v7.16b eor v17.16b, v17.16b, v3.16b and v8.16b, v8.16b, v19.16b and v9.16b, v9.16b, v19.16b and v16.16b, v16.16b, v19.16b and v17.16b, v17.16b, v19.16b eor v5.16b, v5.16b, v8.16b shl v8.2d, v8.2d, #4 eor v2.16b, v2.16b, v9.16b shl v9.2d, v9.2d, #4 eor v7.16b, v7.16b, v16.16b shl v16.2d, v16.2d, #4 eor v3.16b, v3.16b, v17.16b shl v17.2d, v17.2d, #4 eor v6.16b, v6.16b, v8.16b eor v4.16b, v4.16b, v9.16b eor v7.16b, v7.16b, v10.16b eor v1.16b, v1.16b, v16.16b eor v3.16b, v3.16b, v10.16b eor v0.16b, v0.16b, v17.16b eor v6.16b, v6.16b, v10.16b eor v4.16b, v4.16b, v10.16b eor v2.16b, v2.16b, v10.16b eor v5.16b, v5.16b, v10.16b eor v1.16b, v1.16b, v10.16b eor v0.16b, v0.16b, v10.16b ret .size _bsaes_encrypt8,.-_bsaes_encrypt8 .type _bsaes_key_convert,%function .align 4 // On entry: // x9 -> input key (big-endian) // x10 = number of rounds // x17 -> output key (native endianness) // On exit: // x9, x10 corrupted // x11 -> .LM0_bigendian // x17 -> last quadword of output key // other general-purpose registers preserved // v2-v6 preserved // v7.16b[] = 0x63 // v8-v14 preserved // v15 = last round key (converted to native endianness) // other SIMD registers corrupted _bsaes_key_convert: #ifdef __AARCH64EL__ adr x11, .LM0_littleendian #else adr x11, .LM0_bigendian #endif ldr q0, [x9], #16 // load round 0 key ldr q1, [x11] // .LM0 ldr q15, [x9], #16 // load round 1 key movi v7.16b, #0x63 // compose .L63 movi v16.16b, #0x01 // bit masks movi v17.16b, #0x02 movi v18.16b, #0x04 movi v19.16b, #0x08 movi v20.16b, #0x10 movi v21.16b, #0x20 movi v22.16b, #0x40 movi v23.16b, #0x80 #ifdef __AARCH64EL__ rev32 v0.16b, v0.16b #endif sub x10, x10, #1 str q0, [x17], #16 // save round 0 key .align 4 .Lkey_loop: tbl v0.16b, {v15.16b}, v1.16b ldr q15, [x9], #16 // load next round key eor v0.16b, v0.16b, v7.16b cmtst v24.16b, v0.16b, v16.16b cmtst v25.16b, v0.16b, v17.16b cmtst v26.16b, v0.16b, v18.16b cmtst v27.16b, v0.16b, v19.16b cmtst v28.16b, v0.16b, v20.16b cmtst v29.16b, v0.16b, v21.16b cmtst v30.16b, v0.16b, v22.16b cmtst v31.16b, v0.16b, v23.16b sub x10, x10, #1 st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key st1 {v28.16b-v31.16b}, [x17], #64 cbnz x10, .Lkey_loop // don't save last round key #ifdef __AARCH64EL__ rev32 v15.16b, v15.16b adr x11, .LM0_bigendian #endif ret .size _bsaes_key_convert,.-_bsaes_key_convert .globl ossl_bsaes_cbc_encrypt .type ossl_bsaes_cbc_encrypt,%function .align 4 // On entry: // x0 -> input ciphertext // x1 -> output plaintext // x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) // x3 -> key // x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) // w5 must be == 0 // On exit: // Output plaintext filled in // Initialisation vector overwritten with last quadword of ciphertext // No output registers, usual AAPCS64 register preservation ossl_bsaes_cbc_encrypt: AARCH64_VALID_CALL_TARGET cmp x2, #128 bhs .Lcbc_do_bsaes b AES_cbc_encrypt .Lcbc_do_bsaes: // it is up to the caller to make sure we are called with enc == 0 stp x29, x30, [sp, #-48]! stp d8, d9, [sp, #16] stp d10, d15, [sp, #32] lsr x2, x2, #4 // len in 16 byte blocks ldr w15, [x3, #240] // get # of rounds mov x14, sp // allocate the key schedule on the stack add x17, sp, #96 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes // populate the key schedule mov x9, x3 // pass key mov x10, x15 // pass # of rounds mov sp, x17 // sp is sp bl _bsaes_key_convert ldr q6, [sp] str q15, [x17] // save last round key eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) str q6, [sp] ldr q15, [x4] // load IV b .Lcbc_dec_loop .align 4 .Lcbc_dec_loop: subs x2, x2, #0x8 bmi .Lcbc_dec_loop_finish ldr q0, [x0], #16 // load input mov x9, sp // pass the key ldr q1, [x0], #16 mov x10, x15 ldr q2, [x0], #16 ldr q3, [x0], #16 ldr q4, [x0], #16 ldr q5, [x0], #16 ldr q6, [x0], #16 ldr q7, [x0], #-7*16 bl _bsaes_decrypt8 ldr q16, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV eor v1.16b, v1.16b, v16.16b str q0, [x1], #16 // write output ldr q0, [x0], #16 str q1, [x1], #16 ldr q1, [x0], #16 eor v1.16b, v4.16b, v1.16b ldr q4, [x0], #16 eor v2.16b, v2.16b, v4.16b eor v0.16b, v6.16b, v0.16b ldr q4, [x0], #16 str q0, [x1], #16 str q1, [x1], #16 eor v0.16b, v7.16b, v4.16b ldr q1, [x0], #16 str q2, [x1], #16 ldr q2, [x0], #16 ldr q15, [x0], #16 str q0, [x1], #16 eor v0.16b, v5.16b, v2.16b eor v1.16b, v3.16b, v1.16b str q1, [x1], #16 str q0, [x1], #16 b .Lcbc_dec_loop .Lcbc_dec_loop_finish: adds x2, x2, #8 beq .Lcbc_dec_done ldr q0, [x0], #16 // load input cmp x2, #2 blo .Lcbc_dec_one ldr q1, [x0], #16 mov x9, sp // pass the key mov x10, x15 beq .Lcbc_dec_two ldr q2, [x0], #16 cmp x2, #4 blo .Lcbc_dec_three ldr q3, [x0], #16 beq .Lcbc_dec_four ldr q4, [x0], #16 cmp x2, #6 blo .Lcbc_dec_five ldr q5, [x0], #16 beq .Lcbc_dec_six ldr q6, [x0], #-6*16 bl _bsaes_decrypt8 ldr q5, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q8, [x0], #16 ldr q9, [x0], #16 ldr q10, [x0], #16 str q0, [x1], #16 // write output ldr q0, [x0], #16 eor v1.16b, v1.16b, v5.16b ldr q5, [x0], #16 eor v6.16b, v6.16b, v8.16b ldr q15, [x0] eor v4.16b, v4.16b, v9.16b eor v2.16b, v2.16b, v10.16b str q1, [x1], #16 eor v0.16b, v7.16b, v0.16b str q6, [x1], #16 eor v1.16b, v3.16b, v5.16b str q4, [x1], #16 str q2, [x1], #16 str q0, [x1], #16 str q1, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_six: sub x0, x0, #0x60 bl _bsaes_decrypt8 ldr q3, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q5, [x0], #16 ldr q8, [x0], #16 ldr q9, [x0], #16 str q0, [x1], #16 // write output ldr q0, [x0], #16 eor v1.16b, v1.16b, v3.16b ldr q15, [x0] eor v3.16b, v6.16b, v5.16b eor v4.16b, v4.16b, v8.16b eor v2.16b, v2.16b, v9.16b str q1, [x1], #16 eor v0.16b, v7.16b, v0.16b str q3, [x1], #16 str q4, [x1], #16 str q2, [x1], #16 str q0, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_five: sub x0, x0, #0x50 bl _bsaes_decrypt8 ldr q3, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q5, [x0], #16 ldr q7, [x0], #16 ldr q8, [x0], #16 str q0, [x1], #16 // write output ldr q15, [x0] eor v0.16b, v1.16b, v3.16b eor v1.16b, v6.16b, v5.16b eor v3.16b, v4.16b, v7.16b str q0, [x1], #16 eor v0.16b, v2.16b, v8.16b str q1, [x1], #16 str q3, [x1], #16 str q0, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_four: sub x0, x0, #0x40 bl _bsaes_decrypt8 ldr q2, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q3, [x0], #16 ldr q5, [x0], #16 str q0, [x1], #16 // write output ldr q15, [x0] eor v0.16b, v1.16b, v2.16b eor v1.16b, v6.16b, v3.16b eor v2.16b, v4.16b, v5.16b str q0, [x1], #16 str q1, [x1], #16 str q2, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_three: sub x0, x0, #0x30 bl _bsaes_decrypt8 ldr q2, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q3, [x0], #16 ldr q15, [x0] str q0, [x1], #16 // write output eor v0.16b, v1.16b, v2.16b eor v1.16b, v6.16b, v3.16b str q0, [x1], #16 str q1, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_two: sub x0, x0, #0x20 bl _bsaes_decrypt8 ldr q2, [x0], #16 // reload input eor v0.16b, v0.16b, v15.16b // ^= IV ldr q15, [x0] str q0, [x1], #16 // write output eor v0.16b, v1.16b, v2.16b str q0, [x1] b .Lcbc_dec_done .align 4 .Lcbc_dec_one: sub x0, x0, #0x10 stp x1, x4, [sp, #-32]! str x14, [sp, #16] mov v8.16b, v15.16b mov v15.16b, v0.16b mov x2, x3 bl AES_decrypt ldr x14, [sp, #16] ldp x1, x4, [sp], #32 ldr q0, [x1] // load result eor v0.16b, v0.16b, v8.16b // ^= IV str q0, [x1] // write output .align 4 .Lcbc_dec_done: movi v0.16b, #0 movi v1.16b, #0 .Lcbc_dec_bzero:// wipe key schedule [if any] stp q0, q1, [sp], #32 cmp sp, x14 bne .Lcbc_dec_bzero str q15, [x4] // return IV ldp d8, d9, [sp, #16] ldp d10, d15, [sp, #32] ldp x29, x30, [sp], #48 ret .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt .globl ossl_bsaes_ctr32_encrypt_blocks .type ossl_bsaes_ctr32_encrypt_blocks,%function .align 4 // On entry: // x0 -> input text (whole 16-byte blocks) // x1 -> output text (whole 16-byte blocks) // x2 = number of 16-byte blocks to encrypt/decrypt (> 0) // x3 -> key // x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block // On exit: // Output text filled in // No output registers, usual AAPCS64 register preservation ossl_bsaes_ctr32_encrypt_blocks: AARCH64_VALID_CALL_TARGET cmp x2, #8 // use plain AES for blo .Lctr_enc_short // small sizes stp x29, x30, [sp, #-80]! stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] ldr w15, [x3, #240] // get # of rounds mov x14, sp // allocate the key schedule on the stack add x17, sp, #96 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes // populate the key schedule mov x9, x3 // pass key mov x10, x15 // pass # of rounds mov sp, x17 // sp is sp bl _bsaes_key_convert eor v7.16b, v7.16b, v15.16b // fix up last round key str q7, [x17] // save last round key ldr q0, [x4] // load counter add x13, x11, #.LREVM0SR-.LM0_bigendian ldr q4, [sp] // load round0 key movi v8.4s, #1 // compose 1<<96 movi v9.16b, #0 rev32 v15.16b, v0.16b rev32 v0.16b, v0.16b ext v11.16b, v9.16b, v8.16b, #4 rev32 v4.16b, v4.16b add v12.4s, v11.4s, v11.4s // compose 2<<96 str q4, [sp] // save adjusted round0 key add v13.4s, v11.4s, v12.4s // compose 3<<96 add v14.4s, v12.4s, v12.4s // compose 4<<96 b .Lctr_enc_loop .align 4 .Lctr_enc_loop: // Intermix prologue from _bsaes_encrypt8 to use the opportunity // to flip byte order in 32-bit counter add v1.4s, v15.4s, v11.4s // +1 add x9, sp, #0x10 // pass next round key add v2.4s, v15.4s, v12.4s // +2 ldr q9, [x13] // .LREVM0SR ldr q8, [sp] // load round0 key add v3.4s, v15.4s, v13.4s // +3 mov x10, x15 // pass rounds sub x11, x13, #.LREVM0SR-.LSR // pass constants add v6.4s, v2.4s, v14.4s add v4.4s, v15.4s, v14.4s // +4 add v7.4s, v3.4s, v14.4s add v15.4s, v4.4s, v14.4s // next counter add v5.4s, v1.4s, v14.4s bl _bsaes_encrypt8_alt subs x2, x2, #8 blo .Lctr_enc_loop_done ldr q16, [x0], #16 ldr q17, [x0], #16 eor v1.16b, v1.16b, v17.16b ldr q17, [x0], #16 eor v0.16b, v0.16b, v16.16b eor v4.16b, v4.16b, v17.16b str q0, [x1], #16 ldr q16, [x0], #16 str q1, [x1], #16 mov v0.16b, v15.16b str q4, [x1], #16 ldr q1, [x0], #16 eor v4.16b, v6.16b, v16.16b eor v1.16b, v3.16b, v1.16b ldr q3, [x0], #16 eor v3.16b, v7.16b, v3.16b ldr q6, [x0], #16 eor v2.16b, v2.16b, v6.16b ldr q6, [x0], #16 eor v5.16b, v5.16b, v6.16b str q4, [x1], #16 str q1, [x1], #16 str q3, [x1], #16 str q2, [x1], #16 str q5, [x1], #16 bne .Lctr_enc_loop b .Lctr_enc_done .align 4 .Lctr_enc_loop_done: add x2, x2, #8 ldr q16, [x0], #16 // load input eor v0.16b, v0.16b, v16.16b str q0, [x1], #16 // write output cmp x2, #2 blo .Lctr_enc_done ldr q17, [x0], #16 eor v1.16b, v1.16b, v17.16b str q1, [x1], #16 beq .Lctr_enc_done ldr q18, [x0], #16 eor v4.16b, v4.16b, v18.16b str q4, [x1], #16 cmp x2, #4 blo .Lctr_enc_done ldr q19, [x0], #16 eor v6.16b, v6.16b, v19.16b str q6, [x1], #16 beq .Lctr_enc_done ldr q20, [x0], #16 eor v3.16b, v3.16b, v20.16b str q3, [x1], #16 cmp x2, #6 blo .Lctr_enc_done ldr q21, [x0], #16 eor v7.16b, v7.16b, v21.16b str q7, [x1], #16 beq .Lctr_enc_done ldr q22, [x0] eor v2.16b, v2.16b, v22.16b str q2, [x1], #16 .Lctr_enc_done: movi v0.16b, #0 movi v1.16b, #0 .Lctr_enc_bzero: // wipe key schedule [if any] stp q0, q1, [sp], #32 cmp sp, x14 bne .Lctr_enc_bzero ldp d8, d9, [sp, #16] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #48] ldp d14, d15, [sp, #64] ldp x29, x30, [sp], #80 ret .Lctr_enc_short: stp x29, x30, [sp, #-96]! stp x19, x20, [sp, #16] stp x21, x22, [sp, #32] str x23, [sp, #48] mov x19, x0 // copy arguments mov x20, x1 mov x21, x2 mov x22, x3 ldr w23, [x4, #12] // load counter .LSW ldr q1, [x4] // load whole counter value #ifdef __AARCH64EL__ rev w23, w23 #endif str q1, [sp, #80] // copy counter value .Lctr_enc_short_loop: add x0, sp, #80 // input counter value add x1, sp, #64 // output on the stack mov x2, x22 // key bl AES_encrypt ldr q0, [x19], #16 // load input ldr q1, [sp, #64] // load encrypted counter add x23, x23, #1 #ifdef __AARCH64EL__ rev w0, w23 str w0, [sp, #80+12] // next counter value #else str w23, [sp, #80+12] // next counter value #endif eor v0.16b, v0.16b, v1.16b str q0, [x20], #16 // store output subs x21, x21, #1 bne .Lctr_enc_short_loop movi v0.16b, #0 movi v1.16b, #0 stp q0, q1, [sp, #64] ldr x23, [sp, #48] ldp x21, x22, [sp, #32] ldp x19, x20, [sp, #16] ldp x29, x30, [sp], #96 ret .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks .globl ossl_bsaes_xts_encrypt .type ossl_bsaes_xts_encrypt,%function .align 4 // On entry: // x0 -> input plaintext // x1 -> output ciphertext // x2 -> length of text in bytes (must be at least 16) // x3 -> key1 (used to encrypt the XORed plaintext blocks) // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) // x5 -> 16-byte initial vector (typically, sector number) // On exit: // Output ciphertext filled in // No output registers, usual AAPCS64 register preservation ossl_bsaes_xts_encrypt: AARCH64_VALID_CALL_TARGET // Stack layout: // sp -> // nrounds*128-96 bytes: key schedule // x19 -> // 16 bytes: frame record // 4*16 bytes: tweak storage across _bsaes_encrypt8 // 6*8 bytes: storage for 5 callee-saved general-purpose registers // 8*8 bytes: storage for 8 callee-saved SIMD registers stp x29, x30, [sp, #-192]! stp x19, x20, [sp, #80] stp x21, x22, [sp, #96] str x23, [sp, #112] stp d8, d9, [sp, #128] stp d10, d11, [sp, #144] stp d12, d13, [sp, #160] stp d14, d15, [sp, #176] mov x19, sp mov x20, x0 mov x21, x1 mov x22, x2 mov x23, x3 // generate initial tweak sub sp, sp, #16 mov x0, x5 // iv[] mov x1, sp mov x2, x4 // key2 bl AES_encrypt ldr q11, [sp], #16 ldr w1, [x23, #240] // get # of rounds // allocate the key schedule on the stack add x17, sp, #96 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes // populate the key schedule mov x9, x23 // pass key mov x10, x1 // pass # of rounds mov sp, x17 bl _bsaes_key_convert eor v15.16b, v15.16b, v7.16b // fix up last round key str q15, [x17] // save last round key subs x22, x22, #0x80 blo .Lxts_enc_short b .Lxts_enc_loop .align 4 .Lxts_enc_loop: ldr q8, .Lxts_magic mov x10, x1 // pass rounds add x2, x19, #16 ldr q0, [x20], #16 sshr v1.2d, v11.2d, #63 mov x9, sp // pass key schedule ldr q6, .Lxts_magic+16 add v2.2d, v11.2d, v11.2d cmtst v3.2d, v11.2d, v6.2d and v1.16b, v1.16b, v8.16b ext v1.16b, v1.16b, v1.16b, #8 and v3.16b, v3.16b, v8.16b ldr q4, [x20], #16 eor v12.16b, v2.16b, v1.16b eor v1.16b, v4.16b, v12.16b eor v0.16b, v0.16b, v11.16b cmtst v2.2d, v12.2d, v6.2d add v4.2d, v12.2d, v12.2d add x0, x19, #16 ext v3.16b, v3.16b, v3.16b, #8 and v2.16b, v2.16b, v8.16b eor v13.16b, v4.16b, v3.16b ldr q3, [x20], #16 ext v4.16b, v2.16b, v2.16b, #8 eor v2.16b, v3.16b, v13.16b ldr q3, [x20], #16 add v5.2d, v13.2d, v13.2d cmtst v7.2d, v13.2d, v6.2d and v7.16b, v7.16b, v8.16b ldr q9, [x20], #16 ext v7.16b, v7.16b, v7.16b, #8 ldr q10, [x20], #16 eor v14.16b, v5.16b, v4.16b ldr q16, [x20], #16 add v4.2d, v14.2d, v14.2d eor v3.16b, v3.16b, v14.16b eor v15.16b, v4.16b, v7.16b add v5.2d, v15.2d, v15.2d ldr q7, [x20], #16 cmtst v4.2d, v14.2d, v6.2d and v17.16b, v4.16b, v8.16b cmtst v18.2d, v15.2d, v6.2d eor v4.16b, v9.16b, v15.16b ext v9.16b, v17.16b, v17.16b, #8 eor v9.16b, v5.16b, v9.16b add v17.2d, v9.2d, v9.2d and v18.16b, v18.16b, v8.16b eor v5.16b, v10.16b, v9.16b str q9, [x2], #16 ext v10.16b, v18.16b, v18.16b, #8 cmtst v9.2d, v9.2d, v6.2d and v9.16b, v9.16b, v8.16b eor v10.16b, v17.16b, v10.16b cmtst v17.2d, v10.2d, v6.2d eor v6.16b, v16.16b, v10.16b str q10, [x2], #16 ext v9.16b, v9.16b, v9.16b, #8 add v10.2d, v10.2d, v10.2d eor v9.16b, v10.16b, v9.16b str q9, [x2], #16 eor v7.16b, v7.16b, v9.16b add v9.2d, v9.2d, v9.2d and v8.16b, v17.16b, v8.16b ext v8.16b, v8.16b, v8.16b, #8 eor v8.16b, v9.16b, v8.16b str q8, [x2] // next round tweak bl _bsaes_encrypt8 ldr q8, [x0], #16 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b ldr q9, [x0], #16 eor v4.16b, v4.16b, v13.16b eor v6.16b, v6.16b, v14.16b ldr q10, [x0], #16 eor v3.16b, v3.16b, v15.16b subs x22, x22, #0x80 str q0, [x21], #16 ldr q11, [x0] // next round tweak str q1, [x21], #16 eor v0.16b, v7.16b, v8.16b eor v1.16b, v2.16b, v9.16b str q4, [x21], #16 eor v2.16b, v5.16b, v10.16b str q6, [x21], #16 str q3, [x21], #16 str q0, [x21], #16 str q1, [x21], #16 str q2, [x21], #16 bpl .Lxts_enc_loop .Lxts_enc_short: adds x22, x22, #0x70 bmi .Lxts_enc_done ldr q8, .Lxts_magic sshr v1.2d, v11.2d, #63 add v2.2d, v11.2d, v11.2d ldr q9, .Lxts_magic+16 subs x22, x22, #0x10 ldr q0, [x20], #16 and v1.16b, v1.16b, v8.16b cmtst v3.2d, v11.2d, v9.2d ext v1.16b, v1.16b, v1.16b, #8 and v3.16b, v3.16b, v8.16b eor v12.16b, v2.16b, v1.16b ext v1.16b, v3.16b, v3.16b, #8 add v2.2d, v12.2d, v12.2d cmtst v3.2d, v12.2d, v9.2d eor v13.16b, v2.16b, v1.16b and v22.16b, v3.16b, v8.16b bmi .Lxts_enc_1 ext v2.16b, v22.16b, v22.16b, #8 add v3.2d, v13.2d, v13.2d ldr q1, [x20], #16 cmtst v4.2d, v13.2d, v9.2d subs x22, x22, #0x10 eor v14.16b, v3.16b, v2.16b and v23.16b, v4.16b, v8.16b bmi .Lxts_enc_2 ext v3.16b, v23.16b, v23.16b, #8 add v4.2d, v14.2d, v14.2d ldr q2, [x20], #16 cmtst v5.2d, v14.2d, v9.2d eor v0.16b, v0.16b, v11.16b subs x22, x22, #0x10 eor v15.16b, v4.16b, v3.16b and v24.16b, v5.16b, v8.16b bmi .Lxts_enc_3 ext v4.16b, v24.16b, v24.16b, #8 add v5.2d, v15.2d, v15.2d ldr q3, [x20], #16 cmtst v6.2d, v15.2d, v9.2d eor v1.16b, v1.16b, v12.16b subs x22, x22, #0x10 eor v16.16b, v5.16b, v4.16b and v25.16b, v6.16b, v8.16b bmi .Lxts_enc_4 ext v5.16b, v25.16b, v25.16b, #8 add v6.2d, v16.2d, v16.2d add x0, x19, #16 cmtst v7.2d, v16.2d, v9.2d ldr q4, [x20], #16 eor v2.16b, v2.16b, v13.16b str q16, [x0], #16 subs x22, x22, #0x10 eor v17.16b, v6.16b, v5.16b and v26.16b, v7.16b, v8.16b bmi .Lxts_enc_5 ext v7.16b, v26.16b, v26.16b, #8 add v18.2d, v17.2d, v17.2d ldr q5, [x20], #16 eor v3.16b, v3.16b, v14.16b str q17, [x0], #16 subs x22, x22, #0x10 eor v18.16b, v18.16b, v7.16b bmi .Lxts_enc_6 ldr q6, [x20], #16 eor v4.16b, v4.16b, v15.16b eor v5.16b, v5.16b, v16.16b str q18, [x0] // next round tweak mov x9, sp // pass key schedule mov x10, x1 add x0, x19, #16 sub x22, x22, #0x10 eor v6.16b, v6.16b, v17.16b bl _bsaes_encrypt8 ldr q16, [x0], #16 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b ldr q17, [x0], #16 eor v4.16b, v4.16b, v13.16b eor v6.16b, v6.16b, v14.16b eor v3.16b, v3.16b, v15.16b ldr q11, [x0] // next round tweak str q0, [x21], #16 str q1, [x21], #16 eor v0.16b, v7.16b, v16.16b eor v1.16b, v2.16b, v17.16b str q4, [x21], #16 str q6, [x21], #16 str q3, [x21], #16 str q0, [x21], #16 str q1, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_6: eor v4.16b, v4.16b, v15.16b eor v5.16b, v5.16b, v16.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_encrypt8 ldr q16, [x0], #16 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v4.16b, v4.16b, v13.16b eor v6.16b, v6.16b, v14.16b ldr q11, [x0] // next round tweak eor v3.16b, v3.16b, v15.16b str q0, [x21], #16 str q1, [x21], #16 eor v0.16b, v7.16b, v16.16b str q4, [x21], #16 str q6, [x21], #16 str q3, [x21], #16 str q0, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_5: eor v3.16b, v3.16b, v14.16b eor v4.16b, v4.16b, v15.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_encrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b ldr q11, [x0] // next round tweak eor v4.16b, v4.16b, v13.16b eor v6.16b, v6.16b, v14.16b eor v3.16b, v3.16b, v15.16b str q0, [x21], #16 str q1, [x21], #16 str q4, [x21], #16 str q6, [x21], #16 str q3, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_4: eor v2.16b, v2.16b, v13.16b eor v3.16b, v3.16b, v14.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_encrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v4.16b, v4.16b, v13.16b eor v6.16b, v6.16b, v14.16b mov v11.16b, v15.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 str q4, [x21], #16 str q6, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_3: eor v1.16b, v1.16b, v12.16b eor v2.16b, v2.16b, v13.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_encrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v4.16b, v4.16b, v13.16b mov v11.16b, v14.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 str q4, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_2: eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_encrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b mov v11.16b, v13.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 b .Lxts_enc_done .align 4 .Lxts_enc_1: eor v0.16b, v0.16b, v11.16b sub x0, sp, #16 sub x1, sp, #16 mov x2, x23 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers mov v14.d[0], v12.d[1] str q0, [sp, #-16]! bl AES_encrypt ldr q0, [sp], #16 trn1 v13.2d, v11.2d, v13.2d trn1 v11.2d, v12.2d, v14.2d // next round tweak eor v0.16b, v0.16b, v13.16b str q0, [x21], #16 .Lxts_enc_done: adds x22, x22, #0x10 beq .Lxts_enc_ret sub x6, x21, #0x10 // Penultimate plaintext block produces final ciphertext part-block // plus remaining part of final plaintext block. Move ciphertext part // to final position and reuse penultimate ciphertext block buffer to // construct final plaintext block .Lxts_enc_steal: ldrb w0, [x20], #1 ldrb w1, [x21, #-0x10] strb w0, [x21, #-0x10] strb w1, [x21], #1 subs x22, x22, #1 bhi .Lxts_enc_steal // Finally encrypt the penultimate ciphertext block using the // last tweak ldr q0, [x6] eor v0.16b, v0.16b, v11.16b str q0, [sp, #-16]! mov x0, sp mov x1, sp mov x2, x23 mov x21, x6 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers bl AES_encrypt trn1 v11.2d, v11.2d, v13.2d ldr q0, [sp], #16 eor v0.16b, v0.16b, v11.16b str q0, [x21] .Lxts_enc_ret: movi v0.16b, #0 movi v1.16b, #0 .Lxts_enc_bzero: // wipe key schedule stp q0, q1, [sp], #32 cmp sp, x19 bne .Lxts_enc_bzero ldp x19, x20, [sp, #80] ldp x21, x22, [sp, #96] ldr x23, [sp, #112] ldp d8, d9, [sp, #128] ldp d10, d11, [sp, #144] ldp d12, d13, [sp, #160] ldp d14, d15, [sp, #176] ldp x29, x30, [sp], #192 ret .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt // The assembler doesn't seem capable of de-duplicating these when expressed // using `ldr qd,=` syntax, so assign a symbolic address .align 5 .Lxts_magic: .quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 .globl ossl_bsaes_xts_decrypt .type ossl_bsaes_xts_decrypt,%function .align 4 // On entry: // x0 -> input ciphertext // x1 -> output plaintext // x2 -> length of text in bytes (must be at least 16) // x3 -> key1 (used to decrypt the XORed ciphertext blocks) // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) // x5 -> 16-byte initial vector (typically, sector number) // On exit: // Output plaintext filled in // No output registers, usual AAPCS64 register preservation ossl_bsaes_xts_decrypt: AARCH64_VALID_CALL_TARGET // Stack layout: // sp -> // nrounds*128-96 bytes: key schedule // x19 -> // 16 bytes: frame record // 4*16 bytes: tweak storage across _bsaes_decrypt8 // 6*8 bytes: storage for 5 callee-saved general-purpose registers // 8*8 bytes: storage for 8 callee-saved SIMD registers stp x29, x30, [sp, #-192]! stp x19, x20, [sp, #80] stp x21, x22, [sp, #96] str x23, [sp, #112] stp d8, d9, [sp, #128] stp d10, d11, [sp, #144] stp d12, d13, [sp, #160] stp d14, d15, [sp, #176] mov x19, sp mov x20, x0 mov x21, x1 mov x22, x2 mov x23, x3 // generate initial tweak sub sp, sp, #16 mov x0, x5 // iv[] mov x1, sp mov x2, x4 // key2 bl AES_encrypt ldr q11, [sp], #16 ldr w1, [x23, #240] // get # of rounds // allocate the key schedule on the stack add x17, sp, #96 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes // populate the key schedule mov x9, x23 // pass key mov x10, x1 // pass # of rounds mov sp, x17 bl _bsaes_key_convert ldr q6, [sp] str q15, [x17] // save last round key eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) str q6, [sp] sub x30, x22, #0x10 tst x22, #0xf // if not multiple of 16 csel x22, x30, x22, ne // subtract another 16 bytes subs x22, x22, #0x80 blo .Lxts_dec_short b .Lxts_dec_loop .align 4 .Lxts_dec_loop: ldr q8, .Lxts_magic mov x10, x1 // pass rounds add x2, x19, #16 ldr q0, [x20], #16 sshr v1.2d, v11.2d, #63 mov x9, sp // pass key schedule ldr q6, .Lxts_magic+16 add v2.2d, v11.2d, v11.2d cmtst v3.2d, v11.2d, v6.2d and v1.16b, v1.16b, v8.16b ext v1.16b, v1.16b, v1.16b, #8 and v3.16b, v3.16b, v8.16b ldr q4, [x20], #16 eor v12.16b, v2.16b, v1.16b eor v1.16b, v4.16b, v12.16b eor v0.16b, v0.16b, v11.16b cmtst v2.2d, v12.2d, v6.2d add v4.2d, v12.2d, v12.2d add x0, x19, #16 ext v3.16b, v3.16b, v3.16b, #8 and v2.16b, v2.16b, v8.16b eor v13.16b, v4.16b, v3.16b ldr q3, [x20], #16 ext v4.16b, v2.16b, v2.16b, #8 eor v2.16b, v3.16b, v13.16b ldr q3, [x20], #16 add v5.2d, v13.2d, v13.2d cmtst v7.2d, v13.2d, v6.2d and v7.16b, v7.16b, v8.16b ldr q9, [x20], #16 ext v7.16b, v7.16b, v7.16b, #8 ldr q10, [x20], #16 eor v14.16b, v5.16b, v4.16b ldr q16, [x20], #16 add v4.2d, v14.2d, v14.2d eor v3.16b, v3.16b, v14.16b eor v15.16b, v4.16b, v7.16b add v5.2d, v15.2d, v15.2d ldr q7, [x20], #16 cmtst v4.2d, v14.2d, v6.2d and v17.16b, v4.16b, v8.16b cmtst v18.2d, v15.2d, v6.2d eor v4.16b, v9.16b, v15.16b ext v9.16b, v17.16b, v17.16b, #8 eor v9.16b, v5.16b, v9.16b add v17.2d, v9.2d, v9.2d and v18.16b, v18.16b, v8.16b eor v5.16b, v10.16b, v9.16b str q9, [x2], #16 ext v10.16b, v18.16b, v18.16b, #8 cmtst v9.2d, v9.2d, v6.2d and v9.16b, v9.16b, v8.16b eor v10.16b, v17.16b, v10.16b cmtst v17.2d, v10.2d, v6.2d eor v6.16b, v16.16b, v10.16b str q10, [x2], #16 ext v9.16b, v9.16b, v9.16b, #8 add v10.2d, v10.2d, v10.2d eor v9.16b, v10.16b, v9.16b str q9, [x2], #16 eor v7.16b, v7.16b, v9.16b add v9.2d, v9.2d, v9.2d and v8.16b, v17.16b, v8.16b ext v8.16b, v8.16b, v8.16b, #8 eor v8.16b, v9.16b, v8.16b str q8, [x2] // next round tweak bl _bsaes_decrypt8 eor v6.16b, v6.16b, v13.16b eor v0.16b, v0.16b, v11.16b ldr q8, [x0], #16 eor v7.16b, v7.16b, v8.16b str q0, [x21], #16 eor v0.16b, v1.16b, v12.16b ldr q1, [x0], #16 eor v1.16b, v3.16b, v1.16b subs x22, x22, #0x80 eor v2.16b, v2.16b, v15.16b eor v3.16b, v4.16b, v14.16b ldr q4, [x0], #16 str q0, [x21], #16 ldr q11, [x0] // next round tweak eor v0.16b, v5.16b, v4.16b str q6, [x21], #16 str q3, [x21], #16 str q2, [x21], #16 str q7, [x21], #16 str q1, [x21], #16 str q0, [x21], #16 bpl .Lxts_dec_loop .Lxts_dec_short: adds x22, x22, #0x70 bmi .Lxts_dec_done ldr q8, .Lxts_magic sshr v1.2d, v11.2d, #63 add v2.2d, v11.2d, v11.2d ldr q9, .Lxts_magic+16 subs x22, x22, #0x10 ldr q0, [x20], #16 and v1.16b, v1.16b, v8.16b cmtst v3.2d, v11.2d, v9.2d ext v1.16b, v1.16b, v1.16b, #8 and v3.16b, v3.16b, v8.16b eor v12.16b, v2.16b, v1.16b ext v1.16b, v3.16b, v3.16b, #8 add v2.2d, v12.2d, v12.2d cmtst v3.2d, v12.2d, v9.2d eor v13.16b, v2.16b, v1.16b and v22.16b, v3.16b, v8.16b bmi .Lxts_dec_1 ext v2.16b, v22.16b, v22.16b, #8 add v3.2d, v13.2d, v13.2d ldr q1, [x20], #16 cmtst v4.2d, v13.2d, v9.2d subs x22, x22, #0x10 eor v14.16b, v3.16b, v2.16b and v23.16b, v4.16b, v8.16b bmi .Lxts_dec_2 ext v3.16b, v23.16b, v23.16b, #8 add v4.2d, v14.2d, v14.2d ldr q2, [x20], #16 cmtst v5.2d, v14.2d, v9.2d eor v0.16b, v0.16b, v11.16b subs x22, x22, #0x10 eor v15.16b, v4.16b, v3.16b and v24.16b, v5.16b, v8.16b bmi .Lxts_dec_3 ext v4.16b, v24.16b, v24.16b, #8 add v5.2d, v15.2d, v15.2d ldr q3, [x20], #16 cmtst v6.2d, v15.2d, v9.2d eor v1.16b, v1.16b, v12.16b subs x22, x22, #0x10 eor v16.16b, v5.16b, v4.16b and v25.16b, v6.16b, v8.16b bmi .Lxts_dec_4 ext v5.16b, v25.16b, v25.16b, #8 add v6.2d, v16.2d, v16.2d add x0, x19, #16 cmtst v7.2d, v16.2d, v9.2d ldr q4, [x20], #16 eor v2.16b, v2.16b, v13.16b str q16, [x0], #16 subs x22, x22, #0x10 eor v17.16b, v6.16b, v5.16b and v26.16b, v7.16b, v8.16b bmi .Lxts_dec_5 ext v7.16b, v26.16b, v26.16b, #8 add v18.2d, v17.2d, v17.2d ldr q5, [x20], #16 eor v3.16b, v3.16b, v14.16b str q17, [x0], #16 subs x22, x22, #0x10 eor v18.16b, v18.16b, v7.16b bmi .Lxts_dec_6 ldr q6, [x20], #16 eor v4.16b, v4.16b, v15.16b eor v5.16b, v5.16b, v16.16b str q18, [x0] // next round tweak mov x9, sp // pass key schedule mov x10, x1 add x0, x19, #16 sub x22, x22, #0x10 eor v6.16b, v6.16b, v17.16b bl _bsaes_decrypt8 ldr q16, [x0], #16 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b ldr q17, [x0], #16 eor v6.16b, v6.16b, v13.16b eor v4.16b, v4.16b, v14.16b eor v2.16b, v2.16b, v15.16b ldr q11, [x0] // next round tweak str q0, [x21], #16 str q1, [x21], #16 eor v0.16b, v7.16b, v16.16b eor v1.16b, v3.16b, v17.16b str q6, [x21], #16 str q4, [x21], #16 str q2, [x21], #16 str q0, [x21], #16 str q1, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_6: eor v4.16b, v4.16b, v15.16b eor v5.16b, v5.16b, v16.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_decrypt8 ldr q16, [x0], #16 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v4.16b, v4.16b, v14.16b ldr q11, [x0] // next round tweak eor v2.16b, v2.16b, v15.16b str q0, [x21], #16 str q1, [x21], #16 eor v0.16b, v7.16b, v16.16b str q6, [x21], #16 str q4, [x21], #16 str q2, [x21], #16 str q0, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_5: eor v3.16b, v3.16b, v14.16b eor v4.16b, v4.16b, v15.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_decrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b ldr q11, [x0] // next round tweak eor v6.16b, v6.16b, v13.16b eor v4.16b, v4.16b, v14.16b eor v2.16b, v2.16b, v15.16b str q0, [x21], #16 str q1, [x21], #16 str q6, [x21], #16 str q4, [x21], #16 str q2, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_4: eor v2.16b, v2.16b, v13.16b eor v3.16b, v3.16b, v14.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_decrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v6.16b, v6.16b, v13.16b eor v4.16b, v4.16b, v14.16b mov v11.16b, v15.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 str q6, [x21], #16 str q4, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_3: eor v1.16b, v1.16b, v12.16b eor v2.16b, v2.16b, v13.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_decrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b eor v6.16b, v6.16b, v13.16b mov v11.16b, v14.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 str q6, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_2: eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b mov x9, sp // pass key schedule mov x10, x1 // pass rounds add x0, x19, #16 bl _bsaes_decrypt8 eor v0.16b, v0.16b, v11.16b eor v1.16b, v1.16b, v12.16b mov v11.16b, v13.16b // next round tweak str q0, [x21], #16 str q1, [x21], #16 b .Lxts_dec_done .align 4 .Lxts_dec_1: eor v0.16b, v0.16b, v11.16b sub x0, sp, #16 sub x1, sp, #16 mov x2, x23 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers mov v14.d[0], v12.d[1] str q0, [sp, #-16]! bl AES_decrypt ldr q0, [sp], #16 trn1 v13.2d, v11.2d, v13.2d trn1 v11.2d, v12.2d, v14.2d // next round tweak eor v0.16b, v0.16b, v13.16b str q0, [x21], #16 .Lxts_dec_done: adds x22, x22, #0x10 beq .Lxts_dec_ret // calculate one round of extra tweak for the stolen ciphertext ldr q8, .Lxts_magic sshr v6.2d, v11.2d, #63 and v6.16b, v6.16b, v8.16b add v12.2d, v11.2d, v11.2d ext v6.16b, v6.16b, v6.16b, #8 eor v12.16b, v12.16b, v6.16b // perform the final decryption with the last tweak value ldr q0, [x20], #16 eor v0.16b, v0.16b, v12.16b str q0, [sp, #-16]! mov x0, sp mov x1, sp mov x2, x23 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers mov v14.d[0], v12.d[1] bl AES_decrypt trn1 v12.2d, v12.2d, v14.2d trn1 v11.2d, v11.2d, v13.2d ldr q0, [sp], #16 eor v0.16b, v0.16b, v12.16b str q0, [x21] mov x6, x21 // Penultimate ciphertext block produces final plaintext part-block // plus remaining part of final ciphertext block. Move plaintext part // to final position and reuse penultimate plaintext block buffer to // construct final ciphertext block .Lxts_dec_steal: ldrb w1, [x21] ldrb w0, [x20], #1 strb w1, [x21, #0x10] strb w0, [x21], #1 subs x22, x22, #1 bhi .Lxts_dec_steal // Finally decrypt the penultimate plaintext block using the // penultimate tweak ldr q0, [x6] eor v0.16b, v0.16b, v11.16b str q0, [sp, #-16]! mov x0, sp mov x1, sp mov x2, x23 mov x21, x6 bl AES_decrypt trn1 v11.2d, v11.2d, v13.2d ldr q0, [sp], #16 eor v0.16b, v0.16b, v11.16b str q0, [x21] .Lxts_dec_ret: movi v0.16b, #0 movi v1.16b, #0 .Lxts_dec_bzero: // wipe key schedule stp q0, q1, [sp], #32 cmp sp, x19 bne .Lxts_dec_bzero ldp x19, x20, [sp, #80] ldp x21, x22, [sp, #96] ldr x23, [sp, #112] ldp d8, d9, [sp, #128] ldp d10, d11, [sp, #144] ldp d12, d13, [sp, #160] ldp d14, d15, [sp, #176] ldp x29, x30, [sp], #192 ret .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt