#!/usr/bin/env perl
# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

use strict;

my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
my $xlate;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
die "can't locate arm-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

my $code = data();
print $code;

close STDOUT or die "error closing STDOUT: $!"; # enforce flush

sub data
{
    local $/;
    return <DATA>;
}

__END__
// Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License").  You may not use
// this file except in compliance with the License.  You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// ====================================================================
// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
// project. Rights for redistribution and usage in source and binary
// forms are granted according to the OpenSSL license.
// ====================================================================
//
// This implementation is a translation of bsaes-armv7 for AArch64.
// No attempt has been made to carry across the build switches for
// kernel targets, since the Linux kernel crypto support has moved on
// from when it was based on OpenSSL.

// A lot of hand-scheduling has been performed. Consequently, this code
// doesn't factor out neatly into macros in the same way that the
// AArch32 version did, and there is little to be gained by wrapping it
// up in Perl, and it is presented as pure assembly.


#include "crypto/arm_arch.h"

.text

.extern AES_cbc_encrypt
.extern AES_encrypt
.extern AES_decrypt

.type   _bsaes_decrypt8,%function
.align  4
// On entry:
//   x9 -> key (previously expanded using _bsaes_key_convert)
//   x10 = number of rounds
//   v0-v7 input data
// On exit:
//   x9-x11 corrupted
//   other general-purpose registers preserved
//   v0-v7 output data
//   v11-v15 preserved
//   other SIMD registers corrupted
_bsaes_decrypt8:
        ldr     q8, [x9], #16
        adr     x11, .LM0ISR
        movi    v9.16b, #0x55
        ldr     q10, [x11], #16
        movi    v16.16b, #0x33
        movi    v17.16b, #0x0f
        sub     x10, x10, #1
        eor     v0.16b, v0.16b, v8.16b
        eor     v1.16b, v1.16b, v8.16b
        eor     v2.16b, v2.16b, v8.16b
        eor     v4.16b, v4.16b, v8.16b
        eor     v3.16b, v3.16b, v8.16b
        eor     v5.16b, v5.16b, v8.16b
        tbl     v0.16b, {v0.16b}, v10.16b
        tbl     v1.16b, {v1.16b}, v10.16b
        tbl     v2.16b, {v2.16b}, v10.16b
        tbl     v4.16b, {v4.16b}, v10.16b
        eor     v6.16b, v6.16b, v8.16b
        eor     v7.16b, v7.16b, v8.16b
        tbl     v3.16b, {v3.16b}, v10.16b
        tbl     v5.16b, {v5.16b}, v10.16b
        tbl     v6.16b, {v6.16b}, v10.16b
        ushr    v8.2d, v0.2d, #1
        tbl     v7.16b, {v7.16b}, v10.16b
        ushr    v10.2d, v4.2d, #1
        ushr    v18.2d, v2.2d, #1
        eor     v8.16b, v8.16b, v1.16b
        ushr    v19.2d, v6.2d, #1
        eor     v10.16b, v10.16b, v5.16b
        eor     v18.16b, v18.16b, v3.16b
        and     v8.16b, v8.16b, v9.16b
        eor     v19.16b, v19.16b, v7.16b
        and     v10.16b, v10.16b, v9.16b
        and     v18.16b, v18.16b, v9.16b
        eor     v1.16b, v1.16b, v8.16b
        shl     v8.2d, v8.2d, #1
        and     v9.16b, v19.16b, v9.16b
        eor     v5.16b, v5.16b, v10.16b
        shl     v10.2d, v10.2d, #1
        eor     v3.16b, v3.16b, v18.16b
        shl     v18.2d, v18.2d, #1
        eor     v0.16b, v0.16b, v8.16b
        shl     v8.2d, v9.2d, #1
        eor     v7.16b, v7.16b, v9.16b
        eor     v4.16b, v4.16b, v10.16b
        eor     v2.16b, v2.16b, v18.16b
        ushr    v9.2d, v1.2d, #2
        eor     v6.16b, v6.16b, v8.16b
        ushr    v8.2d, v0.2d, #2
        ushr    v10.2d, v5.2d, #2
        ushr    v18.2d, v4.2d, #2
        eor     v9.16b, v9.16b, v3.16b
        eor     v8.16b, v8.16b, v2.16b
        eor     v10.16b, v10.16b, v7.16b
        eor     v18.16b, v18.16b, v6.16b
        and     v9.16b, v9.16b, v16.16b
        and     v8.16b, v8.16b, v16.16b
        and     v10.16b, v10.16b, v16.16b
        and     v16.16b, v18.16b, v16.16b
        eor     v3.16b, v3.16b, v9.16b
        shl     v9.2d, v9.2d, #2
        eor     v2.16b, v2.16b, v8.16b
        shl     v8.2d, v8.2d, #2
        eor     v7.16b, v7.16b, v10.16b
        shl     v10.2d, v10.2d, #2
        eor     v6.16b, v6.16b, v16.16b
        shl     v16.2d, v16.2d, #2
        eor     v1.16b, v1.16b, v9.16b
        eor     v0.16b, v0.16b, v8.16b
        eor     v5.16b, v5.16b, v10.16b
        eor     v4.16b, v4.16b, v16.16b
        ushr    v8.2d, v3.2d, #4
        ushr    v9.2d, v2.2d, #4
        ushr    v10.2d, v1.2d, #4
        ushr    v16.2d, v0.2d, #4
        eor     v8.16b, v8.16b, v7.16b
        eor     v9.16b, v9.16b, v6.16b
        eor     v10.16b, v10.16b, v5.16b
        eor     v16.16b, v16.16b, v4.16b
        and     v8.16b, v8.16b, v17.16b
        and     v9.16b, v9.16b, v17.16b
        and     v10.16b, v10.16b, v17.16b
        and     v16.16b, v16.16b, v17.16b
        eor     v7.16b, v7.16b, v8.16b
        shl     v8.2d, v8.2d, #4
        eor     v6.16b, v6.16b, v9.16b
        shl     v9.2d, v9.2d, #4
        eor     v5.16b, v5.16b, v10.16b
        shl     v10.2d, v10.2d, #4
        eor     v4.16b, v4.16b, v16.16b
        shl     v16.2d, v16.2d, #4
        eor     v3.16b, v3.16b, v8.16b
        eor     v2.16b, v2.16b, v9.16b
        eor     v1.16b, v1.16b, v10.16b
        eor     v0.16b, v0.16b, v16.16b
        b       .Ldec_sbox
.align  4
.Ldec_loop:
        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
        ldp     q8, q9, [x9], #32
        eor     v0.16b, v16.16b, v0.16b
        ldr     q10, [x9], #16
        eor     v1.16b, v17.16b, v1.16b
        ldr     q16, [x9], #16
        eor     v2.16b, v18.16b, v2.16b
        eor     v3.16b, v19.16b, v3.16b
        eor     v4.16b, v8.16b, v4.16b
        eor     v5.16b, v9.16b, v5.16b
        eor     v6.16b, v10.16b, v6.16b
        eor     v7.16b, v16.16b, v7.16b
        tbl     v0.16b, {v0.16b}, v28.16b
        tbl     v1.16b, {v1.16b}, v28.16b
        tbl     v2.16b, {v2.16b}, v28.16b
        tbl     v3.16b, {v3.16b}, v28.16b
        tbl     v4.16b, {v4.16b}, v28.16b
        tbl     v5.16b, {v5.16b}, v28.16b
        tbl     v6.16b, {v6.16b}, v28.16b
        tbl     v7.16b, {v7.16b}, v28.16b
.Ldec_sbox:
        eor     v1.16b, v1.16b, v4.16b
        eor     v3.16b, v3.16b, v4.16b
        subs    x10, x10, #1
        eor     v4.16b, v4.16b, v7.16b
        eor     v2.16b, v2.16b, v7.16b
        eor     v1.16b, v1.16b, v6.16b
        eor     v6.16b, v6.16b, v4.16b
        eor     v2.16b, v2.16b, v5.16b
        eor     v0.16b, v0.16b, v1.16b
        eor     v7.16b, v7.16b, v6.16b
        eor     v8.16b, v6.16b, v2.16b
        and     v9.16b, v4.16b, v6.16b
        eor     v10.16b, v2.16b, v6.16b
        eor     v3.16b, v3.16b, v0.16b
        eor     v5.16b, v5.16b, v0.16b
        eor     v16.16b, v7.16b, v4.16b
        eor     v17.16b, v4.16b, v0.16b
        and     v18.16b, v0.16b, v2.16b
        eor     v19.16b, v7.16b, v4.16b
        eor     v1.16b, v1.16b, v3.16b
        eor     v20.16b, v3.16b, v0.16b
        eor     v21.16b, v5.16b, v2.16b
        eor     v22.16b, v3.16b, v7.16b
        and     v8.16b, v17.16b, v8.16b
        orr     v17.16b, v3.16b, v5.16b
        eor     v23.16b, v1.16b, v6.16b
        eor     v24.16b, v20.16b, v16.16b
        eor     v25.16b, v1.16b, v5.16b
        orr     v26.16b, v20.16b, v21.16b
        and     v20.16b, v20.16b, v21.16b
        and     v27.16b, v7.16b, v1.16b
        eor     v21.16b, v21.16b, v23.16b
        orr     v28.16b, v16.16b, v23.16b
        orr     v29.16b, v22.16b, v25.16b
        eor     v26.16b, v26.16b, v8.16b
        and     v16.16b, v16.16b, v23.16b
        and     v22.16b, v22.16b, v25.16b
        and     v21.16b, v24.16b, v21.16b
        eor     v8.16b, v28.16b, v8.16b
        eor     v23.16b, v5.16b, v2.16b
        eor     v24.16b, v1.16b, v6.16b
        eor     v16.16b, v16.16b, v22.16b
        eor     v22.16b, v3.16b, v0.16b
        eor     v25.16b, v29.16b, v21.16b
        eor     v21.16b, v26.16b, v21.16b
        eor     v8.16b, v8.16b, v20.16b
        eor     v26.16b, v23.16b, v24.16b
        eor     v16.16b, v16.16b, v20.16b
        eor     v28.16b, v22.16b, v19.16b
        eor     v20.16b, v25.16b, v20.16b
        eor     v9.16b, v21.16b, v9.16b
        eor     v8.16b, v8.16b, v18.16b
        eor     v18.16b, v5.16b, v1.16b
        eor     v21.16b, v16.16b, v17.16b
        eor     v16.16b, v16.16b, v17.16b
        eor     v17.16b, v20.16b, v27.16b
        eor     v20.16b, v3.16b, v7.16b
        eor     v25.16b, v9.16b, v8.16b
        eor     v27.16b, v0.16b, v4.16b
        and     v29.16b, v9.16b, v17.16b
        eor     v30.16b, v8.16b, v29.16b
        eor     v31.16b, v21.16b, v29.16b
        eor     v29.16b, v21.16b, v29.16b
        bsl     v30.16b, v17.16b, v21.16b
        bsl     v31.16b, v9.16b, v8.16b
        bsl     v16.16b, v30.16b, v29.16b
        bsl     v21.16b, v29.16b, v30.16b
        eor     v8.16b, v31.16b, v30.16b
        and     v1.16b, v1.16b, v31.16b
        and     v9.16b, v16.16b, v31.16b
        and     v6.16b, v6.16b, v30.16b
        eor     v16.16b, v17.16b, v21.16b
        and     v4.16b, v4.16b, v30.16b
        eor     v17.16b, v8.16b, v30.16b
        and     v21.16b, v24.16b, v8.16b
        eor     v9.16b, v9.16b, v25.16b
        and     v19.16b, v19.16b, v8.16b
        eor     v24.16b, v30.16b, v16.16b
        eor     v25.16b, v30.16b, v16.16b
        and     v7.16b, v7.16b, v17.16b
        and     v10.16b, v10.16b, v16.16b
        eor     v29.16b, v9.16b, v16.16b
        eor     v30.16b, v31.16b, v9.16b
        and     v0.16b, v24.16b, v0.16b
        and     v9.16b, v18.16b, v9.16b
        and     v2.16b, v25.16b, v2.16b
        eor     v10.16b, v10.16b, v6.16b
        eor     v18.16b, v29.16b, v16.16b
        and     v5.16b, v30.16b, v5.16b
        eor     v24.16b, v8.16b, v29.16b
        and     v25.16b, v26.16b, v29.16b
        and     v26.16b, v28.16b, v29.16b
        eor     v8.16b, v8.16b, v29.16b
        eor     v17.16b, v17.16b, v18.16b
        eor     v5.16b, v1.16b, v5.16b
        and     v23.16b, v24.16b, v23.16b
        eor     v21.16b, v21.16b, v25.16b
        eor     v19.16b, v19.16b, v26.16b
        eor     v0.16b, v4.16b, v0.16b
        and     v3.16b, v17.16b, v3.16b
        eor     v1.16b, v9.16b, v1.16b
        eor     v9.16b, v25.16b, v23.16b
        eor     v5.16b, v5.16b, v21.16b
        eor     v2.16b, v6.16b, v2.16b
        and     v6.16b, v8.16b, v22.16b
        eor     v3.16b, v7.16b, v3.16b
        and     v8.16b, v20.16b, v18.16b
        eor     v10.16b, v10.16b, v9.16b
        eor     v0.16b, v0.16b, v19.16b
        eor     v9.16b, v1.16b, v9.16b
        eor     v1.16b, v2.16b, v21.16b
        eor     v3.16b, v3.16b, v19.16b
        and     v16.16b, v27.16b, v16.16b
        eor     v17.16b, v26.16b, v6.16b
        eor     v6.16b, v8.16b, v7.16b
        eor     v7.16b, v1.16b, v9.16b
        eor     v1.16b, v5.16b, v3.16b
        eor     v2.16b, v10.16b, v3.16b
        eor     v4.16b, v16.16b, v4.16b
        eor     v8.16b, v6.16b, v17.16b
        eor     v5.16b, v9.16b, v3.16b
        eor     v9.16b, v0.16b, v1.16b
        eor     v6.16b, v7.16b, v1.16b
        eor     v0.16b, v4.16b, v17.16b
        eor     v4.16b, v8.16b, v7.16b
        eor     v7.16b, v9.16b, v2.16b
        eor     v8.16b, v3.16b, v0.16b
        eor     v7.16b, v7.16b, v5.16b
        eor     v3.16b, v4.16b, v7.16b
        eor     v4.16b, v7.16b, v0.16b
        eor     v7.16b, v8.16b, v3.16b
        bcc     .Ldec_done
        ext     v8.16b, v0.16b, v0.16b, #8
        ext     v9.16b, v1.16b, v1.16b, #8
        ldr     q28, [x11]                  // load from .LISR in common case (x10 > 0)
        ext     v10.16b, v6.16b, v6.16b, #8
        ext     v16.16b, v3.16b, v3.16b, #8
        ext     v17.16b, v5.16b, v5.16b, #8
        ext     v18.16b, v4.16b, v4.16b, #8
        eor     v8.16b, v8.16b, v0.16b
        eor     v9.16b, v9.16b, v1.16b
        eor     v10.16b, v10.16b, v6.16b
        eor     v16.16b, v16.16b, v3.16b
        eor     v17.16b, v17.16b, v5.16b
        ext     v19.16b, v2.16b, v2.16b, #8
        ext     v20.16b, v7.16b, v7.16b, #8
        eor     v18.16b, v18.16b, v4.16b
        eor     v6.16b, v6.16b, v8.16b
        eor     v8.16b, v2.16b, v10.16b
        eor     v4.16b, v4.16b, v9.16b
        eor     v2.16b, v19.16b, v2.16b
        eor     v9.16b, v20.16b, v7.16b
        eor     v0.16b, v0.16b, v16.16b
        eor     v1.16b, v1.16b, v16.16b
        eor     v6.16b, v6.16b, v17.16b
        eor     v8.16b, v8.16b, v16.16b
        eor     v7.16b, v7.16b, v18.16b
        eor     v4.16b, v4.16b, v16.16b
        eor     v2.16b, v3.16b, v2.16b
        eor     v1.16b, v1.16b, v17.16b
        eor     v3.16b, v5.16b, v9.16b
        eor     v5.16b, v8.16b, v17.16b
        eor     v7.16b, v7.16b, v17.16b
        ext     v8.16b, v0.16b, v0.16b, #12
        ext     v9.16b, v6.16b, v6.16b, #12
        ext     v10.16b, v4.16b, v4.16b, #12
        ext     v16.16b, v1.16b, v1.16b, #12
        ext     v17.16b, v5.16b, v5.16b, #12
        ext     v18.16b, v7.16b, v7.16b, #12
        eor     v0.16b, v0.16b, v8.16b
        eor     v6.16b, v6.16b, v9.16b
        eor     v4.16b, v4.16b, v10.16b
        ext     v19.16b, v2.16b, v2.16b, #12
        ext     v20.16b, v3.16b, v3.16b, #12
        eor     v1.16b, v1.16b, v16.16b
        eor     v5.16b, v5.16b, v17.16b
        eor     v7.16b, v7.16b, v18.16b
        eor     v2.16b, v2.16b, v19.16b
        eor     v16.16b, v16.16b, v0.16b
        eor     v3.16b, v3.16b, v20.16b
        eor     v17.16b, v17.16b, v4.16b
        eor     v10.16b, v10.16b, v6.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        eor     v9.16b, v9.16b, v1.16b
        ext     v1.16b, v1.16b, v1.16b, #8
        eor     v8.16b, v8.16b, v3.16b
        eor     v16.16b, v16.16b, v3.16b
        eor     v18.16b, v18.16b, v5.16b
        eor     v19.16b, v19.16b, v7.16b
        ext     v21.16b, v5.16b, v5.16b, #8
        ext     v5.16b, v7.16b, v7.16b, #8
        eor     v7.16b, v20.16b, v2.16b
        ext     v4.16b, v4.16b, v4.16b, #8
        ext     v20.16b, v3.16b, v3.16b, #8
        eor     v17.16b, v17.16b, v3.16b
        ext     v2.16b, v2.16b, v2.16b, #8
        eor     v3.16b, v10.16b, v3.16b
        ext     v10.16b, v6.16b, v6.16b, #8
        eor     v0.16b, v0.16b, v8.16b
        eor     v1.16b, v1.16b, v16.16b
        eor     v5.16b, v5.16b, v18.16b
        eor     v3.16b, v3.16b, v4.16b
        eor     v7.16b, v20.16b, v7.16b
        eor     v6.16b, v2.16b, v19.16b
        eor     v4.16b, v21.16b, v17.16b
        eor     v2.16b, v10.16b, v9.16b
        bne     .Ldec_loop
        ldr     q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
        b       .Ldec_loop
.align  4
.Ldec_done:
        ushr    v8.2d, v0.2d, #1
        movi    v9.16b, #0x55
        ldr     q10, [x9]
        ushr    v16.2d, v2.2d, #1
        movi    v17.16b, #0x33
        ushr    v18.2d, v6.2d, #1
        movi    v19.16b, #0x0f
        eor     v8.16b, v8.16b, v1.16b
        ushr    v20.2d, v3.2d, #1
        eor     v16.16b, v16.16b, v7.16b
        eor     v18.16b, v18.16b, v4.16b
        and     v8.16b, v8.16b, v9.16b
        eor     v20.16b, v20.16b, v5.16b
        and     v16.16b, v16.16b, v9.16b
        and     v18.16b, v18.16b, v9.16b
        shl     v21.2d, v8.2d, #1
        eor     v1.16b, v1.16b, v8.16b
        and     v8.16b, v20.16b, v9.16b
        eor     v7.16b, v7.16b, v16.16b
        shl     v9.2d, v16.2d, #1
        eor     v4.16b, v4.16b, v18.16b
        shl     v16.2d, v18.2d, #1
        eor     v0.16b, v0.16b, v21.16b
        shl     v18.2d, v8.2d, #1
        eor     v5.16b, v5.16b, v8.16b
        eor     v2.16b, v2.16b, v9.16b
        eor     v6.16b, v6.16b, v16.16b
        ushr    v8.2d, v1.2d, #2
        eor     v3.16b, v3.16b, v18.16b
        ushr    v9.2d, v0.2d, #2
        ushr    v16.2d, v7.2d, #2
        ushr    v18.2d, v2.2d, #2
        eor     v8.16b, v8.16b, v4.16b
        eor     v9.16b, v9.16b, v6.16b
        eor     v16.16b, v16.16b, v5.16b
        eor     v18.16b, v18.16b, v3.16b
        and     v8.16b, v8.16b, v17.16b
        and     v9.16b, v9.16b, v17.16b
        and     v16.16b, v16.16b, v17.16b
        and     v17.16b, v18.16b, v17.16b
        eor     v4.16b, v4.16b, v8.16b
        shl     v8.2d, v8.2d, #2
        eor     v6.16b, v6.16b, v9.16b
        shl     v9.2d, v9.2d, #2
        eor     v5.16b, v5.16b, v16.16b
        shl     v16.2d, v16.2d, #2
        eor     v3.16b, v3.16b, v17.16b
        shl     v17.2d, v17.2d, #2
        eor     v1.16b, v1.16b, v8.16b
        eor     v0.16b, v0.16b, v9.16b
        eor     v7.16b, v7.16b, v16.16b
        eor     v2.16b, v2.16b, v17.16b
        ushr    v8.2d, v4.2d, #4
        ushr    v9.2d, v6.2d, #4
        ushr    v16.2d, v1.2d, #4
        ushr    v17.2d, v0.2d, #4
        eor     v8.16b, v8.16b, v5.16b
        eor     v9.16b, v9.16b, v3.16b
        eor     v16.16b, v16.16b, v7.16b
        eor     v17.16b, v17.16b, v2.16b
        and     v8.16b, v8.16b, v19.16b
        and     v9.16b, v9.16b, v19.16b
        and     v16.16b, v16.16b, v19.16b
        and     v17.16b, v17.16b, v19.16b
        eor     v5.16b, v5.16b, v8.16b
        shl     v8.2d, v8.2d, #4
        eor     v3.16b, v3.16b, v9.16b
        shl     v9.2d, v9.2d, #4
        eor     v7.16b, v7.16b, v16.16b
        shl     v16.2d, v16.2d, #4
        eor     v2.16b, v2.16b, v17.16b
        shl     v17.2d, v17.2d, #4
        eor     v4.16b, v4.16b, v8.16b
        eor     v6.16b, v6.16b, v9.16b
        eor     v7.16b, v7.16b, v10.16b
        eor     v1.16b, v1.16b, v16.16b
        eor     v2.16b, v2.16b, v10.16b
        eor     v0.16b, v0.16b, v17.16b
        eor     v4.16b, v4.16b, v10.16b
        eor     v6.16b, v6.16b, v10.16b
        eor     v3.16b, v3.16b, v10.16b
        eor     v5.16b, v5.16b, v10.16b
        eor     v1.16b, v1.16b, v10.16b
        eor     v0.16b, v0.16b, v10.16b
        ret
.size   _bsaes_decrypt8,.-_bsaes_decrypt8

.type   _bsaes_const,%object
.align  6
_bsaes_const:
// InvShiftRows constants
// Used in _bsaes_decrypt8, which assumes contiguity
// .LM0ISR used with round 0 key
// .LISR   used with middle round keys
// .LISRM0 used with final round key
.LM0ISR:
.quad   0x0a0e0206070b0f03, 0x0004080c0d010509
.LISR:
.quad   0x0504070602010003, 0x0f0e0d0c080b0a09
.LISRM0:
.quad   0x01040b0e0205080f, 0x0306090c00070a0d

// ShiftRows constants
// Used in _bsaes_encrypt8, which assumes contiguity
// .LM0SR used with round 0 key
// .LSR   used with middle round keys
// .LSRM0 used with final round key
.LM0SR:
.quad   0x0a0e02060f03070b, 0x0004080c05090d01
.LSR:
.quad   0x0504070600030201, 0x0f0e0d0c0a09080b
.LSRM0:
.quad   0x0304090e00050a0f, 0x01060b0c0207080d

.LM0_bigendian:
.quad   0x02060a0e03070b0f, 0x0004080c0105090d
.LM0_littleendian:
.quad   0x0105090d0004080c, 0x03070b0f02060a0e

// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
.LREVM0SR:
.quad   0x090d01050c000408, 0x03070b0f060a0e02

.align  6
.size   _bsaes_const,.-_bsaes_const

.type   _bsaes_encrypt8,%function
.align  4
// On entry:
//   x9 -> key (previously expanded using _bsaes_key_convert)
//   x10 = number of rounds
//   v0-v7 input data
// On exit:
//   x9-x11 corrupted
//   other general-purpose registers preserved
//   v0-v7 output data
//   v11-v15 preserved
//   other SIMD registers corrupted
_bsaes_encrypt8:
        ldr     q8, [x9], #16
        adr     x11, .LM0SR
        ldr     q9, [x11], #16
_bsaes_encrypt8_alt:
        eor     v0.16b, v0.16b, v8.16b
        eor     v1.16b, v1.16b, v8.16b
        sub     x10, x10, #1
        eor     v2.16b, v2.16b, v8.16b
        eor     v4.16b, v4.16b, v8.16b
        eor     v3.16b, v3.16b, v8.16b
        eor     v5.16b, v5.16b, v8.16b
        tbl     v0.16b, {v0.16b}, v9.16b
        tbl     v1.16b, {v1.16b}, v9.16b
        tbl     v2.16b, {v2.16b}, v9.16b
        tbl     v4.16b, {v4.16b}, v9.16b
        eor     v6.16b, v6.16b, v8.16b
        eor     v7.16b, v7.16b, v8.16b
        tbl     v3.16b, {v3.16b}, v9.16b
        tbl     v5.16b, {v5.16b}, v9.16b
        tbl     v6.16b, {v6.16b}, v9.16b
        ushr    v8.2d, v0.2d, #1
        movi    v10.16b, #0x55
        tbl     v7.16b, {v7.16b}, v9.16b
        ushr    v9.2d, v4.2d, #1
        movi    v16.16b, #0x33
        ushr    v17.2d, v2.2d, #1
        eor     v8.16b, v8.16b, v1.16b
        movi    v18.16b, #0x0f
        ushr    v19.2d, v6.2d, #1
        eor     v9.16b, v9.16b, v5.16b
        eor     v17.16b, v17.16b, v3.16b
        and     v8.16b, v8.16b, v10.16b
        eor     v19.16b, v19.16b, v7.16b
        and     v9.16b, v9.16b, v10.16b
        and     v17.16b, v17.16b, v10.16b
        eor     v1.16b, v1.16b, v8.16b
        shl     v8.2d, v8.2d, #1
        and     v10.16b, v19.16b, v10.16b
        eor     v5.16b, v5.16b, v9.16b
        shl     v9.2d, v9.2d, #1
        eor     v3.16b, v3.16b, v17.16b
        shl     v17.2d, v17.2d, #1
        eor     v0.16b, v0.16b, v8.16b
        shl     v8.2d, v10.2d, #1
        eor     v7.16b, v7.16b, v10.16b
        eor     v4.16b, v4.16b, v9.16b
        eor     v2.16b, v2.16b, v17.16b
        ushr    v9.2d, v1.2d, #2
        eor     v6.16b, v6.16b, v8.16b
        ushr    v8.2d, v0.2d, #2
        ushr    v10.2d, v5.2d, #2
        ushr    v17.2d, v4.2d, #2
        eor     v9.16b, v9.16b, v3.16b
        eor     v8.16b, v8.16b, v2.16b
        eor     v10.16b, v10.16b, v7.16b
        eor     v17.16b, v17.16b, v6.16b
        and     v9.16b, v9.16b, v16.16b
        and     v8.16b, v8.16b, v16.16b
        and     v10.16b, v10.16b, v16.16b
        and     v16.16b, v17.16b, v16.16b
        eor     v3.16b, v3.16b, v9.16b
        shl     v9.2d, v9.2d, #2
        eor     v2.16b, v2.16b, v8.16b
        shl     v8.2d, v8.2d, #2
        eor     v7.16b, v7.16b, v10.16b
        shl     v10.2d, v10.2d, #2
        eor     v6.16b, v6.16b, v16.16b
        shl     v16.2d, v16.2d, #2
        eor     v1.16b, v1.16b, v9.16b
        eor     v0.16b, v0.16b, v8.16b
        eor     v5.16b, v5.16b, v10.16b
        eor     v4.16b, v4.16b, v16.16b
        ushr    v8.2d, v3.2d, #4
        ushr    v9.2d, v2.2d, #4
        ushr    v10.2d, v1.2d, #4
        ushr    v16.2d, v0.2d, #4
        eor     v8.16b, v8.16b, v7.16b
        eor     v9.16b, v9.16b, v6.16b
        eor     v10.16b, v10.16b, v5.16b
        eor     v16.16b, v16.16b, v4.16b
        and     v8.16b, v8.16b, v18.16b
        and     v9.16b, v9.16b, v18.16b
        and     v10.16b, v10.16b, v18.16b
        and     v16.16b, v16.16b, v18.16b
        eor     v7.16b, v7.16b, v8.16b
        shl     v8.2d, v8.2d, #4
        eor     v6.16b, v6.16b, v9.16b
        shl     v9.2d, v9.2d, #4
        eor     v5.16b, v5.16b, v10.16b
        shl     v10.2d, v10.2d, #4
        eor     v4.16b, v4.16b, v16.16b
        shl     v16.2d, v16.2d, #4
        eor     v3.16b, v3.16b, v8.16b
        eor     v2.16b, v2.16b, v9.16b
        eor     v1.16b, v1.16b, v10.16b
        eor     v0.16b, v0.16b, v16.16b
        b       .Lenc_sbox
.align  4
.Lenc_loop:
        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
        ldp     q8, q9, [x9], #32
        eor     v0.16b, v16.16b, v0.16b
        ldr     q10, [x9], #16
        eor     v1.16b, v17.16b, v1.16b
        ldr     q16, [x9], #16
        eor     v2.16b, v18.16b, v2.16b
        eor     v3.16b, v19.16b, v3.16b
        eor     v4.16b, v8.16b, v4.16b
        eor     v5.16b, v9.16b, v5.16b
        eor     v6.16b, v10.16b, v6.16b
        eor     v7.16b, v16.16b, v7.16b
        tbl     v0.16b, {v0.16b}, v28.16b
        tbl     v1.16b, {v1.16b}, v28.16b
        tbl     v2.16b, {v2.16b}, v28.16b
        tbl     v3.16b, {v3.16b}, v28.16b
        tbl     v4.16b, {v4.16b}, v28.16b
        tbl     v5.16b, {v5.16b}, v28.16b
        tbl     v6.16b, {v6.16b}, v28.16b
        tbl     v7.16b, {v7.16b}, v28.16b
.Lenc_sbox:
        eor     v5.16b, v5.16b, v6.16b
        eor     v3.16b, v3.16b, v0.16b
        subs    x10, x10, #1
        eor     v2.16b, v2.16b, v1.16b
        eor     v5.16b, v5.16b, v0.16b
        eor     v8.16b, v3.16b, v7.16b
        eor     v6.16b, v6.16b, v2.16b
        eor     v7.16b, v7.16b, v5.16b
        eor     v8.16b, v8.16b, v4.16b
        eor     v3.16b, v6.16b, v3.16b
        eor     v4.16b, v4.16b, v5.16b
        eor     v6.16b, v1.16b, v5.16b
        eor     v2.16b, v2.16b, v7.16b
        eor     v1.16b, v8.16b, v1.16b
        eor     v8.16b, v7.16b, v4.16b
        eor     v9.16b, v3.16b, v0.16b
        eor     v10.16b, v7.16b, v6.16b
        eor     v16.16b, v5.16b, v3.16b
        eor     v17.16b, v6.16b, v2.16b
        eor     v18.16b, v5.16b, v1.16b
        eor     v19.16b, v2.16b, v4.16b
        eor     v20.16b, v1.16b, v0.16b
        orr     v21.16b, v8.16b, v9.16b
        orr     v22.16b, v10.16b, v16.16b
        eor     v23.16b, v8.16b, v17.16b
        eor     v24.16b, v9.16b, v18.16b
        and     v19.16b, v19.16b, v20.16b
        orr     v20.16b, v17.16b, v18.16b
        and     v8.16b, v8.16b, v9.16b
        and     v9.16b, v17.16b, v18.16b
        and     v17.16b, v23.16b, v24.16b
        and     v10.16b, v10.16b, v16.16b
        eor     v16.16b, v21.16b, v19.16b
        eor     v18.16b, v20.16b, v19.16b
        and     v19.16b, v2.16b, v1.16b
        and     v20.16b, v6.16b, v5.16b
        eor     v21.16b, v22.16b, v17.16b
        eor     v9.16b, v9.16b, v10.16b
        eor     v10.16b, v16.16b, v17.16b
        eor     v16.16b, v18.16b, v8.16b
        and     v17.16b, v4.16b, v0.16b
        orr     v18.16b, v7.16b, v3.16b
        eor     v21.16b, v21.16b, v8.16b
        eor     v8.16b, v9.16b, v8.16b
        eor     v9.16b, v10.16b, v19.16b
        eor     v10.16b, v3.16b, v0.16b
        eor     v16.16b, v16.16b, v17.16b
        eor     v17.16b, v5.16b, v1.16b
        eor     v19.16b, v21.16b, v20.16b
        eor     v20.16b, v8.16b, v18.16b
        eor     v8.16b, v8.16b, v18.16b
        eor     v18.16b, v7.16b, v4.16b
        eor     v21.16b, v9.16b, v16.16b
        eor     v22.16b, v6.16b, v2.16b
        and     v23.16b, v9.16b, v19.16b
        eor     v24.16b, v10.16b, v17.16b
        eor     v25.16b, v0.16b, v1.16b
        eor     v26.16b, v7.16b, v6.16b
        eor     v27.16b, v18.16b, v22.16b
        eor     v28.16b, v3.16b, v5.16b
        eor     v29.16b, v16.16b, v23.16b
        eor     v30.16b, v20.16b, v23.16b
        eor     v23.16b, v20.16b, v23.16b
        eor     v31.16b, v4.16b, v2.16b
        bsl     v29.16b, v19.16b, v20.16b
        bsl     v30.16b, v9.16b, v16.16b
        bsl     v8.16b, v29.16b, v23.16b
        bsl     v20.16b, v23.16b, v29.16b
        eor     v9.16b, v30.16b, v29.16b
        and     v5.16b, v5.16b, v30.16b
        and     v8.16b, v8.16b, v30.16b
        and     v1.16b, v1.16b, v29.16b
        eor     v16.16b, v19.16b, v20.16b
        and     v2.16b, v2.16b, v29.16b
        eor     v19.16b, v9.16b, v29.16b
        and     v17.16b, v17.16b, v9.16b
        eor     v8.16b, v8.16b, v21.16b
        and     v20.16b, v22.16b, v9.16b
        eor     v21.16b, v29.16b, v16.16b
        eor     v22.16b, v29.16b, v16.16b
        and     v23.16b, v25.16b, v16.16b
        and     v6.16b, v6.16b, v19.16b
        eor     v25.16b, v8.16b, v16.16b
        eor     v29.16b, v30.16b, v8.16b
        and     v4.16b, v21.16b, v4.16b
        and     v8.16b, v28.16b, v8.16b
        and     v0.16b, v22.16b, v0.16b
        eor     v21.16b, v23.16b, v1.16b
        eor     v22.16b, v9.16b, v25.16b
        eor     v9.16b, v9.16b, v25.16b
        eor     v23.16b, v25.16b, v16.16b
        and     v3.16b, v29.16b, v3.16b
        and     v24.16b, v24.16b, v25.16b
        and     v25.16b, v27.16b, v25.16b
        and     v10.16b, v22.16b, v10.16b
        and     v9.16b, v9.16b, v18.16b
        eor     v18.16b, v19.16b, v23.16b
        and     v19.16b, v26.16b, v23.16b
        eor     v3.16b, v5.16b, v3.16b
        eor     v17.16b, v17.16b, v24.16b
        eor     v10.16b, v24.16b, v10.16b
        and     v16.16b, v31.16b, v16.16b
        eor     v20.16b, v20.16b, v25.16b
        eor     v9.16b, v25.16b, v9.16b
        eor     v4.16b, v2.16b, v4.16b
        and     v7.16b, v18.16b, v7.16b
        eor     v18.16b, v19.16b, v6.16b
        eor     v5.16b, v8.16b, v5.16b
        eor     v0.16b, v1.16b, v0.16b
        eor     v1.16b, v21.16b, v10.16b
        eor     v8.16b, v3.16b, v17.16b
        eor     v2.16b, v16.16b, v2.16b
        eor     v3.16b, v6.16b, v7.16b
        eor     v6.16b, v18.16b, v9.16b
        eor     v4.16b, v4.16b, v20.16b
        eor     v10.16b, v5.16b, v10.16b
        eor     v0.16b, v0.16b, v17.16b
        eor     v9.16b, v2.16b, v9.16b
        eor     v3.16b, v3.16b, v20.16b
        eor     v7.16b, v6.16b, v1.16b
        eor     v5.16b, v8.16b, v4.16b
        eor     v6.16b, v10.16b, v1.16b
        eor     v2.16b, v4.16b, v0.16b
        eor     v4.16b, v3.16b, v10.16b
        eor     v9.16b, v9.16b, v7.16b
        eor     v3.16b, v0.16b, v5.16b
        eor     v0.16b, v1.16b, v4.16b
        eor     v1.16b, v4.16b, v8.16b
        eor     v4.16b, v9.16b, v5.16b
        eor     v6.16b, v6.16b, v3.16b
        bcc     .Lenc_done
        ext     v8.16b, v0.16b, v0.16b, #12
        ext     v9.16b, v4.16b, v4.16b, #12
        ldr     q28, [x11]
        ext     v10.16b, v6.16b, v6.16b, #12
        ext     v16.16b, v1.16b, v1.16b, #12
        ext     v17.16b, v3.16b, v3.16b, #12
        ext     v18.16b, v7.16b, v7.16b, #12
        eor     v0.16b, v0.16b, v8.16b
        eor     v4.16b, v4.16b, v9.16b
        eor     v6.16b, v6.16b, v10.16b
        ext     v19.16b, v2.16b, v2.16b, #12
        ext     v20.16b, v5.16b, v5.16b, #12
        eor     v1.16b, v1.16b, v16.16b
        eor     v3.16b, v3.16b, v17.16b
        eor     v7.16b, v7.16b, v18.16b
        eor     v2.16b, v2.16b, v19.16b
        eor     v16.16b, v16.16b, v0.16b
        eor     v5.16b, v5.16b, v20.16b
        eor     v17.16b, v17.16b, v6.16b
        eor     v10.16b, v10.16b, v4.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        eor     v9.16b, v9.16b, v1.16b
        ext     v1.16b, v1.16b, v1.16b, #8
        eor     v8.16b, v8.16b, v5.16b
        eor     v16.16b, v16.16b, v5.16b
        eor     v18.16b, v18.16b, v3.16b
        eor     v19.16b, v19.16b, v7.16b
        ext     v3.16b, v3.16b, v3.16b, #8
        ext     v7.16b, v7.16b, v7.16b, #8
        eor     v20.16b, v20.16b, v2.16b
        ext     v6.16b, v6.16b, v6.16b, #8
        ext     v21.16b, v5.16b, v5.16b, #8
        eor     v17.16b, v17.16b, v5.16b
        ext     v2.16b, v2.16b, v2.16b, #8
        eor     v10.16b, v10.16b, v5.16b
        ext     v22.16b, v4.16b, v4.16b, #8
        eor     v0.16b, v0.16b, v8.16b
        eor     v1.16b, v1.16b, v16.16b
        eor     v5.16b, v7.16b, v18.16b
        eor     v4.16b, v3.16b, v17.16b
        eor     v3.16b, v6.16b, v10.16b
        eor     v7.16b, v21.16b, v20.16b
        eor     v6.16b, v2.16b, v19.16b
        eor     v2.16b, v22.16b, v9.16b
        bne     .Lenc_loop
        ldr     q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
        b       .Lenc_loop
.align  4
.Lenc_done:
        ushr    v8.2d, v0.2d, #1
        movi    v9.16b, #0x55
        ldr     q10, [x9]
        ushr    v16.2d, v3.2d, #1
        movi    v17.16b, #0x33
        ushr    v18.2d, v4.2d, #1
        movi    v19.16b, #0x0f
        eor     v8.16b, v8.16b, v1.16b
        ushr    v20.2d, v2.2d, #1
        eor     v16.16b, v16.16b, v7.16b
        eor     v18.16b, v18.16b, v6.16b
        and     v8.16b, v8.16b, v9.16b
        eor     v20.16b, v20.16b, v5.16b
        and     v16.16b, v16.16b, v9.16b
        and     v18.16b, v18.16b, v9.16b
        shl     v21.2d, v8.2d, #1
        eor     v1.16b, v1.16b, v8.16b
        and     v8.16b, v20.16b, v9.16b
        eor     v7.16b, v7.16b, v16.16b
        shl     v9.2d, v16.2d, #1
        eor     v6.16b, v6.16b, v18.16b
        shl     v16.2d, v18.2d, #1
        eor     v0.16b, v0.16b, v21.16b
        shl     v18.2d, v8.2d, #1
        eor     v5.16b, v5.16b, v8.16b
        eor     v3.16b, v3.16b, v9.16b
        eor     v4.16b, v4.16b, v16.16b
        ushr    v8.2d, v1.2d, #2
        eor     v2.16b, v2.16b, v18.16b
        ushr    v9.2d, v0.2d, #2
        ushr    v16.2d, v7.2d, #2
        ushr    v18.2d, v3.2d, #2
        eor     v8.16b, v8.16b, v6.16b
        eor     v9.16b, v9.16b, v4.16b
        eor     v16.16b, v16.16b, v5.16b
        eor     v18.16b, v18.16b, v2.16b
        and     v8.16b, v8.16b, v17.16b
        and     v9.16b, v9.16b, v17.16b
        and     v16.16b, v16.16b, v17.16b
        and     v17.16b, v18.16b, v17.16b
        eor     v6.16b, v6.16b, v8.16b
        shl     v8.2d, v8.2d, #2
        eor     v4.16b, v4.16b, v9.16b
        shl     v9.2d, v9.2d, #2
        eor     v5.16b, v5.16b, v16.16b
        shl     v16.2d, v16.2d, #2
        eor     v2.16b, v2.16b, v17.16b
        shl     v17.2d, v17.2d, #2
        eor     v1.16b, v1.16b, v8.16b
        eor     v0.16b, v0.16b, v9.16b
        eor     v7.16b, v7.16b, v16.16b
        eor     v3.16b, v3.16b, v17.16b
        ushr    v8.2d, v6.2d, #4
        ushr    v9.2d, v4.2d, #4
        ushr    v16.2d, v1.2d, #4
        ushr    v17.2d, v0.2d, #4
        eor     v8.16b, v8.16b, v5.16b
        eor     v9.16b, v9.16b, v2.16b
        eor     v16.16b, v16.16b, v7.16b
        eor     v17.16b, v17.16b, v3.16b
        and     v8.16b, v8.16b, v19.16b
        and     v9.16b, v9.16b, v19.16b
        and     v16.16b, v16.16b, v19.16b
        and     v17.16b, v17.16b, v19.16b
        eor     v5.16b, v5.16b, v8.16b
        shl     v8.2d, v8.2d, #4
        eor     v2.16b, v2.16b, v9.16b
        shl     v9.2d, v9.2d, #4
        eor     v7.16b, v7.16b, v16.16b
        shl     v16.2d, v16.2d, #4
        eor     v3.16b, v3.16b, v17.16b
        shl     v17.2d, v17.2d, #4
        eor     v6.16b, v6.16b, v8.16b
        eor     v4.16b, v4.16b, v9.16b
        eor     v7.16b, v7.16b, v10.16b
        eor     v1.16b, v1.16b, v16.16b
        eor     v3.16b, v3.16b, v10.16b
        eor     v0.16b, v0.16b, v17.16b
        eor     v6.16b, v6.16b, v10.16b
        eor     v4.16b, v4.16b, v10.16b
        eor     v2.16b, v2.16b, v10.16b
        eor     v5.16b, v5.16b, v10.16b
        eor     v1.16b, v1.16b, v10.16b
        eor     v0.16b, v0.16b, v10.16b
        ret
.size   _bsaes_encrypt8,.-_bsaes_encrypt8

.type   _bsaes_key_convert,%function
.align  4
// On entry:
//   x9 -> input key (big-endian)
//   x10 = number of rounds
//   x17 -> output key (native endianness)
// On exit:
//   x9, x10 corrupted
//   x11 -> .LM0_bigendian
//   x17 -> last quadword of output key
//   other general-purpose registers preserved
//   v2-v6 preserved
//   v7.16b[] = 0x63
//   v8-v14 preserved
//   v15 = last round key (converted to native endianness)
//   other SIMD registers corrupted
_bsaes_key_convert:
#ifdef __AARCH64EL__
        adr     x11, .LM0_littleendian
#else
        adr     x11, .LM0_bigendian
#endif
        ldr     q0, [x9], #16               // load round 0 key
        ldr     q1, [x11]                   // .LM0
        ldr     q15, [x9], #16              // load round 1 key

        movi    v7.16b, #0x63               // compose .L63
        movi    v16.16b, #0x01              // bit masks
        movi    v17.16b, #0x02
        movi    v18.16b, #0x04
        movi    v19.16b, #0x08
        movi    v20.16b, #0x10
        movi    v21.16b, #0x20
        movi    v22.16b, #0x40
        movi    v23.16b, #0x80

#ifdef __AARCH64EL__
        rev32   v0.16b, v0.16b
#endif
        sub     x10, x10, #1
        str     q0, [x17], #16              // save round 0 key

.align  4
.Lkey_loop:
        tbl     v0.16b, {v15.16b}, v1.16b
        ldr     q15, [x9], #16              // load next round key

        eor     v0.16b, v0.16b, v7.16b
        cmtst   v24.16b, v0.16b, v16.16b
        cmtst   v25.16b, v0.16b, v17.16b
        cmtst   v26.16b, v0.16b, v18.16b
        cmtst   v27.16b, v0.16b, v19.16b
        cmtst   v28.16b, v0.16b, v20.16b
        cmtst   v29.16b, v0.16b, v21.16b
        cmtst   v30.16b, v0.16b, v22.16b
        cmtst   v31.16b, v0.16b, v23.16b
        sub     x10, x10, #1
        st1     {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
        st1     {v28.16b-v31.16b}, [x17], #64
        cbnz    x10, .Lkey_loop

        // don't save last round key
#ifdef __AARCH64EL__
        rev32   v15.16b, v15.16b
        adr     x11, .LM0_bigendian
#endif
        ret
.size   _bsaes_key_convert,.-_bsaes_key_convert

.globl  ossl_bsaes_cbc_encrypt
.type   ossl_bsaes_cbc_encrypt,%function
.align  4
// On entry:
//   x0 -> input ciphertext
//   x1 -> output plaintext
//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
//   x3 -> key
//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
//   w5 must be == 0
// On exit:
//   Output plaintext filled in
//   Initialisation vector overwritten with last quadword of ciphertext
//   No output registers, usual AAPCS64 register preservation
ossl_bsaes_cbc_encrypt:
        cmp     x2, #128
        bhs     .Lcbc_do_bsaes
        b       AES_cbc_encrypt
.Lcbc_do_bsaes:

        // it is up to the caller to make sure we are called with enc == 0

        stp     x29, x30, [sp, #-48]!
        stp     d8, d9, [sp, #16]
        stp     d10, d15, [sp, #32]
        lsr     x2, x2, #4                  // len in 16 byte blocks

        ldr     w15, [x3, #240]             // get # of rounds
        mov     x14, sp

        // allocate the key schedule on the stack
        add     x17, sp, #96
        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes

        // populate the key schedule
        mov     x9, x3                      // pass key
        mov     x10, x15                    // pass # of rounds
        mov     sp, x17                     // sp is sp
        bl      _bsaes_key_convert
        ldr     q6,  [sp]
        str     q15, [x17]                  // save last round key
        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
        str     q6, [sp]

        ldr     q15, [x4]                   // load IV
        b       .Lcbc_dec_loop

.align  4
.Lcbc_dec_loop:
        subs    x2, x2, #0x8
        bmi     .Lcbc_dec_loop_finish

        ldr     q0, [x0], #16               // load input
        mov     x9, sp                      // pass the key
        ldr     q1, [x0], #16
        mov     x10, x15
        ldr     q2, [x0], #16
        ldr     q3, [x0], #16
        ldr     q4, [x0], #16
        ldr     q5, [x0], #16
        ldr     q6, [x0], #16
        ldr     q7, [x0], #-7*16

        bl      _bsaes_decrypt8

        ldr     q16, [x0], #16              // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        eor     v1.16b, v1.16b, v16.16b
        str     q0, [x1], #16               // write output
        ldr     q0, [x0], #16
        str     q1, [x1], #16
        ldr     q1, [x0], #16
        eor     v1.16b, v4.16b, v1.16b
        ldr     q4, [x0], #16
        eor     v2.16b, v2.16b, v4.16b
        eor     v0.16b, v6.16b, v0.16b
        ldr     q4, [x0], #16
        str     q0, [x1], #16
        str     q1, [x1], #16
        eor     v0.16b, v7.16b, v4.16b
        ldr     q1, [x0], #16
        str     q2, [x1], #16
        ldr     q2, [x0], #16
        ldr     q15, [x0], #16
        str     q0, [x1], #16
        eor     v0.16b, v5.16b, v2.16b
        eor     v1.16b, v3.16b, v1.16b
        str     q1, [x1], #16
        str     q0, [x1], #16

        b       .Lcbc_dec_loop

.Lcbc_dec_loop_finish:
        adds    x2, x2, #8
        beq     .Lcbc_dec_done

        ldr     q0, [x0], #16               // load input
        cmp     x2, #2
        blo     .Lcbc_dec_one
        ldr     q1, [x0], #16
        mov     x9, sp                      // pass the key
        mov     x10, x15
        beq     .Lcbc_dec_two
        ldr     q2, [x0], #16
        cmp     x2, #4
        blo     .Lcbc_dec_three
        ldr     q3, [x0], #16
        beq     .Lcbc_dec_four
        ldr     q4, [x0], #16
        cmp     x2, #6
        blo     .Lcbc_dec_five
        ldr     q5, [x0], #16
        beq     .Lcbc_dec_six
        ldr     q6, [x0], #-6*16

        bl      _bsaes_decrypt8

        ldr     q5, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q8, [x0], #16
        ldr     q9, [x0], #16
        ldr     q10, [x0], #16
        str     q0, [x1], #16               // write output
        ldr     q0, [x0], #16
        eor     v1.16b, v1.16b, v5.16b
        ldr     q5, [x0], #16
        eor     v6.16b, v6.16b, v8.16b
        ldr     q15, [x0]
        eor     v4.16b, v4.16b, v9.16b
        eor     v2.16b, v2.16b, v10.16b
        str     q1, [x1], #16
        eor     v0.16b, v7.16b, v0.16b
        str     q6, [x1], #16
        eor     v1.16b, v3.16b, v5.16b
        str     q4, [x1], #16
        str     q2, [x1], #16
        str     q0, [x1], #16
        str     q1, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_six:
        sub     x0, x0, #0x60
        bl      _bsaes_decrypt8
        ldr     q3, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q5, [x0], #16
        ldr     q8, [x0], #16
        ldr     q9, [x0], #16
        str     q0, [x1], #16               // write output
        ldr     q0, [x0], #16
        eor     v1.16b, v1.16b, v3.16b
        ldr     q15, [x0]
        eor     v3.16b, v6.16b, v5.16b
        eor     v4.16b, v4.16b, v8.16b
        eor     v2.16b, v2.16b, v9.16b
        str     q1, [x1], #16
        eor     v0.16b, v7.16b, v0.16b
        str     q3, [x1], #16
        str     q4, [x1], #16
        str     q2, [x1], #16
        str     q0, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_five:
        sub     x0, x0, #0x50
        bl      _bsaes_decrypt8
        ldr     q3, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q5, [x0], #16
        ldr     q7, [x0], #16
        ldr     q8, [x0], #16
        str     q0, [x1], #16               // write output
        ldr     q15, [x0]
        eor     v0.16b, v1.16b, v3.16b
        eor     v1.16b, v6.16b, v5.16b
        eor     v3.16b, v4.16b, v7.16b
        str     q0, [x1], #16
        eor     v0.16b, v2.16b, v8.16b
        str     q1, [x1], #16
        str     q3, [x1], #16
        str     q0, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_four:
        sub     x0, x0, #0x40
        bl      _bsaes_decrypt8
        ldr     q2, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q3, [x0], #16
        ldr     q5, [x0], #16
        str     q0, [x1], #16               // write output
        ldr     q15, [x0]
        eor     v0.16b, v1.16b, v2.16b
        eor     v1.16b, v6.16b, v3.16b
        eor     v2.16b, v4.16b, v5.16b
        str     q0, [x1], #16
        str     q1, [x1], #16
        str     q2, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_three:
        sub     x0, x0, #0x30
        bl      _bsaes_decrypt8
        ldr     q2, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q3, [x0], #16
        ldr     q15, [x0]
        str     q0, [x1], #16               // write output
        eor     v0.16b, v1.16b, v2.16b
        eor     v1.16b, v6.16b, v3.16b
        str     q0, [x1], #16
        str     q1, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_two:
        sub     x0, x0, #0x20
        bl      _bsaes_decrypt8
        ldr     q2, [x0], #16               // reload input
        eor     v0.16b, v0.16b, v15.16b     // ^= IV
        ldr     q15, [x0]
        str     q0, [x1], #16               // write output
        eor     v0.16b, v1.16b, v2.16b
        str     q0, [x1]
        b       .Lcbc_dec_done
.align  4
.Lcbc_dec_one:
        sub     x0, x0, #0x10
        stp     x1, x4, [sp, #-32]!
        str     x14, [sp, #16]
        mov     v8.16b, v15.16b
        mov     v15.16b, v0.16b
        mov     x2, x3
        bl      AES_decrypt
        ldr     x14, [sp, #16]
        ldp     x1, x4, [sp], #32
        ldr     q0, [x1]                    // load result
        eor     v0.16b, v0.16b, v8.16b      // ^= IV
        str     q0, [x1]                    // write output

.align  4
.Lcbc_dec_done:
        movi    v0.16b, #0
        movi    v1.16b, #0
.Lcbc_dec_bzero:// wipe key schedule [if any]
        stp     q0, q1, [sp], #32
        cmp     sp, x14
        bne     .Lcbc_dec_bzero
        str     q15, [x4]                   // return IV
        ldp     d8, d9, [sp, #16]
        ldp     d10, d15, [sp, #32]
        ldp     x29, x30, [sp], #48
        ret
.size   ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt

.globl  ossl_bsaes_ctr32_encrypt_blocks
.type   ossl_bsaes_ctr32_encrypt_blocks,%function
.align  4
// On entry:
//   x0 -> input text (whole 16-byte blocks)
//   x1 -> output text (whole 16-byte blocks)
//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
//   x3 -> key
//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
// On exit:
//   Output text filled in
//   No output registers, usual AAPCS64 register preservation
ossl_bsaes_ctr32_encrypt_blocks:

        cmp     x2, #8                      // use plain AES for
        blo     .Lctr_enc_short             // small sizes

        stp     x29, x30, [sp, #-80]!
        stp     d8, d9, [sp, #16]
        stp     d10, d11, [sp, #32]
        stp     d12, d13, [sp, #48]
        stp     d14, d15, [sp, #64]

        ldr     w15, [x3, #240]             // get # of rounds
        mov     x14, sp

        // allocate the key schedule on the stack
        add     x17, sp, #96
        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes

        // populate the key schedule
        mov     x9, x3                      // pass key
        mov     x10, x15                    // pass # of rounds
        mov     sp, x17                     // sp is sp
        bl      _bsaes_key_convert
        eor     v7.16b, v7.16b, v15.16b     // fix up last round key
        str     q7, [x17]                   // save last round key

        ldr     q0, [x4]                    // load counter
        add     x13, x11, #.LREVM0SR-.LM0_bigendian
        ldr     q4, [sp]                    // load round0 key

        movi    v8.4s, #1                   // compose 1<<96
        movi    v9.16b, #0
        rev32   v15.16b, v0.16b
        rev32   v0.16b, v0.16b
        ext     v11.16b, v9.16b, v8.16b, #4
        rev32   v4.16b, v4.16b
        add     v12.4s, v11.4s, v11.4s      // compose 2<<96
        str     q4, [sp]                    // save adjusted round0 key
        add     v13.4s, v11.4s, v12.4s      // compose 3<<96
        add     v14.4s, v12.4s, v12.4s      // compose 4<<96
        b       .Lctr_enc_loop

.align  4
.Lctr_enc_loop:
        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
        // to flip byte order in 32-bit counter

        add     v1.4s, v15.4s, v11.4s       // +1
        add     x9, sp, #0x10               // pass next round key
        add     v2.4s, v15.4s, v12.4s       // +2
        ldr     q9, [x13]                   // .LREVM0SR
        ldr     q8, [sp]                    // load round0 key
        add     v3.4s, v15.4s, v13.4s       // +3
        mov     x10, x15                    // pass rounds
        sub     x11, x13, #.LREVM0SR-.LSR   // pass constants
        add     v6.4s, v2.4s, v14.4s
        add     v4.4s, v15.4s, v14.4s       // +4
        add     v7.4s, v3.4s, v14.4s
        add     v15.4s, v4.4s, v14.4s       // next counter
        add     v5.4s, v1.4s, v14.4s

        bl      _bsaes_encrypt8_alt

        subs    x2, x2, #8
        blo     .Lctr_enc_loop_done

        ldr     q16, [x0], #16
        ldr     q17, [x0], #16
        eor     v1.16b, v1.16b, v17.16b
        ldr     q17, [x0], #16
        eor     v0.16b, v0.16b, v16.16b
        eor     v4.16b, v4.16b, v17.16b
        str     q0, [x1], #16
        ldr     q16, [x0], #16
        str     q1, [x1], #16
        mov     v0.16b, v15.16b
        str     q4, [x1], #16
        ldr     q1, [x0], #16
        eor     v4.16b, v6.16b, v16.16b
        eor     v1.16b, v3.16b, v1.16b
        ldr     q3, [x0], #16
        eor     v3.16b, v7.16b, v3.16b
        ldr     q6, [x0], #16
        eor     v2.16b, v2.16b, v6.16b
        ldr     q6, [x0], #16
        eor     v5.16b, v5.16b, v6.16b
        str     q4, [x1], #16
        str     q1, [x1], #16
        str     q3, [x1], #16
        str     q2, [x1], #16
        str     q5, [x1], #16

        bne     .Lctr_enc_loop
        b       .Lctr_enc_done

.align  4
.Lctr_enc_loop_done:
        add     x2, x2, #8
        ldr     q16, [x0], #16              // load input
        eor     v0.16b, v0.16b, v16.16b
        str     q0, [x1], #16               // write output
        cmp     x2, #2
        blo     .Lctr_enc_done
        ldr     q17, [x0], #16
        eor     v1.16b, v1.16b, v17.16b
        str     q1, [x1], #16
        beq     .Lctr_enc_done
        ldr     q18, [x0], #16
        eor     v4.16b, v4.16b, v18.16b
        str     q4, [x1], #16
        cmp     x2, #4
        blo     .Lctr_enc_done
        ldr     q19, [x0], #16
        eor     v6.16b, v6.16b, v19.16b
        str     q6, [x1], #16
        beq     .Lctr_enc_done
        ldr     q20, [x0], #16
        eor     v3.16b, v3.16b, v20.16b
        str     q3, [x1], #16
        cmp     x2, #6
        blo     .Lctr_enc_done
        ldr     q21, [x0], #16
        eor     v7.16b, v7.16b, v21.16b
        str     q7, [x1], #16
        beq     .Lctr_enc_done
        ldr     q22, [x0]
        eor     v2.16b, v2.16b, v22.16b
        str     q2, [x1], #16

.Lctr_enc_done:
        movi    v0.16b, #0
        movi    v1.16b, #0
.Lctr_enc_bzero: // wipe key schedule [if any]
        stp     q0, q1, [sp], #32
        cmp     sp, x14
        bne     .Lctr_enc_bzero

        ldp     d8, d9, [sp, #16]
        ldp     d10, d11, [sp, #32]
        ldp     d12, d13, [sp, #48]
        ldp     d14, d15, [sp, #64]
        ldp     x29, x30, [sp], #80
        ret

.Lctr_enc_short:
        stp     x29, x30, [sp, #-96]!
        stp     x19, x20, [sp, #16]
        stp     x21, x22, [sp, #32]
        str     x23, [sp, #48]

        mov     x19, x0                     // copy arguments
        mov     x20, x1
        mov     x21, x2
        mov     x22, x3
        ldr     w23, [x4, #12]              // load counter .LSW
        ldr     q1, [x4]                    // load whole counter value
#ifdef __AARCH64EL__
        rev     w23, w23
#endif
        str     q1, [sp, #80]               // copy counter value

.Lctr_enc_short_loop:
        add     x0, sp, #80                 // input counter value
        add     x1, sp, #64                 // output on the stack
        mov     x2, x22                     // key

        bl      AES_encrypt

        ldr     q0, [x19], #16              // load input
        ldr     q1, [sp, #64]               // load encrypted counter
        add     x23, x23, #1
#ifdef __AARCH64EL__
        rev     w0, w23
        str     w0, [sp, #80+12]            // next counter value
#else
        str     w23, [sp, #80+12]           // next counter value
#endif
        eor     v0.16b, v0.16b, v1.16b
        str     q0, [x20], #16              // store output
        subs    x21, x21, #1
        bne     .Lctr_enc_short_loop

        movi    v0.16b, #0
        movi    v1.16b, #0
        stp     q0, q1, [sp, #64]

        ldr     x23, [sp, #48]
        ldp     x21, x22, [sp, #32]
        ldp     x19, x20, [sp, #16]
        ldp     x29, x30, [sp], #96
        ret
.size   ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks

.globl  ossl_bsaes_xts_encrypt
.type   ossl_bsaes_xts_encrypt,%function
.align  4
// On entry:
//   x0 -> input plaintext
//   x1 -> output ciphertext
//   x2 -> length of text in bytes (must be at least 16)
//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
//   x5 -> 16-byte initial vector (typically, sector number)
// On exit:
//   Output ciphertext filled in
//   No output registers, usual AAPCS64 register preservation
ossl_bsaes_xts_encrypt:
        // Stack layout:
        // sp ->
        //        nrounds*128-96 bytes: key schedule
        // x19 ->
        //        16 bytes: frame record
        //        4*16 bytes: tweak storage across _bsaes_encrypt8
        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
        //        8*8 bytes: storage for 8 callee-saved SIMD registers
        stp     x29, x30, [sp, #-192]!
        stp     x19, x20, [sp, #80]
        stp     x21, x22, [sp, #96]
        str     x23, [sp, #112]
        stp     d8, d9, [sp, #128]
        stp     d10, d11, [sp, #144]
        stp     d12, d13, [sp, #160]
        stp     d14, d15, [sp, #176]

        mov     x19, sp
        mov     x20, x0
        mov     x21, x1
        mov     x22, x2
        mov     x23, x3

        // generate initial tweak
        sub     sp, sp, #16
        mov     x0, x5                      // iv[]
        mov     x1, sp
        mov     x2, x4                      // key2
        bl      AES_encrypt
        ldr     q11, [sp], #16

        ldr     w1, [x23, #240]             // get # of rounds
        // allocate the key schedule on the stack
        add     x17, sp, #96
        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes

        // populate the key schedule
        mov     x9, x23                     // pass key
        mov     x10, x1                     // pass # of rounds
        mov     sp, x17
        bl      _bsaes_key_convert
        eor     v15.16b, v15.16b, v7.16b    // fix up last round key
        str     q15, [x17]                  // save last round key

        subs    x22, x22, #0x80
        blo     .Lxts_enc_short
        b       .Lxts_enc_loop

.align  4
.Lxts_enc_loop:
        ldr     q8, .Lxts_magic
        mov     x10, x1                     // pass rounds
        add     x2, x19, #16
        ldr     q0, [x20], #16
        sshr    v1.2d, v11.2d, #63
        mov     x9, sp                      // pass key schedule
        ldr     q6, .Lxts_magic+16
        add     v2.2d, v11.2d, v11.2d
        cmtst   v3.2d, v11.2d, v6.2d
        and     v1.16b, v1.16b, v8.16b
        ext     v1.16b, v1.16b, v1.16b, #8
        and     v3.16b, v3.16b, v8.16b
        ldr     q4, [x20], #16
        eor     v12.16b, v2.16b, v1.16b
        eor     v1.16b, v4.16b, v12.16b
        eor     v0.16b, v0.16b, v11.16b
        cmtst   v2.2d, v12.2d, v6.2d
        add     v4.2d, v12.2d, v12.2d
        add     x0, x19, #16
        ext     v3.16b, v3.16b, v3.16b, #8
        and     v2.16b, v2.16b, v8.16b
        eor     v13.16b, v4.16b, v3.16b
        ldr     q3, [x20], #16
        ext     v4.16b, v2.16b, v2.16b, #8
        eor     v2.16b, v3.16b, v13.16b
        ldr     q3, [x20], #16
        add     v5.2d, v13.2d, v13.2d
        cmtst   v7.2d, v13.2d, v6.2d
        and     v7.16b, v7.16b, v8.16b
        ldr     q9, [x20], #16
        ext     v7.16b, v7.16b, v7.16b, #8
        ldr     q10, [x20], #16
        eor     v14.16b, v5.16b, v4.16b
        ldr     q16, [x20], #16
        add     v4.2d, v14.2d, v14.2d
        eor     v3.16b, v3.16b, v14.16b
        eor     v15.16b, v4.16b, v7.16b
        add     v5.2d, v15.2d, v15.2d
        ldr     q7, [x20], #16
        cmtst   v4.2d, v14.2d, v6.2d
        and     v17.16b, v4.16b, v8.16b
        cmtst   v18.2d, v15.2d, v6.2d
        eor     v4.16b, v9.16b, v15.16b
        ext     v9.16b, v17.16b, v17.16b, #8
        eor     v9.16b, v5.16b, v9.16b
        add     v17.2d, v9.2d, v9.2d
        and     v18.16b, v18.16b, v8.16b
        eor     v5.16b, v10.16b, v9.16b
        str     q9, [x2], #16
        ext     v10.16b, v18.16b, v18.16b, #8
        cmtst   v9.2d, v9.2d, v6.2d
        and     v9.16b, v9.16b, v8.16b
        eor     v10.16b, v17.16b, v10.16b
        cmtst   v17.2d, v10.2d, v6.2d
        eor     v6.16b, v16.16b, v10.16b
        str     q10, [x2], #16
        ext     v9.16b, v9.16b, v9.16b, #8
        add     v10.2d, v10.2d, v10.2d
        eor     v9.16b, v10.16b, v9.16b
        str     q9, [x2], #16
        eor     v7.16b, v7.16b, v9.16b
        add     v9.2d, v9.2d, v9.2d
        and     v8.16b, v17.16b, v8.16b
        ext     v8.16b, v8.16b, v8.16b, #8
        eor     v8.16b, v9.16b, v8.16b
        str     q8, [x2]                    // next round tweak

        bl      _bsaes_encrypt8

        ldr     q8, [x0], #16
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        ldr     q9, [x0], #16
        eor     v4.16b, v4.16b, v13.16b
        eor     v6.16b, v6.16b, v14.16b
        ldr     q10, [x0], #16
        eor     v3.16b, v3.16b, v15.16b
        subs    x22, x22, #0x80
        str     q0, [x21], #16
        ldr     q11, [x0]                   // next round tweak
        str     q1, [x21], #16
        eor     v0.16b, v7.16b, v8.16b
        eor     v1.16b, v2.16b, v9.16b
        str     q4, [x21], #16
        eor     v2.16b, v5.16b, v10.16b
        str     q6, [x21], #16
        str     q3, [x21], #16
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q2, [x21], #16
        bpl     .Lxts_enc_loop

.Lxts_enc_short:
        adds    x22, x22, #0x70
        bmi     .Lxts_enc_done

        ldr     q8, .Lxts_magic
        sshr    v1.2d, v11.2d, #63
        add     v2.2d, v11.2d, v11.2d
        ldr     q9, .Lxts_magic+16
        subs    x22, x22, #0x10
        ldr     q0, [x20], #16
        and     v1.16b, v1.16b, v8.16b
        cmtst   v3.2d, v11.2d, v9.2d
        ext     v1.16b, v1.16b, v1.16b, #8
        and     v3.16b, v3.16b, v8.16b
        eor     v12.16b, v2.16b, v1.16b
        ext     v1.16b, v3.16b, v3.16b, #8
        add     v2.2d, v12.2d, v12.2d
        cmtst   v3.2d, v12.2d, v9.2d
        eor     v13.16b, v2.16b, v1.16b
        and     v22.16b, v3.16b, v8.16b
        bmi     .Lxts_enc_1

        ext     v2.16b, v22.16b, v22.16b, #8
        add     v3.2d, v13.2d, v13.2d
        ldr     q1, [x20], #16
        cmtst   v4.2d, v13.2d, v9.2d
        subs    x22, x22, #0x10
        eor     v14.16b, v3.16b, v2.16b
        and     v23.16b, v4.16b, v8.16b
        bmi     .Lxts_enc_2

        ext     v3.16b, v23.16b, v23.16b, #8
        add     v4.2d, v14.2d, v14.2d
        ldr     q2, [x20], #16
        cmtst   v5.2d, v14.2d, v9.2d
        eor     v0.16b, v0.16b, v11.16b
        subs    x22, x22, #0x10
        eor     v15.16b, v4.16b, v3.16b
        and     v24.16b, v5.16b, v8.16b
        bmi     .Lxts_enc_3

        ext     v4.16b, v24.16b, v24.16b, #8
        add     v5.2d, v15.2d, v15.2d
        ldr     q3, [x20], #16
        cmtst   v6.2d, v15.2d, v9.2d
        eor     v1.16b, v1.16b, v12.16b
        subs    x22, x22, #0x10
        eor     v16.16b, v5.16b, v4.16b
        and     v25.16b, v6.16b, v8.16b
        bmi     .Lxts_enc_4

        ext     v5.16b, v25.16b, v25.16b, #8
        add     v6.2d, v16.2d, v16.2d
        add     x0, x19, #16
        cmtst   v7.2d, v16.2d, v9.2d
        ldr     q4, [x20], #16
        eor     v2.16b, v2.16b, v13.16b
        str     q16, [x0], #16
        subs    x22, x22, #0x10
        eor     v17.16b, v6.16b, v5.16b
        and     v26.16b, v7.16b, v8.16b
        bmi     .Lxts_enc_5

        ext     v7.16b, v26.16b, v26.16b, #8
        add     v18.2d, v17.2d, v17.2d
        ldr     q5, [x20], #16
        eor     v3.16b, v3.16b, v14.16b
        str     q17, [x0], #16
        subs    x22, x22, #0x10
        eor     v18.16b, v18.16b, v7.16b
        bmi     .Lxts_enc_6

        ldr     q6, [x20], #16
        eor     v4.16b, v4.16b, v15.16b
        eor     v5.16b, v5.16b, v16.16b
        str     q18, [x0]                   // next round tweak
        mov     x9, sp                      // pass key schedule
        mov     x10, x1
        add     x0, x19, #16
        sub     x22, x22, #0x10
        eor     v6.16b, v6.16b, v17.16b

        bl      _bsaes_encrypt8

        ldr     q16, [x0], #16
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        ldr     q17, [x0], #16
        eor     v4.16b, v4.16b, v13.16b
        eor     v6.16b, v6.16b, v14.16b
        eor     v3.16b, v3.16b, v15.16b
        ldr     q11, [x0]                   // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        eor     v0.16b, v7.16b, v16.16b
        eor     v1.16b, v2.16b, v17.16b
        str     q4, [x21], #16
        str     q6, [x21], #16
        str     q3, [x21], #16
        str     q0, [x21], #16
        str     q1, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_6:
        eor     v4.16b, v4.16b, v15.16b
        eor     v5.16b, v5.16b, v16.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_encrypt8

        ldr     q16, [x0], #16
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v4.16b, v4.16b, v13.16b
        eor     v6.16b, v6.16b, v14.16b
        ldr     q11, [x0]                   // next round tweak
        eor     v3.16b, v3.16b, v15.16b
        str     q0, [x21], #16
        str     q1, [x21], #16
        eor     v0.16b, v7.16b, v16.16b
        str     q4, [x21], #16
        str     q6, [x21], #16
        str     q3, [x21], #16
        str     q0, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_5:
        eor     v3.16b, v3.16b, v14.16b
        eor     v4.16b, v4.16b, v15.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_encrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        ldr     q11, [x0]                   // next round tweak
        eor     v4.16b, v4.16b, v13.16b
        eor     v6.16b, v6.16b, v14.16b
        eor     v3.16b, v3.16b, v15.16b
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q4, [x21], #16
        str     q6, [x21], #16
        str     q3, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_4:
        eor     v2.16b, v2.16b, v13.16b
        eor     v3.16b, v3.16b, v14.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_encrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v4.16b, v4.16b, v13.16b
        eor     v6.16b, v6.16b, v14.16b
        mov     v11.16b, v15.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q4, [x21], #16
        str     q6, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_3:
        eor     v1.16b, v1.16b, v12.16b
        eor     v2.16b, v2.16b, v13.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_encrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v4.16b, v4.16b, v13.16b
        mov     v11.16b, v14.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q4, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_2:
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_encrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        mov     v11.16b, v13.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        b       .Lxts_enc_done

.align  4
.Lxts_enc_1:
        eor     v0.16b, v0.16b, v11.16b
        sub     x0, sp, #16
        sub     x1, sp, #16
        mov     x2, x23
        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
        mov     v14.d[0], v12.d[1]
        str     q0, [sp, #-16]!

        bl      AES_encrypt

        ldr     q0, [sp], #16
        trn1    v13.2d, v11.2d, v13.2d
        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
        eor     v0.16b, v0.16b, v13.16b
        str     q0, [x21], #16

.Lxts_enc_done:
        adds    x22, x22, #0x10
        beq     .Lxts_enc_ret

        sub     x6, x21, #0x10
        // Penultimate plaintext block produces final ciphertext part-block
        // plus remaining part of final plaintext block. Move ciphertext part
        // to final position and reuse penultimate ciphertext block buffer to
        // construct final plaintext block
.Lxts_enc_steal:
        ldrb    w0, [x20], #1
        ldrb    w1, [x21, #-0x10]
        strb    w0, [x21, #-0x10]
        strb    w1, [x21], #1

        subs    x22, x22, #1
        bhi     .Lxts_enc_steal

        // Finally encrypt the penultimate ciphertext block using the
        // last tweak
        ldr     q0, [x6]
        eor     v0.16b, v0.16b, v11.16b
        str     q0, [sp, #-16]!
        mov     x0, sp
        mov     x1, sp
        mov     x2, x23
        mov     x21, x6
        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers

        bl      AES_encrypt

        trn1    v11.2d, v11.2d, v13.2d
        ldr     q0, [sp], #16
        eor     v0.16b, v0.16b, v11.16b
        str     q0, [x21]

.Lxts_enc_ret:

        movi    v0.16b, #0
        movi    v1.16b, #0
.Lxts_enc_bzero: // wipe key schedule
        stp     q0, q1, [sp], #32
        cmp     sp, x19
        bne     .Lxts_enc_bzero

        ldp     x19, x20, [sp, #80]
        ldp     x21, x22, [sp, #96]
        ldr     x23, [sp, #112]
        ldp     d8, d9, [sp, #128]
        ldp     d10, d11, [sp, #144]
        ldp     d12, d13, [sp, #160]
        ldp     d14, d15, [sp, #176]
        ldp     x29, x30, [sp], #192
        ret
.size   ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt

// The assembler doesn't seem capable of de-duplicating these when expressed
// using `ldr qd,=` syntax, so assign a symbolic address
.align  5
.Lxts_magic:
.quad   1, 0x87, 0x4000000000000000, 0x4000000000000000

.globl  ossl_bsaes_xts_decrypt
.type   ossl_bsaes_xts_decrypt,%function
.align  4
// On entry:
//   x0 -> input ciphertext
//   x1 -> output plaintext
//   x2 -> length of text in bytes (must be at least 16)
//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
//   x5 -> 16-byte initial vector (typically, sector number)
// On exit:
//   Output plaintext filled in
//   No output registers, usual AAPCS64 register preservation
ossl_bsaes_xts_decrypt:
        // Stack layout:
        // sp ->
        //        nrounds*128-96 bytes: key schedule
        // x19 ->
        //        16 bytes: frame record
        //        4*16 bytes: tweak storage across _bsaes_decrypt8
        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
        //        8*8 bytes: storage for 8 callee-saved SIMD registers
        stp     x29, x30, [sp, #-192]!
        stp     x19, x20, [sp, #80]
        stp     x21, x22, [sp, #96]
        str     x23, [sp, #112]
        stp     d8, d9, [sp, #128]
        stp     d10, d11, [sp, #144]
        stp     d12, d13, [sp, #160]
        stp     d14, d15, [sp, #176]

        mov     x19, sp
        mov     x20, x0
        mov     x21, x1
        mov     x22, x2
        mov     x23, x3

        // generate initial tweak
        sub     sp, sp, #16
        mov     x0, x5                      // iv[]
        mov     x1, sp
        mov     x2, x4                      // key2
        bl      AES_encrypt
        ldr     q11, [sp], #16

        ldr     w1, [x23, #240]             // get # of rounds
        // allocate the key schedule on the stack
        add     x17, sp, #96
        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes

        // populate the key schedule
        mov     x9, x23                     // pass key
        mov     x10, x1                     // pass # of rounds
        mov     sp, x17
        bl      _bsaes_key_convert
        ldr     q6,  [sp]
        str     q15, [x17]                  // save last round key
        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
        str     q6, [sp]

        sub     x30, x22, #0x10
        tst     x22, #0xf                   // if not multiple of 16
        csel    x22, x30, x22, ne           // subtract another 16 bytes
        subs    x22, x22, #0x80

        blo     .Lxts_dec_short
        b       .Lxts_dec_loop

.align  4
.Lxts_dec_loop:
        ldr     q8, .Lxts_magic
        mov     x10, x1                     // pass rounds
        add     x2, x19, #16
        ldr     q0, [x20], #16
        sshr    v1.2d, v11.2d, #63
        mov     x9, sp                      // pass key schedule
        ldr     q6, .Lxts_magic+16
        add     v2.2d, v11.2d, v11.2d
        cmtst   v3.2d, v11.2d, v6.2d
        and     v1.16b, v1.16b, v8.16b
        ext     v1.16b, v1.16b, v1.16b, #8
        and     v3.16b, v3.16b, v8.16b
        ldr     q4, [x20], #16
        eor     v12.16b, v2.16b, v1.16b
        eor     v1.16b, v4.16b, v12.16b
        eor     v0.16b, v0.16b, v11.16b
        cmtst   v2.2d, v12.2d, v6.2d
        add     v4.2d, v12.2d, v12.2d
        add     x0, x19, #16
        ext     v3.16b, v3.16b, v3.16b, #8
        and     v2.16b, v2.16b, v8.16b
        eor     v13.16b, v4.16b, v3.16b
        ldr     q3, [x20], #16
        ext     v4.16b, v2.16b, v2.16b, #8
        eor     v2.16b, v3.16b, v13.16b
        ldr     q3, [x20], #16
        add     v5.2d, v13.2d, v13.2d
        cmtst   v7.2d, v13.2d, v6.2d
        and     v7.16b, v7.16b, v8.16b
        ldr     q9, [x20], #16
        ext     v7.16b, v7.16b, v7.16b, #8
        ldr     q10, [x20], #16
        eor     v14.16b, v5.16b, v4.16b
        ldr     q16, [x20], #16
        add     v4.2d, v14.2d, v14.2d
        eor     v3.16b, v3.16b, v14.16b
        eor     v15.16b, v4.16b, v7.16b
        add     v5.2d, v15.2d, v15.2d
        ldr     q7, [x20], #16
        cmtst   v4.2d, v14.2d, v6.2d
        and     v17.16b, v4.16b, v8.16b
        cmtst   v18.2d, v15.2d, v6.2d
        eor     v4.16b, v9.16b, v15.16b
        ext     v9.16b, v17.16b, v17.16b, #8
        eor     v9.16b, v5.16b, v9.16b
        add     v17.2d, v9.2d, v9.2d
        and     v18.16b, v18.16b, v8.16b
        eor     v5.16b, v10.16b, v9.16b
        str     q9, [x2], #16
        ext     v10.16b, v18.16b, v18.16b, #8
        cmtst   v9.2d, v9.2d, v6.2d
        and     v9.16b, v9.16b, v8.16b
        eor     v10.16b, v17.16b, v10.16b
        cmtst   v17.2d, v10.2d, v6.2d
        eor     v6.16b, v16.16b, v10.16b
        str     q10, [x2], #16
        ext     v9.16b, v9.16b, v9.16b, #8
        add     v10.2d, v10.2d, v10.2d
        eor     v9.16b, v10.16b, v9.16b
        str     q9, [x2], #16
        eor     v7.16b, v7.16b, v9.16b
        add     v9.2d, v9.2d, v9.2d
        and     v8.16b, v17.16b, v8.16b
        ext     v8.16b, v8.16b, v8.16b, #8
        eor     v8.16b, v9.16b, v8.16b
        str     q8, [x2]                    // next round tweak

        bl      _bsaes_decrypt8

        eor     v6.16b, v6.16b, v13.16b
        eor     v0.16b, v0.16b, v11.16b
        ldr     q8, [x0], #16
        eor     v7.16b, v7.16b, v8.16b
        str     q0, [x21], #16
        eor     v0.16b, v1.16b, v12.16b
        ldr     q1, [x0], #16
        eor     v1.16b, v3.16b, v1.16b
        subs    x22, x22, #0x80
        eor     v2.16b, v2.16b, v15.16b
        eor     v3.16b, v4.16b, v14.16b
        ldr     q4, [x0], #16
        str     q0, [x21], #16
        ldr     q11, [x0]                   // next round tweak
        eor     v0.16b, v5.16b, v4.16b
        str     q6, [x21], #16
        str     q3, [x21], #16
        str     q2, [x21], #16
        str     q7, [x21], #16
        str     q1, [x21], #16
        str     q0, [x21], #16
        bpl     .Lxts_dec_loop

.Lxts_dec_short:
        adds    x22, x22, #0x70
        bmi     .Lxts_dec_done

        ldr     q8, .Lxts_magic
        sshr    v1.2d, v11.2d, #63
        add     v2.2d, v11.2d, v11.2d
        ldr     q9, .Lxts_magic+16
        subs    x22, x22, #0x10
        ldr     q0, [x20], #16
        and     v1.16b, v1.16b, v8.16b
        cmtst   v3.2d, v11.2d, v9.2d
        ext     v1.16b, v1.16b, v1.16b, #8
        and     v3.16b, v3.16b, v8.16b
        eor     v12.16b, v2.16b, v1.16b
        ext     v1.16b, v3.16b, v3.16b, #8
        add     v2.2d, v12.2d, v12.2d
        cmtst   v3.2d, v12.2d, v9.2d
        eor     v13.16b, v2.16b, v1.16b
        and     v22.16b, v3.16b, v8.16b
        bmi     .Lxts_dec_1

        ext     v2.16b, v22.16b, v22.16b, #8
        add     v3.2d, v13.2d, v13.2d
        ldr     q1, [x20], #16
        cmtst   v4.2d, v13.2d, v9.2d
        subs    x22, x22, #0x10
        eor     v14.16b, v3.16b, v2.16b
        and     v23.16b, v4.16b, v8.16b
        bmi     .Lxts_dec_2

        ext     v3.16b, v23.16b, v23.16b, #8
        add     v4.2d, v14.2d, v14.2d
        ldr     q2, [x20], #16
        cmtst   v5.2d, v14.2d, v9.2d
        eor     v0.16b, v0.16b, v11.16b
        subs    x22, x22, #0x10
        eor     v15.16b, v4.16b, v3.16b
        and     v24.16b, v5.16b, v8.16b
        bmi     .Lxts_dec_3

        ext     v4.16b, v24.16b, v24.16b, #8
        add     v5.2d, v15.2d, v15.2d
        ldr     q3, [x20], #16
        cmtst   v6.2d, v15.2d, v9.2d
        eor     v1.16b, v1.16b, v12.16b
        subs    x22, x22, #0x10
        eor     v16.16b, v5.16b, v4.16b
        and     v25.16b, v6.16b, v8.16b
        bmi     .Lxts_dec_4

        ext     v5.16b, v25.16b, v25.16b, #8
        add     v6.2d, v16.2d, v16.2d
        add     x0, x19, #16
        cmtst   v7.2d, v16.2d, v9.2d
        ldr     q4, [x20], #16
        eor     v2.16b, v2.16b, v13.16b
        str     q16, [x0], #16
        subs    x22, x22, #0x10
        eor     v17.16b, v6.16b, v5.16b
        and     v26.16b, v7.16b, v8.16b
        bmi     .Lxts_dec_5

        ext     v7.16b, v26.16b, v26.16b, #8
        add     v18.2d, v17.2d, v17.2d
        ldr     q5, [x20], #16
        eor     v3.16b, v3.16b, v14.16b
        str     q17, [x0], #16
        subs    x22, x22, #0x10
        eor     v18.16b, v18.16b, v7.16b
        bmi     .Lxts_dec_6

        ldr     q6, [x20], #16
        eor     v4.16b, v4.16b, v15.16b
        eor     v5.16b, v5.16b, v16.16b
        str     q18, [x0]                   // next round tweak
        mov     x9, sp                      // pass key schedule
        mov     x10, x1
        add     x0, x19, #16
        sub     x22, x22, #0x10
        eor     v6.16b, v6.16b, v17.16b

        bl      _bsaes_decrypt8

        ldr     q16, [x0], #16
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        ldr     q17, [x0], #16
        eor     v6.16b, v6.16b, v13.16b
        eor     v4.16b, v4.16b, v14.16b
        eor     v2.16b, v2.16b, v15.16b
        ldr     q11, [x0]                   // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        eor     v0.16b, v7.16b, v16.16b
        eor     v1.16b, v3.16b, v17.16b
        str     q6, [x21], #16
        str     q4, [x21], #16
        str     q2, [x21], #16
        str     q0, [x21], #16
        str     q1, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_6:
        eor     v4.16b, v4.16b, v15.16b
        eor     v5.16b, v5.16b, v16.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_decrypt8

        ldr     q16, [x0], #16
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v6.16b, v6.16b, v13.16b
        eor     v4.16b, v4.16b, v14.16b
        ldr     q11, [x0]                   // next round tweak
        eor     v2.16b, v2.16b, v15.16b
        str     q0, [x21], #16
        str     q1, [x21], #16
        eor     v0.16b, v7.16b, v16.16b
        str     q6, [x21], #16
        str     q4, [x21], #16
        str     q2, [x21], #16
        str     q0, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_5:
        eor     v3.16b, v3.16b, v14.16b
        eor     v4.16b, v4.16b, v15.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_decrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        ldr     q11, [x0]                   // next round tweak
        eor     v6.16b, v6.16b, v13.16b
        eor     v4.16b, v4.16b, v14.16b
        eor     v2.16b, v2.16b, v15.16b
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q6, [x21], #16
        str     q4, [x21], #16
        str     q2, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_4:
        eor     v2.16b, v2.16b, v13.16b
        eor     v3.16b, v3.16b, v14.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_decrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v6.16b, v6.16b, v13.16b
        eor     v4.16b, v4.16b, v14.16b
        mov     v11.16b, v15.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q6, [x21], #16
        str     q4, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_3:
        eor     v1.16b, v1.16b, v12.16b
        eor     v2.16b, v2.16b, v13.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_decrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        eor     v6.16b, v6.16b, v13.16b
        mov     v11.16b, v14.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        str     q6, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_2:
        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        mov     x9, sp                      // pass key schedule
        mov     x10, x1                     // pass rounds
        add     x0, x19, #16

        bl      _bsaes_decrypt8

        eor     v0.16b, v0.16b, v11.16b
        eor     v1.16b, v1.16b, v12.16b
        mov     v11.16b, v13.16b            // next round tweak
        str     q0, [x21], #16
        str     q1, [x21], #16
        b       .Lxts_dec_done

.align  4
.Lxts_dec_1:
        eor     v0.16b, v0.16b, v11.16b
        sub     x0, sp, #16
        sub     x1, sp, #16
        mov     x2, x23
        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
        mov     v14.d[0], v12.d[1]
        str     q0, [sp, #-16]!

        bl      AES_decrypt

        ldr     q0, [sp], #16
        trn1    v13.2d, v11.2d, v13.2d
        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
        eor     v0.16b, v0.16b, v13.16b
        str     q0, [x21], #16

.Lxts_dec_done:
        adds    x22, x22, #0x10
        beq     .Lxts_dec_ret

        // calculate one round of extra tweak for the stolen ciphertext
        ldr     q8, .Lxts_magic
        sshr    v6.2d, v11.2d, #63
        and     v6.16b, v6.16b, v8.16b
        add     v12.2d, v11.2d, v11.2d
        ext     v6.16b, v6.16b, v6.16b, #8
        eor     v12.16b, v12.16b, v6.16b

        // perform the final decryption with the last tweak value
        ldr     q0, [x20], #16
        eor     v0.16b, v0.16b, v12.16b
        str     q0, [sp, #-16]!
        mov     x0, sp
        mov     x1, sp
        mov     x2, x23
        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
        mov     v14.d[0], v12.d[1]

        bl      AES_decrypt

        trn1    v12.2d, v12.2d, v14.2d
        trn1    v11.2d, v11.2d, v13.2d
        ldr     q0, [sp], #16
        eor     v0.16b, v0.16b, v12.16b
        str     q0, [x21]

        mov     x6, x21
        // Penultimate ciphertext block produces final plaintext part-block
        // plus remaining part of final ciphertext block. Move plaintext part
        // to final position and reuse penultimate plaintext block buffer to
        // construct final ciphertext block
.Lxts_dec_steal:
        ldrb    w1, [x21]
        ldrb    w0, [x20], #1
        strb    w1, [x21, #0x10]
        strb    w0, [x21], #1

        subs    x22, x22, #1
        bhi     .Lxts_dec_steal

        // Finally decrypt the penultimate plaintext block using the
        // penultimate tweak
        ldr     q0, [x6]
        eor     v0.16b, v0.16b, v11.16b
        str     q0, [sp, #-16]!
        mov     x0, sp
        mov     x1, sp
        mov     x2, x23
        mov     x21, x6

        bl      AES_decrypt

        trn1    v11.2d, v11.2d, v13.2d
        ldr     q0, [sp], #16
        eor     v0.16b, v0.16b, v11.16b
        str     q0, [x21]

.Lxts_dec_ret:

        movi    v0.16b, #0
        movi    v1.16b, #0
.Lxts_dec_bzero: // wipe key schedule
        stp     q0, q1, [sp], #32
        cmp     sp, x19
        bne     .Lxts_dec_bzero

        ldp     x19, x20, [sp, #80]
        ldp     x21, x22, [sp, #96]
        ldr     x23, [sp, #112]
        ldp     d8, d9, [sp, #128]
        ldp     d10, d11, [sp, #144]
        ldp     d12, d13, [sp, #160]
        ldp     d14, d15, [sp, #176]
        ldp     x29, x30, [sp], #192
        ret
.size   ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt