123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878 |
- #!/usr/bin/env perl
- # Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # Keccak-1600 for ARMv8.
- #
- # June 2017.
- #
- # This is straightforward KECCAK_1X_ALT implementation. It makes no
- # sense to attempt SIMD/NEON implementation for following reason.
- # 64-bit lanes of vector registers can't be addressed as easily as in
- # 32-bit mode. This means that 64-bit NEON is bound to be slower than
- # 32-bit NEON, and this implementation is faster than 32-bit NEON on
- # same processor. Even though it takes more scalar xor's and andn's,
- # it gets compensated by availability of rotate. Not to forget that
- # most processors achieve higher issue rate with scalar instructions.
- #
- # February 2018.
- #
- # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
- # variant with register permutation/rotation twist that allows to
- # eliminate copies to temporary registers. If you look closely you'll
- # notice that it uses only one lane of vector registers. The new
- # instructions effectively facilitate parallel hashing, which we don't
- # support [yet?]. But lowest-level core procedure is prepared for it.
- # The inner round is 67 [vector] instructions, so it's not actually
- # obvious that it will provide performance improvement [in serial
- # hash] as long as vector instructions issue rate is limited to 1 per
- # cycle...
- #
- ######################################################################
- # Numbers are cycles per processed byte.
- #
- # r=1088(*)
- #
- # Cortex-A53 13
- # Cortex-A57 12
- # X-Gene 14
- # Mongoose 10
- # Kryo 12
- # Denver 7.8
- # Apple A7 7.2
- # ThunderX2 9.7
- #
- # (*) Corresponds to SHA3-256. No improvement coefficients are listed
- # because they vary too much from compiler to compiler. Newer
- # compiler does much better and improvement varies from 5% on
- # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
- # compiler this code is at least 2x faster...
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- my @rhotates = ([ 0, 1, 62, 28, 27 ],
- [ 36, 44, 6, 55, 20 ],
- [ 3, 10, 43, 25, 39 ],
- [ 41, 45, 15, 21, 8 ],
- [ 18, 2, 61, 56, 14 ]);
- $code.=<<___;
- #include "arm_arch.h"
- .text
- .align 8 // strategic alignment and padding that allows to use
- // address value as loop termination condition...
- .quad 0,0,0,0,0,0,0,0
- .type iotas,%object
- iotas:
- .quad 0x0000000000000001
- .quad 0x0000000000008082
- .quad 0x800000000000808a
- .quad 0x8000000080008000
- .quad 0x000000000000808b
- .quad 0x0000000080000001
- .quad 0x8000000080008081
- .quad 0x8000000000008009
- .quad 0x000000000000008a
- .quad 0x0000000000000088
- .quad 0x0000000080008009
- .quad 0x000000008000000a
- .quad 0x000000008000808b
- .quad 0x800000000000008b
- .quad 0x8000000000008089
- .quad 0x8000000000008003
- .quad 0x8000000000008002
- .quad 0x8000000000000080
- .quad 0x000000000000800a
- .quad 0x800000008000000a
- .quad 0x8000000080008081
- .quad 0x8000000000008080
- .quad 0x0000000080000001
- .quad 0x8000000080008008
- .size iotas,.-iotas
- ___
- {{{
- my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
- (0, 5, 10, 15, 20));
- $A[3][3] = "x25"; # x18 is reserved
- my @C = map("x$_", (26,27,28,30));
- $code.=<<___;
- .type KeccakF1600_int,%function
- .align 5
- KeccakF1600_int:
- AARCH64_SIGN_LINK_REGISTER
- adr $C[2],iotas
- stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
- b .Loop
- .align 4
- .Loop:
- ////////////////////////////////////////// Theta
- eor $C[0],$A[0][0],$A[1][0]
- stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
- eor $C[1],$A[0][1],$A[1][1]
- eor $C[2],$A[0][2],$A[1][2]
- eor $C[3],$A[0][3],$A[1][3]
- ___
- $C[4]=$A[0][4];
- $C[5]=$A[1][4];
- $code.=<<___;
- eor $C[4],$A[0][4],$A[1][4]
- eor $C[0],$C[0],$A[2][0]
- eor $C[1],$C[1],$A[2][1]
- eor $C[2],$C[2],$A[2][2]
- eor $C[3],$C[3],$A[2][3]
- eor $C[4],$C[4],$A[2][4]
- eor $C[0],$C[0],$A[3][0]
- eor $C[1],$C[1],$A[3][1]
- eor $C[2],$C[2],$A[3][2]
- eor $C[3],$C[3],$A[3][3]
- eor $C[4],$C[4],$A[3][4]
- eor $C[0],$C[0],$A[4][0]
- eor $C[2],$C[2],$A[4][2]
- eor $C[1],$C[1],$A[4][1]
- eor $C[3],$C[3],$A[4][3]
- eor $C[4],$C[4],$A[4][4]
- eor $C[5],$C[0],$C[2],ror#63
- eor $A[0][1],$A[0][1],$C[5]
- eor $A[1][1],$A[1][1],$C[5]
- eor $A[2][1],$A[2][1],$C[5]
- eor $A[3][1],$A[3][1],$C[5]
- eor $A[4][1],$A[4][1],$C[5]
- eor $C[5],$C[1],$C[3],ror#63
- eor $C[2],$C[2],$C[4],ror#63
- eor $C[3],$C[3],$C[0],ror#63
- eor $C[4],$C[4],$C[1],ror#63
- eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
- eor $A[1][2],$A[1][2],$C[5]
- eor $A[2][2],$A[2][2],$C[5]
- eor $A[3][2],$A[3][2],$C[5]
- eor $A[4][2],$A[4][2],$C[5]
- eor $A[0][0],$A[0][0],$C[4]
- eor $A[1][0],$A[1][0],$C[4]
- eor $A[2][0],$A[2][0],$C[4]
- eor $A[3][0],$A[3][0],$C[4]
- eor $A[4][0],$A[4][0],$C[4]
- ___
- $C[4]=undef;
- $C[5]=undef;
- $code.=<<___;
- ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
- eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
- eor $A[1][3],$A[1][3],$C[2]
- eor $A[2][3],$A[2][3],$C[2]
- eor $A[3][3],$A[3][3],$C[2]
- eor $A[4][3],$A[4][3],$C[2]
- eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
- eor $A[1][4],$A[1][4],$C[3]
- eor $A[2][4],$A[2][4],$C[3]
- eor $A[3][4],$A[3][4],$C[3]
- eor $A[4][4],$A[4][4],$C[3]
- ////////////////////////////////////////// Rho+Pi
- mov $C[3],$A[0][1]
- ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
- //mov $C[1],$A[0][2]
- ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
- //mov $C[0],$A[0][3]
- ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
- //mov $C[2],$A[0][4]
- ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
- ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
- ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
- ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
- ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
- ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
- ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
- ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
- ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
- ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
- ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
- ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
- ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
- ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
- ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
- ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
- ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
- ror $A[1][0],$C[0],#64-$rhotates[0][3]
- ror $A[2][0],$C[3],#64-$rhotates[0][1]
- ror $A[3][0],$C[2],#64-$rhotates[0][4]
- ror $A[4][0],$C[1],#64-$rhotates[0][2]
- ////////////////////////////////////////// Chi+Iota
- bic $C[0],$A[0][2],$A[0][1]
- bic $C[1],$A[0][3],$A[0][2]
- bic $C[2],$A[0][0],$A[0][4]
- bic $C[3],$A[0][1],$A[0][0]
- eor $A[0][0],$A[0][0],$C[0]
- bic $C[0],$A[0][4],$A[0][3]
- eor $A[0][1],$A[0][1],$C[1]
- ldr $C[1],[sp,#16]
- eor $A[0][3],$A[0][3],$C[2]
- eor $A[0][4],$A[0][4],$C[3]
- eor $A[0][2],$A[0][2],$C[0]
- ldr $C[3],[$C[1]],#8 // Iota[i++]
- bic $C[0],$A[1][2],$A[1][1]
- tst $C[1],#255 // are we done?
- str $C[1],[sp,#16]
- bic $C[1],$A[1][3],$A[1][2]
- bic $C[2],$A[1][0],$A[1][4]
- eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
- bic $C[3],$A[1][1],$A[1][0]
- eor $A[1][0],$A[1][0],$C[0]
- bic $C[0],$A[1][4],$A[1][3]
- eor $A[1][1],$A[1][1],$C[1]
- eor $A[1][3],$A[1][3],$C[2]
- eor $A[1][4],$A[1][4],$C[3]
- eor $A[1][2],$A[1][2],$C[0]
- bic $C[0],$A[2][2],$A[2][1]
- bic $C[1],$A[2][3],$A[2][2]
- bic $C[2],$A[2][0],$A[2][4]
- bic $C[3],$A[2][1],$A[2][0]
- eor $A[2][0],$A[2][0],$C[0]
- bic $C[0],$A[2][4],$A[2][3]
- eor $A[2][1],$A[2][1],$C[1]
- eor $A[2][3],$A[2][3],$C[2]
- eor $A[2][4],$A[2][4],$C[3]
- eor $A[2][2],$A[2][2],$C[0]
- bic $C[0],$A[3][2],$A[3][1]
- bic $C[1],$A[3][3],$A[3][2]
- bic $C[2],$A[3][0],$A[3][4]
- bic $C[3],$A[3][1],$A[3][0]
- eor $A[3][0],$A[3][0],$C[0]
- bic $C[0],$A[3][4],$A[3][3]
- eor $A[3][1],$A[3][1],$C[1]
- eor $A[3][3],$A[3][3],$C[2]
- eor $A[3][4],$A[3][4],$C[3]
- eor $A[3][2],$A[3][2],$C[0]
- bic $C[0],$A[4][2],$A[4][1]
- bic $C[1],$A[4][3],$A[4][2]
- bic $C[2],$A[4][0],$A[4][4]
- bic $C[3],$A[4][1],$A[4][0]
- eor $A[4][0],$A[4][0],$C[0]
- bic $C[0],$A[4][4],$A[4][3]
- eor $A[4][1],$A[4][1],$C[1]
- eor $A[4][3],$A[4][3],$C[2]
- eor $A[4][4],$A[4][4],$C[3]
- eor $A[4][2],$A[4][2],$C[0]
- bne .Loop
- ldr x30,[sp,#24]
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size KeccakF1600_int,.-KeccakF1600_int
- .type KeccakF1600,%function
- .align 5
- KeccakF1600:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#48
- str x0,[sp,#32] // offload argument
- mov $C[0],x0
- ldp $A[0][0],$A[0][1],[x0,#16*0]
- ldp $A[0][2],$A[0][3],[$C[0],#16*1]
- ldp $A[0][4],$A[1][0],[$C[0],#16*2]
- ldp $A[1][1],$A[1][2],[$C[0],#16*3]
- ldp $A[1][3],$A[1][4],[$C[0],#16*4]
- ldp $A[2][0],$A[2][1],[$C[0],#16*5]
- ldp $A[2][2],$A[2][3],[$C[0],#16*6]
- ldp $A[2][4],$A[3][0],[$C[0],#16*7]
- ldp $A[3][1],$A[3][2],[$C[0],#16*8]
- ldp $A[3][3],$A[3][4],[$C[0],#16*9]
- ldp $A[4][0],$A[4][1],[$C[0],#16*10]
- ldp $A[4][2],$A[4][3],[$C[0],#16*11]
- ldr $A[4][4],[$C[0],#16*12]
- bl KeccakF1600_int
- ldr $C[0],[sp,#32]
- stp $A[0][0],$A[0][1],[$C[0],#16*0]
- stp $A[0][2],$A[0][3],[$C[0],#16*1]
- stp $A[0][4],$A[1][0],[$C[0],#16*2]
- stp $A[1][1],$A[1][2],[$C[0],#16*3]
- stp $A[1][3],$A[1][4],[$C[0],#16*4]
- stp $A[2][0],$A[2][1],[$C[0],#16*5]
- stp $A[2][2],$A[2][3],[$C[0],#16*6]
- stp $A[2][4],$A[3][0],[$C[0],#16*7]
- stp $A[3][1],$A[3][2],[$C[0],#16*8]
- stp $A[3][3],$A[3][4],[$C[0],#16*9]
- stp $A[4][0],$A[4][1],[$C[0],#16*10]
- stp $A[4][2],$A[4][3],[$C[0],#16*11]
- str $A[4][4],[$C[0],#16*12]
- ldp x19,x20,[x29,#16]
- add sp,sp,#48
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size KeccakF1600,.-KeccakF1600
- .globl SHA3_absorb
- .type SHA3_absorb,%function
- .align 5
- SHA3_absorb:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
- stp x0,x1,[sp,#32] // offload arguments
- stp x2,x3,[sp,#48]
- mov $C[0],x0 // uint64_t A[5][5]
- mov $C[1],x1 // const void *inp
- mov $C[2],x2 // size_t len
- mov $C[3],x3 // size_t bsz
- ldp $A[0][0],$A[0][1],[$C[0],#16*0]
- ldp $A[0][2],$A[0][3],[$C[0],#16*1]
- ldp $A[0][4],$A[1][0],[$C[0],#16*2]
- ldp $A[1][1],$A[1][2],[$C[0],#16*3]
- ldp $A[1][3],$A[1][4],[$C[0],#16*4]
- ldp $A[2][0],$A[2][1],[$C[0],#16*5]
- ldp $A[2][2],$A[2][3],[$C[0],#16*6]
- ldp $A[2][4],$A[3][0],[$C[0],#16*7]
- ldp $A[3][1],$A[3][2],[$C[0],#16*8]
- ldp $A[3][3],$A[3][4],[$C[0],#16*9]
- ldp $A[4][0],$A[4][1],[$C[0],#16*10]
- ldp $A[4][2],$A[4][3],[$C[0],#16*11]
- ldr $A[4][4],[$C[0],#16*12]
- b .Loop_absorb
- .align 4
- .Loop_absorb:
- subs $C[0],$C[2],$C[3] // len - bsz
- blo .Labsorbed
- str $C[0],[sp,#48] // save len - bsz
- ___
- for (my $i=0; $i<24; $i+=2) {
- my $j = $i+1;
- $code.=<<___;
- ldr $C[0],[$C[1]],#8 // *inp++
- #ifdef __AARCH64EB__
- rev $C[0],$C[0]
- #endif
- eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
- cmp $C[3],#8*($i+2)
- blo .Lprocess_block
- ldr $C[0],[$C[1]],#8 // *inp++
- #ifdef __AARCH64EB__
- rev $C[0],$C[0]
- #endif
- eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
- beq .Lprocess_block
- ___
- }
- $code.=<<___;
- ldr $C[0],[$C[1]],#8 // *inp++
- #ifdef __AARCH64EB__
- rev $C[0],$C[0]
- #endif
- eor $A[4][4],$A[4][4],$C[0]
- .Lprocess_block:
- str $C[1],[sp,#40] // save inp
- bl KeccakF1600_int
- ldr $C[1],[sp,#40] // restore arguments
- ldp $C[2],$C[3],[sp,#48]
- b .Loop_absorb
- .align 4
- .Labsorbed:
- ldr $C[1],[sp,#32]
- stp $A[0][0],$A[0][1],[$C[1],#16*0]
- stp $A[0][2],$A[0][3],[$C[1],#16*1]
- stp $A[0][4],$A[1][0],[$C[1],#16*2]
- stp $A[1][1],$A[1][2],[$C[1],#16*3]
- stp $A[1][3],$A[1][4],[$C[1],#16*4]
- stp $A[2][0],$A[2][1],[$C[1],#16*5]
- stp $A[2][2],$A[2][3],[$C[1],#16*6]
- stp $A[2][4],$A[3][0],[$C[1],#16*7]
- stp $A[3][1],$A[3][2],[$C[1],#16*8]
- stp $A[3][3],$A[3][4],[$C[1],#16*9]
- stp $A[4][0],$A[4][1],[$C[1],#16*10]
- stp $A[4][2],$A[4][3],[$C[1],#16*11]
- str $A[4][4],[$C[1],#16*12]
- mov x0,$C[2] // return value
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size SHA3_absorb,.-SHA3_absorb
- ___
- {
- my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
- $code.=<<___;
- .globl SHA3_squeeze
- .type SHA3_squeeze,%function
- .align 5
- SHA3_squeeze:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-48]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- mov $A_flat,x0 // put aside arguments
- mov $out,x1
- mov $len,x2
- mov $bsz,x3
- .Loop_squeeze:
- ldr x4,[x0],#8
- cmp $len,#8
- blo .Lsqueeze_tail
- #ifdef __AARCH64EB__
- rev x4,x4
- #endif
- str x4,[$out],#8
- subs $len,$len,#8
- beq .Lsqueeze_done
- subs x3,x3,#8
- bhi .Loop_squeeze
- mov x0,$A_flat
- bl KeccakF1600
- mov x0,$A_flat
- mov x3,$bsz
- b .Loop_squeeze
- .align 4
- .Lsqueeze_tail:
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done
- strb w4,[$out],#1
- .Lsqueeze_done:
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x29,x30,[sp],#48
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size SHA3_squeeze,.-SHA3_squeeze
- ___
- } }}}
- {{{
- my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
- "v".($_+3).".16b", "v".($_+4).".16b" ],
- (0, 5, 10, 15, 20));
- my @C = map("v$_.16b", (25..31));
- my @D = @C[4,5,6,2,3];
- $code.=<<___;
- .type KeccakF1600_ce,%function
- .align 5
- KeccakF1600_ce:
- mov x9,#24
- adr x10,iotas
- b .Loop_ce
- .align 4
- .Loop_ce:
- ////////////////////////////////////////////////// Theta
- eor3 $C[0],$A[4][0],$A[3][0],$A[2][0]
- eor3 $C[1],$A[4][1],$A[3][1],$A[2][1]
- eor3 $C[2],$A[4][2],$A[3][2],$A[2][2]
- eor3 $C[3],$A[4][3],$A[3][3],$A[2][3]
- eor3 $C[4],$A[4][4],$A[3][4],$A[2][4]
- eor3 $C[0],$C[0], $A[1][0],$A[0][0]
- eor3 $C[1],$C[1], $A[1][1],$A[0][1]
- eor3 $C[2],$C[2], $A[1][2],$A[0][2]
- eor3 $C[3],$C[3], $A[1][3],$A[0][3]
- eor3 $C[4],$C[4], $A[1][4],$A[0][4]
- rax1 $C[5],$C[0],$C[2] // D[1]
- rax1 $C[6],$C[1],$C[3] // D[2]
- rax1 $C[2],$C[2],$C[4] // D[3]
- rax1 $C[3],$C[3],$C[0] // D[4]
- rax1 $C[4],$C[4],$C[1] // D[0]
- ////////////////////////////////////////////////// Theta+Rho+Pi
- xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
- xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
- xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
- xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
- xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
- xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
- xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
- xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
- xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
- xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
- xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
- xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
- xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
- xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
- xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
- xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
- xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
- xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
- xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
- eor $A[0][0],$A[0][0],$D[0]
- xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
- xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
- xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
- xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
- xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
- ////////////////////////////////////////////////// Chi+Iota
- bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1]
- bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1]
- bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
- bcax $A[4][3],$A[4][3],$C[1], $A[4][4]
- bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1]
- ld1r {$C[1]},[x10],#8
- bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3]
- bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3]
- bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
- bcax $A[3][0],$A[3][0],$D[1], $A[3][1]
- bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3]
- bcax $A[2][0],$C[0], $A[2][2],$D[2]
- bcax $A[2][1],$D[2], $A[2][3],$A[2][2]
- bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
- bcax $A[2][3],$A[2][3],$C[0], $A[2][4]
- bcax $A[2][4],$A[2][4],$D[2], $C[0]
- bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3]
- bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3]
- bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0]
- bcax $A[1][0],$A[1][0],$D[0], $A[1][1]
- bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3]
- bcax $A[0][3],$D[3], $A[0][0],$D[4]
- bcax $A[0][4],$D[4], $A[0][1],$A[0][0]
- bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1]
- bcax $A[0][1],$A[0][1],$D[3], $A[0][2]
- bcax $A[0][2],$A[0][2],$D[4], $D[3]
- eor $A[0][0],$A[0][0],$C[1]
- subs x9,x9,#1
- bne .Loop_ce
- ret
- .size KeccakF1600_ce,.-KeccakF1600_ce
- .type KeccakF1600_cext,%function
- .align 5
- KeccakF1600_cext:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp d8,d9,[sp,#16] // per ABI requirement
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
- ___
- for($i=0; $i<24; $i+=2) { # load A[5][5]
- my $j=$i+1;
- $code.=<<___;
- ldp d$i,d$j,[x0,#8*$i]
- ___
- }
- $code.=<<___;
- ldr d24,[x0,#8*$i]
- bl KeccakF1600_ce
- ldr x30,[sp,#8]
- ___
- for($i=0; $i<24; $i+=2) { # store A[5][5]
- my $j=$i+1;
- $code.=<<___;
- stp d$i,d$j,[x0,#8*$i]
- ___
- }
- $code.=<<___;
- str d24,[x0,#8*$i]
- ldp d8,d9,[sp,#16]
- ldp d10,d11,[sp,#32]
- ldp d12,d13,[sp,#48]
- ldp d14,d15,[sp,#64]
- ldr x29,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size KeccakF1600_cext,.-KeccakF1600_cext
- ___
- {
- my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
- $code.=<<___;
- .globl SHA3_absorb_cext
- .type SHA3_absorb_cext,%function
- .align 5
- SHA3_absorb_cext:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp d8,d9,[sp,#16] // per ABI requirement
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
- ___
- for($i=0; $i<24; $i+=2) { # load A[5][5]
- my $j=$i+1;
- $code.=<<___;
- ldp d$i,d$j,[x0,#8*$i]
- ___
- }
- $code.=<<___;
- ldr d24,[x0,#8*$i]
- b .Loop_absorb_ce
- .align 4
- .Loop_absorb_ce:
- subs $len,$len,$bsz // len - bsz
- blo .Labsorbed_ce
- ___
- for (my $i=0; $i<24; $i+=2) {
- my $j = $i+1;
- $code.=<<___;
- ldr d31,[$inp],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
- cmp $bsz,#8*($i+2)
- blo .Lprocess_block_ce
- ldr d31,[$inp],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
- beq .Lprocess_block_ce
- ___
- }
- $code.=<<___;
- ldr d31,[$inp],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor $A[4][4],$A[4][4],v31.16b
- .Lprocess_block_ce:
- bl KeccakF1600_ce
- b .Loop_absorb_ce
- .align 4
- .Labsorbed_ce:
- ___
- for($i=0; $i<24; $i+=2) { # store A[5][5]
- my $j=$i+1;
- $code.=<<___;
- stp d$i,d$j,[x0,#8*$i]
- ___
- }
- $code.=<<___;
- str d24,[x0,#8*$i]
- add x0,$len,$bsz // return value
- ldp d8,d9,[sp,#16]
- ldp d10,d11,[sp,#32]
- ldp d12,d13,[sp,#48]
- ldp d14,d15,[sp,#64]
- ldp x29,x30,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size SHA3_absorb_cext,.-SHA3_absorb_cext
- ___
- }
- {
- my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
- $code.=<<___;
- .globl SHA3_squeeze_cext
- .type SHA3_squeeze_cext,%function
- .align 5
- SHA3_squeeze_cext:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- mov x9,$ctx
- mov x10,$bsz
- .Loop_squeeze_ce:
- ldr x4,[x9],#8
- cmp $len,#8
- blo .Lsqueeze_tail_ce
- #ifdef __AARCH64EB__
- rev x4,x4
- #endif
- str x4,[$out],#8
- beq .Lsqueeze_done_ce
- sub $len,$len,#8
- subs x10,x10,#8
- bhi .Loop_squeeze_ce
- bl KeccakF1600_cext
- ldr x30,[sp,#8]
- mov x9,$ctx
- mov x10,$bsz
- b .Loop_squeeze_ce
- .align 4
- .Lsqueeze_tail_ce:
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- lsr x4,x4,#8
- subs $len,$len,#1
- beq .Lsqueeze_done_ce
- strb w4,[$out],#1
- .Lsqueeze_done_ce:
- ldr x29,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
- ___
- } }}}
- $code.=<<___;
- .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
- ___
- { my %opcode = (
- "rax1" => 0xce608c00, "eor3" => 0xce000000,
- "bcax" => 0xce200000, "xar" => 0xce800000 );
- sub unsha3 {
- my ($mnemonic,$arg)=@_;
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
- &&
- sprintf ".inst\t0x%08x\t//%s %s",
- $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
- $mnemonic,$arg;
- }
- }
- foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- m/\bld1r\b/ and s/\.16b/.2d/g or
- s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
- print $_,"\n";
- }
- close STDOUT or die "error closing STDOUT: $!";
|