|
- #!/usr/bin/env perl
- # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # [ABI- and endian-neutral] Keccak-1600 for C64x.
- #
- # June 2017.
- #
- # This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
- # with bit interleaving. 64-bit values are simply split between A- and
- # B-files, with A-file holding least significant halves. This works
- # out perfectly, because all operations including cross-communications
- # [in rotate operations] are always complementary. Performance is
- # [incredible for a 32-bit processor] 10.9 cycles per processed byte
- # for r=1088, which corresponds to SHA3-256. This is >15x faster than
- # compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
- # On average processor ends up issuing ~4.5 instructions per cycle...
- my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
- $A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
- ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
- my @C = (0..4,$A[3][0],$A[4][0]);
- my $iotas = "A14";
- my @rhotates = ([ 0, 1, 62, 28, 27 ],
- [ 36, 44, 6, 55, 20 ],
- [ 3, 10, 43, 25, 39 ],
- [ 41, 45, 15, 21, 8 ],
- [ 18, 2, 61, 56, 14 ]);
- sub ROL64 {
- my ($src,$rot,$dst,$p) = @_;
- if ($rot&1) {
- $code.=<<___;
- $p ROTL B$src,$rot/2+1,A$dst
- || ROTL A$src,$rot/2, B$dst
- ___
- } else {
- $code.=<<___;
- $p ROTL A$src,$rot/2,A$dst
- || ROTL B$src,$rot/2,B$dst
- ___
- }
- }
- ########################################################################
- # Stack frame layout
- #
- # SP--->+------+------+
- # | | |
- # +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
- # | | |
- # +2--->+------+------+<- -8
- # | | |
- # +3--->+------+------+<- -7
- # | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
- # +4--->+------+------+<- -6
- # | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
- # +5--->+------+------+<- -5 below is ABI-compliant layout
- # | A10 | A11 |
- # +6--->+------+------+<- -4
- # | A12 | A13 |
- # +7--->+------+------+<- -3
- # | A14 | B3 |
- # +8--->+------+------+<- -2
- # | B10 | B11 |
- # +9--->+------+------+<- -1
- # | B12 | B13 |
- # +------+------+<---FP
- # | A15 |
- # +------+--
- $code.=<<___;
- .text
- .if .ASSEMBLER_VERSION<7000000
- .asg 0,__TI_EABI__
- .endif
- .if __TI_EABI__
- .nocmp
- .asg KeccakF1600,_KeccakF1600
- .asg SHA3_absorb,_SHA3_absorb
- .asg SHA3_squeeze,_SHA3_squeeze
- .endif
- .asg B3,RA
- .asg A15,FP
- .asg B15,SP
- .align 32
- _KeccakF1600_int:
- .asmfunc
- STDW A3:A2,*FP[-7]
- || STDW B3:B2,*SP[4]
- _KeccakF1600_cheat:
- .if __TI_EABI__
- ADDKPC _KeccakF1600_int,B0
- || MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
- MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
- .else
- ADDKPC _KeccakF1600_int,B0
- || MVKL (iotas-_KeccakF1600_int),$iotas
- MVKH (iotas-_KeccakF1600_int),$iotas
- .endif
- ADD B0,$iotas,$iotas
- loop?:
- XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
- || XOR B$A[0][2],B$A[1][2],B$C[2]
- || XOR A$A[0][3],A$A[1][3],A$C[3]
- || XOR B$A[0][3],B$A[1][3],B$C[3]
- || XOR A$A[0][0],A$A[1][0],A$C[0]
- || XOR B$A[0][0],B$A[1][0],B$C[0]
- XOR A$A[2][2],A$C[2],A$C[2]
- || XOR B$A[2][2],B$C[2],B$C[2]
- || XOR A$A[2][3],A$C[3],A$C[3]
- || XOR B$A[2][3],B$C[3],B$C[3]
- || XOR A$A[2][0],A$C[0],A$C[0]
- || XOR B$A[2][0],B$C[0],B$C[0]
- XOR A$A[3][2],A$C[2],A$C[2]
- || XOR B$A[3][2],B$C[2],B$C[2]
- || XOR A$A[3][3],A$C[3],A$C[3]
- || XOR B$A[3][3],B$C[3],B$C[3]
- || XOR A$A[3][0],A$C[0],A$C[0]
- || XOR B$A[3][0],B$C[0],B$C[0]
- XOR A$A[4][2],A$C[2],A$C[2]
- || XOR B$A[4][2],B$C[2],B$C[2]
- || XOR A$A[4][3],A$C[3],A$C[3]
- || XOR B$A[4][3],B$C[3],B$C[3]
- || XOR A$A[4][0],A$C[0],A$C[0]
- || XOR B$A[4][0],B$C[0],B$C[0]
- XOR A$A[0][4],A$A[1][4],A$C[4]
- || XOR B$A[0][4],B$A[1][4],B$C[4]
- || XOR A$A[0][1],A$A[1][1],A$C[1]
- || XOR B$A[0][1],B$A[1][1],B$C[1]
- || STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
- STDW B$A[3][0]:B$A[4][0],*SP[2]
- || XOR A$A[2][4],A$C[4],A$C[4]
- || XOR B$A[2][4],B$C[4],B$C[4]
- || XOR A$A[2][1],A$C[1],A$C[1]
- || XOR B$A[2][1],B$C[1],B$C[1]
- || ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
- || ROTL A$C[2],0,B$C[5]
- XOR A$A[3][4],A$C[4],A$C[4]
- || XOR B$A[3][4],B$C[4],B$C[4]
- || XOR A$A[3][1],A$C[1],A$C[1]
- || XOR B$A[3][1],B$C[1],B$C[1]
- || ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
- || ROTL A$C[3],0,B$C[6]
- XOR A$A[4][4],A$C[4],A$C[4]
- || XOR B$A[4][4],B$C[4],B$C[4]
- || XOR A$A[4][1],A$C[1],A$C[1]
- || XOR B$A[4][1],B$C[1],B$C[1]
- || XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
- || XOR B$C[0],B$C[5],B$C[5]
- XOR A$C[5],A$A[0][1],A$A[0][1]
- || XOR B$C[5],B$A[0][1],B$A[0][1]
- || XOR A$C[5],A$A[1][1],A$A[1][1]
- || XOR B$C[5],B$A[1][1],B$A[1][1]
- || XOR A$C[5],A$A[2][1],A$A[2][1]
- || XOR B$C[5],B$A[2][1],B$A[2][1]
- XOR A$C[5],A$A[3][1],A$A[3][1]
- || XOR B$C[5],B$A[3][1],B$A[3][1]
- || XOR A$C[5],A$A[4][1],A$A[4][1]
- || XOR B$C[5],B$A[4][1],B$A[4][1]
- || ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
- || ROTL A$C[4],0,B$C[5]
- || XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
- || XOR B$C[1],B$C[6],B$C[6]
- XOR A$C[6],A$A[0][2],A$A[0][2]
- || XOR B$C[6],B$A[0][2],B$A[0][2]
- || XOR A$C[6],A$A[1][2],A$A[1][2]
- || XOR B$C[6],B$A[1][2],B$A[1][2]
- || XOR A$C[6],A$A[2][2],A$A[2][2]
- || XOR B$C[6],B$A[2][2],B$A[2][2]
- || ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
- || ROTL A$C[1],0,B$C[1]
- XOR A$C[6],A$A[3][2],A$A[3][2]
- || XOR B$C[6],B$A[3][2],B$A[3][2]
- || XOR A$C[6],A$A[4][2],A$A[4][2]
- || XOR B$C[6],B$A[4][2],B$A[4][2]
- || ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
- || ROTL A$C[0],0,B$C[6]
- || XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
- || XOR B$C[5],B$C[2],B$C[2]
- XOR A$C[2],A$A[0][3],A$A[0][3]
- || XOR B$C[2],B$A[0][3],B$A[0][3]
- || XOR A$C[2],A$A[1][3],A$A[1][3]
- || XOR B$C[2],B$A[1][3],B$A[1][3]
- || XOR A$C[2],A$A[2][3],A$A[2][3]
- || XOR B$C[2],B$A[2][3],B$A[2][3]
- XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
- || XOR B$C[6],B$C[3],B$C[3]
- || LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
- || LDDW *SP[2],B$A[3][0]:B$A[4][0]
- || XOR A$C[2],A$A[3][3],A$A[3][3]
- || XOR B$C[2],B$A[3][3],B$A[3][3]
- XOR A$C[2],A$A[4][3],A$A[4][3]
- || XOR B$C[2],B$A[4][3],B$A[4][3]
- || XOR A$C[3],A$A[0][4],A$A[0][4]
- || XOR B$C[3],B$A[0][4],B$A[0][4]
- || XOR A$C[3],A$A[1][4],A$A[1][4]
- || XOR B$C[3],B$A[1][4],B$A[1][4]
- XOR A$C[3],A$A[2][4],A$A[2][4]
- || XOR B$C[3],B$A[2][4],B$A[2][4]
- || XOR A$C[3],A$A[3][4],A$A[3][4]
- || XOR B$C[3],B$A[3][4],B$A[3][4]
- || XOR A$C[3],A$A[4][4],A$A[4][4]
- || XOR B$C[3],B$A[4][4],B$A[4][4]
- XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
- || XOR B$C[1],B$C[4],B$C[4]
- || MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
- || MV B$A[0][1],B$C[1]
- ___
- &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
- $code.=<<___;
- XOR A$C[4],A$A[0][0],A$A[0][0]
- || XOR B$C[4],B$A[0][0],B$A[0][0]
- || XOR A$C[4],A$A[1][0],A$A[1][0]
- || XOR B$C[4],B$A[1][0],B$A[1][0]
- || MV A$A[0][3],A$C[3]
- || MV B$A[0][3],B$C[3]
- ___
- &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
- $code.=<<___;
- XOR A$C[4],A$A[2][0],A$A[2][0]
- || XOR B$C[4],B$A[2][0],B$A[2][0]
- || XOR A$C[4],A$A[3][0],A$A[3][0]
- || XOR B$C[4],B$A[3][0],B$A[3][0]
- || MV A$A[0][2],A$C[2]
- || MV B$A[0][2],B$C[2]
- ___
- &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
- $code.=<<___;
- XOR A$C[4],A$A[4][0],A$A[4][0]
- || XOR B$C[4],B$A[4][0],B$A[4][0]
- || MV A$A[0][4],A$C[4]
- || MV B$A[0][4],B$C[4]
- ___
- &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
- &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
- $code.=<<___;
- || LDW *${iotas}++[2],A$C[0]
- ___
- &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
- $code.=<<___;
- || LDW *${iotas}[-1],B$C[0]
- ___
- &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
- &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
- &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
- &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
- &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
- &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
- &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
- &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
- &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
- &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
- &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
- &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
- &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
- &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
- #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
- &ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
- &ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
- &ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
- $code.=<<___;
- || ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
- || ANDN B$A[0][2],B$A[0][1],B$C[4]
- || ANDN A$A[0][3],A$A[0][2],A$C[1]
- || ANDN B$A[0][3],B$A[0][2],B$C[1]
- || ANDN A$A[0][4],A$A[0][3],A$C[2]
- || ANDN B$A[0][4],B$A[0][3],B$C[2]
- ___
- &ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
- $code.=<<___;
- || ANDN A$A[0][0],A$A[0][4],A$C[3]
- || ANDN B$A[0][0],B$A[0][4],B$C[3]
- || XOR A$C[4],A$A[0][0],A$A[0][0]
- || XOR B$C[4],B$A[0][0],B$A[0][0]
- || ANDN A$A[0][1],A$A[0][0],A$C[4]
- || ANDN B$A[0][1],B$A[0][0],B$C[4]
- XOR A$C[1],A$A[0][1],A$A[0][1]
- || XOR B$C[1],B$A[0][1],B$A[0][1]
- || XOR A$C[2],A$A[0][2],A$A[0][2]
- || XOR B$C[2],B$A[0][2],B$A[0][2]
- || XOR A$C[3],A$A[0][3],A$A[0][3]
- || XOR B$C[3],B$A[0][3],B$A[0][3]
- XOR A$C[4],A$A[0][4],A$A[0][4]
- || XOR B$C[4],B$A[0][4],B$A[0][4]
- || XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
- || XOR B$C[0],B$A[0][0],B$A[0][0]
- || EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
- ANDN A$A[1][2],A$A[1][1],A$C[4]
- || ANDN B$A[1][2],B$A[1][1],B$C[4]
- || ANDN A$A[1][3],A$A[1][2],A$C[1]
- || ANDN B$A[1][3],B$A[1][2],B$C[1]
- || ANDN A$A[1][4],A$A[1][3],A$C[2]
- || ANDN B$A[1][4],B$A[1][3],B$C[2]
- ANDN A$A[1][0],A$A[1][4],A$C[3]
- || ANDN B$A[1][0],B$A[1][4],B$C[3]
- || XOR A$C[4],A$A[1][0],A$A[1][0]
- || XOR B$C[4],B$A[1][0],B$A[1][0]
- || ANDN A$A[1][1],A$A[1][0],A$C[4]
- || ANDN B$A[1][1],B$A[1][0],B$C[4]
- XOR A$C[1],A$A[1][1],A$A[1][1]
- || XOR B$C[1],B$A[1][1],B$A[1][1]
- || XOR A$C[2],A$A[1][2],A$A[1][2]
- || XOR B$C[2],B$A[1][2],B$A[1][2]
- || XOR A$C[3],A$A[1][3],A$A[1][3]
- || XOR B$C[3],B$A[1][3],B$A[1][3]
- XOR A$C[4],A$A[1][4],A$A[1][4]
- || XOR B$C[4],B$A[1][4],B$A[1][4]
- || ANDN A$A[2][2],A$A[2][1],A$C[4]
- || ANDN B$A[2][2],B$A[2][1],B$C[4]
- || ANDN A$A[2][3],A$A[2][2],A$C[1]
- || ANDN B$A[2][3],B$A[2][2],B$C[1]
- ANDN A$A[2][4],A$A[2][3],A$C[2]
- || ANDN B$A[2][4],B$A[2][3],B$C[2]
- || ANDN A$A[2][0],A$A[2][4],A$C[3]
- || ANDN B$A[2][0],B$A[2][4],B$C[3]
- || XOR A$C[4],A$A[2][0],A$A[2][0]
- || XOR B$C[4],B$A[2][0],B$A[2][0]
- ANDN A$A[2][1],A$A[2][0],A$C[4]
- || ANDN B$A[2][1],B$A[2][0],B$C[4]
- || XOR A$C[1],A$A[2][1],A$A[2][1]
- || XOR B$C[1],B$A[2][1],B$A[2][1]
- || XOR A$C[2],A$A[2][2],A$A[2][2]
- || XOR B$C[2],B$A[2][2],B$A[2][2]
- XOR A$C[3],A$A[2][3],A$A[2][3]
- || XOR B$C[3],B$A[2][3],B$A[2][3]
- || XOR A$C[4],A$A[2][4],A$A[2][4]
- || XOR B$C[4],B$A[2][4],B$A[2][4]
- ANDN A$A[3][2],A$A[3][1],A$C[4]
- || ANDN B$A[3][2],B$A[3][1],B$C[4]
- || ANDN A$A[3][3],A$A[3][2],A$C[1]
- || ANDN B$A[3][3],B$A[3][2],B$C[1]
- || ANDN A$A[3][4],A$A[3][3],A$C[2]
- || ANDN B$A[3][4],B$A[3][3],B$C[2]
- ANDN A$A[3][0],A$A[3][4],A$C[3]
- || ANDN B$A[3][0],B$A[3][4],B$C[3]
- || XOR A$C[4],A$A[3][0],A$A[3][0]
- || XOR B$C[4],B$A[3][0],B$A[3][0]
- || ANDN A$A[3][1],A$A[3][0],A$C[4]
- || ANDN B$A[3][1],B$A[3][0],B$C[4]
- XOR A$C[1],A$A[3][1],A$A[3][1]
- || XOR B$C[1],B$A[3][1],B$A[3][1]
- || XOR A$C[2],A$A[3][2],A$A[3][2]
- || XOR B$C[2],B$A[3][2],B$A[3][2]
- || XOR A$C[3],A$A[3][3],A$A[3][3]
- ||[A0] BNOP loop?
- XOR B$C[3],B$A[3][3],B$A[3][3]
- || XOR A$C[4],A$A[3][4],A$A[3][4]
- || XOR B$C[4],B$A[3][4],B$A[3][4]
- ||[!A0] LDDW *FP[-7],A3:A2
- ||[!A0] LDDW *SP[4], RA:B2
- ANDN A$A[4][2],A$A[4][1],A$C[4]
- || ANDN B$A[4][2],B$A[4][1],B$C[4]
- || ANDN A$A[4][3],A$A[4][2],A$C[1]
- || ANDN B$A[4][3],B$A[4][2],B$C[1]
- || ANDN A$A[4][4],A$A[4][3],A$C[2]
- || ANDN B$A[4][4],B$A[4][3],B$C[2]
- ANDN A$A[4][0],A$A[4][4],A$C[3]
- || ANDN B$A[4][0],B$A[4][4],B$C[3]
- || XOR A$C[4],A$A[4][0],A$A[4][0]
- || XOR B$C[4],B$A[4][0],B$A[4][0]
- || ANDN A$A[4][1],A$A[4][0],A$C[4]
- || ANDN B$A[4][1],B$A[4][0],B$C[4]
- XOR A$C[1],A$A[4][1],A$A[4][1]
- || XOR B$C[1],B$A[4][1],B$A[4][1]
- || XOR A$C[2],A$A[4][2],A$A[4][2]
- || XOR B$C[2],B$A[4][2],B$A[4][2]
- || XOR A$C[3],A$A[4][3],A$A[4][3]
- || XOR B$C[3],B$A[4][3],B$A[4][3]
- XOR A$C[4],A$A[4][4],A$A[4][4]
- || XOR B$C[4],B$A[4][4],B$A[4][4]
- ;;===== branch to loop? is taken here
- BNOP RA,5
- .endasmfunc
- .newblock
- .global _KeccakF1600
- .align 32
- _KeccakF1600:
- .asmfunc stack_usage(80)
- STW FP,*SP--(80) ; save frame pointer
- || MV SP,FP
- STDW B13:B12,*SP[9]
- || STDW A13:A12,*FP[-4]
- STDW B11:B10,*SP[8]
- || STDW A11:A10,*FP[-5]
- STW RA, *SP[15]
- || STW A14,*FP[-6]
- || MV A4,A2
- || ADD 4,A4,B2
- LDW *A2++[2],A$A[0][0] ; load A[5][5]
- || LDW *B2++[2],B$A[0][0]
- LDW *A2++[2],A$A[0][1]
- || LDW *B2++[2],B$A[0][1]
- LDW *A2++[2],A$A[0][2]
- || LDW *B2++[2],B$A[0][2]
- LDW *A2++[2],A$A[0][3]
- || LDW *B2++[2],B$A[0][3]
- LDW *A2++[2],A$A[0][4]
- || LDW *B2++[2],B$A[0][4]
- LDW *A2++[2],A$A[1][0]
- || LDW *B2++[2],B$A[1][0]
- LDW *A2++[2],A$A[1][1]
- || LDW *B2++[2],B$A[1][1]
- LDW *A2++[2],A$A[1][2]
- || LDW *B2++[2],B$A[1][2]
- LDW *A2++[2],A$A[1][3]
- || LDW *B2++[2],B$A[1][3]
- LDW *A2++[2],A$A[1][4]
- || LDW *B2++[2],B$A[1][4]
- LDW *A2++[2],A$A[2][0]
- || LDW *B2++[2],B$A[2][0]
- LDW *A2++[2],A$A[2][1]
- || LDW *B2++[2],B$A[2][1]
- LDW *A2++[2],A$A[2][2]
- || LDW *B2++[2],B$A[2][2]
- LDW *A2++[2],A$A[2][3]
- || LDW *B2++[2],B$A[2][3]
- LDW *A2++[2],A$A[2][4]
- || LDW *B2++[2],B$A[2][4]
- LDW *A2++[2],A$A[3][0]
- || LDW *B2++[2],B$A[3][0]
- LDW *A2++[2],A$A[3][1]
- || LDW *B2++[2],B$A[3][1]
- LDW *A2++[2],A$A[3][2]
- || LDW *B2++[2],B$A[3][2]
- LDW *A2++[2],A$A[3][3]
- || LDW *B2++[2],B$A[3][3]
- LDW *A2++[2],A$A[3][4]
- || LDW *B2++[2],B$A[3][4]
- || BNOP _KeccakF1600_int
- ADDKPC ret?,RA
- || LDW *A2++[2],A$A[4][0]
- || LDW *B2++[2],B$A[4][0]
- LDW *A2++[2],A$A[4][1]
- || LDW *B2++[2],B$A[4][1]
- LDW *A2++[2],A$A[4][2]
- || LDW *B2++[2],B$A[4][2]
- LDW *A2++[2],A$A[4][3]
- || LDW *B2++[2],B$A[4][3]
- LDW *A2,A$A[4][4]
- || LDW *B2,B$A[4][4]
- || ADDK -192,A2 ; rewind
- || ADDK -192,B2
- .align 16
- ret?:
- STW A$A[0][0],*A2++[2] ; store A[5][5]
- || STW B$A[0][0],*B2++[2]
- STW A$A[0][1],*A2++[2]
- || STW B$A[0][1],*B2++[2]
- STW A$A[0][2],*A2++[2]
- || STW B$A[0][2],*B2++[2]
- STW A$A[0][3],*A2++[2]
- || STW B$A[0][3],*B2++[2]
- STW A$A[0][4],*A2++[2]
- || STW B$A[0][4],*B2++[2]
- STW A$A[1][0],*A2++[2]
- || STW B$A[1][0],*B2++[2]
- STW A$A[1][1],*A2++[2]
- || STW B$A[1][1],*B2++[2]
- STW A$A[1][2],*A2++[2]
- || STW B$A[1][2],*B2++[2]
- STW A$A[1][3],*A2++[2]
- || STW B$A[1][3],*B2++[2]
- STW A$A[1][4],*A2++[2]
- || STW B$A[1][4],*B2++[2]
- STW A$A[2][0],*A2++[2]
- || STW B$A[2][0],*B2++[2]
- STW A$A[2][1],*A2++[2]
- || STW B$A[2][1],*B2++[2]
- STW A$A[2][2],*A2++[2]
- || STW B$A[2][2],*B2++[2]
- STW A$A[2][3],*A2++[2]
- || STW B$A[2][3],*B2++[2]
- STW A$A[2][4],*A2++[2]
- || STW B$A[2][4],*B2++[2]
- STW A$A[3][0],*A2++[2]
- || STW B$A[3][0],*B2++[2]
- STW A$A[3][1],*A2++[2]
- || STW B$A[3][1],*B2++[2]
- STW A$A[3][2],*A2++[2]
- || STW B$A[3][2],*B2++[2]
- STW A$A[3][3],*A2++[2]
- || STW B$A[3][3],*B2++[2]
- STW A$A[3][4],*A2++[2]
- || STW B$A[3][4],*B2++[2]
- LDW *SP[15],RA
- || LDW *FP[-6],A14
- STW A$A[4][0],*A2++[2]
- || STW B$A[4][0],*B2++[2]
- STW A$A[4][1],*A2++[2]
- || STW B$A[4][1],*B2++[2]
- STW A$A[4][2],*A2++[2]
- || STW B$A[4][2],*B2++[2]
- STW A$A[4][3],*A2++[2]
- || STW B$A[4][3],*B2++[2]
- STW A$A[4][4],*A2
- || STW B$A[4][4],*B2
- || ADDK -192,A2 ; rewind
- MV A2,A4 ; return original A4
- || LDDW *SP[8], B11:B10
- || LDDW *FP[-5],A11:A10
- LDDW *SP[9], B13:B12
- || LDDW *FP[-4],A13:A12
- || BNOP RA
- LDW *++SP(80),FP ; restore frame pointer
- NOP 4 ; wait till FP is committed
- .endasmfunc
- .newblock
- .asg B2,BSZ
- .asg A2,INP
- .asg A3,LEN
- .global _SHA3_absorb
- .align 32
- _SHA3_absorb:
- .asmfunc stack_usage(80)
- STW FP,*SP--(80) ; save frame pointer
- || MV SP,FP
- STDW B13:B12,*SP[9]
- || STDW A13:A12,*FP[-4]
- STDW B11:B10,*SP[8]
- || STDW A11:A10,*FP[-5]
- STW RA, *SP[15]
- || STW A14,*FP[-6]
- STW A4,*SP[1] ; save A[][]
- || MV B4,INP ; reassign arguments
- || MV A6,LEN
- || MV B6,BSZ
- || ADD 4,A4,B4
- LDW *A4++[2],A$A[0][0] ; load A[5][5]
- || LDW *B4++[2],B$A[0][0]
- LDW *A4++[2],A$A[0][1]
- || LDW *B4++[2],B$A[0][1]
- LDW *A4++[2],A$A[0][2]
- || LDW *B4++[2],B$A[0][2]
- LDW *A4++[2],A$A[0][3]
- || LDW *B4++[2],B$A[0][3]
- LDW *A4++[2],A$A[0][4]
- || LDW *B4++[2],B$A[0][4]
- LDW *A4++[2],A$A[1][0]
- || LDW *B4++[2],B$A[1][0]
- LDW *A4++[2],A$A[1][1]
- || LDW *B4++[2],B$A[1][1]
- LDW *A4++[2],A$A[1][2]
- || LDW *B4++[2],B$A[1][2]
- LDW *A4++[2],A$A[1][3]
- || LDW *B4++[2],B$A[1][3]
- LDW *A4++[2],A$A[1][4]
- || LDW *B4++[2],B$A[1][4]
- LDW *A4++[2],A$A[2][0]
- || LDW *B4++[2],B$A[2][0]
- LDW *A4++[2],A$A[2][1]
- || LDW *B4++[2],B$A[2][1]
- LDW *A4++[2],A$A[2][2]
- || LDW *B4++[2],B$A[2][2]
- LDW *A4++[2],A$A[2][3]
- || LDW *B4++[2],B$A[2][3]
- LDW *A4++[2],A$A[2][4]
- || LDW *B4++[2],B$A[2][4]
- LDW *A4++[2],A$A[3][0]
- || LDW *B4++[2],B$A[3][0]
- LDW *A4++[2],A$A[3][1]
- || LDW *B4++[2],B$A[3][1]
- LDW *A4++[2],A$A[3][2]
- || LDW *B4++[2],B$A[3][2]
- LDW *A4++[2],A$A[3][3]
- || LDW *B4++[2],B$A[3][3]
- LDW *A4++[2],A$A[3][4]
- || LDW *B4++[2],B$A[3][4]
- LDW *A4++[2],A$A[4][0]
- || LDW *B4++[2],B$A[4][0]
- LDW *A4++[2],A$A[4][1]
- || LDW *B4++[2],B$A[4][1]
- LDW *A4++[2],A$A[4][2]
- || LDW *B4++[2],B$A[4][2]
- LDW *A4++[2],A$A[4][3]
- || LDW *B4++[2],B$A[4][3]
- LDW *A4,A$A[4][4]
- || LDW *B4,B$A[4][4]
- || ADDKPC loop?,RA
- STDW RA:BSZ,*SP[4]
- loop?:
- CMPLTU LEN,BSZ,A0 ; len < bsz?
- || SHRU BSZ,3,BSZ
- [A0] BNOP ret?
- ||[A0] ZERO BSZ
- ||[A0] LDW *SP[1],A2 ; pull A[][]
- [BSZ] LDNDW *INP++,A1:A0
- ||[BSZ] SUB LEN,8,LEN
- ||[BSZ] SUB BSZ,1,BSZ
- NOP 4
- ___
- for ($y = 0; $y < 5; $y++) {
- for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
- $code.=<<___;
- .if .BIG_ENDIAN
- SWAP2 A0,A1
- || SWAP2 A1,A0
- SWAP4 A0,A0
- SWAP4 A1,A1
- ||[!BSZ]BNOP _KeccakF1600_cheat
- ||[!BSZ]STDW LEN:INP,*SP[3]
- || DEAL A0,A0
- .else
- [!BSZ]BNOP _KeccakF1600_cheat
- ||[!BSZ]STDW LEN:INP,*SP[3]
- || DEAL A0,A0
- .endif
- [BSZ] LDNDW *INP++,A1:A0
- || DEAL A1,A1
- [BSZ] SUB LEN,8,LEN
- ||[BSZ] SUB BSZ,1,BSZ
- PACK2 A1,A0,A0
- || PACKH2 A1,A0,A1
- XOR A0,A$A[$y][$x],A$A[$y][$x]
- XOR A1,B$A[$y][$x],B$A[$y][$x]
- ___
- }
- }
- $code.=<<___;
- .if .BIG_ENDIAN
- SWAP2 A0,A1
- || SWAP2 A1,A0
- SWAP4 A0,A0
- SWAP4 A1,A1
- .endif
- BNOP _KeccakF1600_cheat
- || STDW LEN:INP,*SP[3]
- || DEAL A0,A0
- DEAL A1,A1
- NOP
- PACK2 A1,A0,A0
- || PACKH2 A1,A0,A1
- XOR A0,A$A[4][4],A$A[4][4]
- XOR A1,B$A[4][4],B$A[4][4]
- .align 16
- ret?:
- MV LEN,A4 ; return value
- || ADD 4,A2,B2
- STW A$A[0][0],*A2++[2] ; store A[5][5]
- || STW B$A[0][0],*B2++[2]
- STW A$A[0][1],*A2++[2]
- || STW B$A[0][1],*B2++[2]
- STW A$A[0][2],*A2++[2]
- || STW B$A[0][2],*B2++[2]
- STW A$A[0][3],*A2++[2]
- || STW B$A[0][3],*B2++[2]
- STW A$A[0][4],*A2++[2]
- || STW B$A[0][4],*B2++[2]
- STW A$A[1][0],*A2++[2]
- || STW B$A[1][0],*B2++[2]
- STW A$A[1][1],*A2++[2]
- || STW B$A[1][1],*B2++[2]
- STW A$A[1][2],*A2++[2]
- || STW B$A[1][2],*B2++[2]
- STW A$A[1][3],*A2++[2]
- || STW B$A[1][3],*B2++[2]
- STW A$A[1][4],*A2++[2]
- || STW B$A[1][4],*B2++[2]
- STW A$A[2][0],*A2++[2]
- || STW B$A[2][0],*B2++[2]
- STW A$A[2][1],*A2++[2]
- || STW B$A[2][1],*B2++[2]
- STW A$A[2][2],*A2++[2]
- || STW B$A[2][2],*B2++[2]
- STW A$A[2][3],*A2++[2]
- || STW B$A[2][3],*B2++[2]
- STW A$A[2][4],*A2++[2]
- || STW B$A[2][4],*B2++[2]
- LDW *SP[15],RA
- || LDW *FP[-6],A14
- STW A$A[3][0],*A2++[2]
- || STW B$A[3][0],*B2++[2]
- STW A$A[3][1],*A2++[2]
- || STW B$A[3][1],*B2++[2]
- STW A$A[3][2],*A2++[2]
- || STW B$A[3][2],*B2++[2]
- STW A$A[3][3],*A2++[2]
- || STW B$A[3][3],*B2++[2]
- STW A$A[3][4],*A2++[2]
- || STW B$A[3][4],*B2++[2]
- LDDW *SP[8], B11:B10
- || LDDW *FP[-5],A11:A10
- LDDW *SP[9], B13:B12
- || LDDW *FP[-4],A13:A12
- BNOP RA
- || LDW *++SP(80),FP ; restore frame pointer
- STW A$A[4][0],*A2++[2]
- || STW B$A[4][0],*B2++[2]
- STW A$A[4][1],*A2++[2]
- || STW B$A[4][1],*B2++[2]
- STW A$A[4][2],*A2++[2]
- || STW B$A[4][2],*B2++[2]
- STW A$A[4][3],*A2++[2]
- || STW B$A[4][3],*B2++[2]
- STW A$A[4][4],*A2++[2]
- || STW B$A[4][4],*B2++[2]
- .endasmfunc
- .newblock
- .global _SHA3_squeeze
- .asg A12,OUT
- .asg A13,LEN
- .asg A14,BSZ
- .align 32
- _SHA3_squeeze:
- .asmfunc stack_usage(24)
- STW FP,*SP--(24) ; save frame pointer
- || MV SP,FP
- STW RA, *SP[5]
- || STW A14,*FP[-2]
- STDW A13:A12,*FP[-2]
- || MV B4,OUT ; reassign arguments
- MV A6,LEN
- || MV B6,BSZ
- loop?:
- LDW *SP[5],RA ; reload RA
- || SHRU BSZ,3,A1
- || MV A4,A8
- || ADD 4,A4,B8
- block?:
- CMPLTU LEN,8,A0 ; len < 8?
- [A0] BNOP tail?
- LDW *A8++[2],A9
- || LDW *B8++[2],B9
- || SUB LEN,8,LEN ; len -= 8
- MV LEN,A0
- || SUB A1,1,A1 ; bsz--
- || NOP 4
- .if .BIG_ENDIAN
- SWAP4 A9,A9
- || SWAP4 B9,B9
- SWAP2 A9,A9
- || SWAP2 B9,B9
- .endif
- [!A0] BNOP ret?
- ||[!A0] ZERO A1
- PACK2 B9,A9,B7
- ||[A1] BNOP block?
- PACKH2 B9,A9,B9
- || SHFL B7,B7
- SHFL B9,B9
- STNW B7,*OUT++
- STNW B9,*OUT++
- NOP
- BNOP _KeccakF1600,4
- ADDKPC loop?,RA
- .align 16
- tail?:
- .if .BIG_ENDIAN
- SWAP4 A9,A9
- || SWAP4 B9,B9
- SWAP2 A9,A9
- || SWAP2 B9,B9
- .endif
- PACK2 B9,A9,B7
- PACKH2 B9,A9,B9
- || SHFL B7,B7
- SHFL B9,B9
- STB B7,*OUT++
- || SHRU B7,8,B7
- || ADD LEN,7,A0
- [A0] STB B7,*OUT++
- ||[A0] SHRU B7,8,B7
- ||[A0] SUB A0,1,A0
- [A0] STB B7,*OUT++
- ||[A0] SHRU B7,8,B7
- ||[A0] SUB A0,1,A0
- [A0] STB B7,*OUT++
- ||[A0] SUB A0,1,A0
- [A0] STB B9,*OUT++
- ||[A0] SHRU B9,8,B9
- ||[A0] SUB A0,1,A0
- [A0] STB B9,*OUT++
- ||[A0] SHRU B9,8,B9
- ||[A0] SUB A0,1,A0
- [A0] STB B9,*OUT++
- ret?:
- LDDW *FP[-2],A13:A12
- BNOP RA
- || LDW *FP[-2],A14
- LDW *++SP(24),FP ; restore frame pointer
- NOP 4 ; wait till FP is committed
- .endasmfunc
- .if __TI_EABI__
- .sect ".text:sha_asm.const"
- .else
- .sect ".const:sha_asm"
- .endif
- .align 256
- .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
- iotas:
- .uword 0x00000001, 0x00000000
- .uword 0x00000000, 0x00000089
- .uword 0x00000000, 0x8000008b
- .uword 0x00000000, 0x80008080
- .uword 0x00000001, 0x0000008b
- .uword 0x00000001, 0x00008000
- .uword 0x00000001, 0x80008088
- .uword 0x00000001, 0x80000082
- .uword 0x00000000, 0x0000000b
- .uword 0x00000000, 0x0000000a
- .uword 0x00000001, 0x00008082
- .uword 0x00000000, 0x00008003
- .uword 0x00000001, 0x0000808b
- .uword 0x00000001, 0x8000000b
- .uword 0x00000001, 0x8000008a
- .uword 0x00000001, 0x80000081
- .uword 0x00000000, 0x80000081
- .uword 0x00000000, 0x80000008
- .uword 0x00000000, 0x00000083
- .uword 0x00000000, 0x80008003
- .uword 0x00000001, 0x80008088
- .uword 0x00000000, 0x80000088
- .uword 0x00000001, 0x00008000
- .uword 0x00000000, 0x80008082
- .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
- .align 4
- ___
- $output=pop and open STDOUT,">$output";
- print $code;
- close STDOUT;
|