123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- #!/usr/bin/env perl
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # March 2010
- #
- # The module implements "4-bit" GCM GHASH function and underlying
- # single multiplication operation in GF(2^128). "4-bit" means that it
- # uses 256 bytes per-key table [+128 bytes shared table]. Even though
- # loops are aggressively modulo-scheduled in respect to references to
- # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
- # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
- # scheduling "glitch," because uprofile(1) indicates uniform sample
- # distribution, as if all instruction bundles execute in 1.5 cycles.
- # Meaning that it could have been even faster, yet 12 cycles is ~60%
- # better than gcc-generated code and ~80% than code generated by vendor
- # compiler.
- $cnt="v0"; # $0
- $t0="t0";
- $t1="t1";
- $t2="t2";
- $Thi0="t3"; # $4
- $Tlo0="t4";
- $Thi1="t5";
- $Tlo1="t6";
- $rem="t7"; # $8
- #################
- $Xi="a0"; # $16, input argument block
- $Htbl="a1";
- $inp="a2";
- $len="a3";
- $nlo="a4"; # $20
- $nhi="a5";
- $Zhi="t8";
- $Zlo="t9";
- $Xhi="t10"; # $24
- $Xlo="t11";
- $remp="t12";
- $rem_4bit="AT"; # $28
- { my $N;
- sub loop() {
- $N++;
- $code.=<<___;
- .align 4
- extbl $Xlo,7,$nlo
- and $nlo,0xf0,$nhi
- sll $nlo,4,$nlo
- and $nlo,0xf0,$nlo
- addq $nlo,$Htbl,$nlo
- ldq $Zlo,8($nlo)
- addq $nhi,$Htbl,$nhi
- ldq $Zhi,0($nlo)
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- lda $cnt,6(zero)
- extbl $Xlo,6,$nlo
- ldq $Tlo1,8($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $Thi1,0($nhi)
- srl $Zlo,4,$Zlo
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- and $nlo,0xf0,$nhi
- xor $Tlo1,$Zlo,$Zlo
- sll $nlo,4,$nlo
- xor $Thi1,$Zhi,$Zhi
- and $nlo,0xf0,$nlo
- addq $nlo,$Htbl,$nlo
- ldq $Tlo0,8($nlo)
- addq $nhi,$Htbl,$nhi
- ldq $Thi0,0($nlo)
- .Looplo$N:
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- subq $cnt,1,$cnt
- srl $Zlo,4,$Zlo
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xlo,$cnt,$nlo
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- bne $cnt,.Looplo$N
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- lda $cnt,7(zero)
- srl $Zlo,4,$Zlo
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xhi,$cnt,$nlo
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- unop
- .Loophi$N:
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- subq $cnt,1,$cnt
- srl $Zlo,4,$Zlo
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- extbl $Xhi,$cnt,$nlo
- and $nlo,0xf0,$nhi
- xor $Thi0,$Zhi,$Zhi
- xor $Tlo0,$Zlo,$Zlo
- sll $nlo,4,$nlo
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- and $nlo,0xf0,$nlo
- srl $Zlo,4,$Zlo
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- addq $nlo,$Htbl,$nlo
- addq $nhi,$Htbl,$nhi
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- ldq $Tlo0,8($nlo)
- xor $t0,$Zlo,$Zlo
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- ldq $Thi0,0($nlo)
- bne $cnt,.Loophi$N
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- srl $Zlo,4,$Zlo
- ldq $Tlo1,8($nhi)
- xor $rem,$Zhi,$Zhi
- ldq $Thi1,0($nhi)
- s8addq $remp,$rem_4bit,$remp
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $t0,$Zlo,$Zlo
- xor $Tlo0,$Zlo,$Zlo
- xor $Thi0,$Zhi,$Zhi
- and $Zlo,0x0f,$remp
- sll $Zhi,60,$t0
- srl $Zlo,4,$Zlo
- s8addq $remp,$rem_4bit,$remp
- xor $rem,$Zhi,$Zhi
- ldq $rem,0($remp)
- srl $Zhi,4,$Zhi
- xor $Tlo1,$Zlo,$Zlo
- xor $Thi1,$Zhi,$Zhi
- xor $t0,$Zlo,$Zlo
- xor $rem,$Zhi,$Zhi
- ___
- }}
- $code=<<___;
- #ifdef __linux__
- #include <asm/regdef.h>
- #else
- #include <asm.h>
- #include <regdef.h>
- #endif
- .text
- .set noat
- .set noreorder
- .globl gcm_gmult_4bit
- .align 4
- .ent gcm_gmult_4bit
- gcm_gmult_4bit:
- .frame sp,0,ra
- .prologue 0
- ldq $Xlo,8($Xi)
- ldq $Xhi,0($Xi)
- br $rem_4bit,.Lpic1
- .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
- ___
- &loop();
- $code.=<<___;
- srl $Zlo,24,$t0 # byte swap
- srl $Zlo,8,$t1
- sll $Zlo,8,$t2
- sll $Zlo,24,$Zlo
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- zapnot $Zlo,0x88,$Zlo
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- or $Zlo,$t0,$Zlo
- srl $Zhi,24,$t0
- srl $Zhi,8,$t1
- or $Zlo,$t2,$Zlo
- sll $Zhi,8,$t2
- sll $Zhi,24,$Zhi
- srl $Zlo,32,$Xlo
- sll $Zlo,32,$Zlo
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
- or $Zhi,$Xhi,$Xhi
- stq $Xlo,8($Xi)
- stq $Xhi,0($Xi)
- ret (ra)
- .end gcm_gmult_4bit
- ___
- $inhi="s0";
- $inlo="s1";
- $code.=<<___;
- .globl gcm_ghash_4bit
- .align 4
- .ent gcm_ghash_4bit
- gcm_ghash_4bit:
- lda sp,-32(sp)
- stq ra,0(sp)
- stq s0,8(sp)
- stq s1,16(sp)
- .mask 0x04000600,-32
- .frame sp,32,ra
- .prologue 0
- ldq_u $inhi,0($inp)
- ldq_u $Thi0,7($inp)
- ldq_u $inlo,8($inp)
- ldq_u $Tlo0,15($inp)
- ldq $Xhi,0($Xi)
- ldq $Xlo,8($Xi)
- br $rem_4bit,.Lpic2
- .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
- .Louter:
- extql $inhi,$inp,$inhi
- extqh $Thi0,$inp,$Thi0
- or $inhi,$Thi0,$inhi
- lda $inp,16($inp)
- extql $inlo,$inp,$inlo
- extqh $Tlo0,$inp,$Tlo0
- or $inlo,$Tlo0,$inlo
- subq $len,16,$len
- xor $Xlo,$inlo,$Xlo
- xor $Xhi,$inhi,$Xhi
- ___
- &loop();
- $code.=<<___;
- srl $Zlo,24,$t0 # byte swap
- srl $Zlo,8,$t1
- sll $Zlo,8,$t2
- sll $Zlo,24,$Zlo
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- zapnot $Zlo,0x88,$Zlo
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- or $Zlo,$t0,$Zlo
- srl $Zhi,24,$t0
- srl $Zhi,8,$t1
- or $Zlo,$t2,$Zlo
- sll $Zhi,8,$t2
- sll $Zhi,24,$Zhi
- srl $Zlo,32,$Xlo
- sll $Zlo,32,$Zlo
- beq $len,.Ldone
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
- ldq_u $inhi,0($inp)
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- ldq_u $Thi0,7($inp)
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
- ldq_u $inlo,8($inp)
- ldq_u $Tlo0,15($inp)
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
- or $Zhi,$Xhi,$Xhi
- br zero,.Louter
- .Ldone:
- zapnot $t0,0x11,$t0
- zapnot $t1,0x22,$t1
- or $Zlo,$Xlo,$Xlo
- zapnot $Zhi,0x88,$Zhi
- or $t0,$t1,$t0
- zapnot $t2,0x44,$t2
- or $Zhi,$t0,$Zhi
- or $Zhi,$t2,$Zhi
- srl $Zhi,32,$Xhi
- sll $Zhi,32,$Zhi
- or $Zhi,$Xhi,$Xhi
- stq $Xlo,8($Xi)
- stq $Xhi,0($Xi)
- .set noreorder
- /*ldq ra,0(sp)*/
- ldq s0,8(sp)
- ldq s1,16(sp)
- lda sp,32(sp)
- ret (ra)
- .end gcm_ghash_4bit
- .align 4
- rem_4bit:
- .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
- .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
- .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
- .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
- .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
- .align 4
- ___
- $output=shift and open STDOUT,">$output";
- print $code;
- close STDOUT;
|