123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799 |
- #! /usr/bin/env perl
- # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- # I let hardware handle unaligned input, except on page boundaries
- # (see below for details). Otherwise straightforward implementation
- # with X vector in register bank.
- # sha256 | sha512
- # -m64 -m32 | -m64 -m32
- # --------------------------------------+-----------------------
- # PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
- # Power6,xlc-7 +150% +90% | +100% +430%(*)
- #
- # (*) 64-bit code in 32-bit application context, which actually is
- # on TODO list. It should be noted that for safe deployment in
- # 32-bit *multi-threaded* context asynchronous signals should be
- # blocked upon entry to SHA512 block routine. This is because
- # 32-bit signaling procedure invalidates upper halves of GPRs.
- # Context switch procedure preserves them, but not signaling:-(
- # Second version is true multi-thread safe. Trouble with the original
- # version was that it was using thread local storage pointer register.
- # Well, it scrupulously preserved it, but the problem would arise the
- # moment asynchronous signal was delivered and signal handler would
- # dereference the TLS pointer. While it's never the case in openssl
- # application or test suite, we have to respect this scenario and not
- # use TLS pointer register. Alternative would be to require caller to
- # block signals prior calling this routine. For the record, in 32-bit
- # context R2 serves as TLS pointer, while in 64-bit context - R13.
- $flavour=shift;
- $output =shift;
- if ($flavour =~ /64/) {
- $SIZE_T=8;
- $LRSAVE=2*$SIZE_T;
- $STU="stdu";
- $UCMP="cmpld";
- $SHL="sldi";
- $POP="ld";
- $PUSH="std";
- } elsif ($flavour =~ /32/) {
- $SIZE_T=4;
- $LRSAVE=$SIZE_T;
- $STU="stwu";
- $UCMP="cmplw";
- $SHL="slwi";
- $POP="lwz";
- $PUSH="stw";
- } else { die "nonsense $flavour"; }
- $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
- die "can't locate ppc-xlate.pl";
- open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
- if ($output =~ /512/) {
- $func="sha512_block_ppc";
- $SZ=8;
- @Sigma0=(28,34,39);
- @Sigma1=(14,18,41);
- @sigma0=(1, 8, 7);
- @sigma1=(19,61, 6);
- $rounds=80;
- $LD="ld";
- $ST="std";
- $ROR="rotrdi";
- $SHR="srdi";
- } else {
- $func="sha256_block_ppc";
- $SZ=4;
- @Sigma0=( 2,13,22);
- @Sigma1=( 6,11,25);
- @sigma0=( 7,18, 3);
- @sigma1=(17,19,10);
- $rounds=64;
- $LD="lwz";
- $ST="stw";
- $ROR="rotrwi";
- $SHR="srwi";
- }
- $FRAME=32*$SIZE_T+16*$SZ;
- $LOCALS=6*$SIZE_T;
- $sp ="r1";
- $toc="r2";
- $ctx="r3"; # zapped by $a0
- $inp="r4"; # zapped by $a1
- $num="r5"; # zapped by $t0
- $T ="r0";
- $a0 ="r3";
- $a1 ="r4";
- $t0 ="r5";
- $t1 ="r6";
- $Tbl="r7";
- $A ="r8";
- $B ="r9";
- $C ="r10";
- $D ="r11";
- $E ="r12";
- $F =$t1; $t1 = "r0"; # stay away from "r13";
- $G ="r14";
- $H ="r15";
- @V=($A,$B,$C,$D,$E,$F,$G,$H);
- @X=("r16","r17","r18","r19","r20","r21","r22","r23",
- "r24","r25","r26","r27","r28","r29","r30","r31");
- $inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15]
- sub ROUND_00_15 {
- my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
- $code.=<<___;
- $ROR $a0,$e,$Sigma1[0]
- $ROR $a1,$e,$Sigma1[1]
- and $t0,$f,$e
- xor $a0,$a0,$a1
- add $h,$h,$t1
- andc $t1,$g,$e
- $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
- or $t0,$t0,$t1 ; Ch(e,f,g)
- add $h,$h,@X[$i%16]
- xor $a0,$a0,$a1 ; Sigma1(e)
- add $h,$h,$t0
- add $h,$h,$a0
- $ROR $a0,$a,$Sigma0[0]
- $ROR $a1,$a,$Sigma0[1]
- and $t0,$a,$b
- and $t1,$a,$c
- xor $a0,$a0,$a1
- $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
- xor $t0,$t0,$t1
- and $t1,$b,$c
- xor $a0,$a0,$a1 ; Sigma0(a)
- add $d,$d,$h
- xor $t0,$t0,$t1 ; Maj(a,b,c)
- ___
- $code.=<<___ if ($i<15);
- $LD $t1,`($i+1)*$SZ`($Tbl)
- ___
- $code.=<<___;
- add $h,$h,$a0
- add $h,$h,$t0
- ___
- }
- sub ROUND_16_xx {
- my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
- $i-=16;
- $code.=<<___;
- $ROR $a0,@X[($i+1)%16],$sigma0[0]
- $ROR $a1,@X[($i+1)%16],$sigma0[1]
- $ROR $t0,@X[($i+14)%16],$sigma1[0]
- $ROR $t1,@X[($i+14)%16],$sigma1[1]
- xor $a0,$a0,$a1
- $SHR $a1,@X[($i+1)%16],$sigma0[2]
- xor $t0,$t0,$t1
- $SHR $t1,@X[($i+14)%16],$sigma1[2]
- add @X[$i],@X[$i],@X[($i+9)%16]
- xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
- xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
- $LD $t1,`$i*$SZ`($Tbl)
- add @X[$i],@X[$i],$a0
- add @X[$i],@X[$i],$t0
- ___
- &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
- }
- $code=<<___;
- .machine "any"
- .text
- .globl $func
- .align 6
- $func:
- $STU $sp,-$FRAME($sp)
- mflr r0
- $SHL $num,$num,`log(16*$SZ)/log(2)`
- $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
- $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
- $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
- $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
- $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
- $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
- $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
- $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
- $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
- $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
- $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
- $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
- $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
- $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
- $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
- $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
- $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
- $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
- $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
- $PUSH r0,`$FRAME+$LRSAVE`($sp)
- ___
- if ($SZ==4 || $SIZE_T==8) {
- $code.=<<___;
- $LD $A,`0*$SZ`($ctx)
- mr $inp,r4 ; incarnate $inp
- $LD $B,`1*$SZ`($ctx)
- $LD $C,`2*$SZ`($ctx)
- $LD $D,`3*$SZ`($ctx)
- $LD $E,`4*$SZ`($ctx)
- $LD $F,`5*$SZ`($ctx)
- $LD $G,`6*$SZ`($ctx)
- $LD $H,`7*$SZ`($ctx)
- ___
- } else {
- for ($i=16;$i<32;$i++) {
- $code.=<<___;
- lwz r$i,`$LITTLE_ENDIAN^(4*($i-16))`($ctx)
- ___
- }
- }
- $code.=<<___;
- bl LPICmeup
- LPICedup:
- andi. r0,$inp,3
- bne Lunaligned
- Laligned:
- add $num,$inp,$num
- $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
- $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
- bl Lsha2_block_private
- b Ldone
- ; PowerPC specification allows an implementation to be ill-behaved
- ; upon unaligned access which crosses page boundary. "Better safe
- ; than sorry" principle makes me treat it specially. But I don't
- ; look for particular offending word, but rather for the input
- ; block which crosses the boundary. Once found that block is aligned
- ; and hashed separately...
- .align 4
- Lunaligned:
- subfic $t1,$inp,4096
- andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
- beq Lcross_page
- $UCMP $num,$t1
- ble Laligned ; didn't cross the page boundary
- subfc $num,$t1,$num
- add $t1,$inp,$t1
- $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
- $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
- $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
- bl Lsha2_block_private
- ; $inp equals to the intermediate end pointer here
- $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
- Lcross_page:
- li $t1,`16*$SZ/4`
- mtctr $t1
- ___
- if ($SZ==4 || $SIZE_T==8) {
- $code.=<<___;
- addi r20,$sp,$LOCALS ; aligned spot below the frame
- Lmemcpy:
- lbz r16,0($inp)
- lbz r17,1($inp)
- lbz r18,2($inp)
- lbz r19,3($inp)
- addi $inp,$inp,4
- stb r16,0(r20)
- stb r17,1(r20)
- stb r18,2(r20)
- stb r19,3(r20)
- addi r20,r20,4
- bdnz Lmemcpy
- ___
- } else {
- $code.=<<___;
- addi r12,$sp,$LOCALS ; aligned spot below the frame
- Lmemcpy:
- lbz r8,0($inp)
- lbz r9,1($inp)
- lbz r10,2($inp)
- lbz r11,3($inp)
- addi $inp,$inp,4
- stb r8,0(r12)
- stb r9,1(r12)
- stb r10,2(r12)
- stb r11,3(r12)
- addi r12,r12,4
- bdnz Lmemcpy
- ___
- }
- $code.=<<___;
- $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
- addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
- addi $inp,$sp,$LOCALS ; fictitious inp pointer
- $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
- $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
- $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
- bl Lsha2_block_private
- $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
- $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
- addic. $num,$num,`-16*$SZ` ; num--
- bne Lunaligned
- Ldone:
- $POP r0,`$FRAME+$LRSAVE`($sp)
- $POP r14,`$FRAME-$SIZE_T*18`($sp)
- $POP r15,`$FRAME-$SIZE_T*17`($sp)
- $POP r16,`$FRAME-$SIZE_T*16`($sp)
- $POP r17,`$FRAME-$SIZE_T*15`($sp)
- $POP r18,`$FRAME-$SIZE_T*14`($sp)
- $POP r19,`$FRAME-$SIZE_T*13`($sp)
- $POP r20,`$FRAME-$SIZE_T*12`($sp)
- $POP r21,`$FRAME-$SIZE_T*11`($sp)
- $POP r22,`$FRAME-$SIZE_T*10`($sp)
- $POP r23,`$FRAME-$SIZE_T*9`($sp)
- $POP r24,`$FRAME-$SIZE_T*8`($sp)
- $POP r25,`$FRAME-$SIZE_T*7`($sp)
- $POP r26,`$FRAME-$SIZE_T*6`($sp)
- $POP r27,`$FRAME-$SIZE_T*5`($sp)
- $POP r28,`$FRAME-$SIZE_T*4`($sp)
- $POP r29,`$FRAME-$SIZE_T*3`($sp)
- $POP r30,`$FRAME-$SIZE_T*2`($sp)
- $POP r31,`$FRAME-$SIZE_T*1`($sp)
- mtlr r0
- addi $sp,$sp,$FRAME
- blr
- .long 0
- .byte 0,12,4,1,0x80,18,3,0
- .long 0
- ___
- if ($SZ==4 || $SIZE_T==8) {
- $code.=<<___;
- .align 4
- Lsha2_block_private:
- $LD $t1,0($Tbl)
- ___
- for($i=0;$i<16;$i++) {
- $code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
- lwz @X[$i],`$i*$SZ`($inp)
- ___
- $code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
- lwz $a0,`$i*$SZ`($inp)
- rotlwi @X[$i],$a0,8
- rlwimi @X[$i],$a0,24,0,7
- rlwimi @X[$i],$a0,24,16,23
- ___
- # 64-bit loads are split to 2x32-bit ones, as CPU can't handle
- # unaligned 64-bit loads, only 32-bit ones...
- $code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
- lwz $t0,`$i*$SZ`($inp)
- lwz @X[$i],`$i*$SZ+4`($inp)
- insrdi @X[$i],$t0,32,0
- ___
- $code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
- lwz $a0,`$i*$SZ`($inp)
- lwz $a1,`$i*$SZ+4`($inp)
- rotlwi $t0,$a0,8
- rotlwi @X[$i],$a1,8
- rlwimi $t0,$a0,24,0,7
- rlwimi @X[$i],$a1,24,0,7
- rlwimi $t0,$a0,24,16,23
- rlwimi @X[$i],$a1,24,16,23
- insrdi @X[$i],$t0,32,0
- ___
- &ROUND_00_15($i,@V);
- unshift(@V,pop(@V));
- }
- $code.=<<___;
- li $t0,`$rounds/16-1`
- mtctr $t0
- .align 4
- Lrounds:
- addi $Tbl,$Tbl,`16*$SZ`
- ___
- for(;$i<32;$i++) {
- &ROUND_16_xx($i,@V);
- unshift(@V,pop(@V));
- }
- $code.=<<___;
- bdnz Lrounds
- $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
- $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
- $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
- subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
- $LD r16,`0*$SZ`($ctx)
- $LD r17,`1*$SZ`($ctx)
- $LD r18,`2*$SZ`($ctx)
- $LD r19,`3*$SZ`($ctx)
- $LD r20,`4*$SZ`($ctx)
- $LD r21,`5*$SZ`($ctx)
- $LD r22,`6*$SZ`($ctx)
- addi $inp,$inp,`16*$SZ` ; advance inp
- $LD r23,`7*$SZ`($ctx)
- add $A,$A,r16
- add $B,$B,r17
- $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
- add $C,$C,r18
- $ST $A,`0*$SZ`($ctx)
- add $D,$D,r19
- $ST $B,`1*$SZ`($ctx)
- add $E,$E,r20
- $ST $C,`2*$SZ`($ctx)
- add $F,$F,r21
- $ST $D,`3*$SZ`($ctx)
- add $G,$G,r22
- $ST $E,`4*$SZ`($ctx)
- add $H,$H,r23
- $ST $F,`5*$SZ`($ctx)
- $ST $G,`6*$SZ`($ctx)
- $UCMP $inp,$num
- $ST $H,`7*$SZ`($ctx)
- bne Lsha2_block_private
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,0,0
- .size $func,.-$func
- ___
- } else {
- ########################################################################
- # SHA512 for PPC32, X vector is off-loaded to stack...
- #
- # | sha512
- # | -m32
- # ----------------------+-----------------------
- # PPC74x0,gcc-4.0.1 | +48%
- # POWER6,gcc-4.4.6 | +124%(*)
- # POWER7,gcc-4.4.6 | +79%(*)
- # e300,gcc-4.1.0 | +167%
- #
- # (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated
- # by xlc-12.1]
- my $XOFF=$LOCALS;
- my @V=map("r$_",(16..31)); # A..H
- my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
- my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp
- sub ROUND_00_15_ppc32 {
- my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
- $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
- $code.=<<___;
- lwz $t2,`$SZ*($i%16)+($LITTLE_ENDIAN^4)`($Tbl)
- xor $a0,$flo,$glo
- lwz $t3,`$SZ*($i%16)+($LITTLE_ENDIAN^0)`($Tbl)
- xor $a1,$fhi,$ghi
- addc $hlo,$hlo,$t0 ; h+=x[i]
- stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i]
- srwi $s0,$elo,$Sigma1[0]
- srwi $s1,$ehi,$Sigma1[0]
- and $a0,$a0,$elo
- adde $hhi,$hhi,$t1
- and $a1,$a1,$ehi
- stw $t1,`$XOFF+4+$SZ*($i%16)`($sp)
- srwi $t0,$elo,$Sigma1[1]
- srwi $t1,$ehi,$Sigma1[1]
- addc $hlo,$hlo,$t2 ; h+=K512[i]
- insrwi $s0,$ehi,$Sigma1[0],0
- insrwi $s1,$elo,$Sigma1[0],0
- xor $a0,$a0,$glo ; Ch(e,f,g)
- adde $hhi,$hhi,$t3
- xor $a1,$a1,$ghi
- insrwi $t0,$ehi,$Sigma1[1],0
- insrwi $t1,$elo,$Sigma1[1],0
- addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g)
- srwi $t2,$ehi,$Sigma1[2]-32
- srwi $t3,$elo,$Sigma1[2]-32
- xor $s0,$s0,$t0
- xor $s1,$s1,$t1
- insrwi $t2,$elo,$Sigma1[2]-32,0
- insrwi $t3,$ehi,$Sigma1[2]-32,0
- xor $a0,$alo,$blo ; a^b, b^c in next round
- adde $hhi,$hhi,$a1
- xor $a1,$ahi,$bhi
- xor $s0,$s0,$t2 ; Sigma1(e)
- xor $s1,$s1,$t3
- srwi $t0,$alo,$Sigma0[0]
- and $a2,$a2,$a0
- addc $hlo,$hlo,$s0 ; h+=Sigma1(e)
- and $a3,$a3,$a1
- srwi $t1,$ahi,$Sigma0[0]
- srwi $s0,$ahi,$Sigma0[1]-32
- adde $hhi,$hhi,$s1
- srwi $s1,$alo,$Sigma0[1]-32
- insrwi $t0,$ahi,$Sigma0[0],0
- insrwi $t1,$alo,$Sigma0[0],0
- xor $a2,$a2,$blo ; Maj(a,b,c)
- addc $dlo,$dlo,$hlo ; d+=h
- xor $a3,$a3,$bhi
- insrwi $s0,$alo,$Sigma0[1]-32,0
- insrwi $s1,$ahi,$Sigma0[1]-32,0
- adde $dhi,$dhi,$hhi
- srwi $t2,$ahi,$Sigma0[2]-32
- srwi $t3,$alo,$Sigma0[2]-32
- xor $s0,$s0,$t0
- addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c)
- xor $s1,$s1,$t1
- insrwi $t2,$alo,$Sigma0[2]-32,0
- insrwi $t3,$ahi,$Sigma0[2]-32,0
- adde $hhi,$hhi,$a3
- ___
- $code.=<<___ if ($i>=15);
- lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
- lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
- ___
- $code.=<<___ if ($i<15 && !$LITTLE_ENDIAN);
- lwz $t1,`$SZ*($i+1)+0`($inp)
- lwz $t0,`$SZ*($i+1)+4`($inp)
- ___
- $code.=<<___ if ($i<15 && $LITTLE_ENDIAN);
- lwz $a2,`$SZ*($i+1)+0`($inp)
- lwz $a3,`$SZ*($i+1)+4`($inp)
- rotlwi $t1,$a2,8
- rotlwi $t0,$a3,8
- rlwimi $t1,$a2,24,0,7
- rlwimi $t0,$a3,24,0,7
- rlwimi $t1,$a2,24,16,23
- rlwimi $t0,$a3,24,16,23
- ___
- $code.=<<___;
- xor $s0,$s0,$t2 ; Sigma0(a)
- xor $s1,$s1,$t3
- addc $hlo,$hlo,$s0 ; h+=Sigma0(a)
- adde $hhi,$hhi,$s1
- ___
- $code.=<<___ if ($i==15);
- lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
- lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
- ___
- }
- sub ROUND_16_xx_ppc32 {
- my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
- $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
- $code.=<<___;
- srwi $s0,$t0,$sigma0[0]
- srwi $s1,$t1,$sigma0[0]
- srwi $t2,$t0,$sigma0[1]
- srwi $t3,$t1,$sigma0[1]
- insrwi $s0,$t1,$sigma0[0],0
- insrwi $s1,$t0,$sigma0[0],0
- srwi $a0,$t0,$sigma0[2]
- insrwi $t2,$t1,$sigma0[1],0
- insrwi $t3,$t0,$sigma0[1],0
- insrwi $a0,$t1,$sigma0[2],0
- xor $s0,$s0,$t2
- lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
- srwi $a1,$t1,$sigma0[2]
- xor $s1,$s1,$t3
- lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
- xor $a0,$a0,$s0
- srwi $s0,$t2,$sigma1[0]
- xor $a1,$a1,$s1
- srwi $s1,$t3,$sigma1[0]
- addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1])
- srwi $a0,$t3,$sigma1[1]-32
- insrwi $s0,$t3,$sigma1[0],0
- insrwi $s1,$t2,$sigma1[0],0
- adde $x1,$x1,$a1
- srwi $a1,$t2,$sigma1[1]-32
- insrwi $a0,$t2,$sigma1[1]-32,0
- srwi $t2,$t2,$sigma1[2]
- insrwi $a1,$t3,$sigma1[1]-32,0
- insrwi $t2,$t3,$sigma1[2],0
- xor $s0,$s0,$a0
- lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
- srwi $t3,$t3,$sigma1[2]
- xor $s1,$s1,$a1
- lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
- xor $s0,$s0,$t2
- addc $x0,$x0,$a0 ; x[i]+=x[i+9]
- xor $s1,$s1,$t3
- adde $x1,$x1,$a1
- addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14])
- adde $x1,$x1,$s1
- ___
- ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
- &ROUND_00_15_ppc32(@_);
- }
- $code.=<<___;
- .align 4
- Lsha2_block_private:
- ___
- $code.=<<___ if (!$LITTLE_ENDIAN);
- lwz $t1,0($inp)
- xor $a2,@V[3],@V[5] ; B^C, magic seed
- lwz $t0,4($inp)
- xor $a3,@V[2],@V[4]
- ___
- $code.=<<___ if ($LITTLE_ENDIAN);
- lwz $a1,0($inp)
- xor $a2,@V[3],@V[5] ; B^C, magic seed
- lwz $a0,4($inp)
- xor $a3,@V[2],@V[4]
- rotlwi $t1,$a1,8
- rotlwi $t0,$a0,8
- rlwimi $t1,$a1,24,0,7
- rlwimi $t0,$a0,24,0,7
- rlwimi $t1,$a1,24,16,23
- rlwimi $t0,$a0,24,16,23
- ___
- for($i=0;$i<16;$i++) {
- &ROUND_00_15_ppc32($i,@V);
- unshift(@V,pop(@V)); unshift(@V,pop(@V));
- ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
- }
- $code.=<<___;
- li $a0,`$rounds/16-1`
- mtctr $a0
- .align 4
- Lrounds:
- addi $Tbl,$Tbl,`16*$SZ`
- ___
- for(;$i<32;$i++) {
- &ROUND_16_xx_ppc32($i,@V);
- unshift(@V,pop(@V)); unshift(@V,pop(@V));
- ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
- }
- $code.=<<___;
- bdnz Lrounds
- $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
- $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
- $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
- subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
- lwz $t0,`$LITTLE_ENDIAN^0`($ctx)
- lwz $t1,`$LITTLE_ENDIAN^4`($ctx)
- lwz $t2,`$LITTLE_ENDIAN^8`($ctx)
- lwz $t3,`$LITTLE_ENDIAN^12`($ctx)
- lwz $a0,`$LITTLE_ENDIAN^16`($ctx)
- lwz $a1,`$LITTLE_ENDIAN^20`($ctx)
- lwz $a2,`$LITTLE_ENDIAN^24`($ctx)
- addc @V[1],@V[1],$t1
- lwz $a3,`$LITTLE_ENDIAN^28`($ctx)
- adde @V[0],@V[0],$t0
- lwz $t0,`$LITTLE_ENDIAN^32`($ctx)
- addc @V[3],@V[3],$t3
- lwz $t1,`$LITTLE_ENDIAN^36`($ctx)
- adde @V[2],@V[2],$t2
- lwz $t2,`$LITTLE_ENDIAN^40`($ctx)
- addc @V[5],@V[5],$a1
- lwz $t3,`$LITTLE_ENDIAN^44`($ctx)
- adde @V[4],@V[4],$a0
- lwz $a0,`$LITTLE_ENDIAN^48`($ctx)
- addc @V[7],@V[7],$a3
- lwz $a1,`$LITTLE_ENDIAN^52`($ctx)
- adde @V[6],@V[6],$a2
- lwz $a2,`$LITTLE_ENDIAN^56`($ctx)
- addc @V[9],@V[9],$t1
- lwz $a3,`$LITTLE_ENDIAN^60`($ctx)
- adde @V[8],@V[8],$t0
- stw @V[0],`$LITTLE_ENDIAN^0`($ctx)
- stw @V[1],`$LITTLE_ENDIAN^4`($ctx)
- addc @V[11],@V[11],$t3
- stw @V[2],`$LITTLE_ENDIAN^8`($ctx)
- stw @V[3],`$LITTLE_ENDIAN^12`($ctx)
- adde @V[10],@V[10],$t2
- stw @V[4],`$LITTLE_ENDIAN^16`($ctx)
- stw @V[5],`$LITTLE_ENDIAN^20`($ctx)
- addc @V[13],@V[13],$a1
- stw @V[6],`$LITTLE_ENDIAN^24`($ctx)
- stw @V[7],`$LITTLE_ENDIAN^28`($ctx)
- adde @V[12],@V[12],$a0
- stw @V[8],`$LITTLE_ENDIAN^32`($ctx)
- stw @V[9],`$LITTLE_ENDIAN^36`($ctx)
- addc @V[15],@V[15],$a3
- stw @V[10],`$LITTLE_ENDIAN^40`($ctx)
- stw @V[11],`$LITTLE_ENDIAN^44`($ctx)
- adde @V[14],@V[14],$a2
- stw @V[12],`$LITTLE_ENDIAN^48`($ctx)
- stw @V[13],`$LITTLE_ENDIAN^52`($ctx)
- stw @V[14],`$LITTLE_ENDIAN^56`($ctx)
- stw @V[15],`$LITTLE_ENDIAN^60`($ctx)
- addi $inp,$inp,`16*$SZ` ; advance inp
- $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
- $UCMP $inp,$num
- bne Lsha2_block_private
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,0,0
- .size $func,.-$func
- ___
- }
- # Ugly hack here, because PPC assembler syntax seem to vary too
- # much from platforms to platform...
- $code.=<<___;
- .align 6
- LPICmeup:
- mflr r0
- bcl 20,31,\$+4
- mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
- addi $Tbl,$Tbl,`64-8`
- mtlr r0
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,0,0
- .space `64-9*4`
- ___
- $code.=<<___ if ($SZ==8);
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
- ___
- $code.=<<___ if ($SZ==4);
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- ___
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT;
|