2
0

sha1-ppc.pl 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # I let hardware handle unaligned input(*), except on page boundaries
  9. # (see below for details). Otherwise straightforward implementation
  10. # with X vector in register bank. The module is big-endian [which is
  11. # not big deal as there're no little-endian targets left around].
  12. #
  13. # (*) this means that this module is inappropriate for PPC403? Does
  14. # anybody know if pre-POWER3 can sustain unaligned load?
  15. # -m64 -m32
  16. # ----------------------------------
  17. # PPC970,gcc-4.0.0 +76% +59%
  18. # Power6,xlc-7 +68% +33%
  19. $flavour = shift;
  20. if ($flavour =~ /64/) {
  21. $SIZE_T =8;
  22. $LRSAVE =2*$SIZE_T;
  23. $UCMP ="cmpld";
  24. $STU ="stdu";
  25. $POP ="ld";
  26. $PUSH ="std";
  27. } elsif ($flavour =~ /32/) {
  28. $SIZE_T =4;
  29. $LRSAVE =$SIZE_T;
  30. $UCMP ="cmplw";
  31. $STU ="stwu";
  32. $POP ="lwz";
  33. $PUSH ="stw";
  34. } else { die "nonsense $flavour"; }
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  38. die "can't locate ppc-xlate.pl";
  39. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  40. $FRAME=24*$SIZE_T+64;
  41. $LOCALS=6*$SIZE_T;
  42. $K ="r0";
  43. $sp ="r1";
  44. $toc="r2";
  45. $ctx="r3";
  46. $inp="r4";
  47. $num="r5";
  48. $t0 ="r15";
  49. $t1 ="r6";
  50. $A ="r7";
  51. $B ="r8";
  52. $C ="r9";
  53. $D ="r10";
  54. $E ="r11";
  55. $T ="r12";
  56. @V=($A,$B,$C,$D,$E,$T);
  57. @X=("r16","r17","r18","r19","r20","r21","r22","r23",
  58. "r24","r25","r26","r27","r28","r29","r30","r31");
  59. sub BODY_00_19 {
  60. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  61. my $j=$i+1;
  62. $code.=<<___ if ($i==0);
  63. lwz @X[$i],`$i*4`($inp)
  64. ___
  65. $code.=<<___ if ($i<15);
  66. lwz @X[$j],`$j*4`($inp)
  67. add $f,$K,$e
  68. rotlwi $e,$a,5
  69. add $f,$f,@X[$i]
  70. and $t0,$c,$b
  71. add $f,$f,$e
  72. andc $t1,$d,$b
  73. rotlwi $b,$b,30
  74. or $t0,$t0,$t1
  75. add $f,$f,$t0
  76. ___
  77. $code.=<<___ if ($i>=15);
  78. add $f,$K,$e
  79. rotlwi $e,$a,5
  80. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  81. add $f,$f,@X[$i%16]
  82. and $t0,$c,$b
  83. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  84. add $f,$f,$e
  85. andc $t1,$d,$b
  86. rotlwi $b,$b,30
  87. or $t0,$t0,$t1
  88. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  89. add $f,$f,$t0
  90. rotlwi @X[$j%16],@X[$j%16],1
  91. ___
  92. }
  93. sub BODY_20_39 {
  94. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  95. my $j=$i+1;
  96. $code.=<<___ if ($i<79);
  97. add $f,$K,$e
  98. rotlwi $e,$a,5
  99. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  100. add $f,$f,@X[$i%16]
  101. xor $t0,$b,$c
  102. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  103. add $f,$f,$e
  104. rotlwi $b,$b,30
  105. xor $t0,$t0,$d
  106. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  107. add $f,$f,$t0
  108. rotlwi @X[$j%16],@X[$j%16],1
  109. ___
  110. $code.=<<___ if ($i==79);
  111. add $f,$K,$e
  112. rotlwi $e,$a,5
  113. lwz r16,0($ctx)
  114. add $f,$f,@X[$i%16]
  115. xor $t0,$b,$c
  116. lwz r17,4($ctx)
  117. add $f,$f,$e
  118. rotlwi $b,$b,30
  119. lwz r18,8($ctx)
  120. xor $t0,$t0,$d
  121. lwz r19,12($ctx)
  122. add $f,$f,$t0
  123. lwz r20,16($ctx)
  124. ___
  125. }
  126. sub BODY_40_59 {
  127. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  128. my $j=$i+1;
  129. $code.=<<___;
  130. add $f,$K,$e
  131. rotlwi $e,$a,5
  132. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  133. add $f,$f,@X[$i%16]
  134. and $t0,$b,$c
  135. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  136. add $f,$f,$e
  137. or $t1,$b,$c
  138. rotlwi $b,$b,30
  139. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  140. and $t1,$t1,$d
  141. or $t0,$t0,$t1
  142. rotlwi @X[$j%16],@X[$j%16],1
  143. add $f,$f,$t0
  144. ___
  145. }
  146. $code=<<___;
  147. .machine "any"
  148. .text
  149. .globl .sha1_block_data_order
  150. .align 4
  151. .sha1_block_data_order:
  152. $STU $sp,-$FRAME($sp)
  153. mflr r0
  154. $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
  155. $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
  156. $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
  157. $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
  158. $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
  159. $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
  160. $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
  161. $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
  162. $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
  163. $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
  164. $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
  165. $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
  166. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  167. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  168. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  169. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  170. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  171. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  172. lwz $A,0($ctx)
  173. lwz $B,4($ctx)
  174. lwz $C,8($ctx)
  175. lwz $D,12($ctx)
  176. lwz $E,16($ctx)
  177. andi. r0,$inp,3
  178. bne Lunaligned
  179. Laligned:
  180. mtctr $num
  181. bl Lsha1_block_private
  182. b Ldone
  183. ; PowerPC specification allows an implementation to be ill-behaved
  184. ; upon unaligned access which crosses page boundary. "Better safe
  185. ; than sorry" principle makes me treat it specially. But I don't
  186. ; look for particular offending word, but rather for 64-byte input
  187. ; block which crosses the boundary. Once found that block is aligned
  188. ; and hashed separately...
  189. .align 4
  190. Lunaligned:
  191. subfic $t1,$inp,4096
  192. andi. $t1,$t1,4095 ; distance to closest page boundary
  193. srwi. $t1,$t1,6 ; t1/=64
  194. beq Lcross_page
  195. $UCMP $num,$t1
  196. ble Laligned ; didn't cross the page boundary
  197. mtctr $t1
  198. subfc $num,$t1,$num
  199. bl Lsha1_block_private
  200. Lcross_page:
  201. li $t1,16
  202. mtctr $t1
  203. addi r20,$sp,$LOCALS ; spot within the frame
  204. Lmemcpy:
  205. lbz r16,0($inp)
  206. lbz r17,1($inp)
  207. lbz r18,2($inp)
  208. lbz r19,3($inp)
  209. addi $inp,$inp,4
  210. stb r16,0(r20)
  211. stb r17,1(r20)
  212. stb r18,2(r20)
  213. stb r19,3(r20)
  214. addi r20,r20,4
  215. bdnz Lmemcpy
  216. $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
  217. li $t1,1
  218. addi $inp,$sp,$LOCALS
  219. mtctr $t1
  220. bl Lsha1_block_private
  221. $POP $inp,`$FRAME-$SIZE_T*18`($sp)
  222. addic. $num,$num,-1
  223. bne Lunaligned
  224. Ldone:
  225. $POP r0,`$FRAME+$LRSAVE`($sp)
  226. $POP r15,`$FRAME-$SIZE_T*17`($sp)
  227. $POP r16,`$FRAME-$SIZE_T*16`($sp)
  228. $POP r17,`$FRAME-$SIZE_T*15`($sp)
  229. $POP r18,`$FRAME-$SIZE_T*14`($sp)
  230. $POP r19,`$FRAME-$SIZE_T*13`($sp)
  231. $POP r20,`$FRAME-$SIZE_T*12`($sp)
  232. $POP r21,`$FRAME-$SIZE_T*11`($sp)
  233. $POP r22,`$FRAME-$SIZE_T*10`($sp)
  234. $POP r23,`$FRAME-$SIZE_T*9`($sp)
  235. $POP r24,`$FRAME-$SIZE_T*8`($sp)
  236. $POP r25,`$FRAME-$SIZE_T*7`($sp)
  237. $POP r26,`$FRAME-$SIZE_T*6`($sp)
  238. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  239. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  240. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  241. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  242. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  243. mtlr r0
  244. addi $sp,$sp,$FRAME
  245. blr
  246. .long 0
  247. .byte 0,12,4,1,0x80,18,3,0
  248. .long 0
  249. ___
  250. # This is private block function, which uses tailored calling
  251. # interface, namely upon entry SHA_CTX is pre-loaded to given
  252. # registers and counter register contains amount of chunks to
  253. # digest...
  254. $code.=<<___;
  255. .align 4
  256. Lsha1_block_private:
  257. ___
  258. $code.=<<___; # load K_00_19
  259. lis $K,0x5a82
  260. ori $K,$K,0x7999
  261. ___
  262. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  263. $code.=<<___; # load K_20_39
  264. lis $K,0x6ed9
  265. ori $K,$K,0xeba1
  266. ___
  267. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  268. $code.=<<___; # load K_40_59
  269. lis $K,0x8f1b
  270. ori $K,$K,0xbcdc
  271. ___
  272. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  273. $code.=<<___; # load K_60_79
  274. lis $K,0xca62
  275. ori $K,$K,0xc1d6
  276. ___
  277. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  278. $code.=<<___;
  279. add r16,r16,$E
  280. add r17,r17,$T
  281. add r18,r18,$A
  282. add r19,r19,$B
  283. add r20,r20,$C
  284. stw r16,0($ctx)
  285. mr $A,r16
  286. stw r17,4($ctx)
  287. mr $B,r17
  288. stw r18,8($ctx)
  289. mr $C,r18
  290. stw r19,12($ctx)
  291. mr $D,r19
  292. stw r20,16($ctx)
  293. mr $E,r20
  294. addi $inp,$inp,`16*4`
  295. bdnz Lsha1_block_private
  296. blr
  297. .long 0
  298. .byte 0,12,0x14,0,0,0,0,0
  299. ___
  300. $code.=<<___;
  301. .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
  302. ___
  303. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  304. print $code;
  305. close STDOUT;