sha1-ppc.pl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. #! /usr/bin/env perl
  2. # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # I let hardware handle unaligned input(*), except on page boundaries
  15. # (see below for details). Otherwise straightforward implementation
  16. # with X vector in register bank.
  17. #
  18. # (*) this means that this module is inappropriate for PPC403? Does
  19. # anybody know if pre-POWER3 can sustain unaligned load?
  20. # -m64 -m32
  21. # ----------------------------------
  22. # PPC970,gcc-4.0.0 +76% +59%
  23. # Power6,xlc-7 +68% +33%
  24. $flavour = shift;
  25. if ($flavour =~ /64/) {
  26. $SIZE_T =8;
  27. $LRSAVE =2*$SIZE_T;
  28. $UCMP ="cmpld";
  29. $STU ="stdu";
  30. $POP ="ld";
  31. $PUSH ="std";
  32. } elsif ($flavour =~ /32/) {
  33. $SIZE_T =4;
  34. $LRSAVE =$SIZE_T;
  35. $UCMP ="cmplw";
  36. $STU ="stwu";
  37. $POP ="lwz";
  38. $PUSH ="stw";
  39. } else { die "nonsense $flavour"; }
  40. # Define endianness based on flavour
  41. # i.e.: linux64le
  42. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  46. die "can't locate ppc-xlate.pl";
  47. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  48. $FRAME=24*$SIZE_T+64;
  49. $LOCALS=6*$SIZE_T;
  50. $K ="r0";
  51. $sp ="r1";
  52. $toc="r2";
  53. $ctx="r3";
  54. $inp="r4";
  55. $num="r5";
  56. $t0 ="r15";
  57. $t1 ="r6";
  58. $A ="r7";
  59. $B ="r8";
  60. $C ="r9";
  61. $D ="r10";
  62. $E ="r11";
  63. $T ="r12";
  64. @V=($A,$B,$C,$D,$E,$T);
  65. @X=("r16","r17","r18","r19","r20","r21","r22","r23",
  66. "r24","r25","r26","r27","r28","r29","r30","r31");
  67. sub loadbe {
  68. my ($dst, $src, $temp_reg) = @_;
  69. $code.=<<___ if (!$LITTLE_ENDIAN);
  70. lwz $dst,$src
  71. ___
  72. $code.=<<___ if ($LITTLE_ENDIAN);
  73. lwz $temp_reg,$src
  74. rotlwi $dst,$temp_reg,8
  75. rlwimi $dst,$temp_reg,24,0,7
  76. rlwimi $dst,$temp_reg,24,16,23
  77. ___
  78. }
  79. sub BODY_00_19 {
  80. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  81. my $j=$i+1;
  82. # Since the last value of $f is discarded, we can use
  83. # it as a temp reg to swap byte-order when needed.
  84. loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
  85. loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
  86. $code.=<<___ if ($i<15);
  87. add $f,$K,$e
  88. rotlwi $e,$a,5
  89. add $f,$f,@X[$i]
  90. and $t0,$c,$b
  91. add $f,$f,$e
  92. andc $t1,$d,$b
  93. rotlwi $b,$b,30
  94. or $t0,$t0,$t1
  95. add $f,$f,$t0
  96. ___
  97. $code.=<<___ if ($i>=15);
  98. add $f,$K,$e
  99. rotlwi $e,$a,5
  100. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  101. add $f,$f,@X[$i%16]
  102. and $t0,$c,$b
  103. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  104. add $f,$f,$e
  105. andc $t1,$d,$b
  106. rotlwi $b,$b,30
  107. or $t0,$t0,$t1
  108. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  109. add $f,$f,$t0
  110. rotlwi @X[$j%16],@X[$j%16],1
  111. ___
  112. }
  113. sub BODY_20_39 {
  114. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  115. my $j=$i+1;
  116. $code.=<<___ if ($i<79);
  117. add $f,$K,$e
  118. xor $t0,$b,$d
  119. rotlwi $e,$a,5
  120. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  121. add $f,$f,@X[$i%16]
  122. xor $t0,$t0,$c
  123. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  124. add $f,$f,$t0
  125. rotlwi $b,$b,30
  126. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  127. add $f,$f,$e
  128. rotlwi @X[$j%16],@X[$j%16],1
  129. ___
  130. $code.=<<___ if ($i==79);
  131. add $f,$K,$e
  132. xor $t0,$b,$d
  133. rotlwi $e,$a,5
  134. lwz r16,0($ctx)
  135. add $f,$f,@X[$i%16]
  136. xor $t0,$t0,$c
  137. lwz r17,4($ctx)
  138. add $f,$f,$t0
  139. rotlwi $b,$b,30
  140. lwz r18,8($ctx)
  141. lwz r19,12($ctx)
  142. add $f,$f,$e
  143. lwz r20,16($ctx)
  144. ___
  145. }
  146. sub BODY_40_59 {
  147. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  148. my $j=$i+1;
  149. $code.=<<___;
  150. add $f,$K,$e
  151. rotlwi $e,$a,5
  152. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  153. add $f,$f,@X[$i%16]
  154. and $t0,$b,$c
  155. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  156. add $f,$f,$e
  157. or $t1,$b,$c
  158. rotlwi $b,$b,30
  159. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  160. and $t1,$t1,$d
  161. or $t0,$t0,$t1
  162. rotlwi @X[$j%16],@X[$j%16],1
  163. add $f,$f,$t0
  164. ___
  165. }
  166. $code=<<___;
  167. .machine "any"
  168. .text
  169. .globl .sha1_block_data_order
  170. .align 4
  171. .sha1_block_data_order:
  172. $STU $sp,-$FRAME($sp)
  173. mflr r0
  174. $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
  175. $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
  176. $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
  177. $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
  178. $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
  179. $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
  180. $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
  181. $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
  182. $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
  183. $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
  184. $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
  185. $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
  186. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  187. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  188. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  189. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  190. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  191. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  192. lwz $A,0($ctx)
  193. lwz $B,4($ctx)
  194. lwz $C,8($ctx)
  195. lwz $D,12($ctx)
  196. lwz $E,16($ctx)
  197. andi. r0,$inp,3
  198. bne Lunaligned
  199. Laligned:
  200. mtctr $num
  201. bl Lsha1_block_private
  202. b Ldone
  203. ; PowerPC specification allows an implementation to be ill-behaved
  204. ; upon unaligned access which crosses page boundary. "Better safe
  205. ; than sorry" principle makes me treat it specially. But I don't
  206. ; look for particular offending word, but rather for 64-byte input
  207. ; block which crosses the boundary. Once found that block is aligned
  208. ; and hashed separately...
  209. .align 4
  210. Lunaligned:
  211. subfic $t1,$inp,4096
  212. andi. $t1,$t1,4095 ; distance to closest page boundary
  213. srwi. $t1,$t1,6 ; t1/=64
  214. beq Lcross_page
  215. $UCMP $num,$t1
  216. ble Laligned ; didn't cross the page boundary
  217. mtctr $t1
  218. subfc $num,$t1,$num
  219. bl Lsha1_block_private
  220. Lcross_page:
  221. li $t1,16
  222. mtctr $t1
  223. addi r20,$sp,$LOCALS ; spot within the frame
  224. Lmemcpy:
  225. lbz r16,0($inp)
  226. lbz r17,1($inp)
  227. lbz r18,2($inp)
  228. lbz r19,3($inp)
  229. addi $inp,$inp,4
  230. stb r16,0(r20)
  231. stb r17,1(r20)
  232. stb r18,2(r20)
  233. stb r19,3(r20)
  234. addi r20,r20,4
  235. bdnz Lmemcpy
  236. $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
  237. li $t1,1
  238. addi $inp,$sp,$LOCALS
  239. mtctr $t1
  240. bl Lsha1_block_private
  241. $POP $inp,`$FRAME-$SIZE_T*18`($sp)
  242. addic. $num,$num,-1
  243. bne Lunaligned
  244. Ldone:
  245. $POP r0,`$FRAME+$LRSAVE`($sp)
  246. $POP r15,`$FRAME-$SIZE_T*17`($sp)
  247. $POP r16,`$FRAME-$SIZE_T*16`($sp)
  248. $POP r17,`$FRAME-$SIZE_T*15`($sp)
  249. $POP r18,`$FRAME-$SIZE_T*14`($sp)
  250. $POP r19,`$FRAME-$SIZE_T*13`($sp)
  251. $POP r20,`$FRAME-$SIZE_T*12`($sp)
  252. $POP r21,`$FRAME-$SIZE_T*11`($sp)
  253. $POP r22,`$FRAME-$SIZE_T*10`($sp)
  254. $POP r23,`$FRAME-$SIZE_T*9`($sp)
  255. $POP r24,`$FRAME-$SIZE_T*8`($sp)
  256. $POP r25,`$FRAME-$SIZE_T*7`($sp)
  257. $POP r26,`$FRAME-$SIZE_T*6`($sp)
  258. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  259. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  260. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  261. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  262. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  263. mtlr r0
  264. addi $sp,$sp,$FRAME
  265. blr
  266. .long 0
  267. .byte 0,12,4,1,0x80,18,3,0
  268. .long 0
  269. ___
  270. # This is private block function, which uses tailored calling
  271. # interface, namely upon entry SHA_CTX is pre-loaded to given
  272. # registers and counter register contains amount of chunks to
  273. # digest...
  274. $code.=<<___;
  275. .align 4
  276. Lsha1_block_private:
  277. ___
  278. $code.=<<___; # load K_00_19
  279. lis $K,0x5a82
  280. ori $K,$K,0x7999
  281. ___
  282. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  283. $code.=<<___; # load K_20_39
  284. lis $K,0x6ed9
  285. ori $K,$K,0xeba1
  286. ___
  287. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  288. $code.=<<___; # load K_40_59
  289. lis $K,0x8f1b
  290. ori $K,$K,0xbcdc
  291. ___
  292. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  293. $code.=<<___; # load K_60_79
  294. lis $K,0xca62
  295. ori $K,$K,0xc1d6
  296. ___
  297. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  298. $code.=<<___;
  299. add r16,r16,$E
  300. add r17,r17,$T
  301. add r18,r18,$A
  302. add r19,r19,$B
  303. add r20,r20,$C
  304. stw r16,0($ctx)
  305. mr $A,r16
  306. stw r17,4($ctx)
  307. mr $B,r17
  308. stw r18,8($ctx)
  309. mr $C,r18
  310. stw r19,12($ctx)
  311. mr $D,r19
  312. stw r20,16($ctx)
  313. mr $E,r20
  314. addi $inp,$inp,`16*4`
  315. bdnz Lsha1_block_private
  316. blr
  317. .long 0
  318. .byte 0,12,0x14,0,0,0,0,0
  319. .size .sha1_block_data_order,.-.sha1_block_data_order
  320. ___
  321. $code.=<<___;
  322. .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
  323. ___
  324. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  325. print $code;
  326. close STDOUT;