sha1-ppc.pl 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. #! /usr/bin/env perl
  2. # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # I let hardware handle unaligned input(*), except on page boundaries
  15. # (see below for details). Otherwise straightforward implementation
  16. # with X vector in register bank.
  17. #
  18. # (*) this means that this module is inappropriate for PPC403? Does
  19. # anybody know if pre-POWER3 can sustain unaligned load?
  20. # -m64 -m32
  21. # ----------------------------------
  22. # PPC970,gcc-4.0.0 +76% +59%
  23. # Power6,xlc-7 +68% +33%
  24. # $output is the last argument if it looks like a file (it has an extension)
  25. # $flavour is the first argument if it doesn't look like a file
  26. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  27. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  28. if ($flavour =~ /64/) {
  29. $SIZE_T =8;
  30. $LRSAVE =2*$SIZE_T;
  31. $UCMP ="cmpld";
  32. $STU ="stdu";
  33. $POP ="ld";
  34. $PUSH ="std";
  35. } elsif ($flavour =~ /32/) {
  36. $SIZE_T =4;
  37. $LRSAVE =$SIZE_T;
  38. $UCMP ="cmplw";
  39. $STU ="stwu";
  40. $POP ="lwz";
  41. $PUSH ="stw";
  42. } else { die "nonsense $flavour"; }
  43. # Define endianness based on flavour
  44. # i.e.: linux64le
  45. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  46. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  48. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  49. die "can't locate ppc-xlate.pl";
  50. open STDOUT,"| $^X $xlate $flavour \"$output\""
  51. or die "can't call $xlate: $!";
  52. $FRAME=24*$SIZE_T+64;
  53. $LOCALS=6*$SIZE_T;
  54. $K ="r0";
  55. $sp ="r1";
  56. $toc="r2";
  57. $ctx="r3";
  58. $inp="r4";
  59. $num="r5";
  60. $t0 ="r15";
  61. $t1 ="r6";
  62. $A ="r7";
  63. $B ="r8";
  64. $C ="r9";
  65. $D ="r10";
  66. $E ="r11";
  67. $T ="r12";
  68. @V=($A,$B,$C,$D,$E,$T);
  69. @X=("r16","r17","r18","r19","r20","r21","r22","r23",
  70. "r24","r25","r26","r27","r28","r29","r30","r31");
  71. sub loadbe {
  72. my ($dst, $src, $temp_reg) = @_;
  73. $code.=<<___ if (!$LITTLE_ENDIAN);
  74. lwz $dst,$src
  75. ___
  76. $code.=<<___ if ($LITTLE_ENDIAN);
  77. lwz $temp_reg,$src
  78. rotlwi $dst,$temp_reg,8
  79. rlwimi $dst,$temp_reg,24,0,7
  80. rlwimi $dst,$temp_reg,24,16,23
  81. ___
  82. }
  83. sub BODY_00_19 {
  84. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  85. my $j=$i+1;
  86. # Since the last value of $f is discarded, we can use
  87. # it as a temp reg to swap byte-order when needed.
  88. loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
  89. loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
  90. $code.=<<___ if ($i<15);
  91. add $f,$K,$e
  92. rotlwi $e,$a,5
  93. add $f,$f,@X[$i]
  94. and $t0,$c,$b
  95. add $f,$f,$e
  96. andc $t1,$d,$b
  97. rotlwi $b,$b,30
  98. or $t0,$t0,$t1
  99. add $f,$f,$t0
  100. ___
  101. $code.=<<___ if ($i>=15);
  102. add $f,$K,$e
  103. rotlwi $e,$a,5
  104. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  105. add $f,$f,@X[$i%16]
  106. and $t0,$c,$b
  107. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  108. add $f,$f,$e
  109. andc $t1,$d,$b
  110. rotlwi $b,$b,30
  111. or $t0,$t0,$t1
  112. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  113. add $f,$f,$t0
  114. rotlwi @X[$j%16],@X[$j%16],1
  115. ___
  116. }
  117. sub BODY_20_39 {
  118. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  119. my $j=$i+1;
  120. $code.=<<___ if ($i<79);
  121. add $f,$K,$e
  122. xor $t0,$b,$d
  123. rotlwi $e,$a,5
  124. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  125. add $f,$f,@X[$i%16]
  126. xor $t0,$t0,$c
  127. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  128. add $f,$f,$t0
  129. rotlwi $b,$b,30
  130. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  131. add $f,$f,$e
  132. rotlwi @X[$j%16],@X[$j%16],1
  133. ___
  134. $code.=<<___ if ($i==79);
  135. add $f,$K,$e
  136. xor $t0,$b,$d
  137. rotlwi $e,$a,5
  138. lwz r16,0($ctx)
  139. add $f,$f,@X[$i%16]
  140. xor $t0,$t0,$c
  141. lwz r17,4($ctx)
  142. add $f,$f,$t0
  143. rotlwi $b,$b,30
  144. lwz r18,8($ctx)
  145. lwz r19,12($ctx)
  146. add $f,$f,$e
  147. lwz r20,16($ctx)
  148. ___
  149. }
  150. sub BODY_40_59 {
  151. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  152. my $j=$i+1;
  153. $code.=<<___;
  154. add $f,$K,$e
  155. rotlwi $e,$a,5
  156. xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
  157. add $f,$f,@X[$i%16]
  158. and $t0,$b,$c
  159. xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
  160. add $f,$f,$e
  161. or $t1,$b,$c
  162. rotlwi $b,$b,30
  163. xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
  164. and $t1,$t1,$d
  165. or $t0,$t0,$t1
  166. rotlwi @X[$j%16],@X[$j%16],1
  167. add $f,$f,$t0
  168. ___
  169. }
  170. $code=<<___;
  171. .machine "any"
  172. .text
  173. .globl .sha1_block_data_order
  174. .align 4
  175. .sha1_block_data_order:
  176. $STU $sp,-$FRAME($sp)
  177. mflr r0
  178. $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
  179. $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
  180. $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
  181. $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
  182. $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
  183. $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
  184. $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
  185. $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
  186. $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
  187. $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
  188. $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
  189. $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
  190. $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
  191. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  192. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  193. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  194. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  195. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  196. lwz $A,0($ctx)
  197. lwz $B,4($ctx)
  198. lwz $C,8($ctx)
  199. lwz $D,12($ctx)
  200. lwz $E,16($ctx)
  201. andi. r0,$inp,3
  202. bne Lunaligned
  203. Laligned:
  204. mtctr $num
  205. bl Lsha1_block_private
  206. b Ldone
  207. ; PowerPC specification allows an implementation to be ill-behaved
  208. ; upon unaligned access which crosses page boundary. "Better safe
  209. ; than sorry" principle makes me treat it specially. But I don't
  210. ; look for particular offending word, but rather for 64-byte input
  211. ; block which crosses the boundary. Once found that block is aligned
  212. ; and hashed separately...
  213. .align 4
  214. Lunaligned:
  215. subfic $t1,$inp,4096
  216. andi. $t1,$t1,4095 ; distance to closest page boundary
  217. srwi. $t1,$t1,6 ; t1/=64
  218. beq Lcross_page
  219. $UCMP $num,$t1
  220. ble Laligned ; didn't cross the page boundary
  221. mtctr $t1
  222. subfc $num,$t1,$num
  223. bl Lsha1_block_private
  224. Lcross_page:
  225. li $t1,16
  226. mtctr $t1
  227. addi r20,$sp,$LOCALS ; spot within the frame
  228. Lmemcpy:
  229. lbz r16,0($inp)
  230. lbz r17,1($inp)
  231. lbz r18,2($inp)
  232. lbz r19,3($inp)
  233. addi $inp,$inp,4
  234. stb r16,0(r20)
  235. stb r17,1(r20)
  236. stb r18,2(r20)
  237. stb r19,3(r20)
  238. addi r20,r20,4
  239. bdnz Lmemcpy
  240. $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
  241. li $t1,1
  242. addi $inp,$sp,$LOCALS
  243. mtctr $t1
  244. bl Lsha1_block_private
  245. $POP $inp,`$FRAME-$SIZE_T*18`($sp)
  246. addic. $num,$num,-1
  247. bne Lunaligned
  248. Ldone:
  249. $POP r0,`$FRAME+$LRSAVE`($sp)
  250. $POP r15,`$FRAME-$SIZE_T*17`($sp)
  251. $POP r16,`$FRAME-$SIZE_T*16`($sp)
  252. $POP r17,`$FRAME-$SIZE_T*15`($sp)
  253. $POP r18,`$FRAME-$SIZE_T*14`($sp)
  254. $POP r19,`$FRAME-$SIZE_T*13`($sp)
  255. $POP r20,`$FRAME-$SIZE_T*12`($sp)
  256. $POP r21,`$FRAME-$SIZE_T*11`($sp)
  257. $POP r22,`$FRAME-$SIZE_T*10`($sp)
  258. $POP r23,`$FRAME-$SIZE_T*9`($sp)
  259. $POP r24,`$FRAME-$SIZE_T*8`($sp)
  260. $POP r25,`$FRAME-$SIZE_T*7`($sp)
  261. $POP r26,`$FRAME-$SIZE_T*6`($sp)
  262. $POP r27,`$FRAME-$SIZE_T*5`($sp)
  263. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  264. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  265. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  266. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  267. mtlr r0
  268. addi $sp,$sp,$FRAME
  269. blr
  270. .long 0
  271. .byte 0,12,4,1,0x80,18,3,0
  272. .long 0
  273. ___
  274. # This is private block function, which uses tailored calling
  275. # interface, namely upon entry SHA_CTX is pre-loaded to given
  276. # registers and counter register contains amount of chunks to
  277. # digest...
  278. $code.=<<___;
  279. .align 4
  280. Lsha1_block_private:
  281. ___
  282. $code.=<<___; # load K_00_19
  283. lis $K,0x5a82
  284. ori $K,$K,0x7999
  285. ___
  286. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  287. $code.=<<___; # load K_20_39
  288. lis $K,0x6ed9
  289. ori $K,$K,0xeba1
  290. ___
  291. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  292. $code.=<<___; # load K_40_59
  293. lis $K,0x8f1b
  294. ori $K,$K,0xbcdc
  295. ___
  296. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  297. $code.=<<___; # load K_60_79
  298. lis $K,0xca62
  299. ori $K,$K,0xc1d6
  300. ___
  301. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  302. $code.=<<___;
  303. add r16,r16,$E
  304. add r17,r17,$T
  305. add r18,r18,$A
  306. add r19,r19,$B
  307. add r20,r20,$C
  308. stw r16,0($ctx)
  309. mr $A,r16
  310. stw r17,4($ctx)
  311. mr $B,r17
  312. stw r18,8($ctx)
  313. mr $C,r18
  314. stw r19,12($ctx)
  315. mr $D,r19
  316. stw r20,16($ctx)
  317. mr $E,r20
  318. addi $inp,$inp,`16*4`
  319. bdnz Lsha1_block_private
  320. blr
  321. .long 0
  322. .byte 0,12,0x14,0,0,0,0,0
  323. .size .sha1_block_data_order,.-.sha1_block_data_order
  324. ___
  325. $code.=<<___;
  326. .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
  327. ___
  328. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  329. print $code;
  330. close STDOUT;