sha1-mips.pl 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # SHA1 block procedure for MIPS.
  9. # Performance improvement is 30% on unaligned input. The "secret" is
  10. # to deploy lwl/lwr pair to load unaligned input. One could have
  11. # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
  12. # compatible subroutine. There is room for minor optimization on
  13. # little-endian platforms...
  14. ######################################################################
  15. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  16. # widely used. Then there is a new contender: NUBI. It appears that if
  17. # one picks the latter, it's possible to arrange code in ABI neutral
  18. # manner. Therefore let's stick to NUBI register layout:
  19. #
  20. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  21. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  22. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  23. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  24. #
  25. # The return value is placed in $a0. Following coding rules facilitate
  26. # interoperability:
  27. #
  28. # - never ever touch $tp, "thread pointer", former $gp;
  29. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  30. # old code];
  31. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  32. #
  33. # For reference here is register layout for N32/64 MIPS ABIs:
  34. #
  35. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  36. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  37. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  38. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  39. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  40. #
  41. $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
  42. if ($flavour =~ /64|n32/i) {
  43. $PTR_ADD="dadd"; # incidentally works even on n32
  44. $PTR_SUB="dsub"; # incidentally works even on n32
  45. $REG_S="sd";
  46. $REG_L="ld";
  47. $PTR_SLL="dsll"; # incidentally works even on n32
  48. $SZREG=8;
  49. } else {
  50. $PTR_ADD="add";
  51. $PTR_SUB="sub";
  52. $REG_S="sw";
  53. $REG_L="lw";
  54. $PTR_SLL="sll";
  55. $SZREG=4;
  56. }
  57. #
  58. # <appro@openssl.org>
  59. #
  60. ######################################################################
  61. $big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
  62. for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
  63. open STDOUT,">$output";
  64. if (!defined($big_endian))
  65. { $big_endian=(unpack('L',pack('N',1))==1); }
  66. # offsets of the Most and Least Significant Bytes
  67. $MSB=$big_endian?0:3;
  68. $LSB=3&~$MSB;
  69. @X=map("\$$_",(8..23)); # a4-a7,s0-s11
  70. $ctx=$a0;
  71. $inp=$a1;
  72. $num=$a2;
  73. $A="\$1";
  74. $B="\$2";
  75. $C="\$3";
  76. $D="\$7";
  77. $E="\$24"; @V=($A,$B,$C,$D,$E);
  78. $t0="\$25";
  79. $t1=$num; # $num is offloaded to stack
  80. $t2="\$30"; # fp
  81. $K="\$31"; # ra
  82. sub BODY_00_14 {
  83. my ($i,$a,$b,$c,$d,$e)=@_;
  84. my $j=$i+1;
  85. $code.=<<___ if (!$big_endian);
  86. srl $t0,@X[$i],24 # byte swap($i)
  87. srl $t1,@X[$i],8
  88. andi $t2,@X[$i],0xFF00
  89. sll @X[$i],@X[$i],24
  90. andi $t1,0xFF00
  91. sll $t2,$t2,8
  92. or @X[$i],$t0
  93. or $t1,$t2
  94. or @X[$i],$t1
  95. ___
  96. $code.=<<___;
  97. lwl @X[$j],$j*4+$MSB($inp)
  98. sll $t0,$a,5 # $i
  99. addu $e,$K
  100. lwr @X[$j],$j*4+$LSB($inp)
  101. srl $t1,$a,27
  102. addu $e,$t0
  103. xor $t0,$c,$d
  104. addu $e,$t1
  105. sll $t2,$b,30
  106. and $t0,$b
  107. srl $b,$b,2
  108. xor $t0,$d
  109. addu $e,@X[$i]
  110. or $b,$t2
  111. addu $e,$t0
  112. ___
  113. }
  114. sub BODY_15_19 {
  115. my ($i,$a,$b,$c,$d,$e)=@_;
  116. my $j=$i+1;
  117. $code.=<<___ if (!$big_endian && $i==15);
  118. srl $t0,@X[$i],24 # byte swap($i)
  119. srl $t1,@X[$i],8
  120. andi $t2,@X[$i],0xFF00
  121. sll @X[$i],@X[$i],24
  122. andi $t1,0xFF00
  123. sll $t2,$t2,8
  124. or @X[$i],$t0
  125. or @X[$i],$t1
  126. or @X[$i],$t2
  127. ___
  128. $code.=<<___;
  129. xor @X[$j%16],@X[($j+2)%16]
  130. sll $t0,$a,5 # $i
  131. addu $e,$K
  132. srl $t1,$a,27
  133. addu $e,$t0
  134. xor @X[$j%16],@X[($j+8)%16]
  135. xor $t0,$c,$d
  136. addu $e,$t1
  137. xor @X[$j%16],@X[($j+13)%16]
  138. sll $t2,$b,30
  139. and $t0,$b
  140. srl $t1,@X[$j%16],31
  141. addu @X[$j%16],@X[$j%16]
  142. srl $b,$b,2
  143. xor $t0,$d
  144. or @X[$j%16],$t1
  145. addu $e,@X[$i%16]
  146. or $b,$t2
  147. addu $e,$t0
  148. ___
  149. }
  150. sub BODY_20_39 {
  151. my ($i,$a,$b,$c,$d,$e)=@_;
  152. my $j=$i+1;
  153. $code.=<<___ if ($i<79);
  154. xor @X[$j%16],@X[($j+2)%16]
  155. sll $t0,$a,5 # $i
  156. addu $e,$K
  157. srl $t1,$a,27
  158. addu $e,$t0
  159. xor @X[$j%16],@X[($j+8)%16]
  160. xor $t0,$c,$d
  161. addu $e,$t1
  162. xor @X[$j%16],@X[($j+13)%16]
  163. sll $t2,$b,30
  164. xor $t0,$b
  165. srl $t1,@X[$j%16],31
  166. addu @X[$j%16],@X[$j%16]
  167. srl $b,$b,2
  168. addu $e,@X[$i%16]
  169. or @X[$j%16],$t1
  170. or $b,$t2
  171. addu $e,$t0
  172. ___
  173. $code.=<<___ if ($i==79);
  174. lw @X[0],0($ctx)
  175. sll $t0,$a,5 # $i
  176. addu $e,$K
  177. lw @X[1],4($ctx)
  178. srl $t1,$a,27
  179. addu $e,$t0
  180. lw @X[2],8($ctx)
  181. xor $t0,$c,$d
  182. addu $e,$t1
  183. lw @X[3],12($ctx)
  184. sll $t2,$b,30
  185. xor $t0,$b
  186. lw @X[4],16($ctx)
  187. srl $b,$b,2
  188. addu $e,@X[$i%16]
  189. or $b,$t2
  190. addu $e,$t0
  191. ___
  192. }
  193. sub BODY_40_59 {
  194. my ($i,$a,$b,$c,$d,$e)=@_;
  195. my $j=$i+1;
  196. $code.=<<___ if ($i<79);
  197. xor @X[$j%16],@X[($j+2)%16]
  198. sll $t0,$a,5 # $i
  199. addu $e,$K
  200. srl $t1,$a,27
  201. addu $e,$t0
  202. xor @X[$j%16],@X[($j+8)%16]
  203. and $t0,$c,$d
  204. addu $e,$t1
  205. xor @X[$j%16],@X[($j+13)%16]
  206. sll $t2,$b,30
  207. addu $e,$t0
  208. srl $t1,@X[$j%16],31
  209. xor $t0,$c,$d
  210. addu @X[$j%16],@X[$j%16]
  211. and $t0,$b
  212. srl $b,$b,2
  213. or @X[$j%16],$t1
  214. addu $e,@X[$i%16]
  215. or $b,$t2
  216. addu $e,$t0
  217. ___
  218. }
  219. $FRAMESIZE=16; # large enough to accomodate NUBI saved registers
  220. $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
  221. $code=<<___;
  222. #ifdef OPENSSL_FIPSCANISTER
  223. # include <openssl/fipssyms.h>
  224. #endif
  225. .text
  226. .set noat
  227. .set noreorder
  228. .align 5
  229. .globl sha1_block_data_order
  230. .ent sha1_block_data_order
  231. sha1_block_data_order:
  232. .frame $sp,$FRAMESIZE*$SZREG,$ra
  233. .mask $SAVED_REGS_MASK,-$SZREG
  234. .set noreorder
  235. $PTR_SUB $sp,$FRAMESIZE*$SZREG
  236. $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
  237. $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
  238. $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
  239. $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
  240. $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
  241. $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
  242. $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
  243. $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
  244. $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
  245. $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
  246. ___
  247. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  248. $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
  249. $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
  250. $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
  251. $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
  252. $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
  253. ___
  254. $code.=<<___;
  255. $PTR_SLL $num,6
  256. $PTR_ADD $num,$inp
  257. $REG_S $num,0($sp)
  258. lw $A,0($ctx)
  259. lw $B,4($ctx)
  260. lw $C,8($ctx)
  261. lw $D,12($ctx)
  262. b .Loop
  263. lw $E,16($ctx)
  264. .align 4
  265. .Loop:
  266. .set reorder
  267. lwl @X[0],$MSB($inp)
  268. lui $K,0x5a82
  269. lwr @X[0],$LSB($inp)
  270. ori $K,0x7999 # K_00_19
  271. ___
  272. for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
  273. for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
  274. $code.=<<___;
  275. lui $K,0x6ed9
  276. ori $K,0xeba1 # K_20_39
  277. ___
  278. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  279. $code.=<<___;
  280. lui $K,0x8f1b
  281. ori $K,0xbcdc # K_40_59
  282. ___
  283. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  284. $code.=<<___;
  285. lui $K,0xca62
  286. ori $K,0xc1d6 # K_60_79
  287. ___
  288. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  289. $code.=<<___;
  290. $PTR_ADD $inp,64
  291. $REG_L $num,0($sp)
  292. addu $A,$X[0]
  293. addu $B,$X[1]
  294. sw $A,0($ctx)
  295. addu $C,$X[2]
  296. addu $D,$X[3]
  297. sw $B,4($ctx)
  298. addu $E,$X[4]
  299. sw $C,8($ctx)
  300. sw $D,12($ctx)
  301. sw $E,16($ctx)
  302. .set noreorder
  303. bne $inp,$num,.Loop
  304. nop
  305. .set noreorder
  306. $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
  307. $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
  308. $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
  309. $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
  310. $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
  311. $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
  312. $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
  313. $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
  314. $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
  315. $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
  316. ___
  317. $code.=<<___ if ($flavour =~ /nubi/i);
  318. $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
  319. $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
  320. $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
  321. $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
  322. $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
  323. ___
  324. $code.=<<___;
  325. jr $ra
  326. $PTR_ADD $sp,$FRAMESIZE*$SZREG
  327. .end sha1_block_data_order
  328. .rdata
  329. .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
  330. ___
  331. print $code;
  332. close STDOUT;