2
0

sha1-x86_64.pl 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. Rights for redistribution and usage in source and binary
  6. # forms are granted according to the OpenSSL license.
  7. # ====================================================================
  8. #
  9. # sha1_block procedure for x86_64.
  10. #
  11. # It was brought to my attention that on EM64T compiler-generated code
  12. # was far behind 32-bit assembler implementation. This is unlike on
  13. # Opteron where compiler-generated code was only 15% behind 32-bit
  14. # assembler, which originally made it hard to motivate the effort.
  15. # There was suggestion to mechanically translate 32-bit code, but I
  16. # dismissed it, reasoning that x86_64 offers enough register bank
  17. # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  18. # implementation:-) However! While 64-bit code does performs better
  19. # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  20. # x86_64 does offer larger *addressable* bank, but out-of-order core
  21. # reaches for even more registers through dynamic aliasing, and EM64T
  22. # core must have managed to run-time optimize even 32-bit code just as
  23. # good as 64-bit one. Performance improvement is summarized in the
  24. # following table:
  25. #
  26. # gcc 3.4 32-bit asm cycles/byte
  27. # Opteron +45% +20% 6.8
  28. # Xeon +65% +0% 9.9
  29. $output=shift;
  30. open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
  31. $ctx="%rdi"; # 1st arg
  32. $inp="%rsi"; # 2nd arg
  33. $num="%rdx"; # 3rd arg
  34. # reassign arguments in order to produce more compact code
  35. $ctx="%r8";
  36. $inp="%r9";
  37. $num="%r10";
  38. $xi="%eax";
  39. $t0="%ebx";
  40. $t1="%ecx";
  41. $A="%edx";
  42. $B="%esi";
  43. $C="%edi";
  44. $D="%ebp";
  45. $E="%r11d";
  46. $T="%r12d";
  47. @V=($A,$B,$C,$D,$E,$T);
  48. sub PROLOGUE {
  49. my $func=shift;
  50. $code.=<<___;
  51. .globl $func
  52. .type $func,\@function,3
  53. .align 16
  54. $func:
  55. push %rbx
  56. push %rbp
  57. push %r12
  58. mov %rsp,%rax
  59. mov %rdi,$ctx # reassigned argument
  60. sub \$`8+16*4`,%rsp
  61. mov %rsi,$inp # reassigned argument
  62. and \$-64,%rsp
  63. mov %rdx,$num # reassigned argument
  64. mov %rax,`16*4`(%rsp)
  65. mov 0($ctx),$A
  66. mov 4($ctx),$B
  67. mov 8($ctx),$C
  68. mov 12($ctx),$D
  69. mov 16($ctx),$E
  70. ___
  71. }
  72. sub EPILOGUE {
  73. my $func=shift;
  74. $code.=<<___;
  75. mov `16*4`(%rsp),%rsp
  76. pop %r12
  77. pop %rbp
  78. pop %rbx
  79. ret
  80. .size $func,.-$func
  81. ___
  82. }
  83. sub BODY_00_19 {
  84. my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
  85. my $j=$i+1;
  86. $code.=<<___ if ($i==0);
  87. mov `4*$i`($inp),$xi
  88. `"bswap $xi" if(!defined($host))`
  89. mov $xi,`4*$i`(%rsp)
  90. ___
  91. $code.=<<___ if ($i<15);
  92. lea 0x5a827999($xi,$e),$f
  93. mov $c,$t0
  94. mov `4*$j`($inp),$xi
  95. mov $a,$e
  96. xor $d,$t0
  97. `"bswap $xi" if(!defined($host))`
  98. rol \$5,$e
  99. and $b,$t0
  100. mov $xi,`4*$j`(%rsp)
  101. add $e,$f
  102. xor $d,$t0
  103. rol \$30,$b
  104. add $t0,$f
  105. ___
  106. $code.=".Lshortcut:\n" if ($i==15);
  107. $code.=<<___ if ($i>=15);
  108. lea 0x5a827999($xi,$e),$f
  109. mov `4*($j%16)`(%rsp),$xi
  110. mov $c,$t0
  111. mov $a,$e
  112. xor `4*(($j+2)%16)`(%rsp),$xi
  113. xor $d,$t0
  114. rol \$5,$e
  115. xor `4*(($j+8)%16)`(%rsp),$xi
  116. and $b,$t0
  117. add $e,$f
  118. xor `4*(($j+13)%16)`(%rsp),$xi
  119. xor $d,$t0
  120. rol \$30,$b
  121. add $t0,$f
  122. rol \$1,$xi
  123. mov $xi,`4*($j%16)`(%rsp)
  124. ___
  125. }
  126. sub BODY_20_39 {
  127. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  128. my $j=$i+1;
  129. my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
  130. $code.=<<___ if ($i<79);
  131. lea $K($xi,$e),$f
  132. mov `4*($j%16)`(%rsp),$xi
  133. mov $c,$t0
  134. mov $a,$e
  135. xor `4*(($j+2)%16)`(%rsp),$xi
  136. xor $b,$t0
  137. rol \$5,$e
  138. xor `4*(($j+8)%16)`(%rsp),$xi
  139. xor $d,$t0
  140. add $e,$f
  141. xor `4*(($j+13)%16)`(%rsp),$xi
  142. rol \$30,$b
  143. add $t0,$f
  144. rol \$1,$xi
  145. mov $xi,`4*($j%16)`(%rsp)
  146. ___
  147. $code.=<<___ if ($i==79);
  148. lea $K($xi,$e),$f
  149. mov $c,$t0
  150. mov $a,$e
  151. xor $b,$t0
  152. rol \$5,$e
  153. xor $d,$t0
  154. add $e,$f
  155. rol \$30,$b
  156. add $t0,$f
  157. ___
  158. }
  159. sub BODY_40_59 {
  160. my ($i,$a,$b,$c,$d,$e,$f)=@_;
  161. my $j=$i+1;
  162. $code.=<<___;
  163. lea 0x8f1bbcdc($xi,$e),$f
  164. mov `4*($j%16)`(%rsp),$xi
  165. mov $b,$t0
  166. mov $b,$t1
  167. xor `4*(($j+2)%16)`(%rsp),$xi
  168. mov $a,$e
  169. and $c,$t0
  170. xor `4*(($j+8)%16)`(%rsp),$xi
  171. or $c,$t1
  172. rol \$5,$e
  173. xor `4*(($j+13)%16)`(%rsp),$xi
  174. and $d,$t1
  175. add $e,$f
  176. rol \$1,$xi
  177. or $t1,$t0
  178. rol \$30,$b
  179. mov $xi,`4*($j%16)`(%rsp)
  180. add $t0,$f
  181. ___
  182. }
  183. $code=".text\n";
  184. &PROLOGUE("sha1_block_asm_data_order");
  185. $code.=".align 4\n.Lloop:\n";
  186. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  187. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  188. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  189. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  190. $code.=<<___;
  191. add 0($ctx),$E
  192. add 4($ctx),$T
  193. add 8($ctx),$A
  194. add 12($ctx),$B
  195. add 16($ctx),$C
  196. mov $E,0($ctx)
  197. mov $T,4($ctx)
  198. mov $A,8($ctx)
  199. mov $B,12($ctx)
  200. mov $C,16($ctx)
  201. xchg $E,$A # mov $E,$A
  202. xchg $T,$B # mov $T,$B
  203. xchg $E,$C # mov $A,$C
  204. xchg $T,$D # mov $B,$D
  205. # mov $C,$E
  206. lea `16*4`($inp),$inp
  207. sub \$1,$num
  208. jnz .Lloop
  209. ___
  210. &EPILOGUE("sha1_block_asm_data_order");
  211. ####################################################################
  212. @V=($A,$B,$C,$D,$E,$T);
  213. &PROLOGUE("sha1_block_asm_host_order");
  214. for($i=0;$i<15;$i++) { &BODY_00_19($i,@V,1); unshift(@V,pop(@V)); }
  215. $code.=<<___;
  216. jmp .Lshortcut
  217. .size sha1_block_asm_host_order,.-sha1_block_asm_host_order
  218. ___
  219. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  220. print $code;
  221. close STDOUT;