sha1-s390x.pl 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # SHA1 block procedure for s390x.
  9. # April 2007.
  10. #
  11. # Performance is >30% better than gcc 3.3 generated code. But the real
  12. # twist is that SHA1 hardware support is detected and utilized. In
  13. # which case performance can reach further >4.5x for larger chunks.
  14. # January 2009.
  15. #
  16. # Optimize Xupdate for amount of memory references and reschedule
  17. # instructions to favour dual-issue z10 pipeline. On z10 hardware is
  18. # "only" ~2.3x faster than software.
  19. # November 2010.
  20. #
  21. # Adapt for -m31 build. If kernel supports what's called "highgprs"
  22. # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
  23. # instructions and achieve "64-bit" performance even in 31-bit legacy
  24. # application context. The feature is not specific to any particular
  25. # processor, as long as it's "z-CPU". Latter implies that the code
  26. # remains z/Architecture specific.
  27. $kimdfunc=1; # magic function code for kimd instruction
  28. $flavour = shift;
  29. if ($flavour =~ /3[12]/) {
  30. $SIZE_T=4;
  31. $g="";
  32. } else {
  33. $SIZE_T=8;
  34. $g="g";
  35. }
  36. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  37. open STDOUT,">$output";
  38. $K_00_39="%r0"; $K=$K_00_39;
  39. $K_40_79="%r1";
  40. $ctx="%r2"; $prefetch="%r2";
  41. $inp="%r3";
  42. $len="%r4";
  43. $A="%r5";
  44. $B="%r6";
  45. $C="%r7";
  46. $D="%r8";
  47. $E="%r9"; @V=($A,$B,$C,$D,$E);
  48. $t0="%r10";
  49. $t1="%r11";
  50. @X=("%r12","%r13","%r14");
  51. $sp="%r15";
  52. $stdframe=16*$SIZE_T+4*8;
  53. $frame=$stdframe+16*4;
  54. sub Xupdate {
  55. my $i=shift;
  56. $code.=<<___ if ($i==15);
  57. lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
  58. lr $X[0],$X[2]
  59. ___
  60. return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
  61. $code.=<<___ if ($i<16);
  62. lg $X[0],`$i*4`($inp) ### Xload($i)
  63. rllg $X[1],$X[0],32
  64. ___
  65. $code.=<<___ if ($i>=16);
  66. xgr $X[0],$prefetch ### Xupdate($i)
  67. lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
  68. xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
  69. xgr $X[0],$prefetch
  70. rll $X[0],$X[0],1
  71. rllg $X[1],$X[0],32
  72. rll $X[1],$X[1],1
  73. rllg $X[0],$X[1],32
  74. lr $X[2],$X[1] # feedback
  75. ___
  76. $code.=<<___ if ($i<=70);
  77. stg $X[0],`$stdframe+4*($i%16)`($sp)
  78. ___
  79. unshift(@X,pop(@X));
  80. }
  81. sub BODY_00_19 {
  82. my ($i,$a,$b,$c,$d,$e)=@_;
  83. my $xi=$X[1];
  84. &Xupdate($i);
  85. $code.=<<___;
  86. alr $e,$K ### $i
  87. rll $t1,$a,5
  88. lr $t0,$d
  89. xr $t0,$c
  90. alr $e,$t1
  91. nr $t0,$b
  92. alr $e,$xi
  93. xr $t0,$d
  94. rll $b,$b,30
  95. alr $e,$t0
  96. ___
  97. }
  98. sub BODY_20_39 {
  99. my ($i,$a,$b,$c,$d,$e)=@_;
  100. my $xi=$X[1];
  101. &Xupdate($i);
  102. $code.=<<___;
  103. alr $e,$K ### $i
  104. rll $t1,$a,5
  105. lr $t0,$b
  106. alr $e,$t1
  107. xr $t0,$c
  108. alr $e,$xi
  109. xr $t0,$d
  110. rll $b,$b,30
  111. alr $e,$t0
  112. ___
  113. }
  114. sub BODY_40_59 {
  115. my ($i,$a,$b,$c,$d,$e)=@_;
  116. my $xi=$X[1];
  117. &Xupdate($i);
  118. $code.=<<___;
  119. alr $e,$K ### $i
  120. rll $t1,$a,5
  121. lr $t0,$b
  122. alr $e,$t1
  123. or $t0,$c
  124. lr $t1,$b
  125. nr $t0,$d
  126. nr $t1,$c
  127. alr $e,$xi
  128. or $t0,$t1
  129. rll $b,$b,30
  130. alr $e,$t0
  131. ___
  132. }
  133. $code.=<<___;
  134. .text
  135. .align 64
  136. .type Ktable,\@object
  137. Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
  138. .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
  139. .size Ktable,.-Ktable
  140. .globl sha1_block_data_order
  141. .type sha1_block_data_order,\@function
  142. sha1_block_data_order:
  143. ___
  144. $code.=<<___ if ($kimdfunc);
  145. larl %r1,OPENSSL_s390xcap_P
  146. lg %r0,0(%r1)
  147. tmhl %r0,0x4000 # check for message-security assist
  148. jz .Lsoftware
  149. lghi %r0,0
  150. la %r1,`2*$SIZE_T`($sp)
  151. .long 0xb93e0002 # kimd %r0,%r2
  152. lg %r0,`2*$SIZE_T`($sp)
  153. tmhh %r0,`0x8000>>$kimdfunc`
  154. jz .Lsoftware
  155. lghi %r0,$kimdfunc
  156. lgr %r1,$ctx
  157. lgr %r2,$inp
  158. sllg %r3,$len,6
  159. .long 0xb93e0002 # kimd %r0,%r2
  160. brc 1,.-4 # pay attention to "partial completion"
  161. br %r14
  162. .align 16
  163. .Lsoftware:
  164. ___
  165. $code.=<<___;
  166. lghi %r1,-$frame
  167. st${g} $ctx,`2*$SIZE_T`($sp)
  168. stm${g} %r6,%r15,`6*$SIZE_T`($sp)
  169. lgr %r0,$sp
  170. la $sp,0(%r1,$sp)
  171. st${g} %r0,0($sp)
  172. larl $t0,Ktable
  173. llgf $A,0($ctx)
  174. llgf $B,4($ctx)
  175. llgf $C,8($ctx)
  176. llgf $D,12($ctx)
  177. llgf $E,16($ctx)
  178. lg $K_00_39,0($t0)
  179. lg $K_40_79,8($t0)
  180. .Lloop:
  181. rllg $K_00_39,$K_00_39,32
  182. ___
  183. for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  184. $code.=<<___;
  185. rllg $K_00_39,$K_00_39,32
  186. ___
  187. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  188. $code.=<<___; $K=$K_40_79;
  189. rllg $K_40_79,$K_40_79,32
  190. ___
  191. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  192. $code.=<<___;
  193. rllg $K_40_79,$K_40_79,32
  194. ___
  195. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  196. $code.=<<___;
  197. l${g} $ctx,`$frame+2*$SIZE_T`($sp)
  198. la $inp,64($inp)
  199. al $A,0($ctx)
  200. al $B,4($ctx)
  201. al $C,8($ctx)
  202. al $D,12($ctx)
  203. al $E,16($ctx)
  204. st $A,0($ctx)
  205. st $B,4($ctx)
  206. st $C,8($ctx)
  207. st $D,12($ctx)
  208. st $E,16($ctx)
  209. brct${g} $len,.Lloop
  210. lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
  211. br %r14
  212. .size sha1_block_data_order,.-sha1_block_data_order
  213. .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  214. .comm OPENSSL_s390xcap_P,16,8
  215. ___
  216. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  217. print $code;
  218. close STDOUT;