sha1-s390x.pl 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # SHA1 block procedure for s390x.
  15. # April 2007.
  16. #
  17. # Performance is >30% better than gcc 3.3 generated code. But the real
  18. # twist is that SHA1 hardware support is detected and utilized. In
  19. # which case performance can reach further >4.5x for larger chunks.
  20. # January 2009.
  21. #
  22. # Optimize Xupdate for amount of memory references and reschedule
  23. # instructions to favour dual-issue z10 pipeline. On z10 hardware is
  24. # "only" ~2.3x faster than software.
  25. # November 2010.
  26. #
  27. # Adapt for -m31 build. If kernel supports what's called "highgprs"
  28. # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
  29. # instructions and achieve "64-bit" performance even in 31-bit legacy
  30. # application context. The feature is not specific to any particular
  31. # processor, as long as it's "z-CPU". Latter implies that the code
  32. # remains z/Architecture specific. On z990 it was measured to perform
  33. # 23% better than code generated by gcc 4.3.
  34. $kimdfunc=1; # magic function code for kimd instruction
  35. # $output is the last argument if it looks like a file (it has an extension)
  36. # $flavour is the first argument if it doesn't look like a file
  37. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  38. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  39. if ($flavour =~ /3[12]/) {
  40. $SIZE_T=4;
  41. $g="";
  42. } else {
  43. $SIZE_T=8;
  44. $g="g";
  45. }
  46. $output and open STDOUT,">$output";
  47. $K_00_39="%r0"; $K=$K_00_39;
  48. $K_40_79="%r1";
  49. $ctx="%r2"; $prefetch="%r2";
  50. $inp="%r3";
  51. $len="%r4";
  52. $A="%r5";
  53. $B="%r6";
  54. $C="%r7";
  55. $D="%r8";
  56. $E="%r9"; @V=($A,$B,$C,$D,$E);
  57. $t0="%r10";
  58. $t1="%r11";
  59. @X=("%r12","%r13","%r14");
  60. $sp="%r15";
  61. $stdframe=16*$SIZE_T+4*8;
  62. $frame=$stdframe+16*4;
  63. sub Xupdate {
  64. my $i=shift;
  65. $code.=<<___ if ($i==15);
  66. lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
  67. lr $X[0],$X[2]
  68. ___
  69. return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
  70. $code.=<<___ if ($i<16);
  71. lg $X[0],`$i*4`($inp) ### Xload($i)
  72. rllg $X[1],$X[0],32
  73. ___
  74. $code.=<<___ if ($i>=16);
  75. xgr $X[0],$prefetch ### Xupdate($i)
  76. lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
  77. xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
  78. xgr $X[0],$prefetch
  79. rll $X[0],$X[0],1
  80. rllg $X[1],$X[0],32
  81. rll $X[1],$X[1],1
  82. rllg $X[0],$X[1],32
  83. lr $X[2],$X[1] # feedback
  84. ___
  85. $code.=<<___ if ($i<=70);
  86. stg $X[0],`$stdframe+4*($i%16)`($sp)
  87. ___
  88. unshift(@X,pop(@X));
  89. }
  90. sub BODY_00_19 {
  91. my ($i,$a,$b,$c,$d,$e)=@_;
  92. my $xi=$X[1];
  93. &Xupdate($i);
  94. $code.=<<___;
  95. alr $e,$K ### $i
  96. rll $t1,$a,5
  97. lr $t0,$d
  98. xr $t0,$c
  99. alr $e,$t1
  100. nr $t0,$b
  101. alr $e,$xi
  102. xr $t0,$d
  103. rll $b,$b,30
  104. alr $e,$t0
  105. ___
  106. }
  107. sub BODY_20_39 {
  108. my ($i,$a,$b,$c,$d,$e)=@_;
  109. my $xi=$X[1];
  110. &Xupdate($i);
  111. $code.=<<___;
  112. alr $e,$K ### $i
  113. rll $t1,$a,5
  114. lr $t0,$b
  115. alr $e,$t1
  116. xr $t0,$c
  117. alr $e,$xi
  118. xr $t0,$d
  119. rll $b,$b,30
  120. alr $e,$t0
  121. ___
  122. }
  123. sub BODY_40_59 {
  124. my ($i,$a,$b,$c,$d,$e)=@_;
  125. my $xi=$X[1];
  126. &Xupdate($i);
  127. $code.=<<___;
  128. alr $e,$K ### $i
  129. rll $t1,$a,5
  130. lr $t0,$b
  131. alr $e,$t1
  132. or $t0,$c
  133. lr $t1,$b
  134. nr $t0,$d
  135. nr $t1,$c
  136. alr $e,$xi
  137. or $t0,$t1
  138. rll $b,$b,30
  139. alr $e,$t0
  140. ___
  141. }
  142. $code.=<<___;
  143. #include "s390x_arch.h"
  144. .text
  145. .align 64
  146. .type Ktable,\@object
  147. Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
  148. .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
  149. .size Ktable,.-Ktable
  150. .globl sha1_block_data_order
  151. .type sha1_block_data_order,\@function
  152. sha1_block_data_order:
  153. ___
  154. $code.=<<___ if ($kimdfunc);
  155. larl %r1,OPENSSL_s390xcap_P
  156. lg %r0,S390X_KIMD(%r1) # check kimd capabilities
  157. tmhh %r0,`0x8000>>$kimdfunc`
  158. jz .Lsoftware
  159. lghi %r0,$kimdfunc
  160. lgr %r1,$ctx
  161. lgr %r2,$inp
  162. sllg %r3,$len,6
  163. .long 0xb93e0002 # kimd %r0,%r2
  164. brc 1,.-4 # pay attention to "partial completion"
  165. br %r14
  166. .align 16
  167. .Lsoftware:
  168. ___
  169. $code.=<<___;
  170. lghi %r1,-$frame
  171. st${g} $ctx,`2*$SIZE_T`($sp)
  172. stm${g} %r6,%r15,`6*$SIZE_T`($sp)
  173. lgr %r0,$sp
  174. la $sp,0(%r1,$sp)
  175. st${g} %r0,0($sp)
  176. larl $t0,Ktable
  177. llgf $A,0($ctx)
  178. llgf $B,4($ctx)
  179. llgf $C,8($ctx)
  180. llgf $D,12($ctx)
  181. llgf $E,16($ctx)
  182. lg $K_00_39,0($t0)
  183. lg $K_40_79,8($t0)
  184. .Lloop:
  185. rllg $K_00_39,$K_00_39,32
  186. ___
  187. for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  188. $code.=<<___;
  189. rllg $K_00_39,$K_00_39,32
  190. ___
  191. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  192. $code.=<<___; $K=$K_40_79;
  193. rllg $K_40_79,$K_40_79,32
  194. ___
  195. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  196. $code.=<<___;
  197. rllg $K_40_79,$K_40_79,32
  198. ___
  199. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  200. $code.=<<___;
  201. l${g} $ctx,`$frame+2*$SIZE_T`($sp)
  202. la $inp,64($inp)
  203. al $A,0($ctx)
  204. al $B,4($ctx)
  205. al $C,8($ctx)
  206. al $D,12($ctx)
  207. al $E,16($ctx)
  208. st $A,0($ctx)
  209. st $B,4($ctx)
  210. st $C,8($ctx)
  211. st $D,12($ctx)
  212. st $E,16($ctx)
  213. brct${g} $len,.Lloop
  214. lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
  215. br %r14
  216. .size sha1_block_data_order,.-sha1_block_data_order
  217. .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  218. ___
  219. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  220. print $code;
  221. close STDOUT;