2
0

sha1-sparcv9.pl 9.2 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Hardware SPARC T4 support by David S. Miller
  15. # ====================================================================
  16. # Performance improvement is not really impressive on pre-T1 CPU: +8%
  17. # over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
  18. # turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
  19. # >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
  20. # X[16] vector is packed to 8 64-bit registers and as result nothing
  21. # is spilled on stack. In addition input data is loaded in compact
  22. # instruction sequence, thus minimizing the window when the code is
  23. # subject to [inter-thread] cache-thrashing hazard. The goal is to
  24. # ensure scalability on UltraSPARC T1, or rather to avoid decay when
  25. # amount of active threads exceeds the number of physical cores.
  26. # SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
  27. # faster than software. Multi-process benchmark saturates at 11x
  28. # single-process result on 8-core processor, or ~9GBps per 2.85GHz
  29. # socket.
  30. $output=pop;
  31. open STDOUT,">$output";
  32. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  33. $rot1m="%g2";
  34. $tmp64="%g3";
  35. $Xi="%g4";
  36. $A="%l0";
  37. $B="%l1";
  38. $C="%l2";
  39. $D="%l3";
  40. $E="%l4";
  41. @V=($A,$B,$C,$D,$E);
  42. $K_00_19="%l5";
  43. $K_20_39="%l6";
  44. $K_40_59="%l7";
  45. $K_60_79="%g5";
  46. @K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
  47. $ctx="%i0";
  48. $inp="%i1";
  49. $len="%i2";
  50. $tmp0="%i3";
  51. $tmp1="%i4";
  52. $tmp2="%i5";
  53. sub BODY_00_15 {
  54. my ($i,$a,$b,$c,$d,$e)=@_;
  55. my $xi=($i&1)?@X[($i/2)%8]:$Xi;
  56. $code.=<<___;
  57. sll $a,5,$tmp0 !! $i
  58. add @K[$i/20],$e,$e
  59. srl $a,27,$tmp1
  60. add $tmp0,$e,$e
  61. and $c,$b,$tmp0
  62. add $tmp1,$e,$e
  63. sll $b,30,$tmp2
  64. andn $d,$b,$tmp1
  65. srl $b,2,$b
  66. or $tmp1,$tmp0,$tmp1
  67. or $tmp2,$b,$b
  68. add $xi,$e,$e
  69. ___
  70. if ($i&1 && $i<15) {
  71. $code.=
  72. " srlx @X[(($i+1)/2)%8],32,$Xi\n";
  73. }
  74. $code.=<<___;
  75. add $tmp1,$e,$e
  76. ___
  77. }
  78. sub Xupdate {
  79. my ($i,$a,$b,$c,$d,$e)=@_;
  80. my $j=$i/2;
  81. if ($i&1) {
  82. $code.=<<___;
  83. sll $a,5,$tmp0 !! $i
  84. add @K[$i/20],$e,$e
  85. srl $a,27,$tmp1
  86. ___
  87. } else {
  88. $code.=<<___;
  89. sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
  90. xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
  91. srlx @X[($j+7)%8],32,$tmp1
  92. xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
  93. sll $a,5,$tmp0 !! $i
  94. or $tmp1,$Xi,$Xi
  95. add @K[$i/20],$e,$e !!
  96. xor $Xi,@X[$j%8],@X[$j%8]
  97. srlx @X[$j%8],31,$Xi
  98. add @X[$j%8],@X[$j%8],@X[$j%8]
  99. and $Xi,$rot1m,$Xi
  100. andn @X[$j%8],$rot1m,@X[$j%8]
  101. srl $a,27,$tmp1 !!
  102. or $Xi,@X[$j%8],@X[$j%8]
  103. ___
  104. }
  105. }
  106. sub BODY_16_19 {
  107. my ($i,$a,$b,$c,$d,$e)=@_;
  108. &Xupdate(@_);
  109. if ($i&1) {
  110. $xi=@X[($i/2)%8];
  111. } else {
  112. $xi=$Xi;
  113. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  114. }
  115. $code.=<<___;
  116. add $tmp0,$e,$e !!
  117. and $c,$b,$tmp0
  118. add $tmp1,$e,$e
  119. sll $b,30,$tmp2
  120. add $xi,$e,$e
  121. andn $d,$b,$tmp1
  122. srl $b,2,$b
  123. or $tmp1,$tmp0,$tmp1
  124. or $tmp2,$b,$b
  125. add $tmp1,$e,$e
  126. ___
  127. }
  128. sub BODY_20_39 {
  129. my ($i,$a,$b,$c,$d,$e)=@_;
  130. my $xi;
  131. &Xupdate(@_);
  132. if ($i&1) {
  133. $xi=@X[($i/2)%8];
  134. } else {
  135. $xi=$Xi;
  136. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  137. }
  138. $code.=<<___;
  139. add $tmp0,$e,$e !!
  140. xor $c,$b,$tmp0
  141. add $tmp1,$e,$e
  142. sll $b,30,$tmp2
  143. xor $d,$tmp0,$tmp1
  144. srl $b,2,$b
  145. add $tmp1,$e,$e
  146. or $tmp2,$b,$b
  147. add $xi,$e,$e
  148. ___
  149. }
  150. sub BODY_40_59 {
  151. my ($i,$a,$b,$c,$d,$e)=@_;
  152. my $xi;
  153. &Xupdate(@_);
  154. if ($i&1) {
  155. $xi=@X[($i/2)%8];
  156. } else {
  157. $xi=$Xi;
  158. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  159. }
  160. $code.=<<___;
  161. add $tmp0,$e,$e !!
  162. and $c,$b,$tmp0
  163. add $tmp1,$e,$e
  164. sll $b,30,$tmp2
  165. or $c,$b,$tmp1
  166. srl $b,2,$b
  167. and $d,$tmp1,$tmp1
  168. add $xi,$e,$e
  169. or $tmp1,$tmp0,$tmp1
  170. or $tmp2,$b,$b
  171. add $tmp1,$e,$e
  172. ___
  173. }
  174. $code.=<<___;
  175. #include "sparc_arch.h"
  176. #ifdef __arch64__
  177. .register %g2,#scratch
  178. .register %g3,#scratch
  179. #endif
  180. .section ".text",#alloc,#execinstr
  181. #ifdef __PIC__
  182. SPARC_PIC_THUNK(%g1)
  183. #endif
  184. .align 32
  185. .globl sha1_block_data_order
  186. sha1_block_data_order:
  187. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  188. ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
  189. andcc %g1, CFR_SHA1, %g0
  190. be .Lsoftware
  191. nop
  192. ld [%o0 + 0x00], %f0 ! load context
  193. ld [%o0 + 0x04], %f1
  194. ld [%o0 + 0x08], %f2
  195. andcc %o1, 0x7, %g0
  196. ld [%o0 + 0x0c], %f3
  197. bne,pn %icc, .Lhwunaligned
  198. ld [%o0 + 0x10], %f4
  199. .Lhw_loop:
  200. ldd [%o1 + 0x00], %f8
  201. ldd [%o1 + 0x08], %f10
  202. ldd [%o1 + 0x10], %f12
  203. ldd [%o1 + 0x18], %f14
  204. ldd [%o1 + 0x20], %f16
  205. ldd [%o1 + 0x28], %f18
  206. ldd [%o1 + 0x30], %f20
  207. subcc %o2, 1, %o2 ! done yet?
  208. ldd [%o1 + 0x38], %f22
  209. add %o1, 0x40, %o1
  210. prefetch [%o1 + 63], 20
  211. .word 0x81b02820 ! SHA1
  212. bne,pt SIZE_T_CC, .Lhw_loop
  213. nop
  214. .Lhwfinish:
  215. st %f0, [%o0 + 0x00] ! store context
  216. st %f1, [%o0 + 0x04]
  217. st %f2, [%o0 + 0x08]
  218. st %f3, [%o0 + 0x0c]
  219. retl
  220. st %f4, [%o0 + 0x10]
  221. .align 8
  222. .Lhwunaligned:
  223. alignaddr %o1, %g0, %o1
  224. ldd [%o1 + 0x00], %f10
  225. .Lhwunaligned_loop:
  226. ldd [%o1 + 0x08], %f12
  227. ldd [%o1 + 0x10], %f14
  228. ldd [%o1 + 0x18], %f16
  229. ldd [%o1 + 0x20], %f18
  230. ldd [%o1 + 0x28], %f20
  231. ldd [%o1 + 0x30], %f22
  232. ldd [%o1 + 0x38], %f24
  233. subcc %o2, 1, %o2 ! done yet?
  234. ldd [%o1 + 0x40], %f26
  235. add %o1, 0x40, %o1
  236. prefetch [%o1 + 63], 20
  237. faligndata %f10, %f12, %f8
  238. faligndata %f12, %f14, %f10
  239. faligndata %f14, %f16, %f12
  240. faligndata %f16, %f18, %f14
  241. faligndata %f18, %f20, %f16
  242. faligndata %f20, %f22, %f18
  243. faligndata %f22, %f24, %f20
  244. faligndata %f24, %f26, %f22
  245. .word 0x81b02820 ! SHA1
  246. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  247. for %f26, %f26, %f10 ! %f10=%f26
  248. ba .Lhwfinish
  249. nop
  250. .align 16
  251. .Lsoftware:
  252. save %sp,-STACK_FRAME,%sp
  253. sllx $len,6,$len
  254. add $inp,$len,$len
  255. or %g0,1,$rot1m
  256. sllx $rot1m,32,$rot1m
  257. or $rot1m,1,$rot1m
  258. ld [$ctx+0],$A
  259. ld [$ctx+4],$B
  260. ld [$ctx+8],$C
  261. ld [$ctx+12],$D
  262. ld [$ctx+16],$E
  263. andn $inp,7,$tmp0
  264. sethi %hi(0x5a827999),$K_00_19
  265. or $K_00_19,%lo(0x5a827999),$K_00_19
  266. sethi %hi(0x6ed9eba1),$K_20_39
  267. or $K_20_39,%lo(0x6ed9eba1),$K_20_39
  268. sethi %hi(0x8f1bbcdc),$K_40_59
  269. or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
  270. sethi %hi(0xca62c1d6),$K_60_79
  271. or $K_60_79,%lo(0xca62c1d6),$K_60_79
  272. .Lloop:
  273. ldx [$tmp0+0],@X[0]
  274. ldx [$tmp0+16],@X[2]
  275. ldx [$tmp0+32],@X[4]
  276. ldx [$tmp0+48],@X[6]
  277. and $inp,7,$tmp1
  278. ldx [$tmp0+8],@X[1]
  279. sll $tmp1,3,$tmp1
  280. ldx [$tmp0+24],@X[3]
  281. subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
  282. ldx [$tmp0+40],@X[5]
  283. bz,pt %icc,.Laligned
  284. ldx [$tmp0+56],@X[7]
  285. sllx @X[0],$tmp1,@X[0]
  286. ldx [$tmp0+64],$tmp64
  287. ___
  288. for($i=0;$i<7;$i++)
  289. { $code.=<<___;
  290. srlx @X[$i+1],$tmp2,$Xi
  291. sllx @X[$i+1],$tmp1,@X[$i+1]
  292. or $Xi,@X[$i],@X[$i]
  293. ___
  294. }
  295. $code.=<<___;
  296. srlx $tmp64,$tmp2,$tmp64
  297. or $tmp64,@X[7],@X[7]
  298. .Laligned:
  299. srlx @X[0],32,$Xi
  300. ___
  301. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  302. for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
  303. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  304. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  305. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  306. $code.=<<___;
  307. ld [$ctx+0],@X[0]
  308. ld [$ctx+4],@X[1]
  309. ld [$ctx+8],@X[2]
  310. ld [$ctx+12],@X[3]
  311. add $inp,64,$inp
  312. ld [$ctx+16],@X[4]
  313. cmp $inp,$len
  314. add $A,@X[0],$A
  315. st $A,[$ctx+0]
  316. add $B,@X[1],$B
  317. st $B,[$ctx+4]
  318. add $C,@X[2],$C
  319. st $C,[$ctx+8]
  320. add $D,@X[3],$D
  321. st $D,[$ctx+12]
  322. add $E,@X[4],$E
  323. st $E,[$ctx+16]
  324. bne SIZE_T_CC,.Lloop
  325. andn $inp,7,$tmp0
  326. ret
  327. restore
  328. .type sha1_block_data_order,#function
  329. .size sha1_block_data_order,(.-sha1_block_data_order)
  330. .asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  331. .align 4
  332. ___
  333. # Purpose of these subroutines is to explicitly encode VIS instructions,
  334. # so that one can compile the module without having to specify VIS
  335. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  336. # Idea is to reserve for option to produce "universal" binary and let
  337. # programmer detect if current CPU is VIS capable at run-time.
  338. sub unvis {
  339. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  340. my $ref,$opf;
  341. my %visopf = ( "faligndata" => 0x048,
  342. "for" => 0x07c );
  343. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  344. if ($opf=$visopf{$mnemonic}) {
  345. foreach ($rs1,$rs2,$rd) {
  346. return $ref if (!/%f([0-9]{1,2})/);
  347. $_=$1;
  348. if ($1>=32) {
  349. return $ref if ($1&1);
  350. # re-encode for upper double register addressing
  351. $_=($1|$1>>5)&31;
  352. }
  353. }
  354. return sprintf ".word\t0x%08x !%s",
  355. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  356. $ref;
  357. } else {
  358. return $ref;
  359. }
  360. }
  361. sub unalignaddr {
  362. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  363. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  364. my $ref="$mnemonic\t$rs1,$rs2,$rd";
  365. foreach ($rs1,$rs2,$rd) {
  366. if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
  367. else { return $ref; }
  368. }
  369. return sprintf ".word\t0x%08x !%s",
  370. 0x81b00300|$rd<<25|$rs1<<14|$rs2,
  371. $ref;
  372. }
  373. foreach (split("\n",$code)) {
  374. s/\`([^\`]*)\`/eval $1/ge;
  375. s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  376. &unvis($1,$2,$3,$4)
  377. /ge;
  378. s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  379. &unalignaddr($1,$2,$3,$4)
  380. /ge;
  381. print $_,"\n";
  382. }
  383. close STDOUT;