sha1-sparcv9.pl 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Hardware SPARC T4 support by David S. Miller
  15. # ====================================================================
  16. # Performance improvement is not really impressive on pre-T1 CPU: +8%
  17. # over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
  18. # turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
  19. # >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
  20. # X[16] vector is packed to 8 64-bit registers and as result nothing
  21. # is spilled on stack. In addition input data is loaded in compact
  22. # instruction sequence, thus minimizing the window when the code is
  23. # subject to [inter-thread] cache-thrashing hazard. The goal is to
  24. # ensure scalability on UltraSPARC T1, or rather to avoid decay when
  25. # amount of active threads exceeds the number of physical cores.
  26. # SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
  27. # faster than software. Multi-process benchmark saturates at 11x
  28. # single-process result on 8-core processor, or ~9GBps per 2.85GHz
  29. # socket.
  30. $output=pop and open STDOUT,">$output";
  31. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  32. $rot1m="%g2";
  33. $tmp64="%g3";
  34. $Xi="%g4";
  35. $A="%l0";
  36. $B="%l1";
  37. $C="%l2";
  38. $D="%l3";
  39. $E="%l4";
  40. @V=($A,$B,$C,$D,$E);
  41. $K_00_19="%l5";
  42. $K_20_39="%l6";
  43. $K_40_59="%l7";
  44. $K_60_79="%g5";
  45. @K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
  46. $ctx="%i0";
  47. $inp="%i1";
  48. $len="%i2";
  49. $tmp0="%i3";
  50. $tmp1="%i4";
  51. $tmp2="%i5";
  52. sub BODY_00_15 {
  53. my ($i,$a,$b,$c,$d,$e)=@_;
  54. my $xi=($i&1)?@X[($i/2)%8]:$Xi;
  55. $code.=<<___;
  56. sll $a,5,$tmp0 !! $i
  57. add @K[$i/20],$e,$e
  58. srl $a,27,$tmp1
  59. add $tmp0,$e,$e
  60. and $c,$b,$tmp0
  61. add $tmp1,$e,$e
  62. sll $b,30,$tmp2
  63. andn $d,$b,$tmp1
  64. srl $b,2,$b
  65. or $tmp1,$tmp0,$tmp1
  66. or $tmp2,$b,$b
  67. add $xi,$e,$e
  68. ___
  69. if ($i&1 && $i<15) {
  70. $code.=
  71. " srlx @X[(($i+1)/2)%8],32,$Xi\n";
  72. }
  73. $code.=<<___;
  74. add $tmp1,$e,$e
  75. ___
  76. }
  77. sub Xupdate {
  78. my ($i,$a,$b,$c,$d,$e)=@_;
  79. my $j=$i/2;
  80. if ($i&1) {
  81. $code.=<<___;
  82. sll $a,5,$tmp0 !! $i
  83. add @K[$i/20],$e,$e
  84. srl $a,27,$tmp1
  85. ___
  86. } else {
  87. $code.=<<___;
  88. sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
  89. xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
  90. srlx @X[($j+7)%8],32,$tmp1
  91. xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
  92. sll $a,5,$tmp0 !! $i
  93. or $tmp1,$Xi,$Xi
  94. add @K[$i/20],$e,$e !!
  95. xor $Xi,@X[$j%8],@X[$j%8]
  96. srlx @X[$j%8],31,$Xi
  97. add @X[$j%8],@X[$j%8],@X[$j%8]
  98. and $Xi,$rot1m,$Xi
  99. andn @X[$j%8],$rot1m,@X[$j%8]
  100. srl $a,27,$tmp1 !!
  101. or $Xi,@X[$j%8],@X[$j%8]
  102. ___
  103. }
  104. }
  105. sub BODY_16_19 {
  106. my ($i,$a,$b,$c,$d,$e)=@_;
  107. &Xupdate(@_);
  108. if ($i&1) {
  109. $xi=@X[($i/2)%8];
  110. } else {
  111. $xi=$Xi;
  112. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  113. }
  114. $code.=<<___;
  115. add $tmp0,$e,$e !!
  116. and $c,$b,$tmp0
  117. add $tmp1,$e,$e
  118. sll $b,30,$tmp2
  119. add $xi,$e,$e
  120. andn $d,$b,$tmp1
  121. srl $b,2,$b
  122. or $tmp1,$tmp0,$tmp1
  123. or $tmp2,$b,$b
  124. add $tmp1,$e,$e
  125. ___
  126. }
  127. sub BODY_20_39 {
  128. my ($i,$a,$b,$c,$d,$e)=@_;
  129. my $xi;
  130. &Xupdate(@_);
  131. if ($i&1) {
  132. $xi=@X[($i/2)%8];
  133. } else {
  134. $xi=$Xi;
  135. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  136. }
  137. $code.=<<___;
  138. add $tmp0,$e,$e !!
  139. xor $c,$b,$tmp0
  140. add $tmp1,$e,$e
  141. sll $b,30,$tmp2
  142. xor $d,$tmp0,$tmp1
  143. srl $b,2,$b
  144. add $tmp1,$e,$e
  145. or $tmp2,$b,$b
  146. add $xi,$e,$e
  147. ___
  148. }
  149. sub BODY_40_59 {
  150. my ($i,$a,$b,$c,$d,$e)=@_;
  151. my $xi;
  152. &Xupdate(@_);
  153. if ($i&1) {
  154. $xi=@X[($i/2)%8];
  155. } else {
  156. $xi=$Xi;
  157. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  158. }
  159. $code.=<<___;
  160. add $tmp0,$e,$e !!
  161. and $c,$b,$tmp0
  162. add $tmp1,$e,$e
  163. sll $b,30,$tmp2
  164. or $c,$b,$tmp1
  165. srl $b,2,$b
  166. and $d,$tmp1,$tmp1
  167. add $xi,$e,$e
  168. or $tmp1,$tmp0,$tmp1
  169. or $tmp2,$b,$b
  170. add $tmp1,$e,$e
  171. ___
  172. }
  173. $code.=<<___;
  174. #include "sparc_arch.h"
  175. #ifdef __arch64__
  176. .register %g2,#scratch
  177. .register %g3,#scratch
  178. #endif
  179. .section ".text",#alloc,#execinstr
  180. #ifdef __PIC__
  181. SPARC_PIC_THUNK(%g1)
  182. #endif
  183. .align 32
  184. .globl sha1_block_data_order
  185. sha1_block_data_order:
  186. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  187. ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
  188. andcc %g1, CFR_SHA1, %g0
  189. be .Lsoftware
  190. nop
  191. ld [%o0 + 0x00], %f0 ! load context
  192. ld [%o0 + 0x04], %f1
  193. ld [%o0 + 0x08], %f2
  194. andcc %o1, 0x7, %g0
  195. ld [%o0 + 0x0c], %f3
  196. bne,pn %icc, .Lhwunaligned
  197. ld [%o0 + 0x10], %f4
  198. .Lhw_loop:
  199. ldd [%o1 + 0x00], %f8
  200. ldd [%o1 + 0x08], %f10
  201. ldd [%o1 + 0x10], %f12
  202. ldd [%o1 + 0x18], %f14
  203. ldd [%o1 + 0x20], %f16
  204. ldd [%o1 + 0x28], %f18
  205. ldd [%o1 + 0x30], %f20
  206. subcc %o2, 1, %o2 ! done yet?
  207. ldd [%o1 + 0x38], %f22
  208. add %o1, 0x40, %o1
  209. prefetch [%o1 + 63], 20
  210. .word 0x81b02820 ! SHA1
  211. bne,pt SIZE_T_CC, .Lhw_loop
  212. nop
  213. .Lhwfinish:
  214. st %f0, [%o0 + 0x00] ! store context
  215. st %f1, [%o0 + 0x04]
  216. st %f2, [%o0 + 0x08]
  217. st %f3, [%o0 + 0x0c]
  218. retl
  219. st %f4, [%o0 + 0x10]
  220. .align 8
  221. .Lhwunaligned:
  222. alignaddr %o1, %g0, %o1
  223. ldd [%o1 + 0x00], %f10
  224. .Lhwunaligned_loop:
  225. ldd [%o1 + 0x08], %f12
  226. ldd [%o1 + 0x10], %f14
  227. ldd [%o1 + 0x18], %f16
  228. ldd [%o1 + 0x20], %f18
  229. ldd [%o1 + 0x28], %f20
  230. ldd [%o1 + 0x30], %f22
  231. ldd [%o1 + 0x38], %f24
  232. subcc %o2, 1, %o2 ! done yet?
  233. ldd [%o1 + 0x40], %f26
  234. add %o1, 0x40, %o1
  235. prefetch [%o1 + 63], 20
  236. faligndata %f10, %f12, %f8
  237. faligndata %f12, %f14, %f10
  238. faligndata %f14, %f16, %f12
  239. faligndata %f16, %f18, %f14
  240. faligndata %f18, %f20, %f16
  241. faligndata %f20, %f22, %f18
  242. faligndata %f22, %f24, %f20
  243. faligndata %f24, %f26, %f22
  244. .word 0x81b02820 ! SHA1
  245. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  246. for %f26, %f26, %f10 ! %f10=%f26
  247. ba .Lhwfinish
  248. nop
  249. .align 16
  250. .Lsoftware:
  251. save %sp,-STACK_FRAME,%sp
  252. sllx $len,6,$len
  253. add $inp,$len,$len
  254. or %g0,1,$rot1m
  255. sllx $rot1m,32,$rot1m
  256. or $rot1m,1,$rot1m
  257. ld [$ctx+0],$A
  258. ld [$ctx+4],$B
  259. ld [$ctx+8],$C
  260. ld [$ctx+12],$D
  261. ld [$ctx+16],$E
  262. andn $inp,7,$tmp0
  263. sethi %hi(0x5a827999),$K_00_19
  264. or $K_00_19,%lo(0x5a827999),$K_00_19
  265. sethi %hi(0x6ed9eba1),$K_20_39
  266. or $K_20_39,%lo(0x6ed9eba1),$K_20_39
  267. sethi %hi(0x8f1bbcdc),$K_40_59
  268. or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
  269. sethi %hi(0xca62c1d6),$K_60_79
  270. or $K_60_79,%lo(0xca62c1d6),$K_60_79
  271. .Lloop:
  272. ldx [$tmp0+0],@X[0]
  273. ldx [$tmp0+16],@X[2]
  274. ldx [$tmp0+32],@X[4]
  275. ldx [$tmp0+48],@X[6]
  276. and $inp,7,$tmp1
  277. ldx [$tmp0+8],@X[1]
  278. sll $tmp1,3,$tmp1
  279. ldx [$tmp0+24],@X[3]
  280. subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
  281. ldx [$tmp0+40],@X[5]
  282. bz,pt %icc,.Laligned
  283. ldx [$tmp0+56],@X[7]
  284. sllx @X[0],$tmp1,@X[0]
  285. ldx [$tmp0+64],$tmp64
  286. ___
  287. for($i=0;$i<7;$i++)
  288. { $code.=<<___;
  289. srlx @X[$i+1],$tmp2,$Xi
  290. sllx @X[$i+1],$tmp1,@X[$i+1]
  291. or $Xi,@X[$i],@X[$i]
  292. ___
  293. }
  294. $code.=<<___;
  295. srlx $tmp64,$tmp2,$tmp64
  296. or $tmp64,@X[7],@X[7]
  297. .Laligned:
  298. srlx @X[0],32,$Xi
  299. ___
  300. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  301. for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
  302. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  303. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  304. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  305. $code.=<<___;
  306. ld [$ctx+0],@X[0]
  307. ld [$ctx+4],@X[1]
  308. ld [$ctx+8],@X[2]
  309. ld [$ctx+12],@X[3]
  310. add $inp,64,$inp
  311. ld [$ctx+16],@X[4]
  312. cmp $inp,$len
  313. add $A,@X[0],$A
  314. st $A,[$ctx+0]
  315. add $B,@X[1],$B
  316. st $B,[$ctx+4]
  317. add $C,@X[2],$C
  318. st $C,[$ctx+8]
  319. add $D,@X[3],$D
  320. st $D,[$ctx+12]
  321. add $E,@X[4],$E
  322. st $E,[$ctx+16]
  323. bne SIZE_T_CC,.Lloop
  324. andn $inp,7,$tmp0
  325. ret
  326. restore
  327. .type sha1_block_data_order,#function
  328. .size sha1_block_data_order,(.-sha1_block_data_order)
  329. .asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  330. .align 4
  331. ___
  332. # Purpose of these subroutines is to explicitly encode VIS instructions,
  333. # so that one can compile the module without having to specify VIS
  334. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  335. # Idea is to reserve for option to produce "universal" binary and let
  336. # programmer detect if current CPU is VIS capable at run-time.
  337. sub unvis {
  338. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  339. my $ref,$opf;
  340. my %visopf = ( "faligndata" => 0x048,
  341. "for" => 0x07c );
  342. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  343. if ($opf=$visopf{$mnemonic}) {
  344. foreach ($rs1,$rs2,$rd) {
  345. return $ref if (!/%f([0-9]{1,2})/);
  346. $_=$1;
  347. if ($1>=32) {
  348. return $ref if ($1&1);
  349. # re-encode for upper double register addressing
  350. $_=($1|$1>>5)&31;
  351. }
  352. }
  353. return sprintf ".word\t0x%08x !%s",
  354. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  355. $ref;
  356. } else {
  357. return $ref;
  358. }
  359. }
  360. sub unalignaddr {
  361. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  362. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  363. my $ref="$mnemonic\t$rs1,$rs2,$rd";
  364. foreach ($rs1,$rs2,$rd) {
  365. if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
  366. else { return $ref; }
  367. }
  368. return sprintf ".word\t0x%08x !%s",
  369. 0x81b00300|$rd<<25|$rs1<<14|$rs2,
  370. $ref;
  371. }
  372. foreach (split("\n",$code)) {
  373. s/\`([^\`]*)\`/eval $1/ge;
  374. s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  375. &unvis($1,$2,$3,$4)
  376. /ge;
  377. s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  378. &unalignaddr($1,$2,$3,$4)
  379. /ge;
  380. print $_,"\n";
  381. }
  382. close STDOUT;