sha1-sparcv9.pl 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Hardware SPARC T4 support by David S. Miller
  15. # ====================================================================
  16. # Performance improvement is not really impressive on pre-T1 CPU: +8%
  17. # over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
  18. # turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
  19. # >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
  20. # X[16] vector is packed to 8 64-bit registers and as result nothing
  21. # is spilled on stack. In addition input data is loaded in compact
  22. # instruction sequence, thus minimizing the window when the code is
  23. # subject to [inter-thread] cache-thrashing hazard. The goal is to
  24. # ensure scalability on UltraSPARC T1, or rather to avoid decay when
  25. # amount of active threads exceeds the number of physical cores.
  26. # SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
  27. # faster than software. Multi-process benchmark saturates at 11x
  28. # single-process result on 8-core processor, or ~9GBps per 2.85GHz
  29. # socket.
  30. $output=pop and open STDOUT,">$output";
  31. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  32. $rot1m="%g2";
  33. $tmp64="%g3";
  34. $Xi="%g4";
  35. $A="%l0";
  36. $B="%l1";
  37. $C="%l2";
  38. $D="%l3";
  39. $E="%l4";
  40. @V=($A,$B,$C,$D,$E);
  41. $K_00_19="%l5";
  42. $K_20_39="%l6";
  43. $K_40_59="%l7";
  44. $K_60_79="%g5";
  45. @K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
  46. $ctx="%i0";
  47. $inp="%i1";
  48. $len="%i2";
  49. $tmp0="%i3";
  50. $tmp1="%i4";
  51. $tmp2="%i5";
  52. sub BODY_00_15 {
  53. my ($i,$a,$b,$c,$d,$e)=@_;
  54. my $xi=($i&1)?@X[($i/2)%8]:$Xi;
  55. $code.=<<___;
  56. sll $a,5,$tmp0 !! $i
  57. add @K[$i/20],$e,$e
  58. srl $a,27,$tmp1
  59. add $tmp0,$e,$e
  60. and $c,$b,$tmp0
  61. add $tmp1,$e,$e
  62. sll $b,30,$tmp2
  63. andn $d,$b,$tmp1
  64. srl $b,2,$b
  65. or $tmp1,$tmp0,$tmp1
  66. or $tmp2,$b,$b
  67. add $xi,$e,$e
  68. ___
  69. if ($i&1 && $i<15) {
  70. $code.=
  71. " srlx @X[(($i+1)/2)%8],32,$Xi\n";
  72. }
  73. $code.=<<___;
  74. add $tmp1,$e,$e
  75. ___
  76. }
  77. sub Xupdate {
  78. my ($i,$a,$b,$c,$d,$e)=@_;
  79. my $j=$i/2;
  80. if ($i&1) {
  81. $code.=<<___;
  82. sll $a,5,$tmp0 !! $i
  83. add @K[$i/20],$e,$e
  84. srl $a,27,$tmp1
  85. ___
  86. } else {
  87. $code.=<<___;
  88. sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
  89. xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
  90. srlx @X[($j+7)%8],32,$tmp1
  91. xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
  92. sll $a,5,$tmp0 !! $i
  93. or $tmp1,$Xi,$Xi
  94. add @K[$i/20],$e,$e !!
  95. xor $Xi,@X[$j%8],@X[$j%8]
  96. srlx @X[$j%8],31,$Xi
  97. add @X[$j%8],@X[$j%8],@X[$j%8]
  98. and $Xi,$rot1m,$Xi
  99. andn @X[$j%8],$rot1m,@X[$j%8]
  100. srl $a,27,$tmp1 !!
  101. or $Xi,@X[$j%8],@X[$j%8]
  102. ___
  103. }
  104. }
  105. sub BODY_16_19 {
  106. my ($i,$a,$b,$c,$d,$e)=@_;
  107. &Xupdate(@_);
  108. if ($i&1) {
  109. $xi=@X[($i/2)%8];
  110. } else {
  111. $xi=$Xi;
  112. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  113. }
  114. $code.=<<___;
  115. add $tmp0,$e,$e !!
  116. and $c,$b,$tmp0
  117. add $tmp1,$e,$e
  118. sll $b,30,$tmp2
  119. add $xi,$e,$e
  120. andn $d,$b,$tmp1
  121. srl $b,2,$b
  122. or $tmp1,$tmp0,$tmp1
  123. or $tmp2,$b,$b
  124. add $tmp1,$e,$e
  125. ___
  126. }
  127. sub BODY_20_39 {
  128. my ($i,$a,$b,$c,$d,$e)=@_;
  129. my $xi;
  130. &Xupdate(@_);
  131. if ($i&1) {
  132. $xi=@X[($i/2)%8];
  133. } else {
  134. $xi=$Xi;
  135. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  136. }
  137. $code.=<<___;
  138. add $tmp0,$e,$e !!
  139. xor $c,$b,$tmp0
  140. add $tmp1,$e,$e
  141. sll $b,30,$tmp2
  142. xor $d,$tmp0,$tmp1
  143. srl $b,2,$b
  144. add $tmp1,$e,$e
  145. or $tmp2,$b,$b
  146. add $xi,$e,$e
  147. ___
  148. }
  149. sub BODY_40_59 {
  150. my ($i,$a,$b,$c,$d,$e)=@_;
  151. my $xi;
  152. &Xupdate(@_);
  153. if ($i&1) {
  154. $xi=@X[($i/2)%8];
  155. } else {
  156. $xi=$Xi;
  157. $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
  158. }
  159. $code.=<<___;
  160. add $tmp0,$e,$e !!
  161. and $c,$b,$tmp0
  162. add $tmp1,$e,$e
  163. sll $b,30,$tmp2
  164. or $c,$b,$tmp1
  165. srl $b,2,$b
  166. and $d,$tmp1,$tmp1
  167. add $xi,$e,$e
  168. or $tmp1,$tmp0,$tmp1
  169. or $tmp2,$b,$b
  170. add $tmp1,$e,$e
  171. ___
  172. }
  173. $code.=<<___;
  174. #ifndef __ASSEMBLER__
  175. # define __ASSEMBLER__ 1
  176. #endif
  177. #include "crypto/sparc_arch.h"
  178. #ifdef __arch64__
  179. .register %g2,#scratch
  180. .register %g3,#scratch
  181. #endif
  182. .section ".text",#alloc,#execinstr
  183. #ifdef __PIC__
  184. SPARC_PIC_THUNK(%g1)
  185. #endif
  186. .align 32
  187. .globl sha1_block_data_order
  188. sha1_block_data_order:
  189. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  190. ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
  191. andcc %g1, CFR_SHA1, %g0
  192. be .Lsoftware
  193. nop
  194. ld [%o0 + 0x00], %f0 ! load context
  195. ld [%o0 + 0x04], %f1
  196. ld [%o0 + 0x08], %f2
  197. andcc %o1, 0x7, %g0
  198. ld [%o0 + 0x0c], %f3
  199. bne,pn %icc, .Lhwunaligned
  200. ld [%o0 + 0x10], %f4
  201. .Lhw_loop:
  202. ldd [%o1 + 0x00], %f8
  203. ldd [%o1 + 0x08], %f10
  204. ldd [%o1 + 0x10], %f12
  205. ldd [%o1 + 0x18], %f14
  206. ldd [%o1 + 0x20], %f16
  207. ldd [%o1 + 0x28], %f18
  208. ldd [%o1 + 0x30], %f20
  209. subcc %o2, 1, %o2 ! done yet?
  210. ldd [%o1 + 0x38], %f22
  211. add %o1, 0x40, %o1
  212. prefetch [%o1 + 63], 20
  213. .word 0x81b02820 ! SHA1
  214. bne,pt SIZE_T_CC, .Lhw_loop
  215. nop
  216. .Lhwfinish:
  217. st %f0, [%o0 + 0x00] ! store context
  218. st %f1, [%o0 + 0x04]
  219. st %f2, [%o0 + 0x08]
  220. st %f3, [%o0 + 0x0c]
  221. retl
  222. st %f4, [%o0 + 0x10]
  223. .align 8
  224. .Lhwunaligned:
  225. alignaddr %o1, %g0, %o1
  226. ldd [%o1 + 0x00], %f10
  227. .Lhwunaligned_loop:
  228. ldd [%o1 + 0x08], %f12
  229. ldd [%o1 + 0x10], %f14
  230. ldd [%o1 + 0x18], %f16
  231. ldd [%o1 + 0x20], %f18
  232. ldd [%o1 + 0x28], %f20
  233. ldd [%o1 + 0x30], %f22
  234. ldd [%o1 + 0x38], %f24
  235. subcc %o2, 1, %o2 ! done yet?
  236. ldd [%o1 + 0x40], %f26
  237. add %o1, 0x40, %o1
  238. prefetch [%o1 + 63], 20
  239. faligndata %f10, %f12, %f8
  240. faligndata %f12, %f14, %f10
  241. faligndata %f14, %f16, %f12
  242. faligndata %f16, %f18, %f14
  243. faligndata %f18, %f20, %f16
  244. faligndata %f20, %f22, %f18
  245. faligndata %f22, %f24, %f20
  246. faligndata %f24, %f26, %f22
  247. .word 0x81b02820 ! SHA1
  248. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  249. for %f26, %f26, %f10 ! %f10=%f26
  250. ba .Lhwfinish
  251. nop
  252. .align 16
  253. .Lsoftware:
  254. save %sp,-STACK_FRAME,%sp
  255. sllx $len,6,$len
  256. add $inp,$len,$len
  257. or %g0,1,$rot1m
  258. sllx $rot1m,32,$rot1m
  259. or $rot1m,1,$rot1m
  260. ld [$ctx+0],$A
  261. ld [$ctx+4],$B
  262. ld [$ctx+8],$C
  263. ld [$ctx+12],$D
  264. ld [$ctx+16],$E
  265. andn $inp,7,$tmp0
  266. sethi %hi(0x5a827999),$K_00_19
  267. or $K_00_19,%lo(0x5a827999),$K_00_19
  268. sethi %hi(0x6ed9eba1),$K_20_39
  269. or $K_20_39,%lo(0x6ed9eba1),$K_20_39
  270. sethi %hi(0x8f1bbcdc),$K_40_59
  271. or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
  272. sethi %hi(0xca62c1d6),$K_60_79
  273. or $K_60_79,%lo(0xca62c1d6),$K_60_79
  274. .Lloop:
  275. ldx [$tmp0+0],@X[0]
  276. ldx [$tmp0+16],@X[2]
  277. ldx [$tmp0+32],@X[4]
  278. ldx [$tmp0+48],@X[6]
  279. and $inp,7,$tmp1
  280. ldx [$tmp0+8],@X[1]
  281. sll $tmp1,3,$tmp1
  282. ldx [$tmp0+24],@X[3]
  283. subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
  284. ldx [$tmp0+40],@X[5]
  285. bz,pt %icc,.Laligned
  286. ldx [$tmp0+56],@X[7]
  287. sllx @X[0],$tmp1,@X[0]
  288. ldx [$tmp0+64],$tmp64
  289. ___
  290. for($i=0;$i<7;$i++)
  291. { $code.=<<___;
  292. srlx @X[$i+1],$tmp2,$Xi
  293. sllx @X[$i+1],$tmp1,@X[$i+1]
  294. or $Xi,@X[$i],@X[$i]
  295. ___
  296. }
  297. $code.=<<___;
  298. srlx $tmp64,$tmp2,$tmp64
  299. or $tmp64,@X[7],@X[7]
  300. .Laligned:
  301. srlx @X[0],32,$Xi
  302. ___
  303. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  304. for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
  305. for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  306. for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  307. for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  308. $code.=<<___;
  309. ld [$ctx+0],@X[0]
  310. ld [$ctx+4],@X[1]
  311. ld [$ctx+8],@X[2]
  312. ld [$ctx+12],@X[3]
  313. add $inp,64,$inp
  314. ld [$ctx+16],@X[4]
  315. cmp $inp,$len
  316. add $A,@X[0],$A
  317. st $A,[$ctx+0]
  318. add $B,@X[1],$B
  319. st $B,[$ctx+4]
  320. add $C,@X[2],$C
  321. st $C,[$ctx+8]
  322. add $D,@X[3],$D
  323. st $D,[$ctx+12]
  324. add $E,@X[4],$E
  325. st $E,[$ctx+16]
  326. bne SIZE_T_CC,.Lloop
  327. andn $inp,7,$tmp0
  328. ret
  329. restore
  330. .type sha1_block_data_order,#function
  331. .size sha1_block_data_order,(.-sha1_block_data_order)
  332. .asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  333. .align 4
  334. ___
  335. # Purpose of these subroutines is to explicitly encode VIS instructions,
  336. # so that one can compile the module without having to specify VIS
  337. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  338. # Idea is to reserve for option to produce "universal" binary and let
  339. # programmer detect if current CPU is VIS capable at run-time.
  340. sub unvis {
  341. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  342. my $ref,$opf;
  343. my %visopf = ( "faligndata" => 0x048,
  344. "for" => 0x07c );
  345. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  346. if ($opf=$visopf{$mnemonic}) {
  347. foreach ($rs1,$rs2,$rd) {
  348. return $ref if (!/%f([0-9]{1,2})/);
  349. $_=$1;
  350. if ($1>=32) {
  351. return $ref if ($1&1);
  352. # re-encode for upper double register addressing
  353. $_=($1|$1>>5)&31;
  354. }
  355. }
  356. return sprintf ".word\t0x%08x !%s",
  357. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  358. $ref;
  359. } else {
  360. return $ref;
  361. }
  362. }
  363. sub unalignaddr {
  364. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  365. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  366. my $ref="$mnemonic\t$rs1,$rs2,$rd";
  367. foreach ($rs1,$rs2,$rd) {
  368. if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
  369. else { return $ref; }
  370. }
  371. return sprintf ".word\t0x%08x !%s",
  372. 0x81b00300|$rd<<25|$rs1<<14|$rs2,
  373. $ref;
  374. }
  375. foreach (split("\n",$code)) {
  376. s/\`([^\`]*)\`/eval $1/ge;
  377. s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  378. &unvis($1,$2,$3,$4)
  379. /ge;
  380. s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  381. &unalignaddr($1,$2,$3,$4)
  382. /ge;
  383. print $_,"\n";
  384. }
  385. close STDOUT or die "error closing STDOUT: $!";