sha512-sparcv9.pl 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # SHA256 performance improvement over compiler generated code varies
  9. # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  10. # build]. Just like in SHA1 module I aim to ensure scalability on
  11. # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  12. # SHA512 on pre-T1 UltraSPARC.
  13. #
  14. # Performance is >75% better than 64-bit code generated by Sun C and
  15. # over 2x than 32-bit code. X[16] resides on stack, but access to it
  16. # is scheduled for L2 latency and staged through 32 least significant
  17. # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  18. # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  19. # good [optimal coefficient is 50%].
  20. #
  21. # SHA512 on UltraSPARC T1.
  22. #
  23. # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  24. # because 64-bit code generator has the advantage of using 64-bit
  25. # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  26. # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  27. # code by 60%, not to mention that it doesn't suffer from severe decay
  28. # when running 4 times physical cores threads and that it leaves gcc
  29. # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  30. # performance is only 10% better, but overall throughput for maximum
  31. # amount of threads for given CPU exceeds corresponding one of SHA256
  32. # by 30% [again, optimal coefficient is 50%].
  33. #
  34. # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  35. # in-order, i.e. load instruction has to complete prior next
  36. # instruction in given thread is executed, even if the latter is
  37. # not dependent on load result! This means that on T1 two 32-bit
  38. # loads are always slower than one 64-bit load. Once again this
  39. # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  40. # 2x32-bit loads can be as fast as 1x64-bit ones.
  41. $bits=32;
  42. for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  43. if ($bits==64) { $bias=2047; $frame=192; }
  44. else { $bias=0; $frame=112; }
  45. $output=shift;
  46. open STDOUT,">$output";
  47. if ($output =~ /512/) {
  48. $label="512";
  49. $SZ=8;
  50. $LD="ldx"; # load from memory
  51. $ST="stx"; # store to memory
  52. $SLL="sllx"; # shift left logical
  53. $SRL="srlx"; # shift right logical
  54. @Sigma0=(28,34,39);
  55. @Sigma1=(14,18,41);
  56. @sigma0=( 7, 1, 8); # right shift first
  57. @sigma1=( 6,19,61); # right shift first
  58. $lastK=0x817;
  59. $rounds=80;
  60. $align=4;
  61. $locals=16*$SZ; # X[16]
  62. $A="%o0";
  63. $B="%o1";
  64. $C="%o2";
  65. $D="%o3";
  66. $E="%o4";
  67. $F="%o5";
  68. $G="%g1";
  69. $H="%o7";
  70. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  71. } else {
  72. $label="256";
  73. $SZ=4;
  74. $LD="ld"; # load from memory
  75. $ST="st"; # store to memory
  76. $SLL="sll"; # shift left logical
  77. $SRL="srl"; # shift right logical
  78. @Sigma0=( 2,13,22);
  79. @Sigma1=( 6,11,25);
  80. @sigma0=( 3, 7,18); # right shift first
  81. @sigma1=(10,17,19); # right shift first
  82. $lastK=0x8f2;
  83. $rounds=64;
  84. $align=8;
  85. $locals=0; # X[16] is register resident
  86. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  87. $A="%l0";
  88. $B="%l1";
  89. $C="%l2";
  90. $D="%l3";
  91. $E="%l4";
  92. $F="%l5";
  93. $G="%l6";
  94. $H="%l7";
  95. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  96. }
  97. $T1="%g2";
  98. $tmp0="%g3";
  99. $tmp1="%g4";
  100. $tmp2="%g5";
  101. $ctx="%i0";
  102. $inp="%i1";
  103. $len="%i2";
  104. $Ktbl="%i3";
  105. $tmp31="%i4";
  106. $tmp32="%i5";
  107. ########### SHA256
  108. $Xload = sub {
  109. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  110. if ($i==0) {
  111. $code.=<<___;
  112. ldx [$inp+0],@X[0]
  113. ldx [$inp+16],@X[2]
  114. ldx [$inp+32],@X[4]
  115. ldx [$inp+48],@X[6]
  116. ldx [$inp+8],@X[1]
  117. ldx [$inp+24],@X[3]
  118. subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
  119. ldx [$inp+40],@X[5]
  120. bz,pt %icc,.Laligned
  121. ldx [$inp+56],@X[7]
  122. sllx @X[0],$tmp31,@X[0]
  123. ldx [$inp+64],$T1
  124. ___
  125. for($j=0;$j<7;$j++)
  126. { $code.=<<___;
  127. srlx @X[$j+1],$tmp32,$tmp1
  128. sllx @X[$j+1],$tmp31,@X[$j+1]
  129. or $tmp1,@X[$j],@X[$j]
  130. ___
  131. }
  132. $code.=<<___;
  133. srlx $T1,$tmp32,$T1
  134. or $T1,@X[7],@X[7]
  135. .Laligned:
  136. ___
  137. }
  138. if ($i&1) {
  139. $code.="\tadd @X[$i/2],$h,$T1\n";
  140. } else {
  141. $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
  142. }
  143. } if ($SZ==4);
  144. ########### SHA512
  145. $Xload = sub {
  146. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  147. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
  148. $code.=<<___ if ($i==0);
  149. ld [$inp+0],%l0
  150. ld [$inp+4],%l1
  151. ld [$inp+8],%l2
  152. ld [$inp+12],%l3
  153. ld [$inp+16],%l4
  154. ld [$inp+20],%l5
  155. ld [$inp+24],%l6
  156. ld [$inp+28],%l7
  157. ___
  158. $code.=<<___ if ($i<15);
  159. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  160. add $tmp31,32,$tmp0
  161. sllx @pair[0],$tmp0,$tmp1
  162. `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
  163. srlx @pair[2],$tmp32,@pair[1]
  164. or $tmp1,$tmp2,$tmp2
  165. or @pair[1],$tmp2,$tmp2
  166. `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
  167. add $h,$tmp2,$T1
  168. $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
  169. ___
  170. $code.=<<___ if ($i==12);
  171. brnz,a $tmp31,.+8
  172. ld [$inp+128],%l0
  173. ___
  174. $code.=<<___ if ($i==15);
  175. ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
  176. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  177. add $tmp31,32,$tmp0
  178. ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
  179. sllx @pair[0],$tmp0,$tmp1
  180. ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
  181. srlx @pair[2],$tmp32,@pair[1]
  182. or $tmp1,$tmp2,$tmp2
  183. ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
  184. or @pair[1],$tmp2,$tmp2
  185. ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
  186. add $h,$tmp2,$T1
  187. $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
  188. ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
  189. ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
  190. ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
  191. ___
  192. } if ($SZ==8);
  193. ########### common
  194. sub BODY_00_15 {
  195. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  196. if ($i<16) {
  197. &$Xload(@_);
  198. } else {
  199. $code.="\tadd $h,$T1,$T1\n";
  200. }
  201. $code.=<<___;
  202. $SRL $e,@Sigma1[0],$h !! $i
  203. xor $f,$g,$tmp2
  204. $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
  205. and $e,$tmp2,$tmp2
  206. $SRL $e,@Sigma1[1],$tmp0
  207. xor $tmp1,$h,$h
  208. $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
  209. xor $tmp0,$h,$h
  210. $SRL $e,@Sigma1[2],$tmp0
  211. xor $tmp1,$h,$h
  212. $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
  213. xor $tmp0,$h,$h
  214. xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
  215. xor $tmp1,$h,$tmp0 ! Sigma1(e)
  216. $SRL $a,@Sigma0[0],$h
  217. add $tmp2,$T1,$T1
  218. $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
  219. $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
  220. add $tmp0,$T1,$T1
  221. $SRL $a,@Sigma0[1],$tmp0
  222. xor $tmp1,$h,$h
  223. $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
  224. xor $tmp0,$h,$h
  225. $SRL $a,@Sigma0[2],$tmp0
  226. xor $tmp1,$h,$h
  227. $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
  228. xor $tmp0,$h,$h
  229. xor $tmp1,$h,$h ! Sigma0(a)
  230. or $a,$b,$tmp0
  231. and $a,$b,$tmp1
  232. and $c,$tmp0,$tmp0
  233. or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
  234. add $tmp2,$T1,$T1 ! +=K[$i]
  235. add $tmp1,$h,$h
  236. add $T1,$d,$d
  237. add $T1,$h,$h
  238. ___
  239. }
  240. ########### SHA256
  241. $BODY_16_XX = sub {
  242. my $i=@_[0];
  243. my $xi;
  244. if ($i&1) {
  245. $xi=$tmp32;
  246. $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
  247. } else {
  248. $xi=@X[(($i+1)/2)%8];
  249. }
  250. $code.=<<___;
  251. srl $xi,@sigma0[0],$T1 !! Xupdate($i)
  252. sll $xi,`32-@sigma0[2]`,$tmp1
  253. srl $xi,@sigma0[1],$tmp0
  254. xor $tmp1,$T1,$T1
  255. sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  256. xor $tmp0,$T1,$T1
  257. srl $xi,@sigma0[2],$tmp0
  258. xor $tmp1,$T1,$T1
  259. ___
  260. if ($i&1) {
  261. $xi=@X[(($i+14)/2)%8];
  262. } else {
  263. $xi=$tmp32;
  264. $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
  265. }
  266. $code.=<<___;
  267. srl $xi,@sigma1[0],$tmp2
  268. xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
  269. sll $xi,`32-@sigma1[2]`,$tmp1
  270. srl $xi,@sigma1[1],$tmp0
  271. xor $tmp1,$tmp2,$tmp2
  272. sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
  273. xor $tmp0,$tmp2,$tmp2
  274. srl $xi,@sigma1[2],$tmp0
  275. xor $tmp1,$tmp2,$tmp2
  276. ___
  277. if ($i&1) {
  278. $xi=@X[($i/2)%8];
  279. $code.=<<___;
  280. srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
  281. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  282. srl @X[($i/2)%8],0,$tmp0
  283. add $tmp2,$tmp1,$tmp1
  284. add $xi,$T1,$T1 ! +=X[i]
  285. xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  286. add $tmp1,$T1,$T1
  287. srl $T1,0,$T1
  288. or $T1,@X[($i/2)%8],@X[($i/2)%8]
  289. ___
  290. } else {
  291. $xi=@X[(($i+9)/2)%8];
  292. $code.=<<___;
  293. srlx @X[($i/2)%8],32,$tmp1 ! X[i]
  294. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  295. add $xi,$T1,$T1 ! +=X[i+9]
  296. add $tmp2,$tmp1,$tmp1
  297. srl @X[($i/2)%8],0,@X[($i/2)%8]
  298. add $tmp1,$T1,$T1
  299. sllx $T1,32,$tmp0
  300. or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  301. ___
  302. }
  303. &BODY_00_15(@_);
  304. } if ($SZ==4);
  305. ########### SHA512
  306. $BODY_16_XX = sub {
  307. my $i=@_[0];
  308. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
  309. $code.=<<___;
  310. sllx %l2,32,$tmp0 !! Xupdate($i)
  311. or %l3,$tmp0,$tmp0
  312. srlx $tmp0,@sigma0[0],$T1
  313. ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
  314. sllx $tmp0,`64-@sigma0[2]`,$tmp1
  315. ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
  316. srlx $tmp0,@sigma0[1],$tmp0
  317. xor $tmp1,$T1,$T1
  318. sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  319. xor $tmp0,$T1,$T1
  320. srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
  321. xor $tmp1,$T1,$T1
  322. sllx %l6,32,$tmp2
  323. xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
  324. or %l7,$tmp2,$tmp2
  325. srlx $tmp2,@sigma1[0],$tmp1
  326. ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
  327. sllx $tmp2,`64-@sigma1[2]`,$tmp0
  328. ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
  329. srlx $tmp2,@sigma1[1],$tmp2
  330. xor $tmp0,$tmp1,$tmp1
  331. sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
  332. xor $tmp2,$tmp1,$tmp1
  333. srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
  334. xor $tmp0,$tmp1,$tmp1
  335. sllx %l4,32,$tmp0
  336. xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
  337. ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
  338. or %l5,$tmp0,$tmp0
  339. ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
  340. sllx %l0,32,$tmp2
  341. add $tmp1,$T1,$T1
  342. ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
  343. or %l1,$tmp2,$tmp2
  344. add $tmp0,$T1,$T1 ! +=X[$i+9]
  345. ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
  346. add $tmp2,$T1,$T1 ! +=X[$i]
  347. $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
  348. ___
  349. &BODY_00_15(@_);
  350. } if ($SZ==8);
  351. $code.=<<___ if ($bits==64);
  352. .register %g2,#scratch
  353. .register %g3,#scratch
  354. ___
  355. $code.=<<___;
  356. .section ".text",#alloc,#execinstr
  357. .align 64
  358. K${label}:
  359. .type K${label},#object
  360. ___
  361. if ($SZ==4) {
  362. $code.=<<___;
  363. .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  364. .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  365. .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  366. .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  367. .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  368. .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  369. .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  370. .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  371. .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  372. .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  373. .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  374. .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  375. .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  376. .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  377. .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  378. .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  379. ___
  380. } else {
  381. $code.=<<___;
  382. .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
  383. .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
  384. .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
  385. .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
  386. .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
  387. .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
  388. .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
  389. .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
  390. .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
  391. .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
  392. .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
  393. .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
  394. .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
  395. .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
  396. .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
  397. .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
  398. .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
  399. .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
  400. .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
  401. .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
  402. .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
  403. .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
  404. .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
  405. .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
  406. .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
  407. .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
  408. .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
  409. .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
  410. .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
  411. .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
  412. .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
  413. .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
  414. .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
  415. .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
  416. .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
  417. .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
  418. .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
  419. .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
  420. .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
  421. .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
  422. ___
  423. }
  424. $code.=<<___;
  425. .size K${label},.-K${label}
  426. .globl sha${label}_block_data_order
  427. sha${label}_block_data_order:
  428. save %sp,`-$frame-$locals`,%sp
  429. and $inp,`$align-1`,$tmp31
  430. sllx $len,`log(16*$SZ)/log(2)`,$len
  431. andn $inp,`$align-1`,$inp
  432. sll $tmp31,3,$tmp31
  433. add $inp,$len,$len
  434. ___
  435. $code.=<<___ if ($SZ==8); # SHA512
  436. mov 32,$tmp32
  437. sub $tmp32,$tmp31,$tmp32
  438. ___
  439. $code.=<<___;
  440. .Lpic: call .+8
  441. add %o7,K${label}-.Lpic,$Ktbl
  442. $LD [$ctx+`0*$SZ`],$A
  443. $LD [$ctx+`1*$SZ`],$B
  444. $LD [$ctx+`2*$SZ`],$C
  445. $LD [$ctx+`3*$SZ`],$D
  446. $LD [$ctx+`4*$SZ`],$E
  447. $LD [$ctx+`5*$SZ`],$F
  448. $LD [$ctx+`6*$SZ`],$G
  449. $LD [$ctx+`7*$SZ`],$H
  450. .Lloop:
  451. ___
  452. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  453. $code.=".L16_xx:\n";
  454. for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  455. $code.=<<___;
  456. and $tmp2,0xfff,$tmp2
  457. cmp $tmp2,$lastK
  458. bne .L16_xx
  459. add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
  460. ___
  461. $code.=<<___ if ($SZ==4); # SHA256
  462. $LD [$ctx+`0*$SZ`],@X[0]
  463. $LD [$ctx+`1*$SZ`],@X[1]
  464. $LD [$ctx+`2*$SZ`],@X[2]
  465. $LD [$ctx+`3*$SZ`],@X[3]
  466. $LD [$ctx+`4*$SZ`],@X[4]
  467. $LD [$ctx+`5*$SZ`],@X[5]
  468. $LD [$ctx+`6*$SZ`],@X[6]
  469. $LD [$ctx+`7*$SZ`],@X[7]
  470. add $A,@X[0],$A
  471. $ST $A,[$ctx+`0*$SZ`]
  472. add $B,@X[1],$B
  473. $ST $B,[$ctx+`1*$SZ`]
  474. add $C,@X[2],$C
  475. $ST $C,[$ctx+`2*$SZ`]
  476. add $D,@X[3],$D
  477. $ST $D,[$ctx+`3*$SZ`]
  478. add $E,@X[4],$E
  479. $ST $E,[$ctx+`4*$SZ`]
  480. add $F,@X[5],$F
  481. $ST $F,[$ctx+`5*$SZ`]
  482. add $G,@X[6],$G
  483. $ST $G,[$ctx+`6*$SZ`]
  484. add $H,@X[7],$H
  485. $ST $H,[$ctx+`7*$SZ`]
  486. ___
  487. $code.=<<___ if ($SZ==8); # SHA512
  488. ld [$ctx+`0*$SZ+0`],%l0
  489. ld [$ctx+`0*$SZ+4`],%l1
  490. ld [$ctx+`1*$SZ+0`],%l2
  491. ld [$ctx+`1*$SZ+4`],%l3
  492. ld [$ctx+`2*$SZ+0`],%l4
  493. ld [$ctx+`2*$SZ+4`],%l5
  494. ld [$ctx+`3*$SZ+0`],%l6
  495. sllx %l0,32,$tmp0
  496. ld [$ctx+`3*$SZ+4`],%l7
  497. sllx %l2,32,$tmp1
  498. or %l1,$tmp0,$tmp0
  499. or %l3,$tmp1,$tmp1
  500. add $tmp0,$A,$A
  501. add $tmp1,$B,$B
  502. $ST $A,[$ctx+`0*$SZ`]
  503. sllx %l4,32,$tmp2
  504. $ST $B,[$ctx+`1*$SZ`]
  505. sllx %l6,32,$T1
  506. or %l5,$tmp2,$tmp2
  507. or %l7,$T1,$T1
  508. add $tmp2,$C,$C
  509. $ST $C,[$ctx+`2*$SZ`]
  510. add $T1,$D,$D
  511. $ST $D,[$ctx+`3*$SZ`]
  512. ld [$ctx+`4*$SZ+0`],%l0
  513. ld [$ctx+`4*$SZ+4`],%l1
  514. ld [$ctx+`5*$SZ+0`],%l2
  515. ld [$ctx+`5*$SZ+4`],%l3
  516. ld [$ctx+`6*$SZ+0`],%l4
  517. ld [$ctx+`6*$SZ+4`],%l5
  518. ld [$ctx+`7*$SZ+0`],%l6
  519. sllx %l0,32,$tmp0
  520. ld [$ctx+`7*$SZ+4`],%l7
  521. sllx %l2,32,$tmp1
  522. or %l1,$tmp0,$tmp0
  523. or %l3,$tmp1,$tmp1
  524. add $tmp0,$E,$E
  525. add $tmp1,$F,$F
  526. $ST $E,[$ctx+`4*$SZ`]
  527. sllx %l4,32,$tmp2
  528. $ST $F,[$ctx+`5*$SZ`]
  529. sllx %l6,32,$T1
  530. or %l5,$tmp2,$tmp2
  531. or %l7,$T1,$T1
  532. add $tmp2,$G,$G
  533. $ST $G,[$ctx+`6*$SZ`]
  534. add $T1,$H,$H
  535. $ST $H,[$ctx+`7*$SZ`]
  536. ___
  537. $code.=<<___;
  538. add $inp,`16*$SZ`,$inp ! advance inp
  539. cmp $inp,$len
  540. bne `$bits==64?"%xcc":"%icc"`,.Lloop
  541. sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
  542. ret
  543. restore
  544. .type sha${label}_block_data_order,#function
  545. .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
  546. .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  547. .align 4
  548. ___
  549. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  550. print $code;
  551. close STDOUT;