2
0

sha512-sparcv9.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Hardware SPARC T4 support by David S. Miller
  15. # ====================================================================
  16. # SHA256 performance improvement over compiler generated code varies
  17. # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  18. # build]. Just like in SHA1 module I aim to ensure scalability on
  19. # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  20. # SHA512 on pre-T1 UltraSPARC.
  21. #
  22. # Performance is >75% better than 64-bit code generated by Sun C and
  23. # over 2x than 32-bit code. X[16] resides on stack, but access to it
  24. # is scheduled for L2 latency and staged through 32 least significant
  25. # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  26. # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  27. # good [optimal coefficient is 50%].
  28. #
  29. # SHA512 on UltraSPARC T1.
  30. #
  31. # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  32. # because 64-bit code generator has the advantage of using 64-bit
  33. # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  34. # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  35. # code by 60%, not to mention that it doesn't suffer from severe decay
  36. # when running 4 times physical cores threads and that it leaves gcc
  37. # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  38. # performance is only 10% better, but overall throughput for maximum
  39. # amount of threads for given CPU exceeds corresponding one of SHA256
  40. # by 30% [again, optimal coefficient is 50%].
  41. #
  42. # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  43. # in-order, i.e. load instruction has to complete prior next
  44. # instruction in given thread is executed, even if the latter is
  45. # not dependent on load result! This means that on T1 two 32-bit
  46. # loads are always slower than one 64-bit load. Once again this
  47. # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  48. # 2x32-bit loads can be as fast as 1x64-bit ones.
  49. #
  50. # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  51. # which is 9.3x/11.1x faster than software. Multi-process benchmark
  52. # saturates at 11.5x single-process result on 8-core processor, or
  53. # ~11/16GBps per 2.85GHz socket.
  54. $output=pop;
  55. open STDOUT,">$output";
  56. if ($output =~ /512/) {
  57. $label="512";
  58. $SZ=8;
  59. $LD="ldx"; # load from memory
  60. $ST="stx"; # store to memory
  61. $SLL="sllx"; # shift left logical
  62. $SRL="srlx"; # shift right logical
  63. @Sigma0=(28,34,39);
  64. @Sigma1=(14,18,41);
  65. @sigma0=( 7, 1, 8); # right shift first
  66. @sigma1=( 6,19,61); # right shift first
  67. $lastK=0x817;
  68. $rounds=80;
  69. $align=4;
  70. $locals=16*$SZ; # X[16]
  71. $A="%o0";
  72. $B="%o1";
  73. $C="%o2";
  74. $D="%o3";
  75. $E="%o4";
  76. $F="%o5";
  77. $G="%g1";
  78. $H="%o7";
  79. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  80. } else {
  81. $label="256";
  82. $SZ=4;
  83. $LD="ld"; # load from memory
  84. $ST="st"; # store to memory
  85. $SLL="sll"; # shift left logical
  86. $SRL="srl"; # shift right logical
  87. @Sigma0=( 2,13,22);
  88. @Sigma1=( 6,11,25);
  89. @sigma0=( 3, 7,18); # right shift first
  90. @sigma1=(10,17,19); # right shift first
  91. $lastK=0x8f2;
  92. $rounds=64;
  93. $align=8;
  94. $locals=0; # X[16] is register resident
  95. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  96. $A="%l0";
  97. $B="%l1";
  98. $C="%l2";
  99. $D="%l3";
  100. $E="%l4";
  101. $F="%l5";
  102. $G="%l6";
  103. $H="%l7";
  104. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  105. }
  106. $T1="%g2";
  107. $tmp0="%g3";
  108. $tmp1="%g4";
  109. $tmp2="%g5";
  110. $ctx="%i0";
  111. $inp="%i1";
  112. $len="%i2";
  113. $Ktbl="%i3";
  114. $tmp31="%i4";
  115. $tmp32="%i5";
  116. ########### SHA256
  117. $Xload = sub {
  118. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  119. if ($i==0) {
  120. $code.=<<___;
  121. ldx [$inp+0],@X[0]
  122. ldx [$inp+16],@X[2]
  123. ldx [$inp+32],@X[4]
  124. ldx [$inp+48],@X[6]
  125. ldx [$inp+8],@X[1]
  126. ldx [$inp+24],@X[3]
  127. subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
  128. ldx [$inp+40],@X[5]
  129. bz,pt %icc,.Laligned
  130. ldx [$inp+56],@X[7]
  131. sllx @X[0],$tmp31,@X[0]
  132. ldx [$inp+64],$T1
  133. ___
  134. for($j=0;$j<7;$j++)
  135. { $code.=<<___;
  136. srlx @X[$j+1],$tmp32,$tmp1
  137. sllx @X[$j+1],$tmp31,@X[$j+1]
  138. or $tmp1,@X[$j],@X[$j]
  139. ___
  140. }
  141. $code.=<<___;
  142. srlx $T1,$tmp32,$T1
  143. or $T1,@X[7],@X[7]
  144. .Laligned:
  145. ___
  146. }
  147. if ($i&1) {
  148. $code.="\tadd @X[$i/2],$h,$T1\n";
  149. } else {
  150. $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
  151. }
  152. } if ($SZ==4);
  153. ########### SHA512
  154. $Xload = sub {
  155. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  156. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
  157. $code.=<<___ if ($i==0);
  158. ld [$inp+0],%l0
  159. ld [$inp+4],%l1
  160. ld [$inp+8],%l2
  161. ld [$inp+12],%l3
  162. ld [$inp+16],%l4
  163. ld [$inp+20],%l5
  164. ld [$inp+24],%l6
  165. cmp $tmp31,0
  166. ld [$inp+28],%l7
  167. ___
  168. $code.=<<___ if ($i<15);
  169. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  170. add $tmp31,32,$tmp0
  171. sllx @pair[0],$tmp0,$tmp1
  172. `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
  173. srlx @pair[2],$tmp32,@pair[1]
  174. or $tmp1,$tmp2,$tmp2
  175. or @pair[1],$tmp2,$tmp2
  176. `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
  177. add $h,$tmp2,$T1
  178. $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
  179. ___
  180. $code.=<<___ if ($i==12);
  181. bnz,a,pn %icc,.+8
  182. ld [$inp+128],%l0
  183. ___
  184. $code.=<<___ if ($i==15);
  185. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
  186. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  187. add $tmp31,32,$tmp0
  188. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
  189. sllx @pair[0],$tmp0,$tmp1
  190. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
  191. srlx @pair[2],$tmp32,@pair[1]
  192. or $tmp1,$tmp2,$tmp2
  193. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
  194. or @pair[1],$tmp2,$tmp2
  195. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
  196. add $h,$tmp2,$T1
  197. $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
  198. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
  199. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
  200. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
  201. ___
  202. } if ($SZ==8);
  203. ########### common
  204. sub BODY_00_15 {
  205. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  206. if ($i<16) {
  207. &$Xload(@_);
  208. } else {
  209. $code.="\tadd $h,$T1,$T1\n";
  210. }
  211. $code.=<<___;
  212. $SRL $e,@Sigma1[0],$h !! $i
  213. xor $f,$g,$tmp2
  214. $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
  215. and $e,$tmp2,$tmp2
  216. $SRL $e,@Sigma1[1],$tmp0
  217. xor $tmp1,$h,$h
  218. $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
  219. xor $tmp0,$h,$h
  220. $SRL $e,@Sigma1[2],$tmp0
  221. xor $tmp1,$h,$h
  222. $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
  223. xor $tmp0,$h,$h
  224. xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
  225. xor $tmp1,$h,$tmp0 ! Sigma1(e)
  226. $SRL $a,@Sigma0[0],$h
  227. add $tmp2,$T1,$T1
  228. $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
  229. $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
  230. add $tmp0,$T1,$T1
  231. $SRL $a,@Sigma0[1],$tmp0
  232. xor $tmp1,$h,$h
  233. $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
  234. xor $tmp0,$h,$h
  235. $SRL $a,@Sigma0[2],$tmp0
  236. xor $tmp1,$h,$h
  237. $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
  238. xor $tmp0,$h,$h
  239. xor $tmp1,$h,$h ! Sigma0(a)
  240. or $a,$b,$tmp0
  241. and $a,$b,$tmp1
  242. and $c,$tmp0,$tmp0
  243. or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
  244. add $tmp2,$T1,$T1 ! +=K[$i]
  245. add $tmp1,$h,$h
  246. add $T1,$d,$d
  247. add $T1,$h,$h
  248. ___
  249. }
  250. ########### SHA256
  251. $BODY_16_XX = sub {
  252. my $i=@_[0];
  253. my $xi;
  254. if ($i&1) {
  255. $xi=$tmp32;
  256. $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
  257. } else {
  258. $xi=@X[(($i+1)/2)%8];
  259. }
  260. $code.=<<___;
  261. srl $xi,@sigma0[0],$T1 !! Xupdate($i)
  262. sll $xi,`32-@sigma0[2]`,$tmp1
  263. srl $xi,@sigma0[1],$tmp0
  264. xor $tmp1,$T1,$T1
  265. sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  266. xor $tmp0,$T1,$T1
  267. srl $xi,@sigma0[2],$tmp0
  268. xor $tmp1,$T1,$T1
  269. ___
  270. if ($i&1) {
  271. $xi=@X[(($i+14)/2)%8];
  272. } else {
  273. $xi=$tmp32;
  274. $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
  275. }
  276. $code.=<<___;
  277. srl $xi,@sigma1[0],$tmp2
  278. xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
  279. sll $xi,`32-@sigma1[2]`,$tmp1
  280. srl $xi,@sigma1[1],$tmp0
  281. xor $tmp1,$tmp2,$tmp2
  282. sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
  283. xor $tmp0,$tmp2,$tmp2
  284. srl $xi,@sigma1[2],$tmp0
  285. xor $tmp1,$tmp2,$tmp2
  286. ___
  287. if ($i&1) {
  288. $xi=@X[($i/2)%8];
  289. $code.=<<___;
  290. srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
  291. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  292. srl @X[($i/2)%8],0,$tmp0
  293. add $tmp2,$tmp1,$tmp1
  294. add $xi,$T1,$T1 ! +=X[i]
  295. xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  296. add $tmp1,$T1,$T1
  297. srl $T1,0,$T1
  298. or $T1,@X[($i/2)%8],@X[($i/2)%8]
  299. ___
  300. } else {
  301. $xi=@X[(($i+9)/2)%8];
  302. $code.=<<___;
  303. srlx @X[($i/2)%8],32,$tmp1 ! X[i]
  304. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  305. add $xi,$T1,$T1 ! +=X[i+9]
  306. add $tmp2,$tmp1,$tmp1
  307. srl @X[($i/2)%8],0,@X[($i/2)%8]
  308. add $tmp1,$T1,$T1
  309. sllx $T1,32,$tmp0
  310. or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  311. ___
  312. }
  313. &BODY_00_15(@_);
  314. } if ($SZ==4);
  315. ########### SHA512
  316. $BODY_16_XX = sub {
  317. my $i=@_[0];
  318. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
  319. $code.=<<___;
  320. sllx %l2,32,$tmp0 !! Xupdate($i)
  321. or %l3,$tmp0,$tmp0
  322. srlx $tmp0,@sigma0[0],$T1
  323. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
  324. sllx $tmp0,`64-@sigma0[2]`,$tmp1
  325. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
  326. srlx $tmp0,@sigma0[1],$tmp0
  327. xor $tmp1,$T1,$T1
  328. sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  329. xor $tmp0,$T1,$T1
  330. srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
  331. xor $tmp1,$T1,$T1
  332. sllx %l6,32,$tmp2
  333. xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
  334. or %l7,$tmp2,$tmp2
  335. srlx $tmp2,@sigma1[0],$tmp1
  336. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
  337. sllx $tmp2,`64-@sigma1[2]`,$tmp0
  338. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
  339. srlx $tmp2,@sigma1[1],$tmp2
  340. xor $tmp0,$tmp1,$tmp1
  341. sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
  342. xor $tmp2,$tmp1,$tmp1
  343. srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
  344. xor $tmp0,$tmp1,$tmp1
  345. sllx %l4,32,$tmp0
  346. xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
  347. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
  348. or %l5,$tmp0,$tmp0
  349. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
  350. sllx %l0,32,$tmp2
  351. add $tmp1,$T1,$T1
  352. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
  353. or %l1,$tmp2,$tmp2
  354. add $tmp0,$T1,$T1 ! +=X[$i+9]
  355. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
  356. add $tmp2,$T1,$T1 ! +=X[$i]
  357. $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
  358. ___
  359. &BODY_00_15(@_);
  360. } if ($SZ==8);
  361. $code.=<<___;
  362. #include "sparc_arch.h"
  363. #ifdef __arch64__
  364. .register %g2,#scratch
  365. .register %g3,#scratch
  366. #endif
  367. .section ".text",#alloc,#execinstr
  368. .align 64
  369. K${label}:
  370. .type K${label},#object
  371. ___
  372. if ($SZ==4) {
  373. $code.=<<___;
  374. .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  375. .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  376. .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  377. .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  378. .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  379. .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  380. .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  381. .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  382. .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  383. .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  384. .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  385. .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  386. .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  387. .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  388. .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  389. .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  390. ___
  391. } else {
  392. $code.=<<___;
  393. .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
  394. .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
  395. .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
  396. .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
  397. .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
  398. .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
  399. .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
  400. .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
  401. .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
  402. .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
  403. .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
  404. .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
  405. .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
  406. .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
  407. .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
  408. .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
  409. .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
  410. .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
  411. .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
  412. .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
  413. .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
  414. .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
  415. .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
  416. .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
  417. .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
  418. .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
  419. .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
  420. .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
  421. .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
  422. .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
  423. .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
  424. .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
  425. .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
  426. .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
  427. .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
  428. .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
  429. .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
  430. .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
  431. .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
  432. .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
  433. ___
  434. }
  435. $code.=<<___;
  436. .size K${label},.-K${label}
  437. #ifdef __PIC__
  438. SPARC_PIC_THUNK(%g1)
  439. #endif
  440. .globl sha${label}_block_data_order
  441. .align 32
  442. sha${label}_block_data_order:
  443. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  444. ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
  445. andcc %g1, CFR_SHA${label}, %g0
  446. be .Lsoftware
  447. nop
  448. ___
  449. $code.=<<___ if ($SZ==8); # SHA512
  450. ldd [%o0 + 0x00], %f0 ! load context
  451. ldd [%o0 + 0x08], %f2
  452. ldd [%o0 + 0x10], %f4
  453. ldd [%o0 + 0x18], %f6
  454. ldd [%o0 + 0x20], %f8
  455. ldd [%o0 + 0x28], %f10
  456. andcc %o1, 0x7, %g0
  457. ldd [%o0 + 0x30], %f12
  458. bne,pn %icc, .Lhwunaligned
  459. ldd [%o0 + 0x38], %f14
  460. .Lhwaligned_loop:
  461. ldd [%o1 + 0x00], %f16
  462. ldd [%o1 + 0x08], %f18
  463. ldd [%o1 + 0x10], %f20
  464. ldd [%o1 + 0x18], %f22
  465. ldd [%o1 + 0x20], %f24
  466. ldd [%o1 + 0x28], %f26
  467. ldd [%o1 + 0x30], %f28
  468. ldd [%o1 + 0x38], %f30
  469. ldd [%o1 + 0x40], %f32
  470. ldd [%o1 + 0x48], %f34
  471. ldd [%o1 + 0x50], %f36
  472. ldd [%o1 + 0x58], %f38
  473. ldd [%o1 + 0x60], %f40
  474. ldd [%o1 + 0x68], %f42
  475. ldd [%o1 + 0x70], %f44
  476. subcc %o2, 1, %o2 ! done yet?
  477. ldd [%o1 + 0x78], %f46
  478. add %o1, 0x80, %o1
  479. prefetch [%o1 + 63], 20
  480. prefetch [%o1 + 64+63], 20
  481. .word 0x81b02860 ! SHA512
  482. bne,pt SIZE_T_CC, .Lhwaligned_loop
  483. nop
  484. .Lhwfinish:
  485. std %f0, [%o0 + 0x00] ! store context
  486. std %f2, [%o0 + 0x08]
  487. std %f4, [%o0 + 0x10]
  488. std %f6, [%o0 + 0x18]
  489. std %f8, [%o0 + 0x20]
  490. std %f10, [%o0 + 0x28]
  491. std %f12, [%o0 + 0x30]
  492. retl
  493. std %f14, [%o0 + 0x38]
  494. .align 16
  495. .Lhwunaligned:
  496. alignaddr %o1, %g0, %o1
  497. ldd [%o1 + 0x00], %f18
  498. .Lhwunaligned_loop:
  499. ldd [%o1 + 0x08], %f20
  500. ldd [%o1 + 0x10], %f22
  501. ldd [%o1 + 0x18], %f24
  502. ldd [%o1 + 0x20], %f26
  503. ldd [%o1 + 0x28], %f28
  504. ldd [%o1 + 0x30], %f30
  505. ldd [%o1 + 0x38], %f32
  506. ldd [%o1 + 0x40], %f34
  507. ldd [%o1 + 0x48], %f36
  508. ldd [%o1 + 0x50], %f38
  509. ldd [%o1 + 0x58], %f40
  510. ldd [%o1 + 0x60], %f42
  511. ldd [%o1 + 0x68], %f44
  512. ldd [%o1 + 0x70], %f46
  513. ldd [%o1 + 0x78], %f48
  514. subcc %o2, 1, %o2 ! done yet?
  515. ldd [%o1 + 0x80], %f50
  516. add %o1, 0x80, %o1
  517. prefetch [%o1 + 63], 20
  518. prefetch [%o1 + 64+63], 20
  519. faligndata %f18, %f20, %f16
  520. faligndata %f20, %f22, %f18
  521. faligndata %f22, %f24, %f20
  522. faligndata %f24, %f26, %f22
  523. faligndata %f26, %f28, %f24
  524. faligndata %f28, %f30, %f26
  525. faligndata %f30, %f32, %f28
  526. faligndata %f32, %f34, %f30
  527. faligndata %f34, %f36, %f32
  528. faligndata %f36, %f38, %f34
  529. faligndata %f38, %f40, %f36
  530. faligndata %f40, %f42, %f38
  531. faligndata %f42, %f44, %f40
  532. faligndata %f44, %f46, %f42
  533. faligndata %f46, %f48, %f44
  534. faligndata %f48, %f50, %f46
  535. .word 0x81b02860 ! SHA512
  536. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  537. for %f50, %f50, %f18 ! %f18=%f50
  538. ba .Lhwfinish
  539. nop
  540. ___
  541. $code.=<<___ if ($SZ==4); # SHA256
  542. ld [%o0 + 0x00], %f0
  543. ld [%o0 + 0x04], %f1
  544. ld [%o0 + 0x08], %f2
  545. ld [%o0 + 0x0c], %f3
  546. ld [%o0 + 0x10], %f4
  547. ld [%o0 + 0x14], %f5
  548. andcc %o1, 0x7, %g0
  549. ld [%o0 + 0x18], %f6
  550. bne,pn %icc, .Lhwunaligned
  551. ld [%o0 + 0x1c], %f7
  552. .Lhwloop:
  553. ldd [%o1 + 0x00], %f8
  554. ldd [%o1 + 0x08], %f10
  555. ldd [%o1 + 0x10], %f12
  556. ldd [%o1 + 0x18], %f14
  557. ldd [%o1 + 0x20], %f16
  558. ldd [%o1 + 0x28], %f18
  559. ldd [%o1 + 0x30], %f20
  560. subcc %o2, 1, %o2 ! done yet?
  561. ldd [%o1 + 0x38], %f22
  562. add %o1, 0x40, %o1
  563. prefetch [%o1 + 63], 20
  564. .word 0x81b02840 ! SHA256
  565. bne,pt SIZE_T_CC, .Lhwloop
  566. nop
  567. .Lhwfinish:
  568. st %f0, [%o0 + 0x00] ! store context
  569. st %f1, [%o0 + 0x04]
  570. st %f2, [%o0 + 0x08]
  571. st %f3, [%o0 + 0x0c]
  572. st %f4, [%o0 + 0x10]
  573. st %f5, [%o0 + 0x14]
  574. st %f6, [%o0 + 0x18]
  575. retl
  576. st %f7, [%o0 + 0x1c]
  577. .align 8
  578. .Lhwunaligned:
  579. alignaddr %o1, %g0, %o1
  580. ldd [%o1 + 0x00], %f10
  581. .Lhwunaligned_loop:
  582. ldd [%o1 + 0x08], %f12
  583. ldd [%o1 + 0x10], %f14
  584. ldd [%o1 + 0x18], %f16
  585. ldd [%o1 + 0x20], %f18
  586. ldd [%o1 + 0x28], %f20
  587. ldd [%o1 + 0x30], %f22
  588. ldd [%o1 + 0x38], %f24
  589. subcc %o2, 1, %o2 ! done yet?
  590. ldd [%o1 + 0x40], %f26
  591. add %o1, 0x40, %o1
  592. prefetch [%o1 + 63], 20
  593. faligndata %f10, %f12, %f8
  594. faligndata %f12, %f14, %f10
  595. faligndata %f14, %f16, %f12
  596. faligndata %f16, %f18, %f14
  597. faligndata %f18, %f20, %f16
  598. faligndata %f20, %f22, %f18
  599. faligndata %f22, %f24, %f20
  600. faligndata %f24, %f26, %f22
  601. .word 0x81b02840 ! SHA256
  602. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  603. for %f26, %f26, %f10 ! %f10=%f26
  604. ba .Lhwfinish
  605. nop
  606. ___
  607. $code.=<<___;
  608. .align 16
  609. .Lsoftware:
  610. save %sp,-STACK_FRAME-$locals,%sp
  611. and $inp,`$align-1`,$tmp31
  612. sllx $len,`log(16*$SZ)/log(2)`,$len
  613. andn $inp,`$align-1`,$inp
  614. sll $tmp31,3,$tmp31
  615. add $inp,$len,$len
  616. ___
  617. $code.=<<___ if ($SZ==8); # SHA512
  618. mov 32,$tmp32
  619. sub $tmp32,$tmp31,$tmp32
  620. ___
  621. $code.=<<___;
  622. .Lpic: call .+8
  623. add %o7,K${label}-.Lpic,$Ktbl
  624. $LD [$ctx+`0*$SZ`],$A
  625. $LD [$ctx+`1*$SZ`],$B
  626. $LD [$ctx+`2*$SZ`],$C
  627. $LD [$ctx+`3*$SZ`],$D
  628. $LD [$ctx+`4*$SZ`],$E
  629. $LD [$ctx+`5*$SZ`],$F
  630. $LD [$ctx+`6*$SZ`],$G
  631. $LD [$ctx+`7*$SZ`],$H
  632. .Lloop:
  633. ___
  634. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  635. $code.=".L16_xx:\n";
  636. for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  637. $code.=<<___;
  638. and $tmp2,0xfff,$tmp2
  639. cmp $tmp2,$lastK
  640. bne .L16_xx
  641. add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
  642. ___
  643. $code.=<<___ if ($SZ==4); # SHA256
  644. $LD [$ctx+`0*$SZ`],@X[0]
  645. $LD [$ctx+`1*$SZ`],@X[1]
  646. $LD [$ctx+`2*$SZ`],@X[2]
  647. $LD [$ctx+`3*$SZ`],@X[3]
  648. $LD [$ctx+`4*$SZ`],@X[4]
  649. $LD [$ctx+`5*$SZ`],@X[5]
  650. $LD [$ctx+`6*$SZ`],@X[6]
  651. $LD [$ctx+`7*$SZ`],@X[7]
  652. add $A,@X[0],$A
  653. $ST $A,[$ctx+`0*$SZ`]
  654. add $B,@X[1],$B
  655. $ST $B,[$ctx+`1*$SZ`]
  656. add $C,@X[2],$C
  657. $ST $C,[$ctx+`2*$SZ`]
  658. add $D,@X[3],$D
  659. $ST $D,[$ctx+`3*$SZ`]
  660. add $E,@X[4],$E
  661. $ST $E,[$ctx+`4*$SZ`]
  662. add $F,@X[5],$F
  663. $ST $F,[$ctx+`5*$SZ`]
  664. add $G,@X[6],$G
  665. $ST $G,[$ctx+`6*$SZ`]
  666. add $H,@X[7],$H
  667. $ST $H,[$ctx+`7*$SZ`]
  668. ___
  669. $code.=<<___ if ($SZ==8); # SHA512
  670. ld [$ctx+`0*$SZ+0`],%l0
  671. ld [$ctx+`0*$SZ+4`],%l1
  672. ld [$ctx+`1*$SZ+0`],%l2
  673. ld [$ctx+`1*$SZ+4`],%l3
  674. ld [$ctx+`2*$SZ+0`],%l4
  675. ld [$ctx+`2*$SZ+4`],%l5
  676. ld [$ctx+`3*$SZ+0`],%l6
  677. sllx %l0,32,$tmp0
  678. ld [$ctx+`3*$SZ+4`],%l7
  679. sllx %l2,32,$tmp1
  680. or %l1,$tmp0,$tmp0
  681. or %l3,$tmp1,$tmp1
  682. add $tmp0,$A,$A
  683. add $tmp1,$B,$B
  684. $ST $A,[$ctx+`0*$SZ`]
  685. sllx %l4,32,$tmp2
  686. $ST $B,[$ctx+`1*$SZ`]
  687. sllx %l6,32,$T1
  688. or %l5,$tmp2,$tmp2
  689. or %l7,$T1,$T1
  690. add $tmp2,$C,$C
  691. $ST $C,[$ctx+`2*$SZ`]
  692. add $T1,$D,$D
  693. $ST $D,[$ctx+`3*$SZ`]
  694. ld [$ctx+`4*$SZ+0`],%l0
  695. ld [$ctx+`4*$SZ+4`],%l1
  696. ld [$ctx+`5*$SZ+0`],%l2
  697. ld [$ctx+`5*$SZ+4`],%l3
  698. ld [$ctx+`6*$SZ+0`],%l4
  699. ld [$ctx+`6*$SZ+4`],%l5
  700. ld [$ctx+`7*$SZ+0`],%l6
  701. sllx %l0,32,$tmp0
  702. ld [$ctx+`7*$SZ+4`],%l7
  703. sllx %l2,32,$tmp1
  704. or %l1,$tmp0,$tmp0
  705. or %l3,$tmp1,$tmp1
  706. add $tmp0,$E,$E
  707. add $tmp1,$F,$F
  708. $ST $E,[$ctx+`4*$SZ`]
  709. sllx %l4,32,$tmp2
  710. $ST $F,[$ctx+`5*$SZ`]
  711. sllx %l6,32,$T1
  712. or %l5,$tmp2,$tmp2
  713. or %l7,$T1,$T1
  714. add $tmp2,$G,$G
  715. $ST $G,[$ctx+`6*$SZ`]
  716. add $T1,$H,$H
  717. $ST $H,[$ctx+`7*$SZ`]
  718. ___
  719. $code.=<<___;
  720. add $inp,`16*$SZ`,$inp ! advance inp
  721. cmp $inp,$len
  722. bne SIZE_T_CC,.Lloop
  723. sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
  724. ret
  725. restore
  726. .type sha${label}_block_data_order,#function
  727. .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
  728. .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  729. .align 4
  730. ___
  731. # Purpose of these subroutines is to explicitly encode VIS instructions,
  732. # so that one can compile the module without having to specify VIS
  733. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  734. # Idea is to reserve for option to produce "universal" binary and let
  735. # programmer detect if current CPU is VIS capable at run-time.
  736. sub unvis {
  737. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  738. my $ref,$opf;
  739. my %visopf = ( "faligndata" => 0x048,
  740. "for" => 0x07c );
  741. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  742. if ($opf=$visopf{$mnemonic}) {
  743. foreach ($rs1,$rs2,$rd) {
  744. return $ref if (!/%f([0-9]{1,2})/);
  745. $_=$1;
  746. if ($1>=32) {
  747. return $ref if ($1&1);
  748. # re-encode for upper double register addressing
  749. $_=($1|$1>>5)&31;
  750. }
  751. }
  752. return sprintf ".word\t0x%08x !%s",
  753. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  754. $ref;
  755. } else {
  756. return $ref;
  757. }
  758. }
  759. sub unalignaddr {
  760. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  761. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  762. my $ref="$mnemonic\t$rs1,$rs2,$rd";
  763. foreach ($rs1,$rs2,$rd) {
  764. if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
  765. else { return $ref; }
  766. }
  767. return sprintf ".word\t0x%08x !%s",
  768. 0x81b00300|$rd<<25|$rs1<<14|$rs2,
  769. $ref;
  770. }
  771. foreach (split("\n",$code)) {
  772. s/\`([^\`]*)\`/eval $1/ge;
  773. s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  774. &unvis($1,$2,$3,$4)
  775. /ge;
  776. s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  777. &unalignaddr($1,$2,$3,$4)
  778. /ge;
  779. print $_,"\n";
  780. }
  781. close STDOUT;