sha512-sparcv9.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. #
  14. # Hardware SPARC T4 support by David S. Miller
  15. # ====================================================================
  16. # SHA256 performance improvement over compiler generated code varies
  17. # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  18. # build]. Just like in SHA1 module I aim to ensure scalability on
  19. # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  20. # SHA512 on pre-T1 UltraSPARC.
  21. #
  22. # Performance is >75% better than 64-bit code generated by Sun C and
  23. # over 2x than 32-bit code. X[16] resides on stack, but access to it
  24. # is scheduled for L2 latency and staged through 32 least significant
  25. # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  26. # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
  27. # good [optimal coefficient is 50%].
  28. #
  29. # SHA512 on UltraSPARC T1.
  30. #
  31. # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  32. # because 64-bit code generator has the advantage of using 64-bit
  33. # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  34. # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  35. # code by 60%, not to mention that it doesn't suffer from severe decay
  36. # when running 4 times physical cores threads and that it leaves gcc
  37. # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  38. # performance is only 10% better, but overall throughput for maximum
  39. # amount of threads for given CPU exceeds corresponding one of SHA256
  40. # by 30% [again, optimal coefficient is 50%].
  41. #
  42. # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  43. # in-order, i.e. load instruction has to complete prior next
  44. # instruction in given thread is executed, even if the latter is
  45. # not dependent on load result! This means that on T1 two 32-bit
  46. # loads are always slower than one 64-bit load. Once again this
  47. # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  48. # 2x32-bit loads can be as fast as 1x64-bit ones.
  49. #
  50. # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  51. # which is 9.3x/11.1x faster than software. Multi-process benchmark
  52. # saturates at 11.5x single-process result on 8-core processor, or
  53. # ~11/16GBps per 2.85GHz socket.
  54. # $output is the last argument if it looks like a file (it has an extension)
  55. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  56. $output and open STDOUT,">$output";
  57. if ($output =~ /512/) {
  58. $label="512";
  59. $SZ=8;
  60. $LD="ldx"; # load from memory
  61. $ST="stx"; # store to memory
  62. $SLL="sllx"; # shift left logical
  63. $SRL="srlx"; # shift right logical
  64. @Sigma0=(28,34,39);
  65. @Sigma1=(14,18,41);
  66. @sigma0=( 7, 1, 8); # right shift first
  67. @sigma1=( 6,19,61); # right shift first
  68. $lastK=0x817;
  69. $rounds=80;
  70. $align=4;
  71. $locals=16*$SZ; # X[16]
  72. $A="%o0";
  73. $B="%o1";
  74. $C="%o2";
  75. $D="%o3";
  76. $E="%o4";
  77. $F="%o5";
  78. $G="%g1";
  79. $H="%o7";
  80. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  81. } else {
  82. $label="256";
  83. $SZ=4;
  84. $LD="ld"; # load from memory
  85. $ST="st"; # store to memory
  86. $SLL="sll"; # shift left logical
  87. $SRL="srl"; # shift right logical
  88. @Sigma0=( 2,13,22);
  89. @Sigma1=( 6,11,25);
  90. @sigma0=( 3, 7,18); # right shift first
  91. @sigma1=(10,17,19); # right shift first
  92. $lastK=0x8f2;
  93. $rounds=64;
  94. $align=8;
  95. $locals=0; # X[16] is register resident
  96. @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  97. $A="%l0";
  98. $B="%l1";
  99. $C="%l2";
  100. $D="%l3";
  101. $E="%l4";
  102. $F="%l5";
  103. $G="%l6";
  104. $H="%l7";
  105. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  106. }
  107. $T1="%g2";
  108. $tmp0="%g3";
  109. $tmp1="%g4";
  110. $tmp2="%g5";
  111. $ctx="%i0";
  112. $inp="%i1";
  113. $len="%i2";
  114. $Ktbl="%i3";
  115. $tmp31="%i4";
  116. $tmp32="%i5";
  117. ########### SHA256
  118. $Xload = sub {
  119. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  120. if ($i==0) {
  121. $code.=<<___;
  122. ldx [$inp+0],@X[0]
  123. ldx [$inp+16],@X[2]
  124. ldx [$inp+32],@X[4]
  125. ldx [$inp+48],@X[6]
  126. ldx [$inp+8],@X[1]
  127. ldx [$inp+24],@X[3]
  128. subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
  129. ldx [$inp+40],@X[5]
  130. bz,pt %icc,.Laligned
  131. ldx [$inp+56],@X[7]
  132. sllx @X[0],$tmp31,@X[0]
  133. ldx [$inp+64],$T1
  134. ___
  135. for($j=0;$j<7;$j++)
  136. { $code.=<<___;
  137. srlx @X[$j+1],$tmp32,$tmp1
  138. sllx @X[$j+1],$tmp31,@X[$j+1]
  139. or $tmp1,@X[$j],@X[$j]
  140. ___
  141. }
  142. $code.=<<___;
  143. srlx $T1,$tmp32,$T1
  144. or $T1,@X[7],@X[7]
  145. .Laligned:
  146. ___
  147. }
  148. if ($i&1) {
  149. $code.="\tadd @X[$i/2],$h,$T1\n";
  150. } else {
  151. $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
  152. }
  153. } if ($SZ==4);
  154. ########### SHA512
  155. $Xload = sub {
  156. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  157. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
  158. $code.=<<___ if ($i==0);
  159. ld [$inp+0],%l0
  160. ld [$inp+4],%l1
  161. ld [$inp+8],%l2
  162. ld [$inp+12],%l3
  163. ld [$inp+16],%l4
  164. ld [$inp+20],%l5
  165. ld [$inp+24],%l6
  166. cmp $tmp31,0
  167. ld [$inp+28],%l7
  168. ___
  169. $code.=<<___ if ($i<15);
  170. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  171. add $tmp31,32,$tmp0
  172. sllx @pair[0],$tmp0,$tmp1
  173. `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
  174. srlx @pair[2],$tmp32,@pair[1]
  175. or $tmp1,$tmp2,$tmp2
  176. or @pair[1],$tmp2,$tmp2
  177. `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
  178. add $h,$tmp2,$T1
  179. $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
  180. ___
  181. $code.=<<___ if ($i==12);
  182. bnz,a,pn %icc,.+8
  183. ld [$inp+128],%l0
  184. ___
  185. $code.=<<___ if ($i==15);
  186. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
  187. sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
  188. add $tmp31,32,$tmp0
  189. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
  190. sllx @pair[0],$tmp0,$tmp1
  191. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
  192. srlx @pair[2],$tmp32,@pair[1]
  193. or $tmp1,$tmp2,$tmp2
  194. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
  195. or @pair[1],$tmp2,$tmp2
  196. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
  197. add $h,$tmp2,$T1
  198. $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
  199. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
  200. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
  201. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
  202. ___
  203. } if ($SZ==8);
  204. ########### common
  205. sub BODY_00_15 {
  206. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  207. if ($i<16) {
  208. &$Xload(@_);
  209. } else {
  210. $code.="\tadd $h,$T1,$T1\n";
  211. }
  212. $code.=<<___;
  213. $SRL $e,@Sigma1[0],$h !! $i
  214. xor $f,$g,$tmp2
  215. $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
  216. and $e,$tmp2,$tmp2
  217. $SRL $e,@Sigma1[1],$tmp0
  218. xor $tmp1,$h,$h
  219. $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
  220. xor $tmp0,$h,$h
  221. $SRL $e,@Sigma1[2],$tmp0
  222. xor $tmp1,$h,$h
  223. $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
  224. xor $tmp0,$h,$h
  225. xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
  226. xor $tmp1,$h,$tmp0 ! Sigma1(e)
  227. $SRL $a,@Sigma0[0],$h
  228. add $tmp2,$T1,$T1
  229. $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
  230. $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
  231. add $tmp0,$T1,$T1
  232. $SRL $a,@Sigma0[1],$tmp0
  233. xor $tmp1,$h,$h
  234. $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
  235. xor $tmp0,$h,$h
  236. $SRL $a,@Sigma0[2],$tmp0
  237. xor $tmp1,$h,$h
  238. $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
  239. xor $tmp0,$h,$h
  240. xor $tmp1,$h,$h ! Sigma0(a)
  241. or $a,$b,$tmp0
  242. and $a,$b,$tmp1
  243. and $c,$tmp0,$tmp0
  244. or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
  245. add $tmp2,$T1,$T1 ! +=K[$i]
  246. add $tmp1,$h,$h
  247. add $T1,$d,$d
  248. add $T1,$h,$h
  249. ___
  250. }
  251. ########### SHA256
  252. $BODY_16_XX = sub {
  253. my $i=@_[0];
  254. my $xi;
  255. if ($i&1) {
  256. $xi=$tmp32;
  257. $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
  258. } else {
  259. $xi=@X[(($i+1)/2)%8];
  260. }
  261. $code.=<<___;
  262. srl $xi,@sigma0[0],$T1 !! Xupdate($i)
  263. sll $xi,`32-@sigma0[2]`,$tmp1
  264. srl $xi,@sigma0[1],$tmp0
  265. xor $tmp1,$T1,$T1
  266. sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  267. xor $tmp0,$T1,$T1
  268. srl $xi,@sigma0[2],$tmp0
  269. xor $tmp1,$T1,$T1
  270. ___
  271. if ($i&1) {
  272. $xi=@X[(($i+14)/2)%8];
  273. } else {
  274. $xi=$tmp32;
  275. $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
  276. }
  277. $code.=<<___;
  278. srl $xi,@sigma1[0],$tmp2
  279. xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
  280. sll $xi,`32-@sigma1[2]`,$tmp1
  281. srl $xi,@sigma1[1],$tmp0
  282. xor $tmp1,$tmp2,$tmp2
  283. sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
  284. xor $tmp0,$tmp2,$tmp2
  285. srl $xi,@sigma1[2],$tmp0
  286. xor $tmp1,$tmp2,$tmp2
  287. ___
  288. if ($i&1) {
  289. $xi=@X[($i/2)%8];
  290. $code.=<<___;
  291. srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
  292. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  293. srl @X[($i/2)%8],0,$tmp0
  294. add $tmp2,$tmp1,$tmp1
  295. add $xi,$T1,$T1 ! +=X[i]
  296. xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  297. add $tmp1,$T1,$T1
  298. srl $T1,0,$T1
  299. or $T1,@X[($i/2)%8],@X[($i/2)%8]
  300. ___
  301. } else {
  302. $xi=@X[(($i+9)/2)%8];
  303. $code.=<<___;
  304. srlx @X[($i/2)%8],32,$tmp1 ! X[i]
  305. xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
  306. add $xi,$T1,$T1 ! +=X[i+9]
  307. add $tmp2,$tmp1,$tmp1
  308. srl @X[($i/2)%8],0,@X[($i/2)%8]
  309. add $tmp1,$T1,$T1
  310. sllx $T1,32,$tmp0
  311. or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
  312. ___
  313. }
  314. &BODY_00_15(@_);
  315. } if ($SZ==4);
  316. ########### SHA512
  317. $BODY_16_XX = sub {
  318. my $i=@_[0];
  319. my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
  320. $code.=<<___;
  321. sllx %l2,32,$tmp0 !! Xupdate($i)
  322. or %l3,$tmp0,$tmp0
  323. srlx $tmp0,@sigma0[0],$T1
  324. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
  325. sllx $tmp0,`64-@sigma0[2]`,$tmp1
  326. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
  327. srlx $tmp0,@sigma0[1],$tmp0
  328. xor $tmp1,$T1,$T1
  329. sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
  330. xor $tmp0,$T1,$T1
  331. srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
  332. xor $tmp1,$T1,$T1
  333. sllx %l6,32,$tmp2
  334. xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
  335. or %l7,$tmp2,$tmp2
  336. srlx $tmp2,@sigma1[0],$tmp1
  337. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
  338. sllx $tmp2,`64-@sigma1[2]`,$tmp0
  339. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
  340. srlx $tmp2,@sigma1[1],$tmp2
  341. xor $tmp0,$tmp1,$tmp1
  342. sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
  343. xor $tmp2,$tmp1,$tmp1
  344. srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
  345. xor $tmp0,$tmp1,$tmp1
  346. sllx %l4,32,$tmp0
  347. xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
  348. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
  349. or %l5,$tmp0,$tmp0
  350. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
  351. sllx %l0,32,$tmp2
  352. add $tmp1,$T1,$T1
  353. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
  354. or %l1,$tmp2,$tmp2
  355. add $tmp0,$T1,$T1 ! +=X[$i+9]
  356. ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
  357. add $tmp2,$T1,$T1 ! +=X[$i]
  358. $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
  359. ___
  360. &BODY_00_15(@_);
  361. } if ($SZ==8);
  362. $code.=<<___;
  363. #ifndef __ASSEMBLER__
  364. # define __ASSEMBLER__ 1
  365. #endif
  366. #include "crypto/sparc_arch.h"
  367. #ifdef __arch64__
  368. .register %g2,#scratch
  369. .register %g3,#scratch
  370. #endif
  371. .section ".text",#alloc,#execinstr
  372. .align 64
  373. K${label}:
  374. .type K${label},#object
  375. ___
  376. if ($SZ==4) {
  377. $code.=<<___;
  378. .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  379. .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  380. .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  381. .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  382. .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  383. .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  384. .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  385. .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  386. .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  387. .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  388. .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  389. .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  390. .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  391. .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  392. .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  393. .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  394. ___
  395. } else {
  396. $code.=<<___;
  397. .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
  398. .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
  399. .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
  400. .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
  401. .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
  402. .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
  403. .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
  404. .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
  405. .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
  406. .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
  407. .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
  408. .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
  409. .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
  410. .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
  411. .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
  412. .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
  413. .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
  414. .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
  415. .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
  416. .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
  417. .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
  418. .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
  419. .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
  420. .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
  421. .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
  422. .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
  423. .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
  424. .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
  425. .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
  426. .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
  427. .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
  428. .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
  429. .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
  430. .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
  431. .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
  432. .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
  433. .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
  434. .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
  435. .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
  436. .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
  437. ___
  438. }
  439. $code.=<<___;
  440. .size K${label},.-K${label}
  441. #ifdef __PIC__
  442. SPARC_PIC_THUNK(%g1)
  443. #endif
  444. .globl sha${label}_block_data_order
  445. .align 32
  446. sha${label}_block_data_order:
  447. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  448. ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
  449. andcc %g1, CFR_SHA${label}, %g0
  450. be .Lsoftware
  451. nop
  452. ___
  453. $code.=<<___ if ($SZ==8); # SHA512
  454. ldd [%o0 + 0x00], %f0 ! load context
  455. ldd [%o0 + 0x08], %f2
  456. ldd [%o0 + 0x10], %f4
  457. ldd [%o0 + 0x18], %f6
  458. ldd [%o0 + 0x20], %f8
  459. ldd [%o0 + 0x28], %f10
  460. andcc %o1, 0x7, %g0
  461. ldd [%o0 + 0x30], %f12
  462. bne,pn %icc, .Lhwunaligned
  463. ldd [%o0 + 0x38], %f14
  464. .Lhwaligned_loop:
  465. ldd [%o1 + 0x00], %f16
  466. ldd [%o1 + 0x08], %f18
  467. ldd [%o1 + 0x10], %f20
  468. ldd [%o1 + 0x18], %f22
  469. ldd [%o1 + 0x20], %f24
  470. ldd [%o1 + 0x28], %f26
  471. ldd [%o1 + 0x30], %f28
  472. ldd [%o1 + 0x38], %f30
  473. ldd [%o1 + 0x40], %f32
  474. ldd [%o1 + 0x48], %f34
  475. ldd [%o1 + 0x50], %f36
  476. ldd [%o1 + 0x58], %f38
  477. ldd [%o1 + 0x60], %f40
  478. ldd [%o1 + 0x68], %f42
  479. ldd [%o1 + 0x70], %f44
  480. subcc %o2, 1, %o2 ! done yet?
  481. ldd [%o1 + 0x78], %f46
  482. add %o1, 0x80, %o1
  483. prefetch [%o1 + 63], 20
  484. prefetch [%o1 + 64+63], 20
  485. .word 0x81b02860 ! SHA512
  486. bne,pt SIZE_T_CC, .Lhwaligned_loop
  487. nop
  488. .Lhwfinish:
  489. std %f0, [%o0 + 0x00] ! store context
  490. std %f2, [%o0 + 0x08]
  491. std %f4, [%o0 + 0x10]
  492. std %f6, [%o0 + 0x18]
  493. std %f8, [%o0 + 0x20]
  494. std %f10, [%o0 + 0x28]
  495. std %f12, [%o0 + 0x30]
  496. retl
  497. std %f14, [%o0 + 0x38]
  498. .align 16
  499. .Lhwunaligned:
  500. alignaddr %o1, %g0, %o1
  501. ldd [%o1 + 0x00], %f18
  502. .Lhwunaligned_loop:
  503. ldd [%o1 + 0x08], %f20
  504. ldd [%o1 + 0x10], %f22
  505. ldd [%o1 + 0x18], %f24
  506. ldd [%o1 + 0x20], %f26
  507. ldd [%o1 + 0x28], %f28
  508. ldd [%o1 + 0x30], %f30
  509. ldd [%o1 + 0x38], %f32
  510. ldd [%o1 + 0x40], %f34
  511. ldd [%o1 + 0x48], %f36
  512. ldd [%o1 + 0x50], %f38
  513. ldd [%o1 + 0x58], %f40
  514. ldd [%o1 + 0x60], %f42
  515. ldd [%o1 + 0x68], %f44
  516. ldd [%o1 + 0x70], %f46
  517. ldd [%o1 + 0x78], %f48
  518. subcc %o2, 1, %o2 ! done yet?
  519. ldd [%o1 + 0x80], %f50
  520. add %o1, 0x80, %o1
  521. prefetch [%o1 + 63], 20
  522. prefetch [%o1 + 64+63], 20
  523. faligndata %f18, %f20, %f16
  524. faligndata %f20, %f22, %f18
  525. faligndata %f22, %f24, %f20
  526. faligndata %f24, %f26, %f22
  527. faligndata %f26, %f28, %f24
  528. faligndata %f28, %f30, %f26
  529. faligndata %f30, %f32, %f28
  530. faligndata %f32, %f34, %f30
  531. faligndata %f34, %f36, %f32
  532. faligndata %f36, %f38, %f34
  533. faligndata %f38, %f40, %f36
  534. faligndata %f40, %f42, %f38
  535. faligndata %f42, %f44, %f40
  536. faligndata %f44, %f46, %f42
  537. faligndata %f46, %f48, %f44
  538. faligndata %f48, %f50, %f46
  539. .word 0x81b02860 ! SHA512
  540. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  541. for %f50, %f50, %f18 ! %f18=%f50
  542. ba .Lhwfinish
  543. nop
  544. ___
  545. $code.=<<___ if ($SZ==4); # SHA256
  546. ld [%o0 + 0x00], %f0
  547. ld [%o0 + 0x04], %f1
  548. ld [%o0 + 0x08], %f2
  549. ld [%o0 + 0x0c], %f3
  550. ld [%o0 + 0x10], %f4
  551. ld [%o0 + 0x14], %f5
  552. andcc %o1, 0x7, %g0
  553. ld [%o0 + 0x18], %f6
  554. bne,pn %icc, .Lhwunaligned
  555. ld [%o0 + 0x1c], %f7
  556. .Lhwloop:
  557. ldd [%o1 + 0x00], %f8
  558. ldd [%o1 + 0x08], %f10
  559. ldd [%o1 + 0x10], %f12
  560. ldd [%o1 + 0x18], %f14
  561. ldd [%o1 + 0x20], %f16
  562. ldd [%o1 + 0x28], %f18
  563. ldd [%o1 + 0x30], %f20
  564. subcc %o2, 1, %o2 ! done yet?
  565. ldd [%o1 + 0x38], %f22
  566. add %o1, 0x40, %o1
  567. prefetch [%o1 + 63], 20
  568. .word 0x81b02840 ! SHA256
  569. bne,pt SIZE_T_CC, .Lhwloop
  570. nop
  571. .Lhwfinish:
  572. st %f0, [%o0 + 0x00] ! store context
  573. st %f1, [%o0 + 0x04]
  574. st %f2, [%o0 + 0x08]
  575. st %f3, [%o0 + 0x0c]
  576. st %f4, [%o0 + 0x10]
  577. st %f5, [%o0 + 0x14]
  578. st %f6, [%o0 + 0x18]
  579. retl
  580. st %f7, [%o0 + 0x1c]
  581. .align 8
  582. .Lhwunaligned:
  583. alignaddr %o1, %g0, %o1
  584. ldd [%o1 + 0x00], %f10
  585. .Lhwunaligned_loop:
  586. ldd [%o1 + 0x08], %f12
  587. ldd [%o1 + 0x10], %f14
  588. ldd [%o1 + 0x18], %f16
  589. ldd [%o1 + 0x20], %f18
  590. ldd [%o1 + 0x28], %f20
  591. ldd [%o1 + 0x30], %f22
  592. ldd [%o1 + 0x38], %f24
  593. subcc %o2, 1, %o2 ! done yet?
  594. ldd [%o1 + 0x40], %f26
  595. add %o1, 0x40, %o1
  596. prefetch [%o1 + 63], 20
  597. faligndata %f10, %f12, %f8
  598. faligndata %f12, %f14, %f10
  599. faligndata %f14, %f16, %f12
  600. faligndata %f16, %f18, %f14
  601. faligndata %f18, %f20, %f16
  602. faligndata %f20, %f22, %f18
  603. faligndata %f22, %f24, %f20
  604. faligndata %f24, %f26, %f22
  605. .word 0x81b02840 ! SHA256
  606. bne,pt SIZE_T_CC, .Lhwunaligned_loop
  607. for %f26, %f26, %f10 ! %f10=%f26
  608. ba .Lhwfinish
  609. nop
  610. ___
  611. $code.=<<___;
  612. .align 16
  613. .Lsoftware:
  614. save %sp,-STACK_FRAME-$locals,%sp
  615. and $inp,`$align-1`,$tmp31
  616. sllx $len,`log(16*$SZ)/log(2)`,$len
  617. andn $inp,`$align-1`,$inp
  618. sll $tmp31,3,$tmp31
  619. add $inp,$len,$len
  620. ___
  621. $code.=<<___ if ($SZ==8); # SHA512
  622. mov 32,$tmp32
  623. sub $tmp32,$tmp31,$tmp32
  624. ___
  625. $code.=<<___;
  626. .Lpic: call .+8
  627. add %o7,K${label}-.Lpic,$Ktbl
  628. $LD [$ctx+`0*$SZ`],$A
  629. $LD [$ctx+`1*$SZ`],$B
  630. $LD [$ctx+`2*$SZ`],$C
  631. $LD [$ctx+`3*$SZ`],$D
  632. $LD [$ctx+`4*$SZ`],$E
  633. $LD [$ctx+`5*$SZ`],$F
  634. $LD [$ctx+`6*$SZ`],$G
  635. $LD [$ctx+`7*$SZ`],$H
  636. .Lloop:
  637. ___
  638. for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  639. $code.=".L16_xx:\n";
  640. for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  641. $code.=<<___;
  642. and $tmp2,0xfff,$tmp2
  643. cmp $tmp2,$lastK
  644. bne .L16_xx
  645. add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
  646. ___
  647. $code.=<<___ if ($SZ==4); # SHA256
  648. $LD [$ctx+`0*$SZ`],@X[0]
  649. $LD [$ctx+`1*$SZ`],@X[1]
  650. $LD [$ctx+`2*$SZ`],@X[2]
  651. $LD [$ctx+`3*$SZ`],@X[3]
  652. $LD [$ctx+`4*$SZ`],@X[4]
  653. $LD [$ctx+`5*$SZ`],@X[5]
  654. $LD [$ctx+`6*$SZ`],@X[6]
  655. $LD [$ctx+`7*$SZ`],@X[7]
  656. add $A,@X[0],$A
  657. $ST $A,[$ctx+`0*$SZ`]
  658. add $B,@X[1],$B
  659. $ST $B,[$ctx+`1*$SZ`]
  660. add $C,@X[2],$C
  661. $ST $C,[$ctx+`2*$SZ`]
  662. add $D,@X[3],$D
  663. $ST $D,[$ctx+`3*$SZ`]
  664. add $E,@X[4],$E
  665. $ST $E,[$ctx+`4*$SZ`]
  666. add $F,@X[5],$F
  667. $ST $F,[$ctx+`5*$SZ`]
  668. add $G,@X[6],$G
  669. $ST $G,[$ctx+`6*$SZ`]
  670. add $H,@X[7],$H
  671. $ST $H,[$ctx+`7*$SZ`]
  672. ___
  673. $code.=<<___ if ($SZ==8); # SHA512
  674. ld [$ctx+`0*$SZ+0`],%l0
  675. ld [$ctx+`0*$SZ+4`],%l1
  676. ld [$ctx+`1*$SZ+0`],%l2
  677. ld [$ctx+`1*$SZ+4`],%l3
  678. ld [$ctx+`2*$SZ+0`],%l4
  679. ld [$ctx+`2*$SZ+4`],%l5
  680. ld [$ctx+`3*$SZ+0`],%l6
  681. sllx %l0,32,$tmp0
  682. ld [$ctx+`3*$SZ+4`],%l7
  683. sllx %l2,32,$tmp1
  684. or %l1,$tmp0,$tmp0
  685. or %l3,$tmp1,$tmp1
  686. add $tmp0,$A,$A
  687. add $tmp1,$B,$B
  688. $ST $A,[$ctx+`0*$SZ`]
  689. sllx %l4,32,$tmp2
  690. $ST $B,[$ctx+`1*$SZ`]
  691. sllx %l6,32,$T1
  692. or %l5,$tmp2,$tmp2
  693. or %l7,$T1,$T1
  694. add $tmp2,$C,$C
  695. $ST $C,[$ctx+`2*$SZ`]
  696. add $T1,$D,$D
  697. $ST $D,[$ctx+`3*$SZ`]
  698. ld [$ctx+`4*$SZ+0`],%l0
  699. ld [$ctx+`4*$SZ+4`],%l1
  700. ld [$ctx+`5*$SZ+0`],%l2
  701. ld [$ctx+`5*$SZ+4`],%l3
  702. ld [$ctx+`6*$SZ+0`],%l4
  703. ld [$ctx+`6*$SZ+4`],%l5
  704. ld [$ctx+`7*$SZ+0`],%l6
  705. sllx %l0,32,$tmp0
  706. ld [$ctx+`7*$SZ+4`],%l7
  707. sllx %l2,32,$tmp1
  708. or %l1,$tmp0,$tmp0
  709. or %l3,$tmp1,$tmp1
  710. add $tmp0,$E,$E
  711. add $tmp1,$F,$F
  712. $ST $E,[$ctx+`4*$SZ`]
  713. sllx %l4,32,$tmp2
  714. $ST $F,[$ctx+`5*$SZ`]
  715. sllx %l6,32,$T1
  716. or %l5,$tmp2,$tmp2
  717. or %l7,$T1,$T1
  718. add $tmp2,$G,$G
  719. $ST $G,[$ctx+`6*$SZ`]
  720. add $T1,$H,$H
  721. $ST $H,[$ctx+`7*$SZ`]
  722. ___
  723. $code.=<<___;
  724. add $inp,`16*$SZ`,$inp ! advance inp
  725. cmp $inp,$len
  726. bne SIZE_T_CC,.Lloop
  727. sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
  728. ret
  729. restore
  730. .type sha${label}_block_data_order,#function
  731. .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
  732. .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  733. .align 4
  734. ___
  735. # Purpose of these subroutines is to explicitly encode VIS instructions,
  736. # so that one can compile the module without having to specify VIS
  737. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  738. # Idea is to reserve for option to produce "universal" binary and let
  739. # programmer detect if current CPU is VIS capable at run-time.
  740. sub unvis {
  741. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  742. my $ref,$opf;
  743. my %visopf = ( "faligndata" => 0x048,
  744. "for" => 0x07c );
  745. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  746. if ($opf=$visopf{$mnemonic}) {
  747. foreach ($rs1,$rs2,$rd) {
  748. return $ref if (!/%f([0-9]{1,2})/);
  749. $_=$1;
  750. if ($1>=32) {
  751. return $ref if ($1&1);
  752. # re-encode for upper double register addressing
  753. $_=($1|$1>>5)&31;
  754. }
  755. }
  756. return sprintf ".word\t0x%08x !%s",
  757. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  758. $ref;
  759. } else {
  760. return $ref;
  761. }
  762. }
  763. sub unalignaddr {
  764. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  765. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  766. my $ref="$mnemonic\t$rs1,$rs2,$rd";
  767. foreach ($rs1,$rs2,$rd) {
  768. if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
  769. else { return $ref; }
  770. }
  771. return sprintf ".word\t0x%08x !%s",
  772. 0x81b00300|$rd<<25|$rs1<<14|$rs2,
  773. $ref;
  774. }
  775. foreach (split("\n",$code)) {
  776. s/\`([^\`]*)\`/eval $1/ge;
  777. s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  778. &unvis($1,$2,$3,$4)
  779. /ge;
  780. s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  781. &unalignaddr($1,$2,$3,$4)
  782. /ge;
  783. print $_,"\n";
  784. }
  785. close STDOUT or die "error closing STDOUT: $!";