sha256-mb-x86_64.pl 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer SHA256 procedure processes n buffers in parallel by
  15. # placing buffer data to designated lane of SIMD register. n is
  16. # naturally limited to 4 on pre-AVX2 processors and to 8 on
  17. # AVX2-capable processors such as Haswell.
  18. #
  19. # this +aesni(i) sha256 aesni-sha256 gain(iv)
  20. # -------------------------------------------------------------------
  21. # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
  22. # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
  23. # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
  24. # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
  25. # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
  26. # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
  27. # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
  28. #
  29. # (i) multi-block CBC encrypt with 128-bit key;
  30. # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  31. # because of lower AES-NI instruction throughput, nor is there
  32. # AES-NI-SHA256 stitch for these processors;
  33. # (iii) "this" is for n=8, when we gather twice as much data, result
  34. # for n=4 is 20.3+4.44=24.7;
  35. # (iv) presented improvement coefficients are asymptotic limits and
  36. # in real-life application are somewhat lower, e.g. for 2KB
  37. # fragments they range from 75% to 130% (on Haswell);
  38. # $output is the last argument if it looks like a file (it has an extension)
  39. # $flavour is the first argument if it doesn't look like a file
  40. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  41. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  42. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  46. die "can't locate x86_64-xlate.pl";
  47. $avx=0;
  48. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  49. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  50. $avx = ($1>=2.19) + ($1>=2.22);
  51. }
  52. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54. $avx = ($1>=2.09) + ($1>=2.10);
  55. }
  56. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  57. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  58. $avx = ($1>=10) + ($1>=11);
  59. }
  60. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  61. $avx = ($2>=3.0) + ($2>3.0);
  62. }
  63. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  64. or die "can't call $xlate: $!";
  65. *STDOUT=*OUT;
  66. # void sha256_multi_block (
  67. # struct { unsigned int A[8];
  68. # unsigned int B[8];
  69. # unsigned int C[8];
  70. # unsigned int D[8];
  71. # unsigned int E[8];
  72. # unsigned int F[8];
  73. # unsigned int G[8];
  74. # unsigned int H[8]; } *ctx,
  75. # struct { void *ptr; int blocks; } inp[8],
  76. # int num); /* 1 or 2 */
  77. #
  78. $ctx="%rdi"; # 1st arg
  79. $inp="%rsi"; # 2nd arg
  80. $num="%edx"; # 3rd arg
  81. @ptr=map("%r$_",(8..11));
  82. $Tbl="%rbp";
  83. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
  84. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
  85. $REG_SZ=16;
  86. sub Xi_off {
  87. my $off = shift;
  88. $off %= 16; $off *= $REG_SZ;
  89. $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
  90. }
  91. sub ROUND_00_15 {
  92. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  93. $code.=<<___ if ($i<15);
  94. movd `4*$i`(@ptr[0]),$Xi
  95. movd `4*$i`(@ptr[1]),$t1
  96. movd `4*$i`(@ptr[2]),$t2
  97. movd `4*$i`(@ptr[3]),$t3
  98. punpckldq $t2,$Xi
  99. punpckldq $t3,$t1
  100. punpckldq $t1,$Xi
  101. ___
  102. $code.=<<___ if ($i==15);
  103. movd `4*$i`(@ptr[0]),$Xi
  104. lea `16*4`(@ptr[0]),@ptr[0]
  105. movd `4*$i`(@ptr[1]),$t1
  106. lea `16*4`(@ptr[1]),@ptr[1]
  107. movd `4*$i`(@ptr[2]),$t2
  108. lea `16*4`(@ptr[2]),@ptr[2]
  109. movd `4*$i`(@ptr[3]),$t3
  110. lea `16*4`(@ptr[3]),@ptr[3]
  111. punpckldq $t2,$Xi
  112. punpckldq $t3,$t1
  113. punpckldq $t1,$Xi
  114. ___
  115. $code.=<<___;
  116. movdqa $e,$sigma
  117. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
  118. movdqa $e,$t3
  119. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
  120. psrld \$6,$sigma
  121. movdqa $e,$t2
  122. pslld \$7,$t3
  123. movdqa $Xi,`&Xi_off($i)`
  124. paddd $h,$Xi # Xi+=h
  125. psrld \$11,$t2
  126. pxor $t3,$sigma
  127. pslld \$21-7,$t3
  128. paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
  129. pxor $t2,$sigma
  130. psrld \$25-11,$t2
  131. movdqa $e,$t1
  132. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  133. pxor $t3,$sigma
  134. movdqa $e,$axb # borrow $axb
  135. pslld \$26-21,$t3
  136. pandn $g,$t1
  137. pand $f,$axb
  138. pxor $t2,$sigma
  139. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  140. movdqa $a,$t2
  141. pxor $t3,$sigma # Sigma1(e)
  142. movdqa $a,$t3
  143. psrld \$2,$t2
  144. paddd $sigma,$Xi # Xi+=Sigma1(e)
  145. pxor $axb,$t1 # Ch(e,f,g)
  146. movdqa $b,$axb
  147. movdqa $a,$sigma
  148. pslld \$10,$t3
  149. pxor $a,$axb # a^b, b^c in next round
  150. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  151. psrld \$13,$sigma
  152. pxor $t3,$t2
  153. paddd $t1,$Xi # Xi+=Ch(e,f,g)
  154. pslld \$19-10,$t3
  155. pand $axb,$bxc
  156. pxor $sigma,$t2
  157. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  158. psrld \$22-13,$sigma
  159. pxor $t3,$t2
  160. movdqa $b,$h
  161. pslld \$30-19,$t3
  162. pxor $t2,$sigma
  163. pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  164. paddd $Xi,$d # d+=Xi
  165. pxor $t3,$sigma # Sigma0(a)
  166. paddd $Xi,$h # h+=Xi
  167. paddd $sigma,$h # h+=Sigma0(a)
  168. ___
  169. $code.=<<___ if (($i%8)==7);
  170. lea `32*8`($Tbl),$Tbl
  171. ___
  172. ($axb,$bxc)=($bxc,$axb);
  173. }
  174. sub ROUND_16_XX {
  175. my $i=shift;
  176. $code.=<<___;
  177. movdqa `&Xi_off($i+1)`,$Xn
  178. paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
  179. movdqa $Xn,$sigma
  180. movdqa $Xn,$t2
  181. psrld \$3,$sigma
  182. movdqa $Xn,$t3
  183. psrld \$7,$t2
  184. movdqa `&Xi_off($i+14)`,$t1
  185. pslld \$14,$t3
  186. pxor $t2,$sigma
  187. psrld \$18-7,$t2
  188. movdqa $t1,$axb # borrow $axb
  189. pxor $t3,$sigma
  190. pslld \$25-14,$t3
  191. pxor $t2,$sigma
  192. psrld \$10,$t1
  193. movdqa $axb,$t2
  194. psrld \$17,$axb
  195. pxor $t3,$sigma # sigma0(X[i+1])
  196. pslld \$13,$t2
  197. paddd $sigma,$Xi # Xi+=sigma0(e)
  198. pxor $axb,$t1
  199. psrld \$19-17,$axb
  200. pxor $t2,$t1
  201. pslld \$15-13,$t2
  202. pxor $axb,$t1
  203. pxor $t2,$t1 # sigma0(X[i+14])
  204. paddd $t1,$Xi # Xi+=sigma1(X[i+14])
  205. ___
  206. &ROUND_00_15($i,@_);
  207. ($Xi,$Xn)=($Xn,$Xi);
  208. }
  209. $code.=<<___;
  210. .text
  211. .extern OPENSSL_ia32cap_P
  212. .globl sha256_multi_block
  213. .type sha256_multi_block,\@function,3
  214. .align 32
  215. sha256_multi_block:
  216. .cfi_startproc
  217. mov OPENSSL_ia32cap_P+4(%rip),%rcx
  218. bt \$61,%rcx # check SHA bit
  219. jc _shaext_shortcut
  220. ___
  221. $code.=<<___ if ($avx);
  222. test \$`1<<28`,%ecx
  223. jnz _avx_shortcut
  224. ___
  225. $code.=<<___;
  226. mov %rsp,%rax
  227. .cfi_def_cfa_register %rax
  228. push %rbx
  229. .cfi_push %rbx
  230. push %rbp
  231. .cfi_push %rbp
  232. ___
  233. $code.=<<___ if ($win64);
  234. lea -0xa8(%rsp),%rsp
  235. movaps %xmm6,(%rsp)
  236. movaps %xmm7,0x10(%rsp)
  237. movaps %xmm8,0x20(%rsp)
  238. movaps %xmm9,0x30(%rsp)
  239. movaps %xmm10,-0x78(%rax)
  240. movaps %xmm11,-0x68(%rax)
  241. movaps %xmm12,-0x58(%rax)
  242. movaps %xmm13,-0x48(%rax)
  243. movaps %xmm14,-0x38(%rax)
  244. movaps %xmm15,-0x28(%rax)
  245. ___
  246. $code.=<<___;
  247. sub \$`$REG_SZ*18`, %rsp
  248. and \$-256,%rsp
  249. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  250. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  251. .Lbody:
  252. lea K256+128(%rip),$Tbl
  253. lea `$REG_SZ*16`(%rsp),%rbx
  254. lea 0x80($ctx),$ctx # size optimization
  255. .Loop_grande:
  256. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  257. xor $num,$num
  258. ___
  259. for($i=0;$i<4;$i++) {
  260. $code.=<<___;
  261. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  262. mov `16*$i+8`($inp),%ecx # number of blocks
  263. cmp $num,%ecx
  264. cmovg %ecx,$num # find maximum
  265. test %ecx,%ecx
  266. mov %ecx,`4*$i`(%rbx) # initialize counters
  267. cmovle $Tbl,@ptr[$i] # cancel input
  268. ___
  269. }
  270. $code.=<<___;
  271. test $num,$num
  272. jz .Ldone
  273. movdqu 0x00-0x80($ctx),$A # load context
  274. lea 128(%rsp),%rax
  275. movdqu 0x20-0x80($ctx),$B
  276. movdqu 0x40-0x80($ctx),$C
  277. movdqu 0x60-0x80($ctx),$D
  278. movdqu 0x80-0x80($ctx),$E
  279. movdqu 0xa0-0x80($ctx),$F
  280. movdqu 0xc0-0x80($ctx),$G
  281. movdqu 0xe0-0x80($ctx),$H
  282. movdqu .Lpbswap(%rip),$Xn
  283. jmp .Loop
  284. .align 32
  285. .Loop:
  286. movdqa $C,$bxc
  287. pxor $B,$bxc # magic seed
  288. ___
  289. for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
  290. $code.=<<___;
  291. movdqu `&Xi_off($i)`,$Xi
  292. mov \$3,%ecx
  293. jmp .Loop_16_xx
  294. .align 32
  295. .Loop_16_xx:
  296. ___
  297. for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
  298. $code.=<<___;
  299. dec %ecx
  300. jnz .Loop_16_xx
  301. mov \$1,%ecx
  302. lea K256+128(%rip),$Tbl
  303. movdqa (%rbx),$sigma # pull counters
  304. cmp 4*0(%rbx),%ecx # examine counters
  305. pxor $t1,$t1
  306. cmovge $Tbl,@ptr[0] # cancel input
  307. cmp 4*1(%rbx),%ecx
  308. movdqa $sigma,$Xn
  309. cmovge $Tbl,@ptr[1]
  310. cmp 4*2(%rbx),%ecx
  311. pcmpgtd $t1,$Xn # mask value
  312. cmovge $Tbl,@ptr[2]
  313. cmp 4*3(%rbx),%ecx
  314. paddd $Xn,$sigma # counters--
  315. cmovge $Tbl,@ptr[3]
  316. movdqu 0x00-0x80($ctx),$t1
  317. pand $Xn,$A
  318. movdqu 0x20-0x80($ctx),$t2
  319. pand $Xn,$B
  320. movdqu 0x40-0x80($ctx),$t3
  321. pand $Xn,$C
  322. movdqu 0x60-0x80($ctx),$Xi
  323. pand $Xn,$D
  324. paddd $t1,$A
  325. movdqu 0x80-0x80($ctx),$t1
  326. pand $Xn,$E
  327. paddd $t2,$B
  328. movdqu 0xa0-0x80($ctx),$t2
  329. pand $Xn,$F
  330. paddd $t3,$C
  331. movdqu 0xc0-0x80($ctx),$t3
  332. pand $Xn,$G
  333. paddd $Xi,$D
  334. movdqu 0xe0-0x80($ctx),$Xi
  335. pand $Xn,$H
  336. paddd $t1,$E
  337. paddd $t2,$F
  338. movdqu $A,0x00-0x80($ctx)
  339. paddd $t3,$G
  340. movdqu $B,0x20-0x80($ctx)
  341. paddd $Xi,$H
  342. movdqu $C,0x40-0x80($ctx)
  343. movdqu $D,0x60-0x80($ctx)
  344. movdqu $E,0x80-0x80($ctx)
  345. movdqu $F,0xa0-0x80($ctx)
  346. movdqu $G,0xc0-0x80($ctx)
  347. movdqu $H,0xe0-0x80($ctx)
  348. movdqa $sigma,(%rbx) # save counters
  349. movdqa .Lpbswap(%rip),$Xn
  350. dec $num
  351. jnz .Loop
  352. mov `$REG_SZ*17+8`(%rsp),$num
  353. lea $REG_SZ($ctx),$ctx
  354. lea `16*$REG_SZ/4`($inp),$inp
  355. dec $num
  356. jnz .Loop_grande
  357. .Ldone:
  358. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  359. .cfi_def_cfa %rax,8
  360. ___
  361. $code.=<<___ if ($win64);
  362. movaps -0xb8(%rax),%xmm6
  363. movaps -0xa8(%rax),%xmm7
  364. movaps -0x98(%rax),%xmm8
  365. movaps -0x88(%rax),%xmm9
  366. movaps -0x78(%rax),%xmm10
  367. movaps -0x68(%rax),%xmm11
  368. movaps -0x58(%rax),%xmm12
  369. movaps -0x48(%rax),%xmm13
  370. movaps -0x38(%rax),%xmm14
  371. movaps -0x28(%rax),%xmm15
  372. ___
  373. $code.=<<___;
  374. mov -16(%rax),%rbp
  375. .cfi_restore %rbp
  376. mov -8(%rax),%rbx
  377. .cfi_restore %rbx
  378. lea (%rax),%rsp
  379. .cfi_def_cfa_register %rsp
  380. .Lepilogue:
  381. ret
  382. .cfi_endproc
  383. .size sha256_multi_block,.-sha256_multi_block
  384. ___
  385. {{{
  386. my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
  387. my @MSG0=map("%xmm$_",(4..7));
  388. my @MSG1=map("%xmm$_",(8..11));
  389. $code.=<<___;
  390. .type sha256_multi_block_shaext,\@function,3
  391. .align 32
  392. sha256_multi_block_shaext:
  393. .cfi_startproc
  394. _shaext_shortcut:
  395. mov %rsp,%rax
  396. .cfi_def_cfa_register %rax
  397. push %rbx
  398. .cfi_push %rbx
  399. push %rbp
  400. .cfi_push %rbp
  401. ___
  402. $code.=<<___ if ($win64);
  403. lea -0xa8(%rsp),%rsp
  404. movaps %xmm6,(%rsp)
  405. movaps %xmm7,0x10(%rsp)
  406. movaps %xmm8,0x20(%rsp)
  407. movaps %xmm9,0x30(%rsp)
  408. movaps %xmm10,-0x78(%rax)
  409. movaps %xmm11,-0x68(%rax)
  410. movaps %xmm12,-0x58(%rax)
  411. movaps %xmm13,-0x48(%rax)
  412. movaps %xmm14,-0x38(%rax)
  413. movaps %xmm15,-0x28(%rax)
  414. ___
  415. $code.=<<___;
  416. sub \$`$REG_SZ*18`,%rsp
  417. shl \$1,$num # we process pair at a time
  418. and \$-256,%rsp
  419. lea 0x80($ctx),$ctx # size optimization
  420. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  421. .Lbody_shaext:
  422. lea `$REG_SZ*16`(%rsp),%rbx
  423. lea K256_shaext+0x80(%rip),$Tbl
  424. .Loop_grande_shaext:
  425. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  426. xor $num,$num
  427. ___
  428. for($i=0;$i<2;$i++) {
  429. $code.=<<___;
  430. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  431. mov `16*$i+8`($inp),%ecx # number of blocks
  432. cmp $num,%ecx
  433. cmovg %ecx,$num # find maximum
  434. test %ecx,%ecx
  435. mov %ecx,`4*$i`(%rbx) # initialize counters
  436. cmovle %rsp,@ptr[$i] # cancel input
  437. ___
  438. }
  439. $code.=<<___;
  440. test $num,$num
  441. jz .Ldone_shaext
  442. movq 0x00-0x80($ctx),$ABEF0 # A1.A0
  443. movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
  444. movq 0x40-0x80($ctx),$CDGH0 # C1.C0
  445. movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
  446. movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
  447. movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
  448. movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
  449. movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
  450. punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
  451. punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
  452. punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
  453. punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
  454. movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
  455. movdqa $ABEF0,$ABEF1
  456. movdqa $CDGH0,$CDGH1
  457. punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
  458. punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
  459. punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
  460. punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
  461. pshufd \$0b00011011,$ABEF0,$ABEF0
  462. pshufd \$0b00011011,$CDGH0,$CDGH0
  463. pshufd \$0b00011011,$ABEF1,$ABEF1
  464. pshufd \$0b00011011,$CDGH1,$CDGH1
  465. jmp .Loop_shaext
  466. .align 32
  467. .Loop_shaext:
  468. movdqu 0x00(@ptr[0]),@MSG0[0]
  469. movdqu 0x00(@ptr[1]),@MSG1[0]
  470. movdqu 0x10(@ptr[0]),@MSG0[1]
  471. movdqu 0x10(@ptr[1]),@MSG1[1]
  472. movdqu 0x20(@ptr[0]),@MSG0[2]
  473. pshufb $TMPx,@MSG0[0]
  474. movdqu 0x20(@ptr[1]),@MSG1[2]
  475. pshufb $TMPx,@MSG1[0]
  476. movdqu 0x30(@ptr[0]),@MSG0[3]
  477. lea 0x40(@ptr[0]),@ptr[0]
  478. movdqu 0x30(@ptr[1]),@MSG1[3]
  479. lea 0x40(@ptr[1]),@ptr[1]
  480. movdqa 0*16-0x80($Tbl),$Wi
  481. pshufb $TMPx,@MSG0[1]
  482. paddd @MSG0[0],$Wi
  483. pxor $ABEF0,@MSG0[0] # black magic
  484. movdqa $Wi,$TMP0
  485. movdqa 0*16-0x80($Tbl),$TMP1
  486. pshufb $TMPx,@MSG1[1]
  487. paddd @MSG1[0],$TMP1
  488. movdqa $CDGH0,0x50(%rsp) # offload
  489. sha256rnds2 $ABEF0,$CDGH0 # 0-3
  490. pxor $ABEF1,@MSG1[0] # black magic
  491. movdqa $TMP1,$Wi
  492. movdqa $CDGH1,0x70(%rsp)
  493. sha256rnds2 $ABEF1,$CDGH1 # 0-3
  494. pshufd \$0x0e,$TMP0,$Wi
  495. pxor $ABEF0,@MSG0[0] # black magic
  496. movdqa $ABEF0,0x40(%rsp) # offload
  497. sha256rnds2 $CDGH0,$ABEF0
  498. pshufd \$0x0e,$TMP1,$Wi
  499. pxor $ABEF1,@MSG1[0] # black magic
  500. movdqa $ABEF1,0x60(%rsp)
  501. movdqa 1*16-0x80($Tbl),$TMP0
  502. paddd @MSG0[1],$TMP0
  503. pshufb $TMPx,@MSG0[2]
  504. sha256rnds2 $CDGH1,$ABEF1
  505. movdqa $TMP0,$Wi
  506. movdqa 1*16-0x80($Tbl),$TMP1
  507. paddd @MSG1[1],$TMP1
  508. sha256rnds2 $ABEF0,$CDGH0 # 4-7
  509. movdqa $TMP1,$Wi
  510. prefetcht0 127(@ptr[0])
  511. pshufb $TMPx,@MSG0[3]
  512. pshufb $TMPx,@MSG1[2]
  513. prefetcht0 127(@ptr[1])
  514. sha256rnds2 $ABEF1,$CDGH1 # 4-7
  515. pshufd \$0x0e,$TMP0,$Wi
  516. pshufb $TMPx,@MSG1[3]
  517. sha256msg1 @MSG0[1],@MSG0[0]
  518. sha256rnds2 $CDGH0,$ABEF0
  519. pshufd \$0x0e,$TMP1,$Wi
  520. movdqa 2*16-0x80($Tbl),$TMP0
  521. paddd @MSG0[2],$TMP0
  522. sha256rnds2 $CDGH1,$ABEF1
  523. movdqa $TMP0,$Wi
  524. movdqa 2*16-0x80($Tbl),$TMP1
  525. paddd @MSG1[2],$TMP1
  526. sha256rnds2 $ABEF0,$CDGH0 # 8-11
  527. sha256msg1 @MSG1[1],@MSG1[0]
  528. movdqa $TMP1,$Wi
  529. movdqa @MSG0[3],$TMPx
  530. sha256rnds2 $ABEF1,$CDGH1 # 8-11
  531. pshufd \$0x0e,$TMP0,$Wi
  532. palignr \$4,@MSG0[2],$TMPx
  533. paddd $TMPx,@MSG0[0]
  534. movdqa @MSG1[3],$TMPx
  535. palignr \$4,@MSG1[2],$TMPx
  536. sha256msg1 @MSG0[2],@MSG0[1]
  537. sha256rnds2 $CDGH0,$ABEF0
  538. pshufd \$0x0e,$TMP1,$Wi
  539. movdqa 3*16-0x80($Tbl),$TMP0
  540. paddd @MSG0[3],$TMP0
  541. sha256rnds2 $CDGH1,$ABEF1
  542. sha256msg1 @MSG1[2],@MSG1[1]
  543. movdqa $TMP0,$Wi
  544. movdqa 3*16-0x80($Tbl),$TMP1
  545. paddd $TMPx,@MSG1[0]
  546. paddd @MSG1[3],$TMP1
  547. sha256msg2 @MSG0[3],@MSG0[0]
  548. sha256rnds2 $ABEF0,$CDGH0 # 12-15
  549. movdqa $TMP1,$Wi
  550. movdqa @MSG0[0],$TMPx
  551. palignr \$4,@MSG0[3],$TMPx
  552. sha256rnds2 $ABEF1,$CDGH1 # 12-15
  553. sha256msg2 @MSG1[3],@MSG1[0]
  554. pshufd \$0x0e,$TMP0,$Wi
  555. paddd $TMPx,@MSG0[1]
  556. movdqa @MSG1[0],$TMPx
  557. palignr \$4,@MSG1[3],$TMPx
  558. sha256msg1 @MSG0[3],@MSG0[2]
  559. sha256rnds2 $CDGH0,$ABEF0
  560. pshufd \$0x0e,$TMP1,$Wi
  561. movdqa 4*16-0x80($Tbl),$TMP0
  562. paddd @MSG0[0],$TMP0
  563. sha256rnds2 $CDGH1,$ABEF1
  564. sha256msg1 @MSG1[3],@MSG1[2]
  565. ___
  566. for($i=4;$i<16-3;$i++) {
  567. $code.=<<___;
  568. movdqa $TMP0,$Wi
  569. movdqa $i*16-0x80($Tbl),$TMP1
  570. paddd $TMPx,@MSG1[1]
  571. paddd @MSG1[0],$TMP1
  572. sha256msg2 @MSG0[0],@MSG0[1]
  573. sha256rnds2 $ABEF0,$CDGH0 # 16-19...
  574. movdqa $TMP1,$Wi
  575. movdqa @MSG0[1],$TMPx
  576. palignr \$4,@MSG0[0],$TMPx
  577. sha256rnds2 $ABEF1,$CDGH1 # 16-19...
  578. sha256msg2 @MSG1[0],@MSG1[1]
  579. pshufd \$0x0e,$TMP0,$Wi
  580. paddd $TMPx,@MSG0[2]
  581. movdqa @MSG1[1],$TMPx
  582. palignr \$4,@MSG1[0],$TMPx
  583. sha256msg1 @MSG0[0],@MSG0[3]
  584. sha256rnds2 $CDGH0,$ABEF0
  585. pshufd \$0x0e,$TMP1,$Wi
  586. movdqa `($i+1)*16`-0x80($Tbl),$TMP0
  587. paddd @MSG0[1],$TMP0
  588. sha256rnds2 $CDGH1,$ABEF1
  589. sha256msg1 @MSG1[0],@MSG1[3]
  590. ___
  591. push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
  592. }
  593. $code.=<<___;
  594. movdqa $TMP0,$Wi
  595. movdqa 13*16-0x80($Tbl),$TMP1
  596. paddd $TMPx,@MSG1[1]
  597. paddd @MSG1[0],$TMP1
  598. sha256msg2 @MSG0[0],@MSG0[1]
  599. sha256rnds2 $ABEF0,$CDGH0 # 52-55
  600. movdqa $TMP1,$Wi
  601. movdqa @MSG0[1],$TMPx
  602. palignr \$4,@MSG0[0],$TMPx
  603. sha256rnds2 $ABEF1,$CDGH1 # 52-55
  604. sha256msg2 @MSG1[0],@MSG1[1]
  605. pshufd \$0x0e,$TMP0,$Wi
  606. paddd $TMPx,@MSG0[2]
  607. movdqa @MSG1[1],$TMPx
  608. palignr \$4,@MSG1[0],$TMPx
  609. nop
  610. sha256rnds2 $CDGH0,$ABEF0
  611. pshufd \$0x0e,$TMP1,$Wi
  612. movdqa 14*16-0x80($Tbl),$TMP0
  613. paddd @MSG0[1],$TMP0
  614. sha256rnds2 $CDGH1,$ABEF1
  615. movdqa $TMP0,$Wi
  616. movdqa 14*16-0x80($Tbl),$TMP1
  617. paddd $TMPx,@MSG1[2]
  618. paddd @MSG1[1],$TMP1
  619. sha256msg2 @MSG0[1],@MSG0[2]
  620. nop
  621. sha256rnds2 $ABEF0,$CDGH0 # 56-59
  622. movdqa $TMP1,$Wi
  623. mov \$1,%ecx
  624. pxor @MSG0[1],@MSG0[1] # zero
  625. sha256rnds2 $ABEF1,$CDGH1 # 56-59
  626. sha256msg2 @MSG1[1],@MSG1[2]
  627. pshufd \$0x0e,$TMP0,$Wi
  628. movdqa 15*16-0x80($Tbl),$TMP0
  629. paddd @MSG0[2],$TMP0
  630. movq (%rbx),@MSG0[2] # pull counters
  631. nop
  632. sha256rnds2 $CDGH0,$ABEF0
  633. pshufd \$0x0e,$TMP1,$Wi
  634. movdqa 15*16-0x80($Tbl),$TMP1
  635. paddd @MSG1[2],$TMP1
  636. sha256rnds2 $CDGH1,$ABEF1
  637. movdqa $TMP0,$Wi
  638. cmp 4*0(%rbx),%ecx # examine counters
  639. cmovge %rsp,@ptr[0] # cancel input
  640. cmp 4*1(%rbx),%ecx
  641. cmovge %rsp,@ptr[1]
  642. pshufd \$0x00,@MSG0[2],@MSG1[0]
  643. sha256rnds2 $ABEF0,$CDGH0 # 60-63
  644. movdqa $TMP1,$Wi
  645. pshufd \$0x55,@MSG0[2],@MSG1[1]
  646. movdqa @MSG0[2],@MSG1[2]
  647. sha256rnds2 $ABEF1,$CDGH1 # 60-63
  648. pshufd \$0x0e,$TMP0,$Wi
  649. pcmpgtd @MSG0[1],@MSG1[0]
  650. pcmpgtd @MSG0[1],@MSG1[1]
  651. sha256rnds2 $CDGH0,$ABEF0
  652. pshufd \$0x0e,$TMP1,$Wi
  653. pcmpgtd @MSG0[1],@MSG1[2] # counter mask
  654. movdqa K256_shaext-0x10(%rip),$TMPx
  655. sha256rnds2 $CDGH1,$ABEF1
  656. pand @MSG1[0],$CDGH0
  657. pand @MSG1[1],$CDGH1
  658. pand @MSG1[0],$ABEF0
  659. pand @MSG1[1],$ABEF1
  660. paddd @MSG0[2],@MSG1[2] # counters--
  661. paddd 0x50(%rsp),$CDGH0
  662. paddd 0x70(%rsp),$CDGH1
  663. paddd 0x40(%rsp),$ABEF0
  664. paddd 0x60(%rsp),$ABEF1
  665. movq @MSG1[2],(%rbx) # save counters
  666. dec $num
  667. jnz .Loop_shaext
  668. mov `$REG_SZ*17+8`(%rsp),$num
  669. pshufd \$0b00011011,$ABEF0,$ABEF0
  670. pshufd \$0b00011011,$CDGH0,$CDGH0
  671. pshufd \$0b00011011,$ABEF1,$ABEF1
  672. pshufd \$0b00011011,$CDGH1,$CDGH1
  673. movdqa $ABEF0,@MSG0[0]
  674. movdqa $CDGH0,@MSG0[1]
  675. punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
  676. punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
  677. punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
  678. punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
  679. movq $ABEF0,0x00-0x80($ctx) # A1.A0
  680. psrldq \$8,$ABEF0
  681. movq @MSG0[0],0x80-0x80($ctx) # E1.E0
  682. psrldq \$8,@MSG0[0]
  683. movq $ABEF0,0x20-0x80($ctx) # B1.B0
  684. movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
  685. movq $CDGH0,0x40-0x80($ctx) # C1.C0
  686. psrldq \$8,$CDGH0
  687. movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
  688. psrldq \$8,@MSG0[1]
  689. movq $CDGH0,0x60-0x80($ctx) # D1.D0
  690. movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
  691. lea `$REG_SZ/2`($ctx),$ctx
  692. lea `16*2`($inp),$inp
  693. dec $num
  694. jnz .Loop_grande_shaext
  695. .Ldone_shaext:
  696. #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  697. ___
  698. $code.=<<___ if ($win64);
  699. movaps -0xb8(%rax),%xmm6
  700. movaps -0xa8(%rax),%xmm7
  701. movaps -0x98(%rax),%xmm8
  702. movaps -0x88(%rax),%xmm9
  703. movaps -0x78(%rax),%xmm10
  704. movaps -0x68(%rax),%xmm11
  705. movaps -0x58(%rax),%xmm12
  706. movaps -0x48(%rax),%xmm13
  707. movaps -0x38(%rax),%xmm14
  708. movaps -0x28(%rax),%xmm15
  709. ___
  710. $code.=<<___;
  711. mov -16(%rax),%rbp
  712. .cfi_restore %rbp
  713. mov -8(%rax),%rbx
  714. .cfi_restore %rbx
  715. lea (%rax),%rsp
  716. .cfi_def_cfa_register %rsp
  717. .Lepilogue_shaext:
  718. ret
  719. .cfi_endproc
  720. .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
  721. ___
  722. }}}
  723. if ($avx) {{{
  724. sub ROUND_00_15_avx {
  725. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  726. $code.=<<___ if ($i<15 && $REG_SZ==16);
  727. vmovd `4*$i`(@ptr[0]),$Xi
  728. vmovd `4*$i`(@ptr[1]),$t1
  729. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  730. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  731. vpunpckldq $t1,$Xi,$Xi
  732. vpshufb $Xn,$Xi,$Xi
  733. ___
  734. $code.=<<___ if ($i==15 && $REG_SZ==16);
  735. vmovd `4*$i`(@ptr[0]),$Xi
  736. lea `16*4`(@ptr[0]),@ptr[0]
  737. vmovd `4*$i`(@ptr[1]),$t1
  738. lea `16*4`(@ptr[1]),@ptr[1]
  739. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  740. lea `16*4`(@ptr[2]),@ptr[2]
  741. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  742. lea `16*4`(@ptr[3]),@ptr[3]
  743. vpunpckldq $t1,$Xi,$Xi
  744. vpshufb $Xn,$Xi,$Xi
  745. ___
  746. $code.=<<___ if ($i<15 && $REG_SZ==32);
  747. vmovd `4*$i`(@ptr[0]),$Xi
  748. vmovd `4*$i`(@ptr[4]),$t1
  749. vmovd `4*$i`(@ptr[1]),$t2
  750. vmovd `4*$i`(@ptr[5]),$t3
  751. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  752. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  753. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  754. vpunpckldq $t2,$Xi,$Xi
  755. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  756. vpunpckldq $t3,$t1,$t1
  757. vinserti128 $t1,$Xi,$Xi
  758. vpshufb $Xn,$Xi,$Xi
  759. ___
  760. $code.=<<___ if ($i==15 && $REG_SZ==32);
  761. vmovd `4*$i`(@ptr[0]),$Xi
  762. lea `16*4`(@ptr[0]),@ptr[0]
  763. vmovd `4*$i`(@ptr[4]),$t1
  764. lea `16*4`(@ptr[4]),@ptr[4]
  765. vmovd `4*$i`(@ptr[1]),$t2
  766. lea `16*4`(@ptr[1]),@ptr[1]
  767. vmovd `4*$i`(@ptr[5]),$t3
  768. lea `16*4`(@ptr[5]),@ptr[5]
  769. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  770. lea `16*4`(@ptr[2]),@ptr[2]
  771. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  772. lea `16*4`(@ptr[6]),@ptr[6]
  773. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  774. lea `16*4`(@ptr[3]),@ptr[3]
  775. vpunpckldq $t2,$Xi,$Xi
  776. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  777. lea `16*4`(@ptr[7]),@ptr[7]
  778. vpunpckldq $t3,$t1,$t1
  779. vinserti128 $t1,$Xi,$Xi
  780. vpshufb $Xn,$Xi,$Xi
  781. ___
  782. $code.=<<___;
  783. vpsrld \$6,$e,$sigma
  784. vpslld \$26,$e,$t3
  785. vmovdqu $Xi,`&Xi_off($i)`
  786. vpaddd $h,$Xi,$Xi # Xi+=h
  787. vpsrld \$11,$e,$t2
  788. vpxor $t3,$sigma,$sigma
  789. vpslld \$21,$e,$t3
  790. vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
  791. vpxor $t2,$sigma,$sigma
  792. vpsrld \$25,$e,$t2
  793. vpxor $t3,$sigma,$sigma
  794. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  795. vpslld \$7,$e,$t3
  796. vpandn $g,$e,$t1
  797. vpand $f,$e,$axb # borrow $axb
  798. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  799. vpxor $t2,$sigma,$sigma
  800. vpsrld \$2,$a,$h # borrow $h
  801. vpxor $t3,$sigma,$sigma # Sigma1(e)
  802. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  803. vpslld \$30,$a,$t2
  804. vpxor $axb,$t1,$t1 # Ch(e,f,g)
  805. vpxor $a,$b,$axb # a^b, b^c in next round
  806. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  807. vpxor $t2,$h,$h
  808. vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
  809. vpsrld \$13,$a,$t2
  810. `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
  811. vpslld \$19,$a,$t3
  812. vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
  813. vpand $axb,$bxc,$bxc
  814. `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
  815. vpxor $t2,$h,$sigma
  816. vpsrld \$22,$a,$t2
  817. vpxor $t3,$sigma,$sigma
  818. `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
  819. vpslld \$10,$a,$t3
  820. vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  821. vpaddd $Xi,$d,$d # d+=Xi
  822. `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
  823. vpxor $t2,$sigma,$sigma
  824. vpxor $t3,$sigma,$sigma # Sigma0(a)
  825. vpaddd $Xi,$h,$h # h+=Xi
  826. vpaddd $sigma,$h,$h # h+=Sigma0(a)
  827. ___
  828. $code.=<<___ if (($i%8)==7);
  829. add \$`32*8`,$Tbl
  830. ___
  831. ($axb,$bxc)=($bxc,$axb);
  832. }
  833. sub ROUND_16_XX_avx {
  834. my $i=shift;
  835. $code.=<<___;
  836. vmovdqu `&Xi_off($i+1)`,$Xn
  837. vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
  838. vpsrld \$3,$Xn,$sigma
  839. vpsrld \$7,$Xn,$t2
  840. vpslld \$25,$Xn,$t3
  841. vpxor $t2,$sigma,$sigma
  842. vpsrld \$18,$Xn,$t2
  843. vpxor $t3,$sigma,$sigma
  844. vpslld \$14,$Xn,$t3
  845. vmovdqu `&Xi_off($i+14)`,$t1
  846. vpsrld \$10,$t1,$axb # borrow $axb
  847. vpxor $t2,$sigma,$sigma
  848. vpsrld \$17,$t1,$t2
  849. vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
  850. vpslld \$15,$t1,$t3
  851. vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
  852. vpxor $t2,$axb,$sigma
  853. vpsrld \$19,$t1,$t2
  854. vpxor $t3,$sigma,$sigma
  855. vpslld \$13,$t1,$t3
  856. vpxor $t2,$sigma,$sigma
  857. vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
  858. vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
  859. ___
  860. &ROUND_00_15_avx($i,@_);
  861. ($Xi,$Xn)=($Xn,$Xi);
  862. }
  863. $code.=<<___;
  864. .type sha256_multi_block_avx,\@function,3
  865. .align 32
  866. sha256_multi_block_avx:
  867. .cfi_startproc
  868. _avx_shortcut:
  869. ___
  870. $code.=<<___ if ($avx>1);
  871. shr \$32,%rcx
  872. cmp \$2,$num
  873. jb .Lavx
  874. test \$`1<<5`,%ecx
  875. jnz _avx2_shortcut
  876. jmp .Lavx
  877. .align 32
  878. .Lavx:
  879. ___
  880. $code.=<<___;
  881. mov %rsp,%rax
  882. .cfi_def_cfa_register %rax
  883. push %rbx
  884. .cfi_push %rbx
  885. push %rbp
  886. .cfi_push %rbp
  887. ___
  888. $code.=<<___ if ($win64);
  889. lea -0xa8(%rsp),%rsp
  890. movaps %xmm6,(%rsp)
  891. movaps %xmm7,0x10(%rsp)
  892. movaps %xmm8,0x20(%rsp)
  893. movaps %xmm9,0x30(%rsp)
  894. movaps %xmm10,-0x78(%rax)
  895. movaps %xmm11,-0x68(%rax)
  896. movaps %xmm12,-0x58(%rax)
  897. movaps %xmm13,-0x48(%rax)
  898. movaps %xmm14,-0x38(%rax)
  899. movaps %xmm15,-0x28(%rax)
  900. ___
  901. $code.=<<___;
  902. sub \$`$REG_SZ*18`, %rsp
  903. and \$-256,%rsp
  904. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  905. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  906. .Lbody_avx:
  907. lea K256+128(%rip),$Tbl
  908. lea `$REG_SZ*16`(%rsp),%rbx
  909. lea 0x80($ctx),$ctx # size optimization
  910. .Loop_grande_avx:
  911. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  912. xor $num,$num
  913. ___
  914. for($i=0;$i<4;$i++) {
  915. $code.=<<___;
  916. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  917. mov `16*$i+8`($inp),%ecx # number of blocks
  918. cmp $num,%ecx
  919. cmovg %ecx,$num # find maximum
  920. test %ecx,%ecx
  921. mov %ecx,`4*$i`(%rbx) # initialize counters
  922. cmovle $Tbl,@ptr[$i] # cancel input
  923. ___
  924. }
  925. $code.=<<___;
  926. test $num,$num
  927. jz .Ldone_avx
  928. vmovdqu 0x00-0x80($ctx),$A # load context
  929. lea 128(%rsp),%rax
  930. vmovdqu 0x20-0x80($ctx),$B
  931. vmovdqu 0x40-0x80($ctx),$C
  932. vmovdqu 0x60-0x80($ctx),$D
  933. vmovdqu 0x80-0x80($ctx),$E
  934. vmovdqu 0xa0-0x80($ctx),$F
  935. vmovdqu 0xc0-0x80($ctx),$G
  936. vmovdqu 0xe0-0x80($ctx),$H
  937. vmovdqu .Lpbswap(%rip),$Xn
  938. jmp .Loop_avx
  939. .align 32
  940. .Loop_avx:
  941. vpxor $B,$C,$bxc # magic seed
  942. ___
  943. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  944. $code.=<<___;
  945. vmovdqu `&Xi_off($i)`,$Xi
  946. mov \$3,%ecx
  947. jmp .Loop_16_xx_avx
  948. .align 32
  949. .Loop_16_xx_avx:
  950. ___
  951. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  952. $code.=<<___;
  953. dec %ecx
  954. jnz .Loop_16_xx_avx
  955. mov \$1,%ecx
  956. lea K256+128(%rip),$Tbl
  957. ___
  958. for($i=0;$i<4;$i++) {
  959. $code.=<<___;
  960. cmp `4*$i`(%rbx),%ecx # examine counters
  961. cmovge $Tbl,@ptr[$i] # cancel input
  962. ___
  963. }
  964. $code.=<<___;
  965. vmovdqa (%rbx),$sigma # pull counters
  966. vpxor $t1,$t1,$t1
  967. vmovdqa $sigma,$Xn
  968. vpcmpgtd $t1,$Xn,$Xn # mask value
  969. vpaddd $Xn,$sigma,$sigma # counters--
  970. vmovdqu 0x00-0x80($ctx),$t1
  971. vpand $Xn,$A,$A
  972. vmovdqu 0x20-0x80($ctx),$t2
  973. vpand $Xn,$B,$B
  974. vmovdqu 0x40-0x80($ctx),$t3
  975. vpand $Xn,$C,$C
  976. vmovdqu 0x60-0x80($ctx),$Xi
  977. vpand $Xn,$D,$D
  978. vpaddd $t1,$A,$A
  979. vmovdqu 0x80-0x80($ctx),$t1
  980. vpand $Xn,$E,$E
  981. vpaddd $t2,$B,$B
  982. vmovdqu 0xa0-0x80($ctx),$t2
  983. vpand $Xn,$F,$F
  984. vpaddd $t3,$C,$C
  985. vmovdqu 0xc0-0x80($ctx),$t3
  986. vpand $Xn,$G,$G
  987. vpaddd $Xi,$D,$D
  988. vmovdqu 0xe0-0x80($ctx),$Xi
  989. vpand $Xn,$H,$H
  990. vpaddd $t1,$E,$E
  991. vpaddd $t2,$F,$F
  992. vmovdqu $A,0x00-0x80($ctx)
  993. vpaddd $t3,$G,$G
  994. vmovdqu $B,0x20-0x80($ctx)
  995. vpaddd $Xi,$H,$H
  996. vmovdqu $C,0x40-0x80($ctx)
  997. vmovdqu $D,0x60-0x80($ctx)
  998. vmovdqu $E,0x80-0x80($ctx)
  999. vmovdqu $F,0xa0-0x80($ctx)
  1000. vmovdqu $G,0xc0-0x80($ctx)
  1001. vmovdqu $H,0xe0-0x80($ctx)
  1002. vmovdqu $sigma,(%rbx) # save counters
  1003. vmovdqu .Lpbswap(%rip),$Xn
  1004. dec $num
  1005. jnz .Loop_avx
  1006. mov `$REG_SZ*17+8`(%rsp),$num
  1007. lea $REG_SZ($ctx),$ctx
  1008. lea `16*$REG_SZ/4`($inp),$inp
  1009. dec $num
  1010. jnz .Loop_grande_avx
  1011. .Ldone_avx:
  1012. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1013. .cfi_def_cfa %rax,8
  1014. vzeroupper
  1015. ___
  1016. $code.=<<___ if ($win64);
  1017. movaps -0xb8(%rax),%xmm6
  1018. movaps -0xa8(%rax),%xmm7
  1019. movaps -0x98(%rax),%xmm8
  1020. movaps -0x88(%rax),%xmm9
  1021. movaps -0x78(%rax),%xmm10
  1022. movaps -0x68(%rax),%xmm11
  1023. movaps -0x58(%rax),%xmm12
  1024. movaps -0x48(%rax),%xmm13
  1025. movaps -0x38(%rax),%xmm14
  1026. movaps -0x28(%rax),%xmm15
  1027. ___
  1028. $code.=<<___;
  1029. mov -16(%rax),%rbp
  1030. .cfi_restore %rbp
  1031. mov -8(%rax),%rbx
  1032. .cfi_restore %rbx
  1033. lea (%rax),%rsp
  1034. .cfi_def_cfa_register %rsp
  1035. .Lepilogue_avx:
  1036. ret
  1037. .cfi_endproc
  1038. .size sha256_multi_block_avx,.-sha256_multi_block_avx
  1039. ___
  1040. if ($avx>1) {
  1041. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1042. $REG_SZ=32;
  1043. @ptr=map("%r$_",(12..15,8..11));
  1044. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
  1045. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
  1046. $code.=<<___;
  1047. .type sha256_multi_block_avx2,\@function,3
  1048. .align 32
  1049. sha256_multi_block_avx2:
  1050. .cfi_startproc
  1051. _avx2_shortcut:
  1052. mov %rsp,%rax
  1053. .cfi_def_cfa_register %rax
  1054. push %rbx
  1055. .cfi_push %rbx
  1056. push %rbp
  1057. .cfi_push %rbp
  1058. push %r12
  1059. .cfi_push %r12
  1060. push %r13
  1061. .cfi_push %r13
  1062. push %r14
  1063. .cfi_push %r14
  1064. push %r15
  1065. .cfi_push %r15
  1066. ___
  1067. $code.=<<___ if ($win64);
  1068. lea -0xa8(%rsp),%rsp
  1069. movaps %xmm6,(%rsp)
  1070. movaps %xmm7,0x10(%rsp)
  1071. movaps %xmm8,0x20(%rsp)
  1072. movaps %xmm9,0x30(%rsp)
  1073. movaps %xmm10,0x40(%rsp)
  1074. movaps %xmm11,0x50(%rsp)
  1075. movaps %xmm12,-0x78(%rax)
  1076. movaps %xmm13,-0x68(%rax)
  1077. movaps %xmm14,-0x58(%rax)
  1078. movaps %xmm15,-0x48(%rax)
  1079. ___
  1080. $code.=<<___;
  1081. sub \$`$REG_SZ*18`, %rsp
  1082. and \$-256,%rsp
  1083. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  1084. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  1085. .Lbody_avx2:
  1086. lea K256+128(%rip),$Tbl
  1087. lea 0x80($ctx),$ctx # size optimization
  1088. .Loop_grande_avx2:
  1089. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  1090. xor $num,$num
  1091. lea `$REG_SZ*16`(%rsp),%rbx
  1092. ___
  1093. for($i=0;$i<8;$i++) {
  1094. $code.=<<___;
  1095. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  1096. mov `16*$i+8`($inp),%ecx # number of blocks
  1097. cmp $num,%ecx
  1098. cmovg %ecx,$num # find maximum
  1099. test %ecx,%ecx
  1100. mov %ecx,`4*$i`(%rbx) # initialize counters
  1101. cmovle $Tbl,@ptr[$i] # cancel input
  1102. ___
  1103. }
  1104. $code.=<<___;
  1105. vmovdqu 0x00-0x80($ctx),$A # load context
  1106. lea 128(%rsp),%rax
  1107. vmovdqu 0x20-0x80($ctx),$B
  1108. lea 256+128(%rsp),%rbx
  1109. vmovdqu 0x40-0x80($ctx),$C
  1110. vmovdqu 0x60-0x80($ctx),$D
  1111. vmovdqu 0x80-0x80($ctx),$E
  1112. vmovdqu 0xa0-0x80($ctx),$F
  1113. vmovdqu 0xc0-0x80($ctx),$G
  1114. vmovdqu 0xe0-0x80($ctx),$H
  1115. vmovdqu .Lpbswap(%rip),$Xn
  1116. jmp .Loop_avx2
  1117. .align 32
  1118. .Loop_avx2:
  1119. vpxor $B,$C,$bxc # magic seed
  1120. ___
  1121. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  1122. $code.=<<___;
  1123. vmovdqu `&Xi_off($i)`,$Xi
  1124. mov \$3,%ecx
  1125. jmp .Loop_16_xx_avx2
  1126. .align 32
  1127. .Loop_16_xx_avx2:
  1128. ___
  1129. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  1130. $code.=<<___;
  1131. dec %ecx
  1132. jnz .Loop_16_xx_avx2
  1133. mov \$1,%ecx
  1134. lea `$REG_SZ*16`(%rsp),%rbx
  1135. lea K256+128(%rip),$Tbl
  1136. ___
  1137. for($i=0;$i<8;$i++) {
  1138. $code.=<<___;
  1139. cmp `4*$i`(%rbx),%ecx # examine counters
  1140. cmovge $Tbl,@ptr[$i] # cancel input
  1141. ___
  1142. }
  1143. $code.=<<___;
  1144. vmovdqa (%rbx),$sigma # pull counters
  1145. vpxor $t1,$t1,$t1
  1146. vmovdqa $sigma,$Xn
  1147. vpcmpgtd $t1,$Xn,$Xn # mask value
  1148. vpaddd $Xn,$sigma,$sigma # counters--
  1149. vmovdqu 0x00-0x80($ctx),$t1
  1150. vpand $Xn,$A,$A
  1151. vmovdqu 0x20-0x80($ctx),$t2
  1152. vpand $Xn,$B,$B
  1153. vmovdqu 0x40-0x80($ctx),$t3
  1154. vpand $Xn,$C,$C
  1155. vmovdqu 0x60-0x80($ctx),$Xi
  1156. vpand $Xn,$D,$D
  1157. vpaddd $t1,$A,$A
  1158. vmovdqu 0x80-0x80($ctx),$t1
  1159. vpand $Xn,$E,$E
  1160. vpaddd $t2,$B,$B
  1161. vmovdqu 0xa0-0x80($ctx),$t2
  1162. vpand $Xn,$F,$F
  1163. vpaddd $t3,$C,$C
  1164. vmovdqu 0xc0-0x80($ctx),$t3
  1165. vpand $Xn,$G,$G
  1166. vpaddd $Xi,$D,$D
  1167. vmovdqu 0xe0-0x80($ctx),$Xi
  1168. vpand $Xn,$H,$H
  1169. vpaddd $t1,$E,$E
  1170. vpaddd $t2,$F,$F
  1171. vmovdqu $A,0x00-0x80($ctx)
  1172. vpaddd $t3,$G,$G
  1173. vmovdqu $B,0x20-0x80($ctx)
  1174. vpaddd $Xi,$H,$H
  1175. vmovdqu $C,0x40-0x80($ctx)
  1176. vmovdqu $D,0x60-0x80($ctx)
  1177. vmovdqu $E,0x80-0x80($ctx)
  1178. vmovdqu $F,0xa0-0x80($ctx)
  1179. vmovdqu $G,0xc0-0x80($ctx)
  1180. vmovdqu $H,0xe0-0x80($ctx)
  1181. vmovdqu $sigma,(%rbx) # save counters
  1182. lea 256+128(%rsp),%rbx
  1183. vmovdqu .Lpbswap(%rip),$Xn
  1184. dec $num
  1185. jnz .Loop_avx2
  1186. #mov `$REG_SZ*17+8`(%rsp),$num
  1187. #lea $REG_SZ($ctx),$ctx
  1188. #lea `16*$REG_SZ/4`($inp),$inp
  1189. #dec $num
  1190. #jnz .Loop_grande_avx2
  1191. .Ldone_avx2:
  1192. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1193. .cfi_def_cfa %rax,8
  1194. vzeroupper
  1195. ___
  1196. $code.=<<___ if ($win64);
  1197. movaps -0xd8(%rax),%xmm6
  1198. movaps -0xc8(%rax),%xmm7
  1199. movaps -0xb8(%rax),%xmm8
  1200. movaps -0xa8(%rax),%xmm9
  1201. movaps -0x98(%rax),%xmm10
  1202. movaps -0x88(%rax),%xmm11
  1203. movaps -0x78(%rax),%xmm12
  1204. movaps -0x68(%rax),%xmm13
  1205. movaps -0x58(%rax),%xmm14
  1206. movaps -0x48(%rax),%xmm15
  1207. ___
  1208. $code.=<<___;
  1209. mov -48(%rax),%r15
  1210. .cfi_restore %r15
  1211. mov -40(%rax),%r14
  1212. .cfi_restore %r14
  1213. mov -32(%rax),%r13
  1214. .cfi_restore %r13
  1215. mov -24(%rax),%r12
  1216. .cfi_restore %r12
  1217. mov -16(%rax),%rbp
  1218. .cfi_restore %rbp
  1219. mov -8(%rax),%rbx
  1220. .cfi_restore %rbx
  1221. lea (%rax),%rsp
  1222. .cfi_def_cfa_register %rsp
  1223. .Lepilogue_avx2:
  1224. ret
  1225. .cfi_endproc
  1226. .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
  1227. ___
  1228. } }}}
  1229. $code.=<<___;
  1230. .align 256
  1231. K256:
  1232. ___
  1233. sub TABLE {
  1234. foreach (@_) {
  1235. $code.=<<___;
  1236. .long $_,$_,$_,$_
  1237. .long $_,$_,$_,$_
  1238. ___
  1239. }
  1240. }
  1241. &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
  1242. 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  1243. 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
  1244. 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  1245. 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
  1246. 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  1247. 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
  1248. 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  1249. 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
  1250. 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  1251. 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
  1252. 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  1253. 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
  1254. 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  1255. 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
  1256. 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
  1257. $code.=<<___;
  1258. .Lpbswap:
  1259. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1260. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1261. K256_shaext:
  1262. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  1263. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  1264. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  1265. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  1266. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  1267. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  1268. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  1269. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  1270. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  1271. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  1272. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  1273. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  1274. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  1275. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  1276. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  1277. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  1278. .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1279. ___
  1280. if ($win64) {
  1281. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1282. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1283. $rec="%rcx";
  1284. $frame="%rdx";
  1285. $context="%r8";
  1286. $disp="%r9";
  1287. $code.=<<___;
  1288. .extern __imp_RtlVirtualUnwind
  1289. .type se_handler,\@abi-omnipotent
  1290. .align 16
  1291. se_handler:
  1292. push %rsi
  1293. push %rdi
  1294. push %rbx
  1295. push %rbp
  1296. push %r12
  1297. push %r13
  1298. push %r14
  1299. push %r15
  1300. pushfq
  1301. sub \$64,%rsp
  1302. mov 120($context),%rax # pull context->Rax
  1303. mov 248($context),%rbx # pull context->Rip
  1304. mov 8($disp),%rsi # disp->ImageBase
  1305. mov 56($disp),%r11 # disp->HandlerData
  1306. mov 0(%r11),%r10d # HandlerData[0]
  1307. lea (%rsi,%r10),%r10 # end of prologue label
  1308. cmp %r10,%rbx # context->Rip<.Lbody
  1309. jb .Lin_prologue
  1310. mov 152($context),%rax # pull context->Rsp
  1311. mov 4(%r11),%r10d # HandlerData[1]
  1312. lea (%rsi,%r10),%r10 # epilogue label
  1313. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1314. jae .Lin_prologue
  1315. mov `16*17`(%rax),%rax # pull saved stack pointer
  1316. mov -8(%rax),%rbx
  1317. mov -16(%rax),%rbp
  1318. mov %rbx,144($context) # restore context->Rbx
  1319. mov %rbp,160($context) # restore context->Rbp
  1320. lea -24-10*16(%rax),%rsi
  1321. lea 512($context),%rdi # &context.Xmm6
  1322. mov \$20,%ecx
  1323. .long 0xa548f3fc # cld; rep movsq
  1324. .Lin_prologue:
  1325. mov 8(%rax),%rdi
  1326. mov 16(%rax),%rsi
  1327. mov %rax,152($context) # restore context->Rsp
  1328. mov %rsi,168($context) # restore context->Rsi
  1329. mov %rdi,176($context) # restore context->Rdi
  1330. mov 40($disp),%rdi # disp->ContextRecord
  1331. mov $context,%rsi # context
  1332. mov \$154,%ecx # sizeof(CONTEXT)
  1333. .long 0xa548f3fc # cld; rep movsq
  1334. mov $disp,%rsi
  1335. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1336. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1337. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1338. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1339. mov 40(%rsi),%r10 # disp->ContextRecord
  1340. lea 56(%rsi),%r11 # &disp->HandlerData
  1341. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1342. mov %r10,32(%rsp) # arg5
  1343. mov %r11,40(%rsp) # arg6
  1344. mov %r12,48(%rsp) # arg7
  1345. mov %rcx,56(%rsp) # arg8, (NULL)
  1346. call *__imp_RtlVirtualUnwind(%rip)
  1347. mov \$1,%eax # ExceptionContinueSearch
  1348. add \$64,%rsp
  1349. popfq
  1350. pop %r15
  1351. pop %r14
  1352. pop %r13
  1353. pop %r12
  1354. pop %rbp
  1355. pop %rbx
  1356. pop %rdi
  1357. pop %rsi
  1358. ret
  1359. .size se_handler,.-se_handler
  1360. ___
  1361. $code.=<<___ if ($avx>1);
  1362. .type avx2_handler,\@abi-omnipotent
  1363. .align 16
  1364. avx2_handler:
  1365. push %rsi
  1366. push %rdi
  1367. push %rbx
  1368. push %rbp
  1369. push %r12
  1370. push %r13
  1371. push %r14
  1372. push %r15
  1373. pushfq
  1374. sub \$64,%rsp
  1375. mov 120($context),%rax # pull context->Rax
  1376. mov 248($context),%rbx # pull context->Rip
  1377. mov 8($disp),%rsi # disp->ImageBase
  1378. mov 56($disp),%r11 # disp->HandlerData
  1379. mov 0(%r11),%r10d # HandlerData[0]
  1380. lea (%rsi,%r10),%r10 # end of prologue label
  1381. cmp %r10,%rbx # context->Rip<body label
  1382. jb .Lin_prologue
  1383. mov 152($context),%rax # pull context->Rsp
  1384. mov 4(%r11),%r10d # HandlerData[1]
  1385. lea (%rsi,%r10),%r10 # epilogue label
  1386. cmp %r10,%rbx # context->Rip>=epilogue label
  1387. jae .Lin_prologue
  1388. mov `32*17`($context),%rax # pull saved stack pointer
  1389. mov -8(%rax),%rbx
  1390. mov -16(%rax),%rbp
  1391. mov -24(%rax),%r12
  1392. mov -32(%rax),%r13
  1393. mov -40(%rax),%r14
  1394. mov -48(%rax),%r15
  1395. mov %rbx,144($context) # restore context->Rbx
  1396. mov %rbp,160($context) # restore context->Rbp
  1397. mov %r12,216($context) # restore context->R12
  1398. mov %r13,224($context) # restore context->R13
  1399. mov %r14,232($context) # restore context->R14
  1400. mov %r15,240($context) # restore context->R15
  1401. lea -56-10*16(%rax),%rsi
  1402. lea 512($context),%rdi # &context.Xmm6
  1403. mov \$20,%ecx
  1404. .long 0xa548f3fc # cld; rep movsq
  1405. jmp .Lin_prologue
  1406. .size avx2_handler,.-avx2_handler
  1407. ___
  1408. $code.=<<___;
  1409. .section .pdata
  1410. .align 4
  1411. .rva .LSEH_begin_sha256_multi_block
  1412. .rva .LSEH_end_sha256_multi_block
  1413. .rva .LSEH_info_sha256_multi_block
  1414. .rva .LSEH_begin_sha256_multi_block_shaext
  1415. .rva .LSEH_end_sha256_multi_block_shaext
  1416. .rva .LSEH_info_sha256_multi_block_shaext
  1417. ___
  1418. $code.=<<___ if ($avx);
  1419. .rva .LSEH_begin_sha256_multi_block_avx
  1420. .rva .LSEH_end_sha256_multi_block_avx
  1421. .rva .LSEH_info_sha256_multi_block_avx
  1422. ___
  1423. $code.=<<___ if ($avx>1);
  1424. .rva .LSEH_begin_sha256_multi_block_avx2
  1425. .rva .LSEH_end_sha256_multi_block_avx2
  1426. .rva .LSEH_info_sha256_multi_block_avx2
  1427. ___
  1428. $code.=<<___;
  1429. .section .xdata
  1430. .align 8
  1431. .LSEH_info_sha256_multi_block:
  1432. .byte 9,0,0,0
  1433. .rva se_handler
  1434. .rva .Lbody,.Lepilogue # HandlerData[]
  1435. .LSEH_info_sha256_multi_block_shaext:
  1436. .byte 9,0,0,0
  1437. .rva se_handler
  1438. .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
  1439. ___
  1440. $code.=<<___ if ($avx);
  1441. .LSEH_info_sha256_multi_block_avx:
  1442. .byte 9,0,0,0
  1443. .rva se_handler
  1444. .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
  1445. ___
  1446. $code.=<<___ if ($avx>1);
  1447. .LSEH_info_sha256_multi_block_avx2:
  1448. .byte 9,0,0,0
  1449. .rva avx2_handler
  1450. .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
  1451. ___
  1452. }
  1453. ####################################################################
  1454. sub rex {
  1455. local *opcode=shift;
  1456. my ($dst,$src)=@_;
  1457. my $rex=0;
  1458. $rex|=0x04 if ($dst>=8);
  1459. $rex|=0x01 if ($src>=8);
  1460. unshift @opcode,$rex|0x40 if ($rex);
  1461. }
  1462. sub sha256op38 {
  1463. my $instr = shift;
  1464. my %opcodelet = (
  1465. "sha256rnds2" => 0xcb,
  1466. "sha256msg1" => 0xcc,
  1467. "sha256msg2" => 0xcd );
  1468. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1469. my @opcode=(0x0f,0x38);
  1470. rex(\@opcode,$2,$1);
  1471. push @opcode,$opcodelet{$instr};
  1472. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1473. return ".byte\t".join(',',@opcode);
  1474. } else {
  1475. return $instr."\t".@_[0];
  1476. }
  1477. }
  1478. foreach (split("\n",$code)) {
  1479. s/\`([^\`]*)\`/eval($1)/ge;
  1480. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
  1481. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1482. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1483. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
  1484. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1485. s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
  1486. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1487. print $_,"\n";
  1488. }
  1489. close STDOUT;