sha256-mb-x86_64.pl 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer SHA256 procedure processes n buffers in parallel by
  15. # placing buffer data to designated lane of SIMD register. n is
  16. # naturally limited to 4 on pre-AVX2 processors and to 8 on
  17. # AVX2-capable processors such as Haswell.
  18. #
  19. # this +aesni(i) sha256 aesni-sha256 gain(iv)
  20. # -------------------------------------------------------------------
  21. # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
  22. # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
  23. # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
  24. # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
  25. # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
  26. # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
  27. # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
  28. #
  29. # (i) multi-block CBC encrypt with 128-bit key;
  30. # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  31. # because of lower AES-NI instruction throughput, nor is there
  32. # AES-NI-SHA256 stitch for these processors;
  33. # (iii) "this" is for n=8, when we gather twice as much data, result
  34. # for n=4 is 20.3+4.44=24.7;
  35. # (iv) presented improvement coefficients are asymptotic limits and
  36. # in real-life application are somewhat lower, e.g. for 2KB
  37. # fragments they range from 75% to 130% (on Haswell);
  38. $flavour = shift;
  39. $output = shift;
  40. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  41. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45. die "can't locate x86_64-xlate.pl";
  46. $avx=0;
  47. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49. $avx = ($1>=2.19) + ($1>=2.22);
  50. }
  51. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  52. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  53. $avx = ($1>=2.09) + ($1>=2.10);
  54. }
  55. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  56. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  57. $avx = ($1>=10) + ($1>=11);
  58. }
  59. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  60. $avx = ($2>=3.0) + ($2>3.0);
  61. }
  62. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  63. *STDOUT=*OUT;
  64. # void sha256_multi_block (
  65. # struct { unsigned int A[8];
  66. # unsigned int B[8];
  67. # unsigned int C[8];
  68. # unsigned int D[8];
  69. # unsigned int E[8];
  70. # unsigned int F[8];
  71. # unsigned int G[8];
  72. # unsigned int H[8]; } *ctx,
  73. # struct { void *ptr; int blocks; } inp[8],
  74. # int num); /* 1 or 2 */
  75. #
  76. $ctx="%rdi"; # 1st arg
  77. $inp="%rsi"; # 2nd arg
  78. $num="%edx"; # 3rd arg
  79. @ptr=map("%r$_",(8..11));
  80. $Tbl="%rbp";
  81. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
  82. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
  83. $REG_SZ=16;
  84. sub Xi_off {
  85. my $off = shift;
  86. $off %= 16; $off *= $REG_SZ;
  87. $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
  88. }
  89. sub ROUND_00_15 {
  90. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  91. $code.=<<___ if ($i<15);
  92. movd `4*$i`(@ptr[0]),$Xi
  93. movd `4*$i`(@ptr[1]),$t1
  94. movd `4*$i`(@ptr[2]),$t2
  95. movd `4*$i`(@ptr[3]),$t3
  96. punpckldq $t2,$Xi
  97. punpckldq $t3,$t1
  98. punpckldq $t1,$Xi
  99. ___
  100. $code.=<<___ if ($i==15);
  101. movd `4*$i`(@ptr[0]),$Xi
  102. lea `16*4`(@ptr[0]),@ptr[0]
  103. movd `4*$i`(@ptr[1]),$t1
  104. lea `16*4`(@ptr[1]),@ptr[1]
  105. movd `4*$i`(@ptr[2]),$t2
  106. lea `16*4`(@ptr[2]),@ptr[2]
  107. movd `4*$i`(@ptr[3]),$t3
  108. lea `16*4`(@ptr[3]),@ptr[3]
  109. punpckldq $t2,$Xi
  110. punpckldq $t3,$t1
  111. punpckldq $t1,$Xi
  112. ___
  113. $code.=<<___;
  114. movdqa $e,$sigma
  115. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
  116. movdqa $e,$t3
  117. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
  118. psrld \$6,$sigma
  119. movdqa $e,$t2
  120. pslld \$7,$t3
  121. movdqa $Xi,`&Xi_off($i)`
  122. paddd $h,$Xi # Xi+=h
  123. psrld \$11,$t2
  124. pxor $t3,$sigma
  125. pslld \$21-7,$t3
  126. paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
  127. pxor $t2,$sigma
  128. psrld \$25-11,$t2
  129. movdqa $e,$t1
  130. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  131. pxor $t3,$sigma
  132. movdqa $e,$axb # borrow $axb
  133. pslld \$26-21,$t3
  134. pandn $g,$t1
  135. pand $f,$axb
  136. pxor $t2,$sigma
  137. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  138. movdqa $a,$t2
  139. pxor $t3,$sigma # Sigma1(e)
  140. movdqa $a,$t3
  141. psrld \$2,$t2
  142. paddd $sigma,$Xi # Xi+=Sigma1(e)
  143. pxor $axb,$t1 # Ch(e,f,g)
  144. movdqa $b,$axb
  145. movdqa $a,$sigma
  146. pslld \$10,$t3
  147. pxor $a,$axb # a^b, b^c in next round
  148. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  149. psrld \$13,$sigma
  150. pxor $t3,$t2
  151. paddd $t1,$Xi # Xi+=Ch(e,f,g)
  152. pslld \$19-10,$t3
  153. pand $axb,$bxc
  154. pxor $sigma,$t2
  155. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  156. psrld \$22-13,$sigma
  157. pxor $t3,$t2
  158. movdqa $b,$h
  159. pslld \$30-19,$t3
  160. pxor $t2,$sigma
  161. pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  162. paddd $Xi,$d # d+=Xi
  163. pxor $t3,$sigma # Sigma0(a)
  164. paddd $Xi,$h # h+=Xi
  165. paddd $sigma,$h # h+=Sigma0(a)
  166. ___
  167. $code.=<<___ if (($i%8)==7);
  168. lea `32*8`($Tbl),$Tbl
  169. ___
  170. ($axb,$bxc)=($bxc,$axb);
  171. }
  172. sub ROUND_16_XX {
  173. my $i=shift;
  174. $code.=<<___;
  175. movdqa `&Xi_off($i+1)`,$Xn
  176. paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
  177. movdqa $Xn,$sigma
  178. movdqa $Xn,$t2
  179. psrld \$3,$sigma
  180. movdqa $Xn,$t3
  181. psrld \$7,$t2
  182. movdqa `&Xi_off($i+14)`,$t1
  183. pslld \$14,$t3
  184. pxor $t2,$sigma
  185. psrld \$18-7,$t2
  186. movdqa $t1,$axb # borrow $axb
  187. pxor $t3,$sigma
  188. pslld \$25-14,$t3
  189. pxor $t2,$sigma
  190. psrld \$10,$t1
  191. movdqa $axb,$t2
  192. psrld \$17,$axb
  193. pxor $t3,$sigma # sigma0(X[i+1])
  194. pslld \$13,$t2
  195. paddd $sigma,$Xi # Xi+=sigma0(e)
  196. pxor $axb,$t1
  197. psrld \$19-17,$axb
  198. pxor $t2,$t1
  199. pslld \$15-13,$t2
  200. pxor $axb,$t1
  201. pxor $t2,$t1 # sigma0(X[i+14])
  202. paddd $t1,$Xi # Xi+=sigma1(X[i+14])
  203. ___
  204. &ROUND_00_15($i,@_);
  205. ($Xi,$Xn)=($Xn,$Xi);
  206. }
  207. $code.=<<___;
  208. .text
  209. .extern OPENSSL_ia32cap_P
  210. .globl sha256_multi_block
  211. .type sha256_multi_block,\@function,3
  212. .align 32
  213. sha256_multi_block:
  214. .cfi_startproc
  215. mov OPENSSL_ia32cap_P+4(%rip),%rcx
  216. bt \$61,%rcx # check SHA bit
  217. jc _shaext_shortcut
  218. ___
  219. $code.=<<___ if ($avx);
  220. test \$`1<<28`,%ecx
  221. jnz _avx_shortcut
  222. ___
  223. $code.=<<___;
  224. mov %rsp,%rax
  225. .cfi_def_cfa_register %rax
  226. push %rbx
  227. .cfi_push %rbx
  228. push %rbp
  229. .cfi_push %rbp
  230. ___
  231. $code.=<<___ if ($win64);
  232. lea -0xa8(%rsp),%rsp
  233. movaps %xmm6,(%rsp)
  234. movaps %xmm7,0x10(%rsp)
  235. movaps %xmm8,0x20(%rsp)
  236. movaps %xmm9,0x30(%rsp)
  237. movaps %xmm10,-0x78(%rax)
  238. movaps %xmm11,-0x68(%rax)
  239. movaps %xmm12,-0x58(%rax)
  240. movaps %xmm13,-0x48(%rax)
  241. movaps %xmm14,-0x38(%rax)
  242. movaps %xmm15,-0x28(%rax)
  243. ___
  244. $code.=<<___;
  245. sub \$`$REG_SZ*18`, %rsp
  246. and \$-256,%rsp
  247. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  248. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  249. .Lbody:
  250. lea K256+128(%rip),$Tbl
  251. lea `$REG_SZ*16`(%rsp),%rbx
  252. lea 0x80($ctx),$ctx # size optimization
  253. .Loop_grande:
  254. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  255. xor $num,$num
  256. ___
  257. for($i=0;$i<4;$i++) {
  258. $code.=<<___;
  259. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  260. mov `16*$i+8`($inp),%ecx # number of blocks
  261. cmp $num,%ecx
  262. cmovg %ecx,$num # find maximum
  263. test %ecx,%ecx
  264. mov %ecx,`4*$i`(%rbx) # initialize counters
  265. cmovle $Tbl,@ptr[$i] # cancel input
  266. ___
  267. }
  268. $code.=<<___;
  269. test $num,$num
  270. jz .Ldone
  271. movdqu 0x00-0x80($ctx),$A # load context
  272. lea 128(%rsp),%rax
  273. movdqu 0x20-0x80($ctx),$B
  274. movdqu 0x40-0x80($ctx),$C
  275. movdqu 0x60-0x80($ctx),$D
  276. movdqu 0x80-0x80($ctx),$E
  277. movdqu 0xa0-0x80($ctx),$F
  278. movdqu 0xc0-0x80($ctx),$G
  279. movdqu 0xe0-0x80($ctx),$H
  280. movdqu .Lpbswap(%rip),$Xn
  281. jmp .Loop
  282. .align 32
  283. .Loop:
  284. movdqa $C,$bxc
  285. pxor $B,$bxc # magic seed
  286. ___
  287. for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
  288. $code.=<<___;
  289. movdqu `&Xi_off($i)`,$Xi
  290. mov \$3,%ecx
  291. jmp .Loop_16_xx
  292. .align 32
  293. .Loop_16_xx:
  294. ___
  295. for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
  296. $code.=<<___;
  297. dec %ecx
  298. jnz .Loop_16_xx
  299. mov \$1,%ecx
  300. lea K256+128(%rip),$Tbl
  301. movdqa (%rbx),$sigma # pull counters
  302. cmp 4*0(%rbx),%ecx # examine counters
  303. pxor $t1,$t1
  304. cmovge $Tbl,@ptr[0] # cancel input
  305. cmp 4*1(%rbx),%ecx
  306. movdqa $sigma,$Xn
  307. cmovge $Tbl,@ptr[1]
  308. cmp 4*2(%rbx),%ecx
  309. pcmpgtd $t1,$Xn # mask value
  310. cmovge $Tbl,@ptr[2]
  311. cmp 4*3(%rbx),%ecx
  312. paddd $Xn,$sigma # counters--
  313. cmovge $Tbl,@ptr[3]
  314. movdqu 0x00-0x80($ctx),$t1
  315. pand $Xn,$A
  316. movdqu 0x20-0x80($ctx),$t2
  317. pand $Xn,$B
  318. movdqu 0x40-0x80($ctx),$t3
  319. pand $Xn,$C
  320. movdqu 0x60-0x80($ctx),$Xi
  321. pand $Xn,$D
  322. paddd $t1,$A
  323. movdqu 0x80-0x80($ctx),$t1
  324. pand $Xn,$E
  325. paddd $t2,$B
  326. movdqu 0xa0-0x80($ctx),$t2
  327. pand $Xn,$F
  328. paddd $t3,$C
  329. movdqu 0xc0-0x80($ctx),$t3
  330. pand $Xn,$G
  331. paddd $Xi,$D
  332. movdqu 0xe0-0x80($ctx),$Xi
  333. pand $Xn,$H
  334. paddd $t1,$E
  335. paddd $t2,$F
  336. movdqu $A,0x00-0x80($ctx)
  337. paddd $t3,$G
  338. movdqu $B,0x20-0x80($ctx)
  339. paddd $Xi,$H
  340. movdqu $C,0x40-0x80($ctx)
  341. movdqu $D,0x60-0x80($ctx)
  342. movdqu $E,0x80-0x80($ctx)
  343. movdqu $F,0xa0-0x80($ctx)
  344. movdqu $G,0xc0-0x80($ctx)
  345. movdqu $H,0xe0-0x80($ctx)
  346. movdqa $sigma,(%rbx) # save counters
  347. movdqa .Lpbswap(%rip),$Xn
  348. dec $num
  349. jnz .Loop
  350. mov `$REG_SZ*17+8`(%rsp),$num
  351. lea $REG_SZ($ctx),$ctx
  352. lea `16*$REG_SZ/4`($inp),$inp
  353. dec $num
  354. jnz .Loop_grande
  355. .Ldone:
  356. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  357. .cfi_def_cfa %rax,8
  358. ___
  359. $code.=<<___ if ($win64);
  360. movaps -0xb8(%rax),%xmm6
  361. movaps -0xa8(%rax),%xmm7
  362. movaps -0x98(%rax),%xmm8
  363. movaps -0x88(%rax),%xmm9
  364. movaps -0x78(%rax),%xmm10
  365. movaps -0x68(%rax),%xmm11
  366. movaps -0x58(%rax),%xmm12
  367. movaps -0x48(%rax),%xmm13
  368. movaps -0x38(%rax),%xmm14
  369. movaps -0x28(%rax),%xmm15
  370. ___
  371. $code.=<<___;
  372. mov -16(%rax),%rbp
  373. .cfi_restore %rbp
  374. mov -8(%rax),%rbx
  375. .cfi_restore %rbx
  376. lea (%rax),%rsp
  377. .cfi_def_cfa_register %rsp
  378. .Lepilogue:
  379. ret
  380. .cfi_endproc
  381. .size sha256_multi_block,.-sha256_multi_block
  382. ___
  383. {{{
  384. my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
  385. my @MSG0=map("%xmm$_",(4..7));
  386. my @MSG1=map("%xmm$_",(8..11));
  387. $code.=<<___;
  388. .type sha256_multi_block_shaext,\@function,3
  389. .align 32
  390. sha256_multi_block_shaext:
  391. .cfi_startproc
  392. _shaext_shortcut:
  393. mov %rsp,%rax
  394. .cfi_def_cfa_register %rax
  395. push %rbx
  396. .cfi_push %rbx
  397. push %rbp
  398. .cfi_push %rbp
  399. ___
  400. $code.=<<___ if ($win64);
  401. lea -0xa8(%rsp),%rsp
  402. movaps %xmm6,(%rsp)
  403. movaps %xmm7,0x10(%rsp)
  404. movaps %xmm8,0x20(%rsp)
  405. movaps %xmm9,0x30(%rsp)
  406. movaps %xmm10,-0x78(%rax)
  407. movaps %xmm11,-0x68(%rax)
  408. movaps %xmm12,-0x58(%rax)
  409. movaps %xmm13,-0x48(%rax)
  410. movaps %xmm14,-0x38(%rax)
  411. movaps %xmm15,-0x28(%rax)
  412. ___
  413. $code.=<<___;
  414. sub \$`$REG_SZ*18`,%rsp
  415. shl \$1,$num # we process pair at a time
  416. and \$-256,%rsp
  417. lea 0x80($ctx),$ctx # size optimization
  418. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  419. .Lbody_shaext:
  420. lea `$REG_SZ*16`(%rsp),%rbx
  421. lea K256_shaext+0x80(%rip),$Tbl
  422. .Loop_grande_shaext:
  423. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  424. xor $num,$num
  425. ___
  426. for($i=0;$i<2;$i++) {
  427. $code.=<<___;
  428. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  429. mov `16*$i+8`($inp),%ecx # number of blocks
  430. cmp $num,%ecx
  431. cmovg %ecx,$num # find maximum
  432. test %ecx,%ecx
  433. mov %ecx,`4*$i`(%rbx) # initialize counters
  434. cmovle %rsp,@ptr[$i] # cancel input
  435. ___
  436. }
  437. $code.=<<___;
  438. test $num,$num
  439. jz .Ldone_shaext
  440. movq 0x00-0x80($ctx),$ABEF0 # A1.A0
  441. movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
  442. movq 0x40-0x80($ctx),$CDGH0 # C1.C0
  443. movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
  444. movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
  445. movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
  446. movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
  447. movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
  448. punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
  449. punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
  450. punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
  451. punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
  452. movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
  453. movdqa $ABEF0,$ABEF1
  454. movdqa $CDGH0,$CDGH1
  455. punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
  456. punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
  457. punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
  458. punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
  459. pshufd \$0b00011011,$ABEF0,$ABEF0
  460. pshufd \$0b00011011,$CDGH0,$CDGH0
  461. pshufd \$0b00011011,$ABEF1,$ABEF1
  462. pshufd \$0b00011011,$CDGH1,$CDGH1
  463. jmp .Loop_shaext
  464. .align 32
  465. .Loop_shaext:
  466. movdqu 0x00(@ptr[0]),@MSG0[0]
  467. movdqu 0x00(@ptr[1]),@MSG1[0]
  468. movdqu 0x10(@ptr[0]),@MSG0[1]
  469. movdqu 0x10(@ptr[1]),@MSG1[1]
  470. movdqu 0x20(@ptr[0]),@MSG0[2]
  471. pshufb $TMPx,@MSG0[0]
  472. movdqu 0x20(@ptr[1]),@MSG1[2]
  473. pshufb $TMPx,@MSG1[0]
  474. movdqu 0x30(@ptr[0]),@MSG0[3]
  475. lea 0x40(@ptr[0]),@ptr[0]
  476. movdqu 0x30(@ptr[1]),@MSG1[3]
  477. lea 0x40(@ptr[1]),@ptr[1]
  478. movdqa 0*16-0x80($Tbl),$Wi
  479. pshufb $TMPx,@MSG0[1]
  480. paddd @MSG0[0],$Wi
  481. pxor $ABEF0,@MSG0[0] # black magic
  482. movdqa $Wi,$TMP0
  483. movdqa 0*16-0x80($Tbl),$TMP1
  484. pshufb $TMPx,@MSG1[1]
  485. paddd @MSG1[0],$TMP1
  486. movdqa $CDGH0,0x50(%rsp) # offload
  487. sha256rnds2 $ABEF0,$CDGH0 # 0-3
  488. pxor $ABEF1,@MSG1[0] # black magic
  489. movdqa $TMP1,$Wi
  490. movdqa $CDGH1,0x70(%rsp)
  491. sha256rnds2 $ABEF1,$CDGH1 # 0-3
  492. pshufd \$0x0e,$TMP0,$Wi
  493. pxor $ABEF0,@MSG0[0] # black magic
  494. movdqa $ABEF0,0x40(%rsp) # offload
  495. sha256rnds2 $CDGH0,$ABEF0
  496. pshufd \$0x0e,$TMP1,$Wi
  497. pxor $ABEF1,@MSG1[0] # black magic
  498. movdqa $ABEF1,0x60(%rsp)
  499. movdqa 1*16-0x80($Tbl),$TMP0
  500. paddd @MSG0[1],$TMP0
  501. pshufb $TMPx,@MSG0[2]
  502. sha256rnds2 $CDGH1,$ABEF1
  503. movdqa $TMP0,$Wi
  504. movdqa 1*16-0x80($Tbl),$TMP1
  505. paddd @MSG1[1],$TMP1
  506. sha256rnds2 $ABEF0,$CDGH0 # 4-7
  507. movdqa $TMP1,$Wi
  508. prefetcht0 127(@ptr[0])
  509. pshufb $TMPx,@MSG0[3]
  510. pshufb $TMPx,@MSG1[2]
  511. prefetcht0 127(@ptr[1])
  512. sha256rnds2 $ABEF1,$CDGH1 # 4-7
  513. pshufd \$0x0e,$TMP0,$Wi
  514. pshufb $TMPx,@MSG1[3]
  515. sha256msg1 @MSG0[1],@MSG0[0]
  516. sha256rnds2 $CDGH0,$ABEF0
  517. pshufd \$0x0e,$TMP1,$Wi
  518. movdqa 2*16-0x80($Tbl),$TMP0
  519. paddd @MSG0[2],$TMP0
  520. sha256rnds2 $CDGH1,$ABEF1
  521. movdqa $TMP0,$Wi
  522. movdqa 2*16-0x80($Tbl),$TMP1
  523. paddd @MSG1[2],$TMP1
  524. sha256rnds2 $ABEF0,$CDGH0 # 8-11
  525. sha256msg1 @MSG1[1],@MSG1[0]
  526. movdqa $TMP1,$Wi
  527. movdqa @MSG0[3],$TMPx
  528. sha256rnds2 $ABEF1,$CDGH1 # 8-11
  529. pshufd \$0x0e,$TMP0,$Wi
  530. palignr \$4,@MSG0[2],$TMPx
  531. paddd $TMPx,@MSG0[0]
  532. movdqa @MSG1[3],$TMPx
  533. palignr \$4,@MSG1[2],$TMPx
  534. sha256msg1 @MSG0[2],@MSG0[1]
  535. sha256rnds2 $CDGH0,$ABEF0
  536. pshufd \$0x0e,$TMP1,$Wi
  537. movdqa 3*16-0x80($Tbl),$TMP0
  538. paddd @MSG0[3],$TMP0
  539. sha256rnds2 $CDGH1,$ABEF1
  540. sha256msg1 @MSG1[2],@MSG1[1]
  541. movdqa $TMP0,$Wi
  542. movdqa 3*16-0x80($Tbl),$TMP1
  543. paddd $TMPx,@MSG1[0]
  544. paddd @MSG1[3],$TMP1
  545. sha256msg2 @MSG0[3],@MSG0[0]
  546. sha256rnds2 $ABEF0,$CDGH0 # 12-15
  547. movdqa $TMP1,$Wi
  548. movdqa @MSG0[0],$TMPx
  549. palignr \$4,@MSG0[3],$TMPx
  550. sha256rnds2 $ABEF1,$CDGH1 # 12-15
  551. sha256msg2 @MSG1[3],@MSG1[0]
  552. pshufd \$0x0e,$TMP0,$Wi
  553. paddd $TMPx,@MSG0[1]
  554. movdqa @MSG1[0],$TMPx
  555. palignr \$4,@MSG1[3],$TMPx
  556. sha256msg1 @MSG0[3],@MSG0[2]
  557. sha256rnds2 $CDGH0,$ABEF0
  558. pshufd \$0x0e,$TMP1,$Wi
  559. movdqa 4*16-0x80($Tbl),$TMP0
  560. paddd @MSG0[0],$TMP0
  561. sha256rnds2 $CDGH1,$ABEF1
  562. sha256msg1 @MSG1[3],@MSG1[2]
  563. ___
  564. for($i=4;$i<16-3;$i++) {
  565. $code.=<<___;
  566. movdqa $TMP0,$Wi
  567. movdqa $i*16-0x80($Tbl),$TMP1
  568. paddd $TMPx,@MSG1[1]
  569. paddd @MSG1[0],$TMP1
  570. sha256msg2 @MSG0[0],@MSG0[1]
  571. sha256rnds2 $ABEF0,$CDGH0 # 16-19...
  572. movdqa $TMP1,$Wi
  573. movdqa @MSG0[1],$TMPx
  574. palignr \$4,@MSG0[0],$TMPx
  575. sha256rnds2 $ABEF1,$CDGH1 # 16-19...
  576. sha256msg2 @MSG1[0],@MSG1[1]
  577. pshufd \$0x0e,$TMP0,$Wi
  578. paddd $TMPx,@MSG0[2]
  579. movdqa @MSG1[1],$TMPx
  580. palignr \$4,@MSG1[0],$TMPx
  581. sha256msg1 @MSG0[0],@MSG0[3]
  582. sha256rnds2 $CDGH0,$ABEF0
  583. pshufd \$0x0e,$TMP1,$Wi
  584. movdqa `($i+1)*16`-0x80($Tbl),$TMP0
  585. paddd @MSG0[1],$TMP0
  586. sha256rnds2 $CDGH1,$ABEF1
  587. sha256msg1 @MSG1[0],@MSG1[3]
  588. ___
  589. push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
  590. }
  591. $code.=<<___;
  592. movdqa $TMP0,$Wi
  593. movdqa 13*16-0x80($Tbl),$TMP1
  594. paddd $TMPx,@MSG1[1]
  595. paddd @MSG1[0],$TMP1
  596. sha256msg2 @MSG0[0],@MSG0[1]
  597. sha256rnds2 $ABEF0,$CDGH0 # 52-55
  598. movdqa $TMP1,$Wi
  599. movdqa @MSG0[1],$TMPx
  600. palignr \$4,@MSG0[0],$TMPx
  601. sha256rnds2 $ABEF1,$CDGH1 # 52-55
  602. sha256msg2 @MSG1[0],@MSG1[1]
  603. pshufd \$0x0e,$TMP0,$Wi
  604. paddd $TMPx,@MSG0[2]
  605. movdqa @MSG1[1],$TMPx
  606. palignr \$4,@MSG1[0],$TMPx
  607. nop
  608. sha256rnds2 $CDGH0,$ABEF0
  609. pshufd \$0x0e,$TMP1,$Wi
  610. movdqa 14*16-0x80($Tbl),$TMP0
  611. paddd @MSG0[1],$TMP0
  612. sha256rnds2 $CDGH1,$ABEF1
  613. movdqa $TMP0,$Wi
  614. movdqa 14*16-0x80($Tbl),$TMP1
  615. paddd $TMPx,@MSG1[2]
  616. paddd @MSG1[1],$TMP1
  617. sha256msg2 @MSG0[1],@MSG0[2]
  618. nop
  619. sha256rnds2 $ABEF0,$CDGH0 # 56-59
  620. movdqa $TMP1,$Wi
  621. mov \$1,%ecx
  622. pxor @MSG0[1],@MSG0[1] # zero
  623. sha256rnds2 $ABEF1,$CDGH1 # 56-59
  624. sha256msg2 @MSG1[1],@MSG1[2]
  625. pshufd \$0x0e,$TMP0,$Wi
  626. movdqa 15*16-0x80($Tbl),$TMP0
  627. paddd @MSG0[2],$TMP0
  628. movq (%rbx),@MSG0[2] # pull counters
  629. nop
  630. sha256rnds2 $CDGH0,$ABEF0
  631. pshufd \$0x0e,$TMP1,$Wi
  632. movdqa 15*16-0x80($Tbl),$TMP1
  633. paddd @MSG1[2],$TMP1
  634. sha256rnds2 $CDGH1,$ABEF1
  635. movdqa $TMP0,$Wi
  636. cmp 4*0(%rbx),%ecx # examine counters
  637. cmovge %rsp,@ptr[0] # cancel input
  638. cmp 4*1(%rbx),%ecx
  639. cmovge %rsp,@ptr[1]
  640. pshufd \$0x00,@MSG0[2],@MSG1[0]
  641. sha256rnds2 $ABEF0,$CDGH0 # 60-63
  642. movdqa $TMP1,$Wi
  643. pshufd \$0x55,@MSG0[2],@MSG1[1]
  644. movdqa @MSG0[2],@MSG1[2]
  645. sha256rnds2 $ABEF1,$CDGH1 # 60-63
  646. pshufd \$0x0e,$TMP0,$Wi
  647. pcmpgtd @MSG0[1],@MSG1[0]
  648. pcmpgtd @MSG0[1],@MSG1[1]
  649. sha256rnds2 $CDGH0,$ABEF0
  650. pshufd \$0x0e,$TMP1,$Wi
  651. pcmpgtd @MSG0[1],@MSG1[2] # counter mask
  652. movdqa K256_shaext-0x10(%rip),$TMPx
  653. sha256rnds2 $CDGH1,$ABEF1
  654. pand @MSG1[0],$CDGH0
  655. pand @MSG1[1],$CDGH1
  656. pand @MSG1[0],$ABEF0
  657. pand @MSG1[1],$ABEF1
  658. paddd @MSG0[2],@MSG1[2] # counters--
  659. paddd 0x50(%rsp),$CDGH0
  660. paddd 0x70(%rsp),$CDGH1
  661. paddd 0x40(%rsp),$ABEF0
  662. paddd 0x60(%rsp),$ABEF1
  663. movq @MSG1[2],(%rbx) # save counters
  664. dec $num
  665. jnz .Loop_shaext
  666. mov `$REG_SZ*17+8`(%rsp),$num
  667. pshufd \$0b00011011,$ABEF0,$ABEF0
  668. pshufd \$0b00011011,$CDGH0,$CDGH0
  669. pshufd \$0b00011011,$ABEF1,$ABEF1
  670. pshufd \$0b00011011,$CDGH1,$CDGH1
  671. movdqa $ABEF0,@MSG0[0]
  672. movdqa $CDGH0,@MSG0[1]
  673. punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
  674. punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
  675. punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
  676. punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
  677. movq $ABEF0,0x00-0x80($ctx) # A1.A0
  678. psrldq \$8,$ABEF0
  679. movq @MSG0[0],0x80-0x80($ctx) # E1.E0
  680. psrldq \$8,@MSG0[0]
  681. movq $ABEF0,0x20-0x80($ctx) # B1.B0
  682. movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
  683. movq $CDGH0,0x40-0x80($ctx) # C1.C0
  684. psrldq \$8,$CDGH0
  685. movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
  686. psrldq \$8,@MSG0[1]
  687. movq $CDGH0,0x60-0x80($ctx) # D1.D0
  688. movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
  689. lea `$REG_SZ/2`($ctx),$ctx
  690. lea `16*2`($inp),$inp
  691. dec $num
  692. jnz .Loop_grande_shaext
  693. .Ldone_shaext:
  694. #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  695. ___
  696. $code.=<<___ if ($win64);
  697. movaps -0xb8(%rax),%xmm6
  698. movaps -0xa8(%rax),%xmm7
  699. movaps -0x98(%rax),%xmm8
  700. movaps -0x88(%rax),%xmm9
  701. movaps -0x78(%rax),%xmm10
  702. movaps -0x68(%rax),%xmm11
  703. movaps -0x58(%rax),%xmm12
  704. movaps -0x48(%rax),%xmm13
  705. movaps -0x38(%rax),%xmm14
  706. movaps -0x28(%rax),%xmm15
  707. ___
  708. $code.=<<___;
  709. mov -16(%rax),%rbp
  710. .cfi_restore %rbp
  711. mov -8(%rax),%rbx
  712. .cfi_restore %rbx
  713. lea (%rax),%rsp
  714. .cfi_def_cfa_register %rsp
  715. .Lepilogue_shaext:
  716. ret
  717. .cfi_endproc
  718. .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
  719. ___
  720. }}}
  721. if ($avx) {{{
  722. sub ROUND_00_15_avx {
  723. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  724. $code.=<<___ if ($i<15 && $REG_SZ==16);
  725. vmovd `4*$i`(@ptr[0]),$Xi
  726. vmovd `4*$i`(@ptr[1]),$t1
  727. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  728. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  729. vpunpckldq $t1,$Xi,$Xi
  730. vpshufb $Xn,$Xi,$Xi
  731. ___
  732. $code.=<<___ if ($i==15 && $REG_SZ==16);
  733. vmovd `4*$i`(@ptr[0]),$Xi
  734. lea `16*4`(@ptr[0]),@ptr[0]
  735. vmovd `4*$i`(@ptr[1]),$t1
  736. lea `16*4`(@ptr[1]),@ptr[1]
  737. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  738. lea `16*4`(@ptr[2]),@ptr[2]
  739. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  740. lea `16*4`(@ptr[3]),@ptr[3]
  741. vpunpckldq $t1,$Xi,$Xi
  742. vpshufb $Xn,$Xi,$Xi
  743. ___
  744. $code.=<<___ if ($i<15 && $REG_SZ==32);
  745. vmovd `4*$i`(@ptr[0]),$Xi
  746. vmovd `4*$i`(@ptr[4]),$t1
  747. vmovd `4*$i`(@ptr[1]),$t2
  748. vmovd `4*$i`(@ptr[5]),$t3
  749. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  750. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  751. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  752. vpunpckldq $t2,$Xi,$Xi
  753. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  754. vpunpckldq $t3,$t1,$t1
  755. vinserti128 $t1,$Xi,$Xi
  756. vpshufb $Xn,$Xi,$Xi
  757. ___
  758. $code.=<<___ if ($i==15 && $REG_SZ==32);
  759. vmovd `4*$i`(@ptr[0]),$Xi
  760. lea `16*4`(@ptr[0]),@ptr[0]
  761. vmovd `4*$i`(@ptr[4]),$t1
  762. lea `16*4`(@ptr[4]),@ptr[4]
  763. vmovd `4*$i`(@ptr[1]),$t2
  764. lea `16*4`(@ptr[1]),@ptr[1]
  765. vmovd `4*$i`(@ptr[5]),$t3
  766. lea `16*4`(@ptr[5]),@ptr[5]
  767. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  768. lea `16*4`(@ptr[2]),@ptr[2]
  769. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  770. lea `16*4`(@ptr[6]),@ptr[6]
  771. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  772. lea `16*4`(@ptr[3]),@ptr[3]
  773. vpunpckldq $t2,$Xi,$Xi
  774. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  775. lea `16*4`(@ptr[7]),@ptr[7]
  776. vpunpckldq $t3,$t1,$t1
  777. vinserti128 $t1,$Xi,$Xi
  778. vpshufb $Xn,$Xi,$Xi
  779. ___
  780. $code.=<<___;
  781. vpsrld \$6,$e,$sigma
  782. vpslld \$26,$e,$t3
  783. vmovdqu $Xi,`&Xi_off($i)`
  784. vpaddd $h,$Xi,$Xi # Xi+=h
  785. vpsrld \$11,$e,$t2
  786. vpxor $t3,$sigma,$sigma
  787. vpslld \$21,$e,$t3
  788. vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
  789. vpxor $t2,$sigma,$sigma
  790. vpsrld \$25,$e,$t2
  791. vpxor $t3,$sigma,$sigma
  792. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  793. vpslld \$7,$e,$t3
  794. vpandn $g,$e,$t1
  795. vpand $f,$e,$axb # borrow $axb
  796. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  797. vpxor $t2,$sigma,$sigma
  798. vpsrld \$2,$a,$h # borrow $h
  799. vpxor $t3,$sigma,$sigma # Sigma1(e)
  800. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  801. vpslld \$30,$a,$t2
  802. vpxor $axb,$t1,$t1 # Ch(e,f,g)
  803. vpxor $a,$b,$axb # a^b, b^c in next round
  804. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  805. vpxor $t2,$h,$h
  806. vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
  807. vpsrld \$13,$a,$t2
  808. `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
  809. vpslld \$19,$a,$t3
  810. vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
  811. vpand $axb,$bxc,$bxc
  812. `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
  813. vpxor $t2,$h,$sigma
  814. vpsrld \$22,$a,$t2
  815. vpxor $t3,$sigma,$sigma
  816. `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
  817. vpslld \$10,$a,$t3
  818. vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  819. vpaddd $Xi,$d,$d # d+=Xi
  820. `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
  821. vpxor $t2,$sigma,$sigma
  822. vpxor $t3,$sigma,$sigma # Sigma0(a)
  823. vpaddd $Xi,$h,$h # h+=Xi
  824. vpaddd $sigma,$h,$h # h+=Sigma0(a)
  825. ___
  826. $code.=<<___ if (($i%8)==7);
  827. add \$`32*8`,$Tbl
  828. ___
  829. ($axb,$bxc)=($bxc,$axb);
  830. }
  831. sub ROUND_16_XX_avx {
  832. my $i=shift;
  833. $code.=<<___;
  834. vmovdqu `&Xi_off($i+1)`,$Xn
  835. vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
  836. vpsrld \$3,$Xn,$sigma
  837. vpsrld \$7,$Xn,$t2
  838. vpslld \$25,$Xn,$t3
  839. vpxor $t2,$sigma,$sigma
  840. vpsrld \$18,$Xn,$t2
  841. vpxor $t3,$sigma,$sigma
  842. vpslld \$14,$Xn,$t3
  843. vmovdqu `&Xi_off($i+14)`,$t1
  844. vpsrld \$10,$t1,$axb # borrow $axb
  845. vpxor $t2,$sigma,$sigma
  846. vpsrld \$17,$t1,$t2
  847. vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
  848. vpslld \$15,$t1,$t3
  849. vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
  850. vpxor $t2,$axb,$sigma
  851. vpsrld \$19,$t1,$t2
  852. vpxor $t3,$sigma,$sigma
  853. vpslld \$13,$t1,$t3
  854. vpxor $t2,$sigma,$sigma
  855. vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
  856. vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
  857. ___
  858. &ROUND_00_15_avx($i,@_);
  859. ($Xi,$Xn)=($Xn,$Xi);
  860. }
  861. $code.=<<___;
  862. .type sha256_multi_block_avx,\@function,3
  863. .align 32
  864. sha256_multi_block_avx:
  865. .cfi_startproc
  866. _avx_shortcut:
  867. ___
  868. $code.=<<___ if ($avx>1);
  869. shr \$32,%rcx
  870. cmp \$2,$num
  871. jb .Lavx
  872. test \$`1<<5`,%ecx
  873. jnz _avx2_shortcut
  874. jmp .Lavx
  875. .align 32
  876. .Lavx:
  877. ___
  878. $code.=<<___;
  879. mov %rsp,%rax
  880. .cfi_def_cfa_register %rax
  881. push %rbx
  882. .cfi_push %rbx
  883. push %rbp
  884. .cfi_push %rbp
  885. ___
  886. $code.=<<___ if ($win64);
  887. lea -0xa8(%rsp),%rsp
  888. movaps %xmm6,(%rsp)
  889. movaps %xmm7,0x10(%rsp)
  890. movaps %xmm8,0x20(%rsp)
  891. movaps %xmm9,0x30(%rsp)
  892. movaps %xmm10,-0x78(%rax)
  893. movaps %xmm11,-0x68(%rax)
  894. movaps %xmm12,-0x58(%rax)
  895. movaps %xmm13,-0x48(%rax)
  896. movaps %xmm14,-0x38(%rax)
  897. movaps %xmm15,-0x28(%rax)
  898. ___
  899. $code.=<<___;
  900. sub \$`$REG_SZ*18`, %rsp
  901. and \$-256,%rsp
  902. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  903. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  904. .Lbody_avx:
  905. lea K256+128(%rip),$Tbl
  906. lea `$REG_SZ*16`(%rsp),%rbx
  907. lea 0x80($ctx),$ctx # size optimization
  908. .Loop_grande_avx:
  909. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  910. xor $num,$num
  911. ___
  912. for($i=0;$i<4;$i++) {
  913. $code.=<<___;
  914. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  915. mov `16*$i+8`($inp),%ecx # number of blocks
  916. cmp $num,%ecx
  917. cmovg %ecx,$num # find maximum
  918. test %ecx,%ecx
  919. mov %ecx,`4*$i`(%rbx) # initialize counters
  920. cmovle $Tbl,@ptr[$i] # cancel input
  921. ___
  922. }
  923. $code.=<<___;
  924. test $num,$num
  925. jz .Ldone_avx
  926. vmovdqu 0x00-0x80($ctx),$A # load context
  927. lea 128(%rsp),%rax
  928. vmovdqu 0x20-0x80($ctx),$B
  929. vmovdqu 0x40-0x80($ctx),$C
  930. vmovdqu 0x60-0x80($ctx),$D
  931. vmovdqu 0x80-0x80($ctx),$E
  932. vmovdqu 0xa0-0x80($ctx),$F
  933. vmovdqu 0xc0-0x80($ctx),$G
  934. vmovdqu 0xe0-0x80($ctx),$H
  935. vmovdqu .Lpbswap(%rip),$Xn
  936. jmp .Loop_avx
  937. .align 32
  938. .Loop_avx:
  939. vpxor $B,$C,$bxc # magic seed
  940. ___
  941. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  942. $code.=<<___;
  943. vmovdqu `&Xi_off($i)`,$Xi
  944. mov \$3,%ecx
  945. jmp .Loop_16_xx_avx
  946. .align 32
  947. .Loop_16_xx_avx:
  948. ___
  949. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  950. $code.=<<___;
  951. dec %ecx
  952. jnz .Loop_16_xx_avx
  953. mov \$1,%ecx
  954. lea K256+128(%rip),$Tbl
  955. ___
  956. for($i=0;$i<4;$i++) {
  957. $code.=<<___;
  958. cmp `4*$i`(%rbx),%ecx # examine counters
  959. cmovge $Tbl,@ptr[$i] # cancel input
  960. ___
  961. }
  962. $code.=<<___;
  963. vmovdqa (%rbx),$sigma # pull counters
  964. vpxor $t1,$t1,$t1
  965. vmovdqa $sigma,$Xn
  966. vpcmpgtd $t1,$Xn,$Xn # mask value
  967. vpaddd $Xn,$sigma,$sigma # counters--
  968. vmovdqu 0x00-0x80($ctx),$t1
  969. vpand $Xn,$A,$A
  970. vmovdqu 0x20-0x80($ctx),$t2
  971. vpand $Xn,$B,$B
  972. vmovdqu 0x40-0x80($ctx),$t3
  973. vpand $Xn,$C,$C
  974. vmovdqu 0x60-0x80($ctx),$Xi
  975. vpand $Xn,$D,$D
  976. vpaddd $t1,$A,$A
  977. vmovdqu 0x80-0x80($ctx),$t1
  978. vpand $Xn,$E,$E
  979. vpaddd $t2,$B,$B
  980. vmovdqu 0xa0-0x80($ctx),$t2
  981. vpand $Xn,$F,$F
  982. vpaddd $t3,$C,$C
  983. vmovdqu 0xc0-0x80($ctx),$t3
  984. vpand $Xn,$G,$G
  985. vpaddd $Xi,$D,$D
  986. vmovdqu 0xe0-0x80($ctx),$Xi
  987. vpand $Xn,$H,$H
  988. vpaddd $t1,$E,$E
  989. vpaddd $t2,$F,$F
  990. vmovdqu $A,0x00-0x80($ctx)
  991. vpaddd $t3,$G,$G
  992. vmovdqu $B,0x20-0x80($ctx)
  993. vpaddd $Xi,$H,$H
  994. vmovdqu $C,0x40-0x80($ctx)
  995. vmovdqu $D,0x60-0x80($ctx)
  996. vmovdqu $E,0x80-0x80($ctx)
  997. vmovdqu $F,0xa0-0x80($ctx)
  998. vmovdqu $G,0xc0-0x80($ctx)
  999. vmovdqu $H,0xe0-0x80($ctx)
  1000. vmovdqu $sigma,(%rbx) # save counters
  1001. vmovdqu .Lpbswap(%rip),$Xn
  1002. dec $num
  1003. jnz .Loop_avx
  1004. mov `$REG_SZ*17+8`(%rsp),$num
  1005. lea $REG_SZ($ctx),$ctx
  1006. lea `16*$REG_SZ/4`($inp),$inp
  1007. dec $num
  1008. jnz .Loop_grande_avx
  1009. .Ldone_avx:
  1010. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1011. .cfi_def_cfa %rax,8
  1012. vzeroupper
  1013. ___
  1014. $code.=<<___ if ($win64);
  1015. movaps -0xb8(%rax),%xmm6
  1016. movaps -0xa8(%rax),%xmm7
  1017. movaps -0x98(%rax),%xmm8
  1018. movaps -0x88(%rax),%xmm9
  1019. movaps -0x78(%rax),%xmm10
  1020. movaps -0x68(%rax),%xmm11
  1021. movaps -0x58(%rax),%xmm12
  1022. movaps -0x48(%rax),%xmm13
  1023. movaps -0x38(%rax),%xmm14
  1024. movaps -0x28(%rax),%xmm15
  1025. ___
  1026. $code.=<<___;
  1027. mov -16(%rax),%rbp
  1028. .cfi_restore %rbp
  1029. mov -8(%rax),%rbx
  1030. .cfi_restore %rbx
  1031. lea (%rax),%rsp
  1032. .cfi_def_cfa_register %rsp
  1033. .Lepilogue_avx:
  1034. ret
  1035. .cfi_endproc
  1036. .size sha256_multi_block_avx,.-sha256_multi_block_avx
  1037. ___
  1038. if ($avx>1) {
  1039. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1040. $REG_SZ=32;
  1041. @ptr=map("%r$_",(12..15,8..11));
  1042. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
  1043. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
  1044. $code.=<<___;
  1045. .type sha256_multi_block_avx2,\@function,3
  1046. .align 32
  1047. sha256_multi_block_avx2:
  1048. .cfi_startproc
  1049. _avx2_shortcut:
  1050. mov %rsp,%rax
  1051. .cfi_def_cfa_register %rax
  1052. push %rbx
  1053. .cfi_push %rbx
  1054. push %rbp
  1055. .cfi_push %rbp
  1056. push %r12
  1057. .cfi_push %r12
  1058. push %r13
  1059. .cfi_push %r13
  1060. push %r14
  1061. .cfi_push %r14
  1062. push %r15
  1063. .cfi_push %r15
  1064. ___
  1065. $code.=<<___ if ($win64);
  1066. lea -0xa8(%rsp),%rsp
  1067. movaps %xmm6,(%rsp)
  1068. movaps %xmm7,0x10(%rsp)
  1069. movaps %xmm8,0x20(%rsp)
  1070. movaps %xmm9,0x30(%rsp)
  1071. movaps %xmm10,0x40(%rsp)
  1072. movaps %xmm11,0x50(%rsp)
  1073. movaps %xmm12,-0x78(%rax)
  1074. movaps %xmm13,-0x68(%rax)
  1075. movaps %xmm14,-0x58(%rax)
  1076. movaps %xmm15,-0x48(%rax)
  1077. ___
  1078. $code.=<<___;
  1079. sub \$`$REG_SZ*18`, %rsp
  1080. and \$-256,%rsp
  1081. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  1082. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  1083. .Lbody_avx2:
  1084. lea K256+128(%rip),$Tbl
  1085. lea 0x80($ctx),$ctx # size optimization
  1086. .Loop_grande_avx2:
  1087. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  1088. xor $num,$num
  1089. lea `$REG_SZ*16`(%rsp),%rbx
  1090. ___
  1091. for($i=0;$i<8;$i++) {
  1092. $code.=<<___;
  1093. mov `16*$i+0`($inp),@ptr[$i] # input pointer
  1094. mov `16*$i+8`($inp),%ecx # number of blocks
  1095. cmp $num,%ecx
  1096. cmovg %ecx,$num # find maximum
  1097. test %ecx,%ecx
  1098. mov %ecx,`4*$i`(%rbx) # initialize counters
  1099. cmovle $Tbl,@ptr[$i] # cancel input
  1100. ___
  1101. }
  1102. $code.=<<___;
  1103. vmovdqu 0x00-0x80($ctx),$A # load context
  1104. lea 128(%rsp),%rax
  1105. vmovdqu 0x20-0x80($ctx),$B
  1106. lea 256+128(%rsp),%rbx
  1107. vmovdqu 0x40-0x80($ctx),$C
  1108. vmovdqu 0x60-0x80($ctx),$D
  1109. vmovdqu 0x80-0x80($ctx),$E
  1110. vmovdqu 0xa0-0x80($ctx),$F
  1111. vmovdqu 0xc0-0x80($ctx),$G
  1112. vmovdqu 0xe0-0x80($ctx),$H
  1113. vmovdqu .Lpbswap(%rip),$Xn
  1114. jmp .Loop_avx2
  1115. .align 32
  1116. .Loop_avx2:
  1117. vpxor $B,$C,$bxc # magic seed
  1118. ___
  1119. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  1120. $code.=<<___;
  1121. vmovdqu `&Xi_off($i)`,$Xi
  1122. mov \$3,%ecx
  1123. jmp .Loop_16_xx_avx2
  1124. .align 32
  1125. .Loop_16_xx_avx2:
  1126. ___
  1127. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  1128. $code.=<<___;
  1129. dec %ecx
  1130. jnz .Loop_16_xx_avx2
  1131. mov \$1,%ecx
  1132. lea `$REG_SZ*16`(%rsp),%rbx
  1133. lea K256+128(%rip),$Tbl
  1134. ___
  1135. for($i=0;$i<8;$i++) {
  1136. $code.=<<___;
  1137. cmp `4*$i`(%rbx),%ecx # examine counters
  1138. cmovge $Tbl,@ptr[$i] # cancel input
  1139. ___
  1140. }
  1141. $code.=<<___;
  1142. vmovdqa (%rbx),$sigma # pull counters
  1143. vpxor $t1,$t1,$t1
  1144. vmovdqa $sigma,$Xn
  1145. vpcmpgtd $t1,$Xn,$Xn # mask value
  1146. vpaddd $Xn,$sigma,$sigma # counters--
  1147. vmovdqu 0x00-0x80($ctx),$t1
  1148. vpand $Xn,$A,$A
  1149. vmovdqu 0x20-0x80($ctx),$t2
  1150. vpand $Xn,$B,$B
  1151. vmovdqu 0x40-0x80($ctx),$t3
  1152. vpand $Xn,$C,$C
  1153. vmovdqu 0x60-0x80($ctx),$Xi
  1154. vpand $Xn,$D,$D
  1155. vpaddd $t1,$A,$A
  1156. vmovdqu 0x80-0x80($ctx),$t1
  1157. vpand $Xn,$E,$E
  1158. vpaddd $t2,$B,$B
  1159. vmovdqu 0xa0-0x80($ctx),$t2
  1160. vpand $Xn,$F,$F
  1161. vpaddd $t3,$C,$C
  1162. vmovdqu 0xc0-0x80($ctx),$t3
  1163. vpand $Xn,$G,$G
  1164. vpaddd $Xi,$D,$D
  1165. vmovdqu 0xe0-0x80($ctx),$Xi
  1166. vpand $Xn,$H,$H
  1167. vpaddd $t1,$E,$E
  1168. vpaddd $t2,$F,$F
  1169. vmovdqu $A,0x00-0x80($ctx)
  1170. vpaddd $t3,$G,$G
  1171. vmovdqu $B,0x20-0x80($ctx)
  1172. vpaddd $Xi,$H,$H
  1173. vmovdqu $C,0x40-0x80($ctx)
  1174. vmovdqu $D,0x60-0x80($ctx)
  1175. vmovdqu $E,0x80-0x80($ctx)
  1176. vmovdqu $F,0xa0-0x80($ctx)
  1177. vmovdqu $G,0xc0-0x80($ctx)
  1178. vmovdqu $H,0xe0-0x80($ctx)
  1179. vmovdqu $sigma,(%rbx) # save counters
  1180. lea 256+128(%rsp),%rbx
  1181. vmovdqu .Lpbswap(%rip),$Xn
  1182. dec $num
  1183. jnz .Loop_avx2
  1184. #mov `$REG_SZ*17+8`(%rsp),$num
  1185. #lea $REG_SZ($ctx),$ctx
  1186. #lea `16*$REG_SZ/4`($inp),$inp
  1187. #dec $num
  1188. #jnz .Loop_grande_avx2
  1189. .Ldone_avx2:
  1190. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1191. .cfi_def_cfa %rax,8
  1192. vzeroupper
  1193. ___
  1194. $code.=<<___ if ($win64);
  1195. movaps -0xd8(%rax),%xmm6
  1196. movaps -0xc8(%rax),%xmm7
  1197. movaps -0xb8(%rax),%xmm8
  1198. movaps -0xa8(%rax),%xmm9
  1199. movaps -0x98(%rax),%xmm10
  1200. movaps -0x88(%rax),%xmm11
  1201. movaps -0x78(%rax),%xmm12
  1202. movaps -0x68(%rax),%xmm13
  1203. movaps -0x58(%rax),%xmm14
  1204. movaps -0x48(%rax),%xmm15
  1205. ___
  1206. $code.=<<___;
  1207. mov -48(%rax),%r15
  1208. .cfi_restore %r15
  1209. mov -40(%rax),%r14
  1210. .cfi_restore %r14
  1211. mov -32(%rax),%r13
  1212. .cfi_restore %r13
  1213. mov -24(%rax),%r12
  1214. .cfi_restore %r12
  1215. mov -16(%rax),%rbp
  1216. .cfi_restore %rbp
  1217. mov -8(%rax),%rbx
  1218. .cfi_restore %rbx
  1219. lea (%rax),%rsp
  1220. .cfi_def_cfa_register %rsp
  1221. .Lepilogue_avx2:
  1222. ret
  1223. .cfi_endproc
  1224. .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
  1225. ___
  1226. } }}}
  1227. $code.=<<___;
  1228. .align 256
  1229. K256:
  1230. ___
  1231. sub TABLE {
  1232. foreach (@_) {
  1233. $code.=<<___;
  1234. .long $_,$_,$_,$_
  1235. .long $_,$_,$_,$_
  1236. ___
  1237. }
  1238. }
  1239. &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
  1240. 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  1241. 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
  1242. 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  1243. 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
  1244. 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  1245. 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
  1246. 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  1247. 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
  1248. 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  1249. 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
  1250. 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  1251. 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
  1252. 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  1253. 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
  1254. 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
  1255. $code.=<<___;
  1256. .Lpbswap:
  1257. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1258. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1259. K256_shaext:
  1260. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  1261. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  1262. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  1263. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  1264. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  1265. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  1266. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  1267. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  1268. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  1269. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  1270. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  1271. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  1272. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  1273. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  1274. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  1275. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  1276. .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1277. ___
  1278. if ($win64) {
  1279. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1280. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1281. $rec="%rcx";
  1282. $frame="%rdx";
  1283. $context="%r8";
  1284. $disp="%r9";
  1285. $code.=<<___;
  1286. .extern __imp_RtlVirtualUnwind
  1287. .type se_handler,\@abi-omnipotent
  1288. .align 16
  1289. se_handler:
  1290. push %rsi
  1291. push %rdi
  1292. push %rbx
  1293. push %rbp
  1294. push %r12
  1295. push %r13
  1296. push %r14
  1297. push %r15
  1298. pushfq
  1299. sub \$64,%rsp
  1300. mov 120($context),%rax # pull context->Rax
  1301. mov 248($context),%rbx # pull context->Rip
  1302. mov 8($disp),%rsi # disp->ImageBase
  1303. mov 56($disp),%r11 # disp->HandlerData
  1304. mov 0(%r11),%r10d # HandlerData[0]
  1305. lea (%rsi,%r10),%r10 # end of prologue label
  1306. cmp %r10,%rbx # context->Rip<.Lbody
  1307. jb .Lin_prologue
  1308. mov 152($context),%rax # pull context->Rsp
  1309. mov 4(%r11),%r10d # HandlerData[1]
  1310. lea (%rsi,%r10),%r10 # epilogue label
  1311. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1312. jae .Lin_prologue
  1313. mov `16*17`(%rax),%rax # pull saved stack pointer
  1314. mov -8(%rax),%rbx
  1315. mov -16(%rax),%rbp
  1316. mov %rbx,144($context) # restore context->Rbx
  1317. mov %rbp,160($context) # restore context->Rbp
  1318. lea -24-10*16(%rax),%rsi
  1319. lea 512($context),%rdi # &context.Xmm6
  1320. mov \$20,%ecx
  1321. .long 0xa548f3fc # cld; rep movsq
  1322. .Lin_prologue:
  1323. mov 8(%rax),%rdi
  1324. mov 16(%rax),%rsi
  1325. mov %rax,152($context) # restore context->Rsp
  1326. mov %rsi,168($context) # restore context->Rsi
  1327. mov %rdi,176($context) # restore context->Rdi
  1328. mov 40($disp),%rdi # disp->ContextRecord
  1329. mov $context,%rsi # context
  1330. mov \$154,%ecx # sizeof(CONTEXT)
  1331. .long 0xa548f3fc # cld; rep movsq
  1332. mov $disp,%rsi
  1333. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1334. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1335. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1336. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1337. mov 40(%rsi),%r10 # disp->ContextRecord
  1338. lea 56(%rsi),%r11 # &disp->HandlerData
  1339. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1340. mov %r10,32(%rsp) # arg5
  1341. mov %r11,40(%rsp) # arg6
  1342. mov %r12,48(%rsp) # arg7
  1343. mov %rcx,56(%rsp) # arg8, (NULL)
  1344. call *__imp_RtlVirtualUnwind(%rip)
  1345. mov \$1,%eax # ExceptionContinueSearch
  1346. add \$64,%rsp
  1347. popfq
  1348. pop %r15
  1349. pop %r14
  1350. pop %r13
  1351. pop %r12
  1352. pop %rbp
  1353. pop %rbx
  1354. pop %rdi
  1355. pop %rsi
  1356. ret
  1357. .size se_handler,.-se_handler
  1358. ___
  1359. $code.=<<___ if ($avx>1);
  1360. .type avx2_handler,\@abi-omnipotent
  1361. .align 16
  1362. avx2_handler:
  1363. push %rsi
  1364. push %rdi
  1365. push %rbx
  1366. push %rbp
  1367. push %r12
  1368. push %r13
  1369. push %r14
  1370. push %r15
  1371. pushfq
  1372. sub \$64,%rsp
  1373. mov 120($context),%rax # pull context->Rax
  1374. mov 248($context),%rbx # pull context->Rip
  1375. mov 8($disp),%rsi # disp->ImageBase
  1376. mov 56($disp),%r11 # disp->HandlerData
  1377. mov 0(%r11),%r10d # HandlerData[0]
  1378. lea (%rsi,%r10),%r10 # end of prologue label
  1379. cmp %r10,%rbx # context->Rip<body label
  1380. jb .Lin_prologue
  1381. mov 152($context),%rax # pull context->Rsp
  1382. mov 4(%r11),%r10d # HandlerData[1]
  1383. lea (%rsi,%r10),%r10 # epilogue label
  1384. cmp %r10,%rbx # context->Rip>=epilogue label
  1385. jae .Lin_prologue
  1386. mov `32*17`($context),%rax # pull saved stack pointer
  1387. mov -8(%rax),%rbx
  1388. mov -16(%rax),%rbp
  1389. mov -24(%rax),%r12
  1390. mov -32(%rax),%r13
  1391. mov -40(%rax),%r14
  1392. mov -48(%rax),%r15
  1393. mov %rbx,144($context) # restore context->Rbx
  1394. mov %rbp,160($context) # restore context->Rbp
  1395. mov %r12,216($context) # restore context->R12
  1396. mov %r13,224($context) # restore context->R13
  1397. mov %r14,232($context) # restore context->R14
  1398. mov %r15,240($context) # restore context->R15
  1399. lea -56-10*16(%rax),%rsi
  1400. lea 512($context),%rdi # &context.Xmm6
  1401. mov \$20,%ecx
  1402. .long 0xa548f3fc # cld; rep movsq
  1403. jmp .Lin_prologue
  1404. .size avx2_handler,.-avx2_handler
  1405. ___
  1406. $code.=<<___;
  1407. .section .pdata
  1408. .align 4
  1409. .rva .LSEH_begin_sha256_multi_block
  1410. .rva .LSEH_end_sha256_multi_block
  1411. .rva .LSEH_info_sha256_multi_block
  1412. .rva .LSEH_begin_sha256_multi_block_shaext
  1413. .rva .LSEH_end_sha256_multi_block_shaext
  1414. .rva .LSEH_info_sha256_multi_block_shaext
  1415. ___
  1416. $code.=<<___ if ($avx);
  1417. .rva .LSEH_begin_sha256_multi_block_avx
  1418. .rva .LSEH_end_sha256_multi_block_avx
  1419. .rva .LSEH_info_sha256_multi_block_avx
  1420. ___
  1421. $code.=<<___ if ($avx>1);
  1422. .rva .LSEH_begin_sha256_multi_block_avx2
  1423. .rva .LSEH_end_sha256_multi_block_avx2
  1424. .rva .LSEH_info_sha256_multi_block_avx2
  1425. ___
  1426. $code.=<<___;
  1427. .section .xdata
  1428. .align 8
  1429. .LSEH_info_sha256_multi_block:
  1430. .byte 9,0,0,0
  1431. .rva se_handler
  1432. .rva .Lbody,.Lepilogue # HandlerData[]
  1433. .LSEH_info_sha256_multi_block_shaext:
  1434. .byte 9,0,0,0
  1435. .rva se_handler
  1436. .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
  1437. ___
  1438. $code.=<<___ if ($avx);
  1439. .LSEH_info_sha256_multi_block_avx:
  1440. .byte 9,0,0,0
  1441. .rva se_handler
  1442. .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
  1443. ___
  1444. $code.=<<___ if ($avx>1);
  1445. .LSEH_info_sha256_multi_block_avx2:
  1446. .byte 9,0,0,0
  1447. .rva avx2_handler
  1448. .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
  1449. ___
  1450. }
  1451. ####################################################################
  1452. sub rex {
  1453. local *opcode=shift;
  1454. my ($dst,$src)=@_;
  1455. my $rex=0;
  1456. $rex|=0x04 if ($dst>=8);
  1457. $rex|=0x01 if ($src>=8);
  1458. unshift @opcode,$rex|0x40 if ($rex);
  1459. }
  1460. sub sha256op38 {
  1461. my $instr = shift;
  1462. my %opcodelet = (
  1463. "sha256rnds2" => 0xcb,
  1464. "sha256msg1" => 0xcc,
  1465. "sha256msg2" => 0xcd );
  1466. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1467. my @opcode=(0x0f,0x38);
  1468. rex(\@opcode,$2,$1);
  1469. push @opcode,$opcodelet{$instr};
  1470. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1471. return ".byte\t".join(',',@opcode);
  1472. } else {
  1473. return $instr."\t".@_[0];
  1474. }
  1475. }
  1476. foreach (split("\n",$code)) {
  1477. s/\`([^\`]*)\`/eval($1)/ge;
  1478. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
  1479. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1480. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1481. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
  1482. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1483. s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
  1484. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1485. print $_,"\n";
  1486. }
  1487. close STDOUT;