sha256-mb-x86_64.pl 39 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer SHA256 procedure processes n buffers in parallel by
  15. # placing buffer data to designated lane of SIMD register. n is
  16. # naturally limited to 4 on pre-AVX2 processors and to 8 on
  17. # AVX2-capable processors such as Haswell.
  18. #
  19. # this +aesni(i) sha256 aesni-sha256 gain(iv)
  20. # -------------------------------------------------------------------
  21. # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
  22. # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
  23. # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
  24. # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
  25. # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
  26. # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
  27. # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
  28. #
  29. # (i) multi-block CBC encrypt with 128-bit key;
  30. # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  31. # because of lower AES-NI instruction throughput, nor is there
  32. # AES-NI-SHA256 stitch for these processors;
  33. # (iii) "this" is for n=8, when we gather twice as much data, result
  34. # for n=4 is 20.3+4.44=24.7;
  35. # (iv) presented improvement coefficients are asymptotic limits and
  36. # in real-life application are somewhat lower, e.g. for 2KB
  37. # fragments they range from 75% to 130% (on Haswell);
  38. # $output is the last argument if it looks like a file (it has an extension)
  39. # $flavour is the first argument if it doesn't look like a file
  40. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  41. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  42. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  46. die "can't locate x86_64-xlate.pl";
  47. push(@INC,"${dir}","${dir}../../perlasm");
  48. require "x86_64-support.pl";
  49. $ptr_size=&pointer_size($flavour);
  50. $avx=0;
  51. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  52. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  53. $avx = ($1>=2.19) + ($1>=2.22);
  54. }
  55. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  56. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  57. $avx = ($1>=2.09) + ($1>=2.10);
  58. }
  59. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  60. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  61. $avx = ($1>=10) + ($1>=11);
  62. }
  63. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  64. $avx = ($2>=3.0) + ($2>3.0);
  65. }
  66. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  67. or die "can't call $xlate: $!";
  68. *STDOUT=*OUT;
  69. # void sha256_multi_block (
  70. # struct { unsigned int A[8];
  71. # unsigned int B[8];
  72. # unsigned int C[8];
  73. # unsigned int D[8];
  74. # unsigned int E[8];
  75. # unsigned int F[8];
  76. # unsigned int G[8];
  77. # unsigned int H[8]; } *ctx,
  78. # struct { void *ptr; int blocks; } inp[8],
  79. # int num); /* 1 or 2 */
  80. #
  81. $ctx="%rdi"; # 1st arg
  82. $inp="%rsi"; # 2nd arg
  83. $num="%edx"; # 3rd arg
  84. @ptr=map("%r$_",(8..11));
  85. $Tbl="%rbp";
  86. $inp_elm_size=2*$ptr_size;
  87. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
  88. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
  89. $REG_SZ=16;
  90. sub Xi_off {
  91. my $off = shift;
  92. $off %= 16; $off *= $REG_SZ;
  93. $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
  94. }
  95. sub ROUND_00_15 {
  96. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  97. $code.=<<___ if ($i<15);
  98. movd `4*$i`(@ptr[0]),$Xi
  99. movd `4*$i`(@ptr[1]),$t1
  100. movd `4*$i`(@ptr[2]),$t2
  101. movd `4*$i`(@ptr[3]),$t3
  102. punpckldq $t2,$Xi
  103. punpckldq $t3,$t1
  104. punpckldq $t1,$Xi
  105. ___
  106. $code.=<<___ if ($i==15);
  107. movd `4*$i`(@ptr[0]),$Xi
  108. lea `16*4`(@ptr[0]),@ptr[0]
  109. movd `4*$i`(@ptr[1]),$t1
  110. lea `16*4`(@ptr[1]),@ptr[1]
  111. movd `4*$i`(@ptr[2]),$t2
  112. lea `16*4`(@ptr[2]),@ptr[2]
  113. movd `4*$i`(@ptr[3]),$t3
  114. lea `16*4`(@ptr[3]),@ptr[3]
  115. punpckldq $t2,$Xi
  116. punpckldq $t3,$t1
  117. punpckldq $t1,$Xi
  118. ___
  119. $code.=<<___;
  120. movdqa $e,$sigma
  121. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
  122. movdqa $e,$t3
  123. `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
  124. psrld \$6,$sigma
  125. movdqa $e,$t2
  126. pslld \$7,$t3
  127. movdqa $Xi,`&Xi_off($i)`
  128. paddd $h,$Xi # Xi+=h
  129. psrld \$11,$t2
  130. pxor $t3,$sigma
  131. pslld \$21-7,$t3
  132. paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
  133. pxor $t2,$sigma
  134. psrld \$25-11,$t2
  135. movdqa $e,$t1
  136. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  137. pxor $t3,$sigma
  138. movdqa $e,$axb # borrow $axb
  139. pslld \$26-21,$t3
  140. pandn $g,$t1
  141. pand $f,$axb
  142. pxor $t2,$sigma
  143. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  144. movdqa $a,$t2
  145. pxor $t3,$sigma # Sigma1(e)
  146. movdqa $a,$t3
  147. psrld \$2,$t2
  148. paddd $sigma,$Xi # Xi+=Sigma1(e)
  149. pxor $axb,$t1 # Ch(e,f,g)
  150. movdqa $b,$axb
  151. movdqa $a,$sigma
  152. pslld \$10,$t3
  153. pxor $a,$axb # a^b, b^c in next round
  154. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  155. psrld \$13,$sigma
  156. pxor $t3,$t2
  157. paddd $t1,$Xi # Xi+=Ch(e,f,g)
  158. pslld \$19-10,$t3
  159. pand $axb,$bxc
  160. pxor $sigma,$t2
  161. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  162. psrld \$22-13,$sigma
  163. pxor $t3,$t2
  164. movdqa $b,$h
  165. pslld \$30-19,$t3
  166. pxor $t2,$sigma
  167. pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  168. paddd $Xi,$d # d+=Xi
  169. pxor $t3,$sigma # Sigma0(a)
  170. paddd $Xi,$h # h+=Xi
  171. paddd $sigma,$h # h+=Sigma0(a)
  172. ___
  173. $code.=<<___ if (($i%8)==7);
  174. lea `32*8`($Tbl),$Tbl
  175. ___
  176. ($axb,$bxc)=($bxc,$axb);
  177. }
  178. sub ROUND_16_XX {
  179. my $i=shift;
  180. $code.=<<___;
  181. movdqa `&Xi_off($i+1)`,$Xn
  182. paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
  183. movdqa $Xn,$sigma
  184. movdqa $Xn,$t2
  185. psrld \$3,$sigma
  186. movdqa $Xn,$t3
  187. psrld \$7,$t2
  188. movdqa `&Xi_off($i+14)`,$t1
  189. pslld \$14,$t3
  190. pxor $t2,$sigma
  191. psrld \$18-7,$t2
  192. movdqa $t1,$axb # borrow $axb
  193. pxor $t3,$sigma
  194. pslld \$25-14,$t3
  195. pxor $t2,$sigma
  196. psrld \$10,$t1
  197. movdqa $axb,$t2
  198. psrld \$17,$axb
  199. pxor $t3,$sigma # sigma0(X[i+1])
  200. pslld \$13,$t2
  201. paddd $sigma,$Xi # Xi+=sigma0(e)
  202. pxor $axb,$t1
  203. psrld \$19-17,$axb
  204. pxor $t2,$t1
  205. pslld \$15-13,$t2
  206. pxor $axb,$t1
  207. pxor $t2,$t1 # sigma0(X[i+14])
  208. paddd $t1,$Xi # Xi+=sigma1(X[i+14])
  209. ___
  210. &ROUND_00_15($i,@_);
  211. ($Xi,$Xn)=($Xn,$Xi);
  212. }
  213. $code.=<<___;
  214. .text
  215. .extern OPENSSL_ia32cap_P
  216. .globl sha256_multi_block
  217. .type sha256_multi_block,\@function,3
  218. .align 32
  219. sha256_multi_block:
  220. .cfi_startproc
  221. mov OPENSSL_ia32cap_P+4(%rip),%rcx
  222. bt \$61,%rcx # check SHA bit
  223. jc _shaext_shortcut
  224. ___
  225. $code.=<<___ if ($avx);
  226. test \$`1<<28`,%ecx
  227. jnz _avx_shortcut
  228. ___
  229. $code.=<<___;
  230. mov %rsp,%rax
  231. .cfi_def_cfa_register %rax
  232. push %rbx
  233. .cfi_push %rbx
  234. push %rbp
  235. .cfi_push %rbp
  236. ___
  237. $code.=<<___ if ($win64);
  238. lea -0xa8(%rsp),%rsp
  239. movaps %xmm6,(%rsp)
  240. movaps %xmm7,0x10(%rsp)
  241. movaps %xmm8,0x20(%rsp)
  242. movaps %xmm9,0x30(%rsp)
  243. movaps %xmm10,-0x78(%rax)
  244. movaps %xmm11,-0x68(%rax)
  245. movaps %xmm12,-0x58(%rax)
  246. movaps %xmm13,-0x48(%rax)
  247. movaps %xmm14,-0x38(%rax)
  248. movaps %xmm15,-0x28(%rax)
  249. ___
  250. $code.=<<___;
  251. sub \$`$REG_SZ*18`, %rsp
  252. and \$-256,%rsp
  253. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  254. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  255. .Lbody:
  256. lea K256+128(%rip),$Tbl
  257. lea `$REG_SZ*16`(%rsp),%rbx
  258. lea 0x80($ctx),$ctx # size optimization
  259. .Loop_grande:
  260. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  261. xor $num,$num
  262. ___
  263. for($i=0;$i<4;$i++) {
  264. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  265. $code.=<<___;
  266. # input pointer
  267. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  268. # number of blocks
  269. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  270. cmp $num,%ecx
  271. cmovg %ecx,$num # find maximum
  272. test %ecx,%ecx
  273. mov %ecx,`4*$i`(%rbx) # initialize counters
  274. cmovle $Tbl,@ptr[$i] # cancel input
  275. ___
  276. }
  277. $code.=<<___;
  278. test $num,$num
  279. jz .Ldone
  280. movdqu 0x00-0x80($ctx),$A # load context
  281. lea 128(%rsp),%rax
  282. movdqu 0x20-0x80($ctx),$B
  283. movdqu 0x40-0x80($ctx),$C
  284. movdqu 0x60-0x80($ctx),$D
  285. movdqu 0x80-0x80($ctx),$E
  286. movdqu 0xa0-0x80($ctx),$F
  287. movdqu 0xc0-0x80($ctx),$G
  288. movdqu 0xe0-0x80($ctx),$H
  289. movdqu .Lpbswap(%rip),$Xn
  290. jmp .Loop
  291. .align 32
  292. .Loop:
  293. movdqa $C,$bxc
  294. pxor $B,$bxc # magic seed
  295. ___
  296. for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
  297. $code.=<<___;
  298. movdqu `&Xi_off($i)`,$Xi
  299. mov \$3,%ecx
  300. jmp .Loop_16_xx
  301. .align 32
  302. .Loop_16_xx:
  303. ___
  304. for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
  305. $code.=<<___;
  306. dec %ecx
  307. jnz .Loop_16_xx
  308. mov \$1,%ecx
  309. lea K256+128(%rip),$Tbl
  310. movdqa (%rbx),$sigma # pull counters
  311. cmp 4*0(%rbx),%ecx # examine counters
  312. pxor $t1,$t1
  313. cmovge $Tbl,@ptr[0] # cancel input
  314. cmp 4*1(%rbx),%ecx
  315. movdqa $sigma,$Xn
  316. cmovge $Tbl,@ptr[1]
  317. cmp 4*2(%rbx),%ecx
  318. pcmpgtd $t1,$Xn # mask value
  319. cmovge $Tbl,@ptr[2]
  320. cmp 4*3(%rbx),%ecx
  321. paddd $Xn,$sigma # counters--
  322. cmovge $Tbl,@ptr[3]
  323. movdqu 0x00-0x80($ctx),$t1
  324. pand $Xn,$A
  325. movdqu 0x20-0x80($ctx),$t2
  326. pand $Xn,$B
  327. movdqu 0x40-0x80($ctx),$t3
  328. pand $Xn,$C
  329. movdqu 0x60-0x80($ctx),$Xi
  330. pand $Xn,$D
  331. paddd $t1,$A
  332. movdqu 0x80-0x80($ctx),$t1
  333. pand $Xn,$E
  334. paddd $t2,$B
  335. movdqu 0xa0-0x80($ctx),$t2
  336. pand $Xn,$F
  337. paddd $t3,$C
  338. movdqu 0xc0-0x80($ctx),$t3
  339. pand $Xn,$G
  340. paddd $Xi,$D
  341. movdqu 0xe0-0x80($ctx),$Xi
  342. pand $Xn,$H
  343. paddd $t1,$E
  344. paddd $t2,$F
  345. movdqu $A,0x00-0x80($ctx)
  346. paddd $t3,$G
  347. movdqu $B,0x20-0x80($ctx)
  348. paddd $Xi,$H
  349. movdqu $C,0x40-0x80($ctx)
  350. movdqu $D,0x60-0x80($ctx)
  351. movdqu $E,0x80-0x80($ctx)
  352. movdqu $F,0xa0-0x80($ctx)
  353. movdqu $G,0xc0-0x80($ctx)
  354. movdqu $H,0xe0-0x80($ctx)
  355. movdqa $sigma,(%rbx) # save counters
  356. movdqa .Lpbswap(%rip),$Xn
  357. dec $num
  358. jnz .Loop
  359. mov `$REG_SZ*17+8`(%rsp),$num
  360. lea $REG_SZ($ctx),$ctx
  361. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  362. dec $num
  363. jnz .Loop_grande
  364. .Ldone:
  365. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  366. .cfi_def_cfa %rax,8
  367. ___
  368. $code.=<<___ if ($win64);
  369. movaps -0xb8(%rax),%xmm6
  370. movaps -0xa8(%rax),%xmm7
  371. movaps -0x98(%rax),%xmm8
  372. movaps -0x88(%rax),%xmm9
  373. movaps -0x78(%rax),%xmm10
  374. movaps -0x68(%rax),%xmm11
  375. movaps -0x58(%rax),%xmm12
  376. movaps -0x48(%rax),%xmm13
  377. movaps -0x38(%rax),%xmm14
  378. movaps -0x28(%rax),%xmm15
  379. ___
  380. $code.=<<___;
  381. mov -16(%rax),%rbp
  382. .cfi_restore %rbp
  383. mov -8(%rax),%rbx
  384. .cfi_restore %rbx
  385. lea (%rax),%rsp
  386. .cfi_def_cfa_register %rsp
  387. .Lepilogue:
  388. ret
  389. .cfi_endproc
  390. .size sha256_multi_block,.-sha256_multi_block
  391. ___
  392. {{{
  393. my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
  394. my @MSG0=map("%xmm$_",(4..7));
  395. my @MSG1=map("%xmm$_",(8..11));
  396. $code.=<<___;
  397. .type sha256_multi_block_shaext,\@function,3
  398. .align 32
  399. sha256_multi_block_shaext:
  400. .cfi_startproc
  401. _shaext_shortcut:
  402. mov %rsp,%rax
  403. .cfi_def_cfa_register %rax
  404. push %rbx
  405. .cfi_push %rbx
  406. push %rbp
  407. .cfi_push %rbp
  408. ___
  409. $code.=<<___ if ($win64);
  410. lea -0xa8(%rsp),%rsp
  411. movaps %xmm6,(%rsp)
  412. movaps %xmm7,0x10(%rsp)
  413. movaps %xmm8,0x20(%rsp)
  414. movaps %xmm9,0x30(%rsp)
  415. movaps %xmm10,-0x78(%rax)
  416. movaps %xmm11,-0x68(%rax)
  417. movaps %xmm12,-0x58(%rax)
  418. movaps %xmm13,-0x48(%rax)
  419. movaps %xmm14,-0x38(%rax)
  420. movaps %xmm15,-0x28(%rax)
  421. ___
  422. $code.=<<___;
  423. sub \$`$REG_SZ*18`,%rsp
  424. shl \$1,$num # we process pair at a time
  425. and \$-256,%rsp
  426. lea 0x80($ctx),$ctx # size optimization
  427. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  428. .Lbody_shaext:
  429. lea `$REG_SZ*16`(%rsp),%rbx
  430. lea K256_shaext+0x80(%rip),$Tbl
  431. .Loop_grande_shaext:
  432. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  433. xor $num,$num
  434. ___
  435. for($i=0;$i<2;$i++) {
  436. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  437. $code.=<<___;
  438. # input pointer
  439. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  440. # number of blocks
  441. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  442. cmp $num,%ecx
  443. cmovg %ecx,$num # find maximum
  444. test %ecx,%ecx
  445. mov %ecx,`4*$i`(%rbx) # initialize counters
  446. cmovle %rsp,@ptr[$i] # cancel input
  447. ___
  448. }
  449. $code.=<<___;
  450. test $num,$num
  451. jz .Ldone_shaext
  452. movq 0x00-0x80($ctx),$ABEF0 # A1.A0
  453. movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
  454. movq 0x40-0x80($ctx),$CDGH0 # C1.C0
  455. movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
  456. movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
  457. movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
  458. movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
  459. movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
  460. punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
  461. punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
  462. punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
  463. punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
  464. movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
  465. movdqa $ABEF0,$ABEF1
  466. movdqa $CDGH0,$CDGH1
  467. punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
  468. punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
  469. punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
  470. punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
  471. pshufd \$0b00011011,$ABEF0,$ABEF0
  472. pshufd \$0b00011011,$CDGH0,$CDGH0
  473. pshufd \$0b00011011,$ABEF1,$ABEF1
  474. pshufd \$0b00011011,$CDGH1,$CDGH1
  475. jmp .Loop_shaext
  476. .align 32
  477. .Loop_shaext:
  478. movdqu 0x00(@ptr[0]),@MSG0[0]
  479. movdqu 0x00(@ptr[1]),@MSG1[0]
  480. movdqu 0x10(@ptr[0]),@MSG0[1]
  481. movdqu 0x10(@ptr[1]),@MSG1[1]
  482. movdqu 0x20(@ptr[0]),@MSG0[2]
  483. pshufb $TMPx,@MSG0[0]
  484. movdqu 0x20(@ptr[1]),@MSG1[2]
  485. pshufb $TMPx,@MSG1[0]
  486. movdqu 0x30(@ptr[0]),@MSG0[3]
  487. lea 0x40(@ptr[0]),@ptr[0]
  488. movdqu 0x30(@ptr[1]),@MSG1[3]
  489. lea 0x40(@ptr[1]),@ptr[1]
  490. movdqa 0*16-0x80($Tbl),$Wi
  491. pshufb $TMPx,@MSG0[1]
  492. paddd @MSG0[0],$Wi
  493. pxor $ABEF0,@MSG0[0] # black magic
  494. movdqa $Wi,$TMP0
  495. movdqa 0*16-0x80($Tbl),$TMP1
  496. pshufb $TMPx,@MSG1[1]
  497. paddd @MSG1[0],$TMP1
  498. movdqa $CDGH0,0x50(%rsp) # offload
  499. sha256rnds2 $ABEF0,$CDGH0 # 0-3
  500. pxor $ABEF1,@MSG1[0] # black magic
  501. movdqa $TMP1,$Wi
  502. movdqa $CDGH1,0x70(%rsp)
  503. sha256rnds2 $ABEF1,$CDGH1 # 0-3
  504. pshufd \$0x0e,$TMP0,$Wi
  505. pxor $ABEF0,@MSG0[0] # black magic
  506. movdqa $ABEF0,0x40(%rsp) # offload
  507. sha256rnds2 $CDGH0,$ABEF0
  508. pshufd \$0x0e,$TMP1,$Wi
  509. pxor $ABEF1,@MSG1[0] # black magic
  510. movdqa $ABEF1,0x60(%rsp)
  511. movdqa 1*16-0x80($Tbl),$TMP0
  512. paddd @MSG0[1],$TMP0
  513. pshufb $TMPx,@MSG0[2]
  514. sha256rnds2 $CDGH1,$ABEF1
  515. movdqa $TMP0,$Wi
  516. movdqa 1*16-0x80($Tbl),$TMP1
  517. paddd @MSG1[1],$TMP1
  518. sha256rnds2 $ABEF0,$CDGH0 # 4-7
  519. movdqa $TMP1,$Wi
  520. prefetcht0 127(@ptr[0])
  521. pshufb $TMPx,@MSG0[3]
  522. pshufb $TMPx,@MSG1[2]
  523. prefetcht0 127(@ptr[1])
  524. sha256rnds2 $ABEF1,$CDGH1 # 4-7
  525. pshufd \$0x0e,$TMP0,$Wi
  526. pshufb $TMPx,@MSG1[3]
  527. sha256msg1 @MSG0[1],@MSG0[0]
  528. sha256rnds2 $CDGH0,$ABEF0
  529. pshufd \$0x0e,$TMP1,$Wi
  530. movdqa 2*16-0x80($Tbl),$TMP0
  531. paddd @MSG0[2],$TMP0
  532. sha256rnds2 $CDGH1,$ABEF1
  533. movdqa $TMP0,$Wi
  534. movdqa 2*16-0x80($Tbl),$TMP1
  535. paddd @MSG1[2],$TMP1
  536. sha256rnds2 $ABEF0,$CDGH0 # 8-11
  537. sha256msg1 @MSG1[1],@MSG1[0]
  538. movdqa $TMP1,$Wi
  539. movdqa @MSG0[3],$TMPx
  540. sha256rnds2 $ABEF1,$CDGH1 # 8-11
  541. pshufd \$0x0e,$TMP0,$Wi
  542. palignr \$4,@MSG0[2],$TMPx
  543. paddd $TMPx,@MSG0[0]
  544. movdqa @MSG1[3],$TMPx
  545. palignr \$4,@MSG1[2],$TMPx
  546. sha256msg1 @MSG0[2],@MSG0[1]
  547. sha256rnds2 $CDGH0,$ABEF0
  548. pshufd \$0x0e,$TMP1,$Wi
  549. movdqa 3*16-0x80($Tbl),$TMP0
  550. paddd @MSG0[3],$TMP0
  551. sha256rnds2 $CDGH1,$ABEF1
  552. sha256msg1 @MSG1[2],@MSG1[1]
  553. movdqa $TMP0,$Wi
  554. movdqa 3*16-0x80($Tbl),$TMP1
  555. paddd $TMPx,@MSG1[0]
  556. paddd @MSG1[3],$TMP1
  557. sha256msg2 @MSG0[3],@MSG0[0]
  558. sha256rnds2 $ABEF0,$CDGH0 # 12-15
  559. movdqa $TMP1,$Wi
  560. movdqa @MSG0[0],$TMPx
  561. palignr \$4,@MSG0[3],$TMPx
  562. sha256rnds2 $ABEF1,$CDGH1 # 12-15
  563. sha256msg2 @MSG1[3],@MSG1[0]
  564. pshufd \$0x0e,$TMP0,$Wi
  565. paddd $TMPx,@MSG0[1]
  566. movdqa @MSG1[0],$TMPx
  567. palignr \$4,@MSG1[3],$TMPx
  568. sha256msg1 @MSG0[3],@MSG0[2]
  569. sha256rnds2 $CDGH0,$ABEF0
  570. pshufd \$0x0e,$TMP1,$Wi
  571. movdqa 4*16-0x80($Tbl),$TMP0
  572. paddd @MSG0[0],$TMP0
  573. sha256rnds2 $CDGH1,$ABEF1
  574. sha256msg1 @MSG1[3],@MSG1[2]
  575. ___
  576. for($i=4;$i<16-3;$i++) {
  577. $code.=<<___;
  578. movdqa $TMP0,$Wi
  579. movdqa $i*16-0x80($Tbl),$TMP1
  580. paddd $TMPx,@MSG1[1]
  581. paddd @MSG1[0],$TMP1
  582. sha256msg2 @MSG0[0],@MSG0[1]
  583. sha256rnds2 $ABEF0,$CDGH0 # 16-19...
  584. movdqa $TMP1,$Wi
  585. movdqa @MSG0[1],$TMPx
  586. palignr \$4,@MSG0[0],$TMPx
  587. sha256rnds2 $ABEF1,$CDGH1 # 16-19...
  588. sha256msg2 @MSG1[0],@MSG1[1]
  589. pshufd \$0x0e,$TMP0,$Wi
  590. paddd $TMPx,@MSG0[2]
  591. movdqa @MSG1[1],$TMPx
  592. palignr \$4,@MSG1[0],$TMPx
  593. sha256msg1 @MSG0[0],@MSG0[3]
  594. sha256rnds2 $CDGH0,$ABEF0
  595. pshufd \$0x0e,$TMP1,$Wi
  596. movdqa `($i+1)*16`-0x80($Tbl),$TMP0
  597. paddd @MSG0[1],$TMP0
  598. sha256rnds2 $CDGH1,$ABEF1
  599. sha256msg1 @MSG1[0],@MSG1[3]
  600. ___
  601. push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
  602. }
  603. $code.=<<___;
  604. movdqa $TMP0,$Wi
  605. movdqa 13*16-0x80($Tbl),$TMP1
  606. paddd $TMPx,@MSG1[1]
  607. paddd @MSG1[0],$TMP1
  608. sha256msg2 @MSG0[0],@MSG0[1]
  609. sha256rnds2 $ABEF0,$CDGH0 # 52-55
  610. movdqa $TMP1,$Wi
  611. movdqa @MSG0[1],$TMPx
  612. palignr \$4,@MSG0[0],$TMPx
  613. sha256rnds2 $ABEF1,$CDGH1 # 52-55
  614. sha256msg2 @MSG1[0],@MSG1[1]
  615. pshufd \$0x0e,$TMP0,$Wi
  616. paddd $TMPx,@MSG0[2]
  617. movdqa @MSG1[1],$TMPx
  618. palignr \$4,@MSG1[0],$TMPx
  619. nop
  620. sha256rnds2 $CDGH0,$ABEF0
  621. pshufd \$0x0e,$TMP1,$Wi
  622. movdqa 14*16-0x80($Tbl),$TMP0
  623. paddd @MSG0[1],$TMP0
  624. sha256rnds2 $CDGH1,$ABEF1
  625. movdqa $TMP0,$Wi
  626. movdqa 14*16-0x80($Tbl),$TMP1
  627. paddd $TMPx,@MSG1[2]
  628. paddd @MSG1[1],$TMP1
  629. sha256msg2 @MSG0[1],@MSG0[2]
  630. nop
  631. sha256rnds2 $ABEF0,$CDGH0 # 56-59
  632. movdqa $TMP1,$Wi
  633. mov \$1,%ecx
  634. pxor @MSG0[1],@MSG0[1] # zero
  635. sha256rnds2 $ABEF1,$CDGH1 # 56-59
  636. sha256msg2 @MSG1[1],@MSG1[2]
  637. pshufd \$0x0e,$TMP0,$Wi
  638. movdqa 15*16-0x80($Tbl),$TMP0
  639. paddd @MSG0[2],$TMP0
  640. movq (%rbx),@MSG0[2] # pull counters
  641. nop
  642. sha256rnds2 $CDGH0,$ABEF0
  643. pshufd \$0x0e,$TMP1,$Wi
  644. movdqa 15*16-0x80($Tbl),$TMP1
  645. paddd @MSG1[2],$TMP1
  646. sha256rnds2 $CDGH1,$ABEF1
  647. movdqa $TMP0,$Wi
  648. cmp 4*0(%rbx),%ecx # examine counters
  649. cmovge %rsp,@ptr[0] # cancel input
  650. cmp 4*1(%rbx),%ecx
  651. cmovge %rsp,@ptr[1]
  652. pshufd \$0x00,@MSG0[2],@MSG1[0]
  653. sha256rnds2 $ABEF0,$CDGH0 # 60-63
  654. movdqa $TMP1,$Wi
  655. pshufd \$0x55,@MSG0[2],@MSG1[1]
  656. movdqa @MSG0[2],@MSG1[2]
  657. sha256rnds2 $ABEF1,$CDGH1 # 60-63
  658. pshufd \$0x0e,$TMP0,$Wi
  659. pcmpgtd @MSG0[1],@MSG1[0]
  660. pcmpgtd @MSG0[1],@MSG1[1]
  661. sha256rnds2 $CDGH0,$ABEF0
  662. pshufd \$0x0e,$TMP1,$Wi
  663. pcmpgtd @MSG0[1],@MSG1[2] # counter mask
  664. movdqa K256_shaext-0x10(%rip),$TMPx
  665. sha256rnds2 $CDGH1,$ABEF1
  666. pand @MSG1[0],$CDGH0
  667. pand @MSG1[1],$CDGH1
  668. pand @MSG1[0],$ABEF0
  669. pand @MSG1[1],$ABEF1
  670. paddd @MSG0[2],@MSG1[2] # counters--
  671. paddd 0x50(%rsp),$CDGH0
  672. paddd 0x70(%rsp),$CDGH1
  673. paddd 0x40(%rsp),$ABEF0
  674. paddd 0x60(%rsp),$ABEF1
  675. movq @MSG1[2],(%rbx) # save counters
  676. dec $num
  677. jnz .Loop_shaext
  678. mov `$REG_SZ*17+8`(%rsp),$num
  679. pshufd \$0b00011011,$ABEF0,$ABEF0
  680. pshufd \$0b00011011,$CDGH0,$CDGH0
  681. pshufd \$0b00011011,$ABEF1,$ABEF1
  682. pshufd \$0b00011011,$CDGH1,$CDGH1
  683. movdqa $ABEF0,@MSG0[0]
  684. movdqa $CDGH0,@MSG0[1]
  685. punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
  686. punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
  687. punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
  688. punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
  689. movq $ABEF0,0x00-0x80($ctx) # A1.A0
  690. psrldq \$8,$ABEF0
  691. movq @MSG0[0],0x80-0x80($ctx) # E1.E0
  692. psrldq \$8,@MSG0[0]
  693. movq $ABEF0,0x20-0x80($ctx) # B1.B0
  694. movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
  695. movq $CDGH0,0x40-0x80($ctx) # C1.C0
  696. psrldq \$8,$CDGH0
  697. movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
  698. psrldq \$8,@MSG0[1]
  699. movq $CDGH0,0x60-0x80($ctx) # D1.D0
  700. movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
  701. lea `$REG_SZ/2`($ctx),$ctx
  702. lea `$inp_elm_size*2`($inp),$inp
  703. dec $num
  704. jnz .Loop_grande_shaext
  705. .Ldone_shaext:
  706. #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  707. ___
  708. $code.=<<___ if ($win64);
  709. movaps -0xb8(%rax),%xmm6
  710. movaps -0xa8(%rax),%xmm7
  711. movaps -0x98(%rax),%xmm8
  712. movaps -0x88(%rax),%xmm9
  713. movaps -0x78(%rax),%xmm10
  714. movaps -0x68(%rax),%xmm11
  715. movaps -0x58(%rax),%xmm12
  716. movaps -0x48(%rax),%xmm13
  717. movaps -0x38(%rax),%xmm14
  718. movaps -0x28(%rax),%xmm15
  719. ___
  720. $code.=<<___;
  721. mov -16(%rax),%rbp
  722. .cfi_restore %rbp
  723. mov -8(%rax),%rbx
  724. .cfi_restore %rbx
  725. lea (%rax),%rsp
  726. .cfi_def_cfa_register %rsp
  727. .Lepilogue_shaext:
  728. ret
  729. .cfi_endproc
  730. .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
  731. ___
  732. }}}
  733. if ($avx) {{{
  734. sub ROUND_00_15_avx {
  735. my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  736. $code.=<<___ if ($i<15 && $REG_SZ==16);
  737. vmovd `4*$i`(@ptr[0]),$Xi
  738. vmovd `4*$i`(@ptr[1]),$t1
  739. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  740. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  741. vpunpckldq $t1,$Xi,$Xi
  742. vpshufb $Xn,$Xi,$Xi
  743. ___
  744. $code.=<<___ if ($i==15 && $REG_SZ==16);
  745. vmovd `4*$i`(@ptr[0]),$Xi
  746. lea `16*4`(@ptr[0]),@ptr[0]
  747. vmovd `4*$i`(@ptr[1]),$t1
  748. lea `16*4`(@ptr[1]),@ptr[1]
  749. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  750. lea `16*4`(@ptr[2]),@ptr[2]
  751. vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
  752. lea `16*4`(@ptr[3]),@ptr[3]
  753. vpunpckldq $t1,$Xi,$Xi
  754. vpshufb $Xn,$Xi,$Xi
  755. ___
  756. $code.=<<___ if ($i<15 && $REG_SZ==32);
  757. vmovd `4*$i`(@ptr[0]),$Xi
  758. vmovd `4*$i`(@ptr[4]),$t1
  759. vmovd `4*$i`(@ptr[1]),$t2
  760. vmovd `4*$i`(@ptr[5]),$t3
  761. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  762. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  763. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  764. vpunpckldq $t2,$Xi,$Xi
  765. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  766. vpunpckldq $t3,$t1,$t1
  767. vinserti128 $t1,$Xi,$Xi
  768. vpshufb $Xn,$Xi,$Xi
  769. ___
  770. $code.=<<___ if ($i==15 && $REG_SZ==32);
  771. vmovd `4*$i`(@ptr[0]),$Xi
  772. lea `16*4`(@ptr[0]),@ptr[0]
  773. vmovd `4*$i`(@ptr[4]),$t1
  774. lea `16*4`(@ptr[4]),@ptr[4]
  775. vmovd `4*$i`(@ptr[1]),$t2
  776. lea `16*4`(@ptr[1]),@ptr[1]
  777. vmovd `4*$i`(@ptr[5]),$t3
  778. lea `16*4`(@ptr[5]),@ptr[5]
  779. vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
  780. lea `16*4`(@ptr[2]),@ptr[2]
  781. vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
  782. lea `16*4`(@ptr[6]),@ptr[6]
  783. vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
  784. lea `16*4`(@ptr[3]),@ptr[3]
  785. vpunpckldq $t2,$Xi,$Xi
  786. vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
  787. lea `16*4`(@ptr[7]),@ptr[7]
  788. vpunpckldq $t3,$t1,$t1
  789. vinserti128 $t1,$Xi,$Xi
  790. vpshufb $Xn,$Xi,$Xi
  791. ___
  792. $code.=<<___;
  793. vpsrld \$6,$e,$sigma
  794. vpslld \$26,$e,$t3
  795. vmovdqu $Xi,`&Xi_off($i)`
  796. vpaddd $h,$Xi,$Xi # Xi+=h
  797. vpsrld \$11,$e,$t2
  798. vpxor $t3,$sigma,$sigma
  799. vpslld \$21,$e,$t3
  800. vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
  801. vpxor $t2,$sigma,$sigma
  802. vpsrld \$25,$e,$t2
  803. vpxor $t3,$sigma,$sigma
  804. `"prefetcht0 63(@ptr[0])" if ($i==15)`
  805. vpslld \$7,$e,$t3
  806. vpandn $g,$e,$t1
  807. vpand $f,$e,$axb # borrow $axb
  808. `"prefetcht0 63(@ptr[1])" if ($i==15)`
  809. vpxor $t2,$sigma,$sigma
  810. vpsrld \$2,$a,$h # borrow $h
  811. vpxor $t3,$sigma,$sigma # Sigma1(e)
  812. `"prefetcht0 63(@ptr[2])" if ($i==15)`
  813. vpslld \$30,$a,$t2
  814. vpxor $axb,$t1,$t1 # Ch(e,f,g)
  815. vpxor $a,$b,$axb # a^b, b^c in next round
  816. `"prefetcht0 63(@ptr[3])" if ($i==15)`
  817. vpxor $t2,$h,$h
  818. vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
  819. vpsrld \$13,$a,$t2
  820. `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
  821. vpslld \$19,$a,$t3
  822. vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
  823. vpand $axb,$bxc,$bxc
  824. `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
  825. vpxor $t2,$h,$sigma
  826. vpsrld \$22,$a,$t2
  827. vpxor $t3,$sigma,$sigma
  828. `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
  829. vpslld \$10,$a,$t3
  830. vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  831. vpaddd $Xi,$d,$d # d+=Xi
  832. `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
  833. vpxor $t2,$sigma,$sigma
  834. vpxor $t3,$sigma,$sigma # Sigma0(a)
  835. vpaddd $Xi,$h,$h # h+=Xi
  836. vpaddd $sigma,$h,$h # h+=Sigma0(a)
  837. ___
  838. $code.=<<___ if (($i%8)==7);
  839. add \$`32*8`,$Tbl
  840. ___
  841. ($axb,$bxc)=($bxc,$axb);
  842. }
  843. sub ROUND_16_XX_avx {
  844. my $i=shift;
  845. $code.=<<___;
  846. vmovdqu `&Xi_off($i+1)`,$Xn
  847. vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
  848. vpsrld \$3,$Xn,$sigma
  849. vpsrld \$7,$Xn,$t2
  850. vpslld \$25,$Xn,$t3
  851. vpxor $t2,$sigma,$sigma
  852. vpsrld \$18,$Xn,$t2
  853. vpxor $t3,$sigma,$sigma
  854. vpslld \$14,$Xn,$t3
  855. vmovdqu `&Xi_off($i+14)`,$t1
  856. vpsrld \$10,$t1,$axb # borrow $axb
  857. vpxor $t2,$sigma,$sigma
  858. vpsrld \$17,$t1,$t2
  859. vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
  860. vpslld \$15,$t1,$t3
  861. vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
  862. vpxor $t2,$axb,$sigma
  863. vpsrld \$19,$t1,$t2
  864. vpxor $t3,$sigma,$sigma
  865. vpslld \$13,$t1,$t3
  866. vpxor $t2,$sigma,$sigma
  867. vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
  868. vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
  869. ___
  870. &ROUND_00_15_avx($i,@_);
  871. ($Xi,$Xn)=($Xn,$Xi);
  872. }
  873. $code.=<<___;
  874. .type sha256_multi_block_avx,\@function,3
  875. .align 32
  876. sha256_multi_block_avx:
  877. .cfi_startproc
  878. _avx_shortcut:
  879. ___
  880. $code.=<<___ if ($avx>1);
  881. shr \$32,%rcx
  882. cmp \$2,$num
  883. jb .Lavx
  884. test \$`1<<5`,%ecx
  885. jnz _avx2_shortcut
  886. jmp .Lavx
  887. .align 32
  888. .Lavx:
  889. ___
  890. $code.=<<___;
  891. mov %rsp,%rax
  892. .cfi_def_cfa_register %rax
  893. push %rbx
  894. .cfi_push %rbx
  895. push %rbp
  896. .cfi_push %rbp
  897. ___
  898. $code.=<<___ if ($win64);
  899. lea -0xa8(%rsp),%rsp
  900. movaps %xmm6,(%rsp)
  901. movaps %xmm7,0x10(%rsp)
  902. movaps %xmm8,0x20(%rsp)
  903. movaps %xmm9,0x30(%rsp)
  904. movaps %xmm10,-0x78(%rax)
  905. movaps %xmm11,-0x68(%rax)
  906. movaps %xmm12,-0x58(%rax)
  907. movaps %xmm13,-0x48(%rax)
  908. movaps %xmm14,-0x38(%rax)
  909. movaps %xmm15,-0x28(%rax)
  910. ___
  911. $code.=<<___;
  912. sub \$`$REG_SZ*18`, %rsp
  913. and \$-256,%rsp
  914. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  915. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  916. .Lbody_avx:
  917. lea K256+128(%rip),$Tbl
  918. lea `$REG_SZ*16`(%rsp),%rbx
  919. lea 0x80($ctx),$ctx # size optimization
  920. .Loop_grande_avx:
  921. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  922. xor $num,$num
  923. ___
  924. for($i=0;$i<4;$i++) {
  925. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  926. $code.=<<___;
  927. # input pointer
  928. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  929. # number of blocks
  930. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  931. cmp $num,%ecx
  932. cmovg %ecx,$num # find maximum
  933. test %ecx,%ecx
  934. mov %ecx,`4*$i`(%rbx) # initialize counters
  935. cmovle $Tbl,@ptr[$i] # cancel input
  936. ___
  937. }
  938. $code.=<<___;
  939. test $num,$num
  940. jz .Ldone_avx
  941. vmovdqu 0x00-0x80($ctx),$A # load context
  942. lea 128(%rsp),%rax
  943. vmovdqu 0x20-0x80($ctx),$B
  944. vmovdqu 0x40-0x80($ctx),$C
  945. vmovdqu 0x60-0x80($ctx),$D
  946. vmovdqu 0x80-0x80($ctx),$E
  947. vmovdqu 0xa0-0x80($ctx),$F
  948. vmovdqu 0xc0-0x80($ctx),$G
  949. vmovdqu 0xe0-0x80($ctx),$H
  950. vmovdqu .Lpbswap(%rip),$Xn
  951. jmp .Loop_avx
  952. .align 32
  953. .Loop_avx:
  954. vpxor $B,$C,$bxc # magic seed
  955. ___
  956. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  957. $code.=<<___;
  958. vmovdqu `&Xi_off($i)`,$Xi
  959. mov \$3,%ecx
  960. jmp .Loop_16_xx_avx
  961. .align 32
  962. .Loop_16_xx_avx:
  963. ___
  964. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  965. $code.=<<___;
  966. dec %ecx
  967. jnz .Loop_16_xx_avx
  968. mov \$1,%ecx
  969. lea K256+128(%rip),$Tbl
  970. ___
  971. for($i=0;$i<4;$i++) {
  972. $code.=<<___;
  973. cmp `4*$i`(%rbx),%ecx # examine counters
  974. cmovge $Tbl,@ptr[$i] # cancel input
  975. ___
  976. }
  977. $code.=<<___;
  978. vmovdqa (%rbx),$sigma # pull counters
  979. vpxor $t1,$t1,$t1
  980. vmovdqa $sigma,$Xn
  981. vpcmpgtd $t1,$Xn,$Xn # mask value
  982. vpaddd $Xn,$sigma,$sigma # counters--
  983. vmovdqu 0x00-0x80($ctx),$t1
  984. vpand $Xn,$A,$A
  985. vmovdqu 0x20-0x80($ctx),$t2
  986. vpand $Xn,$B,$B
  987. vmovdqu 0x40-0x80($ctx),$t3
  988. vpand $Xn,$C,$C
  989. vmovdqu 0x60-0x80($ctx),$Xi
  990. vpand $Xn,$D,$D
  991. vpaddd $t1,$A,$A
  992. vmovdqu 0x80-0x80($ctx),$t1
  993. vpand $Xn,$E,$E
  994. vpaddd $t2,$B,$B
  995. vmovdqu 0xa0-0x80($ctx),$t2
  996. vpand $Xn,$F,$F
  997. vpaddd $t3,$C,$C
  998. vmovdqu 0xc0-0x80($ctx),$t3
  999. vpand $Xn,$G,$G
  1000. vpaddd $Xi,$D,$D
  1001. vmovdqu 0xe0-0x80($ctx),$Xi
  1002. vpand $Xn,$H,$H
  1003. vpaddd $t1,$E,$E
  1004. vpaddd $t2,$F,$F
  1005. vmovdqu $A,0x00-0x80($ctx)
  1006. vpaddd $t3,$G,$G
  1007. vmovdqu $B,0x20-0x80($ctx)
  1008. vpaddd $Xi,$H,$H
  1009. vmovdqu $C,0x40-0x80($ctx)
  1010. vmovdqu $D,0x60-0x80($ctx)
  1011. vmovdqu $E,0x80-0x80($ctx)
  1012. vmovdqu $F,0xa0-0x80($ctx)
  1013. vmovdqu $G,0xc0-0x80($ctx)
  1014. vmovdqu $H,0xe0-0x80($ctx)
  1015. vmovdqu $sigma,(%rbx) # save counters
  1016. vmovdqu .Lpbswap(%rip),$Xn
  1017. dec $num
  1018. jnz .Loop_avx
  1019. mov `$REG_SZ*17+8`(%rsp),$num
  1020. lea $REG_SZ($ctx),$ctx
  1021. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1022. dec $num
  1023. jnz .Loop_grande_avx
  1024. .Ldone_avx:
  1025. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1026. .cfi_def_cfa %rax,8
  1027. vzeroupper
  1028. ___
  1029. $code.=<<___ if ($win64);
  1030. movaps -0xb8(%rax),%xmm6
  1031. movaps -0xa8(%rax),%xmm7
  1032. movaps -0x98(%rax),%xmm8
  1033. movaps -0x88(%rax),%xmm9
  1034. movaps -0x78(%rax),%xmm10
  1035. movaps -0x68(%rax),%xmm11
  1036. movaps -0x58(%rax),%xmm12
  1037. movaps -0x48(%rax),%xmm13
  1038. movaps -0x38(%rax),%xmm14
  1039. movaps -0x28(%rax),%xmm15
  1040. ___
  1041. $code.=<<___;
  1042. mov -16(%rax),%rbp
  1043. .cfi_restore %rbp
  1044. mov -8(%rax),%rbx
  1045. .cfi_restore %rbx
  1046. lea (%rax),%rsp
  1047. .cfi_def_cfa_register %rsp
  1048. .Lepilogue_avx:
  1049. ret
  1050. .cfi_endproc
  1051. .size sha256_multi_block_avx,.-sha256_multi_block_avx
  1052. ___
  1053. if ($avx>1) {
  1054. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1055. $REG_SZ=32;
  1056. @ptr=map("%r$_",(12..15,8..11));
  1057. @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
  1058. ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
  1059. $code.=<<___;
  1060. .type sha256_multi_block_avx2,\@function,3
  1061. .align 32
  1062. sha256_multi_block_avx2:
  1063. .cfi_startproc
  1064. _avx2_shortcut:
  1065. mov %rsp,%rax
  1066. .cfi_def_cfa_register %rax
  1067. push %rbx
  1068. .cfi_push %rbx
  1069. push %rbp
  1070. .cfi_push %rbp
  1071. push %r12
  1072. .cfi_push %r12
  1073. push %r13
  1074. .cfi_push %r13
  1075. push %r14
  1076. .cfi_push %r14
  1077. push %r15
  1078. .cfi_push %r15
  1079. ___
  1080. $code.=<<___ if ($win64);
  1081. lea -0xa8(%rsp),%rsp
  1082. movaps %xmm6,(%rsp)
  1083. movaps %xmm7,0x10(%rsp)
  1084. movaps %xmm8,0x20(%rsp)
  1085. movaps %xmm9,0x30(%rsp)
  1086. movaps %xmm10,0x40(%rsp)
  1087. movaps %xmm11,0x50(%rsp)
  1088. movaps %xmm12,-0x78(%rax)
  1089. movaps %xmm13,-0x68(%rax)
  1090. movaps %xmm14,-0x58(%rax)
  1091. movaps %xmm15,-0x48(%rax)
  1092. ___
  1093. $code.=<<___;
  1094. sub \$`$REG_SZ*18`, %rsp
  1095. and \$-256,%rsp
  1096. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  1097. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  1098. .Lbody_avx2:
  1099. lea K256+128(%rip),$Tbl
  1100. lea 0x80($ctx),$ctx # size optimization
  1101. .Loop_grande_avx2:
  1102. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  1103. xor $num,$num
  1104. lea `$REG_SZ*16`(%rsp),%rbx
  1105. ___
  1106. for($i=0;$i<8;$i++) {
  1107. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  1108. $code.=<<___;
  1109. # input pointer
  1110. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  1111. # number of blocks
  1112. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  1113. cmp $num,%ecx
  1114. cmovg %ecx,$num # find maximum
  1115. test %ecx,%ecx
  1116. mov %ecx,`4*$i`(%rbx) # initialize counters
  1117. cmovle $Tbl,@ptr[$i] # cancel input
  1118. ___
  1119. }
  1120. $code.=<<___;
  1121. vmovdqu 0x00-0x80($ctx),$A # load context
  1122. lea 128(%rsp),%rax
  1123. vmovdqu 0x20-0x80($ctx),$B
  1124. lea 256+128(%rsp),%rbx
  1125. vmovdqu 0x40-0x80($ctx),$C
  1126. vmovdqu 0x60-0x80($ctx),$D
  1127. vmovdqu 0x80-0x80($ctx),$E
  1128. vmovdqu 0xa0-0x80($ctx),$F
  1129. vmovdqu 0xc0-0x80($ctx),$G
  1130. vmovdqu 0xe0-0x80($ctx),$H
  1131. vmovdqu .Lpbswap(%rip),$Xn
  1132. jmp .Loop_avx2
  1133. .align 32
  1134. .Loop_avx2:
  1135. vpxor $B,$C,$bxc # magic seed
  1136. ___
  1137. for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
  1138. $code.=<<___;
  1139. vmovdqu `&Xi_off($i)`,$Xi
  1140. mov \$3,%ecx
  1141. jmp .Loop_16_xx_avx2
  1142. .align 32
  1143. .Loop_16_xx_avx2:
  1144. ___
  1145. for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
  1146. $code.=<<___;
  1147. dec %ecx
  1148. jnz .Loop_16_xx_avx2
  1149. mov \$1,%ecx
  1150. lea `$REG_SZ*16`(%rsp),%rbx
  1151. lea K256+128(%rip),$Tbl
  1152. ___
  1153. for($i=0;$i<8;$i++) {
  1154. $code.=<<___;
  1155. cmp `4*$i`(%rbx),%ecx # examine counters
  1156. cmovge $Tbl,@ptr[$i] # cancel input
  1157. ___
  1158. }
  1159. $code.=<<___;
  1160. vmovdqa (%rbx),$sigma # pull counters
  1161. vpxor $t1,$t1,$t1
  1162. vmovdqa $sigma,$Xn
  1163. vpcmpgtd $t1,$Xn,$Xn # mask value
  1164. vpaddd $Xn,$sigma,$sigma # counters--
  1165. vmovdqu 0x00-0x80($ctx),$t1
  1166. vpand $Xn,$A,$A
  1167. vmovdqu 0x20-0x80($ctx),$t2
  1168. vpand $Xn,$B,$B
  1169. vmovdqu 0x40-0x80($ctx),$t3
  1170. vpand $Xn,$C,$C
  1171. vmovdqu 0x60-0x80($ctx),$Xi
  1172. vpand $Xn,$D,$D
  1173. vpaddd $t1,$A,$A
  1174. vmovdqu 0x80-0x80($ctx),$t1
  1175. vpand $Xn,$E,$E
  1176. vpaddd $t2,$B,$B
  1177. vmovdqu 0xa0-0x80($ctx),$t2
  1178. vpand $Xn,$F,$F
  1179. vpaddd $t3,$C,$C
  1180. vmovdqu 0xc0-0x80($ctx),$t3
  1181. vpand $Xn,$G,$G
  1182. vpaddd $Xi,$D,$D
  1183. vmovdqu 0xe0-0x80($ctx),$Xi
  1184. vpand $Xn,$H,$H
  1185. vpaddd $t1,$E,$E
  1186. vpaddd $t2,$F,$F
  1187. vmovdqu $A,0x00-0x80($ctx)
  1188. vpaddd $t3,$G,$G
  1189. vmovdqu $B,0x20-0x80($ctx)
  1190. vpaddd $Xi,$H,$H
  1191. vmovdqu $C,0x40-0x80($ctx)
  1192. vmovdqu $D,0x60-0x80($ctx)
  1193. vmovdqu $E,0x80-0x80($ctx)
  1194. vmovdqu $F,0xa0-0x80($ctx)
  1195. vmovdqu $G,0xc0-0x80($ctx)
  1196. vmovdqu $H,0xe0-0x80($ctx)
  1197. vmovdqu $sigma,(%rbx) # save counters
  1198. lea 256+128(%rsp),%rbx
  1199. vmovdqu .Lpbswap(%rip),$Xn
  1200. dec $num
  1201. jnz .Loop_avx2
  1202. #mov `$REG_SZ*17+8`(%rsp),$num
  1203. #lea $REG_SZ($ctx),$ctx
  1204. #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1205. #dec $num
  1206. #jnz .Loop_grande_avx2
  1207. .Ldone_avx2:
  1208. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1209. .cfi_def_cfa %rax,8
  1210. vzeroupper
  1211. ___
  1212. $code.=<<___ if ($win64);
  1213. movaps -0xd8(%rax),%xmm6
  1214. movaps -0xc8(%rax),%xmm7
  1215. movaps -0xb8(%rax),%xmm8
  1216. movaps -0xa8(%rax),%xmm9
  1217. movaps -0x98(%rax),%xmm10
  1218. movaps -0x88(%rax),%xmm11
  1219. movaps -0x78(%rax),%xmm12
  1220. movaps -0x68(%rax),%xmm13
  1221. movaps -0x58(%rax),%xmm14
  1222. movaps -0x48(%rax),%xmm15
  1223. ___
  1224. $code.=<<___;
  1225. mov -48(%rax),%r15
  1226. .cfi_restore %r15
  1227. mov -40(%rax),%r14
  1228. .cfi_restore %r14
  1229. mov -32(%rax),%r13
  1230. .cfi_restore %r13
  1231. mov -24(%rax),%r12
  1232. .cfi_restore %r12
  1233. mov -16(%rax),%rbp
  1234. .cfi_restore %rbp
  1235. mov -8(%rax),%rbx
  1236. .cfi_restore %rbx
  1237. lea (%rax),%rsp
  1238. .cfi_def_cfa_register %rsp
  1239. .Lepilogue_avx2:
  1240. ret
  1241. .cfi_endproc
  1242. .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
  1243. ___
  1244. } }}}
  1245. $code.=<<___;
  1246. .align 256
  1247. K256:
  1248. ___
  1249. sub TABLE {
  1250. foreach (@_) {
  1251. $code.=<<___;
  1252. .long $_,$_,$_,$_
  1253. .long $_,$_,$_,$_
  1254. ___
  1255. }
  1256. }
  1257. &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
  1258. 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  1259. 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
  1260. 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  1261. 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
  1262. 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  1263. 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
  1264. 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  1265. 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
  1266. 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  1267. 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
  1268. 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  1269. 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
  1270. 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  1271. 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
  1272. 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
  1273. $code.=<<___;
  1274. .Lpbswap:
  1275. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1276. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1277. K256_shaext:
  1278. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  1279. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  1280. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  1281. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  1282. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  1283. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  1284. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  1285. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  1286. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  1287. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  1288. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  1289. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  1290. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  1291. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  1292. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  1293. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  1294. .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1295. ___
  1296. if ($win64) {
  1297. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1298. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1299. $rec="%rcx";
  1300. $frame="%rdx";
  1301. $context="%r8";
  1302. $disp="%r9";
  1303. $code.=<<___;
  1304. .extern __imp_RtlVirtualUnwind
  1305. .type se_handler,\@abi-omnipotent
  1306. .align 16
  1307. se_handler:
  1308. push %rsi
  1309. push %rdi
  1310. push %rbx
  1311. push %rbp
  1312. push %r12
  1313. push %r13
  1314. push %r14
  1315. push %r15
  1316. pushfq
  1317. sub \$64,%rsp
  1318. mov 120($context),%rax # pull context->Rax
  1319. mov 248($context),%rbx # pull context->Rip
  1320. mov 8($disp),%rsi # disp->ImageBase
  1321. mov 56($disp),%r11 # disp->HandlerData
  1322. mov 0(%r11),%r10d # HandlerData[0]
  1323. lea (%rsi,%r10),%r10 # end of prologue label
  1324. cmp %r10,%rbx # context->Rip<.Lbody
  1325. jb .Lin_prologue
  1326. mov 152($context),%rax # pull context->Rsp
  1327. mov 4(%r11),%r10d # HandlerData[1]
  1328. lea (%rsi,%r10),%r10 # epilogue label
  1329. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1330. jae .Lin_prologue
  1331. mov `16*17`(%rax),%rax # pull saved stack pointer
  1332. mov -8(%rax),%rbx
  1333. mov -16(%rax),%rbp
  1334. mov %rbx,144($context) # restore context->Rbx
  1335. mov %rbp,160($context) # restore context->Rbp
  1336. lea -24-10*16(%rax),%rsi
  1337. lea 512($context),%rdi # &context.Xmm6
  1338. mov \$20,%ecx
  1339. .long 0xa548f3fc # cld; rep movsq
  1340. .Lin_prologue:
  1341. mov 8(%rax),%rdi
  1342. mov 16(%rax),%rsi
  1343. mov %rax,152($context) # restore context->Rsp
  1344. mov %rsi,168($context) # restore context->Rsi
  1345. mov %rdi,176($context) # restore context->Rdi
  1346. mov 40($disp),%rdi # disp->ContextRecord
  1347. mov $context,%rsi # context
  1348. mov \$154,%ecx # sizeof(CONTEXT)
  1349. .long 0xa548f3fc # cld; rep movsq
  1350. mov $disp,%rsi
  1351. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1352. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1353. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1354. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1355. mov 40(%rsi),%r10 # disp->ContextRecord
  1356. lea 56(%rsi),%r11 # &disp->HandlerData
  1357. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1358. mov %r10,32(%rsp) # arg5
  1359. mov %r11,40(%rsp) # arg6
  1360. mov %r12,48(%rsp) # arg7
  1361. mov %rcx,56(%rsp) # arg8, (NULL)
  1362. call *__imp_RtlVirtualUnwind(%rip)
  1363. mov \$1,%eax # ExceptionContinueSearch
  1364. add \$64,%rsp
  1365. popfq
  1366. pop %r15
  1367. pop %r14
  1368. pop %r13
  1369. pop %r12
  1370. pop %rbp
  1371. pop %rbx
  1372. pop %rdi
  1373. pop %rsi
  1374. ret
  1375. .size se_handler,.-se_handler
  1376. ___
  1377. $code.=<<___ if ($avx>1);
  1378. .type avx2_handler,\@abi-omnipotent
  1379. .align 16
  1380. avx2_handler:
  1381. push %rsi
  1382. push %rdi
  1383. push %rbx
  1384. push %rbp
  1385. push %r12
  1386. push %r13
  1387. push %r14
  1388. push %r15
  1389. pushfq
  1390. sub \$64,%rsp
  1391. mov 120($context),%rax # pull context->Rax
  1392. mov 248($context),%rbx # pull context->Rip
  1393. mov 8($disp),%rsi # disp->ImageBase
  1394. mov 56($disp),%r11 # disp->HandlerData
  1395. mov 0(%r11),%r10d # HandlerData[0]
  1396. lea (%rsi,%r10),%r10 # end of prologue label
  1397. cmp %r10,%rbx # context->Rip<body label
  1398. jb .Lin_prologue
  1399. mov 152($context),%rax # pull context->Rsp
  1400. mov 4(%r11),%r10d # HandlerData[1]
  1401. lea (%rsi,%r10),%r10 # epilogue label
  1402. cmp %r10,%rbx # context->Rip>=epilogue label
  1403. jae .Lin_prologue
  1404. mov `32*17`($context),%rax # pull saved stack pointer
  1405. mov -8(%rax),%rbx
  1406. mov -16(%rax),%rbp
  1407. mov -24(%rax),%r12
  1408. mov -32(%rax),%r13
  1409. mov -40(%rax),%r14
  1410. mov -48(%rax),%r15
  1411. mov %rbx,144($context) # restore context->Rbx
  1412. mov %rbp,160($context) # restore context->Rbp
  1413. mov %r12,216($context) # restore context->R12
  1414. mov %r13,224($context) # restore context->R13
  1415. mov %r14,232($context) # restore context->R14
  1416. mov %r15,240($context) # restore context->R15
  1417. lea -56-10*16(%rax),%rsi
  1418. lea 512($context),%rdi # &context.Xmm6
  1419. mov \$20,%ecx
  1420. .long 0xa548f3fc # cld; rep movsq
  1421. jmp .Lin_prologue
  1422. .size avx2_handler,.-avx2_handler
  1423. ___
  1424. $code.=<<___;
  1425. .section .pdata
  1426. .align 4
  1427. .rva .LSEH_begin_sha256_multi_block
  1428. .rva .LSEH_end_sha256_multi_block
  1429. .rva .LSEH_info_sha256_multi_block
  1430. .rva .LSEH_begin_sha256_multi_block_shaext
  1431. .rva .LSEH_end_sha256_multi_block_shaext
  1432. .rva .LSEH_info_sha256_multi_block_shaext
  1433. ___
  1434. $code.=<<___ if ($avx);
  1435. .rva .LSEH_begin_sha256_multi_block_avx
  1436. .rva .LSEH_end_sha256_multi_block_avx
  1437. .rva .LSEH_info_sha256_multi_block_avx
  1438. ___
  1439. $code.=<<___ if ($avx>1);
  1440. .rva .LSEH_begin_sha256_multi_block_avx2
  1441. .rva .LSEH_end_sha256_multi_block_avx2
  1442. .rva .LSEH_info_sha256_multi_block_avx2
  1443. ___
  1444. $code.=<<___;
  1445. .section .xdata
  1446. .align 8
  1447. .LSEH_info_sha256_multi_block:
  1448. .byte 9,0,0,0
  1449. .rva se_handler
  1450. .rva .Lbody,.Lepilogue # HandlerData[]
  1451. .LSEH_info_sha256_multi_block_shaext:
  1452. .byte 9,0,0,0
  1453. .rva se_handler
  1454. .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
  1455. ___
  1456. $code.=<<___ if ($avx);
  1457. .LSEH_info_sha256_multi_block_avx:
  1458. .byte 9,0,0,0
  1459. .rva se_handler
  1460. .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
  1461. ___
  1462. $code.=<<___ if ($avx>1);
  1463. .LSEH_info_sha256_multi_block_avx2:
  1464. .byte 9,0,0,0
  1465. .rva avx2_handler
  1466. .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
  1467. ___
  1468. }
  1469. ####################################################################
  1470. sub rex {
  1471. local *opcode=shift;
  1472. my ($dst,$src)=@_;
  1473. my $rex=0;
  1474. $rex|=0x04 if ($dst>=8);
  1475. $rex|=0x01 if ($src>=8);
  1476. unshift @opcode,$rex|0x40 if ($rex);
  1477. }
  1478. sub sha256op38 {
  1479. my $instr = shift;
  1480. my %opcodelet = (
  1481. "sha256rnds2" => 0xcb,
  1482. "sha256msg1" => 0xcc,
  1483. "sha256msg2" => 0xcd );
  1484. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1485. my @opcode=(0x0f,0x38);
  1486. rex(\@opcode,$2,$1);
  1487. push @opcode,$opcodelet{$instr};
  1488. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1489. return ".byte\t".join(',',@opcode);
  1490. } else {
  1491. return $instr."\t".@_[0];
  1492. }
  1493. }
  1494. foreach (split("\n",$code)) {
  1495. s/\`([^\`]*)\`/eval($1)/ge;
  1496. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
  1497. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1498. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1499. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
  1500. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1501. s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
  1502. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1503. print $_,"\n";
  1504. }
  1505. close STDOUT or die "error closing STDOUT: $!";