ghash-x86_64.pl 44 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # March, June 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that
  20. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  21. # function features so called "528B" variant utilizing additional
  22. # 256+16 bytes of per-key storage [+512 bytes shared table].
  23. # Performance results are for this streamed GHASH subroutine and are
  24. # expressed in cycles per processed byte, less is better:
  25. #
  26. # gcc 3.4.x(*) assembler
  27. #
  28. # P4 28.6 14.0 +100%
  29. # Opteron 19.3 7.7 +150%
  30. # Core2 17.8 8.1(**) +120%
  31. # Atom 31.6 16.8 +88%
  32. # VIA Nano 21.8 10.1 +115%
  33. #
  34. # (*) comparison is not completely fair, because C results are
  35. # for vanilla "256B" implementation, while assembler results
  36. # are for "528B";-)
  37. # (**) it's mystery [to me] why Core2 result is not same as for
  38. # Opteron;
  39. # May 2010
  40. #
  41. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  42. # See ghash-x86.pl for background information and details about coding
  43. # techniques.
  44. #
  45. # Special thanks to David Woodhouse for providing access to a
  46. # Westmere-based system on behalf of Intel Open Source Technology Centre.
  47. # December 2012
  48. #
  49. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  50. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  51. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  52. # increase aggregate factor. Then why increase here? Critical path
  53. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  54. # processing and reduction. "On top" of this we lay down aggregated
  55. # multiplication operations, triplets of independent pclmulqdq's. As
  56. # issue rate for pclmulqdq is limited, it makes lesser sense to
  57. # aggregate more multiplications than it takes to perform remaining
  58. # non-multiplication operations. 2x is near-optimal coefficient for
  59. # contemporary Intel CPUs (therefore modest improvement coefficient),
  60. # but not for Bulldozer. Latter is because logical SIMD operations
  61. # are twice as slow in comparison to Intel, so that critical path is
  62. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  63. # from higher aggregate factor...
  64. #
  65. # Westmere 1.78(+13%)
  66. # Sandy Bridge 1.80(+8%)
  67. # Ivy Bridge 1.80(+7%)
  68. # Haswell 0.55(+93%) (if system doesn't support AVX)
  69. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  70. # Skylake 0.44(+110%)(if system doesn't support AVX)
  71. # Bulldozer 1.49(+27%)
  72. # Silvermont 2.88(+13%)
  73. # Knights L 2.12(-) (if system doesn't support AVX)
  74. # Goldmont 1.08(+24%)
  75. # March 2013
  76. #
  77. # ... 8x aggregate factor AVX code path is using reduction algorithm
  78. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  79. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  80. # sub-optimally in comparison to above mentioned version. But thanks
  81. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  82. # it performs in 0.41 cycles per byte on Haswell processor, in
  83. # 0.29 on Broadwell, and in 0.36 on Skylake.
  84. #
  85. # Knights Landing achieves 1.09 cpb.
  86. #
  87. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  88. # $output is the last argument if it looks like a file (it has an extension)
  89. # $flavour is the first argument if it doesn't look like a file
  90. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  91. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  92. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  93. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  94. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  95. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  96. die "can't locate x86_64-xlate.pl";
  97. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  98. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  99. $avx = ($1>=2.20) + ($1>=2.22);
  100. }
  101. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  102. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  103. $avx = ($1>=2.09) + ($1>=2.10);
  104. }
  105. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  106. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  107. $avx = ($1>=10) + ($1>=11);
  108. }
  109. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  110. $avx = ($2>=3.0) + ($2>3.0);
  111. }
  112. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  113. or die "can't call $xlate: $!";
  114. *STDOUT=*OUT;
  115. $do4xaggr=1;
  116. # common register layout
  117. $nlo="%rax";
  118. $nhi="%rbx";
  119. $Zlo="%r8";
  120. $Zhi="%r9";
  121. $tmp="%r10";
  122. $rem_4bit = "%r11";
  123. $Xi="%rdi";
  124. $Htbl="%rsi";
  125. # per-function register layout
  126. $cnt="%rcx";
  127. $rem="%rdx";
  128. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  129. $r =~ s/%[er]([sd]i)/%\1l/ or
  130. $r =~ s/%[er](bp)/%\1l/ or
  131. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  132. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  133. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  134. my $arg = pop;
  135. $arg = "\$$arg" if ($arg*1 eq $arg);
  136. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  137. }
  138. { my $N;
  139. sub loop() {
  140. my $inp = shift;
  141. $N++;
  142. $code.=<<___;
  143. xor $nlo,$nlo
  144. xor $nhi,$nhi
  145. mov `&LB("$Zlo")`,`&LB("$nlo")`
  146. mov `&LB("$Zlo")`,`&LB("$nhi")`
  147. shl \$4,`&LB("$nlo")`
  148. mov \$14,$cnt
  149. mov 8($Htbl,$nlo),$Zlo
  150. mov ($Htbl,$nlo),$Zhi
  151. and \$0xf0,`&LB("$nhi")`
  152. mov $Zlo,$rem
  153. jmp .Loop$N
  154. .align 16
  155. .Loop$N:
  156. shr \$4,$Zlo
  157. and \$0xf,$rem
  158. mov $Zhi,$tmp
  159. mov ($inp,$cnt),`&LB("$nlo")`
  160. shr \$4,$Zhi
  161. xor 8($Htbl,$nhi),$Zlo
  162. shl \$60,$tmp
  163. xor ($Htbl,$nhi),$Zhi
  164. mov `&LB("$nlo")`,`&LB("$nhi")`
  165. xor ($rem_4bit,$rem,8),$Zhi
  166. mov $Zlo,$rem
  167. shl \$4,`&LB("$nlo")`
  168. xor $tmp,$Zlo
  169. dec $cnt
  170. js .Lbreak$N
  171. shr \$4,$Zlo
  172. and \$0xf,$rem
  173. mov $Zhi,$tmp
  174. shr \$4,$Zhi
  175. xor 8($Htbl,$nlo),$Zlo
  176. shl \$60,$tmp
  177. xor ($Htbl,$nlo),$Zhi
  178. and \$0xf0,`&LB("$nhi")`
  179. xor ($rem_4bit,$rem,8),$Zhi
  180. mov $Zlo,$rem
  181. xor $tmp,$Zlo
  182. jmp .Loop$N
  183. .align 16
  184. .Lbreak$N:
  185. shr \$4,$Zlo
  186. and \$0xf,$rem
  187. mov $Zhi,$tmp
  188. shr \$4,$Zhi
  189. xor 8($Htbl,$nlo),$Zlo
  190. shl \$60,$tmp
  191. xor ($Htbl,$nlo),$Zhi
  192. and \$0xf0,`&LB("$nhi")`
  193. xor ($rem_4bit,$rem,8),$Zhi
  194. mov $Zlo,$rem
  195. xor $tmp,$Zlo
  196. shr \$4,$Zlo
  197. and \$0xf,$rem
  198. mov $Zhi,$tmp
  199. shr \$4,$Zhi
  200. xor 8($Htbl,$nhi),$Zlo
  201. shl \$60,$tmp
  202. xor ($Htbl,$nhi),$Zhi
  203. xor $tmp,$Zlo
  204. xor ($rem_4bit,$rem,8),$Zhi
  205. bswap $Zlo
  206. bswap $Zhi
  207. ___
  208. }}
  209. $code=<<___;
  210. .text
  211. .extern OPENSSL_ia32cap_P
  212. .globl gcm_gmult_4bit
  213. .type gcm_gmult_4bit,\@function,2
  214. .align 16
  215. gcm_gmult_4bit:
  216. .cfi_startproc
  217. endbranch
  218. push %rbx
  219. .cfi_push %rbx
  220. push %rbp # %rbp and others are pushed exclusively in
  221. .cfi_push %rbp
  222. push %r12 # order to reuse Win64 exception handler...
  223. .cfi_push %r12
  224. push %r13
  225. .cfi_push %r13
  226. push %r14
  227. .cfi_push %r14
  228. push %r15
  229. .cfi_push %r15
  230. sub \$280,%rsp
  231. .cfi_adjust_cfa_offset 280
  232. .Lgmult_prologue:
  233. movzb 15($Xi),$Zlo
  234. lea .Lrem_4bit(%rip),$rem_4bit
  235. ___
  236. &loop ($Xi);
  237. $code.=<<___;
  238. mov $Zlo,8($Xi)
  239. mov $Zhi,($Xi)
  240. lea 280+48(%rsp),%rsi
  241. .cfi_def_cfa %rsi,8
  242. mov -8(%rsi),%rbx
  243. .cfi_restore %rbx
  244. lea (%rsi),%rsp
  245. .cfi_def_cfa_register %rsp
  246. .Lgmult_epilogue:
  247. ret
  248. .cfi_endproc
  249. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  250. ___
  251. # per-function register layout
  252. $inp="%rdx";
  253. $len="%rcx";
  254. $rem_8bit=$rem_4bit;
  255. $code.=<<___;
  256. .globl gcm_ghash_4bit
  257. .type gcm_ghash_4bit,\@function,4
  258. .align 16
  259. gcm_ghash_4bit:
  260. .cfi_startproc
  261. endbranch
  262. push %rbx
  263. .cfi_push %rbx
  264. push %rbp
  265. .cfi_push %rbp
  266. push %r12
  267. .cfi_push %r12
  268. push %r13
  269. .cfi_push %r13
  270. push %r14
  271. .cfi_push %r14
  272. push %r15
  273. .cfi_push %r15
  274. sub \$280,%rsp
  275. .cfi_adjust_cfa_offset 280
  276. .Lghash_prologue:
  277. mov $inp,%r14 # reassign couple of args
  278. mov $len,%r15
  279. ___
  280. { my $inp="%r14";
  281. my $dat="%edx";
  282. my $len="%r15";
  283. my @nhi=("%ebx","%ecx");
  284. my @rem=("%r12","%r13");
  285. my $Hshr4="%rbp";
  286. &sub ($Htbl,-128); # size optimization
  287. &lea ($Hshr4,"16+128(%rsp)");
  288. { my @lo =($nlo,$nhi);
  289. my @hi =($Zlo,$Zhi);
  290. &xor ($dat,$dat);
  291. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  292. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  293. &or ($lo[0],$tmp) if ($i>1);
  294. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  295. &shr ($lo[1],4) if ($i>0 && $i<17);
  296. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  297. &shr ($hi[1],4) if ($i>0 && $i<17);
  298. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  299. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  300. &shl (&LB($dat),4) if ($i>0 && $i<17);
  301. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  302. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  303. &shl ($tmp,60) if ($i>0 && $i<17);
  304. push (@lo,shift(@lo));
  305. push (@hi,shift(@hi));
  306. }
  307. }
  308. &add ($Htbl,-128);
  309. &mov ($Zlo,"8($Xi)");
  310. &mov ($Zhi,"0($Xi)");
  311. &add ($len,$inp); # pointer to the end of data
  312. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  313. &jmp (".Louter_loop");
  314. $code.=".align 16\n.Louter_loop:\n";
  315. &xor ($Zhi,"($inp)");
  316. &mov ("%rdx","8($inp)");
  317. &lea ($inp,"16($inp)");
  318. &xor ("%rdx",$Zlo);
  319. &mov ("($Xi)",$Zhi);
  320. &mov ("8($Xi)","%rdx");
  321. &shr ("%rdx",32);
  322. &xor ($nlo,$nlo);
  323. &rol ($dat,8);
  324. &mov (&LB($nlo),&LB($dat));
  325. &movz ($nhi[0],&LB($dat));
  326. &shl (&LB($nlo),4);
  327. &shr ($nhi[0],4);
  328. for ($j=11,$i=0;$i<15;$i++) {
  329. &rol ($dat,8);
  330. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  331. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  332. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  333. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  334. &mov (&LB($nlo),&LB($dat));
  335. &xor ($Zlo,$tmp) if ($i>0);
  336. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  337. &movz ($nhi[1],&LB($dat));
  338. &shl (&LB($nlo),4);
  339. &movzb ($rem[0],"(%rsp,$nhi[0])");
  340. &shr ($nhi[1],4) if ($i<14);
  341. &and ($nhi[1],0xf0) if ($i==14);
  342. &shl ($rem[1],48) if ($i>0);
  343. &xor ($rem[0],$Zlo);
  344. &mov ($tmp,$Zhi);
  345. &xor ($Zhi,$rem[1]) if ($i>0);
  346. &shr ($Zlo,8);
  347. &movz ($rem[0],&LB($rem[0]));
  348. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  349. &shr ($Zhi,8);
  350. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  351. &shl ($tmp,56);
  352. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  353. unshift (@nhi,pop(@nhi)); # "rotate" registers
  354. unshift (@rem,pop(@rem));
  355. }
  356. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  357. &xor ($Zlo,"8($Htbl,$nlo)");
  358. &xor ($Zhi,"($Htbl,$nlo)");
  359. &shl ($rem[1],48);
  360. &xor ($Zlo,$tmp);
  361. &xor ($Zhi,$rem[1]);
  362. &movz ($rem[0],&LB($Zlo));
  363. &shr ($Zlo,4);
  364. &mov ($tmp,$Zhi);
  365. &shl (&LB($rem[0]),4);
  366. &shr ($Zhi,4);
  367. &xor ($Zlo,"8($Htbl,$nhi[0])");
  368. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  369. &shl ($tmp,60);
  370. &xor ($Zhi,"($Htbl,$nhi[0])");
  371. &xor ($Zlo,$tmp);
  372. &shl ($rem[0],48);
  373. &bswap ($Zlo);
  374. &xor ($Zhi,$rem[0]);
  375. &bswap ($Zhi);
  376. &cmp ($inp,$len);
  377. &jb (".Louter_loop");
  378. }
  379. $code.=<<___;
  380. mov $Zlo,8($Xi)
  381. mov $Zhi,($Xi)
  382. lea 280+48(%rsp),%rsi
  383. .cfi_def_cfa %rsi,8
  384. mov -48(%rsi),%r15
  385. .cfi_restore %r15
  386. mov -40(%rsi),%r14
  387. .cfi_restore %r14
  388. mov -32(%rsi),%r13
  389. .cfi_restore %r13
  390. mov -24(%rsi),%r12
  391. .cfi_restore %r12
  392. mov -16(%rsi),%rbp
  393. .cfi_restore %rbp
  394. mov -8(%rsi),%rbx
  395. .cfi_restore %rbx
  396. lea 0(%rsi),%rsp
  397. .cfi_def_cfa_register %rsp
  398. .Lghash_epilogue:
  399. ret
  400. .cfi_endproc
  401. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  402. ___
  403. ######################################################################
  404. # PCLMULQDQ version.
  405. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  406. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  407. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  408. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  409. sub clmul64x64_T2 { # minimal register pressure
  410. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  411. if (!defined($HK)) { $HK = $T2;
  412. $code.=<<___;
  413. movdqa $Xi,$Xhi #
  414. pshufd \$0b01001110,$Xi,$T1
  415. pshufd \$0b01001110,$Hkey,$T2
  416. pxor $Xi,$T1 #
  417. pxor $Hkey,$T2
  418. ___
  419. } else {
  420. $code.=<<___;
  421. movdqa $Xi,$Xhi #
  422. pshufd \$0b01001110,$Xi,$T1
  423. pxor $Xi,$T1 #
  424. ___
  425. }
  426. $code.=<<___;
  427. pclmulqdq \$0x00,$Hkey,$Xi #######
  428. pclmulqdq \$0x11,$Hkey,$Xhi #######
  429. pclmulqdq \$0x00,$HK,$T1 #######
  430. pxor $Xi,$T1 #
  431. pxor $Xhi,$T1 #
  432. movdqa $T1,$T2 #
  433. psrldq \$8,$T1
  434. pslldq \$8,$T2 #
  435. pxor $T1,$Xhi
  436. pxor $T2,$Xi #
  437. ___
  438. }
  439. sub reduction_alg9 { # 17/11 times faster than Intel version
  440. my ($Xhi,$Xi) = @_;
  441. $code.=<<___;
  442. # 1st phase
  443. movdqa $Xi,$T2 #
  444. movdqa $Xi,$T1
  445. psllq \$5,$Xi
  446. pxor $Xi,$T1 #
  447. psllq \$1,$Xi
  448. pxor $T1,$Xi #
  449. psllq \$57,$Xi #
  450. movdqa $Xi,$T1 #
  451. pslldq \$8,$Xi
  452. psrldq \$8,$T1 #
  453. pxor $T2,$Xi
  454. pxor $T1,$Xhi #
  455. # 2nd phase
  456. movdqa $Xi,$T2
  457. psrlq \$1,$Xi
  458. pxor $T2,$Xhi #
  459. pxor $Xi,$T2
  460. psrlq \$5,$Xi
  461. pxor $T2,$Xi #
  462. psrlq \$1,$Xi #
  463. pxor $Xhi,$Xi #
  464. ___
  465. }
  466. { my ($Htbl,$Xip)=@_4args;
  467. my $HK="%xmm6";
  468. $code.=<<___;
  469. .globl gcm_init_clmul
  470. .type gcm_init_clmul,\@abi-omnipotent
  471. .align 16
  472. gcm_init_clmul:
  473. .cfi_startproc
  474. .L_init_clmul:
  475. ___
  476. $code.=<<___ if ($win64);
  477. .LSEH_begin_gcm_init_clmul:
  478. # I can't trust assembler to use specific encoding:-(
  479. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  480. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  481. ___
  482. $code.=<<___;
  483. movdqu ($Xip),$Hkey
  484. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  485. # <<1 twist
  486. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  487. movdqa $Hkey,$T1
  488. psllq \$1,$Hkey
  489. pxor $T3,$T3 #
  490. psrlq \$63,$T1
  491. pcmpgtd $T2,$T3 # broadcast carry bit
  492. pslldq \$8,$T1
  493. por $T1,$Hkey # H<<=1
  494. # magic reduction
  495. pand .L0x1c2_polynomial(%rip),$T3
  496. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  497. # calculate H^2
  498. pshufd \$0b01001110,$Hkey,$HK
  499. movdqa $Hkey,$Xi
  500. pxor $Hkey,$HK
  501. ___
  502. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  503. &reduction_alg9 ($Xhi,$Xi);
  504. $code.=<<___;
  505. pshufd \$0b01001110,$Hkey,$T1
  506. pshufd \$0b01001110,$Xi,$T2
  507. pxor $Hkey,$T1 # Karatsuba pre-processing
  508. movdqu $Hkey,0x00($Htbl) # save H
  509. pxor $Xi,$T2 # Karatsuba pre-processing
  510. movdqu $Xi,0x10($Htbl) # save H^2
  511. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  512. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  513. ___
  514. if ($do4xaggr) {
  515. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  516. &reduction_alg9 ($Xhi,$Xi);
  517. $code.=<<___;
  518. movdqa $Xi,$T3
  519. ___
  520. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  521. &reduction_alg9 ($Xhi,$Xi);
  522. $code.=<<___;
  523. pshufd \$0b01001110,$T3,$T1
  524. pshufd \$0b01001110,$Xi,$T2
  525. pxor $T3,$T1 # Karatsuba pre-processing
  526. movdqu $T3,0x30($Htbl) # save H^3
  527. pxor $Xi,$T2 # Karatsuba pre-processing
  528. movdqu $Xi,0x40($Htbl) # save H^4
  529. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  530. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  531. ___
  532. }
  533. $code.=<<___ if ($win64);
  534. movaps (%rsp),%xmm6
  535. lea 0x18(%rsp),%rsp
  536. .LSEH_end_gcm_init_clmul:
  537. ___
  538. $code.=<<___;
  539. ret
  540. .cfi_endproc
  541. .size gcm_init_clmul,.-gcm_init_clmul
  542. ___
  543. }
  544. { my ($Xip,$Htbl)=@_4args;
  545. $code.=<<___;
  546. .globl gcm_gmult_clmul
  547. .type gcm_gmult_clmul,\@abi-omnipotent
  548. .align 16
  549. gcm_gmult_clmul:
  550. .cfi_startproc
  551. endbranch
  552. .L_gmult_clmul:
  553. movdqu ($Xip),$Xi
  554. movdqa .Lbswap_mask(%rip),$T3
  555. movdqu ($Htbl),$Hkey
  556. movdqu 0x20($Htbl),$T2
  557. pshufb $T3,$Xi
  558. ___
  559. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  560. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  561. # experimental alternative. special thing about is that there
  562. # no dependency between the two multiplications...
  563. mov \$`0xE1<<1`,%eax
  564. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  565. mov \$0x07,%r11d
  566. movq %rax,$T1
  567. movq %r10,$T2
  568. movq %r11,$T3 # borrow $T3
  569. pand $Xi,$T3
  570. pshufb $T3,$T2 # ($Xi&7)·0xE0
  571. movq %rax,$T3
  572. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  573. pxor $Xi,$T2
  574. pslldq \$15,$T2
  575. paddd $T2,$T2 # <<(64+56+1)
  576. pxor $T2,$Xi
  577. pclmulqdq \$0x01,$T3,$Xi
  578. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  579. psrldq \$1,$T1
  580. pxor $T1,$Xhi
  581. pslldq \$7,$Xi
  582. pxor $Xhi,$Xi
  583. ___
  584. $code.=<<___;
  585. pshufb $T3,$Xi
  586. movdqu $Xi,($Xip)
  587. ret
  588. .cfi_endproc
  589. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  590. ___
  591. }
  592. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  593. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  594. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  595. $code.=<<___;
  596. .globl gcm_ghash_clmul
  597. .type gcm_ghash_clmul,\@abi-omnipotent
  598. .align 32
  599. gcm_ghash_clmul:
  600. .cfi_startproc
  601. endbranch
  602. .L_ghash_clmul:
  603. ___
  604. $code.=<<___ if ($win64);
  605. lea -0x88(%rsp),%rax
  606. .LSEH_begin_gcm_ghash_clmul:
  607. # I can't trust assembler to use specific encoding:-(
  608. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  609. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  610. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  611. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  612. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  613. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  614. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  615. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  616. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  617. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  618. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  619. ___
  620. $code.=<<___;
  621. movdqa .Lbswap_mask(%rip),$T3
  622. movdqu ($Xip),$Xi
  623. movdqu ($Htbl),$Hkey
  624. movdqu 0x20($Htbl),$HK
  625. pshufb $T3,$Xi
  626. sub \$0x10,$len
  627. jz .Lodd_tail
  628. movdqu 0x10($Htbl),$Hkey2
  629. ___
  630. if ($do4xaggr) {
  631. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  632. $code.=<<___;
  633. mov OPENSSL_ia32cap_P+4(%rip),%eax
  634. cmp \$0x30,$len
  635. jb .Lskip4x
  636. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  637. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  638. je .Lskip4x
  639. sub \$0x30,$len
  640. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  641. movdqu 0x30($Htbl),$Hkey3
  642. movdqu 0x40($Htbl),$Hkey4
  643. #######
  644. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  645. #
  646. movdqu 0x30($inp),$Xln
  647. movdqu 0x20($inp),$Xl
  648. pshufb $T3,$Xln
  649. pshufb $T3,$Xl
  650. movdqa $Xln,$Xhn
  651. pshufd \$0b01001110,$Xln,$Xmn
  652. pxor $Xln,$Xmn
  653. pclmulqdq \$0x00,$Hkey,$Xln
  654. pclmulqdq \$0x11,$Hkey,$Xhn
  655. pclmulqdq \$0x00,$HK,$Xmn
  656. movdqa $Xl,$Xh
  657. pshufd \$0b01001110,$Xl,$Xm
  658. pxor $Xl,$Xm
  659. pclmulqdq \$0x00,$Hkey2,$Xl
  660. pclmulqdq \$0x11,$Hkey2,$Xh
  661. pclmulqdq \$0x10,$HK,$Xm
  662. xorps $Xl,$Xln
  663. xorps $Xh,$Xhn
  664. movups 0x50($Htbl),$HK
  665. xorps $Xm,$Xmn
  666. movdqu 0x10($inp),$Xl
  667. movdqu 0($inp),$T1
  668. pshufb $T3,$Xl
  669. pshufb $T3,$T1
  670. movdqa $Xl,$Xh
  671. pshufd \$0b01001110,$Xl,$Xm
  672. pxor $T1,$Xi
  673. pxor $Xl,$Xm
  674. pclmulqdq \$0x00,$Hkey3,$Xl
  675. movdqa $Xi,$Xhi
  676. pshufd \$0b01001110,$Xi,$T1
  677. pxor $Xi,$T1
  678. pclmulqdq \$0x11,$Hkey3,$Xh
  679. pclmulqdq \$0x00,$HK,$Xm
  680. xorps $Xl,$Xln
  681. xorps $Xh,$Xhn
  682. lea 0x40($inp),$inp
  683. sub \$0x40,$len
  684. jc .Ltail4x
  685. jmp .Lmod4_loop
  686. .align 32
  687. .Lmod4_loop:
  688. pclmulqdq \$0x00,$Hkey4,$Xi
  689. xorps $Xm,$Xmn
  690. movdqu 0x30($inp),$Xl
  691. pshufb $T3,$Xl
  692. pclmulqdq \$0x11,$Hkey4,$Xhi
  693. xorps $Xln,$Xi
  694. movdqu 0x20($inp),$Xln
  695. movdqa $Xl,$Xh
  696. pclmulqdq \$0x10,$HK,$T1
  697. pshufd \$0b01001110,$Xl,$Xm
  698. xorps $Xhn,$Xhi
  699. pxor $Xl,$Xm
  700. pshufb $T3,$Xln
  701. movups 0x20($Htbl),$HK
  702. xorps $Xmn,$T1
  703. pclmulqdq \$0x00,$Hkey,$Xl
  704. pshufd \$0b01001110,$Xln,$Xmn
  705. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  706. movdqa $Xln,$Xhn
  707. pxor $Xhi,$T1 #
  708. pxor $Xln,$Xmn
  709. movdqa $T1,$T2 #
  710. pclmulqdq \$0x11,$Hkey,$Xh
  711. pslldq \$8,$T1
  712. psrldq \$8,$T2 #
  713. pxor $T1,$Xi
  714. movdqa .L7_mask(%rip),$T1
  715. pxor $T2,$Xhi #
  716. movq %rax,$T2
  717. pand $Xi,$T1 # 1st phase
  718. pshufb $T1,$T2 #
  719. pxor $Xi,$T2 #
  720. pclmulqdq \$0x00,$HK,$Xm
  721. psllq \$57,$T2 #
  722. movdqa $T2,$T1 #
  723. pslldq \$8,$T2
  724. pclmulqdq \$0x00,$Hkey2,$Xln
  725. psrldq \$8,$T1 #
  726. pxor $T2,$Xi
  727. pxor $T1,$Xhi #
  728. movdqu 0($inp),$T1
  729. movdqa $Xi,$T2 # 2nd phase
  730. psrlq \$1,$Xi
  731. pclmulqdq \$0x11,$Hkey2,$Xhn
  732. xorps $Xl,$Xln
  733. movdqu 0x10($inp),$Xl
  734. pshufb $T3,$Xl
  735. pclmulqdq \$0x10,$HK,$Xmn
  736. xorps $Xh,$Xhn
  737. movups 0x50($Htbl),$HK
  738. pshufb $T3,$T1
  739. pxor $T2,$Xhi #
  740. pxor $Xi,$T2
  741. psrlq \$5,$Xi
  742. movdqa $Xl,$Xh
  743. pxor $Xm,$Xmn
  744. pshufd \$0b01001110,$Xl,$Xm
  745. pxor $T2,$Xi #
  746. pxor $T1,$Xhi
  747. pxor $Xl,$Xm
  748. pclmulqdq \$0x00,$Hkey3,$Xl
  749. psrlq \$1,$Xi #
  750. pxor $Xhi,$Xi #
  751. movdqa $Xi,$Xhi
  752. pclmulqdq \$0x11,$Hkey3,$Xh
  753. xorps $Xl,$Xln
  754. pshufd \$0b01001110,$Xi,$T1
  755. pxor $Xi,$T1
  756. pclmulqdq \$0x00,$HK,$Xm
  757. xorps $Xh,$Xhn
  758. lea 0x40($inp),$inp
  759. sub \$0x40,$len
  760. jnc .Lmod4_loop
  761. .Ltail4x:
  762. pclmulqdq \$0x00,$Hkey4,$Xi
  763. pclmulqdq \$0x11,$Hkey4,$Xhi
  764. pclmulqdq \$0x10,$HK,$T1
  765. xorps $Xm,$Xmn
  766. xorps $Xln,$Xi
  767. xorps $Xhn,$Xhi
  768. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  769. pxor $Xmn,$T1
  770. pxor $Xhi,$T1 #
  771. pxor $Xi,$Xhi
  772. movdqa $T1,$T2 #
  773. psrldq \$8,$T1
  774. pslldq \$8,$T2 #
  775. pxor $T1,$Xhi
  776. pxor $T2,$Xi #
  777. ___
  778. &reduction_alg9($Xhi,$Xi);
  779. $code.=<<___;
  780. add \$0x40,$len
  781. jz .Ldone
  782. movdqu 0x20($Htbl),$HK
  783. sub \$0x10,$len
  784. jz .Lodd_tail
  785. .Lskip4x:
  786. ___
  787. }
  788. $code.=<<___;
  789. #######
  790. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  791. # [(H*Ii+1) + (H*Xi+1)] mod P =
  792. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  793. #
  794. movdqu ($inp),$T1 # Ii
  795. movdqu 16($inp),$Xln # Ii+1
  796. pshufb $T3,$T1
  797. pshufb $T3,$Xln
  798. pxor $T1,$Xi # Ii+Xi
  799. movdqa $Xln,$Xhn
  800. pshufd \$0b01001110,$Xln,$Xmn
  801. pxor $Xln,$Xmn
  802. pclmulqdq \$0x00,$Hkey,$Xln
  803. pclmulqdq \$0x11,$Hkey,$Xhn
  804. pclmulqdq \$0x00,$HK,$Xmn
  805. lea 32($inp),$inp # i+=2
  806. nop
  807. sub \$0x20,$len
  808. jbe .Leven_tail
  809. nop
  810. jmp .Lmod_loop
  811. .align 32
  812. .Lmod_loop:
  813. movdqa $Xi,$Xhi
  814. movdqa $Xmn,$T1
  815. pshufd \$0b01001110,$Xi,$Xmn #
  816. pxor $Xi,$Xmn #
  817. pclmulqdq \$0x00,$Hkey2,$Xi
  818. pclmulqdq \$0x11,$Hkey2,$Xhi
  819. pclmulqdq \$0x10,$HK,$Xmn
  820. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  821. pxor $Xhn,$Xhi
  822. movdqu ($inp),$T2 # Ii
  823. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  824. pshufb $T3,$T2
  825. movdqu 16($inp),$Xln # Ii+1
  826. pxor $Xhi,$T1
  827. pxor $T2,$Xhi # "Ii+Xi", consume early
  828. pxor $T1,$Xmn
  829. pshufb $T3,$Xln
  830. movdqa $Xmn,$T1 #
  831. psrldq \$8,$T1
  832. pslldq \$8,$Xmn #
  833. pxor $T1,$Xhi
  834. pxor $Xmn,$Xi #
  835. movdqa $Xln,$Xhn #
  836. movdqa $Xi,$T2 # 1st phase
  837. movdqa $Xi,$T1
  838. psllq \$5,$Xi
  839. pxor $Xi,$T1 #
  840. pclmulqdq \$0x00,$Hkey,$Xln #######
  841. psllq \$1,$Xi
  842. pxor $T1,$Xi #
  843. psllq \$57,$Xi #
  844. movdqa $Xi,$T1 #
  845. pslldq \$8,$Xi
  846. psrldq \$8,$T1 #
  847. pxor $T2,$Xi
  848. pshufd \$0b01001110,$Xhn,$Xmn
  849. pxor $T1,$Xhi #
  850. pxor $Xhn,$Xmn #
  851. movdqa $Xi,$T2 # 2nd phase
  852. psrlq \$1,$Xi
  853. pclmulqdq \$0x11,$Hkey,$Xhn #######
  854. pxor $T2,$Xhi #
  855. pxor $Xi,$T2
  856. psrlq \$5,$Xi
  857. pxor $T2,$Xi #
  858. lea 32($inp),$inp
  859. psrlq \$1,$Xi #
  860. pclmulqdq \$0x00,$HK,$Xmn #######
  861. pxor $Xhi,$Xi #
  862. sub \$0x20,$len
  863. ja .Lmod_loop
  864. .Leven_tail:
  865. movdqa $Xi,$Xhi
  866. movdqa $Xmn,$T1
  867. pshufd \$0b01001110,$Xi,$Xmn #
  868. pxor $Xi,$Xmn #
  869. pclmulqdq \$0x00,$Hkey2,$Xi
  870. pclmulqdq \$0x11,$Hkey2,$Xhi
  871. pclmulqdq \$0x10,$HK,$Xmn
  872. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  873. pxor $Xhn,$Xhi
  874. pxor $Xi,$T1
  875. pxor $Xhi,$T1
  876. pxor $T1,$Xmn
  877. movdqa $Xmn,$T1 #
  878. psrldq \$8,$T1
  879. pslldq \$8,$Xmn #
  880. pxor $T1,$Xhi
  881. pxor $Xmn,$Xi #
  882. ___
  883. &reduction_alg9 ($Xhi,$Xi);
  884. $code.=<<___;
  885. test $len,$len
  886. jnz .Ldone
  887. .Lodd_tail:
  888. movdqu ($inp),$T1 # Ii
  889. pshufb $T3,$T1
  890. pxor $T1,$Xi # Ii+Xi
  891. ___
  892. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  893. &reduction_alg9 ($Xhi,$Xi);
  894. $code.=<<___;
  895. .Ldone:
  896. pshufb $T3,$Xi
  897. movdqu $Xi,($Xip)
  898. ___
  899. $code.=<<___ if ($win64);
  900. movaps (%rsp),%xmm6
  901. movaps 0x10(%rsp),%xmm7
  902. movaps 0x20(%rsp),%xmm8
  903. movaps 0x30(%rsp),%xmm9
  904. movaps 0x40(%rsp),%xmm10
  905. movaps 0x50(%rsp),%xmm11
  906. movaps 0x60(%rsp),%xmm12
  907. movaps 0x70(%rsp),%xmm13
  908. movaps 0x80(%rsp),%xmm14
  909. movaps 0x90(%rsp),%xmm15
  910. lea 0xa8(%rsp),%rsp
  911. .LSEH_end_gcm_ghash_clmul:
  912. ___
  913. $code.=<<___;
  914. ret
  915. .cfi_endproc
  916. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  917. ___
  918. }
  919. $code.=<<___;
  920. .globl gcm_init_avx
  921. .type gcm_init_avx,\@abi-omnipotent
  922. .align 32
  923. gcm_init_avx:
  924. .cfi_startproc
  925. ___
  926. if ($avx) {
  927. my ($Htbl,$Xip)=@_4args;
  928. my $HK="%xmm6";
  929. $code.=<<___ if ($win64);
  930. .LSEH_begin_gcm_init_avx:
  931. # I can't trust assembler to use specific encoding:-(
  932. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  933. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  934. ___
  935. $code.=<<___;
  936. vzeroupper
  937. vmovdqu ($Xip),$Hkey
  938. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  939. # <<1 twist
  940. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  941. vpsrlq \$63,$Hkey,$T1
  942. vpsllq \$1,$Hkey,$Hkey
  943. vpxor $T3,$T3,$T3 #
  944. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  945. vpslldq \$8,$T1,$T1
  946. vpor $T1,$Hkey,$Hkey # H<<=1
  947. # magic reduction
  948. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  949. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  950. vpunpckhqdq $Hkey,$Hkey,$HK
  951. vmovdqa $Hkey,$Xi
  952. vpxor $Hkey,$HK,$HK
  953. mov \$4,%r10 # up to H^8
  954. jmp .Linit_start_avx
  955. ___
  956. sub clmul64x64_avx {
  957. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  958. if (!defined($HK)) { $HK = $T2;
  959. $code.=<<___;
  960. vpunpckhqdq $Xi,$Xi,$T1
  961. vpunpckhqdq $Hkey,$Hkey,$T2
  962. vpxor $Xi,$T1,$T1 #
  963. vpxor $Hkey,$T2,$T2
  964. ___
  965. } else {
  966. $code.=<<___;
  967. vpunpckhqdq $Xi,$Xi,$T1
  968. vpxor $Xi,$T1,$T1 #
  969. ___
  970. }
  971. $code.=<<___;
  972. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  973. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  974. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  975. vpxor $Xi,$Xhi,$T2 #
  976. vpxor $T2,$T1,$T1 #
  977. vpslldq \$8,$T1,$T2 #
  978. vpsrldq \$8,$T1,$T1
  979. vpxor $T2,$Xi,$Xi #
  980. vpxor $T1,$Xhi,$Xhi
  981. ___
  982. }
  983. sub reduction_avx {
  984. my ($Xhi,$Xi) = @_;
  985. $code.=<<___;
  986. vpsllq \$57,$Xi,$T1 # 1st phase
  987. vpsllq \$62,$Xi,$T2
  988. vpxor $T1,$T2,$T2 #
  989. vpsllq \$63,$Xi,$T1
  990. vpxor $T1,$T2,$T2 #
  991. vpslldq \$8,$T2,$T1 #
  992. vpsrldq \$8,$T2,$T2
  993. vpxor $T1,$Xi,$Xi #
  994. vpxor $T2,$Xhi,$Xhi
  995. vpsrlq \$1,$Xi,$T2 # 2nd phase
  996. vpxor $Xi,$Xhi,$Xhi
  997. vpxor $T2,$Xi,$Xi #
  998. vpsrlq \$5,$T2,$T2
  999. vpxor $T2,$Xi,$Xi #
  1000. vpsrlq \$1,$Xi,$Xi #
  1001. vpxor $Xhi,$Xi,$Xi #
  1002. ___
  1003. }
  1004. $code.=<<___;
  1005. .align 32
  1006. .Linit_loop_avx:
  1007. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  1008. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  1009. ___
  1010. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  1011. &reduction_avx ($Xhi,$Xi);
  1012. $code.=<<___;
  1013. .Linit_start_avx:
  1014. vmovdqa $Xi,$T3
  1015. ___
  1016. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  1017. &reduction_avx ($Xhi,$Xi);
  1018. $code.=<<___;
  1019. vpshufd \$0b01001110,$T3,$T1
  1020. vpshufd \$0b01001110,$Xi,$T2
  1021. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  1022. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  1023. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  1024. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  1025. lea 0x30($Htbl),$Htbl
  1026. sub \$1,%r10
  1027. jnz .Linit_loop_avx
  1028. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  1029. vmovdqu $T3,-0x10($Htbl)
  1030. vzeroupper
  1031. ___
  1032. $code.=<<___ if ($win64);
  1033. movaps (%rsp),%xmm6
  1034. lea 0x18(%rsp),%rsp
  1035. .LSEH_end_gcm_init_avx:
  1036. ___
  1037. $code.=<<___;
  1038. ret
  1039. .cfi_endproc
  1040. .size gcm_init_avx,.-gcm_init_avx
  1041. ___
  1042. } else {
  1043. $code.=<<___;
  1044. jmp .L_init_clmul
  1045. .cfi_endproc
  1046. .size gcm_init_avx,.-gcm_init_avx
  1047. ___
  1048. }
  1049. $code.=<<___;
  1050. .globl gcm_gmult_avx
  1051. .type gcm_gmult_avx,\@abi-omnipotent
  1052. .align 32
  1053. gcm_gmult_avx:
  1054. .cfi_startproc
  1055. endbranch
  1056. jmp .L_gmult_clmul
  1057. .cfi_endproc
  1058. .size gcm_gmult_avx,.-gcm_gmult_avx
  1059. ___
  1060. $code.=<<___;
  1061. .globl gcm_ghash_avx
  1062. .type gcm_ghash_avx,\@abi-omnipotent
  1063. .align 32
  1064. gcm_ghash_avx:
  1065. .cfi_startproc
  1066. endbranch
  1067. ___
  1068. if ($avx) {
  1069. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1070. my ($Xlo,$Xhi,$Xmi,
  1071. $Zlo,$Zhi,$Zmi,
  1072. $Hkey,$HK,$T1,$T2,
  1073. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1074. $code.=<<___ if ($win64);
  1075. lea -0x88(%rsp),%rax
  1076. .LSEH_begin_gcm_ghash_avx:
  1077. # I can't trust assembler to use specific encoding:-(
  1078. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1079. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1080. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1081. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1082. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1083. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1084. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1085. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1086. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1087. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1088. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1089. ___
  1090. $code.=<<___;
  1091. vzeroupper
  1092. vmovdqu ($Xip),$Xi # load $Xi
  1093. lea .L0x1c2_polynomial(%rip),%r10
  1094. lea 0x40($Htbl),$Htbl # size optimization
  1095. vmovdqu .Lbswap_mask(%rip),$bswap
  1096. vpshufb $bswap,$Xi,$Xi
  1097. cmp \$0x80,$len
  1098. jb .Lshort_avx
  1099. sub \$0x80,$len
  1100. vmovdqu 0x70($inp),$Ii # I[7]
  1101. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1102. vpshufb $bswap,$Ii,$Ii
  1103. vmovdqu 0x20-0x40($Htbl),$HK
  1104. vpunpckhqdq $Ii,$Ii,$T2
  1105. vmovdqu 0x60($inp),$Ij # I[6]
  1106. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1107. vpxor $Ii,$T2,$T2
  1108. vpshufb $bswap,$Ij,$Ij
  1109. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1110. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1111. vpunpckhqdq $Ij,$Ij,$T1
  1112. vmovdqu 0x50($inp),$Ii # I[5]
  1113. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1114. vpxor $Ij,$T1,$T1
  1115. vpshufb $bswap,$Ii,$Ii
  1116. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1117. vpunpckhqdq $Ii,$Ii,$T2
  1118. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1119. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1120. vpxor $Ii,$T2,$T2
  1121. vmovdqu 0x40($inp),$Ij # I[4]
  1122. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1123. vmovdqu 0x50-0x40($Htbl),$HK
  1124. vpshufb $bswap,$Ij,$Ij
  1125. vpxor $Xlo,$Zlo,$Zlo
  1126. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1127. vpxor $Xhi,$Zhi,$Zhi
  1128. vpunpckhqdq $Ij,$Ij,$T1
  1129. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1130. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1131. vpxor $Xmi,$Zmi,$Zmi
  1132. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1133. vpxor $Ij,$T1,$T1
  1134. vmovdqu 0x30($inp),$Ii # I[3]
  1135. vpxor $Zlo,$Xlo,$Xlo
  1136. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1137. vpxor $Zhi,$Xhi,$Xhi
  1138. vpshufb $bswap,$Ii,$Ii
  1139. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1140. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1141. vpxor $Zmi,$Xmi,$Xmi
  1142. vpunpckhqdq $Ii,$Ii,$T2
  1143. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1144. vmovdqu 0x80-0x40($Htbl),$HK
  1145. vpxor $Ii,$T2,$T2
  1146. vmovdqu 0x20($inp),$Ij # I[2]
  1147. vpxor $Xlo,$Zlo,$Zlo
  1148. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1149. vpxor $Xhi,$Zhi,$Zhi
  1150. vpshufb $bswap,$Ij,$Ij
  1151. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1152. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1153. vpxor $Xmi,$Zmi,$Zmi
  1154. vpunpckhqdq $Ij,$Ij,$T1
  1155. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1156. vpxor $Ij,$T1,$T1
  1157. vmovdqu 0x10($inp),$Ii # I[1]
  1158. vpxor $Zlo,$Xlo,$Xlo
  1159. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1160. vpxor $Zhi,$Xhi,$Xhi
  1161. vpshufb $bswap,$Ii,$Ii
  1162. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1163. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1164. vpxor $Zmi,$Xmi,$Xmi
  1165. vpunpckhqdq $Ii,$Ii,$T2
  1166. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1167. vmovdqu 0xb0-0x40($Htbl),$HK
  1168. vpxor $Ii,$T2,$T2
  1169. vmovdqu ($inp),$Ij # I[0]
  1170. vpxor $Xlo,$Zlo,$Zlo
  1171. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1172. vpxor $Xhi,$Zhi,$Zhi
  1173. vpshufb $bswap,$Ij,$Ij
  1174. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1175. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1176. vpxor $Xmi,$Zmi,$Zmi
  1177. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1178. lea 0x80($inp),$inp
  1179. cmp \$0x80,$len
  1180. jb .Ltail_avx
  1181. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1182. sub \$0x80,$len
  1183. jmp .Loop8x_avx
  1184. .align 32
  1185. .Loop8x_avx:
  1186. vpunpckhqdq $Ij,$Ij,$T1
  1187. vmovdqu 0x70($inp),$Ii # I[7]
  1188. vpxor $Xlo,$Zlo,$Zlo
  1189. vpxor $Ij,$T1,$T1
  1190. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1191. vpshufb $bswap,$Ii,$Ii
  1192. vpxor $Xhi,$Zhi,$Zhi
  1193. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1194. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1195. vpunpckhqdq $Ii,$Ii,$T2
  1196. vpxor $Xmi,$Zmi,$Zmi
  1197. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1198. vmovdqu 0x20-0x40($Htbl),$HK
  1199. vpxor $Ii,$T2,$T2
  1200. vmovdqu 0x60($inp),$Ij # I[6]
  1201. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1202. vpxor $Zlo,$Xi,$Xi # collect result
  1203. vpshufb $bswap,$Ij,$Ij
  1204. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1205. vxorps $Zhi,$Xo,$Xo
  1206. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1207. vpunpckhqdq $Ij,$Ij,$T1
  1208. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1209. vpxor $Zmi,$Tred,$Tred
  1210. vxorps $Ij,$T1,$T1
  1211. vmovdqu 0x50($inp),$Ii # I[5]
  1212. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1213. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1214. vpxor $Xo,$Tred,$Tred
  1215. vpslldq \$8,$Tred,$T2
  1216. vpxor $Xlo,$Zlo,$Zlo
  1217. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1218. vpsrldq \$8,$Tred,$Tred
  1219. vpxor $T2, $Xi, $Xi
  1220. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1221. vpshufb $bswap,$Ii,$Ii
  1222. vxorps $Tred,$Xo, $Xo
  1223. vpxor $Xhi,$Zhi,$Zhi
  1224. vpunpckhqdq $Ii,$Ii,$T2
  1225. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1226. vmovdqu 0x50-0x40($Htbl),$HK
  1227. vpxor $Ii,$T2,$T2
  1228. vpxor $Xmi,$Zmi,$Zmi
  1229. vmovdqu 0x40($inp),$Ij # I[4]
  1230. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1231. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1232. vpshufb $bswap,$Ij,$Ij
  1233. vpxor $Zlo,$Xlo,$Xlo
  1234. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1235. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1236. vpunpckhqdq $Ij,$Ij,$T1
  1237. vpxor $Zhi,$Xhi,$Xhi
  1238. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1239. vxorps $Ij,$T1,$T1
  1240. vpxor $Zmi,$Xmi,$Xmi
  1241. vmovdqu 0x30($inp),$Ii # I[3]
  1242. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1243. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1244. vpshufb $bswap,$Ii,$Ii
  1245. vpxor $Xlo,$Zlo,$Zlo
  1246. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1247. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1248. vpunpckhqdq $Ii,$Ii,$T2
  1249. vpxor $Xhi,$Zhi,$Zhi
  1250. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1251. vmovdqu 0x80-0x40($Htbl),$HK
  1252. vpxor $Ii,$T2,$T2
  1253. vpxor $Xmi,$Zmi,$Zmi
  1254. vmovdqu 0x20($inp),$Ij # I[2]
  1255. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1256. vpshufb $bswap,$Ij,$Ij
  1257. vpxor $Zlo,$Xlo,$Xlo
  1258. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1259. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1260. vpunpckhqdq $Ij,$Ij,$T1
  1261. vpxor $Zhi,$Xhi,$Xhi
  1262. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1263. vpxor $Ij,$T1,$T1
  1264. vpxor $Zmi,$Xmi,$Xmi
  1265. vxorps $Tred,$Xi,$Xi
  1266. vmovdqu 0x10($inp),$Ii # I[1]
  1267. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1268. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1269. vpshufb $bswap,$Ii,$Ii
  1270. vpxor $Xlo,$Zlo,$Zlo
  1271. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1272. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1273. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1274. vxorps $Xo,$Tred,$Tred
  1275. vpunpckhqdq $Ii,$Ii,$T2
  1276. vpxor $Xhi,$Zhi,$Zhi
  1277. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1278. vmovdqu 0xb0-0x40($Htbl),$HK
  1279. vpxor $Ii,$T2,$T2
  1280. vpxor $Xmi,$Zmi,$Zmi
  1281. vmovdqu ($inp),$Ij # I[0]
  1282. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1283. vpshufb $bswap,$Ij,$Ij
  1284. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1285. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1286. vpxor $Tred,$Ij,$Ij
  1287. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1288. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1289. lea 0x80($inp),$inp
  1290. sub \$0x80,$len
  1291. jnc .Loop8x_avx
  1292. add \$0x80,$len
  1293. jmp .Ltail_no_xor_avx
  1294. .align 32
  1295. .Lshort_avx:
  1296. vmovdqu -0x10($inp,$len),$Ii # very last word
  1297. lea ($inp,$len),$inp
  1298. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1299. vmovdqu 0x20-0x40($Htbl),$HK
  1300. vpshufb $bswap,$Ii,$Ij
  1301. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1302. vmovdqa $Xhi,$Zhi # $Zhi and
  1303. vmovdqa $Xmi,$Zmi # $Zmi
  1304. sub \$0x10,$len
  1305. jz .Ltail_avx
  1306. vpunpckhqdq $Ij,$Ij,$T1
  1307. vpxor $Xlo,$Zlo,$Zlo
  1308. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1309. vpxor $Ij,$T1,$T1
  1310. vmovdqu -0x20($inp),$Ii
  1311. vpxor $Xhi,$Zhi,$Zhi
  1312. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1313. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1314. vpshufb $bswap,$Ii,$Ij
  1315. vpxor $Xmi,$Zmi,$Zmi
  1316. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1317. vpsrldq \$8,$HK,$HK
  1318. sub \$0x10,$len
  1319. jz .Ltail_avx
  1320. vpunpckhqdq $Ij,$Ij,$T1
  1321. vpxor $Xlo,$Zlo,$Zlo
  1322. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1323. vpxor $Ij,$T1,$T1
  1324. vmovdqu -0x30($inp),$Ii
  1325. vpxor $Xhi,$Zhi,$Zhi
  1326. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1327. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1328. vpshufb $bswap,$Ii,$Ij
  1329. vpxor $Xmi,$Zmi,$Zmi
  1330. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1331. vmovdqu 0x50-0x40($Htbl),$HK
  1332. sub \$0x10,$len
  1333. jz .Ltail_avx
  1334. vpunpckhqdq $Ij,$Ij,$T1
  1335. vpxor $Xlo,$Zlo,$Zlo
  1336. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1337. vpxor $Ij,$T1,$T1
  1338. vmovdqu -0x40($inp),$Ii
  1339. vpxor $Xhi,$Zhi,$Zhi
  1340. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1341. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1342. vpshufb $bswap,$Ii,$Ij
  1343. vpxor $Xmi,$Zmi,$Zmi
  1344. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1345. vpsrldq \$8,$HK,$HK
  1346. sub \$0x10,$len
  1347. jz .Ltail_avx
  1348. vpunpckhqdq $Ij,$Ij,$T1
  1349. vpxor $Xlo,$Zlo,$Zlo
  1350. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1351. vpxor $Ij,$T1,$T1
  1352. vmovdqu -0x50($inp),$Ii
  1353. vpxor $Xhi,$Zhi,$Zhi
  1354. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1355. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1356. vpshufb $bswap,$Ii,$Ij
  1357. vpxor $Xmi,$Zmi,$Zmi
  1358. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1359. vmovdqu 0x80-0x40($Htbl),$HK
  1360. sub \$0x10,$len
  1361. jz .Ltail_avx
  1362. vpunpckhqdq $Ij,$Ij,$T1
  1363. vpxor $Xlo,$Zlo,$Zlo
  1364. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1365. vpxor $Ij,$T1,$T1
  1366. vmovdqu -0x60($inp),$Ii
  1367. vpxor $Xhi,$Zhi,$Zhi
  1368. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1369. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1370. vpshufb $bswap,$Ii,$Ij
  1371. vpxor $Xmi,$Zmi,$Zmi
  1372. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1373. vpsrldq \$8,$HK,$HK
  1374. sub \$0x10,$len
  1375. jz .Ltail_avx
  1376. vpunpckhqdq $Ij,$Ij,$T1
  1377. vpxor $Xlo,$Zlo,$Zlo
  1378. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1379. vpxor $Ij,$T1,$T1
  1380. vmovdqu -0x70($inp),$Ii
  1381. vpxor $Xhi,$Zhi,$Zhi
  1382. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1383. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1384. vpshufb $bswap,$Ii,$Ij
  1385. vpxor $Xmi,$Zmi,$Zmi
  1386. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1387. vmovq 0xb8-0x40($Htbl),$HK
  1388. sub \$0x10,$len
  1389. jmp .Ltail_avx
  1390. .align 32
  1391. .Ltail_avx:
  1392. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1393. .Ltail_no_xor_avx:
  1394. vpunpckhqdq $Ij,$Ij,$T1
  1395. vpxor $Xlo,$Zlo,$Zlo
  1396. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1397. vpxor $Ij,$T1,$T1
  1398. vpxor $Xhi,$Zhi,$Zhi
  1399. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1400. vpxor $Xmi,$Zmi,$Zmi
  1401. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1402. vmovdqu (%r10),$Tred
  1403. vpxor $Xlo,$Zlo,$Xi
  1404. vpxor $Xhi,$Zhi,$Xo
  1405. vpxor $Xmi,$Zmi,$Zmi
  1406. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1407. vpxor $Xo, $Zmi,$Zmi
  1408. vpslldq \$8, $Zmi,$T2
  1409. vpsrldq \$8, $Zmi,$Zmi
  1410. vpxor $T2, $Xi, $Xi
  1411. vpxor $Zmi,$Xo, $Xo
  1412. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1413. vpalignr \$8,$Xi,$Xi,$Xi
  1414. vpxor $T2,$Xi,$Xi
  1415. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1416. vpalignr \$8,$Xi,$Xi,$Xi
  1417. vpxor $Xo,$Xi,$Xi
  1418. vpxor $T2,$Xi,$Xi
  1419. cmp \$0,$len
  1420. jne .Lshort_avx
  1421. vpshufb $bswap,$Xi,$Xi
  1422. vmovdqu $Xi,($Xip)
  1423. vzeroupper
  1424. ___
  1425. $code.=<<___ if ($win64);
  1426. movaps (%rsp),%xmm6
  1427. movaps 0x10(%rsp),%xmm7
  1428. movaps 0x20(%rsp),%xmm8
  1429. movaps 0x30(%rsp),%xmm9
  1430. movaps 0x40(%rsp),%xmm10
  1431. movaps 0x50(%rsp),%xmm11
  1432. movaps 0x60(%rsp),%xmm12
  1433. movaps 0x70(%rsp),%xmm13
  1434. movaps 0x80(%rsp),%xmm14
  1435. movaps 0x90(%rsp),%xmm15
  1436. lea 0xa8(%rsp),%rsp
  1437. .LSEH_end_gcm_ghash_avx:
  1438. ___
  1439. $code.=<<___;
  1440. ret
  1441. .cfi_endproc
  1442. .size gcm_ghash_avx,.-gcm_ghash_avx
  1443. ___
  1444. } else {
  1445. $code.=<<___;
  1446. jmp .L_ghash_clmul
  1447. .cfi_endproc
  1448. .size gcm_ghash_avx,.-gcm_ghash_avx
  1449. ___
  1450. }
  1451. $code.=<<___;
  1452. .align 64
  1453. .Lbswap_mask:
  1454. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1455. .L0x1c2_polynomial:
  1456. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1457. .L7_mask:
  1458. .long 7,0,7,0
  1459. .L7_mask_poly:
  1460. .long 7,0,`0xE1<<1`,0
  1461. .align 64
  1462. .type .Lrem_4bit,\@object
  1463. .Lrem_4bit:
  1464. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1465. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1466. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1467. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1468. .type .Lrem_8bit,\@object
  1469. .Lrem_8bit:
  1470. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1471. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1472. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1473. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1474. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1475. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1476. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1477. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1478. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1479. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1480. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1481. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1482. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1483. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1484. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1485. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1486. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1487. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1488. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1489. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1490. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1491. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1492. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1493. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1494. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1495. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1496. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1497. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1498. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1499. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1500. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1501. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1502. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1503. .align 64
  1504. ___
  1505. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1506. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1507. if ($win64) {
  1508. $rec="%rcx";
  1509. $frame="%rdx";
  1510. $context="%r8";
  1511. $disp="%r9";
  1512. $code.=<<___;
  1513. .extern __imp_RtlVirtualUnwind
  1514. .type se_handler,\@abi-omnipotent
  1515. .align 16
  1516. se_handler:
  1517. push %rsi
  1518. push %rdi
  1519. push %rbx
  1520. push %rbp
  1521. push %r12
  1522. push %r13
  1523. push %r14
  1524. push %r15
  1525. pushfq
  1526. sub \$64,%rsp
  1527. mov 120($context),%rax # pull context->Rax
  1528. mov 248($context),%rbx # pull context->Rip
  1529. mov 8($disp),%rsi # disp->ImageBase
  1530. mov 56($disp),%r11 # disp->HandlerData
  1531. mov 0(%r11),%r10d # HandlerData[0]
  1532. lea (%rsi,%r10),%r10 # prologue label
  1533. cmp %r10,%rbx # context->Rip<prologue label
  1534. jb .Lin_prologue
  1535. mov 152($context),%rax # pull context->Rsp
  1536. mov 4(%r11),%r10d # HandlerData[1]
  1537. lea (%rsi,%r10),%r10 # epilogue label
  1538. cmp %r10,%rbx # context->Rip>=epilogue label
  1539. jae .Lin_prologue
  1540. lea 48+280(%rax),%rax # adjust "rsp"
  1541. mov -8(%rax),%rbx
  1542. mov -16(%rax),%rbp
  1543. mov -24(%rax),%r12
  1544. mov -32(%rax),%r13
  1545. mov -40(%rax),%r14
  1546. mov -48(%rax),%r15
  1547. mov %rbx,144($context) # restore context->Rbx
  1548. mov %rbp,160($context) # restore context->Rbp
  1549. mov %r12,216($context) # restore context->R12
  1550. mov %r13,224($context) # restore context->R13
  1551. mov %r14,232($context) # restore context->R14
  1552. mov %r15,240($context) # restore context->R15
  1553. .Lin_prologue:
  1554. mov 8(%rax),%rdi
  1555. mov 16(%rax),%rsi
  1556. mov %rax,152($context) # restore context->Rsp
  1557. mov %rsi,168($context) # restore context->Rsi
  1558. mov %rdi,176($context) # restore context->Rdi
  1559. mov 40($disp),%rdi # disp->ContextRecord
  1560. mov $context,%rsi # context
  1561. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1562. .long 0xa548f3fc # cld; rep movsq
  1563. mov $disp,%rsi
  1564. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1565. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1566. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1567. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1568. mov 40(%rsi),%r10 # disp->ContextRecord
  1569. lea 56(%rsi),%r11 # &disp->HandlerData
  1570. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1571. mov %r10,32(%rsp) # arg5
  1572. mov %r11,40(%rsp) # arg6
  1573. mov %r12,48(%rsp) # arg7
  1574. mov %rcx,56(%rsp) # arg8, (NULL)
  1575. call *__imp_RtlVirtualUnwind(%rip)
  1576. mov \$1,%eax # ExceptionContinueSearch
  1577. add \$64,%rsp
  1578. popfq
  1579. pop %r15
  1580. pop %r14
  1581. pop %r13
  1582. pop %r12
  1583. pop %rbp
  1584. pop %rbx
  1585. pop %rdi
  1586. pop %rsi
  1587. ret
  1588. .size se_handler,.-se_handler
  1589. .section .pdata
  1590. .align 4
  1591. .rva .LSEH_begin_gcm_gmult_4bit
  1592. .rva .LSEH_end_gcm_gmult_4bit
  1593. .rva .LSEH_info_gcm_gmult_4bit
  1594. .rva .LSEH_begin_gcm_ghash_4bit
  1595. .rva .LSEH_end_gcm_ghash_4bit
  1596. .rva .LSEH_info_gcm_ghash_4bit
  1597. .rva .LSEH_begin_gcm_init_clmul
  1598. .rva .LSEH_end_gcm_init_clmul
  1599. .rva .LSEH_info_gcm_init_clmul
  1600. .rva .LSEH_begin_gcm_ghash_clmul
  1601. .rva .LSEH_end_gcm_ghash_clmul
  1602. .rva .LSEH_info_gcm_ghash_clmul
  1603. ___
  1604. $code.=<<___ if ($avx);
  1605. .rva .LSEH_begin_gcm_init_avx
  1606. .rva .LSEH_end_gcm_init_avx
  1607. .rva .LSEH_info_gcm_init_clmul
  1608. .rva .LSEH_begin_gcm_ghash_avx
  1609. .rva .LSEH_end_gcm_ghash_avx
  1610. .rva .LSEH_info_gcm_ghash_clmul
  1611. ___
  1612. $code.=<<___;
  1613. .section .xdata
  1614. .align 8
  1615. .LSEH_info_gcm_gmult_4bit:
  1616. .byte 9,0,0,0
  1617. .rva se_handler
  1618. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1619. .LSEH_info_gcm_ghash_4bit:
  1620. .byte 9,0,0,0
  1621. .rva se_handler
  1622. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1623. .LSEH_info_gcm_init_clmul:
  1624. .byte 0x01,0x08,0x03,0x00
  1625. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1626. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1627. .LSEH_info_gcm_ghash_clmul:
  1628. .byte 0x01,0x33,0x16,0x00
  1629. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1630. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1631. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1632. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1633. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1634. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1635. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1636. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1637. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1638. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1639. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1640. ___
  1641. }
  1642. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1643. print $code;
  1644. close STDOUT or die "error closing STDOUT: $!";