2
0

ghash-x86_64.pl 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # March, June 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that
  20. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  21. # function features so called "528B" variant utilizing additional
  22. # 256+16 bytes of per-key storage [+512 bytes shared table].
  23. # Performance results are for this streamed GHASH subroutine and are
  24. # expressed in cycles per processed byte, less is better:
  25. #
  26. # gcc 3.4.x(*) assembler
  27. #
  28. # P4 28.6 14.0 +100%
  29. # Opteron 19.3 7.7 +150%
  30. # Core2 17.8 8.1(**) +120%
  31. # Atom 31.6 16.8 +88%
  32. # VIA Nano 21.8 10.1 +115%
  33. #
  34. # (*) comparison is not completely fair, because C results are
  35. # for vanilla "256B" implementation, while assembler results
  36. # are for "528B";-)
  37. # (**) it's mystery [to me] why Core2 result is not same as for
  38. # Opteron;
  39. # May 2010
  40. #
  41. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  42. # See ghash-x86.pl for background information and details about coding
  43. # techniques.
  44. #
  45. # Special thanks to David Woodhouse for providing access to a
  46. # Westmere-based system on behalf of Intel Open Source Technology Centre.
  47. # December 2012
  48. #
  49. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  50. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  51. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  52. # increase aggregate factor. Then why increase here? Critical path
  53. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  54. # processing and reduction. "On top" of this we lay down aggregated
  55. # multiplication operations, triplets of independent pclmulqdq's. As
  56. # issue rate for pclmulqdq is limited, it makes lesser sense to
  57. # aggregate more multiplications than it takes to perform remaining
  58. # non-multiplication operations. 2x is near-optimal coefficient for
  59. # contemporary Intel CPUs (therefore modest improvement coefficient),
  60. # but not for Bulldozer. Latter is because logical SIMD operations
  61. # are twice as slow in comparison to Intel, so that critical path is
  62. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  63. # from higher aggregate factor...
  64. #
  65. # Westmere 1.78(+13%)
  66. # Sandy Bridge 1.80(+8%)
  67. # Ivy Bridge 1.80(+7%)
  68. # Haswell 0.55(+93%) (if system doesn't support AVX)
  69. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  70. # Skylake 0.44(+110%)(if system doesn't support AVX)
  71. # Bulldozer 1.49(+27%)
  72. # Silvermont 2.88(+13%)
  73. # Knights L 2.12(-) (if system doesn't support AVX)
  74. # Goldmont 1.08(+24%)
  75. # March 2013
  76. #
  77. # ... 8x aggregate factor AVX code path is using reduction algorithm
  78. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  79. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  80. # sub-optimally in comparison to above mentioned version. But thanks
  81. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  82. # it performs in 0.41 cycles per byte on Haswell processor, in
  83. # 0.29 on Broadwell, and in 0.36 on Skylake.
  84. #
  85. # Knights Landing achieves 1.09 cpb.
  86. #
  87. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  88. $flavour = shift;
  89. $output = shift;
  90. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  91. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  92. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  93. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  94. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  95. die "can't locate x86_64-xlate.pl";
  96. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  97. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  98. $avx = ($1>=2.20) + ($1>=2.22);
  99. }
  100. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  101. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  102. $avx = ($1>=2.09) + ($1>=2.10);
  103. }
  104. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  105. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  106. $avx = ($1>=10) + ($1>=11);
  107. }
  108. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  109. $avx = ($2>=3.0) + ($2>3.0);
  110. }
  111. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  112. *STDOUT=*OUT;
  113. $do4xaggr=1;
  114. # common register layout
  115. $nlo="%rax";
  116. $nhi="%rbx";
  117. $Zlo="%r8";
  118. $Zhi="%r9";
  119. $tmp="%r10";
  120. $rem_4bit = "%r11";
  121. $Xi="%rdi";
  122. $Htbl="%rsi";
  123. # per-function register layout
  124. $cnt="%rcx";
  125. $rem="%rdx";
  126. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  127. $r =~ s/%[er]([sd]i)/%\1l/ or
  128. $r =~ s/%[er](bp)/%\1l/ or
  129. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  130. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  131. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  132. my $arg = pop;
  133. $arg = "\$$arg" if ($arg*1 eq $arg);
  134. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  135. }
  136. { my $N;
  137. sub loop() {
  138. my $inp = shift;
  139. $N++;
  140. $code.=<<___;
  141. xor $nlo,$nlo
  142. xor $nhi,$nhi
  143. mov `&LB("$Zlo")`,`&LB("$nlo")`
  144. mov `&LB("$Zlo")`,`&LB("$nhi")`
  145. shl \$4,`&LB("$nlo")`
  146. mov \$14,$cnt
  147. mov 8($Htbl,$nlo),$Zlo
  148. mov ($Htbl,$nlo),$Zhi
  149. and \$0xf0,`&LB("$nhi")`
  150. mov $Zlo,$rem
  151. jmp .Loop$N
  152. .align 16
  153. .Loop$N:
  154. shr \$4,$Zlo
  155. and \$0xf,$rem
  156. mov $Zhi,$tmp
  157. mov ($inp,$cnt),`&LB("$nlo")`
  158. shr \$4,$Zhi
  159. xor 8($Htbl,$nhi),$Zlo
  160. shl \$60,$tmp
  161. xor ($Htbl,$nhi),$Zhi
  162. mov `&LB("$nlo")`,`&LB("$nhi")`
  163. xor ($rem_4bit,$rem,8),$Zhi
  164. mov $Zlo,$rem
  165. shl \$4,`&LB("$nlo")`
  166. xor $tmp,$Zlo
  167. dec $cnt
  168. js .Lbreak$N
  169. shr \$4,$Zlo
  170. and \$0xf,$rem
  171. mov $Zhi,$tmp
  172. shr \$4,$Zhi
  173. xor 8($Htbl,$nlo),$Zlo
  174. shl \$60,$tmp
  175. xor ($Htbl,$nlo),$Zhi
  176. and \$0xf0,`&LB("$nhi")`
  177. xor ($rem_4bit,$rem,8),$Zhi
  178. mov $Zlo,$rem
  179. xor $tmp,$Zlo
  180. jmp .Loop$N
  181. .align 16
  182. .Lbreak$N:
  183. shr \$4,$Zlo
  184. and \$0xf,$rem
  185. mov $Zhi,$tmp
  186. shr \$4,$Zhi
  187. xor 8($Htbl,$nlo),$Zlo
  188. shl \$60,$tmp
  189. xor ($Htbl,$nlo),$Zhi
  190. and \$0xf0,`&LB("$nhi")`
  191. xor ($rem_4bit,$rem,8),$Zhi
  192. mov $Zlo,$rem
  193. xor $tmp,$Zlo
  194. shr \$4,$Zlo
  195. and \$0xf,$rem
  196. mov $Zhi,$tmp
  197. shr \$4,$Zhi
  198. xor 8($Htbl,$nhi),$Zlo
  199. shl \$60,$tmp
  200. xor ($Htbl,$nhi),$Zhi
  201. xor $tmp,$Zlo
  202. xor ($rem_4bit,$rem,8),$Zhi
  203. bswap $Zlo
  204. bswap $Zhi
  205. ___
  206. }}
  207. $code=<<___;
  208. .text
  209. .extern OPENSSL_ia32cap_P
  210. .globl gcm_gmult_4bit
  211. .type gcm_gmult_4bit,\@function,2
  212. .align 16
  213. gcm_gmult_4bit:
  214. .cfi_startproc
  215. push %rbx
  216. .cfi_push %rbx
  217. push %rbp # %rbp and others are pushed exclusively in
  218. .cfi_push %rbp
  219. push %r12 # order to reuse Win64 exception handler...
  220. .cfi_push %r12
  221. push %r13
  222. .cfi_push %r13
  223. push %r14
  224. .cfi_push %r14
  225. push %r15
  226. .cfi_push %r15
  227. sub \$280,%rsp
  228. .cfi_adjust_cfa_offset 280
  229. .Lgmult_prologue:
  230. movzb 15($Xi),$Zlo
  231. lea .Lrem_4bit(%rip),$rem_4bit
  232. ___
  233. &loop ($Xi);
  234. $code.=<<___;
  235. mov $Zlo,8($Xi)
  236. mov $Zhi,($Xi)
  237. lea 280+48(%rsp),%rsi
  238. .cfi_def_cfa %rsi,8
  239. mov -8(%rsi),%rbx
  240. .cfi_restore %rbx
  241. lea (%rsi),%rsp
  242. .cfi_def_cfa_register %rsp
  243. .Lgmult_epilogue:
  244. ret
  245. .cfi_endproc
  246. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  247. ___
  248. # per-function register layout
  249. $inp="%rdx";
  250. $len="%rcx";
  251. $rem_8bit=$rem_4bit;
  252. $code.=<<___;
  253. .globl gcm_ghash_4bit
  254. .type gcm_ghash_4bit,\@function,4
  255. .align 16
  256. gcm_ghash_4bit:
  257. .cfi_startproc
  258. push %rbx
  259. .cfi_push %rbx
  260. push %rbp
  261. .cfi_push %rbp
  262. push %r12
  263. .cfi_push %r12
  264. push %r13
  265. .cfi_push %r13
  266. push %r14
  267. .cfi_push %r14
  268. push %r15
  269. .cfi_push %r15
  270. sub \$280,%rsp
  271. .cfi_adjust_cfa_offset 280
  272. .Lghash_prologue:
  273. mov $inp,%r14 # reassign couple of args
  274. mov $len,%r15
  275. ___
  276. { my $inp="%r14";
  277. my $dat="%edx";
  278. my $len="%r15";
  279. my @nhi=("%ebx","%ecx");
  280. my @rem=("%r12","%r13");
  281. my $Hshr4="%rbp";
  282. &sub ($Htbl,-128); # size optimization
  283. &lea ($Hshr4,"16+128(%rsp)");
  284. { my @lo =($nlo,$nhi);
  285. my @hi =($Zlo,$Zhi);
  286. &xor ($dat,$dat);
  287. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  288. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  289. &or ($lo[0],$tmp) if ($i>1);
  290. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  291. &shr ($lo[1],4) if ($i>0 && $i<17);
  292. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  293. &shr ($hi[1],4) if ($i>0 && $i<17);
  294. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  295. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  296. &shl (&LB($dat),4) if ($i>0 && $i<17);
  297. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  298. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  299. &shl ($tmp,60) if ($i>0 && $i<17);
  300. push (@lo,shift(@lo));
  301. push (@hi,shift(@hi));
  302. }
  303. }
  304. &add ($Htbl,-128);
  305. &mov ($Zlo,"8($Xi)");
  306. &mov ($Zhi,"0($Xi)");
  307. &add ($len,$inp); # pointer to the end of data
  308. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  309. &jmp (".Louter_loop");
  310. $code.=".align 16\n.Louter_loop:\n";
  311. &xor ($Zhi,"($inp)");
  312. &mov ("%rdx","8($inp)");
  313. &lea ($inp,"16($inp)");
  314. &xor ("%rdx",$Zlo);
  315. &mov ("($Xi)",$Zhi);
  316. &mov ("8($Xi)","%rdx");
  317. &shr ("%rdx",32);
  318. &xor ($nlo,$nlo);
  319. &rol ($dat,8);
  320. &mov (&LB($nlo),&LB($dat));
  321. &movz ($nhi[0],&LB($dat));
  322. &shl (&LB($nlo),4);
  323. &shr ($nhi[0],4);
  324. for ($j=11,$i=0;$i<15;$i++) {
  325. &rol ($dat,8);
  326. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  327. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  328. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  329. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  330. &mov (&LB($nlo),&LB($dat));
  331. &xor ($Zlo,$tmp) if ($i>0);
  332. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  333. &movz ($nhi[1],&LB($dat));
  334. &shl (&LB($nlo),4);
  335. &movzb ($rem[0],"(%rsp,$nhi[0])");
  336. &shr ($nhi[1],4) if ($i<14);
  337. &and ($nhi[1],0xf0) if ($i==14);
  338. &shl ($rem[1],48) if ($i>0);
  339. &xor ($rem[0],$Zlo);
  340. &mov ($tmp,$Zhi);
  341. &xor ($Zhi,$rem[1]) if ($i>0);
  342. &shr ($Zlo,8);
  343. &movz ($rem[0],&LB($rem[0]));
  344. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  345. &shr ($Zhi,8);
  346. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  347. &shl ($tmp,56);
  348. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  349. unshift (@nhi,pop(@nhi)); # "rotate" registers
  350. unshift (@rem,pop(@rem));
  351. }
  352. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  353. &xor ($Zlo,"8($Htbl,$nlo)");
  354. &xor ($Zhi,"($Htbl,$nlo)");
  355. &shl ($rem[1],48);
  356. &xor ($Zlo,$tmp);
  357. &xor ($Zhi,$rem[1]);
  358. &movz ($rem[0],&LB($Zlo));
  359. &shr ($Zlo,4);
  360. &mov ($tmp,$Zhi);
  361. &shl (&LB($rem[0]),4);
  362. &shr ($Zhi,4);
  363. &xor ($Zlo,"8($Htbl,$nhi[0])");
  364. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  365. &shl ($tmp,60);
  366. &xor ($Zhi,"($Htbl,$nhi[0])");
  367. &xor ($Zlo,$tmp);
  368. &shl ($rem[0],48);
  369. &bswap ($Zlo);
  370. &xor ($Zhi,$rem[0]);
  371. &bswap ($Zhi);
  372. &cmp ($inp,$len);
  373. &jb (".Louter_loop");
  374. }
  375. $code.=<<___;
  376. mov $Zlo,8($Xi)
  377. mov $Zhi,($Xi)
  378. lea 280+48(%rsp),%rsi
  379. .cfi_def_cfa %rsi,8
  380. mov -48(%rsi),%r15
  381. .cfi_restore %r15
  382. mov -40(%rsi),%r14
  383. .cfi_restore %r14
  384. mov -32(%rsi),%r13
  385. .cfi_restore %r13
  386. mov -24(%rsi),%r12
  387. .cfi_restore %r12
  388. mov -16(%rsi),%rbp
  389. .cfi_restore %rbp
  390. mov -8(%rsi),%rbx
  391. .cfi_restore %rbx
  392. lea 0(%rsi),%rsp
  393. .cfi_def_cfa_register %rsp
  394. .Lghash_epilogue:
  395. ret
  396. .cfi_endproc
  397. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  398. ___
  399. ######################################################################
  400. # PCLMULQDQ version.
  401. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  402. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  403. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  404. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  405. sub clmul64x64_T2 { # minimal register pressure
  406. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  407. if (!defined($HK)) { $HK = $T2;
  408. $code.=<<___;
  409. movdqa $Xi,$Xhi #
  410. pshufd \$0b01001110,$Xi,$T1
  411. pshufd \$0b01001110,$Hkey,$T2
  412. pxor $Xi,$T1 #
  413. pxor $Hkey,$T2
  414. ___
  415. } else {
  416. $code.=<<___;
  417. movdqa $Xi,$Xhi #
  418. pshufd \$0b01001110,$Xi,$T1
  419. pxor $Xi,$T1 #
  420. ___
  421. }
  422. $code.=<<___;
  423. pclmulqdq \$0x00,$Hkey,$Xi #######
  424. pclmulqdq \$0x11,$Hkey,$Xhi #######
  425. pclmulqdq \$0x00,$HK,$T1 #######
  426. pxor $Xi,$T1 #
  427. pxor $Xhi,$T1 #
  428. movdqa $T1,$T2 #
  429. psrldq \$8,$T1
  430. pslldq \$8,$T2 #
  431. pxor $T1,$Xhi
  432. pxor $T2,$Xi #
  433. ___
  434. }
  435. sub reduction_alg9 { # 17/11 times faster than Intel version
  436. my ($Xhi,$Xi) = @_;
  437. $code.=<<___;
  438. # 1st phase
  439. movdqa $Xi,$T2 #
  440. movdqa $Xi,$T1
  441. psllq \$5,$Xi
  442. pxor $Xi,$T1 #
  443. psllq \$1,$Xi
  444. pxor $T1,$Xi #
  445. psllq \$57,$Xi #
  446. movdqa $Xi,$T1 #
  447. pslldq \$8,$Xi
  448. psrldq \$8,$T1 #
  449. pxor $T2,$Xi
  450. pxor $T1,$Xhi #
  451. # 2nd phase
  452. movdqa $Xi,$T2
  453. psrlq \$1,$Xi
  454. pxor $T2,$Xhi #
  455. pxor $Xi,$T2
  456. psrlq \$5,$Xi
  457. pxor $T2,$Xi #
  458. psrlq \$1,$Xi #
  459. pxor $Xhi,$Xi #
  460. ___
  461. }
  462. { my ($Htbl,$Xip)=@_4args;
  463. my $HK="%xmm6";
  464. $code.=<<___;
  465. .globl gcm_init_clmul
  466. .type gcm_init_clmul,\@abi-omnipotent
  467. .align 16
  468. gcm_init_clmul:
  469. .L_init_clmul:
  470. ___
  471. $code.=<<___ if ($win64);
  472. .LSEH_begin_gcm_init_clmul:
  473. # I can't trust assembler to use specific encoding:-(
  474. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  475. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  476. ___
  477. $code.=<<___;
  478. movdqu ($Xip),$Hkey
  479. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  480. # <<1 twist
  481. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  482. movdqa $Hkey,$T1
  483. psllq \$1,$Hkey
  484. pxor $T3,$T3 #
  485. psrlq \$63,$T1
  486. pcmpgtd $T2,$T3 # broadcast carry bit
  487. pslldq \$8,$T1
  488. por $T1,$Hkey # H<<=1
  489. # magic reduction
  490. pand .L0x1c2_polynomial(%rip),$T3
  491. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  492. # calculate H^2
  493. pshufd \$0b01001110,$Hkey,$HK
  494. movdqa $Hkey,$Xi
  495. pxor $Hkey,$HK
  496. ___
  497. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  498. &reduction_alg9 ($Xhi,$Xi);
  499. $code.=<<___;
  500. pshufd \$0b01001110,$Hkey,$T1
  501. pshufd \$0b01001110,$Xi,$T2
  502. pxor $Hkey,$T1 # Karatsuba pre-processing
  503. movdqu $Hkey,0x00($Htbl) # save H
  504. pxor $Xi,$T2 # Karatsuba pre-processing
  505. movdqu $Xi,0x10($Htbl) # save H^2
  506. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  507. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  508. ___
  509. if ($do4xaggr) {
  510. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  511. &reduction_alg9 ($Xhi,$Xi);
  512. $code.=<<___;
  513. movdqa $Xi,$T3
  514. ___
  515. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  516. &reduction_alg9 ($Xhi,$Xi);
  517. $code.=<<___;
  518. pshufd \$0b01001110,$T3,$T1
  519. pshufd \$0b01001110,$Xi,$T2
  520. pxor $T3,$T1 # Karatsuba pre-processing
  521. movdqu $T3,0x30($Htbl) # save H^3
  522. pxor $Xi,$T2 # Karatsuba pre-processing
  523. movdqu $Xi,0x40($Htbl) # save H^4
  524. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  525. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  526. ___
  527. }
  528. $code.=<<___ if ($win64);
  529. movaps (%rsp),%xmm6
  530. lea 0x18(%rsp),%rsp
  531. .LSEH_end_gcm_init_clmul:
  532. ___
  533. $code.=<<___;
  534. ret
  535. .size gcm_init_clmul,.-gcm_init_clmul
  536. ___
  537. }
  538. { my ($Xip,$Htbl)=@_4args;
  539. $code.=<<___;
  540. .globl gcm_gmult_clmul
  541. .type gcm_gmult_clmul,\@abi-omnipotent
  542. .align 16
  543. gcm_gmult_clmul:
  544. .L_gmult_clmul:
  545. movdqu ($Xip),$Xi
  546. movdqa .Lbswap_mask(%rip),$T3
  547. movdqu ($Htbl),$Hkey
  548. movdqu 0x20($Htbl),$T2
  549. pshufb $T3,$Xi
  550. ___
  551. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  552. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  553. # experimental alternative. special thing about is that there
  554. # no dependency between the two multiplications...
  555. mov \$`0xE1<<1`,%eax
  556. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  557. mov \$0x07,%r11d
  558. movq %rax,$T1
  559. movq %r10,$T2
  560. movq %r11,$T3 # borrow $T3
  561. pand $Xi,$T3
  562. pshufb $T3,$T2 # ($Xi&7)·0xE0
  563. movq %rax,$T3
  564. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  565. pxor $Xi,$T2
  566. pslldq \$15,$T2
  567. paddd $T2,$T2 # <<(64+56+1)
  568. pxor $T2,$Xi
  569. pclmulqdq \$0x01,$T3,$Xi
  570. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  571. psrldq \$1,$T1
  572. pxor $T1,$Xhi
  573. pslldq \$7,$Xi
  574. pxor $Xhi,$Xi
  575. ___
  576. $code.=<<___;
  577. pshufb $T3,$Xi
  578. movdqu $Xi,($Xip)
  579. ret
  580. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  581. ___
  582. }
  583. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  584. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  585. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  586. $code.=<<___;
  587. .globl gcm_ghash_clmul
  588. .type gcm_ghash_clmul,\@abi-omnipotent
  589. .align 32
  590. gcm_ghash_clmul:
  591. .L_ghash_clmul:
  592. ___
  593. $code.=<<___ if ($win64);
  594. lea -0x88(%rsp),%rax
  595. .LSEH_begin_gcm_ghash_clmul:
  596. # I can't trust assembler to use specific encoding:-(
  597. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  598. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  599. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  600. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  601. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  602. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  603. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  604. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  605. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  606. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  607. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  608. ___
  609. $code.=<<___;
  610. movdqa .Lbswap_mask(%rip),$T3
  611. movdqu ($Xip),$Xi
  612. movdqu ($Htbl),$Hkey
  613. movdqu 0x20($Htbl),$HK
  614. pshufb $T3,$Xi
  615. sub \$0x10,$len
  616. jz .Lodd_tail
  617. movdqu 0x10($Htbl),$Hkey2
  618. ___
  619. if ($do4xaggr) {
  620. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  621. $code.=<<___;
  622. mov OPENSSL_ia32cap_P+4(%rip),%eax
  623. cmp \$0x30,$len
  624. jb .Lskip4x
  625. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  626. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  627. je .Lskip4x
  628. sub \$0x30,$len
  629. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  630. movdqu 0x30($Htbl),$Hkey3
  631. movdqu 0x40($Htbl),$Hkey4
  632. #######
  633. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  634. #
  635. movdqu 0x30($inp),$Xln
  636. movdqu 0x20($inp),$Xl
  637. pshufb $T3,$Xln
  638. pshufb $T3,$Xl
  639. movdqa $Xln,$Xhn
  640. pshufd \$0b01001110,$Xln,$Xmn
  641. pxor $Xln,$Xmn
  642. pclmulqdq \$0x00,$Hkey,$Xln
  643. pclmulqdq \$0x11,$Hkey,$Xhn
  644. pclmulqdq \$0x00,$HK,$Xmn
  645. movdqa $Xl,$Xh
  646. pshufd \$0b01001110,$Xl,$Xm
  647. pxor $Xl,$Xm
  648. pclmulqdq \$0x00,$Hkey2,$Xl
  649. pclmulqdq \$0x11,$Hkey2,$Xh
  650. pclmulqdq \$0x10,$HK,$Xm
  651. xorps $Xl,$Xln
  652. xorps $Xh,$Xhn
  653. movups 0x50($Htbl),$HK
  654. xorps $Xm,$Xmn
  655. movdqu 0x10($inp),$Xl
  656. movdqu 0($inp),$T1
  657. pshufb $T3,$Xl
  658. pshufb $T3,$T1
  659. movdqa $Xl,$Xh
  660. pshufd \$0b01001110,$Xl,$Xm
  661. pxor $T1,$Xi
  662. pxor $Xl,$Xm
  663. pclmulqdq \$0x00,$Hkey3,$Xl
  664. movdqa $Xi,$Xhi
  665. pshufd \$0b01001110,$Xi,$T1
  666. pxor $Xi,$T1
  667. pclmulqdq \$0x11,$Hkey3,$Xh
  668. pclmulqdq \$0x00,$HK,$Xm
  669. xorps $Xl,$Xln
  670. xorps $Xh,$Xhn
  671. lea 0x40($inp),$inp
  672. sub \$0x40,$len
  673. jc .Ltail4x
  674. jmp .Lmod4_loop
  675. .align 32
  676. .Lmod4_loop:
  677. pclmulqdq \$0x00,$Hkey4,$Xi
  678. xorps $Xm,$Xmn
  679. movdqu 0x30($inp),$Xl
  680. pshufb $T3,$Xl
  681. pclmulqdq \$0x11,$Hkey4,$Xhi
  682. xorps $Xln,$Xi
  683. movdqu 0x20($inp),$Xln
  684. movdqa $Xl,$Xh
  685. pclmulqdq \$0x10,$HK,$T1
  686. pshufd \$0b01001110,$Xl,$Xm
  687. xorps $Xhn,$Xhi
  688. pxor $Xl,$Xm
  689. pshufb $T3,$Xln
  690. movups 0x20($Htbl),$HK
  691. xorps $Xmn,$T1
  692. pclmulqdq \$0x00,$Hkey,$Xl
  693. pshufd \$0b01001110,$Xln,$Xmn
  694. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  695. movdqa $Xln,$Xhn
  696. pxor $Xhi,$T1 #
  697. pxor $Xln,$Xmn
  698. movdqa $T1,$T2 #
  699. pclmulqdq \$0x11,$Hkey,$Xh
  700. pslldq \$8,$T1
  701. psrldq \$8,$T2 #
  702. pxor $T1,$Xi
  703. movdqa .L7_mask(%rip),$T1
  704. pxor $T2,$Xhi #
  705. movq %rax,$T2
  706. pand $Xi,$T1 # 1st phase
  707. pshufb $T1,$T2 #
  708. pxor $Xi,$T2 #
  709. pclmulqdq \$0x00,$HK,$Xm
  710. psllq \$57,$T2 #
  711. movdqa $T2,$T1 #
  712. pslldq \$8,$T2
  713. pclmulqdq \$0x00,$Hkey2,$Xln
  714. psrldq \$8,$T1 #
  715. pxor $T2,$Xi
  716. pxor $T1,$Xhi #
  717. movdqu 0($inp),$T1
  718. movdqa $Xi,$T2 # 2nd phase
  719. psrlq \$1,$Xi
  720. pclmulqdq \$0x11,$Hkey2,$Xhn
  721. xorps $Xl,$Xln
  722. movdqu 0x10($inp),$Xl
  723. pshufb $T3,$Xl
  724. pclmulqdq \$0x10,$HK,$Xmn
  725. xorps $Xh,$Xhn
  726. movups 0x50($Htbl),$HK
  727. pshufb $T3,$T1
  728. pxor $T2,$Xhi #
  729. pxor $Xi,$T2
  730. psrlq \$5,$Xi
  731. movdqa $Xl,$Xh
  732. pxor $Xm,$Xmn
  733. pshufd \$0b01001110,$Xl,$Xm
  734. pxor $T2,$Xi #
  735. pxor $T1,$Xhi
  736. pxor $Xl,$Xm
  737. pclmulqdq \$0x00,$Hkey3,$Xl
  738. psrlq \$1,$Xi #
  739. pxor $Xhi,$Xi #
  740. movdqa $Xi,$Xhi
  741. pclmulqdq \$0x11,$Hkey3,$Xh
  742. xorps $Xl,$Xln
  743. pshufd \$0b01001110,$Xi,$T1
  744. pxor $Xi,$T1
  745. pclmulqdq \$0x00,$HK,$Xm
  746. xorps $Xh,$Xhn
  747. lea 0x40($inp),$inp
  748. sub \$0x40,$len
  749. jnc .Lmod4_loop
  750. .Ltail4x:
  751. pclmulqdq \$0x00,$Hkey4,$Xi
  752. pclmulqdq \$0x11,$Hkey4,$Xhi
  753. pclmulqdq \$0x10,$HK,$T1
  754. xorps $Xm,$Xmn
  755. xorps $Xln,$Xi
  756. xorps $Xhn,$Xhi
  757. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  758. pxor $Xmn,$T1
  759. pxor $Xhi,$T1 #
  760. pxor $Xi,$Xhi
  761. movdqa $T1,$T2 #
  762. psrldq \$8,$T1
  763. pslldq \$8,$T2 #
  764. pxor $T1,$Xhi
  765. pxor $T2,$Xi #
  766. ___
  767. &reduction_alg9($Xhi,$Xi);
  768. $code.=<<___;
  769. add \$0x40,$len
  770. jz .Ldone
  771. movdqu 0x20($Htbl),$HK
  772. sub \$0x10,$len
  773. jz .Lodd_tail
  774. .Lskip4x:
  775. ___
  776. }
  777. $code.=<<___;
  778. #######
  779. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  780. # [(H*Ii+1) + (H*Xi+1)] mod P =
  781. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  782. #
  783. movdqu ($inp),$T1 # Ii
  784. movdqu 16($inp),$Xln # Ii+1
  785. pshufb $T3,$T1
  786. pshufb $T3,$Xln
  787. pxor $T1,$Xi # Ii+Xi
  788. movdqa $Xln,$Xhn
  789. pshufd \$0b01001110,$Xln,$Xmn
  790. pxor $Xln,$Xmn
  791. pclmulqdq \$0x00,$Hkey,$Xln
  792. pclmulqdq \$0x11,$Hkey,$Xhn
  793. pclmulqdq \$0x00,$HK,$Xmn
  794. lea 32($inp),$inp # i+=2
  795. nop
  796. sub \$0x20,$len
  797. jbe .Leven_tail
  798. nop
  799. jmp .Lmod_loop
  800. .align 32
  801. .Lmod_loop:
  802. movdqa $Xi,$Xhi
  803. movdqa $Xmn,$T1
  804. pshufd \$0b01001110,$Xi,$Xmn #
  805. pxor $Xi,$Xmn #
  806. pclmulqdq \$0x00,$Hkey2,$Xi
  807. pclmulqdq \$0x11,$Hkey2,$Xhi
  808. pclmulqdq \$0x10,$HK,$Xmn
  809. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  810. pxor $Xhn,$Xhi
  811. movdqu ($inp),$T2 # Ii
  812. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  813. pshufb $T3,$T2
  814. movdqu 16($inp),$Xln # Ii+1
  815. pxor $Xhi,$T1
  816. pxor $T2,$Xhi # "Ii+Xi", consume early
  817. pxor $T1,$Xmn
  818. pshufb $T3,$Xln
  819. movdqa $Xmn,$T1 #
  820. psrldq \$8,$T1
  821. pslldq \$8,$Xmn #
  822. pxor $T1,$Xhi
  823. pxor $Xmn,$Xi #
  824. movdqa $Xln,$Xhn #
  825. movdqa $Xi,$T2 # 1st phase
  826. movdqa $Xi,$T1
  827. psllq \$5,$Xi
  828. pxor $Xi,$T1 #
  829. pclmulqdq \$0x00,$Hkey,$Xln #######
  830. psllq \$1,$Xi
  831. pxor $T1,$Xi #
  832. psllq \$57,$Xi #
  833. movdqa $Xi,$T1 #
  834. pslldq \$8,$Xi
  835. psrldq \$8,$T1 #
  836. pxor $T2,$Xi
  837. pshufd \$0b01001110,$Xhn,$Xmn
  838. pxor $T1,$Xhi #
  839. pxor $Xhn,$Xmn #
  840. movdqa $Xi,$T2 # 2nd phase
  841. psrlq \$1,$Xi
  842. pclmulqdq \$0x11,$Hkey,$Xhn #######
  843. pxor $T2,$Xhi #
  844. pxor $Xi,$T2
  845. psrlq \$5,$Xi
  846. pxor $T2,$Xi #
  847. lea 32($inp),$inp
  848. psrlq \$1,$Xi #
  849. pclmulqdq \$0x00,$HK,$Xmn #######
  850. pxor $Xhi,$Xi #
  851. sub \$0x20,$len
  852. ja .Lmod_loop
  853. .Leven_tail:
  854. movdqa $Xi,$Xhi
  855. movdqa $Xmn,$T1
  856. pshufd \$0b01001110,$Xi,$Xmn #
  857. pxor $Xi,$Xmn #
  858. pclmulqdq \$0x00,$Hkey2,$Xi
  859. pclmulqdq \$0x11,$Hkey2,$Xhi
  860. pclmulqdq \$0x10,$HK,$Xmn
  861. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  862. pxor $Xhn,$Xhi
  863. pxor $Xi,$T1
  864. pxor $Xhi,$T1
  865. pxor $T1,$Xmn
  866. movdqa $Xmn,$T1 #
  867. psrldq \$8,$T1
  868. pslldq \$8,$Xmn #
  869. pxor $T1,$Xhi
  870. pxor $Xmn,$Xi #
  871. ___
  872. &reduction_alg9 ($Xhi,$Xi);
  873. $code.=<<___;
  874. test $len,$len
  875. jnz .Ldone
  876. .Lodd_tail:
  877. movdqu ($inp),$T1 # Ii
  878. pshufb $T3,$T1
  879. pxor $T1,$Xi # Ii+Xi
  880. ___
  881. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  882. &reduction_alg9 ($Xhi,$Xi);
  883. $code.=<<___;
  884. .Ldone:
  885. pshufb $T3,$Xi
  886. movdqu $Xi,($Xip)
  887. ___
  888. $code.=<<___ if ($win64);
  889. movaps (%rsp),%xmm6
  890. movaps 0x10(%rsp),%xmm7
  891. movaps 0x20(%rsp),%xmm8
  892. movaps 0x30(%rsp),%xmm9
  893. movaps 0x40(%rsp),%xmm10
  894. movaps 0x50(%rsp),%xmm11
  895. movaps 0x60(%rsp),%xmm12
  896. movaps 0x70(%rsp),%xmm13
  897. movaps 0x80(%rsp),%xmm14
  898. movaps 0x90(%rsp),%xmm15
  899. lea 0xa8(%rsp),%rsp
  900. .LSEH_end_gcm_ghash_clmul:
  901. ___
  902. $code.=<<___;
  903. ret
  904. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  905. ___
  906. }
  907. $code.=<<___;
  908. .globl gcm_init_avx
  909. .type gcm_init_avx,\@abi-omnipotent
  910. .align 32
  911. gcm_init_avx:
  912. ___
  913. if ($avx) {
  914. my ($Htbl,$Xip)=@_4args;
  915. my $HK="%xmm6";
  916. $code.=<<___ if ($win64);
  917. .LSEH_begin_gcm_init_avx:
  918. # I can't trust assembler to use specific encoding:-(
  919. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  920. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  921. ___
  922. $code.=<<___;
  923. vzeroupper
  924. vmovdqu ($Xip),$Hkey
  925. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  926. # <<1 twist
  927. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  928. vpsrlq \$63,$Hkey,$T1
  929. vpsllq \$1,$Hkey,$Hkey
  930. vpxor $T3,$T3,$T3 #
  931. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  932. vpslldq \$8,$T1,$T1
  933. vpor $T1,$Hkey,$Hkey # H<<=1
  934. # magic reduction
  935. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  936. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  937. vpunpckhqdq $Hkey,$Hkey,$HK
  938. vmovdqa $Hkey,$Xi
  939. vpxor $Hkey,$HK,$HK
  940. mov \$4,%r10 # up to H^8
  941. jmp .Linit_start_avx
  942. ___
  943. sub clmul64x64_avx {
  944. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  945. if (!defined($HK)) { $HK = $T2;
  946. $code.=<<___;
  947. vpunpckhqdq $Xi,$Xi,$T1
  948. vpunpckhqdq $Hkey,$Hkey,$T2
  949. vpxor $Xi,$T1,$T1 #
  950. vpxor $Hkey,$T2,$T2
  951. ___
  952. } else {
  953. $code.=<<___;
  954. vpunpckhqdq $Xi,$Xi,$T1
  955. vpxor $Xi,$T1,$T1 #
  956. ___
  957. }
  958. $code.=<<___;
  959. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  960. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  961. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  962. vpxor $Xi,$Xhi,$T2 #
  963. vpxor $T2,$T1,$T1 #
  964. vpslldq \$8,$T1,$T2 #
  965. vpsrldq \$8,$T1,$T1
  966. vpxor $T2,$Xi,$Xi #
  967. vpxor $T1,$Xhi,$Xhi
  968. ___
  969. }
  970. sub reduction_avx {
  971. my ($Xhi,$Xi) = @_;
  972. $code.=<<___;
  973. vpsllq \$57,$Xi,$T1 # 1st phase
  974. vpsllq \$62,$Xi,$T2
  975. vpxor $T1,$T2,$T2 #
  976. vpsllq \$63,$Xi,$T1
  977. vpxor $T1,$T2,$T2 #
  978. vpslldq \$8,$T2,$T1 #
  979. vpsrldq \$8,$T2,$T2
  980. vpxor $T1,$Xi,$Xi #
  981. vpxor $T2,$Xhi,$Xhi
  982. vpsrlq \$1,$Xi,$T2 # 2nd phase
  983. vpxor $Xi,$Xhi,$Xhi
  984. vpxor $T2,$Xi,$Xi #
  985. vpsrlq \$5,$T2,$T2
  986. vpxor $T2,$Xi,$Xi #
  987. vpsrlq \$1,$Xi,$Xi #
  988. vpxor $Xhi,$Xi,$Xi #
  989. ___
  990. }
  991. $code.=<<___;
  992. .align 32
  993. .Linit_loop_avx:
  994. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  995. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  996. ___
  997. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  998. &reduction_avx ($Xhi,$Xi);
  999. $code.=<<___;
  1000. .Linit_start_avx:
  1001. vmovdqa $Xi,$T3
  1002. ___
  1003. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  1004. &reduction_avx ($Xhi,$Xi);
  1005. $code.=<<___;
  1006. vpshufd \$0b01001110,$T3,$T1
  1007. vpshufd \$0b01001110,$Xi,$T2
  1008. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  1009. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  1010. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  1011. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  1012. lea 0x30($Htbl),$Htbl
  1013. sub \$1,%r10
  1014. jnz .Linit_loop_avx
  1015. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  1016. vmovdqu $T3,-0x10($Htbl)
  1017. vzeroupper
  1018. ___
  1019. $code.=<<___ if ($win64);
  1020. movaps (%rsp),%xmm6
  1021. lea 0x18(%rsp),%rsp
  1022. .LSEH_end_gcm_init_avx:
  1023. ___
  1024. $code.=<<___;
  1025. ret
  1026. .size gcm_init_avx,.-gcm_init_avx
  1027. ___
  1028. } else {
  1029. $code.=<<___;
  1030. jmp .L_init_clmul
  1031. .size gcm_init_avx,.-gcm_init_avx
  1032. ___
  1033. }
  1034. $code.=<<___;
  1035. .globl gcm_gmult_avx
  1036. .type gcm_gmult_avx,\@abi-omnipotent
  1037. .align 32
  1038. gcm_gmult_avx:
  1039. jmp .L_gmult_clmul
  1040. .size gcm_gmult_avx,.-gcm_gmult_avx
  1041. ___
  1042. $code.=<<___;
  1043. .globl gcm_ghash_avx
  1044. .type gcm_ghash_avx,\@abi-omnipotent
  1045. .align 32
  1046. gcm_ghash_avx:
  1047. ___
  1048. if ($avx) {
  1049. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1050. my ($Xlo,$Xhi,$Xmi,
  1051. $Zlo,$Zhi,$Zmi,
  1052. $Hkey,$HK,$T1,$T2,
  1053. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1054. $code.=<<___ if ($win64);
  1055. lea -0x88(%rsp),%rax
  1056. .LSEH_begin_gcm_ghash_avx:
  1057. # I can't trust assembler to use specific encoding:-(
  1058. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1059. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1060. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1061. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1062. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1063. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1064. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1065. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1066. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1067. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1068. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1069. ___
  1070. $code.=<<___;
  1071. vzeroupper
  1072. vmovdqu ($Xip),$Xi # load $Xi
  1073. lea .L0x1c2_polynomial(%rip),%r10
  1074. lea 0x40($Htbl),$Htbl # size optimization
  1075. vmovdqu .Lbswap_mask(%rip),$bswap
  1076. vpshufb $bswap,$Xi,$Xi
  1077. cmp \$0x80,$len
  1078. jb .Lshort_avx
  1079. sub \$0x80,$len
  1080. vmovdqu 0x70($inp),$Ii # I[7]
  1081. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1082. vpshufb $bswap,$Ii,$Ii
  1083. vmovdqu 0x20-0x40($Htbl),$HK
  1084. vpunpckhqdq $Ii,$Ii,$T2
  1085. vmovdqu 0x60($inp),$Ij # I[6]
  1086. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1087. vpxor $Ii,$T2,$T2
  1088. vpshufb $bswap,$Ij,$Ij
  1089. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1090. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1091. vpunpckhqdq $Ij,$Ij,$T1
  1092. vmovdqu 0x50($inp),$Ii # I[5]
  1093. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1094. vpxor $Ij,$T1,$T1
  1095. vpshufb $bswap,$Ii,$Ii
  1096. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1097. vpunpckhqdq $Ii,$Ii,$T2
  1098. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1099. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1100. vpxor $Ii,$T2,$T2
  1101. vmovdqu 0x40($inp),$Ij # I[4]
  1102. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1103. vmovdqu 0x50-0x40($Htbl),$HK
  1104. vpshufb $bswap,$Ij,$Ij
  1105. vpxor $Xlo,$Zlo,$Zlo
  1106. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1107. vpxor $Xhi,$Zhi,$Zhi
  1108. vpunpckhqdq $Ij,$Ij,$T1
  1109. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1110. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1111. vpxor $Xmi,$Zmi,$Zmi
  1112. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1113. vpxor $Ij,$T1,$T1
  1114. vmovdqu 0x30($inp),$Ii # I[3]
  1115. vpxor $Zlo,$Xlo,$Xlo
  1116. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1117. vpxor $Zhi,$Xhi,$Xhi
  1118. vpshufb $bswap,$Ii,$Ii
  1119. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1120. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1121. vpxor $Zmi,$Xmi,$Xmi
  1122. vpunpckhqdq $Ii,$Ii,$T2
  1123. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1124. vmovdqu 0x80-0x40($Htbl),$HK
  1125. vpxor $Ii,$T2,$T2
  1126. vmovdqu 0x20($inp),$Ij # I[2]
  1127. vpxor $Xlo,$Zlo,$Zlo
  1128. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1129. vpxor $Xhi,$Zhi,$Zhi
  1130. vpshufb $bswap,$Ij,$Ij
  1131. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1132. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1133. vpxor $Xmi,$Zmi,$Zmi
  1134. vpunpckhqdq $Ij,$Ij,$T1
  1135. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1136. vpxor $Ij,$T1,$T1
  1137. vmovdqu 0x10($inp),$Ii # I[1]
  1138. vpxor $Zlo,$Xlo,$Xlo
  1139. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1140. vpxor $Zhi,$Xhi,$Xhi
  1141. vpshufb $bswap,$Ii,$Ii
  1142. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1143. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1144. vpxor $Zmi,$Xmi,$Xmi
  1145. vpunpckhqdq $Ii,$Ii,$T2
  1146. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1147. vmovdqu 0xb0-0x40($Htbl),$HK
  1148. vpxor $Ii,$T2,$T2
  1149. vmovdqu ($inp),$Ij # I[0]
  1150. vpxor $Xlo,$Zlo,$Zlo
  1151. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1152. vpxor $Xhi,$Zhi,$Zhi
  1153. vpshufb $bswap,$Ij,$Ij
  1154. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1155. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1156. vpxor $Xmi,$Zmi,$Zmi
  1157. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1158. lea 0x80($inp),$inp
  1159. cmp \$0x80,$len
  1160. jb .Ltail_avx
  1161. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1162. sub \$0x80,$len
  1163. jmp .Loop8x_avx
  1164. .align 32
  1165. .Loop8x_avx:
  1166. vpunpckhqdq $Ij,$Ij,$T1
  1167. vmovdqu 0x70($inp),$Ii # I[7]
  1168. vpxor $Xlo,$Zlo,$Zlo
  1169. vpxor $Ij,$T1,$T1
  1170. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1171. vpshufb $bswap,$Ii,$Ii
  1172. vpxor $Xhi,$Zhi,$Zhi
  1173. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1174. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1175. vpunpckhqdq $Ii,$Ii,$T2
  1176. vpxor $Xmi,$Zmi,$Zmi
  1177. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1178. vmovdqu 0x20-0x40($Htbl),$HK
  1179. vpxor $Ii,$T2,$T2
  1180. vmovdqu 0x60($inp),$Ij # I[6]
  1181. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1182. vpxor $Zlo,$Xi,$Xi # collect result
  1183. vpshufb $bswap,$Ij,$Ij
  1184. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1185. vxorps $Zhi,$Xo,$Xo
  1186. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1187. vpunpckhqdq $Ij,$Ij,$T1
  1188. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1189. vpxor $Zmi,$Tred,$Tred
  1190. vxorps $Ij,$T1,$T1
  1191. vmovdqu 0x50($inp),$Ii # I[5]
  1192. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1193. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1194. vpxor $Xo,$Tred,$Tred
  1195. vpslldq \$8,$Tred,$T2
  1196. vpxor $Xlo,$Zlo,$Zlo
  1197. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1198. vpsrldq \$8,$Tred,$Tred
  1199. vpxor $T2, $Xi, $Xi
  1200. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1201. vpshufb $bswap,$Ii,$Ii
  1202. vxorps $Tred,$Xo, $Xo
  1203. vpxor $Xhi,$Zhi,$Zhi
  1204. vpunpckhqdq $Ii,$Ii,$T2
  1205. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1206. vmovdqu 0x50-0x40($Htbl),$HK
  1207. vpxor $Ii,$T2,$T2
  1208. vpxor $Xmi,$Zmi,$Zmi
  1209. vmovdqu 0x40($inp),$Ij # I[4]
  1210. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1211. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1212. vpshufb $bswap,$Ij,$Ij
  1213. vpxor $Zlo,$Xlo,$Xlo
  1214. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1215. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1216. vpunpckhqdq $Ij,$Ij,$T1
  1217. vpxor $Zhi,$Xhi,$Xhi
  1218. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1219. vxorps $Ij,$T1,$T1
  1220. vpxor $Zmi,$Xmi,$Xmi
  1221. vmovdqu 0x30($inp),$Ii # I[3]
  1222. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1223. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1224. vpshufb $bswap,$Ii,$Ii
  1225. vpxor $Xlo,$Zlo,$Zlo
  1226. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1227. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1228. vpunpckhqdq $Ii,$Ii,$T2
  1229. vpxor $Xhi,$Zhi,$Zhi
  1230. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1231. vmovdqu 0x80-0x40($Htbl),$HK
  1232. vpxor $Ii,$T2,$T2
  1233. vpxor $Xmi,$Zmi,$Zmi
  1234. vmovdqu 0x20($inp),$Ij # I[2]
  1235. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1236. vpshufb $bswap,$Ij,$Ij
  1237. vpxor $Zlo,$Xlo,$Xlo
  1238. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1239. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1240. vpunpckhqdq $Ij,$Ij,$T1
  1241. vpxor $Zhi,$Xhi,$Xhi
  1242. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1243. vpxor $Ij,$T1,$T1
  1244. vpxor $Zmi,$Xmi,$Xmi
  1245. vxorps $Tred,$Xi,$Xi
  1246. vmovdqu 0x10($inp),$Ii # I[1]
  1247. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1248. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1249. vpshufb $bswap,$Ii,$Ii
  1250. vpxor $Xlo,$Zlo,$Zlo
  1251. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1252. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1253. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1254. vxorps $Xo,$Tred,$Tred
  1255. vpunpckhqdq $Ii,$Ii,$T2
  1256. vpxor $Xhi,$Zhi,$Zhi
  1257. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1258. vmovdqu 0xb0-0x40($Htbl),$HK
  1259. vpxor $Ii,$T2,$T2
  1260. vpxor $Xmi,$Zmi,$Zmi
  1261. vmovdqu ($inp),$Ij # I[0]
  1262. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1263. vpshufb $bswap,$Ij,$Ij
  1264. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1265. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1266. vpxor $Tred,$Ij,$Ij
  1267. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1268. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1269. lea 0x80($inp),$inp
  1270. sub \$0x80,$len
  1271. jnc .Loop8x_avx
  1272. add \$0x80,$len
  1273. jmp .Ltail_no_xor_avx
  1274. .align 32
  1275. .Lshort_avx:
  1276. vmovdqu -0x10($inp,$len),$Ii # very last word
  1277. lea ($inp,$len),$inp
  1278. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1279. vmovdqu 0x20-0x40($Htbl),$HK
  1280. vpshufb $bswap,$Ii,$Ij
  1281. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1282. vmovdqa $Xhi,$Zhi # $Zhi and
  1283. vmovdqa $Xmi,$Zmi # $Zmi
  1284. sub \$0x10,$len
  1285. jz .Ltail_avx
  1286. vpunpckhqdq $Ij,$Ij,$T1
  1287. vpxor $Xlo,$Zlo,$Zlo
  1288. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1289. vpxor $Ij,$T1,$T1
  1290. vmovdqu -0x20($inp),$Ii
  1291. vpxor $Xhi,$Zhi,$Zhi
  1292. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1293. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1294. vpshufb $bswap,$Ii,$Ij
  1295. vpxor $Xmi,$Zmi,$Zmi
  1296. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1297. vpsrldq \$8,$HK,$HK
  1298. sub \$0x10,$len
  1299. jz .Ltail_avx
  1300. vpunpckhqdq $Ij,$Ij,$T1
  1301. vpxor $Xlo,$Zlo,$Zlo
  1302. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1303. vpxor $Ij,$T1,$T1
  1304. vmovdqu -0x30($inp),$Ii
  1305. vpxor $Xhi,$Zhi,$Zhi
  1306. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1307. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1308. vpshufb $bswap,$Ii,$Ij
  1309. vpxor $Xmi,$Zmi,$Zmi
  1310. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1311. vmovdqu 0x50-0x40($Htbl),$HK
  1312. sub \$0x10,$len
  1313. jz .Ltail_avx
  1314. vpunpckhqdq $Ij,$Ij,$T1
  1315. vpxor $Xlo,$Zlo,$Zlo
  1316. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1317. vpxor $Ij,$T1,$T1
  1318. vmovdqu -0x40($inp),$Ii
  1319. vpxor $Xhi,$Zhi,$Zhi
  1320. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1321. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1322. vpshufb $bswap,$Ii,$Ij
  1323. vpxor $Xmi,$Zmi,$Zmi
  1324. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1325. vpsrldq \$8,$HK,$HK
  1326. sub \$0x10,$len
  1327. jz .Ltail_avx
  1328. vpunpckhqdq $Ij,$Ij,$T1
  1329. vpxor $Xlo,$Zlo,$Zlo
  1330. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1331. vpxor $Ij,$T1,$T1
  1332. vmovdqu -0x50($inp),$Ii
  1333. vpxor $Xhi,$Zhi,$Zhi
  1334. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1335. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1336. vpshufb $bswap,$Ii,$Ij
  1337. vpxor $Xmi,$Zmi,$Zmi
  1338. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1339. vmovdqu 0x80-0x40($Htbl),$HK
  1340. sub \$0x10,$len
  1341. jz .Ltail_avx
  1342. vpunpckhqdq $Ij,$Ij,$T1
  1343. vpxor $Xlo,$Zlo,$Zlo
  1344. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1345. vpxor $Ij,$T1,$T1
  1346. vmovdqu -0x60($inp),$Ii
  1347. vpxor $Xhi,$Zhi,$Zhi
  1348. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1349. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1350. vpshufb $bswap,$Ii,$Ij
  1351. vpxor $Xmi,$Zmi,$Zmi
  1352. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1353. vpsrldq \$8,$HK,$HK
  1354. sub \$0x10,$len
  1355. jz .Ltail_avx
  1356. vpunpckhqdq $Ij,$Ij,$T1
  1357. vpxor $Xlo,$Zlo,$Zlo
  1358. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1359. vpxor $Ij,$T1,$T1
  1360. vmovdqu -0x70($inp),$Ii
  1361. vpxor $Xhi,$Zhi,$Zhi
  1362. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1363. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1364. vpshufb $bswap,$Ii,$Ij
  1365. vpxor $Xmi,$Zmi,$Zmi
  1366. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1367. vmovq 0xb8-0x40($Htbl),$HK
  1368. sub \$0x10,$len
  1369. jmp .Ltail_avx
  1370. .align 32
  1371. .Ltail_avx:
  1372. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1373. .Ltail_no_xor_avx:
  1374. vpunpckhqdq $Ij,$Ij,$T1
  1375. vpxor $Xlo,$Zlo,$Zlo
  1376. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1377. vpxor $Ij,$T1,$T1
  1378. vpxor $Xhi,$Zhi,$Zhi
  1379. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1380. vpxor $Xmi,$Zmi,$Zmi
  1381. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1382. vmovdqu (%r10),$Tred
  1383. vpxor $Xlo,$Zlo,$Xi
  1384. vpxor $Xhi,$Zhi,$Xo
  1385. vpxor $Xmi,$Zmi,$Zmi
  1386. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1387. vpxor $Xo, $Zmi,$Zmi
  1388. vpslldq \$8, $Zmi,$T2
  1389. vpsrldq \$8, $Zmi,$Zmi
  1390. vpxor $T2, $Xi, $Xi
  1391. vpxor $Zmi,$Xo, $Xo
  1392. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1393. vpalignr \$8,$Xi,$Xi,$Xi
  1394. vpxor $T2,$Xi,$Xi
  1395. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1396. vpalignr \$8,$Xi,$Xi,$Xi
  1397. vpxor $Xo,$Xi,$Xi
  1398. vpxor $T2,$Xi,$Xi
  1399. cmp \$0,$len
  1400. jne .Lshort_avx
  1401. vpshufb $bswap,$Xi,$Xi
  1402. vmovdqu $Xi,($Xip)
  1403. vzeroupper
  1404. ___
  1405. $code.=<<___ if ($win64);
  1406. movaps (%rsp),%xmm6
  1407. movaps 0x10(%rsp),%xmm7
  1408. movaps 0x20(%rsp),%xmm8
  1409. movaps 0x30(%rsp),%xmm9
  1410. movaps 0x40(%rsp),%xmm10
  1411. movaps 0x50(%rsp),%xmm11
  1412. movaps 0x60(%rsp),%xmm12
  1413. movaps 0x70(%rsp),%xmm13
  1414. movaps 0x80(%rsp),%xmm14
  1415. movaps 0x90(%rsp),%xmm15
  1416. lea 0xa8(%rsp),%rsp
  1417. .LSEH_end_gcm_ghash_avx:
  1418. ___
  1419. $code.=<<___;
  1420. ret
  1421. .size gcm_ghash_avx,.-gcm_ghash_avx
  1422. ___
  1423. } else {
  1424. $code.=<<___;
  1425. jmp .L_ghash_clmul
  1426. .size gcm_ghash_avx,.-gcm_ghash_avx
  1427. ___
  1428. }
  1429. $code.=<<___;
  1430. .align 64
  1431. .Lbswap_mask:
  1432. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1433. .L0x1c2_polynomial:
  1434. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1435. .L7_mask:
  1436. .long 7,0,7,0
  1437. .L7_mask_poly:
  1438. .long 7,0,`0xE1<<1`,0
  1439. .align 64
  1440. .type .Lrem_4bit,\@object
  1441. .Lrem_4bit:
  1442. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1443. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1444. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1445. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1446. .type .Lrem_8bit,\@object
  1447. .Lrem_8bit:
  1448. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1449. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1450. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1451. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1452. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1453. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1454. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1455. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1456. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1457. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1458. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1459. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1460. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1461. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1462. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1463. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1464. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1465. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1466. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1467. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1468. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1469. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1470. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1471. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1472. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1473. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1474. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1475. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1476. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1477. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1478. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1479. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1480. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1481. .align 64
  1482. ___
  1483. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1484. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1485. if ($win64) {
  1486. $rec="%rcx";
  1487. $frame="%rdx";
  1488. $context="%r8";
  1489. $disp="%r9";
  1490. $code.=<<___;
  1491. .extern __imp_RtlVirtualUnwind
  1492. .type se_handler,\@abi-omnipotent
  1493. .align 16
  1494. se_handler:
  1495. push %rsi
  1496. push %rdi
  1497. push %rbx
  1498. push %rbp
  1499. push %r12
  1500. push %r13
  1501. push %r14
  1502. push %r15
  1503. pushfq
  1504. sub \$64,%rsp
  1505. mov 120($context),%rax # pull context->Rax
  1506. mov 248($context),%rbx # pull context->Rip
  1507. mov 8($disp),%rsi # disp->ImageBase
  1508. mov 56($disp),%r11 # disp->HandlerData
  1509. mov 0(%r11),%r10d # HandlerData[0]
  1510. lea (%rsi,%r10),%r10 # prologue label
  1511. cmp %r10,%rbx # context->Rip<prologue label
  1512. jb .Lin_prologue
  1513. mov 152($context),%rax # pull context->Rsp
  1514. mov 4(%r11),%r10d # HandlerData[1]
  1515. lea (%rsi,%r10),%r10 # epilogue label
  1516. cmp %r10,%rbx # context->Rip>=epilogue label
  1517. jae .Lin_prologue
  1518. lea 48+280(%rax),%rax # adjust "rsp"
  1519. mov -8(%rax),%rbx
  1520. mov -16(%rax),%rbp
  1521. mov -24(%rax),%r12
  1522. mov -32(%rax),%r13
  1523. mov -40(%rax),%r14
  1524. mov -48(%rax),%r15
  1525. mov %rbx,144($context) # restore context->Rbx
  1526. mov %rbp,160($context) # restore context->Rbp
  1527. mov %r12,216($context) # restore context->R12
  1528. mov %r13,224($context) # restore context->R13
  1529. mov %r14,232($context) # restore context->R14
  1530. mov %r15,240($context) # restore context->R15
  1531. .Lin_prologue:
  1532. mov 8(%rax),%rdi
  1533. mov 16(%rax),%rsi
  1534. mov %rax,152($context) # restore context->Rsp
  1535. mov %rsi,168($context) # restore context->Rsi
  1536. mov %rdi,176($context) # restore context->Rdi
  1537. mov 40($disp),%rdi # disp->ContextRecord
  1538. mov $context,%rsi # context
  1539. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1540. .long 0xa548f3fc # cld; rep movsq
  1541. mov $disp,%rsi
  1542. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1543. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1544. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1545. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1546. mov 40(%rsi),%r10 # disp->ContextRecord
  1547. lea 56(%rsi),%r11 # &disp->HandlerData
  1548. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1549. mov %r10,32(%rsp) # arg5
  1550. mov %r11,40(%rsp) # arg6
  1551. mov %r12,48(%rsp) # arg7
  1552. mov %rcx,56(%rsp) # arg8, (NULL)
  1553. call *__imp_RtlVirtualUnwind(%rip)
  1554. mov \$1,%eax # ExceptionContinueSearch
  1555. add \$64,%rsp
  1556. popfq
  1557. pop %r15
  1558. pop %r14
  1559. pop %r13
  1560. pop %r12
  1561. pop %rbp
  1562. pop %rbx
  1563. pop %rdi
  1564. pop %rsi
  1565. ret
  1566. .size se_handler,.-se_handler
  1567. .section .pdata
  1568. .align 4
  1569. .rva .LSEH_begin_gcm_gmult_4bit
  1570. .rva .LSEH_end_gcm_gmult_4bit
  1571. .rva .LSEH_info_gcm_gmult_4bit
  1572. .rva .LSEH_begin_gcm_ghash_4bit
  1573. .rva .LSEH_end_gcm_ghash_4bit
  1574. .rva .LSEH_info_gcm_ghash_4bit
  1575. .rva .LSEH_begin_gcm_init_clmul
  1576. .rva .LSEH_end_gcm_init_clmul
  1577. .rva .LSEH_info_gcm_init_clmul
  1578. .rva .LSEH_begin_gcm_ghash_clmul
  1579. .rva .LSEH_end_gcm_ghash_clmul
  1580. .rva .LSEH_info_gcm_ghash_clmul
  1581. ___
  1582. $code.=<<___ if ($avx);
  1583. .rva .LSEH_begin_gcm_init_avx
  1584. .rva .LSEH_end_gcm_init_avx
  1585. .rva .LSEH_info_gcm_init_clmul
  1586. .rva .LSEH_begin_gcm_ghash_avx
  1587. .rva .LSEH_end_gcm_ghash_avx
  1588. .rva .LSEH_info_gcm_ghash_clmul
  1589. ___
  1590. $code.=<<___;
  1591. .section .xdata
  1592. .align 8
  1593. .LSEH_info_gcm_gmult_4bit:
  1594. .byte 9,0,0,0
  1595. .rva se_handler
  1596. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1597. .LSEH_info_gcm_ghash_4bit:
  1598. .byte 9,0,0,0
  1599. .rva se_handler
  1600. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1601. .LSEH_info_gcm_init_clmul:
  1602. .byte 0x01,0x08,0x03,0x00
  1603. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1604. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1605. .LSEH_info_gcm_ghash_clmul:
  1606. .byte 0x01,0x33,0x16,0x00
  1607. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1608. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1609. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1610. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1611. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1612. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1613. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1614. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1615. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1616. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1617. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1618. ___
  1619. }
  1620. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1621. print $code;
  1622. close STDOUT;