ghash-x86_64.pl 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # March, June 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that
  20. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  21. # function features so called "528B" variant utilizing additional
  22. # 256+16 bytes of per-key storage [+512 bytes shared table].
  23. # Performance results are for this streamed GHASH subroutine and are
  24. # expressed in cycles per processed byte, less is better:
  25. #
  26. # gcc 3.4.x(*) assembler
  27. #
  28. # P4 28.6 14.0 +100%
  29. # Opteron 19.3 7.7 +150%
  30. # Core2 17.8 8.1(**) +120%
  31. # Atom 31.6 16.8 +88%
  32. # VIA Nano 21.8 10.1 +115%
  33. #
  34. # (*) comparison is not completely fair, because C results are
  35. # for vanilla "256B" implementation, while assembler results
  36. # are for "528B";-)
  37. # (**) it's mystery [to me] why Core2 result is not same as for
  38. # Opteron;
  39. # May 2010
  40. #
  41. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  42. # See ghash-x86.pl for background information and details about coding
  43. # techniques.
  44. #
  45. # Special thanks to David Woodhouse for providing access to a
  46. # Westmere-based system on behalf of Intel Open Source Technology Centre.
  47. # December 2012
  48. #
  49. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  50. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  51. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  52. # increase aggregate factor. Then why increase here? Critical path
  53. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  54. # processing and reduction. "On top" of this we lay down aggregated
  55. # multiplication operations, triplets of independent pclmulqdq's. As
  56. # issue rate for pclmulqdq is limited, it makes lesser sense to
  57. # aggregate more multiplications than it takes to perform remaining
  58. # non-multiplication operations. 2x is near-optimal coefficient for
  59. # contemporary Intel CPUs (therefore modest improvement coefficient),
  60. # but not for Bulldozer. Latter is because logical SIMD operations
  61. # are twice as slow in comparison to Intel, so that critical path is
  62. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  63. # from higher aggregate factor...
  64. #
  65. # Westmere 1.78(+13%)
  66. # Sandy Bridge 1.80(+8%)
  67. # Ivy Bridge 1.80(+7%)
  68. # Haswell 0.55(+93%) (if system doesn't support AVX)
  69. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  70. # Skylake 0.44(+110%)(if system doesn't support AVX)
  71. # Bulldozer 1.49(+27%)
  72. # Silvermont 2.88(+13%)
  73. # Knights L 2.12(-) (if system doesn't support AVX)
  74. # Goldmont 1.08(+24%)
  75. # March 2013
  76. #
  77. # ... 8x aggregate factor AVX code path is using reduction algorithm
  78. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  79. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  80. # sub-optimally in comparison to above mentioned version. But thanks
  81. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  82. # it performs in 0.41 cycles per byte on Haswell processor, in
  83. # 0.29 on Broadwell, and in 0.36 on Skylake.
  84. #
  85. # Knights Landing achieves 1.09 cpb.
  86. #
  87. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  88. # $output is the last argument if it looks like a file (it has an extension)
  89. # $flavour is the first argument if it doesn't look like a file
  90. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  91. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  92. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  93. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  94. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  95. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  96. die "can't locate x86_64-xlate.pl";
  97. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  98. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  99. $avx = ($1>=2.20) + ($1>=2.22);
  100. }
  101. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  102. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  103. $avx = ($1>=2.09) + ($1>=2.10);
  104. }
  105. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  106. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  107. $avx = ($1>=10) + ($1>=11);
  108. }
  109. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  110. $avx = ($2>=3.0) + ($2>3.0);
  111. }
  112. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  113. or die "can't call $xlate: $!";
  114. *STDOUT=*OUT;
  115. $do4xaggr=1;
  116. # common register layout
  117. $nlo="%rax";
  118. $nhi="%rbx";
  119. $Zlo="%r8";
  120. $Zhi="%r9";
  121. $tmp="%r10";
  122. $rem_4bit = "%r11";
  123. $Xi="%rdi";
  124. $Htbl="%rsi";
  125. # per-function register layout
  126. $cnt="%rcx";
  127. $rem="%rdx";
  128. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  129. $r =~ s/%[er]([sd]i)/%\1l/ or
  130. $r =~ s/%[er](bp)/%\1l/ or
  131. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  132. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  133. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  134. my $arg = pop;
  135. $arg = "\$$arg" if ($arg*1 eq $arg);
  136. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  137. }
  138. { my $N;
  139. sub loop() {
  140. my $inp = shift;
  141. $N++;
  142. $code.=<<___;
  143. xor $nlo,$nlo
  144. xor $nhi,$nhi
  145. mov `&LB("$Zlo")`,`&LB("$nlo")`
  146. mov `&LB("$Zlo")`,`&LB("$nhi")`
  147. shl \$4,`&LB("$nlo")`
  148. mov \$14,$cnt
  149. mov 8($Htbl,$nlo),$Zlo
  150. mov ($Htbl,$nlo),$Zhi
  151. and \$0xf0,`&LB("$nhi")`
  152. mov $Zlo,$rem
  153. jmp .Loop$N
  154. .align 16
  155. .Loop$N:
  156. shr \$4,$Zlo
  157. and \$0xf,$rem
  158. mov $Zhi,$tmp
  159. mov ($inp,$cnt),`&LB("$nlo")`
  160. shr \$4,$Zhi
  161. xor 8($Htbl,$nhi),$Zlo
  162. shl \$60,$tmp
  163. xor ($Htbl,$nhi),$Zhi
  164. mov `&LB("$nlo")`,`&LB("$nhi")`
  165. xor ($rem_4bit,$rem,8),$Zhi
  166. mov $Zlo,$rem
  167. shl \$4,`&LB("$nlo")`
  168. xor $tmp,$Zlo
  169. dec $cnt
  170. js .Lbreak$N
  171. shr \$4,$Zlo
  172. and \$0xf,$rem
  173. mov $Zhi,$tmp
  174. shr \$4,$Zhi
  175. xor 8($Htbl,$nlo),$Zlo
  176. shl \$60,$tmp
  177. xor ($Htbl,$nlo),$Zhi
  178. and \$0xf0,`&LB("$nhi")`
  179. xor ($rem_4bit,$rem,8),$Zhi
  180. mov $Zlo,$rem
  181. xor $tmp,$Zlo
  182. jmp .Loop$N
  183. .align 16
  184. .Lbreak$N:
  185. shr \$4,$Zlo
  186. and \$0xf,$rem
  187. mov $Zhi,$tmp
  188. shr \$4,$Zhi
  189. xor 8($Htbl,$nlo),$Zlo
  190. shl \$60,$tmp
  191. xor ($Htbl,$nlo),$Zhi
  192. and \$0xf0,`&LB("$nhi")`
  193. xor ($rem_4bit,$rem,8),$Zhi
  194. mov $Zlo,$rem
  195. xor $tmp,$Zlo
  196. shr \$4,$Zlo
  197. and \$0xf,$rem
  198. mov $Zhi,$tmp
  199. shr \$4,$Zhi
  200. xor 8($Htbl,$nhi),$Zlo
  201. shl \$60,$tmp
  202. xor ($Htbl,$nhi),$Zhi
  203. xor $tmp,$Zlo
  204. xor ($rem_4bit,$rem,8),$Zhi
  205. bswap $Zlo
  206. bswap $Zhi
  207. ___
  208. }}
  209. $code=<<___;
  210. .text
  211. .extern OPENSSL_ia32cap_P
  212. .globl gcm_gmult_4bit
  213. .type gcm_gmult_4bit,\@function,2
  214. .align 16
  215. gcm_gmult_4bit:
  216. .cfi_startproc
  217. push %rbx
  218. .cfi_push %rbx
  219. push %rbp # %rbp and others are pushed exclusively in
  220. .cfi_push %rbp
  221. push %r12 # order to reuse Win64 exception handler...
  222. .cfi_push %r12
  223. push %r13
  224. .cfi_push %r13
  225. push %r14
  226. .cfi_push %r14
  227. push %r15
  228. .cfi_push %r15
  229. sub \$280,%rsp
  230. .cfi_adjust_cfa_offset 280
  231. .Lgmult_prologue:
  232. movzb 15($Xi),$Zlo
  233. lea .Lrem_4bit(%rip),$rem_4bit
  234. ___
  235. &loop ($Xi);
  236. $code.=<<___;
  237. mov $Zlo,8($Xi)
  238. mov $Zhi,($Xi)
  239. lea 280+48(%rsp),%rsi
  240. .cfi_def_cfa %rsi,8
  241. mov -8(%rsi),%rbx
  242. .cfi_restore %rbx
  243. lea (%rsi),%rsp
  244. .cfi_def_cfa_register %rsp
  245. .Lgmult_epilogue:
  246. ret
  247. .cfi_endproc
  248. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  249. ___
  250. # per-function register layout
  251. $inp="%rdx";
  252. $len="%rcx";
  253. $rem_8bit=$rem_4bit;
  254. $code.=<<___;
  255. .globl gcm_ghash_4bit
  256. .type gcm_ghash_4bit,\@function,4
  257. .align 16
  258. gcm_ghash_4bit:
  259. .cfi_startproc
  260. push %rbx
  261. .cfi_push %rbx
  262. push %rbp
  263. .cfi_push %rbp
  264. push %r12
  265. .cfi_push %r12
  266. push %r13
  267. .cfi_push %r13
  268. push %r14
  269. .cfi_push %r14
  270. push %r15
  271. .cfi_push %r15
  272. sub \$280,%rsp
  273. .cfi_adjust_cfa_offset 280
  274. .Lghash_prologue:
  275. mov $inp,%r14 # reassign couple of args
  276. mov $len,%r15
  277. ___
  278. { my $inp="%r14";
  279. my $dat="%edx";
  280. my $len="%r15";
  281. my @nhi=("%ebx","%ecx");
  282. my @rem=("%r12","%r13");
  283. my $Hshr4="%rbp";
  284. &sub ($Htbl,-128); # size optimization
  285. &lea ($Hshr4,"16+128(%rsp)");
  286. { my @lo =($nlo,$nhi);
  287. my @hi =($Zlo,$Zhi);
  288. &xor ($dat,$dat);
  289. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  290. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  291. &or ($lo[0],$tmp) if ($i>1);
  292. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  293. &shr ($lo[1],4) if ($i>0 && $i<17);
  294. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  295. &shr ($hi[1],4) if ($i>0 && $i<17);
  296. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  297. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  298. &shl (&LB($dat),4) if ($i>0 && $i<17);
  299. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  300. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  301. &shl ($tmp,60) if ($i>0 && $i<17);
  302. push (@lo,shift(@lo));
  303. push (@hi,shift(@hi));
  304. }
  305. }
  306. &add ($Htbl,-128);
  307. &mov ($Zlo,"8($Xi)");
  308. &mov ($Zhi,"0($Xi)");
  309. &add ($len,$inp); # pointer to the end of data
  310. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  311. &jmp (".Louter_loop");
  312. $code.=".align 16\n.Louter_loop:\n";
  313. &xor ($Zhi,"($inp)");
  314. &mov ("%rdx","8($inp)");
  315. &lea ($inp,"16($inp)");
  316. &xor ("%rdx",$Zlo);
  317. &mov ("($Xi)",$Zhi);
  318. &mov ("8($Xi)","%rdx");
  319. &shr ("%rdx",32);
  320. &xor ($nlo,$nlo);
  321. &rol ($dat,8);
  322. &mov (&LB($nlo),&LB($dat));
  323. &movz ($nhi[0],&LB($dat));
  324. &shl (&LB($nlo),4);
  325. &shr ($nhi[0],4);
  326. for ($j=11,$i=0;$i<15;$i++) {
  327. &rol ($dat,8);
  328. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  329. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  330. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  331. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  332. &mov (&LB($nlo),&LB($dat));
  333. &xor ($Zlo,$tmp) if ($i>0);
  334. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  335. &movz ($nhi[1],&LB($dat));
  336. &shl (&LB($nlo),4);
  337. &movzb ($rem[0],"(%rsp,$nhi[0])");
  338. &shr ($nhi[1],4) if ($i<14);
  339. &and ($nhi[1],0xf0) if ($i==14);
  340. &shl ($rem[1],48) if ($i>0);
  341. &xor ($rem[0],$Zlo);
  342. &mov ($tmp,$Zhi);
  343. &xor ($Zhi,$rem[1]) if ($i>0);
  344. &shr ($Zlo,8);
  345. &movz ($rem[0],&LB($rem[0]));
  346. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  347. &shr ($Zhi,8);
  348. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  349. &shl ($tmp,56);
  350. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  351. unshift (@nhi,pop(@nhi)); # "rotate" registers
  352. unshift (@rem,pop(@rem));
  353. }
  354. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  355. &xor ($Zlo,"8($Htbl,$nlo)");
  356. &xor ($Zhi,"($Htbl,$nlo)");
  357. &shl ($rem[1],48);
  358. &xor ($Zlo,$tmp);
  359. &xor ($Zhi,$rem[1]);
  360. &movz ($rem[0],&LB($Zlo));
  361. &shr ($Zlo,4);
  362. &mov ($tmp,$Zhi);
  363. &shl (&LB($rem[0]),4);
  364. &shr ($Zhi,4);
  365. &xor ($Zlo,"8($Htbl,$nhi[0])");
  366. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  367. &shl ($tmp,60);
  368. &xor ($Zhi,"($Htbl,$nhi[0])");
  369. &xor ($Zlo,$tmp);
  370. &shl ($rem[0],48);
  371. &bswap ($Zlo);
  372. &xor ($Zhi,$rem[0]);
  373. &bswap ($Zhi);
  374. &cmp ($inp,$len);
  375. &jb (".Louter_loop");
  376. }
  377. $code.=<<___;
  378. mov $Zlo,8($Xi)
  379. mov $Zhi,($Xi)
  380. lea 280+48(%rsp),%rsi
  381. .cfi_def_cfa %rsi,8
  382. mov -48(%rsi),%r15
  383. .cfi_restore %r15
  384. mov -40(%rsi),%r14
  385. .cfi_restore %r14
  386. mov -32(%rsi),%r13
  387. .cfi_restore %r13
  388. mov -24(%rsi),%r12
  389. .cfi_restore %r12
  390. mov -16(%rsi),%rbp
  391. .cfi_restore %rbp
  392. mov -8(%rsi),%rbx
  393. .cfi_restore %rbx
  394. lea 0(%rsi),%rsp
  395. .cfi_def_cfa_register %rsp
  396. .Lghash_epilogue:
  397. ret
  398. .cfi_endproc
  399. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  400. ___
  401. ######################################################################
  402. # PCLMULQDQ version.
  403. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  404. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  405. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  406. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  407. sub clmul64x64_T2 { # minimal register pressure
  408. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  409. if (!defined($HK)) { $HK = $T2;
  410. $code.=<<___;
  411. movdqa $Xi,$Xhi #
  412. pshufd \$0b01001110,$Xi,$T1
  413. pshufd \$0b01001110,$Hkey,$T2
  414. pxor $Xi,$T1 #
  415. pxor $Hkey,$T2
  416. ___
  417. } else {
  418. $code.=<<___;
  419. movdqa $Xi,$Xhi #
  420. pshufd \$0b01001110,$Xi,$T1
  421. pxor $Xi,$T1 #
  422. ___
  423. }
  424. $code.=<<___;
  425. pclmulqdq \$0x00,$Hkey,$Xi #######
  426. pclmulqdq \$0x11,$Hkey,$Xhi #######
  427. pclmulqdq \$0x00,$HK,$T1 #######
  428. pxor $Xi,$T1 #
  429. pxor $Xhi,$T1 #
  430. movdqa $T1,$T2 #
  431. psrldq \$8,$T1
  432. pslldq \$8,$T2 #
  433. pxor $T1,$Xhi
  434. pxor $T2,$Xi #
  435. ___
  436. }
  437. sub reduction_alg9 { # 17/11 times faster than Intel version
  438. my ($Xhi,$Xi) = @_;
  439. $code.=<<___;
  440. # 1st phase
  441. movdqa $Xi,$T2 #
  442. movdqa $Xi,$T1
  443. psllq \$5,$Xi
  444. pxor $Xi,$T1 #
  445. psllq \$1,$Xi
  446. pxor $T1,$Xi #
  447. psllq \$57,$Xi #
  448. movdqa $Xi,$T1 #
  449. pslldq \$8,$Xi
  450. psrldq \$8,$T1 #
  451. pxor $T2,$Xi
  452. pxor $T1,$Xhi #
  453. # 2nd phase
  454. movdqa $Xi,$T2
  455. psrlq \$1,$Xi
  456. pxor $T2,$Xhi #
  457. pxor $Xi,$T2
  458. psrlq \$5,$Xi
  459. pxor $T2,$Xi #
  460. psrlq \$1,$Xi #
  461. pxor $Xhi,$Xi #
  462. ___
  463. }
  464. { my ($Htbl,$Xip)=@_4args;
  465. my $HK="%xmm6";
  466. $code.=<<___;
  467. .globl gcm_init_clmul
  468. .type gcm_init_clmul,\@abi-omnipotent
  469. .align 16
  470. gcm_init_clmul:
  471. .cfi_startproc
  472. .L_init_clmul:
  473. ___
  474. $code.=<<___ if ($win64);
  475. .LSEH_begin_gcm_init_clmul:
  476. # I can't trust assembler to use specific encoding:-(
  477. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  478. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  479. ___
  480. $code.=<<___;
  481. movdqu ($Xip),$Hkey
  482. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  483. # <<1 twist
  484. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  485. movdqa $Hkey,$T1
  486. psllq \$1,$Hkey
  487. pxor $T3,$T3 #
  488. psrlq \$63,$T1
  489. pcmpgtd $T2,$T3 # broadcast carry bit
  490. pslldq \$8,$T1
  491. por $T1,$Hkey # H<<=1
  492. # magic reduction
  493. pand .L0x1c2_polynomial(%rip),$T3
  494. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  495. # calculate H^2
  496. pshufd \$0b01001110,$Hkey,$HK
  497. movdqa $Hkey,$Xi
  498. pxor $Hkey,$HK
  499. ___
  500. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  501. &reduction_alg9 ($Xhi,$Xi);
  502. $code.=<<___;
  503. pshufd \$0b01001110,$Hkey,$T1
  504. pshufd \$0b01001110,$Xi,$T2
  505. pxor $Hkey,$T1 # Karatsuba pre-processing
  506. movdqu $Hkey,0x00($Htbl) # save H
  507. pxor $Xi,$T2 # Karatsuba pre-processing
  508. movdqu $Xi,0x10($Htbl) # save H^2
  509. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  510. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  511. ___
  512. if ($do4xaggr) {
  513. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  514. &reduction_alg9 ($Xhi,$Xi);
  515. $code.=<<___;
  516. movdqa $Xi,$T3
  517. ___
  518. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  519. &reduction_alg9 ($Xhi,$Xi);
  520. $code.=<<___;
  521. pshufd \$0b01001110,$T3,$T1
  522. pshufd \$0b01001110,$Xi,$T2
  523. pxor $T3,$T1 # Karatsuba pre-processing
  524. movdqu $T3,0x30($Htbl) # save H^3
  525. pxor $Xi,$T2 # Karatsuba pre-processing
  526. movdqu $Xi,0x40($Htbl) # save H^4
  527. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  528. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  529. ___
  530. }
  531. $code.=<<___ if ($win64);
  532. movaps (%rsp),%xmm6
  533. lea 0x18(%rsp),%rsp
  534. .LSEH_end_gcm_init_clmul:
  535. ___
  536. $code.=<<___;
  537. ret
  538. .cfi_endproc
  539. .size gcm_init_clmul,.-gcm_init_clmul
  540. ___
  541. }
  542. { my ($Xip,$Htbl)=@_4args;
  543. $code.=<<___;
  544. .globl gcm_gmult_clmul
  545. .type gcm_gmult_clmul,\@abi-omnipotent
  546. .align 16
  547. gcm_gmult_clmul:
  548. .cfi_startproc
  549. .L_gmult_clmul:
  550. movdqu ($Xip),$Xi
  551. movdqa .Lbswap_mask(%rip),$T3
  552. movdqu ($Htbl),$Hkey
  553. movdqu 0x20($Htbl),$T2
  554. pshufb $T3,$Xi
  555. ___
  556. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  557. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  558. # experimental alternative. special thing about is that there
  559. # no dependency between the two multiplications...
  560. mov \$`0xE1<<1`,%eax
  561. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  562. mov \$0x07,%r11d
  563. movq %rax,$T1
  564. movq %r10,$T2
  565. movq %r11,$T3 # borrow $T3
  566. pand $Xi,$T3
  567. pshufb $T3,$T2 # ($Xi&7)·0xE0
  568. movq %rax,$T3
  569. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  570. pxor $Xi,$T2
  571. pslldq \$15,$T2
  572. paddd $T2,$T2 # <<(64+56+1)
  573. pxor $T2,$Xi
  574. pclmulqdq \$0x01,$T3,$Xi
  575. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  576. psrldq \$1,$T1
  577. pxor $T1,$Xhi
  578. pslldq \$7,$Xi
  579. pxor $Xhi,$Xi
  580. ___
  581. $code.=<<___;
  582. pshufb $T3,$Xi
  583. movdqu $Xi,($Xip)
  584. ret
  585. .cfi_endproc
  586. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  587. ___
  588. }
  589. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  590. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  591. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  592. $code.=<<___;
  593. .globl gcm_ghash_clmul
  594. .type gcm_ghash_clmul,\@abi-omnipotent
  595. .align 32
  596. gcm_ghash_clmul:
  597. .cfi_startproc
  598. .L_ghash_clmul:
  599. ___
  600. $code.=<<___ if ($win64);
  601. lea -0x88(%rsp),%rax
  602. .LSEH_begin_gcm_ghash_clmul:
  603. # I can't trust assembler to use specific encoding:-(
  604. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  605. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  606. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  607. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  608. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  609. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  610. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  611. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  612. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  613. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  614. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  615. ___
  616. $code.=<<___;
  617. movdqa .Lbswap_mask(%rip),$T3
  618. movdqu ($Xip),$Xi
  619. movdqu ($Htbl),$Hkey
  620. movdqu 0x20($Htbl),$HK
  621. pshufb $T3,$Xi
  622. sub \$0x10,$len
  623. jz .Lodd_tail
  624. movdqu 0x10($Htbl),$Hkey2
  625. ___
  626. if ($do4xaggr) {
  627. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  628. $code.=<<___;
  629. mov OPENSSL_ia32cap_P+4(%rip),%eax
  630. cmp \$0x30,$len
  631. jb .Lskip4x
  632. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  633. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  634. je .Lskip4x
  635. sub \$0x30,$len
  636. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  637. movdqu 0x30($Htbl),$Hkey3
  638. movdqu 0x40($Htbl),$Hkey4
  639. #######
  640. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  641. #
  642. movdqu 0x30($inp),$Xln
  643. movdqu 0x20($inp),$Xl
  644. pshufb $T3,$Xln
  645. pshufb $T3,$Xl
  646. movdqa $Xln,$Xhn
  647. pshufd \$0b01001110,$Xln,$Xmn
  648. pxor $Xln,$Xmn
  649. pclmulqdq \$0x00,$Hkey,$Xln
  650. pclmulqdq \$0x11,$Hkey,$Xhn
  651. pclmulqdq \$0x00,$HK,$Xmn
  652. movdqa $Xl,$Xh
  653. pshufd \$0b01001110,$Xl,$Xm
  654. pxor $Xl,$Xm
  655. pclmulqdq \$0x00,$Hkey2,$Xl
  656. pclmulqdq \$0x11,$Hkey2,$Xh
  657. pclmulqdq \$0x10,$HK,$Xm
  658. xorps $Xl,$Xln
  659. xorps $Xh,$Xhn
  660. movups 0x50($Htbl),$HK
  661. xorps $Xm,$Xmn
  662. movdqu 0x10($inp),$Xl
  663. movdqu 0($inp),$T1
  664. pshufb $T3,$Xl
  665. pshufb $T3,$T1
  666. movdqa $Xl,$Xh
  667. pshufd \$0b01001110,$Xl,$Xm
  668. pxor $T1,$Xi
  669. pxor $Xl,$Xm
  670. pclmulqdq \$0x00,$Hkey3,$Xl
  671. movdqa $Xi,$Xhi
  672. pshufd \$0b01001110,$Xi,$T1
  673. pxor $Xi,$T1
  674. pclmulqdq \$0x11,$Hkey3,$Xh
  675. pclmulqdq \$0x00,$HK,$Xm
  676. xorps $Xl,$Xln
  677. xorps $Xh,$Xhn
  678. lea 0x40($inp),$inp
  679. sub \$0x40,$len
  680. jc .Ltail4x
  681. jmp .Lmod4_loop
  682. .align 32
  683. .Lmod4_loop:
  684. pclmulqdq \$0x00,$Hkey4,$Xi
  685. xorps $Xm,$Xmn
  686. movdqu 0x30($inp),$Xl
  687. pshufb $T3,$Xl
  688. pclmulqdq \$0x11,$Hkey4,$Xhi
  689. xorps $Xln,$Xi
  690. movdqu 0x20($inp),$Xln
  691. movdqa $Xl,$Xh
  692. pclmulqdq \$0x10,$HK,$T1
  693. pshufd \$0b01001110,$Xl,$Xm
  694. xorps $Xhn,$Xhi
  695. pxor $Xl,$Xm
  696. pshufb $T3,$Xln
  697. movups 0x20($Htbl),$HK
  698. xorps $Xmn,$T1
  699. pclmulqdq \$0x00,$Hkey,$Xl
  700. pshufd \$0b01001110,$Xln,$Xmn
  701. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  702. movdqa $Xln,$Xhn
  703. pxor $Xhi,$T1 #
  704. pxor $Xln,$Xmn
  705. movdqa $T1,$T2 #
  706. pclmulqdq \$0x11,$Hkey,$Xh
  707. pslldq \$8,$T1
  708. psrldq \$8,$T2 #
  709. pxor $T1,$Xi
  710. movdqa .L7_mask(%rip),$T1
  711. pxor $T2,$Xhi #
  712. movq %rax,$T2
  713. pand $Xi,$T1 # 1st phase
  714. pshufb $T1,$T2 #
  715. pxor $Xi,$T2 #
  716. pclmulqdq \$0x00,$HK,$Xm
  717. psllq \$57,$T2 #
  718. movdqa $T2,$T1 #
  719. pslldq \$8,$T2
  720. pclmulqdq \$0x00,$Hkey2,$Xln
  721. psrldq \$8,$T1 #
  722. pxor $T2,$Xi
  723. pxor $T1,$Xhi #
  724. movdqu 0($inp),$T1
  725. movdqa $Xi,$T2 # 2nd phase
  726. psrlq \$1,$Xi
  727. pclmulqdq \$0x11,$Hkey2,$Xhn
  728. xorps $Xl,$Xln
  729. movdqu 0x10($inp),$Xl
  730. pshufb $T3,$Xl
  731. pclmulqdq \$0x10,$HK,$Xmn
  732. xorps $Xh,$Xhn
  733. movups 0x50($Htbl),$HK
  734. pshufb $T3,$T1
  735. pxor $T2,$Xhi #
  736. pxor $Xi,$T2
  737. psrlq \$5,$Xi
  738. movdqa $Xl,$Xh
  739. pxor $Xm,$Xmn
  740. pshufd \$0b01001110,$Xl,$Xm
  741. pxor $T2,$Xi #
  742. pxor $T1,$Xhi
  743. pxor $Xl,$Xm
  744. pclmulqdq \$0x00,$Hkey3,$Xl
  745. psrlq \$1,$Xi #
  746. pxor $Xhi,$Xi #
  747. movdqa $Xi,$Xhi
  748. pclmulqdq \$0x11,$Hkey3,$Xh
  749. xorps $Xl,$Xln
  750. pshufd \$0b01001110,$Xi,$T1
  751. pxor $Xi,$T1
  752. pclmulqdq \$0x00,$HK,$Xm
  753. xorps $Xh,$Xhn
  754. lea 0x40($inp),$inp
  755. sub \$0x40,$len
  756. jnc .Lmod4_loop
  757. .Ltail4x:
  758. pclmulqdq \$0x00,$Hkey4,$Xi
  759. pclmulqdq \$0x11,$Hkey4,$Xhi
  760. pclmulqdq \$0x10,$HK,$T1
  761. xorps $Xm,$Xmn
  762. xorps $Xln,$Xi
  763. xorps $Xhn,$Xhi
  764. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  765. pxor $Xmn,$T1
  766. pxor $Xhi,$T1 #
  767. pxor $Xi,$Xhi
  768. movdqa $T1,$T2 #
  769. psrldq \$8,$T1
  770. pslldq \$8,$T2 #
  771. pxor $T1,$Xhi
  772. pxor $T2,$Xi #
  773. ___
  774. &reduction_alg9($Xhi,$Xi);
  775. $code.=<<___;
  776. add \$0x40,$len
  777. jz .Ldone
  778. movdqu 0x20($Htbl),$HK
  779. sub \$0x10,$len
  780. jz .Lodd_tail
  781. .Lskip4x:
  782. ___
  783. }
  784. $code.=<<___;
  785. #######
  786. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  787. # [(H*Ii+1) + (H*Xi+1)] mod P =
  788. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  789. #
  790. movdqu ($inp),$T1 # Ii
  791. movdqu 16($inp),$Xln # Ii+1
  792. pshufb $T3,$T1
  793. pshufb $T3,$Xln
  794. pxor $T1,$Xi # Ii+Xi
  795. movdqa $Xln,$Xhn
  796. pshufd \$0b01001110,$Xln,$Xmn
  797. pxor $Xln,$Xmn
  798. pclmulqdq \$0x00,$Hkey,$Xln
  799. pclmulqdq \$0x11,$Hkey,$Xhn
  800. pclmulqdq \$0x00,$HK,$Xmn
  801. lea 32($inp),$inp # i+=2
  802. nop
  803. sub \$0x20,$len
  804. jbe .Leven_tail
  805. nop
  806. jmp .Lmod_loop
  807. .align 32
  808. .Lmod_loop:
  809. movdqa $Xi,$Xhi
  810. movdqa $Xmn,$T1
  811. pshufd \$0b01001110,$Xi,$Xmn #
  812. pxor $Xi,$Xmn #
  813. pclmulqdq \$0x00,$Hkey2,$Xi
  814. pclmulqdq \$0x11,$Hkey2,$Xhi
  815. pclmulqdq \$0x10,$HK,$Xmn
  816. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  817. pxor $Xhn,$Xhi
  818. movdqu ($inp),$T2 # Ii
  819. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  820. pshufb $T3,$T2
  821. movdqu 16($inp),$Xln # Ii+1
  822. pxor $Xhi,$T1
  823. pxor $T2,$Xhi # "Ii+Xi", consume early
  824. pxor $T1,$Xmn
  825. pshufb $T3,$Xln
  826. movdqa $Xmn,$T1 #
  827. psrldq \$8,$T1
  828. pslldq \$8,$Xmn #
  829. pxor $T1,$Xhi
  830. pxor $Xmn,$Xi #
  831. movdqa $Xln,$Xhn #
  832. movdqa $Xi,$T2 # 1st phase
  833. movdqa $Xi,$T1
  834. psllq \$5,$Xi
  835. pxor $Xi,$T1 #
  836. pclmulqdq \$0x00,$Hkey,$Xln #######
  837. psllq \$1,$Xi
  838. pxor $T1,$Xi #
  839. psllq \$57,$Xi #
  840. movdqa $Xi,$T1 #
  841. pslldq \$8,$Xi
  842. psrldq \$8,$T1 #
  843. pxor $T2,$Xi
  844. pshufd \$0b01001110,$Xhn,$Xmn
  845. pxor $T1,$Xhi #
  846. pxor $Xhn,$Xmn #
  847. movdqa $Xi,$T2 # 2nd phase
  848. psrlq \$1,$Xi
  849. pclmulqdq \$0x11,$Hkey,$Xhn #######
  850. pxor $T2,$Xhi #
  851. pxor $Xi,$T2
  852. psrlq \$5,$Xi
  853. pxor $T2,$Xi #
  854. lea 32($inp),$inp
  855. psrlq \$1,$Xi #
  856. pclmulqdq \$0x00,$HK,$Xmn #######
  857. pxor $Xhi,$Xi #
  858. sub \$0x20,$len
  859. ja .Lmod_loop
  860. .Leven_tail:
  861. movdqa $Xi,$Xhi
  862. movdqa $Xmn,$T1
  863. pshufd \$0b01001110,$Xi,$Xmn #
  864. pxor $Xi,$Xmn #
  865. pclmulqdq \$0x00,$Hkey2,$Xi
  866. pclmulqdq \$0x11,$Hkey2,$Xhi
  867. pclmulqdq \$0x10,$HK,$Xmn
  868. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  869. pxor $Xhn,$Xhi
  870. pxor $Xi,$T1
  871. pxor $Xhi,$T1
  872. pxor $T1,$Xmn
  873. movdqa $Xmn,$T1 #
  874. psrldq \$8,$T1
  875. pslldq \$8,$Xmn #
  876. pxor $T1,$Xhi
  877. pxor $Xmn,$Xi #
  878. ___
  879. &reduction_alg9 ($Xhi,$Xi);
  880. $code.=<<___;
  881. test $len,$len
  882. jnz .Ldone
  883. .Lodd_tail:
  884. movdqu ($inp),$T1 # Ii
  885. pshufb $T3,$T1
  886. pxor $T1,$Xi # Ii+Xi
  887. ___
  888. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  889. &reduction_alg9 ($Xhi,$Xi);
  890. $code.=<<___;
  891. .Ldone:
  892. pshufb $T3,$Xi
  893. movdqu $Xi,($Xip)
  894. ___
  895. $code.=<<___ if ($win64);
  896. movaps (%rsp),%xmm6
  897. movaps 0x10(%rsp),%xmm7
  898. movaps 0x20(%rsp),%xmm8
  899. movaps 0x30(%rsp),%xmm9
  900. movaps 0x40(%rsp),%xmm10
  901. movaps 0x50(%rsp),%xmm11
  902. movaps 0x60(%rsp),%xmm12
  903. movaps 0x70(%rsp),%xmm13
  904. movaps 0x80(%rsp),%xmm14
  905. movaps 0x90(%rsp),%xmm15
  906. lea 0xa8(%rsp),%rsp
  907. .LSEH_end_gcm_ghash_clmul:
  908. ___
  909. $code.=<<___;
  910. ret
  911. .cfi_endproc
  912. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  913. ___
  914. }
  915. $code.=<<___;
  916. .globl gcm_init_avx
  917. .type gcm_init_avx,\@abi-omnipotent
  918. .align 32
  919. gcm_init_avx:
  920. .cfi_startproc
  921. ___
  922. if ($avx) {
  923. my ($Htbl,$Xip)=@_4args;
  924. my $HK="%xmm6";
  925. $code.=<<___ if ($win64);
  926. .LSEH_begin_gcm_init_avx:
  927. # I can't trust assembler to use specific encoding:-(
  928. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  929. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  930. ___
  931. $code.=<<___;
  932. vzeroupper
  933. vmovdqu ($Xip),$Hkey
  934. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  935. # <<1 twist
  936. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  937. vpsrlq \$63,$Hkey,$T1
  938. vpsllq \$1,$Hkey,$Hkey
  939. vpxor $T3,$T3,$T3 #
  940. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  941. vpslldq \$8,$T1,$T1
  942. vpor $T1,$Hkey,$Hkey # H<<=1
  943. # magic reduction
  944. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  945. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  946. vpunpckhqdq $Hkey,$Hkey,$HK
  947. vmovdqa $Hkey,$Xi
  948. vpxor $Hkey,$HK,$HK
  949. mov \$4,%r10 # up to H^8
  950. jmp .Linit_start_avx
  951. ___
  952. sub clmul64x64_avx {
  953. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  954. if (!defined($HK)) { $HK = $T2;
  955. $code.=<<___;
  956. vpunpckhqdq $Xi,$Xi,$T1
  957. vpunpckhqdq $Hkey,$Hkey,$T2
  958. vpxor $Xi,$T1,$T1 #
  959. vpxor $Hkey,$T2,$T2
  960. ___
  961. } else {
  962. $code.=<<___;
  963. vpunpckhqdq $Xi,$Xi,$T1
  964. vpxor $Xi,$T1,$T1 #
  965. ___
  966. }
  967. $code.=<<___;
  968. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  969. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  970. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  971. vpxor $Xi,$Xhi,$T2 #
  972. vpxor $T2,$T1,$T1 #
  973. vpslldq \$8,$T1,$T2 #
  974. vpsrldq \$8,$T1,$T1
  975. vpxor $T2,$Xi,$Xi #
  976. vpxor $T1,$Xhi,$Xhi
  977. ___
  978. }
  979. sub reduction_avx {
  980. my ($Xhi,$Xi) = @_;
  981. $code.=<<___;
  982. vpsllq \$57,$Xi,$T1 # 1st phase
  983. vpsllq \$62,$Xi,$T2
  984. vpxor $T1,$T2,$T2 #
  985. vpsllq \$63,$Xi,$T1
  986. vpxor $T1,$T2,$T2 #
  987. vpslldq \$8,$T2,$T1 #
  988. vpsrldq \$8,$T2,$T2
  989. vpxor $T1,$Xi,$Xi #
  990. vpxor $T2,$Xhi,$Xhi
  991. vpsrlq \$1,$Xi,$T2 # 2nd phase
  992. vpxor $Xi,$Xhi,$Xhi
  993. vpxor $T2,$Xi,$Xi #
  994. vpsrlq \$5,$T2,$T2
  995. vpxor $T2,$Xi,$Xi #
  996. vpsrlq \$1,$Xi,$Xi #
  997. vpxor $Xhi,$Xi,$Xi #
  998. ___
  999. }
  1000. $code.=<<___;
  1001. .align 32
  1002. .Linit_loop_avx:
  1003. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  1004. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  1005. ___
  1006. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  1007. &reduction_avx ($Xhi,$Xi);
  1008. $code.=<<___;
  1009. .Linit_start_avx:
  1010. vmovdqa $Xi,$T3
  1011. ___
  1012. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  1013. &reduction_avx ($Xhi,$Xi);
  1014. $code.=<<___;
  1015. vpshufd \$0b01001110,$T3,$T1
  1016. vpshufd \$0b01001110,$Xi,$T2
  1017. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  1018. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  1019. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  1020. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  1021. lea 0x30($Htbl),$Htbl
  1022. sub \$1,%r10
  1023. jnz .Linit_loop_avx
  1024. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  1025. vmovdqu $T3,-0x10($Htbl)
  1026. vzeroupper
  1027. ___
  1028. $code.=<<___ if ($win64);
  1029. movaps (%rsp),%xmm6
  1030. lea 0x18(%rsp),%rsp
  1031. .LSEH_end_gcm_init_avx:
  1032. ___
  1033. $code.=<<___;
  1034. ret
  1035. .cfi_endproc
  1036. .size gcm_init_avx,.-gcm_init_avx
  1037. ___
  1038. } else {
  1039. $code.=<<___;
  1040. jmp .L_init_clmul
  1041. .cfi_endproc
  1042. .size gcm_init_avx,.-gcm_init_avx
  1043. ___
  1044. }
  1045. $code.=<<___;
  1046. .globl gcm_gmult_avx
  1047. .type gcm_gmult_avx,\@abi-omnipotent
  1048. .align 32
  1049. gcm_gmult_avx:
  1050. .cfi_startproc
  1051. jmp .L_gmult_clmul
  1052. .cfi_endproc
  1053. .size gcm_gmult_avx,.-gcm_gmult_avx
  1054. ___
  1055. $code.=<<___;
  1056. .globl gcm_ghash_avx
  1057. .type gcm_ghash_avx,\@abi-omnipotent
  1058. .align 32
  1059. gcm_ghash_avx:
  1060. .cfi_startproc
  1061. ___
  1062. if ($avx) {
  1063. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1064. my ($Xlo,$Xhi,$Xmi,
  1065. $Zlo,$Zhi,$Zmi,
  1066. $Hkey,$HK,$T1,$T2,
  1067. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1068. $code.=<<___ if ($win64);
  1069. lea -0x88(%rsp),%rax
  1070. .LSEH_begin_gcm_ghash_avx:
  1071. # I can't trust assembler to use specific encoding:-(
  1072. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1073. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1074. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1075. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1076. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1077. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1078. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1079. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1080. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1081. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1082. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1083. ___
  1084. $code.=<<___;
  1085. vzeroupper
  1086. vmovdqu ($Xip),$Xi # load $Xi
  1087. lea .L0x1c2_polynomial(%rip),%r10
  1088. lea 0x40($Htbl),$Htbl # size optimization
  1089. vmovdqu .Lbswap_mask(%rip),$bswap
  1090. vpshufb $bswap,$Xi,$Xi
  1091. cmp \$0x80,$len
  1092. jb .Lshort_avx
  1093. sub \$0x80,$len
  1094. vmovdqu 0x70($inp),$Ii # I[7]
  1095. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1096. vpshufb $bswap,$Ii,$Ii
  1097. vmovdqu 0x20-0x40($Htbl),$HK
  1098. vpunpckhqdq $Ii,$Ii,$T2
  1099. vmovdqu 0x60($inp),$Ij # I[6]
  1100. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1101. vpxor $Ii,$T2,$T2
  1102. vpshufb $bswap,$Ij,$Ij
  1103. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1104. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1105. vpunpckhqdq $Ij,$Ij,$T1
  1106. vmovdqu 0x50($inp),$Ii # I[5]
  1107. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1108. vpxor $Ij,$T1,$T1
  1109. vpshufb $bswap,$Ii,$Ii
  1110. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1111. vpunpckhqdq $Ii,$Ii,$T2
  1112. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1113. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1114. vpxor $Ii,$T2,$T2
  1115. vmovdqu 0x40($inp),$Ij # I[4]
  1116. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1117. vmovdqu 0x50-0x40($Htbl),$HK
  1118. vpshufb $bswap,$Ij,$Ij
  1119. vpxor $Xlo,$Zlo,$Zlo
  1120. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1121. vpxor $Xhi,$Zhi,$Zhi
  1122. vpunpckhqdq $Ij,$Ij,$T1
  1123. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1124. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1125. vpxor $Xmi,$Zmi,$Zmi
  1126. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1127. vpxor $Ij,$T1,$T1
  1128. vmovdqu 0x30($inp),$Ii # I[3]
  1129. vpxor $Zlo,$Xlo,$Xlo
  1130. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1131. vpxor $Zhi,$Xhi,$Xhi
  1132. vpshufb $bswap,$Ii,$Ii
  1133. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1134. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1135. vpxor $Zmi,$Xmi,$Xmi
  1136. vpunpckhqdq $Ii,$Ii,$T2
  1137. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1138. vmovdqu 0x80-0x40($Htbl),$HK
  1139. vpxor $Ii,$T2,$T2
  1140. vmovdqu 0x20($inp),$Ij # I[2]
  1141. vpxor $Xlo,$Zlo,$Zlo
  1142. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1143. vpxor $Xhi,$Zhi,$Zhi
  1144. vpshufb $bswap,$Ij,$Ij
  1145. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1146. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1147. vpxor $Xmi,$Zmi,$Zmi
  1148. vpunpckhqdq $Ij,$Ij,$T1
  1149. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1150. vpxor $Ij,$T1,$T1
  1151. vmovdqu 0x10($inp),$Ii # I[1]
  1152. vpxor $Zlo,$Xlo,$Xlo
  1153. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1154. vpxor $Zhi,$Xhi,$Xhi
  1155. vpshufb $bswap,$Ii,$Ii
  1156. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1157. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1158. vpxor $Zmi,$Xmi,$Xmi
  1159. vpunpckhqdq $Ii,$Ii,$T2
  1160. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1161. vmovdqu 0xb0-0x40($Htbl),$HK
  1162. vpxor $Ii,$T2,$T2
  1163. vmovdqu ($inp),$Ij # I[0]
  1164. vpxor $Xlo,$Zlo,$Zlo
  1165. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1166. vpxor $Xhi,$Zhi,$Zhi
  1167. vpshufb $bswap,$Ij,$Ij
  1168. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1169. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1170. vpxor $Xmi,$Zmi,$Zmi
  1171. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1172. lea 0x80($inp),$inp
  1173. cmp \$0x80,$len
  1174. jb .Ltail_avx
  1175. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1176. sub \$0x80,$len
  1177. jmp .Loop8x_avx
  1178. .align 32
  1179. .Loop8x_avx:
  1180. vpunpckhqdq $Ij,$Ij,$T1
  1181. vmovdqu 0x70($inp),$Ii # I[7]
  1182. vpxor $Xlo,$Zlo,$Zlo
  1183. vpxor $Ij,$T1,$T1
  1184. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1185. vpshufb $bswap,$Ii,$Ii
  1186. vpxor $Xhi,$Zhi,$Zhi
  1187. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1188. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1189. vpunpckhqdq $Ii,$Ii,$T2
  1190. vpxor $Xmi,$Zmi,$Zmi
  1191. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1192. vmovdqu 0x20-0x40($Htbl),$HK
  1193. vpxor $Ii,$T2,$T2
  1194. vmovdqu 0x60($inp),$Ij # I[6]
  1195. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1196. vpxor $Zlo,$Xi,$Xi # collect result
  1197. vpshufb $bswap,$Ij,$Ij
  1198. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1199. vxorps $Zhi,$Xo,$Xo
  1200. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1201. vpunpckhqdq $Ij,$Ij,$T1
  1202. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1203. vpxor $Zmi,$Tred,$Tred
  1204. vxorps $Ij,$T1,$T1
  1205. vmovdqu 0x50($inp),$Ii # I[5]
  1206. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1207. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1208. vpxor $Xo,$Tred,$Tred
  1209. vpslldq \$8,$Tred,$T2
  1210. vpxor $Xlo,$Zlo,$Zlo
  1211. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1212. vpsrldq \$8,$Tred,$Tred
  1213. vpxor $T2, $Xi, $Xi
  1214. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1215. vpshufb $bswap,$Ii,$Ii
  1216. vxorps $Tred,$Xo, $Xo
  1217. vpxor $Xhi,$Zhi,$Zhi
  1218. vpunpckhqdq $Ii,$Ii,$T2
  1219. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1220. vmovdqu 0x50-0x40($Htbl),$HK
  1221. vpxor $Ii,$T2,$T2
  1222. vpxor $Xmi,$Zmi,$Zmi
  1223. vmovdqu 0x40($inp),$Ij # I[4]
  1224. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1225. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1226. vpshufb $bswap,$Ij,$Ij
  1227. vpxor $Zlo,$Xlo,$Xlo
  1228. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1229. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1230. vpunpckhqdq $Ij,$Ij,$T1
  1231. vpxor $Zhi,$Xhi,$Xhi
  1232. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1233. vxorps $Ij,$T1,$T1
  1234. vpxor $Zmi,$Xmi,$Xmi
  1235. vmovdqu 0x30($inp),$Ii # I[3]
  1236. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1237. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1238. vpshufb $bswap,$Ii,$Ii
  1239. vpxor $Xlo,$Zlo,$Zlo
  1240. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1241. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1242. vpunpckhqdq $Ii,$Ii,$T2
  1243. vpxor $Xhi,$Zhi,$Zhi
  1244. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1245. vmovdqu 0x80-0x40($Htbl),$HK
  1246. vpxor $Ii,$T2,$T2
  1247. vpxor $Xmi,$Zmi,$Zmi
  1248. vmovdqu 0x20($inp),$Ij # I[2]
  1249. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1250. vpshufb $bswap,$Ij,$Ij
  1251. vpxor $Zlo,$Xlo,$Xlo
  1252. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1253. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1254. vpunpckhqdq $Ij,$Ij,$T1
  1255. vpxor $Zhi,$Xhi,$Xhi
  1256. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1257. vpxor $Ij,$T1,$T1
  1258. vpxor $Zmi,$Xmi,$Xmi
  1259. vxorps $Tred,$Xi,$Xi
  1260. vmovdqu 0x10($inp),$Ii # I[1]
  1261. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1262. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1263. vpshufb $bswap,$Ii,$Ii
  1264. vpxor $Xlo,$Zlo,$Zlo
  1265. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1266. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1267. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1268. vxorps $Xo,$Tred,$Tred
  1269. vpunpckhqdq $Ii,$Ii,$T2
  1270. vpxor $Xhi,$Zhi,$Zhi
  1271. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1272. vmovdqu 0xb0-0x40($Htbl),$HK
  1273. vpxor $Ii,$T2,$T2
  1274. vpxor $Xmi,$Zmi,$Zmi
  1275. vmovdqu ($inp),$Ij # I[0]
  1276. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1277. vpshufb $bswap,$Ij,$Ij
  1278. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1279. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1280. vpxor $Tred,$Ij,$Ij
  1281. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1282. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1283. lea 0x80($inp),$inp
  1284. sub \$0x80,$len
  1285. jnc .Loop8x_avx
  1286. add \$0x80,$len
  1287. jmp .Ltail_no_xor_avx
  1288. .align 32
  1289. .Lshort_avx:
  1290. vmovdqu -0x10($inp,$len),$Ii # very last word
  1291. lea ($inp,$len),$inp
  1292. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1293. vmovdqu 0x20-0x40($Htbl),$HK
  1294. vpshufb $bswap,$Ii,$Ij
  1295. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1296. vmovdqa $Xhi,$Zhi # $Zhi and
  1297. vmovdqa $Xmi,$Zmi # $Zmi
  1298. sub \$0x10,$len
  1299. jz .Ltail_avx
  1300. vpunpckhqdq $Ij,$Ij,$T1
  1301. vpxor $Xlo,$Zlo,$Zlo
  1302. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1303. vpxor $Ij,$T1,$T1
  1304. vmovdqu -0x20($inp),$Ii
  1305. vpxor $Xhi,$Zhi,$Zhi
  1306. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1307. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1308. vpshufb $bswap,$Ii,$Ij
  1309. vpxor $Xmi,$Zmi,$Zmi
  1310. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1311. vpsrldq \$8,$HK,$HK
  1312. sub \$0x10,$len
  1313. jz .Ltail_avx
  1314. vpunpckhqdq $Ij,$Ij,$T1
  1315. vpxor $Xlo,$Zlo,$Zlo
  1316. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1317. vpxor $Ij,$T1,$T1
  1318. vmovdqu -0x30($inp),$Ii
  1319. vpxor $Xhi,$Zhi,$Zhi
  1320. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1321. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1322. vpshufb $bswap,$Ii,$Ij
  1323. vpxor $Xmi,$Zmi,$Zmi
  1324. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1325. vmovdqu 0x50-0x40($Htbl),$HK
  1326. sub \$0x10,$len
  1327. jz .Ltail_avx
  1328. vpunpckhqdq $Ij,$Ij,$T1
  1329. vpxor $Xlo,$Zlo,$Zlo
  1330. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1331. vpxor $Ij,$T1,$T1
  1332. vmovdqu -0x40($inp),$Ii
  1333. vpxor $Xhi,$Zhi,$Zhi
  1334. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1335. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1336. vpshufb $bswap,$Ii,$Ij
  1337. vpxor $Xmi,$Zmi,$Zmi
  1338. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1339. vpsrldq \$8,$HK,$HK
  1340. sub \$0x10,$len
  1341. jz .Ltail_avx
  1342. vpunpckhqdq $Ij,$Ij,$T1
  1343. vpxor $Xlo,$Zlo,$Zlo
  1344. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1345. vpxor $Ij,$T1,$T1
  1346. vmovdqu -0x50($inp),$Ii
  1347. vpxor $Xhi,$Zhi,$Zhi
  1348. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1349. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1350. vpshufb $bswap,$Ii,$Ij
  1351. vpxor $Xmi,$Zmi,$Zmi
  1352. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1353. vmovdqu 0x80-0x40($Htbl),$HK
  1354. sub \$0x10,$len
  1355. jz .Ltail_avx
  1356. vpunpckhqdq $Ij,$Ij,$T1
  1357. vpxor $Xlo,$Zlo,$Zlo
  1358. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1359. vpxor $Ij,$T1,$T1
  1360. vmovdqu -0x60($inp),$Ii
  1361. vpxor $Xhi,$Zhi,$Zhi
  1362. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1363. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1364. vpshufb $bswap,$Ii,$Ij
  1365. vpxor $Xmi,$Zmi,$Zmi
  1366. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1367. vpsrldq \$8,$HK,$HK
  1368. sub \$0x10,$len
  1369. jz .Ltail_avx
  1370. vpunpckhqdq $Ij,$Ij,$T1
  1371. vpxor $Xlo,$Zlo,$Zlo
  1372. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1373. vpxor $Ij,$T1,$T1
  1374. vmovdqu -0x70($inp),$Ii
  1375. vpxor $Xhi,$Zhi,$Zhi
  1376. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1377. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1378. vpshufb $bswap,$Ii,$Ij
  1379. vpxor $Xmi,$Zmi,$Zmi
  1380. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1381. vmovq 0xb8-0x40($Htbl),$HK
  1382. sub \$0x10,$len
  1383. jmp .Ltail_avx
  1384. .align 32
  1385. .Ltail_avx:
  1386. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1387. .Ltail_no_xor_avx:
  1388. vpunpckhqdq $Ij,$Ij,$T1
  1389. vpxor $Xlo,$Zlo,$Zlo
  1390. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1391. vpxor $Ij,$T1,$T1
  1392. vpxor $Xhi,$Zhi,$Zhi
  1393. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1394. vpxor $Xmi,$Zmi,$Zmi
  1395. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1396. vmovdqu (%r10),$Tred
  1397. vpxor $Xlo,$Zlo,$Xi
  1398. vpxor $Xhi,$Zhi,$Xo
  1399. vpxor $Xmi,$Zmi,$Zmi
  1400. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1401. vpxor $Xo, $Zmi,$Zmi
  1402. vpslldq \$8, $Zmi,$T2
  1403. vpsrldq \$8, $Zmi,$Zmi
  1404. vpxor $T2, $Xi, $Xi
  1405. vpxor $Zmi,$Xo, $Xo
  1406. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1407. vpalignr \$8,$Xi,$Xi,$Xi
  1408. vpxor $T2,$Xi,$Xi
  1409. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1410. vpalignr \$8,$Xi,$Xi,$Xi
  1411. vpxor $Xo,$Xi,$Xi
  1412. vpxor $T2,$Xi,$Xi
  1413. cmp \$0,$len
  1414. jne .Lshort_avx
  1415. vpshufb $bswap,$Xi,$Xi
  1416. vmovdqu $Xi,($Xip)
  1417. vzeroupper
  1418. ___
  1419. $code.=<<___ if ($win64);
  1420. movaps (%rsp),%xmm6
  1421. movaps 0x10(%rsp),%xmm7
  1422. movaps 0x20(%rsp),%xmm8
  1423. movaps 0x30(%rsp),%xmm9
  1424. movaps 0x40(%rsp),%xmm10
  1425. movaps 0x50(%rsp),%xmm11
  1426. movaps 0x60(%rsp),%xmm12
  1427. movaps 0x70(%rsp),%xmm13
  1428. movaps 0x80(%rsp),%xmm14
  1429. movaps 0x90(%rsp),%xmm15
  1430. lea 0xa8(%rsp),%rsp
  1431. .LSEH_end_gcm_ghash_avx:
  1432. ___
  1433. $code.=<<___;
  1434. ret
  1435. .cfi_endproc
  1436. .size gcm_ghash_avx,.-gcm_ghash_avx
  1437. ___
  1438. } else {
  1439. $code.=<<___;
  1440. jmp .L_ghash_clmul
  1441. .cfi_endproc
  1442. .size gcm_ghash_avx,.-gcm_ghash_avx
  1443. ___
  1444. }
  1445. $code.=<<___;
  1446. .align 64
  1447. .Lbswap_mask:
  1448. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1449. .L0x1c2_polynomial:
  1450. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1451. .L7_mask:
  1452. .long 7,0,7,0
  1453. .L7_mask_poly:
  1454. .long 7,0,`0xE1<<1`,0
  1455. .align 64
  1456. .type .Lrem_4bit,\@object
  1457. .Lrem_4bit:
  1458. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1459. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1460. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1461. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1462. .type .Lrem_8bit,\@object
  1463. .Lrem_8bit:
  1464. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1465. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1466. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1467. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1468. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1469. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1470. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1471. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1472. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1473. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1474. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1475. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1476. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1477. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1478. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1479. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1480. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1481. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1482. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1483. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1484. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1485. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1486. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1487. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1488. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1489. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1490. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1491. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1492. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1493. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1494. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1495. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1496. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1497. .align 64
  1498. ___
  1499. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1500. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1501. if ($win64) {
  1502. $rec="%rcx";
  1503. $frame="%rdx";
  1504. $context="%r8";
  1505. $disp="%r9";
  1506. $code.=<<___;
  1507. .extern __imp_RtlVirtualUnwind
  1508. .type se_handler,\@abi-omnipotent
  1509. .align 16
  1510. se_handler:
  1511. push %rsi
  1512. push %rdi
  1513. push %rbx
  1514. push %rbp
  1515. push %r12
  1516. push %r13
  1517. push %r14
  1518. push %r15
  1519. pushfq
  1520. sub \$64,%rsp
  1521. mov 120($context),%rax # pull context->Rax
  1522. mov 248($context),%rbx # pull context->Rip
  1523. mov 8($disp),%rsi # disp->ImageBase
  1524. mov 56($disp),%r11 # disp->HandlerData
  1525. mov 0(%r11),%r10d # HandlerData[0]
  1526. lea (%rsi,%r10),%r10 # prologue label
  1527. cmp %r10,%rbx # context->Rip<prologue label
  1528. jb .Lin_prologue
  1529. mov 152($context),%rax # pull context->Rsp
  1530. mov 4(%r11),%r10d # HandlerData[1]
  1531. lea (%rsi,%r10),%r10 # epilogue label
  1532. cmp %r10,%rbx # context->Rip>=epilogue label
  1533. jae .Lin_prologue
  1534. lea 48+280(%rax),%rax # adjust "rsp"
  1535. mov -8(%rax),%rbx
  1536. mov -16(%rax),%rbp
  1537. mov -24(%rax),%r12
  1538. mov -32(%rax),%r13
  1539. mov -40(%rax),%r14
  1540. mov -48(%rax),%r15
  1541. mov %rbx,144($context) # restore context->Rbx
  1542. mov %rbp,160($context) # restore context->Rbp
  1543. mov %r12,216($context) # restore context->R12
  1544. mov %r13,224($context) # restore context->R13
  1545. mov %r14,232($context) # restore context->R14
  1546. mov %r15,240($context) # restore context->R15
  1547. .Lin_prologue:
  1548. mov 8(%rax),%rdi
  1549. mov 16(%rax),%rsi
  1550. mov %rax,152($context) # restore context->Rsp
  1551. mov %rsi,168($context) # restore context->Rsi
  1552. mov %rdi,176($context) # restore context->Rdi
  1553. mov 40($disp),%rdi # disp->ContextRecord
  1554. mov $context,%rsi # context
  1555. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1556. .long 0xa548f3fc # cld; rep movsq
  1557. mov $disp,%rsi
  1558. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1559. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1560. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1561. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1562. mov 40(%rsi),%r10 # disp->ContextRecord
  1563. lea 56(%rsi),%r11 # &disp->HandlerData
  1564. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1565. mov %r10,32(%rsp) # arg5
  1566. mov %r11,40(%rsp) # arg6
  1567. mov %r12,48(%rsp) # arg7
  1568. mov %rcx,56(%rsp) # arg8, (NULL)
  1569. call *__imp_RtlVirtualUnwind(%rip)
  1570. mov \$1,%eax # ExceptionContinueSearch
  1571. add \$64,%rsp
  1572. popfq
  1573. pop %r15
  1574. pop %r14
  1575. pop %r13
  1576. pop %r12
  1577. pop %rbp
  1578. pop %rbx
  1579. pop %rdi
  1580. pop %rsi
  1581. ret
  1582. .size se_handler,.-se_handler
  1583. .section .pdata
  1584. .align 4
  1585. .rva .LSEH_begin_gcm_gmult_4bit
  1586. .rva .LSEH_end_gcm_gmult_4bit
  1587. .rva .LSEH_info_gcm_gmult_4bit
  1588. .rva .LSEH_begin_gcm_ghash_4bit
  1589. .rva .LSEH_end_gcm_ghash_4bit
  1590. .rva .LSEH_info_gcm_ghash_4bit
  1591. .rva .LSEH_begin_gcm_init_clmul
  1592. .rva .LSEH_end_gcm_init_clmul
  1593. .rva .LSEH_info_gcm_init_clmul
  1594. .rva .LSEH_begin_gcm_ghash_clmul
  1595. .rva .LSEH_end_gcm_ghash_clmul
  1596. .rva .LSEH_info_gcm_ghash_clmul
  1597. ___
  1598. $code.=<<___ if ($avx);
  1599. .rva .LSEH_begin_gcm_init_avx
  1600. .rva .LSEH_end_gcm_init_avx
  1601. .rva .LSEH_info_gcm_init_clmul
  1602. .rva .LSEH_begin_gcm_ghash_avx
  1603. .rva .LSEH_end_gcm_ghash_avx
  1604. .rva .LSEH_info_gcm_ghash_clmul
  1605. ___
  1606. $code.=<<___;
  1607. .section .xdata
  1608. .align 8
  1609. .LSEH_info_gcm_gmult_4bit:
  1610. .byte 9,0,0,0
  1611. .rva se_handler
  1612. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1613. .LSEH_info_gcm_ghash_4bit:
  1614. .byte 9,0,0,0
  1615. .rva se_handler
  1616. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1617. .LSEH_info_gcm_init_clmul:
  1618. .byte 0x01,0x08,0x03,0x00
  1619. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1620. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1621. .LSEH_info_gcm_ghash_clmul:
  1622. .byte 0x01,0x33,0x16,0x00
  1623. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1624. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1625. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1626. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1627. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1628. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1629. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1630. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1631. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1632. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1633. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1634. ___
  1635. }
  1636. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1637. print $code;
  1638. close STDOUT or die "error closing STDOUT";