aesni-sha256-x86_64.pl 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # January 2013
  11. #
  12. # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13. # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14. # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15. # parallelism, interleaving it with another algorithm would allow to
  16. # utilize processor resources better and achieve better performance.
  17. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18. # AESNI code is weaved into it. As SHA256 dominates execution time,
  19. # stitch performance does not depend on AES key length. Below are
  20. # performance numbers in cycles per processed byte, less is better,
  21. # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22. # subroutine:
  23. #
  24. # AES-128/-192/-256+SHA256 this(**)gain
  25. # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
  26. # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
  27. # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
  28. # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
  29. #
  30. # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
  31. # Westmere is omitted from loop, this is because gain was not
  32. # estimated high enough to justify the effort;
  33. # (**) these are EVP-free results, results obtained with 'speed
  34. # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35. $flavour = shift;
  36. $output = shift;
  37. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  38. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  39. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  40. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  41. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  42. die "can't locate x86_64-xlate.pl";
  43. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  44. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  45. $avx = ($1>=2.19) + ($1>=2.22);
  46. }
  47. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  48. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  49. $avx = ($1>=2.09) + ($1>=2.10);
  50. }
  51. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  52. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  53. $avx = ($1>=10) + ($1>=12);
  54. }
  55. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  56. $avx = ($2>=3.0) + ($2>3.0);
  57. }
  58. $shaext=$avx; ### set to zero if compiling for 1.0.1
  59. $avx=1 if (!$shaext && $avx);
  60. open OUT,"| \"$^X\" $xlate $flavour $output";
  61. *STDOUT=*OUT;
  62. $func="aesni_cbc_sha256_enc";
  63. $TABLE="K256";
  64. $SZ=4;
  65. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  66. "%r8d","%r9d","%r10d","%r11d");
  67. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  68. @Sigma0=( 2,13,22);
  69. @Sigma1=( 6,11,25);
  70. @sigma0=( 7,18, 3);
  71. @sigma1=(17,19,10);
  72. $rounds=64;
  73. ########################################################################
  74. # void aesni_cbc_sha256_enc(const void *inp,
  75. # void *out,
  76. # size_t length,
  77. # const AES_KEY *key,
  78. # unsigned char *iv,
  79. # SHA256_CTX *ctx,
  80. # const void *in0);
  81. ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
  82. ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  83. $Tbl="%rbp";
  84. $_inp="16*$SZ+0*8(%rsp)";
  85. $_out="16*$SZ+1*8(%rsp)";
  86. $_end="16*$SZ+2*8(%rsp)";
  87. $_key="16*$SZ+3*8(%rsp)";
  88. $_ivp="16*$SZ+4*8(%rsp)";
  89. $_ctx="16*$SZ+5*8(%rsp)";
  90. $_in0="16*$SZ+6*8(%rsp)";
  91. $_rsp="16*$SZ+7*8(%rsp)";
  92. $framesz=16*$SZ+8*8;
  93. $code=<<___;
  94. .text
  95. .extern OPENSSL_ia32cap_P
  96. .globl $func
  97. .type $func,\@abi-omnipotent
  98. .align 16
  99. $func:
  100. ___
  101. if ($avx) {
  102. $code.=<<___;
  103. lea OPENSSL_ia32cap_P(%rip),%r11
  104. mov \$1,%eax
  105. cmp \$0,`$win64?"%rcx":"%rdi"`
  106. je .Lprobe
  107. mov 0(%r11),%eax
  108. mov 4(%r11),%r10
  109. ___
  110. $code.=<<___ if ($shaext);
  111. bt \$61,%r10 # check for SHA
  112. jc ${func}_shaext
  113. ___
  114. $code.=<<___;
  115. mov %r10,%r11
  116. shr \$32,%r11
  117. test \$`1<<11`,%r10d # check for XOP
  118. jnz ${func}_xop
  119. ___
  120. $code.=<<___ if ($avx>1);
  121. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  122. cmp \$`1<<8|1<<5|1<<3`,%r11d
  123. je ${func}_avx2
  124. ___
  125. $code.=<<___;
  126. and \$`1<<28`,%r10d # check for AVX
  127. jnz ${func}_avx
  128. ud2
  129. ___
  130. }
  131. $code.=<<___;
  132. xor %eax,%eax
  133. cmp \$0,`$win64?"%rcx":"%rdi"`
  134. je .Lprobe
  135. ud2
  136. .Lprobe:
  137. ret
  138. .size $func,.-$func
  139. .align 64
  140. .type $TABLE,\@object
  141. $TABLE:
  142. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  143. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  144. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  145. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  146. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  147. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  148. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  149. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  150. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  151. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  152. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  153. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  154. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  155. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  156. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  157. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  158. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  159. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  160. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  161. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  162. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  163. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  164. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  165. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  166. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  167. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  168. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  169. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  170. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  171. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  172. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  173. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  174. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  175. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  176. .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
  177. .long 0,0,0,0, 0,0,0,0
  178. .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  179. .align 64
  180. ___
  181. ######################################################################
  182. # SIMD code paths
  183. #
  184. {{{
  185. ($iv,$inout,$roundkey,$temp,
  186. $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
  187. $aesni_cbc_idx=0;
  188. @aesni_cbc_block = (
  189. ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  190. ## &vmovdqu ($inout,($inp));
  191. ## &mov ($_inp,$inp);
  192. '&vpxor ($inout,$inout,$roundkey);'.
  193. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
  194. '&vpxor ($inout,$inout,$iv);',
  195. '&vaesenc ($inout,$inout,$roundkey);'.
  196. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
  197. '&vaesenc ($inout,$inout,$roundkey);'.
  198. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
  199. '&vaesenc ($inout,$inout,$roundkey);'.
  200. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
  201. '&vaesenc ($inout,$inout,$roundkey);'.
  202. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
  203. '&vaesenc ($inout,$inout,$roundkey);'.
  204. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
  205. '&vaesenc ($inout,$inout,$roundkey);'.
  206. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
  207. '&vaesenc ($inout,$inout,$roundkey);'.
  208. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
  209. '&vaesenc ($inout,$inout,$roundkey);'.
  210. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
  211. '&vaesenc ($inout,$inout,$roundkey);'.
  212. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
  213. '&vaesenclast ($temp,$inout,$roundkey);'.
  214. ' &vaesenc ($inout,$inout,$roundkey);'.
  215. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
  216. '&vpand ($iv,$temp,$mask10);'.
  217. ' &vaesenc ($inout,$inout,$roundkey);'.
  218. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
  219. '&vaesenclast ($temp,$inout,$roundkey);'.
  220. ' &vaesenc ($inout,$inout,$roundkey);'.
  221. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
  222. '&vpand ($temp,$temp,$mask12);'.
  223. ' &vaesenc ($inout,$inout,$roundkey);'.
  224. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
  225. '&vpor ($iv,$iv,$temp);'.
  226. ' &vaesenclast ($temp,$inout,$roundkey);'.
  227. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  228. ## &mov ($inp,$_inp);
  229. ## &mov ($out,$_out);
  230. ## &vpand ($temp,$temp,$mask14);
  231. ## &vpor ($iv,$iv,$temp);
  232. ## &vmovdqu ($iv,($out,$inp);
  233. ## &lea (inp,16($inp));
  234. );
  235. my $a4=$T1;
  236. my ($a,$b,$c,$d,$e,$f,$g,$h);
  237. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  238. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  239. my $arg = pop;
  240. $arg = "\$$arg" if ($arg*1 eq $arg);
  241. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  242. }
  243. sub body_00_15 () {
  244. (
  245. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  246. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  247. '&mov ($a,$a1)',
  248. '&mov ($a4,$f)',
  249. '&xor ($a0,$e)',
  250. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  251. '&xor ($a4,$g)', # f^g
  252. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  253. '&xor ($a1,$a)',
  254. '&and ($a4,$e)', # (f^g)&e
  255. @aesni_cbc_block[$aesni_cbc_idx++].
  256. '&xor ($a0,$e)',
  257. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  258. '&mov ($a2,$a)',
  259. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  260. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  261. '&xor ($a2,$b)', # a^b, b^c in next round
  262. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  263. '&add ($h,$a4)', # h+=Ch(e,f,g)
  264. '&and ($a3,$a2)', # (b^c)&(a^b)
  265. '&xor ($a1,$a)',
  266. '&add ($h,$a0)', # h+=Sigma1(e)
  267. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  268. '&add ($d,$h)', # d+=h
  269. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  270. '&add ($h,$a3)', # h+=Maj(a,b,c)
  271. '&mov ($a0,$d)',
  272. '&add ($a1,$h);'. # h+=Sigma0(a)
  273. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  274. );
  275. }
  276. if ($avx) {{
  277. ######################################################################
  278. # XOP code path
  279. #
  280. $code.=<<___;
  281. .type ${func}_xop,\@function,6
  282. .align 64
  283. ${func}_xop:
  284. .Lxop_shortcut:
  285. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  286. push %rbx
  287. push %rbp
  288. push %r12
  289. push %r13
  290. push %r14
  291. push %r15
  292. mov %rsp,%r11 # copy %rsp
  293. sub \$`$framesz+$win64*16*10`,%rsp
  294. and \$-64,%rsp # align stack frame
  295. shl \$6,$len
  296. sub $inp,$out # re-bias
  297. sub $inp,$in0
  298. add $inp,$len # end of input
  299. #mov $inp,$_inp # saved later
  300. mov $out,$_out
  301. mov $len,$_end
  302. #mov $key,$_key # remains resident in $inp register
  303. mov $ivp,$_ivp
  304. mov $ctx,$_ctx
  305. mov $in0,$_in0
  306. mov %r11,$_rsp
  307. ___
  308. $code.=<<___ if ($win64);
  309. movaps %xmm6,`$framesz+16*0`(%rsp)
  310. movaps %xmm7,`$framesz+16*1`(%rsp)
  311. movaps %xmm8,`$framesz+16*2`(%rsp)
  312. movaps %xmm9,`$framesz+16*3`(%rsp)
  313. movaps %xmm10,`$framesz+16*4`(%rsp)
  314. movaps %xmm11,`$framesz+16*5`(%rsp)
  315. movaps %xmm12,`$framesz+16*6`(%rsp)
  316. movaps %xmm13,`$framesz+16*7`(%rsp)
  317. movaps %xmm14,`$framesz+16*8`(%rsp)
  318. movaps %xmm15,`$framesz+16*9`(%rsp)
  319. ___
  320. $code.=<<___;
  321. .Lprologue_xop:
  322. vzeroall
  323. mov $inp,%r12 # borrow $a4
  324. lea 0x80($key),$inp # size optimization, reassign
  325. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  326. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  327. mov $ctx,%r15 # borrow $a2
  328. mov $in0,%rsi # borrow $a3
  329. vmovdqu ($ivp),$iv # load IV
  330. sub \$9,%r14
  331. mov $SZ*0(%r15),$A
  332. mov $SZ*1(%r15),$B
  333. mov $SZ*2(%r15),$C
  334. mov $SZ*3(%r15),$D
  335. mov $SZ*4(%r15),$E
  336. mov $SZ*5(%r15),$F
  337. mov $SZ*6(%r15),$G
  338. mov $SZ*7(%r15),$H
  339. vmovdqa 0x00(%r13,%r14,8),$mask14
  340. vmovdqa 0x10(%r13,%r14,8),$mask12
  341. vmovdqa 0x20(%r13,%r14,8),$mask10
  342. vmovdqu 0x00-0x80($inp),$roundkey
  343. jmp .Lloop_xop
  344. ___
  345. if ($SZ==4) { # SHA256
  346. my @X = map("%xmm$_",(0..3));
  347. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  348. $code.=<<___;
  349. .align 16
  350. .Lloop_xop:
  351. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  352. vmovdqu 0x00(%rsi,%r12),@X[0]
  353. vmovdqu 0x10(%rsi,%r12),@X[1]
  354. vmovdqu 0x20(%rsi,%r12),@X[2]
  355. vmovdqu 0x30(%rsi,%r12),@X[3]
  356. vpshufb $t3,@X[0],@X[0]
  357. lea $TABLE(%rip),$Tbl
  358. vpshufb $t3,@X[1],@X[1]
  359. vpshufb $t3,@X[2],@X[2]
  360. vpaddd 0x00($Tbl),@X[0],$t0
  361. vpshufb $t3,@X[3],@X[3]
  362. vpaddd 0x20($Tbl),@X[1],$t1
  363. vpaddd 0x40($Tbl),@X[2],$t2
  364. vpaddd 0x60($Tbl),@X[3],$t3
  365. vmovdqa $t0,0x00(%rsp)
  366. mov $A,$a1
  367. vmovdqa $t1,0x10(%rsp)
  368. mov $B,$a3
  369. vmovdqa $t2,0x20(%rsp)
  370. xor $C,$a3 # magic
  371. vmovdqa $t3,0x30(%rsp)
  372. mov $E,$a0
  373. jmp .Lxop_00_47
  374. .align 16
  375. .Lxop_00_47:
  376. sub \$-16*2*$SZ,$Tbl # size optimization
  377. vmovdqu (%r12),$inout # $a4
  378. mov %r12,$_inp # $a4
  379. ___
  380. sub XOP_256_00_47 () {
  381. my $j = shift;
  382. my $body = shift;
  383. my @X = @_;
  384. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  385. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. &vpsrld ($t0,$t0,$sigma0[2]);
  395. eval(shift(@insns));
  396. eval(shift(@insns));
  397. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  398. eval(shift(@insns));
  399. eval(shift(@insns));
  400. eval(shift(@insns));
  401. eval(shift(@insns));
  402. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  403. eval(shift(@insns));
  404. eval(shift(@insns));
  405. &vpxor ($t0,$t0,$t1);
  406. eval(shift(@insns));
  407. eval(shift(@insns));
  408. eval(shift(@insns));
  409. eval(shift(@insns));
  410. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  411. eval(shift(@insns));
  412. eval(shift(@insns));
  413. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  414. eval(shift(@insns));
  415. eval(shift(@insns));
  416. &vpsrld ($t2,@X[3],$sigma1[2]);
  417. eval(shift(@insns));
  418. eval(shift(@insns));
  419. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  420. eval(shift(@insns));
  421. eval(shift(@insns));
  422. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  423. eval(shift(@insns));
  424. eval(shift(@insns));
  425. &vpxor ($t3,$t3,$t2);
  426. eval(shift(@insns));
  427. eval(shift(@insns));
  428. eval(shift(@insns));
  429. eval(shift(@insns));
  430. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  431. eval(shift(@insns));
  432. eval(shift(@insns));
  433. eval(shift(@insns));
  434. eval(shift(@insns));
  435. &vpsrldq ($t3,$t3,8);
  436. eval(shift(@insns));
  437. eval(shift(@insns));
  438. eval(shift(@insns));
  439. eval(shift(@insns));
  440. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  441. eval(shift(@insns));
  442. eval(shift(@insns));
  443. eval(shift(@insns));
  444. eval(shift(@insns));
  445. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  446. eval(shift(@insns));
  447. eval(shift(@insns));
  448. &vpsrld ($t2,@X[0],$sigma1[2]);
  449. eval(shift(@insns));
  450. eval(shift(@insns));
  451. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  452. eval(shift(@insns));
  453. eval(shift(@insns));
  454. &vpxor ($t3,$t3,$t2);
  455. eval(shift(@insns));
  456. eval(shift(@insns));
  457. eval(shift(@insns));
  458. eval(shift(@insns));
  459. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  460. eval(shift(@insns));
  461. eval(shift(@insns));
  462. eval(shift(@insns));
  463. eval(shift(@insns));
  464. &vpslldq ($t3,$t3,8); # 22 instructions
  465. eval(shift(@insns));
  466. eval(shift(@insns));
  467. eval(shift(@insns));
  468. eval(shift(@insns));
  469. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  470. eval(shift(@insns));
  471. eval(shift(@insns));
  472. eval(shift(@insns));
  473. eval(shift(@insns));
  474. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  475. foreach (@insns) { eval; } # remaining instructions
  476. &vmovdqa (16*$j."(%rsp)",$t2);
  477. }
  478. $aesni_cbc_idx=0;
  479. for ($i=0,$j=0; $j<4; $j++) {
  480. &XOP_256_00_47($j,\&body_00_15,@X);
  481. push(@X,shift(@X)); # rotate(@X)
  482. }
  483. &mov ("%r12",$_inp); # borrow $a4
  484. &vpand ($temp,$temp,$mask14);
  485. &mov ("%r15",$_out); # borrow $a2
  486. &vpor ($iv,$iv,$temp);
  487. &vmovdqu ("(%r15,%r12)",$iv); # write output
  488. &lea ("%r12","16(%r12)"); # inp++
  489. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  490. &jne (".Lxop_00_47");
  491. &vmovdqu ($inout,"(%r12)");
  492. &mov ($_inp,"%r12");
  493. $aesni_cbc_idx=0;
  494. for ($i=0; $i<16; ) {
  495. foreach(body_00_15()) { eval; }
  496. }
  497. }
  498. $code.=<<___;
  499. mov $_inp,%r12 # borrow $a4
  500. mov $_out,%r13 # borrow $a0
  501. mov $_ctx,%r15 # borrow $a2
  502. mov $_in0,%rsi # borrow $a3
  503. vpand $mask14,$temp,$temp
  504. mov $a1,$A
  505. vpor $temp,$iv,$iv
  506. vmovdqu $iv,(%r13,%r12) # write output
  507. lea 16(%r12),%r12 # inp++
  508. add $SZ*0(%r15),$A
  509. add $SZ*1(%r15),$B
  510. add $SZ*2(%r15),$C
  511. add $SZ*3(%r15),$D
  512. add $SZ*4(%r15),$E
  513. add $SZ*5(%r15),$F
  514. add $SZ*6(%r15),$G
  515. add $SZ*7(%r15),$H
  516. cmp $_end,%r12
  517. mov $A,$SZ*0(%r15)
  518. mov $B,$SZ*1(%r15)
  519. mov $C,$SZ*2(%r15)
  520. mov $D,$SZ*3(%r15)
  521. mov $E,$SZ*4(%r15)
  522. mov $F,$SZ*5(%r15)
  523. mov $G,$SZ*6(%r15)
  524. mov $H,$SZ*7(%r15)
  525. jb .Lloop_xop
  526. mov $_ivp,$ivp
  527. mov $_rsp,%rsi
  528. vmovdqu $iv,($ivp) # output IV
  529. vzeroall
  530. ___
  531. $code.=<<___ if ($win64);
  532. movaps `$framesz+16*0`(%rsp),%xmm6
  533. movaps `$framesz+16*1`(%rsp),%xmm7
  534. movaps `$framesz+16*2`(%rsp),%xmm8
  535. movaps `$framesz+16*3`(%rsp),%xmm9
  536. movaps `$framesz+16*4`(%rsp),%xmm10
  537. movaps `$framesz+16*5`(%rsp),%xmm11
  538. movaps `$framesz+16*6`(%rsp),%xmm12
  539. movaps `$framesz+16*7`(%rsp),%xmm13
  540. movaps `$framesz+16*8`(%rsp),%xmm14
  541. movaps `$framesz+16*9`(%rsp),%xmm15
  542. ___
  543. $code.=<<___;
  544. mov (%rsi),%r15
  545. mov 8(%rsi),%r14
  546. mov 16(%rsi),%r13
  547. mov 24(%rsi),%r12
  548. mov 32(%rsi),%rbp
  549. mov 40(%rsi),%rbx
  550. lea 48(%rsi),%rsp
  551. .Lepilogue_xop:
  552. ret
  553. .size ${func}_xop,.-${func}_xop
  554. ___
  555. ######################################################################
  556. # AVX+shrd code path
  557. #
  558. local *ror = sub { &shrd(@_[0],@_) };
  559. $code.=<<___;
  560. .type ${func}_avx,\@function,6
  561. .align 64
  562. ${func}_avx:
  563. .Lavx_shortcut:
  564. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  565. push %rbx
  566. push %rbp
  567. push %r12
  568. push %r13
  569. push %r14
  570. push %r15
  571. mov %rsp,%r11 # copy %rsp
  572. sub \$`$framesz+$win64*16*10`,%rsp
  573. and \$-64,%rsp # align stack frame
  574. shl \$6,$len
  575. sub $inp,$out # re-bias
  576. sub $inp,$in0
  577. add $inp,$len # end of input
  578. #mov $inp,$_inp # saved later
  579. mov $out,$_out
  580. mov $len,$_end
  581. #mov $key,$_key # remains resident in $inp register
  582. mov $ivp,$_ivp
  583. mov $ctx,$_ctx
  584. mov $in0,$_in0
  585. mov %r11,$_rsp
  586. ___
  587. $code.=<<___ if ($win64);
  588. movaps %xmm6,`$framesz+16*0`(%rsp)
  589. movaps %xmm7,`$framesz+16*1`(%rsp)
  590. movaps %xmm8,`$framesz+16*2`(%rsp)
  591. movaps %xmm9,`$framesz+16*3`(%rsp)
  592. movaps %xmm10,`$framesz+16*4`(%rsp)
  593. movaps %xmm11,`$framesz+16*5`(%rsp)
  594. movaps %xmm12,`$framesz+16*6`(%rsp)
  595. movaps %xmm13,`$framesz+16*7`(%rsp)
  596. movaps %xmm14,`$framesz+16*8`(%rsp)
  597. movaps %xmm15,`$framesz+16*9`(%rsp)
  598. ___
  599. $code.=<<___;
  600. .Lprologue_avx:
  601. vzeroall
  602. mov $inp,%r12 # borrow $a4
  603. lea 0x80($key),$inp # size optimization, reassign
  604. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  605. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  606. mov $ctx,%r15 # borrow $a2
  607. mov $in0,%rsi # borrow $a3
  608. vmovdqu ($ivp),$iv # load IV
  609. sub \$9,%r14
  610. mov $SZ*0(%r15),$A
  611. mov $SZ*1(%r15),$B
  612. mov $SZ*2(%r15),$C
  613. mov $SZ*3(%r15),$D
  614. mov $SZ*4(%r15),$E
  615. mov $SZ*5(%r15),$F
  616. mov $SZ*6(%r15),$G
  617. mov $SZ*7(%r15),$H
  618. vmovdqa 0x00(%r13,%r14,8),$mask14
  619. vmovdqa 0x10(%r13,%r14,8),$mask12
  620. vmovdqa 0x20(%r13,%r14,8),$mask10
  621. vmovdqu 0x00-0x80($inp),$roundkey
  622. ___
  623. if ($SZ==4) { # SHA256
  624. my @X = map("%xmm$_",(0..3));
  625. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  626. $code.=<<___;
  627. jmp .Lloop_avx
  628. .align 16
  629. .Lloop_avx:
  630. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  631. vmovdqu 0x00(%rsi,%r12),@X[0]
  632. vmovdqu 0x10(%rsi,%r12),@X[1]
  633. vmovdqu 0x20(%rsi,%r12),@X[2]
  634. vmovdqu 0x30(%rsi,%r12),@X[3]
  635. vpshufb $t3,@X[0],@X[0]
  636. lea $TABLE(%rip),$Tbl
  637. vpshufb $t3,@X[1],@X[1]
  638. vpshufb $t3,@X[2],@X[2]
  639. vpaddd 0x00($Tbl),@X[0],$t0
  640. vpshufb $t3,@X[3],@X[3]
  641. vpaddd 0x20($Tbl),@X[1],$t1
  642. vpaddd 0x40($Tbl),@X[2],$t2
  643. vpaddd 0x60($Tbl),@X[3],$t3
  644. vmovdqa $t0,0x00(%rsp)
  645. mov $A,$a1
  646. vmovdqa $t1,0x10(%rsp)
  647. mov $B,$a3
  648. vmovdqa $t2,0x20(%rsp)
  649. xor $C,$a3 # magic
  650. vmovdqa $t3,0x30(%rsp)
  651. mov $E,$a0
  652. jmp .Lavx_00_47
  653. .align 16
  654. .Lavx_00_47:
  655. sub \$-16*2*$SZ,$Tbl # size optimization
  656. vmovdqu (%r12),$inout # $a4
  657. mov %r12,$_inp # $a4
  658. ___
  659. sub Xupdate_256_AVX () {
  660. (
  661. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  662. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  663. '&vpsrld ($t2,$t0,$sigma0[0]);',
  664. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  665. '&vpsrld ($t3,$t0,$sigma0[2])',
  666. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  667. '&vpxor ($t0,$t3,$t2)',
  668. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  669. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  670. '&vpxor ($t0,$t0,$t1)',
  671. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  672. '&vpxor ($t0,$t0,$t2)',
  673. '&vpsrld ($t2,$t3,$sigma1[2]);',
  674. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  675. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  676. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  677. '&vpxor ($t2,$t2,$t3);',
  678. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  679. '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
  680. '&vpshufd ($t2,$t2,0b10000100)',
  681. '&vpsrldq ($t2,$t2,8)',
  682. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  683. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  684. '&vpsrld ($t2,$t3,$sigma1[2])',
  685. '&vpsrlq ($t3,$t3,$sigma1[0])',
  686. '&vpxor ($t2,$t2,$t3);',
  687. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  688. '&vpxor ($t2,$t2,$t3)',
  689. '&vpshufd ($t2,$t2,0b11101000)',
  690. '&vpslldq ($t2,$t2,8)',
  691. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  692. );
  693. }
  694. sub AVX_256_00_47 () {
  695. my $j = shift;
  696. my $body = shift;
  697. my @X = @_;
  698. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  699. foreach (Xupdate_256_AVX()) { # 29 instructions
  700. eval;
  701. eval(shift(@insns));
  702. eval(shift(@insns));
  703. eval(shift(@insns));
  704. }
  705. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  706. foreach (@insns) { eval; } # remaining instructions
  707. &vmovdqa (16*$j."(%rsp)",$t2);
  708. }
  709. $aesni_cbc_idx=0;
  710. for ($i=0,$j=0; $j<4; $j++) {
  711. &AVX_256_00_47($j,\&body_00_15,@X);
  712. push(@X,shift(@X)); # rotate(@X)
  713. }
  714. &mov ("%r12",$_inp); # borrow $a4
  715. &vpand ($temp,$temp,$mask14);
  716. &mov ("%r15",$_out); # borrow $a2
  717. &vpor ($iv,$iv,$temp);
  718. &vmovdqu ("(%r15,%r12)",$iv); # write output
  719. &lea ("%r12","16(%r12)"); # inp++
  720. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  721. &jne (".Lavx_00_47");
  722. &vmovdqu ($inout,"(%r12)");
  723. &mov ($_inp,"%r12");
  724. $aesni_cbc_idx=0;
  725. for ($i=0; $i<16; ) {
  726. foreach(body_00_15()) { eval; }
  727. }
  728. }
  729. $code.=<<___;
  730. mov $_inp,%r12 # borrow $a4
  731. mov $_out,%r13 # borrow $a0
  732. mov $_ctx,%r15 # borrow $a2
  733. mov $_in0,%rsi # borrow $a3
  734. vpand $mask14,$temp,$temp
  735. mov $a1,$A
  736. vpor $temp,$iv,$iv
  737. vmovdqu $iv,(%r13,%r12) # write output
  738. lea 16(%r12),%r12 # inp++
  739. add $SZ*0(%r15),$A
  740. add $SZ*1(%r15),$B
  741. add $SZ*2(%r15),$C
  742. add $SZ*3(%r15),$D
  743. add $SZ*4(%r15),$E
  744. add $SZ*5(%r15),$F
  745. add $SZ*6(%r15),$G
  746. add $SZ*7(%r15),$H
  747. cmp $_end,%r12
  748. mov $A,$SZ*0(%r15)
  749. mov $B,$SZ*1(%r15)
  750. mov $C,$SZ*2(%r15)
  751. mov $D,$SZ*3(%r15)
  752. mov $E,$SZ*4(%r15)
  753. mov $F,$SZ*5(%r15)
  754. mov $G,$SZ*6(%r15)
  755. mov $H,$SZ*7(%r15)
  756. jb .Lloop_avx
  757. mov $_ivp,$ivp
  758. mov $_rsp,%rsi
  759. vmovdqu $iv,($ivp) # output IV
  760. vzeroall
  761. ___
  762. $code.=<<___ if ($win64);
  763. movaps `$framesz+16*0`(%rsp),%xmm6
  764. movaps `$framesz+16*1`(%rsp),%xmm7
  765. movaps `$framesz+16*2`(%rsp),%xmm8
  766. movaps `$framesz+16*3`(%rsp),%xmm9
  767. movaps `$framesz+16*4`(%rsp),%xmm10
  768. movaps `$framesz+16*5`(%rsp),%xmm11
  769. movaps `$framesz+16*6`(%rsp),%xmm12
  770. movaps `$framesz+16*7`(%rsp),%xmm13
  771. movaps `$framesz+16*8`(%rsp),%xmm14
  772. movaps `$framesz+16*9`(%rsp),%xmm15
  773. ___
  774. $code.=<<___;
  775. mov (%rsi),%r15
  776. mov 8(%rsi),%r14
  777. mov 16(%rsi),%r13
  778. mov 24(%rsi),%r12
  779. mov 32(%rsi),%rbp
  780. mov 40(%rsi),%rbx
  781. lea 48(%rsi),%rsp
  782. .Lepilogue_avx:
  783. ret
  784. .size ${func}_avx,.-${func}_avx
  785. ___
  786. if ($avx>1) {{
  787. ######################################################################
  788. # AVX2+BMI code path
  789. #
  790. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  791. my $PUSH8=8*2*$SZ;
  792. use integer;
  793. sub bodyx_00_15 () {
  794. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  795. (
  796. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  797. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  798. '&and ($a4,$e)', # f&e
  799. '&rorx ($a0,$e,$Sigma1[2])',
  800. '&rorx ($a2,$e,$Sigma1[1])',
  801. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  802. '&lea ($h,"($h,$a4)")',
  803. '&andn ($a4,$e,$g)', # ~e&g
  804. '&xor ($a0,$a2)',
  805. '&rorx ($a1,$e,$Sigma1[0])',
  806. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  807. '&xor ($a0,$a1)', # Sigma1(e)
  808. '&mov ($a2,$a)',
  809. '&rorx ($a4,$a,$Sigma0[2])',
  810. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  811. '&xor ($a2,$b)', # a^b, b^c in next round
  812. '&rorx ($a1,$a,$Sigma0[1])',
  813. '&rorx ($a0,$a,$Sigma0[0])',
  814. '&lea ($d,"($d,$h)")', # d+=h
  815. '&and ($a3,$a2)', # (b^c)&(a^b)
  816. @aesni_cbc_block[$aesni_cbc_idx++].
  817. '&xor ($a1,$a4)',
  818. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  819. '&xor ($a1,$a0)', # Sigma0(a)
  820. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  821. '&mov ($a4,$e)', # copy of f in future
  822. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  823. );
  824. # and at the finish one has to $a+=$a1
  825. }
  826. $code.=<<___;
  827. .type ${func}_avx2,\@function,6
  828. .align 64
  829. ${func}_avx2:
  830. .Lavx2_shortcut:
  831. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  832. push %rbx
  833. push %rbp
  834. push %r12
  835. push %r13
  836. push %r14
  837. push %r15
  838. mov %rsp,%r11 # copy %rsp
  839. sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
  840. and \$-256*$SZ,%rsp # align stack frame
  841. add \$`2*$SZ*($rounds-8)`,%rsp
  842. shl \$6,$len
  843. sub $inp,$out # re-bias
  844. sub $inp,$in0
  845. add $inp,$len # end of input
  846. #mov $inp,$_inp # saved later
  847. #mov $out,$_out # kept in $offload
  848. mov $len,$_end
  849. #mov $key,$_key # remains resident in $inp register
  850. mov $ivp,$_ivp
  851. mov $ctx,$_ctx
  852. mov $in0,$_in0
  853. mov %r11,$_rsp
  854. ___
  855. $code.=<<___ if ($win64);
  856. movaps %xmm6,`$framesz+16*0`(%rsp)
  857. movaps %xmm7,`$framesz+16*1`(%rsp)
  858. movaps %xmm8,`$framesz+16*2`(%rsp)
  859. movaps %xmm9,`$framesz+16*3`(%rsp)
  860. movaps %xmm10,`$framesz+16*4`(%rsp)
  861. movaps %xmm11,`$framesz+16*5`(%rsp)
  862. movaps %xmm12,`$framesz+16*6`(%rsp)
  863. movaps %xmm13,`$framesz+16*7`(%rsp)
  864. movaps %xmm14,`$framesz+16*8`(%rsp)
  865. movaps %xmm15,`$framesz+16*9`(%rsp)
  866. ___
  867. $code.=<<___;
  868. .Lprologue_avx2:
  869. vzeroall
  870. mov $inp,%r13 # borrow $a0
  871. vpinsrq \$1,$out,$offload,$offload
  872. lea 0x80($key),$inp # size optimization, reassign
  873. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
  874. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  875. mov $ctx,%r15 # borrow $a2
  876. mov $in0,%rsi # borrow $a3
  877. vmovdqu ($ivp),$iv # load IV
  878. lea -9(%r14),%r14
  879. vmovdqa 0x00(%r12,%r14,8),$mask14
  880. vmovdqa 0x10(%r12,%r14,8),$mask12
  881. vmovdqa 0x20(%r12,%r14,8),$mask10
  882. sub \$-16*$SZ,%r13 # inp++, size optimization
  883. mov $SZ*0(%r15),$A
  884. lea (%rsi,%r13),%r12 # borrow $a0
  885. mov $SZ*1(%r15),$B
  886. cmp $len,%r13 # $_end
  887. mov $SZ*2(%r15),$C
  888. cmove %rsp,%r12 # next block or random data
  889. mov $SZ*3(%r15),$D
  890. mov $SZ*4(%r15),$E
  891. mov $SZ*5(%r15),$F
  892. mov $SZ*6(%r15),$G
  893. mov $SZ*7(%r15),$H
  894. vmovdqu 0x00-0x80($inp),$roundkey
  895. ___
  896. if ($SZ==4) { # SHA256
  897. my @X = map("%ymm$_",(0..3));
  898. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
  899. $code.=<<___;
  900. jmp .Loop_avx2
  901. .align 16
  902. .Loop_avx2:
  903. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  904. vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
  905. vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
  906. vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
  907. vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
  908. vinserti128 \$1,(%r12),@X[0],@X[0]
  909. vinserti128 \$1,16(%r12),@X[1],@X[1]
  910. vpshufb $t3,@X[0],@X[0]
  911. vinserti128 \$1,32(%r12),@X[2],@X[2]
  912. vpshufb $t3,@X[1],@X[1]
  913. vinserti128 \$1,48(%r12),@X[3],@X[3]
  914. lea $TABLE(%rip),$Tbl
  915. vpshufb $t3,@X[2],@X[2]
  916. lea -16*$SZ(%r13),%r13
  917. vpaddd 0x00($Tbl),@X[0],$t0
  918. vpshufb $t3,@X[3],@X[3]
  919. vpaddd 0x20($Tbl),@X[1],$t1
  920. vpaddd 0x40($Tbl),@X[2],$t2
  921. vpaddd 0x60($Tbl),@X[3],$t3
  922. vmovdqa $t0,0x00(%rsp)
  923. xor $a1,$a1
  924. vmovdqa $t1,0x20(%rsp)
  925. lea -$PUSH8(%rsp),%rsp
  926. mov $B,$a3
  927. vmovdqa $t2,0x00(%rsp)
  928. xor $C,$a3 # magic
  929. vmovdqa $t3,0x20(%rsp)
  930. mov $F,$a4
  931. sub \$-16*2*$SZ,$Tbl # size optimization
  932. jmp .Lavx2_00_47
  933. .align 16
  934. .Lavx2_00_47:
  935. vmovdqu (%r13),$inout
  936. vpinsrq \$0,%r13,$offload,$offload
  937. ___
  938. sub AVX2_256_00_47 () {
  939. my $j = shift;
  940. my $body = shift;
  941. my @X = @_;
  942. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  943. my $base = "+2*$PUSH8(%rsp)";
  944. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
  945. foreach (Xupdate_256_AVX()) { # 29 instructions
  946. eval;
  947. eval(shift(@insns));
  948. eval(shift(@insns));
  949. eval(shift(@insns));
  950. }
  951. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  952. foreach (@insns) { eval; } # remaining instructions
  953. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  954. }
  955. $aesni_cbc_idx=0;
  956. for ($i=0,$j=0; $j<4; $j++) {
  957. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  958. push(@X,shift(@X)); # rotate(@X)
  959. }
  960. &vmovq ("%r13",$offload); # borrow $a0
  961. &vpextrq ("%r15",$offload,1); # borrow $a2
  962. &vpand ($temp,$temp,$mask14);
  963. &vpor ($iv,$iv,$temp);
  964. &vmovdqu ("(%r15,%r13)",$iv); # write output
  965. &lea ("%r13","16(%r13)"); # inp++
  966. &lea ($Tbl,16*2*$SZ."($Tbl)");
  967. &cmpb (($SZ-1)."($Tbl)",0);
  968. &jne (".Lavx2_00_47");
  969. &vmovdqu ($inout,"(%r13)");
  970. &vpinsrq ($offload,$offload,"%r13",0);
  971. $aesni_cbc_idx=0;
  972. for ($i=0; $i<16; ) {
  973. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  974. foreach(bodyx_00_15()) { eval; }
  975. }
  976. }
  977. $code.=<<___;
  978. vpextrq \$1,$offload,%r12 # $_out, borrow $a4
  979. vmovq $offload,%r13 # $_inp, borrow $a0
  980. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  981. add $a1,$A
  982. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  983. vpand $mask14,$temp,$temp
  984. vpor $temp,$iv,$iv
  985. vmovdqu $iv,(%r12,%r13) # write output
  986. lea 16(%r13),%r13
  987. add $SZ*0(%r15),$A
  988. add $SZ*1(%r15),$B
  989. add $SZ*2(%r15),$C
  990. add $SZ*3(%r15),$D
  991. add $SZ*4(%r15),$E
  992. add $SZ*5(%r15),$F
  993. add $SZ*6(%r15),$G
  994. add $SZ*7(%r15),$H
  995. mov $A,$SZ*0(%r15)
  996. mov $B,$SZ*1(%r15)
  997. mov $C,$SZ*2(%r15)
  998. mov $D,$SZ*3(%r15)
  999. mov $E,$SZ*4(%r15)
  1000. mov $F,$SZ*5(%r15)
  1001. mov $G,$SZ*6(%r15)
  1002. mov $H,$SZ*7(%r15)
  1003. cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
  1004. je .Ldone_avx2
  1005. xor $a1,$a1
  1006. mov $B,$a3
  1007. mov $F,$a4
  1008. xor $C,$a3 # magic
  1009. jmp .Lower_avx2
  1010. .align 16
  1011. .Lower_avx2:
  1012. vmovdqu (%r13),$inout
  1013. vpinsrq \$0,%r13,$offload,$offload
  1014. ___
  1015. $aesni_cbc_idx=0;
  1016. for ($i=0; $i<16; ) {
  1017. my $base="+16($Tbl)";
  1018. foreach(bodyx_00_15()) { eval; }
  1019. &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
  1020. }
  1021. $code.=<<___;
  1022. vmovq $offload,%r13 # borrow $a0
  1023. vpextrq \$1,$offload,%r15 # borrow $a2
  1024. vpand $mask14,$temp,$temp
  1025. vpor $temp,$iv,$iv
  1026. lea -$PUSH8($Tbl),$Tbl
  1027. vmovdqu $iv,(%r15,%r13) # write output
  1028. lea 16(%r13),%r13 # inp++
  1029. cmp %rsp,$Tbl
  1030. jae .Lower_avx2
  1031. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  1032. lea 16*$SZ(%r13),%r13
  1033. mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
  1034. add $a1,$A
  1035. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  1036. add $SZ*0(%r15),$A
  1037. add $SZ*1(%r15),$B
  1038. add $SZ*2(%r15),$C
  1039. add $SZ*3(%r15),$D
  1040. add $SZ*4(%r15),$E
  1041. add $SZ*5(%r15),$F
  1042. add $SZ*6(%r15),$G
  1043. lea (%rsi,%r13),%r12
  1044. add $SZ*7(%r15),$H
  1045. cmp $_end,%r13
  1046. mov $A,$SZ*0(%r15)
  1047. cmove %rsp,%r12 # next block or stale data
  1048. mov $B,$SZ*1(%r15)
  1049. mov $C,$SZ*2(%r15)
  1050. mov $D,$SZ*3(%r15)
  1051. mov $E,$SZ*4(%r15)
  1052. mov $F,$SZ*5(%r15)
  1053. mov $G,$SZ*6(%r15)
  1054. mov $H,$SZ*7(%r15)
  1055. jbe .Loop_avx2
  1056. lea (%rsp),$Tbl
  1057. .Ldone_avx2:
  1058. lea ($Tbl),%rsp
  1059. mov $_ivp,$ivp
  1060. mov $_rsp,%rsi
  1061. vmovdqu $iv,($ivp) # output IV
  1062. vzeroall
  1063. ___
  1064. $code.=<<___ if ($win64);
  1065. movaps `$framesz+16*0`(%rsp),%xmm6
  1066. movaps `$framesz+16*1`(%rsp),%xmm7
  1067. movaps `$framesz+16*2`(%rsp),%xmm8
  1068. movaps `$framesz+16*3`(%rsp),%xmm9
  1069. movaps `$framesz+16*4`(%rsp),%xmm10
  1070. movaps `$framesz+16*5`(%rsp),%xmm11
  1071. movaps `$framesz+16*6`(%rsp),%xmm12
  1072. movaps `$framesz+16*7`(%rsp),%xmm13
  1073. movaps `$framesz+16*8`(%rsp),%xmm14
  1074. movaps `$framesz+16*9`(%rsp),%xmm15
  1075. ___
  1076. $code.=<<___;
  1077. mov (%rsi),%r15
  1078. mov 8(%rsi),%r14
  1079. mov 16(%rsi),%r13
  1080. mov 24(%rsi),%r12
  1081. mov 32(%rsi),%rbp
  1082. mov 40(%rsi),%rbx
  1083. lea 48(%rsi),%rsp
  1084. .Lepilogue_avx2:
  1085. ret
  1086. .size ${func}_avx2,.-${func}_avx2
  1087. ___
  1088. }}
  1089. }}
  1090. {{
  1091. my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  1092. my ($rounds,$Tbl)=("%r11d","%rbx");
  1093. my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
  1094. my @rndkey=("%xmm4","%xmm5");
  1095. my $r=0;
  1096. my $sn=0;
  1097. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
  1098. my @MSG=map("%xmm$_",(10..13));
  1099. my $aesenc=sub {
  1100. use integer;
  1101. my ($n,$k)=($r/10,$r%10);
  1102. if ($k==0) {
  1103. $code.=<<___;
  1104. movups `16*$n`($in0),$in # load input
  1105. xorps $rndkey0,$in
  1106. ___
  1107. $code.=<<___ if ($n);
  1108. movups $iv,`16*($n-1)`($out,$in0) # write output
  1109. ___
  1110. $code.=<<___;
  1111. xorps $in,$iv
  1112. movups `32+16*$k-112`($key),$rndkey[1]
  1113. aesenc $rndkey[0],$iv
  1114. ___
  1115. } elsif ($k==9) {
  1116. $sn++;
  1117. $code.=<<___;
  1118. cmp \$11,$rounds
  1119. jb .Laesenclast$sn
  1120. movups `32+16*($k+0)-112`($key),$rndkey[1]
  1121. aesenc $rndkey[0],$iv
  1122. movups `32+16*($k+1)-112`($key),$rndkey[0]
  1123. aesenc $rndkey[1],$iv
  1124. je .Laesenclast$sn
  1125. movups `32+16*($k+2)-112`($key),$rndkey[1]
  1126. aesenc $rndkey[0],$iv
  1127. movups `32+16*($k+3)-112`($key),$rndkey[0]
  1128. aesenc $rndkey[1],$iv
  1129. .Laesenclast$sn:
  1130. aesenclast $rndkey[0],$iv
  1131. movups 16-112($key),$rndkey[1] # forward reference
  1132. nop
  1133. ___
  1134. } else {
  1135. $code.=<<___;
  1136. movups `32+16*$k-112`($key),$rndkey[1]
  1137. aesenc $rndkey[0],$iv
  1138. ___
  1139. }
  1140. $r++; unshift(@rndkey,pop(@rndkey));
  1141. };
  1142. if ($shaext) {
  1143. my $Tbl="%rax";
  1144. $code.=<<___;
  1145. .type ${func}_shaext,\@function,6
  1146. .align 32
  1147. ${func}_shaext:
  1148. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  1149. ___
  1150. $code.=<<___ if ($win64);
  1151. lea `-8-10*16`(%rsp),%rsp
  1152. movaps %xmm6,-8-10*16(%rax)
  1153. movaps %xmm7,-8-9*16(%rax)
  1154. movaps %xmm8,-8-8*16(%rax)
  1155. movaps %xmm9,-8-7*16(%rax)
  1156. movaps %xmm10,-8-6*16(%rax)
  1157. movaps %xmm11,-8-5*16(%rax)
  1158. movaps %xmm12,-8-4*16(%rax)
  1159. movaps %xmm13,-8-3*16(%rax)
  1160. movaps %xmm14,-8-2*16(%rax)
  1161. movaps %xmm15,-8-1*16(%rax)
  1162. .Lprologue_shaext:
  1163. ___
  1164. $code.=<<___;
  1165. lea K256+0x80(%rip),$Tbl
  1166. movdqu ($ctx),$ABEF # DCBA
  1167. movdqu 16($ctx),$CDGH # HGFE
  1168. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  1169. mov 240($key),$rounds
  1170. sub $in0,$out
  1171. movups ($key),$rndkey0 # $key[0]
  1172. movups ($ivp),$iv # load IV
  1173. movups 16($key),$rndkey[0] # forward reference
  1174. lea 112($key),$key # size optimization
  1175. pshufd \$0x1b,$ABEF,$Wi # ABCD
  1176. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  1177. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  1178. movdqa $TMP,$BSWAP # offload
  1179. palignr \$8,$CDGH,$ABEF # ABEF
  1180. punpcklqdq $Wi,$CDGH # CDGH
  1181. jmp .Loop_shaext
  1182. .align 16
  1183. .Loop_shaext:
  1184. movdqu ($inp),@MSG[0]
  1185. movdqu 0x10($inp),@MSG[1]
  1186. movdqu 0x20($inp),@MSG[2]
  1187. pshufb $TMP,@MSG[0]
  1188. movdqu 0x30($inp),@MSG[3]
  1189. movdqa 0*32-0x80($Tbl),$Wi
  1190. paddd @MSG[0],$Wi
  1191. pshufb $TMP,@MSG[1]
  1192. movdqa $CDGH,$CDGH_SAVE # offload
  1193. movdqa $ABEF,$ABEF_SAVE # offload
  1194. ___
  1195. &$aesenc();
  1196. $code.=<<___;
  1197. sha256rnds2 $ABEF,$CDGH # 0-3
  1198. pshufd \$0x0e,$Wi,$Wi
  1199. ___
  1200. &$aesenc();
  1201. $code.=<<___;
  1202. sha256rnds2 $CDGH,$ABEF
  1203. movdqa 1*32-0x80($Tbl),$Wi
  1204. paddd @MSG[1],$Wi
  1205. pshufb $TMP,@MSG[2]
  1206. lea 0x40($inp),$inp
  1207. ___
  1208. &$aesenc();
  1209. $code.=<<___;
  1210. sha256rnds2 $ABEF,$CDGH # 4-7
  1211. pshufd \$0x0e,$Wi,$Wi
  1212. ___
  1213. &$aesenc();
  1214. $code.=<<___;
  1215. sha256rnds2 $CDGH,$ABEF
  1216. movdqa 2*32-0x80($Tbl),$Wi
  1217. paddd @MSG[2],$Wi
  1218. pshufb $TMP,@MSG[3]
  1219. sha256msg1 @MSG[1],@MSG[0]
  1220. ___
  1221. &$aesenc();
  1222. $code.=<<___;
  1223. sha256rnds2 $ABEF,$CDGH # 8-11
  1224. pshufd \$0x0e,$Wi,$Wi
  1225. movdqa @MSG[3],$TMP
  1226. palignr \$4,@MSG[2],$TMP
  1227. paddd $TMP,@MSG[0]
  1228. ___
  1229. &$aesenc();
  1230. $code.=<<___;
  1231. sha256rnds2 $CDGH,$ABEF
  1232. movdqa 3*32-0x80($Tbl),$Wi
  1233. paddd @MSG[3],$Wi
  1234. sha256msg2 @MSG[3],@MSG[0]
  1235. sha256msg1 @MSG[2],@MSG[1]
  1236. ___
  1237. &$aesenc();
  1238. $code.=<<___;
  1239. sha256rnds2 $ABEF,$CDGH # 12-15
  1240. pshufd \$0x0e,$Wi,$Wi
  1241. ___
  1242. &$aesenc();
  1243. $code.=<<___;
  1244. movdqa @MSG[0],$TMP
  1245. palignr \$4,@MSG[3],$TMP
  1246. paddd $TMP,@MSG[1]
  1247. sha256rnds2 $CDGH,$ABEF
  1248. ___
  1249. for($i=4;$i<16-3;$i++) {
  1250. &$aesenc() if (($r%10)==0);
  1251. $code.=<<___;
  1252. movdqa $i*32-0x80($Tbl),$Wi
  1253. paddd @MSG[0],$Wi
  1254. sha256msg2 @MSG[0],@MSG[1]
  1255. sha256msg1 @MSG[3],@MSG[2]
  1256. ___
  1257. &$aesenc();
  1258. $code.=<<___;
  1259. sha256rnds2 $ABEF,$CDGH # 16-19...
  1260. pshufd \$0x0e,$Wi,$Wi
  1261. movdqa @MSG[1],$TMP
  1262. palignr \$4,@MSG[0],$TMP
  1263. paddd $TMP,@MSG[2]
  1264. ___
  1265. &$aesenc();
  1266. &$aesenc() if ($r==19);
  1267. $code.=<<___;
  1268. sha256rnds2 $CDGH,$ABEF
  1269. ___
  1270. push(@MSG,shift(@MSG));
  1271. }
  1272. $code.=<<___;
  1273. movdqa 13*32-0x80($Tbl),$Wi
  1274. paddd @MSG[0],$Wi
  1275. sha256msg2 @MSG[0],@MSG[1]
  1276. sha256msg1 @MSG[3],@MSG[2]
  1277. ___
  1278. &$aesenc();
  1279. $code.=<<___;
  1280. sha256rnds2 $ABEF,$CDGH # 52-55
  1281. pshufd \$0x0e,$Wi,$Wi
  1282. movdqa @MSG[1],$TMP
  1283. palignr \$4,@MSG[0],$TMP
  1284. paddd $TMP,@MSG[2]
  1285. ___
  1286. &$aesenc();
  1287. &$aesenc();
  1288. $code.=<<___;
  1289. sha256rnds2 $CDGH,$ABEF
  1290. movdqa 14*32-0x80($Tbl),$Wi
  1291. paddd @MSG[1],$Wi
  1292. sha256msg2 @MSG[1],@MSG[2]
  1293. movdqa $BSWAP,$TMP
  1294. ___
  1295. &$aesenc();
  1296. $code.=<<___;
  1297. sha256rnds2 $ABEF,$CDGH # 56-59
  1298. pshufd \$0x0e,$Wi,$Wi
  1299. ___
  1300. &$aesenc();
  1301. $code.=<<___;
  1302. sha256rnds2 $CDGH,$ABEF
  1303. movdqa 15*32-0x80($Tbl),$Wi
  1304. paddd @MSG[2],$Wi
  1305. ___
  1306. &$aesenc();
  1307. &$aesenc();
  1308. $code.=<<___;
  1309. sha256rnds2 $ABEF,$CDGH # 60-63
  1310. pshufd \$0x0e,$Wi,$Wi
  1311. ___
  1312. &$aesenc();
  1313. $code.=<<___;
  1314. sha256rnds2 $CDGH,$ABEF
  1315. #pxor $CDGH,$rndkey0 # black magic
  1316. ___
  1317. while ($r<40) { &$aesenc(); } # remaining aesenc's
  1318. $code.=<<___;
  1319. #xorps $CDGH,$rndkey0 # black magic
  1320. paddd $CDGH_SAVE,$CDGH
  1321. paddd $ABEF_SAVE,$ABEF
  1322. dec $len
  1323. movups $iv,48($out,$in0) # write output
  1324. lea 64($in0),$in0
  1325. jnz .Loop_shaext
  1326. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  1327. pshufd \$0x1b,$ABEF,$TMP # FEBA
  1328. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  1329. punpckhqdq $CDGH,$ABEF # DCBA
  1330. palignr \$8,$TMP,$CDGH # HGFE
  1331. movups $iv,($ivp) # write IV
  1332. movdqu $ABEF,($ctx)
  1333. movdqu $CDGH,16($ctx)
  1334. ___
  1335. $code.=<<___ if ($win64);
  1336. movaps 0*16(%rsp),%xmm6
  1337. movaps 1*16(%rsp),%xmm7
  1338. movaps 2*16(%rsp),%xmm8
  1339. movaps 3*16(%rsp),%xmm9
  1340. movaps 4*16(%rsp),%xmm10
  1341. movaps 5*16(%rsp),%xmm11
  1342. movaps 6*16(%rsp),%xmm12
  1343. movaps 7*16(%rsp),%xmm13
  1344. movaps 8*16(%rsp),%xmm14
  1345. movaps 9*16(%rsp),%xmm15
  1346. lea 8+10*16(%rsp),%rsp
  1347. .Lepilogue_shaext:
  1348. ___
  1349. $code.=<<___;
  1350. ret
  1351. .size ${func}_shaext,.-${func}_shaext
  1352. ___
  1353. }
  1354. }}}}}
  1355. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1356. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1357. if ($win64 && $avx) {
  1358. $rec="%rcx";
  1359. $frame="%rdx";
  1360. $context="%r8";
  1361. $disp="%r9";
  1362. $code.=<<___;
  1363. .extern __imp_RtlVirtualUnwind
  1364. .type se_handler,\@abi-omnipotent
  1365. .align 16
  1366. se_handler:
  1367. push %rsi
  1368. push %rdi
  1369. push %rbx
  1370. push %rbp
  1371. push %r12
  1372. push %r13
  1373. push %r14
  1374. push %r15
  1375. pushfq
  1376. sub \$64,%rsp
  1377. mov 120($context),%rax # pull context->Rax
  1378. mov 248($context),%rbx # pull context->Rip
  1379. mov 8($disp),%rsi # disp->ImageBase
  1380. mov 56($disp),%r11 # disp->HanderlData
  1381. mov 0(%r11),%r10d # HandlerData[0]
  1382. lea (%rsi,%r10),%r10 # prologue label
  1383. cmp %r10,%rbx # context->Rip<prologue label
  1384. jb .Lin_prologue
  1385. mov 152($context),%rax # pull context->Rsp
  1386. mov 4(%r11),%r10d # HandlerData[1]
  1387. lea (%rsi,%r10),%r10 # epilogue label
  1388. cmp %r10,%rbx # context->Rip>=epilogue label
  1389. jae .Lin_prologue
  1390. ___
  1391. $code.=<<___ if ($shaext);
  1392. lea aesni_cbc_sha256_enc_shaext(%rip),%r10
  1393. cmp %r10,%rbx
  1394. jb .Lnot_in_shaext
  1395. lea (%rax),%rsi
  1396. lea 512($context),%rdi # &context.Xmm6
  1397. mov \$20,%ecx
  1398. .long 0xa548f3fc # cld; rep movsq
  1399. lea 168(%rax),%rax # adjust stack pointer
  1400. jmp .Lin_prologue
  1401. .Lnot_in_shaext:
  1402. ___
  1403. $code.=<<___ if ($avx>1);
  1404. lea .Lavx2_shortcut(%rip),%r10
  1405. cmp %r10,%rbx # context->Rip<avx2_shortcut
  1406. jb .Lnot_in_avx2
  1407. and \$-256*$SZ,%rax
  1408. add \$`2*$SZ*($rounds-8)`,%rax
  1409. .Lnot_in_avx2:
  1410. ___
  1411. $code.=<<___;
  1412. mov %rax,%rsi # put aside Rsp
  1413. mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
  1414. lea 48(%rax),%rax
  1415. mov -8(%rax),%rbx
  1416. mov -16(%rax),%rbp
  1417. mov -24(%rax),%r12
  1418. mov -32(%rax),%r13
  1419. mov -40(%rax),%r14
  1420. mov -48(%rax),%r15
  1421. mov %rbx,144($context) # restore context->Rbx
  1422. mov %rbp,160($context) # restore context->Rbp
  1423. mov %r12,216($context) # restore context->R12
  1424. mov %r13,224($context) # restore context->R13
  1425. mov %r14,232($context) # restore context->R14
  1426. mov %r15,240($context) # restore context->R15
  1427. lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
  1428. lea 512($context),%rdi # &context.Xmm6
  1429. mov \$20,%ecx
  1430. .long 0xa548f3fc # cld; rep movsq
  1431. .Lin_prologue:
  1432. mov 8(%rax),%rdi
  1433. mov 16(%rax),%rsi
  1434. mov %rax,152($context) # restore context->Rsp
  1435. mov %rsi,168($context) # restore context->Rsi
  1436. mov %rdi,176($context) # restore context->Rdi
  1437. mov 40($disp),%rdi # disp->ContextRecord
  1438. mov $context,%rsi # context
  1439. mov \$154,%ecx # sizeof(CONTEXT)
  1440. .long 0xa548f3fc # cld; rep movsq
  1441. mov $disp,%rsi
  1442. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1443. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1444. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1445. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1446. mov 40(%rsi),%r10 # disp->ContextRecord
  1447. lea 56(%rsi),%r11 # &disp->HandlerData
  1448. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1449. mov %r10,32(%rsp) # arg5
  1450. mov %r11,40(%rsp) # arg6
  1451. mov %r12,48(%rsp) # arg7
  1452. mov %rcx,56(%rsp) # arg8, (NULL)
  1453. call *__imp_RtlVirtualUnwind(%rip)
  1454. mov \$1,%eax # ExceptionContinueSearch
  1455. add \$64,%rsp
  1456. popfq
  1457. pop %r15
  1458. pop %r14
  1459. pop %r13
  1460. pop %r12
  1461. pop %rbp
  1462. pop %rbx
  1463. pop %rdi
  1464. pop %rsi
  1465. ret
  1466. .size se_handler,.-se_handler
  1467. .section .pdata
  1468. .rva .LSEH_begin_${func}_xop
  1469. .rva .LSEH_end_${func}_xop
  1470. .rva .LSEH_info_${func}_xop
  1471. .rva .LSEH_begin_${func}_avx
  1472. .rva .LSEH_end_${func}_avx
  1473. .rva .LSEH_info_${func}_avx
  1474. ___
  1475. $code.=<<___ if ($avx>1);
  1476. .rva .LSEH_begin_${func}_avx2
  1477. .rva .LSEH_end_${func}_avx2
  1478. .rva .LSEH_info_${func}_avx2
  1479. ___
  1480. $code.=<<___ if ($shaext);
  1481. .rva .LSEH_begin_${func}_shaext
  1482. .rva .LSEH_end_${func}_shaext
  1483. .rva .LSEH_info_${func}_shaext
  1484. ___
  1485. $code.=<<___;
  1486. .section .xdata
  1487. .align 8
  1488. .LSEH_info_${func}_xop:
  1489. .byte 9,0,0,0
  1490. .rva se_handler
  1491. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  1492. .LSEH_info_${func}_avx:
  1493. .byte 9,0,0,0
  1494. .rva se_handler
  1495. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1496. ___
  1497. $code.=<<___ if ($avx>1);
  1498. .LSEH_info_${func}_avx2:
  1499. .byte 9,0,0,0
  1500. .rva se_handler
  1501. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  1502. ___
  1503. $code.=<<___ if ($shaext);
  1504. .LSEH_info_${func}_shaext:
  1505. .byte 9,0,0,0
  1506. .rva se_handler
  1507. .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
  1508. ___
  1509. }
  1510. ####################################################################
  1511. sub rex {
  1512. local *opcode=shift;
  1513. my ($dst,$src)=@_;
  1514. my $rex=0;
  1515. $rex|=0x04 if($dst>=8);
  1516. $rex|=0x01 if($src>=8);
  1517. unshift @opcode,$rex|0x40 if($rex);
  1518. }
  1519. {
  1520. my %opcodelet = (
  1521. "sha256rnds2" => 0xcb,
  1522. "sha256msg1" => 0xcc,
  1523. "sha256msg2" => 0xcd );
  1524. sub sha256op38 {
  1525. my $instr = shift;
  1526. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1527. my @opcode=(0x0f,0x38);
  1528. rex(\@opcode,$2,$1);
  1529. push @opcode,$opcodelet{$instr};
  1530. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1531. return ".byte\t".join(',',@opcode);
  1532. } else {
  1533. return $instr."\t".@_[0];
  1534. }
  1535. }
  1536. }
  1537. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1538. $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
  1539. print $code;
  1540. close STDOUT;