aesni-sha256-x86_64.pl 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # January 2013
  17. #
  18. # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  19. # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  20. # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  21. # parallelism, interleaving it with another algorithm would allow to
  22. # utilize processor resources better and achieve better performance.
  23. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  24. # AESNI code is weaved into it. As SHA256 dominates execution time,
  25. # stitch performance does not depend on AES key length. Below are
  26. # performance numbers in cycles per processed byte, less is better,
  27. # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  28. # subroutine:
  29. #
  30. # AES-128/-192/-256+SHA256 this(**) gain
  31. # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
  32. # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
  33. # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
  34. # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
  35. # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
  36. # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
  37. # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
  38. #
  39. # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
  40. # Westmere is omitted from loop, this is because gain was not
  41. # estimated high enough to justify the effort;
  42. # (**) these are EVP-free results, results obtained with 'speed
  43. # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  44. # (***) these are SHAEXT results;
  45. # $output is the last argument if it looks like a file (it has an extension)
  46. # $flavour is the first argument if it doesn't look like a file
  47. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  48. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  49. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  50. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  52. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  53. die "can't locate x86_64-xlate.pl";
  54. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  55. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  56. $avx = ($1>=2.19) + ($1>=2.22);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  59. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  60. $avx = ($1>=2.09) + ($1>=2.10);
  61. }
  62. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  63. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  64. $avx = ($1>=10) + ($1>=12);
  65. }
  66. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  67. $avx = ($2>=3.0) + ($2>3.0);
  68. }
  69. $shaext=$avx; ### set to zero if compiling for 1.0.1
  70. $avx=1 if (!$shaext && $avx);
  71. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  72. or die "can't call $xlate: $!";
  73. *STDOUT=*OUT;
  74. $func="aesni_cbc_sha256_enc";
  75. $TABLE="K256";
  76. $SZ=4;
  77. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  78. "%r8d","%r9d","%r10d","%r11d");
  79. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  80. @Sigma0=( 2,13,22);
  81. @Sigma1=( 6,11,25);
  82. @sigma0=( 7,18, 3);
  83. @sigma1=(17,19,10);
  84. $rounds=64;
  85. ########################################################################
  86. # void aesni_cbc_sha256_enc(const void *inp,
  87. # void *out,
  88. # size_t length,
  89. # const AES_KEY *key,
  90. # unsigned char *iv,
  91. # SHA256_CTX *ctx,
  92. # const void *in0);
  93. ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
  94. ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  95. $Tbl="%rbp";
  96. $_inp="16*$SZ+0*8(%rsp)";
  97. $_out="16*$SZ+1*8(%rsp)";
  98. $_end="16*$SZ+2*8(%rsp)";
  99. $_key="16*$SZ+3*8(%rsp)";
  100. $_ivp="16*$SZ+4*8(%rsp)";
  101. $_ctx="16*$SZ+5*8(%rsp)";
  102. $_in0="16*$SZ+6*8(%rsp)";
  103. $_rsp="`16*$SZ+7*8`(%rsp)";
  104. $framesz=16*$SZ+8*8;
  105. $code=<<___;
  106. .text
  107. .extern OPENSSL_ia32cap_P
  108. .globl $func
  109. .type $func,\@abi-omnipotent
  110. .align 16
  111. $func:
  112. .cfi_startproc
  113. ___
  114. if ($avx) {
  115. $code.=<<___;
  116. lea OPENSSL_ia32cap_P(%rip),%r11
  117. mov \$1,%eax
  118. cmp \$0,`$win64?"%rcx":"%rdi"`
  119. je .Lprobe
  120. mov 0(%r11),%eax
  121. mov 4(%r11),%r10
  122. ___
  123. $code.=<<___ if ($shaext);
  124. bt \$61,%r10 # check for SHA
  125. jc ${func}_shaext
  126. ___
  127. $code.=<<___;
  128. mov %r10,%r11
  129. shr \$32,%r11
  130. test \$`1<<11`,%r10d # check for XOP
  131. jnz ${func}_xop
  132. ___
  133. $code.=<<___ if ($avx>1);
  134. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  135. cmp \$`1<<8|1<<5|1<<3`,%r11d
  136. je ${func}_avx2
  137. ___
  138. $code.=<<___;
  139. and \$`1<<28`,%r10d # check for AVX
  140. jnz ${func}_avx
  141. ud2
  142. ___
  143. }
  144. $code.=<<___;
  145. xor %eax,%eax
  146. cmp \$0,`$win64?"%rcx":"%rdi"`
  147. je .Lprobe
  148. ud2
  149. .Lprobe:
  150. ret
  151. .cfi_endproc
  152. .size $func,.-$func
  153. .align 64
  154. .type $TABLE,\@object
  155. $TABLE:
  156. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  157. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  158. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  159. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  160. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  161. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  162. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  163. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  164. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  165. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  166. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  167. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  168. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  169. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  170. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  171. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  172. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  173. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  174. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  175. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  176. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  177. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  178. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  179. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  180. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  181. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  182. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  183. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  184. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  185. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  186. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  187. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  188. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  189. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  190. .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
  191. .long 0,0,0,0, 0,0,0,0
  192. .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  193. .align 64
  194. ___
  195. ######################################################################
  196. # SIMD code paths
  197. #
  198. {{{
  199. ($iv,$inout,$roundkey,$temp,
  200. $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
  201. $aesni_cbc_idx=0;
  202. @aesni_cbc_block = (
  203. ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  204. ## &vmovdqu ($inout,($inp));
  205. ## &mov ($_inp,$inp);
  206. '&vpxor ($inout,$inout,$roundkey);'.
  207. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
  208. '&vpxor ($inout,$inout,$iv);',
  209. '&vaesenc ($inout,$inout,$roundkey);'.
  210. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
  211. '&vaesenc ($inout,$inout,$roundkey);'.
  212. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
  213. '&vaesenc ($inout,$inout,$roundkey);'.
  214. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
  215. '&vaesenc ($inout,$inout,$roundkey);'.
  216. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
  217. '&vaesenc ($inout,$inout,$roundkey);'.
  218. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
  219. '&vaesenc ($inout,$inout,$roundkey);'.
  220. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
  221. '&vaesenc ($inout,$inout,$roundkey);'.
  222. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
  223. '&vaesenc ($inout,$inout,$roundkey);'.
  224. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
  225. '&vaesenc ($inout,$inout,$roundkey);'.
  226. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
  227. '&vaesenclast ($temp,$inout,$roundkey);'.
  228. ' &vaesenc ($inout,$inout,$roundkey);'.
  229. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
  230. '&vpand ($iv,$temp,$mask10);'.
  231. ' &vaesenc ($inout,$inout,$roundkey);'.
  232. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
  233. '&vaesenclast ($temp,$inout,$roundkey);'.
  234. ' &vaesenc ($inout,$inout,$roundkey);'.
  235. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
  236. '&vpand ($temp,$temp,$mask12);'.
  237. ' &vaesenc ($inout,$inout,$roundkey);'.
  238. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
  239. '&vpor ($iv,$iv,$temp);'.
  240. ' &vaesenclast ($temp,$inout,$roundkey);'.
  241. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  242. ## &mov ($inp,$_inp);
  243. ## &mov ($out,$_out);
  244. ## &vpand ($temp,$temp,$mask14);
  245. ## &vpor ($iv,$iv,$temp);
  246. ## &vmovdqu ($iv,($out,$inp);
  247. ## &lea (inp,16($inp));
  248. );
  249. my $a4=$T1;
  250. my ($a,$b,$c,$d,$e,$f,$g,$h);
  251. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  252. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  253. my $arg = pop;
  254. $arg = "\$$arg" if ($arg*1 eq $arg);
  255. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  256. }
  257. sub body_00_15 () {
  258. (
  259. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  260. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  261. '&mov ($a,$a1)',
  262. '&mov ($a4,$f)',
  263. '&xor ($a0,$e)',
  264. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  265. '&xor ($a4,$g)', # f^g
  266. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  267. '&xor ($a1,$a)',
  268. '&and ($a4,$e)', # (f^g)&e
  269. @aesni_cbc_block[$aesni_cbc_idx++].
  270. '&xor ($a0,$e)',
  271. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  272. '&mov ($a2,$a)',
  273. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  274. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  275. '&xor ($a2,$b)', # a^b, b^c in next round
  276. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  277. '&add ($h,$a4)', # h+=Ch(e,f,g)
  278. '&and ($a3,$a2)', # (b^c)&(a^b)
  279. '&xor ($a1,$a)',
  280. '&add ($h,$a0)', # h+=Sigma1(e)
  281. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  282. '&add ($d,$h)', # d+=h
  283. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  284. '&add ($h,$a3)', # h+=Maj(a,b,c)
  285. '&mov ($a0,$d)',
  286. '&add ($a1,$h);'. # h+=Sigma0(a)
  287. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  288. );
  289. }
  290. if ($avx) {{
  291. ######################################################################
  292. # XOP code path
  293. #
  294. $code.=<<___;
  295. .type ${func}_xop,\@function,6
  296. .align 64
  297. ${func}_xop:
  298. .cfi_startproc
  299. .Lxop_shortcut:
  300. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  301. mov %rsp,%rax # copy %rsp
  302. .cfi_def_cfa_register %rax
  303. push %rbx
  304. .cfi_push %rbx
  305. push %rbp
  306. .cfi_push %rbp
  307. push %r12
  308. .cfi_push %r12
  309. push %r13
  310. .cfi_push %r13
  311. push %r14
  312. .cfi_push %r14
  313. push %r15
  314. .cfi_push %r15
  315. sub \$`$framesz+$win64*16*10`,%rsp
  316. and \$-64,%rsp # align stack frame
  317. shl \$6,$len
  318. sub $inp,$out # re-bias
  319. sub $inp,$in0
  320. add $inp,$len # end of input
  321. #mov $inp,$_inp # saved later
  322. mov $out,$_out
  323. mov $len,$_end
  324. #mov $key,$_key # remains resident in $inp register
  325. mov $ivp,$_ivp
  326. mov $ctx,$_ctx
  327. mov $in0,$_in0
  328. mov %rax,$_rsp
  329. .cfi_cfa_expression $_rsp,deref,+8
  330. ___
  331. $code.=<<___ if ($win64);
  332. movaps %xmm6,`$framesz+16*0`(%rsp)
  333. movaps %xmm7,`$framesz+16*1`(%rsp)
  334. movaps %xmm8,`$framesz+16*2`(%rsp)
  335. movaps %xmm9,`$framesz+16*3`(%rsp)
  336. movaps %xmm10,`$framesz+16*4`(%rsp)
  337. movaps %xmm11,`$framesz+16*5`(%rsp)
  338. movaps %xmm12,`$framesz+16*6`(%rsp)
  339. movaps %xmm13,`$framesz+16*7`(%rsp)
  340. movaps %xmm14,`$framesz+16*8`(%rsp)
  341. movaps %xmm15,`$framesz+16*9`(%rsp)
  342. ___
  343. $code.=<<___;
  344. .Lprologue_xop:
  345. vzeroall
  346. mov $inp,%r12 # borrow $a4
  347. lea 0x80($key),$inp # size optimization, reassign
  348. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  349. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  350. mov $ctx,%r15 # borrow $a2
  351. mov $in0,%rsi # borrow $a3
  352. vmovdqu ($ivp),$iv # load IV
  353. sub \$9,%r14
  354. mov $SZ*0(%r15),$A
  355. mov $SZ*1(%r15),$B
  356. mov $SZ*2(%r15),$C
  357. mov $SZ*3(%r15),$D
  358. mov $SZ*4(%r15),$E
  359. mov $SZ*5(%r15),$F
  360. mov $SZ*6(%r15),$G
  361. mov $SZ*7(%r15),$H
  362. vmovdqa 0x00(%r13,%r14,8),$mask14
  363. vmovdqa 0x10(%r13,%r14,8),$mask12
  364. vmovdqa 0x20(%r13,%r14,8),$mask10
  365. vmovdqu 0x00-0x80($inp),$roundkey
  366. jmp .Lloop_xop
  367. ___
  368. if ($SZ==4) { # SHA256
  369. my @X = map("%xmm$_",(0..3));
  370. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  371. $code.=<<___;
  372. .align 16
  373. .Lloop_xop:
  374. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  375. vmovdqu 0x00(%rsi,%r12),@X[0]
  376. vmovdqu 0x10(%rsi,%r12),@X[1]
  377. vmovdqu 0x20(%rsi,%r12),@X[2]
  378. vmovdqu 0x30(%rsi,%r12),@X[3]
  379. vpshufb $t3,@X[0],@X[0]
  380. lea $TABLE(%rip),$Tbl
  381. vpshufb $t3,@X[1],@X[1]
  382. vpshufb $t3,@X[2],@X[2]
  383. vpaddd 0x00($Tbl),@X[0],$t0
  384. vpshufb $t3,@X[3],@X[3]
  385. vpaddd 0x20($Tbl),@X[1],$t1
  386. vpaddd 0x40($Tbl),@X[2],$t2
  387. vpaddd 0x60($Tbl),@X[3],$t3
  388. vmovdqa $t0,0x00(%rsp)
  389. mov $A,$a1
  390. vmovdqa $t1,0x10(%rsp)
  391. mov $B,$a3
  392. vmovdqa $t2,0x20(%rsp)
  393. xor $C,$a3 # magic
  394. vmovdqa $t3,0x30(%rsp)
  395. mov $E,$a0
  396. jmp .Lxop_00_47
  397. .align 16
  398. .Lxop_00_47:
  399. sub \$-16*2*$SZ,$Tbl # size optimization
  400. vmovdqu (%r12),$inout # $a4
  401. mov %r12,$_inp # $a4
  402. ___
  403. sub XOP_256_00_47 () {
  404. my $j = shift;
  405. my $body = shift;
  406. my @X = @_;
  407. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  408. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  409. eval(shift(@insns));
  410. eval(shift(@insns));
  411. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  412. eval(shift(@insns));
  413. eval(shift(@insns));
  414. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  415. eval(shift(@insns));
  416. eval(shift(@insns));
  417. &vpsrld ($t0,$t0,$sigma0[2]);
  418. eval(shift(@insns));
  419. eval(shift(@insns));
  420. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  421. eval(shift(@insns));
  422. eval(shift(@insns));
  423. eval(shift(@insns));
  424. eval(shift(@insns));
  425. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  426. eval(shift(@insns));
  427. eval(shift(@insns));
  428. &vpxor ($t0,$t0,$t1);
  429. eval(shift(@insns));
  430. eval(shift(@insns));
  431. eval(shift(@insns));
  432. eval(shift(@insns));
  433. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  434. eval(shift(@insns));
  435. eval(shift(@insns));
  436. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  437. eval(shift(@insns));
  438. eval(shift(@insns));
  439. &vpsrld ($t2,@X[3],$sigma1[2]);
  440. eval(shift(@insns));
  441. eval(shift(@insns));
  442. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  443. eval(shift(@insns));
  444. eval(shift(@insns));
  445. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  446. eval(shift(@insns));
  447. eval(shift(@insns));
  448. &vpxor ($t3,$t3,$t2);
  449. eval(shift(@insns));
  450. eval(shift(@insns));
  451. eval(shift(@insns));
  452. eval(shift(@insns));
  453. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  454. eval(shift(@insns));
  455. eval(shift(@insns));
  456. eval(shift(@insns));
  457. eval(shift(@insns));
  458. &vpsrldq ($t3,$t3,8);
  459. eval(shift(@insns));
  460. eval(shift(@insns));
  461. eval(shift(@insns));
  462. eval(shift(@insns));
  463. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  464. eval(shift(@insns));
  465. eval(shift(@insns));
  466. eval(shift(@insns));
  467. eval(shift(@insns));
  468. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  469. eval(shift(@insns));
  470. eval(shift(@insns));
  471. &vpsrld ($t2,@X[0],$sigma1[2]);
  472. eval(shift(@insns));
  473. eval(shift(@insns));
  474. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  475. eval(shift(@insns));
  476. eval(shift(@insns));
  477. &vpxor ($t3,$t3,$t2);
  478. eval(shift(@insns));
  479. eval(shift(@insns));
  480. eval(shift(@insns));
  481. eval(shift(@insns));
  482. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  483. eval(shift(@insns));
  484. eval(shift(@insns));
  485. eval(shift(@insns));
  486. eval(shift(@insns));
  487. &vpslldq ($t3,$t3,8); # 22 instructions
  488. eval(shift(@insns));
  489. eval(shift(@insns));
  490. eval(shift(@insns));
  491. eval(shift(@insns));
  492. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  493. eval(shift(@insns));
  494. eval(shift(@insns));
  495. eval(shift(@insns));
  496. eval(shift(@insns));
  497. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  498. foreach (@insns) { eval; } # remaining instructions
  499. &vmovdqa (16*$j."(%rsp)",$t2);
  500. }
  501. $aesni_cbc_idx=0;
  502. for ($i=0,$j=0; $j<4; $j++) {
  503. &XOP_256_00_47($j,\&body_00_15,@X);
  504. push(@X,shift(@X)); # rotate(@X)
  505. }
  506. &mov ("%r12",$_inp); # borrow $a4
  507. &vpand ($temp,$temp,$mask14);
  508. &mov ("%r15",$_out); # borrow $a2
  509. &vpor ($iv,$iv,$temp);
  510. &vmovdqu ("(%r15,%r12)",$iv); # write output
  511. &lea ("%r12","16(%r12)"); # inp++
  512. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  513. &jne (".Lxop_00_47");
  514. &vmovdqu ($inout,"(%r12)");
  515. &mov ($_inp,"%r12");
  516. $aesni_cbc_idx=0;
  517. for ($i=0; $i<16; ) {
  518. foreach(body_00_15()) { eval; }
  519. }
  520. }
  521. $code.=<<___;
  522. mov $_inp,%r12 # borrow $a4
  523. mov $_out,%r13 # borrow $a0
  524. mov $_ctx,%r15 # borrow $a2
  525. mov $_in0,%rsi # borrow $a3
  526. vpand $mask14,$temp,$temp
  527. mov $a1,$A
  528. vpor $temp,$iv,$iv
  529. vmovdqu $iv,(%r13,%r12) # write output
  530. lea 16(%r12),%r12 # inp++
  531. add $SZ*0(%r15),$A
  532. add $SZ*1(%r15),$B
  533. add $SZ*2(%r15),$C
  534. add $SZ*3(%r15),$D
  535. add $SZ*4(%r15),$E
  536. add $SZ*5(%r15),$F
  537. add $SZ*6(%r15),$G
  538. add $SZ*7(%r15),$H
  539. cmp $_end,%r12
  540. mov $A,$SZ*0(%r15)
  541. mov $B,$SZ*1(%r15)
  542. mov $C,$SZ*2(%r15)
  543. mov $D,$SZ*3(%r15)
  544. mov $E,$SZ*4(%r15)
  545. mov $F,$SZ*5(%r15)
  546. mov $G,$SZ*6(%r15)
  547. mov $H,$SZ*7(%r15)
  548. jb .Lloop_xop
  549. mov $_ivp,$ivp
  550. mov $_rsp,%rsi
  551. .cfi_def_cfa %rsi,8
  552. vmovdqu $iv,($ivp) # output IV
  553. vzeroall
  554. ___
  555. $code.=<<___ if ($win64);
  556. movaps `$framesz+16*0`(%rsp),%xmm6
  557. movaps `$framesz+16*1`(%rsp),%xmm7
  558. movaps `$framesz+16*2`(%rsp),%xmm8
  559. movaps `$framesz+16*3`(%rsp),%xmm9
  560. movaps `$framesz+16*4`(%rsp),%xmm10
  561. movaps `$framesz+16*5`(%rsp),%xmm11
  562. movaps `$framesz+16*6`(%rsp),%xmm12
  563. movaps `$framesz+16*7`(%rsp),%xmm13
  564. movaps `$framesz+16*8`(%rsp),%xmm14
  565. movaps `$framesz+16*9`(%rsp),%xmm15
  566. ___
  567. $code.=<<___;
  568. mov -48(%rsi),%r15
  569. .cfi_restore %r15
  570. mov -40(%rsi),%r14
  571. .cfi_restore %r14
  572. mov -32(%rsi),%r13
  573. .cfi_restore %r13
  574. mov -24(%rsi),%r12
  575. .cfi_restore %r12
  576. mov -16(%rsi),%rbp
  577. .cfi_restore %rbp
  578. mov -8(%rsi),%rbx
  579. .cfi_restore %rbx
  580. lea (%rsi),%rsp
  581. .cfi_def_cfa_register %rsp
  582. .Lepilogue_xop:
  583. ret
  584. .cfi_endproc
  585. .size ${func}_xop,.-${func}_xop
  586. ___
  587. ######################################################################
  588. # AVX+shrd code path
  589. #
  590. local *ror = sub { &shrd(@_[0],@_) };
  591. $code.=<<___;
  592. .type ${func}_avx,\@function,6
  593. .align 64
  594. ${func}_avx:
  595. .cfi_startproc
  596. .Lavx_shortcut:
  597. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  598. mov %rsp,%rax # copy %rsp
  599. .cfi_def_cfa_register %rax
  600. push %rbx
  601. .cfi_push %rbx
  602. push %rbp
  603. .cfi_push %rbp
  604. push %r12
  605. .cfi_push %r12
  606. push %r13
  607. .cfi_push %r13
  608. push %r14
  609. .cfi_push %r14
  610. push %r15
  611. .cfi_push %r15
  612. sub \$`$framesz+$win64*16*10`,%rsp
  613. and \$-64,%rsp # align stack frame
  614. shl \$6,$len
  615. sub $inp,$out # re-bias
  616. sub $inp,$in0
  617. add $inp,$len # end of input
  618. #mov $inp,$_inp # saved later
  619. mov $out,$_out
  620. mov $len,$_end
  621. #mov $key,$_key # remains resident in $inp register
  622. mov $ivp,$_ivp
  623. mov $ctx,$_ctx
  624. mov $in0,$_in0
  625. mov %rax,$_rsp
  626. .cfi_cfa_expression $_rsp,deref,+8
  627. ___
  628. $code.=<<___ if ($win64);
  629. movaps %xmm6,`$framesz+16*0`(%rsp)
  630. movaps %xmm7,`$framesz+16*1`(%rsp)
  631. movaps %xmm8,`$framesz+16*2`(%rsp)
  632. movaps %xmm9,`$framesz+16*3`(%rsp)
  633. movaps %xmm10,`$framesz+16*4`(%rsp)
  634. movaps %xmm11,`$framesz+16*5`(%rsp)
  635. movaps %xmm12,`$framesz+16*6`(%rsp)
  636. movaps %xmm13,`$framesz+16*7`(%rsp)
  637. movaps %xmm14,`$framesz+16*8`(%rsp)
  638. movaps %xmm15,`$framesz+16*9`(%rsp)
  639. ___
  640. $code.=<<___;
  641. .Lprologue_avx:
  642. vzeroall
  643. mov $inp,%r12 # borrow $a4
  644. lea 0x80($key),$inp # size optimization, reassign
  645. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  646. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  647. mov $ctx,%r15 # borrow $a2
  648. mov $in0,%rsi # borrow $a3
  649. vmovdqu ($ivp),$iv # load IV
  650. sub \$9,%r14
  651. mov $SZ*0(%r15),$A
  652. mov $SZ*1(%r15),$B
  653. mov $SZ*2(%r15),$C
  654. mov $SZ*3(%r15),$D
  655. mov $SZ*4(%r15),$E
  656. mov $SZ*5(%r15),$F
  657. mov $SZ*6(%r15),$G
  658. mov $SZ*7(%r15),$H
  659. vmovdqa 0x00(%r13,%r14,8),$mask14
  660. vmovdqa 0x10(%r13,%r14,8),$mask12
  661. vmovdqa 0x20(%r13,%r14,8),$mask10
  662. vmovdqu 0x00-0x80($inp),$roundkey
  663. ___
  664. if ($SZ==4) { # SHA256
  665. my @X = map("%xmm$_",(0..3));
  666. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  667. $code.=<<___;
  668. jmp .Lloop_avx
  669. .align 16
  670. .Lloop_avx:
  671. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  672. vmovdqu 0x00(%rsi,%r12),@X[0]
  673. vmovdqu 0x10(%rsi,%r12),@X[1]
  674. vmovdqu 0x20(%rsi,%r12),@X[2]
  675. vmovdqu 0x30(%rsi,%r12),@X[3]
  676. vpshufb $t3,@X[0],@X[0]
  677. lea $TABLE(%rip),$Tbl
  678. vpshufb $t3,@X[1],@X[1]
  679. vpshufb $t3,@X[2],@X[2]
  680. vpaddd 0x00($Tbl),@X[0],$t0
  681. vpshufb $t3,@X[3],@X[3]
  682. vpaddd 0x20($Tbl),@X[1],$t1
  683. vpaddd 0x40($Tbl),@X[2],$t2
  684. vpaddd 0x60($Tbl),@X[3],$t3
  685. vmovdqa $t0,0x00(%rsp)
  686. mov $A,$a1
  687. vmovdqa $t1,0x10(%rsp)
  688. mov $B,$a3
  689. vmovdqa $t2,0x20(%rsp)
  690. xor $C,$a3 # magic
  691. vmovdqa $t3,0x30(%rsp)
  692. mov $E,$a0
  693. jmp .Lavx_00_47
  694. .align 16
  695. .Lavx_00_47:
  696. sub \$-16*2*$SZ,$Tbl # size optimization
  697. vmovdqu (%r12),$inout # $a4
  698. mov %r12,$_inp # $a4
  699. ___
  700. sub Xupdate_256_AVX () {
  701. (
  702. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  703. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  704. '&vpsrld ($t2,$t0,$sigma0[0]);',
  705. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  706. '&vpsrld ($t3,$t0,$sigma0[2])',
  707. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  708. '&vpxor ($t0,$t3,$t2)',
  709. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  710. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  711. '&vpxor ($t0,$t0,$t1)',
  712. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  713. '&vpxor ($t0,$t0,$t2)',
  714. '&vpsrld ($t2,$t3,$sigma1[2]);',
  715. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  716. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  717. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  718. '&vpxor ($t2,$t2,$t3);',
  719. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  720. '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
  721. '&vpshufd ($t2,$t2,0b10000100)',
  722. '&vpsrldq ($t2,$t2,8)',
  723. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  724. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  725. '&vpsrld ($t2,$t3,$sigma1[2])',
  726. '&vpsrlq ($t3,$t3,$sigma1[0])',
  727. '&vpxor ($t2,$t2,$t3);',
  728. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  729. '&vpxor ($t2,$t2,$t3)',
  730. '&vpshufd ($t2,$t2,0b11101000)',
  731. '&vpslldq ($t2,$t2,8)',
  732. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  733. );
  734. }
  735. sub AVX_256_00_47 () {
  736. my $j = shift;
  737. my $body = shift;
  738. my @X = @_;
  739. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  740. foreach (Xupdate_256_AVX()) { # 29 instructions
  741. eval;
  742. eval(shift(@insns));
  743. eval(shift(@insns));
  744. eval(shift(@insns));
  745. }
  746. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  747. foreach (@insns) { eval; } # remaining instructions
  748. &vmovdqa (16*$j."(%rsp)",$t2);
  749. }
  750. $aesni_cbc_idx=0;
  751. for ($i=0,$j=0; $j<4; $j++) {
  752. &AVX_256_00_47($j,\&body_00_15,@X);
  753. push(@X,shift(@X)); # rotate(@X)
  754. }
  755. &mov ("%r12",$_inp); # borrow $a4
  756. &vpand ($temp,$temp,$mask14);
  757. &mov ("%r15",$_out); # borrow $a2
  758. &vpor ($iv,$iv,$temp);
  759. &vmovdqu ("(%r15,%r12)",$iv); # write output
  760. &lea ("%r12","16(%r12)"); # inp++
  761. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  762. &jne (".Lavx_00_47");
  763. &vmovdqu ($inout,"(%r12)");
  764. &mov ($_inp,"%r12");
  765. $aesni_cbc_idx=0;
  766. for ($i=0; $i<16; ) {
  767. foreach(body_00_15()) { eval; }
  768. }
  769. }
  770. $code.=<<___;
  771. mov $_inp,%r12 # borrow $a4
  772. mov $_out,%r13 # borrow $a0
  773. mov $_ctx,%r15 # borrow $a2
  774. mov $_in0,%rsi # borrow $a3
  775. vpand $mask14,$temp,$temp
  776. mov $a1,$A
  777. vpor $temp,$iv,$iv
  778. vmovdqu $iv,(%r13,%r12) # write output
  779. lea 16(%r12),%r12 # inp++
  780. add $SZ*0(%r15),$A
  781. add $SZ*1(%r15),$B
  782. add $SZ*2(%r15),$C
  783. add $SZ*3(%r15),$D
  784. add $SZ*4(%r15),$E
  785. add $SZ*5(%r15),$F
  786. add $SZ*6(%r15),$G
  787. add $SZ*7(%r15),$H
  788. cmp $_end,%r12
  789. mov $A,$SZ*0(%r15)
  790. mov $B,$SZ*1(%r15)
  791. mov $C,$SZ*2(%r15)
  792. mov $D,$SZ*3(%r15)
  793. mov $E,$SZ*4(%r15)
  794. mov $F,$SZ*5(%r15)
  795. mov $G,$SZ*6(%r15)
  796. mov $H,$SZ*7(%r15)
  797. jb .Lloop_avx
  798. mov $_ivp,$ivp
  799. mov $_rsp,%rsi
  800. .cfi_def_cfa %rsi,8
  801. vmovdqu $iv,($ivp) # output IV
  802. vzeroall
  803. ___
  804. $code.=<<___ if ($win64);
  805. movaps `$framesz+16*0`(%rsp),%xmm6
  806. movaps `$framesz+16*1`(%rsp),%xmm7
  807. movaps `$framesz+16*2`(%rsp),%xmm8
  808. movaps `$framesz+16*3`(%rsp),%xmm9
  809. movaps `$framesz+16*4`(%rsp),%xmm10
  810. movaps `$framesz+16*5`(%rsp),%xmm11
  811. movaps `$framesz+16*6`(%rsp),%xmm12
  812. movaps `$framesz+16*7`(%rsp),%xmm13
  813. movaps `$framesz+16*8`(%rsp),%xmm14
  814. movaps `$framesz+16*9`(%rsp),%xmm15
  815. ___
  816. $code.=<<___;
  817. mov -48(%rsi),%r15
  818. .cfi_restore %r15
  819. mov -40(%rsi),%r14
  820. .cfi_restore %r14
  821. mov -32(%rsi),%r13
  822. .cfi_restore %r13
  823. mov -24(%rsi),%r12
  824. .cfi_restore %r12
  825. mov -16(%rsi),%rbp
  826. .cfi_restore %rbp
  827. mov -8(%rsi),%rbx
  828. .cfi_restore %rbx
  829. lea (%rsi),%rsp
  830. .cfi_def_cfa_register %rsp
  831. .Lepilogue_avx:
  832. ret
  833. .cfi_endproc
  834. .size ${func}_avx,.-${func}_avx
  835. ___
  836. if ($avx>1) {{
  837. ######################################################################
  838. # AVX2+BMI code path
  839. #
  840. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  841. my $PUSH8=8*2*$SZ;
  842. use integer;
  843. sub bodyx_00_15 () {
  844. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  845. (
  846. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  847. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  848. '&and ($a4,$e)', # f&e
  849. '&rorx ($a0,$e,$Sigma1[2])',
  850. '&rorx ($a2,$e,$Sigma1[1])',
  851. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  852. '&lea ($h,"($h,$a4)")',
  853. '&andn ($a4,$e,$g)', # ~e&g
  854. '&xor ($a0,$a2)',
  855. '&rorx ($a1,$e,$Sigma1[0])',
  856. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  857. '&xor ($a0,$a1)', # Sigma1(e)
  858. '&mov ($a2,$a)',
  859. '&rorx ($a4,$a,$Sigma0[2])',
  860. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  861. '&xor ($a2,$b)', # a^b, b^c in next round
  862. '&rorx ($a1,$a,$Sigma0[1])',
  863. '&rorx ($a0,$a,$Sigma0[0])',
  864. '&lea ($d,"($d,$h)")', # d+=h
  865. '&and ($a3,$a2)', # (b^c)&(a^b)
  866. @aesni_cbc_block[$aesni_cbc_idx++].
  867. '&xor ($a1,$a4)',
  868. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  869. '&xor ($a1,$a0)', # Sigma0(a)
  870. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  871. '&mov ($a4,$e)', # copy of f in future
  872. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  873. );
  874. # and at the finish one has to $a+=$a1
  875. }
  876. $code.=<<___;
  877. .type ${func}_avx2,\@function,6
  878. .align 64
  879. ${func}_avx2:
  880. .cfi_startproc
  881. .Lavx2_shortcut:
  882. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  883. mov %rsp,%rax # copy %rsp
  884. .cfi_def_cfa_register %rax
  885. push %rbx
  886. .cfi_push %rbx
  887. push %rbp
  888. .cfi_push %rbp
  889. push %r12
  890. .cfi_push %r12
  891. push %r13
  892. .cfi_push %r13
  893. push %r14
  894. .cfi_push %r14
  895. push %r15
  896. .cfi_push %r15
  897. sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
  898. and \$-256*$SZ,%rsp # align stack frame
  899. add \$`2*$SZ*($rounds-8)`,%rsp
  900. shl \$6,$len
  901. sub $inp,$out # re-bias
  902. sub $inp,$in0
  903. add $inp,$len # end of input
  904. #mov $inp,$_inp # saved later
  905. #mov $out,$_out # kept in $offload
  906. mov $len,$_end
  907. #mov $key,$_key # remains resident in $inp register
  908. mov $ivp,$_ivp
  909. mov $ctx,$_ctx
  910. mov $in0,$_in0
  911. mov %rax,$_rsp
  912. .cfi_cfa_expression $_rsp,deref,+8
  913. ___
  914. $code.=<<___ if ($win64);
  915. movaps %xmm6,`$framesz+16*0`(%rsp)
  916. movaps %xmm7,`$framesz+16*1`(%rsp)
  917. movaps %xmm8,`$framesz+16*2`(%rsp)
  918. movaps %xmm9,`$framesz+16*3`(%rsp)
  919. movaps %xmm10,`$framesz+16*4`(%rsp)
  920. movaps %xmm11,`$framesz+16*5`(%rsp)
  921. movaps %xmm12,`$framesz+16*6`(%rsp)
  922. movaps %xmm13,`$framesz+16*7`(%rsp)
  923. movaps %xmm14,`$framesz+16*8`(%rsp)
  924. movaps %xmm15,`$framesz+16*9`(%rsp)
  925. ___
  926. $code.=<<___;
  927. .Lprologue_avx2:
  928. vzeroall
  929. mov $inp,%r13 # borrow $a0
  930. vpinsrq \$1,$out,$offload,$offload
  931. lea 0x80($key),$inp # size optimization, reassign
  932. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
  933. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  934. mov $ctx,%r15 # borrow $a2
  935. mov $in0,%rsi # borrow $a3
  936. vmovdqu ($ivp),$iv # load IV
  937. lea -9(%r14),%r14
  938. vmovdqa 0x00(%r12,%r14,8),$mask14
  939. vmovdqa 0x10(%r12,%r14,8),$mask12
  940. vmovdqa 0x20(%r12,%r14,8),$mask10
  941. sub \$-16*$SZ,%r13 # inp++, size optimization
  942. mov $SZ*0(%r15),$A
  943. lea (%rsi,%r13),%r12 # borrow $a0
  944. mov $SZ*1(%r15),$B
  945. cmp $len,%r13 # $_end
  946. mov $SZ*2(%r15),$C
  947. cmove %rsp,%r12 # next block or random data
  948. mov $SZ*3(%r15),$D
  949. mov $SZ*4(%r15),$E
  950. mov $SZ*5(%r15),$F
  951. mov $SZ*6(%r15),$G
  952. mov $SZ*7(%r15),$H
  953. vmovdqu 0x00-0x80($inp),$roundkey
  954. ___
  955. if ($SZ==4) { # SHA256
  956. my @X = map("%ymm$_",(0..3));
  957. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
  958. $code.=<<___;
  959. jmp .Loop_avx2
  960. .align 16
  961. .Loop_avx2:
  962. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  963. vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
  964. vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
  965. vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
  966. vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
  967. vinserti128 \$1,(%r12),@X[0],@X[0]
  968. vinserti128 \$1,16(%r12),@X[1],@X[1]
  969. vpshufb $t3,@X[0],@X[0]
  970. vinserti128 \$1,32(%r12),@X[2],@X[2]
  971. vpshufb $t3,@X[1],@X[1]
  972. vinserti128 \$1,48(%r12),@X[3],@X[3]
  973. lea $TABLE(%rip),$Tbl
  974. vpshufb $t3,@X[2],@X[2]
  975. lea -16*$SZ(%r13),%r13
  976. vpaddd 0x00($Tbl),@X[0],$t0
  977. vpshufb $t3,@X[3],@X[3]
  978. vpaddd 0x20($Tbl),@X[1],$t1
  979. vpaddd 0x40($Tbl),@X[2],$t2
  980. vpaddd 0x60($Tbl),@X[3],$t3
  981. vmovdqa $t0,0x00(%rsp)
  982. xor $a1,$a1
  983. vmovdqa $t1,0x20(%rsp)
  984. ___
  985. $code.=<<___ if (!$win64);
  986. # temporarily use %rsi as frame pointer
  987. mov $_rsp,%rsi
  988. .cfi_def_cfa %rsi,8
  989. ___
  990. $code.=<<___;
  991. lea -$PUSH8(%rsp),%rsp
  992. ___
  993. $code.=<<___ if (!$win64);
  994. # the frame info is at $_rsp, but the stack is moving...
  995. # so a second frame pointer is saved at -8(%rsp)
  996. # that is in the red zone
  997. mov %rsi,-8(%rsp)
  998. .cfi_cfa_expression %rsp-8,deref,+8
  999. ___
  1000. $code.=<<___;
  1001. mov $B,$a3
  1002. vmovdqa $t2,0x00(%rsp)
  1003. xor $C,$a3 # magic
  1004. vmovdqa $t3,0x20(%rsp)
  1005. mov $F,$a4
  1006. sub \$-16*2*$SZ,$Tbl # size optimization
  1007. jmp .Lavx2_00_47
  1008. .align 16
  1009. .Lavx2_00_47:
  1010. vmovdqu (%r13),$inout
  1011. vpinsrq \$0,%r13,$offload,$offload
  1012. ___
  1013. sub AVX2_256_00_47 () {
  1014. my $j = shift;
  1015. my $body = shift;
  1016. my @X = @_;
  1017. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1018. my $base = "+2*$PUSH8(%rsp)";
  1019. if (($j%2)==0) {
  1020. &lea ("%rsp","-$PUSH8(%rsp)");
  1021. $code.=<<___ if (!$win64);
  1022. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  1023. # copy secondary frame pointer to new location again at -8(%rsp)
  1024. pushq $PUSH8-8(%rsp)
  1025. .cfi_cfa_expression %rsp,deref,+8
  1026. lea 8(%rsp),%rsp
  1027. .cfi_cfa_expression %rsp-8,deref,+8
  1028. ___
  1029. }
  1030. foreach (Xupdate_256_AVX()) { # 29 instructions
  1031. eval;
  1032. eval(shift(@insns));
  1033. eval(shift(@insns));
  1034. eval(shift(@insns));
  1035. }
  1036. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1037. foreach (@insns) { eval; } # remaining instructions
  1038. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1039. }
  1040. $aesni_cbc_idx=0;
  1041. for ($i=0,$j=0; $j<4; $j++) {
  1042. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1043. push(@X,shift(@X)); # rotate(@X)
  1044. }
  1045. &vmovq ("%r13",$offload); # borrow $a0
  1046. &vpextrq ("%r15",$offload,1); # borrow $a2
  1047. &vpand ($temp,$temp,$mask14);
  1048. &vpor ($iv,$iv,$temp);
  1049. &vmovdqu ("(%r15,%r13)",$iv); # write output
  1050. &lea ("%r13","16(%r13)"); # inp++
  1051. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1052. &cmpb (($SZ-1)."($Tbl)",0);
  1053. &jne (".Lavx2_00_47");
  1054. &vmovdqu ($inout,"(%r13)");
  1055. &vpinsrq ($offload,$offload,"%r13",0);
  1056. $aesni_cbc_idx=0;
  1057. for ($i=0; $i<16; ) {
  1058. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1059. foreach(bodyx_00_15()) { eval; }
  1060. }
  1061. }
  1062. $code.=<<___;
  1063. vpextrq \$1,$offload,%r12 # $_out, borrow $a4
  1064. vmovq $offload,%r13 # $_inp, borrow $a0
  1065. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  1066. add $a1,$A
  1067. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  1068. vpand $mask14,$temp,$temp
  1069. vpor $temp,$iv,$iv
  1070. vmovdqu $iv,(%r12,%r13) # write output
  1071. lea 16(%r13),%r13
  1072. add $SZ*0(%r15),$A
  1073. add $SZ*1(%r15),$B
  1074. add $SZ*2(%r15),$C
  1075. add $SZ*3(%r15),$D
  1076. add $SZ*4(%r15),$E
  1077. add $SZ*5(%r15),$F
  1078. add $SZ*6(%r15),$G
  1079. add $SZ*7(%r15),$H
  1080. mov $A,$SZ*0(%r15)
  1081. mov $B,$SZ*1(%r15)
  1082. mov $C,$SZ*2(%r15)
  1083. mov $D,$SZ*3(%r15)
  1084. mov $E,$SZ*4(%r15)
  1085. mov $F,$SZ*5(%r15)
  1086. mov $G,$SZ*6(%r15)
  1087. mov $H,$SZ*7(%r15)
  1088. cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
  1089. je .Ldone_avx2
  1090. xor $a1,$a1
  1091. mov $B,$a3
  1092. mov $F,$a4
  1093. xor $C,$a3 # magic
  1094. jmp .Lower_avx2
  1095. .align 16
  1096. .Lower_avx2:
  1097. vmovdqu (%r13),$inout
  1098. vpinsrq \$0,%r13,$offload,$offload
  1099. ___
  1100. $aesni_cbc_idx=0;
  1101. for ($i=0; $i<16; ) {
  1102. my $base="+16($Tbl)";
  1103. foreach(bodyx_00_15()) { eval; }
  1104. &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
  1105. }
  1106. $code.=<<___;
  1107. vmovq $offload,%r13 # borrow $a0
  1108. vpextrq \$1,$offload,%r15 # borrow $a2
  1109. vpand $mask14,$temp,$temp
  1110. vpor $temp,$iv,$iv
  1111. lea -$PUSH8($Tbl),$Tbl
  1112. vmovdqu $iv,(%r15,%r13) # write output
  1113. lea 16(%r13),%r13 # inp++
  1114. cmp %rsp,$Tbl
  1115. jae .Lower_avx2
  1116. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  1117. lea 16*$SZ(%r13),%r13
  1118. mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
  1119. add $a1,$A
  1120. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  1121. add $SZ*0(%r15),$A
  1122. add $SZ*1(%r15),$B
  1123. add $SZ*2(%r15),$C
  1124. add $SZ*3(%r15),$D
  1125. add $SZ*4(%r15),$E
  1126. add $SZ*5(%r15),$F
  1127. add $SZ*6(%r15),$G
  1128. lea (%rsi,%r13),%r12
  1129. add $SZ*7(%r15),$H
  1130. cmp $_end,%r13
  1131. mov $A,$SZ*0(%r15)
  1132. cmove %rsp,%r12 # next block or stale data
  1133. mov $B,$SZ*1(%r15)
  1134. mov $C,$SZ*2(%r15)
  1135. mov $D,$SZ*3(%r15)
  1136. mov $E,$SZ*4(%r15)
  1137. mov $F,$SZ*5(%r15)
  1138. mov $G,$SZ*6(%r15)
  1139. mov $H,$SZ*7(%r15)
  1140. jbe .Loop_avx2
  1141. lea (%rsp),$Tbl
  1142. # temporarily use $Tbl as index to $_rsp
  1143. # this avoids the need to save a secondary frame pointer at -8(%rsp)
  1144. .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
  1145. .Ldone_avx2:
  1146. mov 16*$SZ+4*8($Tbl),$ivp
  1147. mov 16*$SZ+7*8($Tbl),%rsi
  1148. .cfi_def_cfa %rsi,8
  1149. vmovdqu $iv,($ivp) # output IV
  1150. vzeroall
  1151. ___
  1152. $code.=<<___ if ($win64);
  1153. movaps `$framesz+16*0`($Tbl),%xmm6
  1154. movaps `$framesz+16*1`($Tbl),%xmm7
  1155. movaps `$framesz+16*2`($Tbl),%xmm8
  1156. movaps `$framesz+16*3`($Tbl),%xmm9
  1157. movaps `$framesz+16*4`($Tbl),%xmm10
  1158. movaps `$framesz+16*5`($Tbl),%xmm11
  1159. movaps `$framesz+16*6`($Tbl),%xmm12
  1160. movaps `$framesz+16*7`($Tbl),%xmm13
  1161. movaps `$framesz+16*8`($Tbl),%xmm14
  1162. movaps `$framesz+16*9`($Tbl),%xmm15
  1163. ___
  1164. $code.=<<___;
  1165. mov -48(%rsi),%r15
  1166. .cfi_restore %r15
  1167. mov -40(%rsi),%r14
  1168. .cfi_restore %r14
  1169. mov -32(%rsi),%r13
  1170. .cfi_restore %r13
  1171. mov -24(%rsi),%r12
  1172. .cfi_restore %r12
  1173. mov -16(%rsi),%rbp
  1174. .cfi_restore %rbp
  1175. mov -8(%rsi),%rbx
  1176. .cfi_restore %rbx
  1177. lea (%rsi),%rsp
  1178. .cfi_def_cfa_register %rsp
  1179. .Lepilogue_avx2:
  1180. ret
  1181. .cfi_endproc
  1182. .size ${func}_avx2,.-${func}_avx2
  1183. ___
  1184. }}
  1185. }}
  1186. {{
  1187. my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  1188. my ($rounds,$Tbl)=("%r11d","%rbx");
  1189. my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
  1190. my @rndkey=("%xmm4","%xmm5");
  1191. my $r=0;
  1192. my $sn=0;
  1193. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
  1194. my @MSG=map("%xmm$_",(10..13));
  1195. my $aesenc=sub {
  1196. use integer;
  1197. my ($n,$k)=($r/10,$r%10);
  1198. if ($k==0) {
  1199. $code.=<<___;
  1200. movups `16*$n`($in0),$in # load input
  1201. xorps $rndkey0,$in
  1202. ___
  1203. $code.=<<___ if ($n);
  1204. movups $iv,`16*($n-1)`($out,$in0) # write output
  1205. ___
  1206. $code.=<<___;
  1207. xorps $in,$iv
  1208. movups `32+16*$k-112`($key),$rndkey[1]
  1209. aesenc $rndkey[0],$iv
  1210. ___
  1211. } elsif ($k==9) {
  1212. $sn++;
  1213. $code.=<<___;
  1214. cmp \$11,$rounds
  1215. jb .Laesenclast$sn
  1216. movups `32+16*($k+0)-112`($key),$rndkey[1]
  1217. aesenc $rndkey[0],$iv
  1218. movups `32+16*($k+1)-112`($key),$rndkey[0]
  1219. aesenc $rndkey[1],$iv
  1220. je .Laesenclast$sn
  1221. movups `32+16*($k+2)-112`($key),$rndkey[1]
  1222. aesenc $rndkey[0],$iv
  1223. movups `32+16*($k+3)-112`($key),$rndkey[0]
  1224. aesenc $rndkey[1],$iv
  1225. .Laesenclast$sn:
  1226. aesenclast $rndkey[0],$iv
  1227. movups 16-112($key),$rndkey[1] # forward reference
  1228. nop
  1229. ___
  1230. } else {
  1231. $code.=<<___;
  1232. movups `32+16*$k-112`($key),$rndkey[1]
  1233. aesenc $rndkey[0],$iv
  1234. ___
  1235. }
  1236. $r++; unshift(@rndkey,pop(@rndkey));
  1237. };
  1238. if ($shaext) {
  1239. my $Tbl="%rax";
  1240. $code.=<<___;
  1241. .type ${func}_shaext,\@function,6
  1242. .align 32
  1243. ${func}_shaext:
  1244. .cfi_startproc
  1245. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  1246. ___
  1247. $code.=<<___ if ($win64);
  1248. lea `-8-10*16`(%rsp),%rsp
  1249. movaps %xmm6,-8-10*16(%rax)
  1250. movaps %xmm7,-8-9*16(%rax)
  1251. movaps %xmm8,-8-8*16(%rax)
  1252. movaps %xmm9,-8-7*16(%rax)
  1253. movaps %xmm10,-8-6*16(%rax)
  1254. movaps %xmm11,-8-5*16(%rax)
  1255. movaps %xmm12,-8-4*16(%rax)
  1256. movaps %xmm13,-8-3*16(%rax)
  1257. movaps %xmm14,-8-2*16(%rax)
  1258. movaps %xmm15,-8-1*16(%rax)
  1259. .Lprologue_shaext:
  1260. ___
  1261. $code.=<<___;
  1262. lea K256+0x80(%rip),$Tbl
  1263. movdqu ($ctx),$ABEF # DCBA
  1264. movdqu 16($ctx),$CDGH # HGFE
  1265. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  1266. mov 240($key),$rounds
  1267. sub $in0,$out
  1268. movups ($key),$rndkey0 # $key[0]
  1269. movups ($ivp),$iv # load IV
  1270. movups 16($key),$rndkey[0] # forward reference
  1271. lea 112($key),$key # size optimization
  1272. pshufd \$0x1b,$ABEF,$Wi # ABCD
  1273. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  1274. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  1275. movdqa $TMP,$BSWAP # offload
  1276. palignr \$8,$CDGH,$ABEF # ABEF
  1277. punpcklqdq $Wi,$CDGH # CDGH
  1278. jmp .Loop_shaext
  1279. .align 16
  1280. .Loop_shaext:
  1281. movdqu ($inp),@MSG[0]
  1282. movdqu 0x10($inp),@MSG[1]
  1283. movdqu 0x20($inp),@MSG[2]
  1284. pshufb $TMP,@MSG[0]
  1285. movdqu 0x30($inp),@MSG[3]
  1286. movdqa 0*32-0x80($Tbl),$Wi
  1287. paddd @MSG[0],$Wi
  1288. pshufb $TMP,@MSG[1]
  1289. movdqa $CDGH,$CDGH_SAVE # offload
  1290. movdqa $ABEF,$ABEF_SAVE # offload
  1291. ___
  1292. &$aesenc();
  1293. $code.=<<___;
  1294. sha256rnds2 $ABEF,$CDGH # 0-3
  1295. pshufd \$0x0e,$Wi,$Wi
  1296. ___
  1297. &$aesenc();
  1298. $code.=<<___;
  1299. sha256rnds2 $CDGH,$ABEF
  1300. movdqa 1*32-0x80($Tbl),$Wi
  1301. paddd @MSG[1],$Wi
  1302. pshufb $TMP,@MSG[2]
  1303. lea 0x40($inp),$inp
  1304. ___
  1305. &$aesenc();
  1306. $code.=<<___;
  1307. sha256rnds2 $ABEF,$CDGH # 4-7
  1308. pshufd \$0x0e,$Wi,$Wi
  1309. ___
  1310. &$aesenc();
  1311. $code.=<<___;
  1312. sha256rnds2 $CDGH,$ABEF
  1313. movdqa 2*32-0x80($Tbl),$Wi
  1314. paddd @MSG[2],$Wi
  1315. pshufb $TMP,@MSG[3]
  1316. sha256msg1 @MSG[1],@MSG[0]
  1317. ___
  1318. &$aesenc();
  1319. $code.=<<___;
  1320. sha256rnds2 $ABEF,$CDGH # 8-11
  1321. pshufd \$0x0e,$Wi,$Wi
  1322. movdqa @MSG[3],$TMP
  1323. palignr \$4,@MSG[2],$TMP
  1324. paddd $TMP,@MSG[0]
  1325. ___
  1326. &$aesenc();
  1327. $code.=<<___;
  1328. sha256rnds2 $CDGH,$ABEF
  1329. movdqa 3*32-0x80($Tbl),$Wi
  1330. paddd @MSG[3],$Wi
  1331. sha256msg2 @MSG[3],@MSG[0]
  1332. sha256msg1 @MSG[2],@MSG[1]
  1333. ___
  1334. &$aesenc();
  1335. $code.=<<___;
  1336. sha256rnds2 $ABEF,$CDGH # 12-15
  1337. pshufd \$0x0e,$Wi,$Wi
  1338. ___
  1339. &$aesenc();
  1340. $code.=<<___;
  1341. movdqa @MSG[0],$TMP
  1342. palignr \$4,@MSG[3],$TMP
  1343. paddd $TMP,@MSG[1]
  1344. sha256rnds2 $CDGH,$ABEF
  1345. ___
  1346. for($i=4;$i<16-3;$i++) {
  1347. &$aesenc() if (($r%10)==0);
  1348. $code.=<<___;
  1349. movdqa $i*32-0x80($Tbl),$Wi
  1350. paddd @MSG[0],$Wi
  1351. sha256msg2 @MSG[0],@MSG[1]
  1352. sha256msg1 @MSG[3],@MSG[2]
  1353. ___
  1354. &$aesenc();
  1355. $code.=<<___;
  1356. sha256rnds2 $ABEF,$CDGH # 16-19...
  1357. pshufd \$0x0e,$Wi,$Wi
  1358. movdqa @MSG[1],$TMP
  1359. palignr \$4,@MSG[0],$TMP
  1360. paddd $TMP,@MSG[2]
  1361. ___
  1362. &$aesenc();
  1363. &$aesenc() if ($r==19);
  1364. $code.=<<___;
  1365. sha256rnds2 $CDGH,$ABEF
  1366. ___
  1367. push(@MSG,shift(@MSG));
  1368. }
  1369. $code.=<<___;
  1370. movdqa 13*32-0x80($Tbl),$Wi
  1371. paddd @MSG[0],$Wi
  1372. sha256msg2 @MSG[0],@MSG[1]
  1373. sha256msg1 @MSG[3],@MSG[2]
  1374. ___
  1375. &$aesenc();
  1376. $code.=<<___;
  1377. sha256rnds2 $ABEF,$CDGH # 52-55
  1378. pshufd \$0x0e,$Wi,$Wi
  1379. movdqa @MSG[1],$TMP
  1380. palignr \$4,@MSG[0],$TMP
  1381. paddd $TMP,@MSG[2]
  1382. ___
  1383. &$aesenc();
  1384. &$aesenc();
  1385. $code.=<<___;
  1386. sha256rnds2 $CDGH,$ABEF
  1387. movdqa 14*32-0x80($Tbl),$Wi
  1388. paddd @MSG[1],$Wi
  1389. sha256msg2 @MSG[1],@MSG[2]
  1390. movdqa $BSWAP,$TMP
  1391. ___
  1392. &$aesenc();
  1393. $code.=<<___;
  1394. sha256rnds2 $ABEF,$CDGH # 56-59
  1395. pshufd \$0x0e,$Wi,$Wi
  1396. ___
  1397. &$aesenc();
  1398. $code.=<<___;
  1399. sha256rnds2 $CDGH,$ABEF
  1400. movdqa 15*32-0x80($Tbl),$Wi
  1401. paddd @MSG[2],$Wi
  1402. ___
  1403. &$aesenc();
  1404. &$aesenc();
  1405. $code.=<<___;
  1406. sha256rnds2 $ABEF,$CDGH # 60-63
  1407. pshufd \$0x0e,$Wi,$Wi
  1408. ___
  1409. &$aesenc();
  1410. $code.=<<___;
  1411. sha256rnds2 $CDGH,$ABEF
  1412. #pxor $CDGH,$rndkey0 # black magic
  1413. ___
  1414. while ($r<40) { &$aesenc(); } # remaining aesenc's
  1415. $code.=<<___;
  1416. #xorps $CDGH,$rndkey0 # black magic
  1417. paddd $CDGH_SAVE,$CDGH
  1418. paddd $ABEF_SAVE,$ABEF
  1419. dec $len
  1420. movups $iv,48($out,$in0) # write output
  1421. lea 64($in0),$in0
  1422. jnz .Loop_shaext
  1423. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  1424. pshufd \$0x1b,$ABEF,$TMP # FEBA
  1425. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  1426. punpckhqdq $CDGH,$ABEF # DCBA
  1427. palignr \$8,$TMP,$CDGH # HGFE
  1428. movups $iv,($ivp) # write IV
  1429. movdqu $ABEF,($ctx)
  1430. movdqu $CDGH,16($ctx)
  1431. ___
  1432. $code.=<<___ if ($win64);
  1433. movaps 0*16(%rsp),%xmm6
  1434. movaps 1*16(%rsp),%xmm7
  1435. movaps 2*16(%rsp),%xmm8
  1436. movaps 3*16(%rsp),%xmm9
  1437. movaps 4*16(%rsp),%xmm10
  1438. movaps 5*16(%rsp),%xmm11
  1439. movaps 6*16(%rsp),%xmm12
  1440. movaps 7*16(%rsp),%xmm13
  1441. movaps 8*16(%rsp),%xmm14
  1442. movaps 9*16(%rsp),%xmm15
  1443. lea 8+10*16(%rsp),%rsp
  1444. .Lepilogue_shaext:
  1445. ___
  1446. $code.=<<___;
  1447. ret
  1448. .cfi_endproc
  1449. .size ${func}_shaext,.-${func}_shaext
  1450. ___
  1451. }
  1452. }}}}}
  1453. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1454. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1455. if ($win64 && $avx) {
  1456. $rec="%rcx";
  1457. $frame="%rdx";
  1458. $context="%r8";
  1459. $disp="%r9";
  1460. $code.=<<___;
  1461. .extern __imp_RtlVirtualUnwind
  1462. .type se_handler,\@abi-omnipotent
  1463. .align 16
  1464. se_handler:
  1465. push %rsi
  1466. push %rdi
  1467. push %rbx
  1468. push %rbp
  1469. push %r12
  1470. push %r13
  1471. push %r14
  1472. push %r15
  1473. pushfq
  1474. sub \$64,%rsp
  1475. mov 120($context),%rax # pull context->Rax
  1476. mov 248($context),%rbx # pull context->Rip
  1477. mov 8($disp),%rsi # disp->ImageBase
  1478. mov 56($disp),%r11 # disp->HanderlData
  1479. mov 0(%r11),%r10d # HandlerData[0]
  1480. lea (%rsi,%r10),%r10 # prologue label
  1481. cmp %r10,%rbx # context->Rip<prologue label
  1482. jb .Lin_prologue
  1483. mov 152($context),%rax # pull context->Rsp
  1484. mov 4(%r11),%r10d # HandlerData[1]
  1485. lea (%rsi,%r10),%r10 # epilogue label
  1486. cmp %r10,%rbx # context->Rip>=epilogue label
  1487. jae .Lin_prologue
  1488. ___
  1489. $code.=<<___ if ($shaext);
  1490. lea aesni_cbc_sha256_enc_shaext(%rip),%r10
  1491. cmp %r10,%rbx
  1492. jb .Lnot_in_shaext
  1493. lea (%rax),%rsi
  1494. lea 512($context),%rdi # &context.Xmm6
  1495. mov \$20,%ecx
  1496. .long 0xa548f3fc # cld; rep movsq
  1497. lea 168(%rax),%rax # adjust stack pointer
  1498. jmp .Lin_prologue
  1499. .Lnot_in_shaext:
  1500. ___
  1501. $code.=<<___ if ($avx>1);
  1502. lea .Lavx2_shortcut(%rip),%r10
  1503. cmp %r10,%rbx # context->Rip<avx2_shortcut
  1504. jb .Lnot_in_avx2
  1505. and \$-256*$SZ,%rax
  1506. add \$`2*$SZ*($rounds-8)`,%rax
  1507. .Lnot_in_avx2:
  1508. ___
  1509. $code.=<<___;
  1510. mov %rax,%rsi # put aside Rsp
  1511. mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
  1512. mov -8(%rax),%rbx
  1513. mov -16(%rax),%rbp
  1514. mov -24(%rax),%r12
  1515. mov -32(%rax),%r13
  1516. mov -40(%rax),%r14
  1517. mov -48(%rax),%r15
  1518. mov %rbx,144($context) # restore context->Rbx
  1519. mov %rbp,160($context) # restore context->Rbp
  1520. mov %r12,216($context) # restore context->R12
  1521. mov %r13,224($context) # restore context->R13
  1522. mov %r14,232($context) # restore context->R14
  1523. mov %r15,240($context) # restore context->R15
  1524. lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
  1525. lea 512($context),%rdi # &context.Xmm6
  1526. mov \$20,%ecx
  1527. .long 0xa548f3fc # cld; rep movsq
  1528. .Lin_prologue:
  1529. mov 8(%rax),%rdi
  1530. mov 16(%rax),%rsi
  1531. mov %rax,152($context) # restore context->Rsp
  1532. mov %rsi,168($context) # restore context->Rsi
  1533. mov %rdi,176($context) # restore context->Rdi
  1534. mov 40($disp),%rdi # disp->ContextRecord
  1535. mov $context,%rsi # context
  1536. mov \$154,%ecx # sizeof(CONTEXT)
  1537. .long 0xa548f3fc # cld; rep movsq
  1538. mov $disp,%rsi
  1539. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1540. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1541. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1542. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1543. mov 40(%rsi),%r10 # disp->ContextRecord
  1544. lea 56(%rsi),%r11 # &disp->HandlerData
  1545. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1546. mov %r10,32(%rsp) # arg5
  1547. mov %r11,40(%rsp) # arg6
  1548. mov %r12,48(%rsp) # arg7
  1549. mov %rcx,56(%rsp) # arg8, (NULL)
  1550. call *__imp_RtlVirtualUnwind(%rip)
  1551. mov \$1,%eax # ExceptionContinueSearch
  1552. add \$64,%rsp
  1553. popfq
  1554. pop %r15
  1555. pop %r14
  1556. pop %r13
  1557. pop %r12
  1558. pop %rbp
  1559. pop %rbx
  1560. pop %rdi
  1561. pop %rsi
  1562. ret
  1563. .size se_handler,.-se_handler
  1564. .section .pdata
  1565. .rva .LSEH_begin_${func}_xop
  1566. .rva .LSEH_end_${func}_xop
  1567. .rva .LSEH_info_${func}_xop
  1568. .rva .LSEH_begin_${func}_avx
  1569. .rva .LSEH_end_${func}_avx
  1570. .rva .LSEH_info_${func}_avx
  1571. ___
  1572. $code.=<<___ if ($avx>1);
  1573. .rva .LSEH_begin_${func}_avx2
  1574. .rva .LSEH_end_${func}_avx2
  1575. .rva .LSEH_info_${func}_avx2
  1576. ___
  1577. $code.=<<___ if ($shaext);
  1578. .rva .LSEH_begin_${func}_shaext
  1579. .rva .LSEH_end_${func}_shaext
  1580. .rva .LSEH_info_${func}_shaext
  1581. ___
  1582. $code.=<<___;
  1583. .section .xdata
  1584. .align 8
  1585. .LSEH_info_${func}_xop:
  1586. .byte 9,0,0,0
  1587. .rva se_handler
  1588. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  1589. .LSEH_info_${func}_avx:
  1590. .byte 9,0,0,0
  1591. .rva se_handler
  1592. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1593. ___
  1594. $code.=<<___ if ($avx>1);
  1595. .LSEH_info_${func}_avx2:
  1596. .byte 9,0,0,0
  1597. .rva se_handler
  1598. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  1599. ___
  1600. $code.=<<___ if ($shaext);
  1601. .LSEH_info_${func}_shaext:
  1602. .byte 9,0,0,0
  1603. .rva se_handler
  1604. .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
  1605. ___
  1606. }
  1607. ####################################################################
  1608. sub rex {
  1609. local *opcode=shift;
  1610. my ($dst,$src)=@_;
  1611. my $rex=0;
  1612. $rex|=0x04 if($dst>=8);
  1613. $rex|=0x01 if($src>=8);
  1614. unshift @opcode,$rex|0x40 if($rex);
  1615. }
  1616. {
  1617. my %opcodelet = (
  1618. "sha256rnds2" => 0xcb,
  1619. "sha256msg1" => 0xcc,
  1620. "sha256msg2" => 0xcd );
  1621. sub sha256op38 {
  1622. my $instr = shift;
  1623. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1624. my @opcode=(0x0f,0x38);
  1625. rex(\@opcode,$2,$1);
  1626. push @opcode,$opcodelet{$instr};
  1627. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1628. return ".byte\t".join(',',@opcode);
  1629. } else {
  1630. return $instr."\t".@_[0];
  1631. }
  1632. }
  1633. }
  1634. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1635. $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
  1636. print $code;
  1637. close STDOUT or die "error closing STDOUT: $!";