aesni-sha1-x86_64.pl 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # June 2011
  11. #
  12. # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13. # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14. # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15. # parallelism, interleaving it with another algorithm would allow to
  16. # utilize processor resources better and achieve better performance.
  17. # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18. # AESNI code is weaved into it. Below are performance numbers in
  19. # cycles per processed byte, less is better, for standalone AESNI-CBC
  20. # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21. # subroutine:
  22. #
  23. # AES-128-CBC +SHA1 stitch gain
  24. # Westmere 3.77[+5.3] 9.07 6.55 +38%
  25. # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
  26. # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
  27. # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
  28. # Bulldozer 5.77[+6.0] 11.72 6.37 +84%
  29. #
  30. # AES-192-CBC
  31. # Westmere 4.51 9.81 6.80 +44%
  32. # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
  33. # Ivy Bridge 6.05 10.65 6.07 +75%
  34. # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
  35. # Bulldozer 6.89 12.84 6.96 +84%
  36. #
  37. # AES-256-CBC
  38. # Westmere 5.25 10.55 7.21 +46%
  39. # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
  40. # Ivy Bridge 7.05 11.65 7.12 +64%
  41. # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
  42. # Bulldozer 8.00 13.95 8.25 +69%
  43. #
  44. # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  45. # background information. Above numbers in parentheses are SSSE3
  46. # results collected on AVX-capable CPU, i.e. apply on OSes that
  47. # don't support AVX.
  48. #
  49. # Needless to mention that it makes no sense to implement "stitched"
  50. # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  51. # fully utilize parallelism, so stitching would not give any gain
  52. # anyway. Well, there might be some, e.g. because of better cache
  53. # locality... For reference, here are performance results for
  54. # standalone AESNI-CBC decrypt:
  55. #
  56. # AES-128-CBC AES-192-CBC AES-256-CBC
  57. # Westmere 1.25 1.50 1.75
  58. # Sandy Bridge 0.74 0.91 1.09
  59. # Ivy Bridge 0.74 0.90 1.11
  60. # Haswell 0.63 0.76 0.88
  61. # Bulldozer 0.70 0.85 0.99
  62. # And indeed:
  63. #
  64. # AES-256-CBC +SHA1 stitch gain
  65. # Westmere 1.75 7.20 6.68 +7.8%
  66. # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
  67. # Ivy Bridge 1.11 5.70 5.45 +4.6%
  68. # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
  69. # Bulldozer 0.99 6.95 5.95 +17%(**)
  70. #
  71. # (*) Tiny improvement coefficient on Haswell is because we compare
  72. # AVX1 stitch to sum with AVX2 SHA1.
  73. # (**) Execution is fully dominated by integer code sequence and
  74. # SIMD still hardly shows [in single-process benchmark;-]
  75. $flavour = shift;
  76. $output = shift;
  77. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  78. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  79. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  80. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  81. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  82. die "can't locate x86_64-xlate.pl";
  83. $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  84. =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  85. $1>=2.19);
  86. $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  87. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  88. $1>=2.09);
  89. $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  90. `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  91. $1>=10);
  92. $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
  93. $shaext=1; ### set to zero if compiling for 1.0.1
  94. $stitched_decrypt=0;
  95. open OUT,"| \"$^X\" $xlate $flavour $output";
  96. *STDOUT=*OUT;
  97. # void aesni_cbc_sha1_enc(const void *inp,
  98. # void *out,
  99. # size_t length,
  100. # const AES_KEY *key,
  101. # unsigned char *iv,
  102. # SHA_CTX *ctx,
  103. # const void *in0);
  104. $code.=<<___;
  105. .text
  106. .extern OPENSSL_ia32cap_P
  107. .globl aesni_cbc_sha1_enc
  108. .type aesni_cbc_sha1_enc,\@abi-omnipotent
  109. .align 32
  110. aesni_cbc_sha1_enc:
  111. # caller should check for SSSE3 and AES-NI bits
  112. mov OPENSSL_ia32cap_P+0(%rip),%r10d
  113. mov OPENSSL_ia32cap_P+4(%rip),%r11
  114. ___
  115. $code.=<<___ if ($shaext);
  116. bt \$61,%r11 # check SHA bit
  117. jc aesni_cbc_sha1_enc_shaext
  118. ___
  119. $code.=<<___ if ($avx);
  120. and \$`1<<28`,%r11d # mask AVX bit
  121. and \$`1<<30`,%r10d # mask "Intel CPU" bit
  122. or %r11d,%r10d
  123. cmp \$`1<<28|1<<30`,%r10d
  124. je aesni_cbc_sha1_enc_avx
  125. ___
  126. $code.=<<___;
  127. jmp aesni_cbc_sha1_enc_ssse3
  128. ret
  129. .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
  130. ___
  131. my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  132. my $Xi=4;
  133. my @X=map("%xmm$_",(4..7,0..3));
  134. my @Tx=map("%xmm$_",(8..10));
  135. my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
  136. my @T=("%esi","%edi");
  137. my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
  138. my $K_XX_XX="%r11";
  139. my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
  140. my @rndkey=("%xmm14","%xmm15"); # for enc
  141. my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
  142. if (1) { # reassign for Atom Silvermont
  143. # The goal is to minimize amount of instructions with more than
  144. # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
  145. # SSSE3 instructions to upper half of the register bank.
  146. @X=map("%xmm$_",(8..11,4..7));
  147. @Tx=map("%xmm$_",(12,13,3));
  148. ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
  149. @rndkey=("%xmm0","%xmm1");
  150. }
  151. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  152. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  153. my $arg = pop;
  154. $arg = "\$$arg" if ($arg*1 eq $arg);
  155. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  156. }
  157. my $_rol=sub { &rol(@_) };
  158. my $_ror=sub { &ror(@_) };
  159. $code.=<<___;
  160. .type aesni_cbc_sha1_enc_ssse3,\@function,6
  161. .align 32
  162. aesni_cbc_sha1_enc_ssse3:
  163. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  164. #shr \$6,$len # debugging artefact
  165. #jz .Lepilogue_ssse3 # debugging artefact
  166. push %rbx
  167. push %rbp
  168. push %r12
  169. push %r13
  170. push %r14
  171. push %r15
  172. lea `-104-($win64?10*16:0)`(%rsp),%rsp
  173. #mov $in0,$inp # debugging artefact
  174. #lea 64(%rsp),$ctx # debugging artefact
  175. ___
  176. $code.=<<___ if ($win64);
  177. movaps %xmm6,96+0(%rsp)
  178. movaps %xmm7,96+16(%rsp)
  179. movaps %xmm8,96+32(%rsp)
  180. movaps %xmm9,96+48(%rsp)
  181. movaps %xmm10,96+64(%rsp)
  182. movaps %xmm11,96+80(%rsp)
  183. movaps %xmm12,96+96(%rsp)
  184. movaps %xmm13,96+112(%rsp)
  185. movaps %xmm14,96+128(%rsp)
  186. movaps %xmm15,96+144(%rsp)
  187. .Lprologue_ssse3:
  188. ___
  189. $code.=<<___;
  190. mov $in0,%r12 # reassign arguments
  191. mov $out,%r13
  192. mov $len,%r14
  193. lea 112($key),%r15 # size optimization
  194. movdqu ($ivp),$iv # load IV
  195. mov $ivp,88(%rsp) # save $ivp
  196. ___
  197. ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  198. my $rounds="${ivp}d";
  199. $code.=<<___;
  200. shl \$6,$len
  201. sub $in0,$out
  202. mov 240-112($key),$rounds
  203. add $inp,$len # end of input
  204. lea K_XX_XX(%rip),$K_XX_XX
  205. mov 0($ctx),$A # load context
  206. mov 4($ctx),$B
  207. mov 8($ctx),$C
  208. mov 12($ctx),$D
  209. mov $B,@T[0] # magic seed
  210. mov 16($ctx),$E
  211. mov $C,@T[1]
  212. xor $D,@T[1]
  213. and @T[1],@T[0]
  214. movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
  215. movdqa 0($K_XX_XX),@Tx[1] # K_00_19
  216. movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  217. movdqu 16($inp),@X[-3&7]
  218. movdqu 32($inp),@X[-2&7]
  219. movdqu 48($inp),@X[-1&7]
  220. pshufb @Tx[2],@X[-4&7] # byte swap
  221. pshufb @Tx[2],@X[-3&7]
  222. pshufb @Tx[2],@X[-2&7]
  223. add \$64,$inp
  224. paddd @Tx[1],@X[-4&7] # add K_00_19
  225. pshufb @Tx[2],@X[-1&7]
  226. paddd @Tx[1],@X[-3&7]
  227. paddd @Tx[1],@X[-2&7]
  228. movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
  229. psubd @Tx[1],@X[-4&7] # restore X[]
  230. movdqa @X[-3&7],16(%rsp)
  231. psubd @Tx[1],@X[-3&7]
  232. movdqa @X[-2&7],32(%rsp)
  233. psubd @Tx[1],@X[-2&7]
  234. movups -112($key),$rndkey0 # $key[0]
  235. movups 16-112($key),$rndkey[0] # forward reference
  236. jmp .Loop_ssse3
  237. ___
  238. my $aesenc=sub {
  239. use integer;
  240. my ($n,$k)=($r/10,$r%10);
  241. if ($k==0) {
  242. $code.=<<___;
  243. movups `16*$n`($in0),$in # load input
  244. xorps $rndkey0,$in
  245. ___
  246. $code.=<<___ if ($n);
  247. movups $iv,`16*($n-1)`($out,$in0) # write output
  248. ___
  249. $code.=<<___;
  250. xorps $in,$iv
  251. movups `32+16*$k-112`($key),$rndkey[1]
  252. aesenc $rndkey[0],$iv
  253. ___
  254. } elsif ($k==9) {
  255. $sn++;
  256. $code.=<<___;
  257. cmp \$11,$rounds
  258. jb .Laesenclast$sn
  259. movups `32+16*($k+0)-112`($key),$rndkey[1]
  260. aesenc $rndkey[0],$iv
  261. movups `32+16*($k+1)-112`($key),$rndkey[0]
  262. aesenc $rndkey[1],$iv
  263. je .Laesenclast$sn
  264. movups `32+16*($k+2)-112`($key),$rndkey[1]
  265. aesenc $rndkey[0],$iv
  266. movups `32+16*($k+3)-112`($key),$rndkey[0]
  267. aesenc $rndkey[1],$iv
  268. .Laesenclast$sn:
  269. aesenclast $rndkey[0],$iv
  270. movups 16-112($key),$rndkey[1] # forward reference
  271. ___
  272. } else {
  273. $code.=<<___;
  274. movups `32+16*$k-112`($key),$rndkey[1]
  275. aesenc $rndkey[0],$iv
  276. ___
  277. }
  278. $r++; unshift(@rndkey,pop(@rndkey));
  279. };
  280. sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  281. { use integer;
  282. my $body = shift;
  283. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  284. my ($a,$b,$c,$d,$e);
  285. eval(shift(@insns)); # ror
  286. &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
  287. eval(shift(@insns));
  288. &movdqa (@Tx[0],@X[-1&7]);
  289. &paddd (@Tx[1],@X[-1&7]);
  290. eval(shift(@insns));
  291. eval(shift(@insns));
  292. &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
  293. eval(shift(@insns));
  294. eval(shift(@insns)); # rol
  295. eval(shift(@insns));
  296. &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
  297. eval(shift(@insns));
  298. eval(shift(@insns));
  299. &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  300. eval(shift(@insns));
  301. eval(shift(@insns)); # ror
  302. &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  303. eval(shift(@insns));
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  307. eval(shift(@insns));
  308. eval(shift(@insns)); # rol
  309. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &movdqa (@Tx[2],@X[0]);
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. eval(shift(@insns)); # ror
  316. &movdqa (@Tx[0],@X[0]);
  317. eval(shift(@insns));
  318. &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
  319. &paddd (@X[0],@X[0]);
  320. eval(shift(@insns));
  321. eval(shift(@insns));
  322. &psrld (@Tx[0],31);
  323. eval(shift(@insns));
  324. eval(shift(@insns)); # rol
  325. eval(shift(@insns));
  326. &movdqa (@Tx[1],@Tx[2]);
  327. eval(shift(@insns));
  328. eval(shift(@insns));
  329. &psrld (@Tx[2],30);
  330. eval(shift(@insns));
  331. eval(shift(@insns)); # ror
  332. &por (@X[0],@Tx[0]); # "X[0]"<<<=1
  333. eval(shift(@insns));
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &pslld (@Tx[1],2);
  337. &pxor (@X[0],@Tx[2]);
  338. eval(shift(@insns));
  339. &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
  340. eval(shift(@insns)); # rol
  341. eval(shift(@insns));
  342. eval(shift(@insns));
  343. &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  344. &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
  345. foreach (@insns) { eval; } # remaining instructions [if any]
  346. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  347. push(@Tx,shift(@Tx));
  348. }
  349. sub Xupdate_ssse3_32_79()
  350. { use integer;
  351. my $body = shift;
  352. my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
  353. my ($a,$b,$c,$d,$e);
  354. eval(shift(@insns)) if ($Xi==8);
  355. &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  356. eval(shift(@insns)) if ($Xi==8);
  357. eval(shift(@insns)); # body_20_39
  358. eval(shift(@insns));
  359. eval(shift(@insns)) if (@insns[1] =~ /_ror/);
  360. eval(shift(@insns)) if (@insns[0] =~ /_ror/);
  361. &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
  362. eval(shift(@insns));
  363. eval(shift(@insns)); # rol
  364. &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  365. eval(shift(@insns));
  366. eval(shift(@insns));
  367. if ($Xi%5) {
  368. &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
  369. } else { # ... or load next one
  370. &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
  371. }
  372. eval(shift(@insns)); # ror
  373. &paddd (@Tx[1],@X[-1&7]);
  374. eval(shift(@insns));
  375. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  376. eval(shift(@insns)); # body_20_39
  377. eval(shift(@insns));
  378. eval(shift(@insns));
  379. eval(shift(@insns)); # rol
  380. eval(shift(@insns)) if (@insns[0] =~ /_ror/);
  381. &movdqa (@Tx[0],@X[0]);
  382. eval(shift(@insns));
  383. eval(shift(@insns));
  384. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  385. eval(shift(@insns)); # ror
  386. eval(shift(@insns));
  387. eval(shift(@insns)); # body_20_39
  388. &pslld (@X[0],2);
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. &psrld (@Tx[0],30);
  392. eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
  393. eval(shift(@insns));
  394. eval(shift(@insns));
  395. eval(shift(@insns)); # ror
  396. &por (@X[0],@Tx[0]); # "X[0]"<<<=2
  397. eval(shift(@insns));
  398. eval(shift(@insns)); # body_20_39
  399. eval(shift(@insns)) if (@insns[1] =~ /_rol/);
  400. eval(shift(@insns)) if (@insns[0] =~ /_rol/);
  401. &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
  402. eval(shift(@insns));
  403. eval(shift(@insns)); # rol
  404. eval(shift(@insns));
  405. eval(shift(@insns));
  406. eval(shift(@insns)); # rol
  407. eval(shift(@insns));
  408. foreach (@insns) { eval; } # remaining instructions
  409. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  410. push(@Tx,shift(@Tx));
  411. }
  412. sub Xuplast_ssse3_80()
  413. { use integer;
  414. my $body = shift;
  415. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  416. my ($a,$b,$c,$d,$e);
  417. eval(shift(@insns));
  418. eval(shift(@insns));
  419. eval(shift(@insns));
  420. eval(shift(@insns));
  421. &paddd (@Tx[1],@X[-1&7]);
  422. eval(shift(@insns));
  423. eval(shift(@insns));
  424. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  425. foreach (@insns) { eval; } # remaining instructions
  426. &cmp ($inp,$len);
  427. &je (shift);
  428. unshift(@Tx,pop(@Tx));
  429. &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
  430. &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
  431. &movdqu (@X[-4&7],"0($inp)"); # load input
  432. &movdqu (@X[-3&7],"16($inp)");
  433. &movdqu (@X[-2&7],"32($inp)");
  434. &movdqu (@X[-1&7],"48($inp)");
  435. &pshufb (@X[-4&7],@Tx[2]); # byte swap
  436. &add ($inp,64);
  437. $Xi=0;
  438. }
  439. sub Xloop_ssse3()
  440. { use integer;
  441. my $body = shift;
  442. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  443. my ($a,$b,$c,$d,$e);
  444. eval(shift(@insns));
  445. eval(shift(@insns));
  446. eval(shift(@insns));
  447. &pshufb (@X[($Xi-3)&7],@Tx[2]);
  448. eval(shift(@insns));
  449. eval(shift(@insns));
  450. eval(shift(@insns));
  451. eval(shift(@insns));
  452. &paddd (@X[($Xi-4)&7],@Tx[1]);
  453. eval(shift(@insns));
  454. eval(shift(@insns));
  455. eval(shift(@insns));
  456. eval(shift(@insns));
  457. &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
  458. eval(shift(@insns));
  459. eval(shift(@insns));
  460. eval(shift(@insns));
  461. eval(shift(@insns));
  462. &psubd (@X[($Xi-4)&7],@Tx[1]);
  463. foreach (@insns) { eval; }
  464. $Xi++;
  465. }
  466. sub Xtail_ssse3()
  467. { use integer;
  468. my $body = shift;
  469. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  470. my ($a,$b,$c,$d,$e);
  471. foreach (@insns) { eval; }
  472. }
  473. my @body_00_19 = (
  474. '($a,$b,$c,$d,$e)=@V;'.
  475. '&$_ror ($b,$j?7:2);', # $b>>>2
  476. '&xor (@T[0],$d);',
  477. '&mov (@T[1],$a);', # $b for next round
  478. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  479. '&xor ($b,$c);', # $c^$d for next round
  480. '&$_rol ($a,5);',
  481. '&add ($e,@T[0]);',
  482. '&and (@T[1],$b);', # ($b&($c^$d)) for next round
  483. '&xor ($b,$c);', # restore $b
  484. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  485. );
  486. sub body_00_19 () { # ((c^d)&b)^d
  487. # on start @T[0]=(c^d)&b
  488. return &body_20_39() if ($rx==19); $rx++;
  489. use integer;
  490. my ($k,$n);
  491. my @r=@body_00_19;
  492. $n = scalar(@r);
  493. $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
  494. @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
  495. $jj++;
  496. return @r;
  497. }
  498. my @body_20_39 = (
  499. '($a,$b,$c,$d,$e)=@V;'.
  500. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  501. '&xor (@T[0],$d) if($j==19);'.
  502. '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
  503. '&mov (@T[1],$a);', # $b for next round
  504. '&$_rol ($a,5);',
  505. '&add ($e,@T[0]);',
  506. '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
  507. '&$_ror ($b,7);', # $b>>>2
  508. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  509. );
  510. sub body_20_39 () { # b^d^c
  511. # on entry @T[0]=b^d
  512. return &body_40_59() if ($rx==39); $rx++;
  513. use integer;
  514. my ($k,$n);
  515. my @r=@body_20_39;
  516. $n = scalar(@r);
  517. $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
  518. @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
  519. $jj++;
  520. return @r;
  521. }
  522. my @body_40_59 = (
  523. '($a,$b,$c,$d,$e)=@V;'.
  524. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  525. '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
  526. '&xor ($c,$d) if ($j>=40);', # restore $c
  527. '&$_ror ($b,7);', # $b>>>2
  528. '&mov (@T[1],$a);', # $b for next round
  529. '&xor (@T[0],$c);',
  530. '&$_rol ($a,5);',
  531. '&add ($e,@T[0]);',
  532. '&xor (@T[1],$c) if ($j==59);'.
  533. '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
  534. '&xor ($b,$c) if ($j< 59);', # c^d for next round
  535. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  536. );
  537. sub body_40_59 () { # ((b^c)&(c^d))^c
  538. # on entry @T[0]=(b^c), (c^=d)
  539. $rx++;
  540. use integer;
  541. my ($k,$n);
  542. my @r=@body_40_59;
  543. $n = scalar(@r);
  544. $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
  545. @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
  546. $jj++;
  547. return @r;
  548. }
  549. $code.=<<___;
  550. .align 32
  551. .Loop_ssse3:
  552. ___
  553. &Xupdate_ssse3_16_31(\&body_00_19);
  554. &Xupdate_ssse3_16_31(\&body_00_19);
  555. &Xupdate_ssse3_16_31(\&body_00_19);
  556. &Xupdate_ssse3_16_31(\&body_00_19);
  557. &Xupdate_ssse3_32_79(\&body_00_19);
  558. &Xupdate_ssse3_32_79(\&body_20_39);
  559. &Xupdate_ssse3_32_79(\&body_20_39);
  560. &Xupdate_ssse3_32_79(\&body_20_39);
  561. &Xupdate_ssse3_32_79(\&body_20_39);
  562. &Xupdate_ssse3_32_79(\&body_20_39);
  563. &Xupdate_ssse3_32_79(\&body_40_59);
  564. &Xupdate_ssse3_32_79(\&body_40_59);
  565. &Xupdate_ssse3_32_79(\&body_40_59);
  566. &Xupdate_ssse3_32_79(\&body_40_59);
  567. &Xupdate_ssse3_32_79(\&body_40_59);
  568. &Xupdate_ssse3_32_79(\&body_20_39);
  569. &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
  570. $saved_j=$j; @saved_V=@V;
  571. $saved_r=$r; @saved_rndkey=@rndkey;
  572. &Xloop_ssse3(\&body_20_39);
  573. &Xloop_ssse3(\&body_20_39);
  574. &Xloop_ssse3(\&body_20_39);
  575. $code.=<<___;
  576. movups $iv,48($out,$in0) # write output
  577. lea 64($in0),$in0
  578. add 0($ctx),$A # update context
  579. add 4($ctx),@T[0]
  580. add 8($ctx),$C
  581. add 12($ctx),$D
  582. mov $A,0($ctx)
  583. add 16($ctx),$E
  584. mov @T[0],4($ctx)
  585. mov @T[0],$B # magic seed
  586. mov $C,8($ctx)
  587. mov $C,@T[1]
  588. mov $D,12($ctx)
  589. xor $D,@T[1]
  590. mov $E,16($ctx)
  591. and @T[1],@T[0]
  592. jmp .Loop_ssse3
  593. .Ldone_ssse3:
  594. ___
  595. $jj=$j=$saved_j; @V=@saved_V;
  596. $r=$saved_r; @rndkey=@saved_rndkey;
  597. &Xtail_ssse3(\&body_20_39);
  598. &Xtail_ssse3(\&body_20_39);
  599. &Xtail_ssse3(\&body_20_39);
  600. $code.=<<___;
  601. movups $iv,48($out,$in0) # write output
  602. mov 88(%rsp),$ivp # restore $ivp
  603. add 0($ctx),$A # update context
  604. add 4($ctx),@T[0]
  605. add 8($ctx),$C
  606. mov $A,0($ctx)
  607. add 12($ctx),$D
  608. mov @T[0],4($ctx)
  609. add 16($ctx),$E
  610. mov $C,8($ctx)
  611. mov $D,12($ctx)
  612. mov $E,16($ctx)
  613. movups $iv,($ivp) # write IV
  614. ___
  615. $code.=<<___ if ($win64);
  616. movaps 96+0(%rsp),%xmm6
  617. movaps 96+16(%rsp),%xmm7
  618. movaps 96+32(%rsp),%xmm8
  619. movaps 96+48(%rsp),%xmm9
  620. movaps 96+64(%rsp),%xmm10
  621. movaps 96+80(%rsp),%xmm11
  622. movaps 96+96(%rsp),%xmm12
  623. movaps 96+112(%rsp),%xmm13
  624. movaps 96+128(%rsp),%xmm14
  625. movaps 96+144(%rsp),%xmm15
  626. ___
  627. $code.=<<___;
  628. lea `104+($win64?10*16:0)`(%rsp),%rsi
  629. mov 0(%rsi),%r15
  630. mov 8(%rsi),%r14
  631. mov 16(%rsi),%r13
  632. mov 24(%rsi),%r12
  633. mov 32(%rsi),%rbp
  634. mov 40(%rsi),%rbx
  635. lea 48(%rsi),%rsp
  636. .Lepilogue_ssse3:
  637. ret
  638. .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
  639. ___
  640. if ($stitched_decrypt) {{{
  641. # reset
  642. ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  643. $j=$jj=$r=$rx=0;
  644. $Xi=4;
  645. # reassign for Atom Silvermont (see above)
  646. ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
  647. @X=map("%xmm$_",(8..13,6,7));
  648. @Tx=map("%xmm$_",(14,15,5));
  649. my @aes256_dec = (
  650. '&movdqu($inout0,"0x00($in0)");',
  651. '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
  652. '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
  653. '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
  654. '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
  655. '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
  656. undef,undef
  657. );
  658. for ($i=0;$i<13;$i++) {
  659. push (@aes256_dec,(
  660. '&aesdec ($inout0,$rndkey0);',
  661. '&aesdec ($inout1,$rndkey0);',
  662. '&aesdec ($inout2,$rndkey0);',
  663. '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
  664. ));
  665. push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
  666. push (@aes256_dec,(undef,undef)) if ($i==5);
  667. }
  668. push(@aes256_dec,(
  669. '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
  670. '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
  671. '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
  672. '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
  673. '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
  674. '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
  675. '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
  676. '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
  677. '&movups ("0x30($out,$in0)",$inout3);'
  678. ));
  679. sub body_00_19_dec () { # ((c^d)&b)^d
  680. # on start @T[0]=(c^d)&b
  681. return &body_20_39_dec() if ($rx==19);
  682. my @r=@body_00_19;
  683. unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
  684. $rx++;
  685. return @r;
  686. }
  687. sub body_20_39_dec () { # b^d^c
  688. # on entry @T[0]=b^d
  689. return &body_40_59_dec() if ($rx==39);
  690. my @r=@body_20_39;
  691. unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
  692. $rx++;
  693. return @r;
  694. }
  695. sub body_40_59_dec () { # ((b^c)&(c^d))^c
  696. # on entry @T[0]=(b^c), (c^=d)
  697. my @r=@body_40_59;
  698. unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
  699. $rx++;
  700. return @r;
  701. }
  702. $code.=<<___;
  703. .globl aesni256_cbc_sha1_dec
  704. .type aesni256_cbc_sha1_dec,\@abi-omnipotent
  705. .align 32
  706. aesni256_cbc_sha1_dec:
  707. # caller should check for SSSE3 and AES-NI bits
  708. mov OPENSSL_ia32cap_P+0(%rip),%r10d
  709. mov OPENSSL_ia32cap_P+4(%rip),%r11d
  710. ___
  711. $code.=<<___ if ($avx);
  712. and \$`1<<28`,%r11d # mask AVX bit
  713. and \$`1<<30`,%r10d # mask "Intel CPU" bit
  714. or %r11d,%r10d
  715. cmp \$`1<<28|1<<30`,%r10d
  716. je aesni256_cbc_sha1_dec_avx
  717. ___
  718. $code.=<<___;
  719. jmp aesni256_cbc_sha1_dec_ssse3
  720. ret
  721. .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
  722. .type aesni256_cbc_sha1_dec_ssse3,\@function,6
  723. .align 32
  724. aesni256_cbc_sha1_dec_ssse3:
  725. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  726. push %rbx
  727. push %rbp
  728. push %r12
  729. push %r13
  730. push %r14
  731. push %r15
  732. lea `-104-($win64?10*16:0)`(%rsp),%rsp
  733. ___
  734. $code.=<<___ if ($win64);
  735. movaps %xmm6,96+0(%rsp)
  736. movaps %xmm7,96+16(%rsp)
  737. movaps %xmm8,96+32(%rsp)
  738. movaps %xmm9,96+48(%rsp)
  739. movaps %xmm10,96+64(%rsp)
  740. movaps %xmm11,96+80(%rsp)
  741. movaps %xmm12,96+96(%rsp)
  742. movaps %xmm13,96+112(%rsp)
  743. movaps %xmm14,96+128(%rsp)
  744. movaps %xmm15,96+144(%rsp)
  745. .Lprologue_dec_ssse3:
  746. ___
  747. $code.=<<___;
  748. mov $in0,%r12 # reassign arguments
  749. mov $out,%r13
  750. mov $len,%r14
  751. lea 112($key),%r15 # size optimization
  752. movdqu ($ivp),@X[3] # load IV
  753. #mov $ivp,88(%rsp) # save $ivp
  754. ___
  755. ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  756. $code.=<<___;
  757. shl \$6,$len
  758. sub $in0,$out
  759. add $inp,$len # end of input
  760. lea K_XX_XX(%rip),$K_XX_XX
  761. mov 0($ctx),$A # load context
  762. mov 4($ctx),$B
  763. mov 8($ctx),$C
  764. mov 12($ctx),$D
  765. mov $B,@T[0] # magic seed
  766. mov 16($ctx),$E
  767. mov $C,@T[1]
  768. xor $D,@T[1]
  769. and @T[1],@T[0]
  770. movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
  771. movdqa 0($K_XX_XX),@Tx[1] # K_00_19
  772. movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  773. movdqu 16($inp),@X[-3&7]
  774. movdqu 32($inp),@X[-2&7]
  775. movdqu 48($inp),@X[-1&7]
  776. pshufb @Tx[2],@X[-4&7] # byte swap
  777. add \$64,$inp
  778. pshufb @Tx[2],@X[-3&7]
  779. pshufb @Tx[2],@X[-2&7]
  780. pshufb @Tx[2],@X[-1&7]
  781. paddd @Tx[1],@X[-4&7] # add K_00_19
  782. paddd @Tx[1],@X[-3&7]
  783. paddd @Tx[1],@X[-2&7]
  784. movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
  785. psubd @Tx[1],@X[-4&7] # restore X[]
  786. movdqa @X[-3&7],16(%rsp)
  787. psubd @Tx[1],@X[-3&7]
  788. movdqa @X[-2&7],32(%rsp)
  789. psubd @Tx[1],@X[-2&7]
  790. movdqu -112($key),$rndkey0 # $key[0]
  791. jmp .Loop_dec_ssse3
  792. .align 32
  793. .Loop_dec_ssse3:
  794. ___
  795. &Xupdate_ssse3_16_31(\&body_00_19_dec);
  796. &Xupdate_ssse3_16_31(\&body_00_19_dec);
  797. &Xupdate_ssse3_16_31(\&body_00_19_dec);
  798. &Xupdate_ssse3_16_31(\&body_00_19_dec);
  799. &Xupdate_ssse3_32_79(\&body_00_19_dec);
  800. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  801. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  802. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  803. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  804. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  805. &Xupdate_ssse3_32_79(\&body_40_59_dec);
  806. &Xupdate_ssse3_32_79(\&body_40_59_dec);
  807. &Xupdate_ssse3_32_79(\&body_40_59_dec);
  808. &Xupdate_ssse3_32_79(\&body_40_59_dec);
  809. &Xupdate_ssse3_32_79(\&body_40_59_dec);
  810. &Xupdate_ssse3_32_79(\&body_20_39_dec);
  811. &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
  812. $saved_j=$j; @saved_V=@V;
  813. $saved_rx=$rx;
  814. &Xloop_ssse3(\&body_20_39_dec);
  815. &Xloop_ssse3(\&body_20_39_dec);
  816. &Xloop_ssse3(\&body_20_39_dec);
  817. eval(@aes256_dec[-1]); # last store
  818. $code.=<<___;
  819. lea 64($in0),$in0
  820. add 0($ctx),$A # update context
  821. add 4($ctx),@T[0]
  822. add 8($ctx),$C
  823. add 12($ctx),$D
  824. mov $A,0($ctx)
  825. add 16($ctx),$E
  826. mov @T[0],4($ctx)
  827. mov @T[0],$B # magic seed
  828. mov $C,8($ctx)
  829. mov $C,@T[1]
  830. mov $D,12($ctx)
  831. xor $D,@T[1]
  832. mov $E,16($ctx)
  833. and @T[1],@T[0]
  834. jmp .Loop_dec_ssse3
  835. .Ldone_dec_ssse3:
  836. ___
  837. $jj=$j=$saved_j; @V=@saved_V;
  838. $rx=$saved_rx;
  839. &Xtail_ssse3(\&body_20_39_dec);
  840. &Xtail_ssse3(\&body_20_39_dec);
  841. &Xtail_ssse3(\&body_20_39_dec);
  842. eval(@aes256_dec[-1]); # last store
  843. $code.=<<___;
  844. add 0($ctx),$A # update context
  845. add 4($ctx),@T[0]
  846. add 8($ctx),$C
  847. mov $A,0($ctx)
  848. add 12($ctx),$D
  849. mov @T[0],4($ctx)
  850. add 16($ctx),$E
  851. mov $C,8($ctx)
  852. mov $D,12($ctx)
  853. mov $E,16($ctx)
  854. movups @X[3],($ivp) # write IV
  855. ___
  856. $code.=<<___ if ($win64);
  857. movaps 96+0(%rsp),%xmm6
  858. movaps 96+16(%rsp),%xmm7
  859. movaps 96+32(%rsp),%xmm8
  860. movaps 96+48(%rsp),%xmm9
  861. movaps 96+64(%rsp),%xmm10
  862. movaps 96+80(%rsp),%xmm11
  863. movaps 96+96(%rsp),%xmm12
  864. movaps 96+112(%rsp),%xmm13
  865. movaps 96+128(%rsp),%xmm14
  866. movaps 96+144(%rsp),%xmm15
  867. ___
  868. $code.=<<___;
  869. lea `104+($win64?10*16:0)`(%rsp),%rsi
  870. mov 0(%rsi),%r15
  871. mov 8(%rsi),%r14
  872. mov 16(%rsi),%r13
  873. mov 24(%rsi),%r12
  874. mov 32(%rsi),%rbp
  875. mov 40(%rsi),%rbx
  876. lea 48(%rsi),%rsp
  877. .Lepilogue_dec_ssse3:
  878. ret
  879. .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
  880. ___
  881. }}}
  882. $j=$jj=$r=$rx=0;
  883. if ($avx) {
  884. my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  885. my $Xi=4;
  886. my @X=map("%xmm$_",(4..7,0..3));
  887. my @Tx=map("%xmm$_",(8..10));
  888. my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
  889. my @T=("%esi","%edi");
  890. my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
  891. my @rndkey=("%xmm14","%xmm15");
  892. my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
  893. my $Kx=@Tx[2];
  894. my $_rol=sub { &shld(@_[0],@_) };
  895. my $_ror=sub { &shrd(@_[0],@_) };
  896. $code.=<<___;
  897. .type aesni_cbc_sha1_enc_avx,\@function,6
  898. .align 32
  899. aesni_cbc_sha1_enc_avx:
  900. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  901. #shr \$6,$len # debugging artefact
  902. #jz .Lepilogue_avx # debugging artefact
  903. push %rbx
  904. push %rbp
  905. push %r12
  906. push %r13
  907. push %r14
  908. push %r15
  909. lea `-104-($win64?10*16:0)`(%rsp),%rsp
  910. #mov $in0,$inp # debugging artefact
  911. #lea 64(%rsp),$ctx # debugging artefact
  912. ___
  913. $code.=<<___ if ($win64);
  914. movaps %xmm6,96+0(%rsp)
  915. movaps %xmm7,96+16(%rsp)
  916. movaps %xmm8,96+32(%rsp)
  917. movaps %xmm9,96+48(%rsp)
  918. movaps %xmm10,96+64(%rsp)
  919. movaps %xmm11,96+80(%rsp)
  920. movaps %xmm12,96+96(%rsp)
  921. movaps %xmm13,96+112(%rsp)
  922. movaps %xmm14,96+128(%rsp)
  923. movaps %xmm15,96+144(%rsp)
  924. .Lprologue_avx:
  925. ___
  926. $code.=<<___;
  927. vzeroall
  928. mov $in0,%r12 # reassign arguments
  929. mov $out,%r13
  930. mov $len,%r14
  931. lea 112($key),%r15 # size optimization
  932. vmovdqu ($ivp),$iv # load IV
  933. mov $ivp,88(%rsp) # save $ivp
  934. ___
  935. ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  936. my $rounds="${ivp}d";
  937. $code.=<<___;
  938. shl \$6,$len
  939. sub $in0,$out
  940. mov 240-112($key),$rounds
  941. add $inp,$len # end of input
  942. lea K_XX_XX(%rip),$K_XX_XX
  943. mov 0($ctx),$A # load context
  944. mov 4($ctx),$B
  945. mov 8($ctx),$C
  946. mov 12($ctx),$D
  947. mov $B,@T[0] # magic seed
  948. mov 16($ctx),$E
  949. mov $C,@T[1]
  950. xor $D,@T[1]
  951. and @T[1],@T[0]
  952. vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
  953. vmovdqa 0($K_XX_XX),$Kx # K_00_19
  954. vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  955. vmovdqu 16($inp),@X[-3&7]
  956. vmovdqu 32($inp),@X[-2&7]
  957. vmovdqu 48($inp),@X[-1&7]
  958. vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
  959. add \$64,$inp
  960. vpshufb @X[2],@X[-3&7],@X[-3&7]
  961. vpshufb @X[2],@X[-2&7],@X[-2&7]
  962. vpshufb @X[2],@X[-1&7],@X[-1&7]
  963. vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
  964. vpaddd $Kx,@X[-3&7],@X[1]
  965. vpaddd $Kx,@X[-2&7],@X[2]
  966. vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
  967. vmovdqa @X[1],16(%rsp)
  968. vmovdqa @X[2],32(%rsp)
  969. vmovups -112($key),$rndkey[1] # $key[0]
  970. vmovups 16-112($key),$rndkey[0] # forward reference
  971. jmp .Loop_avx
  972. ___
  973. my $aesenc=sub {
  974. use integer;
  975. my ($n,$k)=($r/10,$r%10);
  976. if ($k==0) {
  977. $code.=<<___;
  978. vmovdqu `16*$n`($in0),$in # load input
  979. vpxor $rndkey[1],$in,$in
  980. ___
  981. $code.=<<___ if ($n);
  982. vmovups $iv,`16*($n-1)`($out,$in0) # write output
  983. ___
  984. $code.=<<___;
  985. vpxor $in,$iv,$iv
  986. vaesenc $rndkey[0],$iv,$iv
  987. vmovups `32+16*$k-112`($key),$rndkey[1]
  988. ___
  989. } elsif ($k==9) {
  990. $sn++;
  991. $code.=<<___;
  992. cmp \$11,$rounds
  993. jb .Lvaesenclast$sn
  994. vaesenc $rndkey[0],$iv,$iv
  995. vmovups `32+16*($k+0)-112`($key),$rndkey[1]
  996. vaesenc $rndkey[1],$iv,$iv
  997. vmovups `32+16*($k+1)-112`($key),$rndkey[0]
  998. je .Lvaesenclast$sn
  999. vaesenc $rndkey[0],$iv,$iv
  1000. vmovups `32+16*($k+2)-112`($key),$rndkey[1]
  1001. vaesenc $rndkey[1],$iv,$iv
  1002. vmovups `32+16*($k+3)-112`($key),$rndkey[0]
  1003. .Lvaesenclast$sn:
  1004. vaesenclast $rndkey[0],$iv,$iv
  1005. vmovups -112($key),$rndkey[0]
  1006. vmovups 16-112($key),$rndkey[1] # forward reference
  1007. ___
  1008. } else {
  1009. $code.=<<___;
  1010. vaesenc $rndkey[0],$iv,$iv
  1011. vmovups `32+16*$k-112`($key),$rndkey[1]
  1012. ___
  1013. }
  1014. $r++; unshift(@rndkey,pop(@rndkey));
  1015. };
  1016. sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
  1017. { use integer;
  1018. my $body = shift;
  1019. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  1020. my ($a,$b,$c,$d,$e);
  1021. eval(shift(@insns));
  1022. eval(shift(@insns));
  1023. &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  1024. eval(shift(@insns));
  1025. eval(shift(@insns));
  1026. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  1027. eval(shift(@insns));
  1028. eval(shift(@insns));
  1029. &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
  1030. eval(shift(@insns));
  1031. eval(shift(@insns));
  1032. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  1033. eval(shift(@insns));
  1034. eval(shift(@insns));
  1035. &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  1036. eval(shift(@insns));
  1037. eval(shift(@insns));
  1038. eval(shift(@insns));
  1039. eval(shift(@insns));
  1040. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  1041. eval(shift(@insns));
  1042. eval(shift(@insns));
  1043. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  1044. eval(shift(@insns));
  1045. eval(shift(@insns));
  1046. &vpsrld (@Tx[0],@X[0],31);
  1047. eval(shift(@insns));
  1048. eval(shift(@insns));
  1049. eval(shift(@insns));
  1050. eval(shift(@insns));
  1051. &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
  1052. &vpaddd (@X[0],@X[0],@X[0]);
  1053. eval(shift(@insns));
  1054. eval(shift(@insns));
  1055. eval(shift(@insns));
  1056. eval(shift(@insns));
  1057. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
  1058. &vpsrld (@Tx[0],@Tx[1],30);
  1059. eval(shift(@insns));
  1060. eval(shift(@insns));
  1061. eval(shift(@insns));
  1062. eval(shift(@insns));
  1063. &vpslld (@Tx[1],@Tx[1],2);
  1064. &vpxor (@X[0],@X[0],@Tx[0]);
  1065. eval(shift(@insns));
  1066. eval(shift(@insns));
  1067. eval(shift(@insns));
  1068. eval(shift(@insns));
  1069. &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  1070. eval(shift(@insns));
  1071. eval(shift(@insns));
  1072. &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
  1073. eval(shift(@insns));
  1074. eval(shift(@insns));
  1075. foreach (@insns) { eval; } # remaining instructions [if any]
  1076. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  1077. }
  1078. sub Xupdate_avx_32_79()
  1079. { use integer;
  1080. my $body = shift;
  1081. my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
  1082. my ($a,$b,$c,$d,$e);
  1083. &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
  1084. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  1085. eval(shift(@insns)); # body_20_39
  1086. eval(shift(@insns));
  1087. eval(shift(@insns));
  1088. eval(shift(@insns)); # rol
  1089. &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  1090. eval(shift(@insns));
  1091. eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
  1092. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  1093. &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
  1094. eval(shift(@insns)); # ror
  1095. eval(shift(@insns));
  1096. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  1097. eval(shift(@insns)); # body_20_39
  1098. eval(shift(@insns));
  1099. eval(shift(@insns));
  1100. eval(shift(@insns)); # rol
  1101. &vpsrld (@Tx[0],@X[0],30);
  1102. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  1103. eval(shift(@insns));
  1104. eval(shift(@insns));
  1105. eval(shift(@insns)); # ror
  1106. eval(shift(@insns));
  1107. &vpslld (@X[0],@X[0],2);
  1108. eval(shift(@insns)); # body_20_39
  1109. eval(shift(@insns));
  1110. eval(shift(@insns));
  1111. eval(shift(@insns)); # rol
  1112. eval(shift(@insns));
  1113. eval(shift(@insns));
  1114. eval(shift(@insns)); # ror
  1115. eval(shift(@insns));
  1116. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
  1117. eval(shift(@insns)); # body_20_39
  1118. eval(shift(@insns));
  1119. eval(shift(@insns));
  1120. eval(shift(@insns)); # rol
  1121. eval(shift(@insns));
  1122. eval(shift(@insns));
  1123. eval(shift(@insns)); # rol
  1124. eval(shift(@insns));
  1125. foreach (@insns) { eval; } # remaining instructions
  1126. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  1127. }
  1128. sub Xuplast_avx_80()
  1129. { use integer;
  1130. my $body = shift;
  1131. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1132. my ($a,$b,$c,$d,$e);
  1133. eval(shift(@insns));
  1134. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  1135. eval(shift(@insns));
  1136. eval(shift(@insns));
  1137. eval(shift(@insns));
  1138. eval(shift(@insns));
  1139. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  1140. foreach (@insns) { eval; } # remaining instructions
  1141. &cmp ($inp,$len);
  1142. &je (shift);
  1143. &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
  1144. &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
  1145. &vmovdqu(@X[-4&7],"0($inp)"); # load input
  1146. &vmovdqu(@X[-3&7],"16($inp)");
  1147. &vmovdqu(@X[-2&7],"32($inp)");
  1148. &vmovdqu(@X[-1&7],"48($inp)");
  1149. &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
  1150. &add ($inp,64);
  1151. $Xi=0;
  1152. }
  1153. sub Xloop_avx()
  1154. { use integer;
  1155. my $body = shift;
  1156. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1157. my ($a,$b,$c,$d,$e);
  1158. eval(shift(@insns));
  1159. eval(shift(@insns));
  1160. &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
  1161. eval(shift(@insns));
  1162. eval(shift(@insns));
  1163. &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
  1164. eval(shift(@insns));
  1165. eval(shift(@insns));
  1166. eval(shift(@insns));
  1167. eval(shift(@insns));
  1168. &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. foreach (@insns) { eval; }
  1172. $Xi++;
  1173. }
  1174. sub Xtail_avx()
  1175. { use integer;
  1176. my $body = shift;
  1177. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1178. my ($a,$b,$c,$d,$e);
  1179. foreach (@insns) { eval; }
  1180. }
  1181. $code.=<<___;
  1182. .align 32
  1183. .Loop_avx:
  1184. ___
  1185. &Xupdate_avx_16_31(\&body_00_19);
  1186. &Xupdate_avx_16_31(\&body_00_19);
  1187. &Xupdate_avx_16_31(\&body_00_19);
  1188. &Xupdate_avx_16_31(\&body_00_19);
  1189. &Xupdate_avx_32_79(\&body_00_19);
  1190. &Xupdate_avx_32_79(\&body_20_39);
  1191. &Xupdate_avx_32_79(\&body_20_39);
  1192. &Xupdate_avx_32_79(\&body_20_39);
  1193. &Xupdate_avx_32_79(\&body_20_39);
  1194. &Xupdate_avx_32_79(\&body_20_39);
  1195. &Xupdate_avx_32_79(\&body_40_59);
  1196. &Xupdate_avx_32_79(\&body_40_59);
  1197. &Xupdate_avx_32_79(\&body_40_59);
  1198. &Xupdate_avx_32_79(\&body_40_59);
  1199. &Xupdate_avx_32_79(\&body_40_59);
  1200. &Xupdate_avx_32_79(\&body_20_39);
  1201. &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
  1202. $saved_j=$j; @saved_V=@V;
  1203. $saved_r=$r; @saved_rndkey=@rndkey;
  1204. &Xloop_avx(\&body_20_39);
  1205. &Xloop_avx(\&body_20_39);
  1206. &Xloop_avx(\&body_20_39);
  1207. $code.=<<___;
  1208. vmovups $iv,48($out,$in0) # write output
  1209. lea 64($in0),$in0
  1210. add 0($ctx),$A # update context
  1211. add 4($ctx),@T[0]
  1212. add 8($ctx),$C
  1213. add 12($ctx),$D
  1214. mov $A,0($ctx)
  1215. add 16($ctx),$E
  1216. mov @T[0],4($ctx)
  1217. mov @T[0],$B # magic seed
  1218. mov $C,8($ctx)
  1219. mov $C,@T[1]
  1220. mov $D,12($ctx)
  1221. xor $D,@T[1]
  1222. mov $E,16($ctx)
  1223. and @T[1],@T[0]
  1224. jmp .Loop_avx
  1225. .Ldone_avx:
  1226. ___
  1227. $jj=$j=$saved_j; @V=@saved_V;
  1228. $r=$saved_r; @rndkey=@saved_rndkey;
  1229. &Xtail_avx(\&body_20_39);
  1230. &Xtail_avx(\&body_20_39);
  1231. &Xtail_avx(\&body_20_39);
  1232. $code.=<<___;
  1233. vmovups $iv,48($out,$in0) # write output
  1234. mov 88(%rsp),$ivp # restore $ivp
  1235. add 0($ctx),$A # update context
  1236. add 4($ctx),@T[0]
  1237. add 8($ctx),$C
  1238. mov $A,0($ctx)
  1239. add 12($ctx),$D
  1240. mov @T[0],4($ctx)
  1241. add 16($ctx),$E
  1242. mov $C,8($ctx)
  1243. mov $D,12($ctx)
  1244. mov $E,16($ctx)
  1245. vmovups $iv,($ivp) # write IV
  1246. vzeroall
  1247. ___
  1248. $code.=<<___ if ($win64);
  1249. movaps 96+0(%rsp),%xmm6
  1250. movaps 96+16(%rsp),%xmm7
  1251. movaps 96+32(%rsp),%xmm8
  1252. movaps 96+48(%rsp),%xmm9
  1253. movaps 96+64(%rsp),%xmm10
  1254. movaps 96+80(%rsp),%xmm11
  1255. movaps 96+96(%rsp),%xmm12
  1256. movaps 96+112(%rsp),%xmm13
  1257. movaps 96+128(%rsp),%xmm14
  1258. movaps 96+144(%rsp),%xmm15
  1259. ___
  1260. $code.=<<___;
  1261. lea `104+($win64?10*16:0)`(%rsp),%rsi
  1262. mov 0(%rsi),%r15
  1263. mov 8(%rsi),%r14
  1264. mov 16(%rsi),%r13
  1265. mov 24(%rsi),%r12
  1266. mov 32(%rsi),%rbp
  1267. mov 40(%rsi),%rbx
  1268. lea 48(%rsi),%rsp
  1269. .Lepilogue_avx:
  1270. ret
  1271. .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
  1272. ___
  1273. if ($stitched_decrypt) {{{
  1274. # reset
  1275. ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  1276. $j=$jj=$r=$rx=0;
  1277. $Xi=4;
  1278. @aes256_dec = (
  1279. '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
  1280. '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
  1281. '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
  1282. '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
  1283. '&vmovups($rndkey0,"16-112($key)");',
  1284. '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
  1285. undef,undef
  1286. );
  1287. for ($i=0;$i<13;$i++) {
  1288. push (@aes256_dec,(
  1289. '&vaesdec ($inout0,$inout0,$rndkey0);',
  1290. '&vaesdec ($inout1,$inout1,$rndkey0);',
  1291. '&vaesdec ($inout2,$inout2,$rndkey0);',
  1292. '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
  1293. ));
  1294. push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
  1295. push (@aes256_dec,(undef,undef)) if ($i==5);
  1296. }
  1297. push(@aes256_dec,(
  1298. '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
  1299. '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
  1300. '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
  1301. '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
  1302. '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
  1303. '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
  1304. '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
  1305. '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
  1306. '&vmovups ("0x30($out,$in0)",$inout3);'
  1307. ));
  1308. $code.=<<___;
  1309. .type aesni256_cbc_sha1_dec_avx,\@function,6
  1310. .align 32
  1311. aesni256_cbc_sha1_dec_avx:
  1312. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  1313. push %rbx
  1314. push %rbp
  1315. push %r12
  1316. push %r13
  1317. push %r14
  1318. push %r15
  1319. lea `-104-($win64?10*16:0)`(%rsp),%rsp
  1320. ___
  1321. $code.=<<___ if ($win64);
  1322. movaps %xmm6,96+0(%rsp)
  1323. movaps %xmm7,96+16(%rsp)
  1324. movaps %xmm8,96+32(%rsp)
  1325. movaps %xmm9,96+48(%rsp)
  1326. movaps %xmm10,96+64(%rsp)
  1327. movaps %xmm11,96+80(%rsp)
  1328. movaps %xmm12,96+96(%rsp)
  1329. movaps %xmm13,96+112(%rsp)
  1330. movaps %xmm14,96+128(%rsp)
  1331. movaps %xmm15,96+144(%rsp)
  1332. .Lprologue_dec_avx:
  1333. ___
  1334. $code.=<<___;
  1335. vzeroall
  1336. mov $in0,%r12 # reassign arguments
  1337. mov $out,%r13
  1338. mov $len,%r14
  1339. lea 112($key),%r15 # size optimization
  1340. vmovdqu ($ivp),@X[3] # load IV
  1341. ___
  1342. ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  1343. $code.=<<___;
  1344. shl \$6,$len
  1345. sub $in0,$out
  1346. add $inp,$len # end of input
  1347. lea K_XX_XX(%rip),$K_XX_XX
  1348. mov 0($ctx),$A # load context
  1349. mov 4($ctx),$B
  1350. mov 8($ctx),$C
  1351. mov 12($ctx),$D
  1352. mov $B,@T[0] # magic seed
  1353. mov 16($ctx),$E
  1354. mov $C,@T[1]
  1355. xor $D,@T[1]
  1356. and @T[1],@T[0]
  1357. vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
  1358. vmovdqa 0($K_XX_XX),$Kx # K_00_19
  1359. vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  1360. vmovdqu 16($inp),@X[-3&7]
  1361. vmovdqu 32($inp),@X[-2&7]
  1362. vmovdqu 48($inp),@X[-1&7]
  1363. vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
  1364. add \$64,$inp
  1365. vpshufb @X[2],@X[-3&7],@X[-3&7]
  1366. vpshufb @X[2],@X[-2&7],@X[-2&7]
  1367. vpshufb @X[2],@X[-1&7],@X[-1&7]
  1368. vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
  1369. vpaddd $Kx,@X[-3&7],@X[1]
  1370. vpaddd $Kx,@X[-2&7],@X[2]
  1371. vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
  1372. vmovdqa @X[1],16(%rsp)
  1373. vmovdqa @X[2],32(%rsp)
  1374. vmovups -112($key),$rndkey0 # $key[0]
  1375. jmp .Loop_dec_avx
  1376. .align 32
  1377. .Loop_dec_avx:
  1378. ___
  1379. &Xupdate_avx_16_31(\&body_00_19_dec);
  1380. &Xupdate_avx_16_31(\&body_00_19_dec);
  1381. &Xupdate_avx_16_31(\&body_00_19_dec);
  1382. &Xupdate_avx_16_31(\&body_00_19_dec);
  1383. &Xupdate_avx_32_79(\&body_00_19_dec);
  1384. &Xupdate_avx_32_79(\&body_20_39_dec);
  1385. &Xupdate_avx_32_79(\&body_20_39_dec);
  1386. &Xupdate_avx_32_79(\&body_20_39_dec);
  1387. &Xupdate_avx_32_79(\&body_20_39_dec);
  1388. &Xupdate_avx_32_79(\&body_20_39_dec);
  1389. &Xupdate_avx_32_79(\&body_40_59_dec);
  1390. &Xupdate_avx_32_79(\&body_40_59_dec);
  1391. &Xupdate_avx_32_79(\&body_40_59_dec);
  1392. &Xupdate_avx_32_79(\&body_40_59_dec);
  1393. &Xupdate_avx_32_79(\&body_40_59_dec);
  1394. &Xupdate_avx_32_79(\&body_20_39_dec);
  1395. &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
  1396. $saved_j=$j; @saved_V=@V;
  1397. $saved_rx=$rx;
  1398. &Xloop_avx(\&body_20_39_dec);
  1399. &Xloop_avx(\&body_20_39_dec);
  1400. &Xloop_avx(\&body_20_39_dec);
  1401. eval(@aes256_dec[-1]); # last store
  1402. $code.=<<___;
  1403. lea 64($in0),$in0
  1404. add 0($ctx),$A # update context
  1405. add 4($ctx),@T[0]
  1406. add 8($ctx),$C
  1407. add 12($ctx),$D
  1408. mov $A,0($ctx)
  1409. add 16($ctx),$E
  1410. mov @T[0],4($ctx)
  1411. mov @T[0],$B # magic seed
  1412. mov $C,8($ctx)
  1413. mov $C,@T[1]
  1414. mov $D,12($ctx)
  1415. xor $D,@T[1]
  1416. mov $E,16($ctx)
  1417. and @T[1],@T[0]
  1418. jmp .Loop_dec_avx
  1419. .Ldone_dec_avx:
  1420. ___
  1421. $jj=$j=$saved_j; @V=@saved_V;
  1422. $rx=$saved_rx;
  1423. &Xtail_avx(\&body_20_39_dec);
  1424. &Xtail_avx(\&body_20_39_dec);
  1425. &Xtail_avx(\&body_20_39_dec);
  1426. eval(@aes256_dec[-1]); # last store
  1427. $code.=<<___;
  1428. add 0($ctx),$A # update context
  1429. add 4($ctx),@T[0]
  1430. add 8($ctx),$C
  1431. mov $A,0($ctx)
  1432. add 12($ctx),$D
  1433. mov @T[0],4($ctx)
  1434. add 16($ctx),$E
  1435. mov $C,8($ctx)
  1436. mov $D,12($ctx)
  1437. mov $E,16($ctx)
  1438. vmovups @X[3],($ivp) # write IV
  1439. vzeroall
  1440. ___
  1441. $code.=<<___ if ($win64);
  1442. movaps 96+0(%rsp),%xmm6
  1443. movaps 96+16(%rsp),%xmm7
  1444. movaps 96+32(%rsp),%xmm8
  1445. movaps 96+48(%rsp),%xmm9
  1446. movaps 96+64(%rsp),%xmm10
  1447. movaps 96+80(%rsp),%xmm11
  1448. movaps 96+96(%rsp),%xmm12
  1449. movaps 96+112(%rsp),%xmm13
  1450. movaps 96+128(%rsp),%xmm14
  1451. movaps 96+144(%rsp),%xmm15
  1452. ___
  1453. $code.=<<___;
  1454. lea `104+($win64?10*16:0)`(%rsp),%rsi
  1455. mov 0(%rsi),%r15
  1456. mov 8(%rsi),%r14
  1457. mov 16(%rsi),%r13
  1458. mov 24(%rsi),%r12
  1459. mov 32(%rsi),%rbp
  1460. mov 40(%rsi),%rbx
  1461. lea 48(%rsi),%rsp
  1462. .Lepilogue_dec_avx:
  1463. ret
  1464. .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
  1465. ___
  1466. }}}
  1467. }
  1468. $code.=<<___;
  1469. .align 64
  1470. K_XX_XX:
  1471. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1472. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1473. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1474. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1475. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
  1476. .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
  1477. .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1478. .align 64
  1479. ___
  1480. if ($shaext) {{{
  1481. ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  1482. $rounds="%r11d";
  1483. ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
  1484. @rndkey=("%xmm0","%xmm1");
  1485. $r=0;
  1486. my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
  1487. my @MSG=map("%xmm$_",(3..6));
  1488. $code.=<<___;
  1489. .type aesni_cbc_sha1_enc_shaext,\@function,6
  1490. .align 32
  1491. aesni_cbc_sha1_enc_shaext:
  1492. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  1493. ___
  1494. $code.=<<___ if ($win64);
  1495. lea `-8-10*16`(%rsp),%rsp
  1496. movaps %xmm6,-8-10*16(%rax)
  1497. movaps %xmm7,-8-9*16(%rax)
  1498. movaps %xmm8,-8-8*16(%rax)
  1499. movaps %xmm9,-8-7*16(%rax)
  1500. movaps %xmm10,-8-6*16(%rax)
  1501. movaps %xmm11,-8-5*16(%rax)
  1502. movaps %xmm12,-8-4*16(%rax)
  1503. movaps %xmm13,-8-3*16(%rax)
  1504. movaps %xmm14,-8-2*16(%rax)
  1505. movaps %xmm15,-8-1*16(%rax)
  1506. .Lprologue_shaext:
  1507. ___
  1508. $code.=<<___;
  1509. movdqu ($ctx),$ABCD
  1510. movd 16($ctx),$E
  1511. movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
  1512. mov 240($key),$rounds
  1513. sub $in0,$out
  1514. movups ($key),$rndkey0 # $key[0]
  1515. movups ($ivp),$iv # load IV
  1516. movups 16($key),$rndkey[0] # forward reference
  1517. lea 112($key),$key # size optimization
  1518. pshufd \$0b00011011,$ABCD,$ABCD # flip word order
  1519. pshufd \$0b00011011,$E,$E # flip word order
  1520. jmp .Loop_shaext
  1521. .align 16
  1522. .Loop_shaext:
  1523. ___
  1524. &$aesenc();
  1525. $code.=<<___;
  1526. movdqu ($inp),@MSG[0]
  1527. movdqa $E,$E_SAVE # offload $E
  1528. pshufb $BSWAP,@MSG[0]
  1529. movdqu 0x10($inp),@MSG[1]
  1530. movdqa $ABCD,$ABCD_SAVE # offload $ABCD
  1531. ___
  1532. &$aesenc();
  1533. $code.=<<___;
  1534. pshufb $BSWAP,@MSG[1]
  1535. paddd @MSG[0],$E
  1536. movdqu 0x20($inp),@MSG[2]
  1537. lea 0x40($inp),$inp
  1538. pxor $E_SAVE,@MSG[0] # black magic
  1539. ___
  1540. &$aesenc();
  1541. $code.=<<___;
  1542. pxor $E_SAVE,@MSG[0] # black magic
  1543. movdqa $ABCD,$E_
  1544. pshufb $BSWAP,@MSG[2]
  1545. sha1rnds4 \$0,$E,$ABCD # 0-3
  1546. sha1nexte @MSG[1],$E_
  1547. ___
  1548. &$aesenc();
  1549. $code.=<<___;
  1550. sha1msg1 @MSG[1],@MSG[0]
  1551. movdqu -0x10($inp),@MSG[3]
  1552. movdqa $ABCD,$E
  1553. pshufb $BSWAP,@MSG[3]
  1554. ___
  1555. &$aesenc();
  1556. $code.=<<___;
  1557. sha1rnds4 \$0,$E_,$ABCD # 4-7
  1558. sha1nexte @MSG[2],$E
  1559. pxor @MSG[2],@MSG[0]
  1560. sha1msg1 @MSG[2],@MSG[1]
  1561. ___
  1562. &$aesenc();
  1563. for($i=2;$i<20-4;$i++) {
  1564. $code.=<<___;
  1565. movdqa $ABCD,$E_
  1566. sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
  1567. sha1nexte @MSG[3],$E_
  1568. ___
  1569. &$aesenc();
  1570. $code.=<<___;
  1571. sha1msg2 @MSG[3],@MSG[0]
  1572. pxor @MSG[3],@MSG[1]
  1573. sha1msg1 @MSG[3],@MSG[2]
  1574. ___
  1575. ($E,$E_)=($E_,$E);
  1576. push(@MSG,shift(@MSG));
  1577. &$aesenc();
  1578. }
  1579. $code.=<<___;
  1580. movdqa $ABCD,$E_
  1581. sha1rnds4 \$3,$E,$ABCD # 64-67
  1582. sha1nexte @MSG[3],$E_
  1583. sha1msg2 @MSG[3],@MSG[0]
  1584. pxor @MSG[3],@MSG[1]
  1585. ___
  1586. &$aesenc();
  1587. $code.=<<___;
  1588. movdqa $ABCD,$E
  1589. sha1rnds4 \$3,$E_,$ABCD # 68-71
  1590. sha1nexte @MSG[0],$E
  1591. sha1msg2 @MSG[0],@MSG[1]
  1592. ___
  1593. &$aesenc();
  1594. $code.=<<___;
  1595. movdqa $E_SAVE,@MSG[0]
  1596. movdqa $ABCD,$E_
  1597. sha1rnds4 \$3,$E,$ABCD # 72-75
  1598. sha1nexte @MSG[1],$E_
  1599. ___
  1600. &$aesenc();
  1601. $code.=<<___;
  1602. movdqa $ABCD,$E
  1603. sha1rnds4 \$3,$E_,$ABCD # 76-79
  1604. sha1nexte $MSG[0],$E
  1605. ___
  1606. while($r<40) { &$aesenc(); } # remaining aesenc's
  1607. $code.=<<___;
  1608. dec $len
  1609. paddd $ABCD_SAVE,$ABCD
  1610. movups $iv,48($out,$in0) # write output
  1611. lea 64($in0),$in0
  1612. jnz .Loop_shaext
  1613. pshufd \$0b00011011,$ABCD,$ABCD
  1614. pshufd \$0b00011011,$E,$E
  1615. movups $iv,($ivp) # write IV
  1616. movdqu $ABCD,($ctx)
  1617. movd $E,16($ctx)
  1618. ___
  1619. $code.=<<___ if ($win64);
  1620. movaps -8-10*16(%rax),%xmm6
  1621. movaps -8-9*16(%rax),%xmm7
  1622. movaps -8-8*16(%rax),%xmm8
  1623. movaps -8-7*16(%rax),%xmm9
  1624. movaps -8-6*16(%rax),%xmm10
  1625. movaps -8-5*16(%rax),%xmm11
  1626. movaps -8-4*16(%rax),%xmm12
  1627. movaps -8-3*16(%rax),%xmm13
  1628. movaps -8-2*16(%rax),%xmm14
  1629. movaps -8-1*16(%rax),%xmm15
  1630. mov %rax,%rsp
  1631. .Lepilogue_shaext:
  1632. ___
  1633. $code.=<<___;
  1634. ret
  1635. .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
  1636. ___
  1637. }}}
  1638. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1639. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1640. if ($win64) {
  1641. $rec="%rcx";
  1642. $frame="%rdx";
  1643. $context="%r8";
  1644. $disp="%r9";
  1645. $code.=<<___;
  1646. .extern __imp_RtlVirtualUnwind
  1647. .type ssse3_handler,\@abi-omnipotent
  1648. .align 16
  1649. ssse3_handler:
  1650. push %rsi
  1651. push %rdi
  1652. push %rbx
  1653. push %rbp
  1654. push %r12
  1655. push %r13
  1656. push %r14
  1657. push %r15
  1658. pushfq
  1659. sub \$64,%rsp
  1660. mov 120($context),%rax # pull context->Rax
  1661. mov 248($context),%rbx # pull context->Rip
  1662. mov 8($disp),%rsi # disp->ImageBase
  1663. mov 56($disp),%r11 # disp->HandlerData
  1664. mov 0(%r11),%r10d # HandlerData[0]
  1665. lea (%rsi,%r10),%r10 # prologue label
  1666. cmp %r10,%rbx # context->Rip<prologue label
  1667. jb .Lcommon_seh_tail
  1668. mov 152($context),%rax # pull context->Rsp
  1669. mov 4(%r11),%r10d # HandlerData[1]
  1670. lea (%rsi,%r10),%r10 # epilogue label
  1671. cmp %r10,%rbx # context->Rip>=epilogue label
  1672. jae .Lcommon_seh_tail
  1673. ___
  1674. $code.=<<___ if ($shaext);
  1675. lea aesni_cbc_sha1_enc_shaext(%rip),%r10
  1676. cmp %r10,%rbx
  1677. jb .Lseh_no_shaext
  1678. lea (%rax),%rsi
  1679. lea 512($context),%rdi # &context.Xmm6
  1680. mov \$20,%ecx
  1681. .long 0xa548f3fc # cld; rep movsq
  1682. lea 168(%rax),%rax # adjust stack pointer
  1683. jmp .Lcommon_seh_tail
  1684. .Lseh_no_shaext:
  1685. ___
  1686. $code.=<<___;
  1687. lea 96(%rax),%rsi
  1688. lea 512($context),%rdi # &context.Xmm6
  1689. mov \$20,%ecx
  1690. .long 0xa548f3fc # cld; rep movsq
  1691. lea `104+10*16`(%rax),%rax # adjust stack pointer
  1692. mov 0(%rax),%r15
  1693. mov 8(%rax),%r14
  1694. mov 16(%rax),%r13
  1695. mov 24(%rax),%r12
  1696. mov 32(%rax),%rbp
  1697. mov 40(%rax),%rbx
  1698. lea 48(%rax),%rax
  1699. mov %rbx,144($context) # restore context->Rbx
  1700. mov %rbp,160($context) # restore context->Rbp
  1701. mov %r12,216($context) # restore context->R12
  1702. mov %r13,224($context) # restore context->R13
  1703. mov %r14,232($context) # restore context->R14
  1704. mov %r15,240($context) # restore context->R15
  1705. .Lcommon_seh_tail:
  1706. mov 8(%rax),%rdi
  1707. mov 16(%rax),%rsi
  1708. mov %rax,152($context) # restore context->Rsp
  1709. mov %rsi,168($context) # restore context->Rsi
  1710. mov %rdi,176($context) # restore context->Rdi
  1711. mov 40($disp),%rdi # disp->ContextRecord
  1712. mov $context,%rsi # context
  1713. mov \$154,%ecx # sizeof(CONTEXT)
  1714. .long 0xa548f3fc # cld; rep movsq
  1715. mov $disp,%rsi
  1716. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1717. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1718. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1719. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1720. mov 40(%rsi),%r10 # disp->ContextRecord
  1721. lea 56(%rsi),%r11 # &disp->HandlerData
  1722. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1723. mov %r10,32(%rsp) # arg5
  1724. mov %r11,40(%rsp) # arg6
  1725. mov %r12,48(%rsp) # arg7
  1726. mov %rcx,56(%rsp) # arg8, (NULL)
  1727. call *__imp_RtlVirtualUnwind(%rip)
  1728. mov \$1,%eax # ExceptionContinueSearch
  1729. add \$64,%rsp
  1730. popfq
  1731. pop %r15
  1732. pop %r14
  1733. pop %r13
  1734. pop %r12
  1735. pop %rbp
  1736. pop %rbx
  1737. pop %rdi
  1738. pop %rsi
  1739. ret
  1740. .size ssse3_handler,.-ssse3_handler
  1741. .section .pdata
  1742. .align 4
  1743. .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
  1744. .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
  1745. .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
  1746. ___
  1747. $code.=<<___ if ($avx);
  1748. .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
  1749. .rva .LSEH_end_aesni_cbc_sha1_enc_avx
  1750. .rva .LSEH_info_aesni_cbc_sha1_enc_avx
  1751. ___
  1752. $code.=<<___ if ($shaext);
  1753. .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
  1754. .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
  1755. .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
  1756. ___
  1757. $code.=<<___;
  1758. .section .xdata
  1759. .align 8
  1760. .LSEH_info_aesni_cbc_sha1_enc_ssse3:
  1761. .byte 9,0,0,0
  1762. .rva ssse3_handler
  1763. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  1764. ___
  1765. $code.=<<___ if ($avx);
  1766. .LSEH_info_aesni_cbc_sha1_enc_avx:
  1767. .byte 9,0,0,0
  1768. .rva ssse3_handler
  1769. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1770. ___
  1771. $code.=<<___ if ($shaext);
  1772. .LSEH_info_aesni_cbc_sha1_enc_shaext:
  1773. .byte 9,0,0,0
  1774. .rva ssse3_handler
  1775. .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
  1776. ___
  1777. }
  1778. ####################################################################
  1779. sub rex {
  1780. local *opcode=shift;
  1781. my ($dst,$src)=@_;
  1782. my $rex=0;
  1783. $rex|=0x04 if($dst>=8);
  1784. $rex|=0x01 if($src>=8);
  1785. unshift @opcode,$rex|0x40 if($rex);
  1786. }
  1787. sub sha1rnds4 {
  1788. if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1789. my @opcode=(0x0f,0x3a,0xcc);
  1790. rex(\@opcode,$3,$2);
  1791. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1792. my $c=$1;
  1793. push @opcode,$c=~/^0/?oct($c):$c;
  1794. return ".byte\t".join(',',@opcode);
  1795. } else {
  1796. return "sha1rnds4\t".@_[0];
  1797. }
  1798. }
  1799. sub sha1op38 {
  1800. my $instr = shift;
  1801. my %opcodelet = (
  1802. "sha1nexte" => 0xc8,
  1803. "sha1msg1" => 0xc9,
  1804. "sha1msg2" => 0xca );
  1805. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1806. my @opcode=(0x0f,0x38);
  1807. rex(\@opcode,$2,$1);
  1808. push @opcode,$opcodelet{$instr};
  1809. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1810. return ".byte\t".join(',',@opcode);
  1811. } else {
  1812. return $instr."\t".@_[0];
  1813. }
  1814. }
  1815. sub aesni {
  1816. my $line=shift;
  1817. my @opcode=(0x0f,0x38);
  1818. if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1819. my %opcodelet = (
  1820. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  1821. "aesdec" => 0xde, "aesdeclast" => 0xdf
  1822. );
  1823. return undef if (!defined($opcodelet{$1}));
  1824. rex(\@opcode,$3,$2);
  1825. push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1826. unshift @opcode,0x66;
  1827. return ".byte\t".join(',',@opcode);
  1828. }
  1829. return $line;
  1830. }
  1831. foreach (split("\n",$code)) {
  1832. s/\`([^\`]*)\`/eval $1/geo;
  1833. s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
  1834. s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
  1835. s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
  1836. print $_,"\n";
  1837. }
  1838. close STDOUT;