aesni-x86.pl 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. #
  9. # This module implements support for Intel AES-NI extension. In
  10. # OpenSSL context it's used with Intel engine, but can also be used as
  11. # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
  12. # details].
  13. #
  14. # Performance.
  15. #
  16. # To start with see corresponding paragraph in aesni-x86_64.pl...
  17. # Instead of filling table similar to one found there I've chosen to
  18. # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
  19. # The simplified table below represents 32-bit performance relative
  20. # to 64-bit one in every given point. Ratios vary for different
  21. # encryption modes, therefore interval values.
  22. #
  23. # 16-byte 64-byte 256-byte 1-KB 8-KB
  24. # 53-67% 67-84% 91-94% 95-98% 97-99.5%
  25. #
  26. # Lower ratios for smaller block sizes are perfectly understandable,
  27. # because function call overhead is higher in 32-bit mode. Largest
  28. # 8-KB block performance is virtually same: 32-bit code is less than
  29. # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
  30. # January 2011
  31. #
  32. # See aesni-x86_64.pl for details. Unlike x86_64 version this module
  33. # interleaves at most 6 aes[enc|dec] instructions, because there are
  34. # not enough registers for 8x interleave [which should be optimal for
  35. # Sandy Bridge]. Actually, performance results for 6x interleave
  36. # factor presented in aesni-x86_64.pl (except for CTR) are for this
  37. # module.
  38. # April 2011
  39. #
  40. # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
  41. # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
  42. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
  43. # generates drop-in replacement for
  44. # crypto/aes/asm/aes-586.pl:-)
  45. $inline=1; # inline _aesni_[en|de]crypt
  46. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47. push(@INC,"${dir}","${dir}../../perlasm");
  48. require "x86asm.pl";
  49. &asm_init($ARGV[0],$0);
  50. if ($PREFIX eq "aesni") { $movekey=*movups; }
  51. else { $movekey=*movups; }
  52. $len="eax";
  53. $rounds="ecx";
  54. $key="edx";
  55. $inp="esi";
  56. $out="edi";
  57. $rounds_="ebx"; # backup copy for $rounds
  58. $key_="ebp"; # backup copy for $key
  59. $rndkey0="xmm0";
  60. $rndkey1="xmm1";
  61. $inout0="xmm2";
  62. $inout1="xmm3";
  63. $inout2="xmm4";
  64. $inout3="xmm5"; $in1="xmm5";
  65. $inout4="xmm6"; $in0="xmm6";
  66. $inout5="xmm7"; $ivec="xmm7";
  67. # AESNI extenstion
  68. sub aeskeygenassist
  69. { my($dst,$src,$imm)=@_;
  70. if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
  71. { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
  72. }
  73. sub aescommon
  74. { my($opcodelet,$dst,$src)=@_;
  75. if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
  76. { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
  77. }
  78. sub aesimc { aescommon(0xdb,@_); }
  79. sub aesenc { aescommon(0xdc,@_); }
  80. sub aesenclast { aescommon(0xdd,@_); }
  81. sub aesdec { aescommon(0xde,@_); }
  82. sub aesdeclast { aescommon(0xdf,@_); }
  83. # Inline version of internal aesni_[en|de]crypt1
  84. { my $sn;
  85. sub aesni_inline_generate1
  86. { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
  87. $sn++;
  88. &$movekey ($rndkey0,&QWP(0,$key));
  89. &$movekey ($rndkey1,&QWP(16,$key));
  90. &xorps ($ivec,$rndkey0) if (defined($ivec));
  91. &lea ($key,&DWP(32,$key));
  92. &xorps ($inout,$ivec) if (defined($ivec));
  93. &xorps ($inout,$rndkey0) if (!defined($ivec));
  94. &set_label("${p}1_loop_$sn");
  95. eval"&aes${p} ($inout,$rndkey1)";
  96. &dec ($rounds);
  97. &$movekey ($rndkey1,&QWP(0,$key));
  98. &lea ($key,&DWP(16,$key));
  99. &jnz (&label("${p}1_loop_$sn"));
  100. eval"&aes${p}last ($inout,$rndkey1)";
  101. }}
  102. sub aesni_generate1 # fully unrolled loop
  103. { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
  104. &function_begin_B("_aesni_${p}rypt1");
  105. &movups ($rndkey0,&QWP(0,$key));
  106. &$movekey ($rndkey1,&QWP(0x10,$key));
  107. &xorps ($inout,$rndkey0);
  108. &$movekey ($rndkey0,&QWP(0x20,$key));
  109. &lea ($key,&DWP(0x30,$key));
  110. &cmp ($rounds,11);
  111. &jb (&label("${p}128"));
  112. &lea ($key,&DWP(0x20,$key));
  113. &je (&label("${p}192"));
  114. &lea ($key,&DWP(0x20,$key));
  115. eval"&aes${p} ($inout,$rndkey1)";
  116. &$movekey ($rndkey1,&QWP(-0x40,$key));
  117. eval"&aes${p} ($inout,$rndkey0)";
  118. &$movekey ($rndkey0,&QWP(-0x30,$key));
  119. &set_label("${p}192");
  120. eval"&aes${p} ($inout,$rndkey1)";
  121. &$movekey ($rndkey1,&QWP(-0x20,$key));
  122. eval"&aes${p} ($inout,$rndkey0)";
  123. &$movekey ($rndkey0,&QWP(-0x10,$key));
  124. &set_label("${p}128");
  125. eval"&aes${p} ($inout,$rndkey1)";
  126. &$movekey ($rndkey1,&QWP(0,$key));
  127. eval"&aes${p} ($inout,$rndkey0)";
  128. &$movekey ($rndkey0,&QWP(0x10,$key));
  129. eval"&aes${p} ($inout,$rndkey1)";
  130. &$movekey ($rndkey1,&QWP(0x20,$key));
  131. eval"&aes${p} ($inout,$rndkey0)";
  132. &$movekey ($rndkey0,&QWP(0x30,$key));
  133. eval"&aes${p} ($inout,$rndkey1)";
  134. &$movekey ($rndkey1,&QWP(0x40,$key));
  135. eval"&aes${p} ($inout,$rndkey0)";
  136. &$movekey ($rndkey0,&QWP(0x50,$key));
  137. eval"&aes${p} ($inout,$rndkey1)";
  138. &$movekey ($rndkey1,&QWP(0x60,$key));
  139. eval"&aes${p} ($inout,$rndkey0)";
  140. &$movekey ($rndkey0,&QWP(0x70,$key));
  141. eval"&aes${p} ($inout,$rndkey1)";
  142. eval"&aes${p}last ($inout,$rndkey0)";
  143. &ret();
  144. &function_end_B("_aesni_${p}rypt1");
  145. }
  146. # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
  147. &aesni_generate1("enc") if (!$inline);
  148. &function_begin_B("${PREFIX}_encrypt");
  149. &mov ("eax",&wparam(0));
  150. &mov ($key,&wparam(2));
  151. &movups ($inout0,&QWP(0,"eax"));
  152. &mov ($rounds,&DWP(240,$key));
  153. &mov ("eax",&wparam(1));
  154. if ($inline)
  155. { &aesni_inline_generate1("enc"); }
  156. else
  157. { &call ("_aesni_encrypt1"); }
  158. &movups (&QWP(0,"eax"),$inout0);
  159. &ret ();
  160. &function_end_B("${PREFIX}_encrypt");
  161. # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
  162. &aesni_generate1("dec") if(!$inline);
  163. &function_begin_B("${PREFIX}_decrypt");
  164. &mov ("eax",&wparam(0));
  165. &mov ($key,&wparam(2));
  166. &movups ($inout0,&QWP(0,"eax"));
  167. &mov ($rounds,&DWP(240,$key));
  168. &mov ("eax",&wparam(1));
  169. if ($inline)
  170. { &aesni_inline_generate1("dec"); }
  171. else
  172. { &call ("_aesni_decrypt1"); }
  173. &movups (&QWP(0,"eax"),$inout0);
  174. &ret ();
  175. &function_end_B("${PREFIX}_decrypt");
  176. # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
  177. # factor. Why 3x subroutine were originally used in loops? Even though
  178. # aes[enc|dec] latency was originally 6, it could be scheduled only
  179. # every *2nd* cycle. Thus 3x interleave was the one providing optimal
  180. # utilization, i.e. when subroutine's throughput is virtually same as
  181. # of non-interleaved subroutine [for number of input blocks up to 3].
  182. # This is why it makes no sense to implement 2x subroutine.
  183. # aes[enc|dec] latency in next processor generation is 8, but the
  184. # instructions can be scheduled every cycle. Optimal interleave for
  185. # new processor is therefore 8x, but it's unfeasible to accommodate it
  186. # in XMM registers addreassable in 32-bit mode and therefore 6x is
  187. # used instead...
  188. sub aesni_generate3
  189. { my $p=shift;
  190. &function_begin_B("_aesni_${p}rypt3");
  191. &$movekey ($rndkey0,&QWP(0,$key));
  192. &shr ($rounds,1);
  193. &$movekey ($rndkey1,&QWP(16,$key));
  194. &lea ($key,&DWP(32,$key));
  195. &xorps ($inout0,$rndkey0);
  196. &pxor ($inout1,$rndkey0);
  197. &pxor ($inout2,$rndkey0);
  198. &$movekey ($rndkey0,&QWP(0,$key));
  199. &set_label("${p}3_loop");
  200. eval"&aes${p} ($inout0,$rndkey1)";
  201. eval"&aes${p} ($inout1,$rndkey1)";
  202. &dec ($rounds);
  203. eval"&aes${p} ($inout2,$rndkey1)";
  204. &$movekey ($rndkey1,&QWP(16,$key));
  205. eval"&aes${p} ($inout0,$rndkey0)";
  206. eval"&aes${p} ($inout1,$rndkey0)";
  207. &lea ($key,&DWP(32,$key));
  208. eval"&aes${p} ($inout2,$rndkey0)";
  209. &$movekey ($rndkey0,&QWP(0,$key));
  210. &jnz (&label("${p}3_loop"));
  211. eval"&aes${p} ($inout0,$rndkey1)";
  212. eval"&aes${p} ($inout1,$rndkey1)";
  213. eval"&aes${p} ($inout2,$rndkey1)";
  214. eval"&aes${p}last ($inout0,$rndkey0)";
  215. eval"&aes${p}last ($inout1,$rndkey0)";
  216. eval"&aes${p}last ($inout2,$rndkey0)";
  217. &ret();
  218. &function_end_B("_aesni_${p}rypt3");
  219. }
  220. # 4x interleave is implemented to improve small block performance,
  221. # most notably [and naturally] 4 block by ~30%. One can argue that one
  222. # should have implemented 5x as well, but improvement would be <20%,
  223. # so it's not worth it...
  224. sub aesni_generate4
  225. { my $p=shift;
  226. &function_begin_B("_aesni_${p}rypt4");
  227. &$movekey ($rndkey0,&QWP(0,$key));
  228. &$movekey ($rndkey1,&QWP(16,$key));
  229. &shr ($rounds,1);
  230. &lea ($key,&DWP(32,$key));
  231. &xorps ($inout0,$rndkey0);
  232. &pxor ($inout1,$rndkey0);
  233. &pxor ($inout2,$rndkey0);
  234. &pxor ($inout3,$rndkey0);
  235. &$movekey ($rndkey0,&QWP(0,$key));
  236. &set_label("${p}4_loop");
  237. eval"&aes${p} ($inout0,$rndkey1)";
  238. eval"&aes${p} ($inout1,$rndkey1)";
  239. &dec ($rounds);
  240. eval"&aes${p} ($inout2,$rndkey1)";
  241. eval"&aes${p} ($inout3,$rndkey1)";
  242. &$movekey ($rndkey1,&QWP(16,$key));
  243. eval"&aes${p} ($inout0,$rndkey0)";
  244. eval"&aes${p} ($inout1,$rndkey0)";
  245. &lea ($key,&DWP(32,$key));
  246. eval"&aes${p} ($inout2,$rndkey0)";
  247. eval"&aes${p} ($inout3,$rndkey0)";
  248. &$movekey ($rndkey0,&QWP(0,$key));
  249. &jnz (&label("${p}4_loop"));
  250. eval"&aes${p} ($inout0,$rndkey1)";
  251. eval"&aes${p} ($inout1,$rndkey1)";
  252. eval"&aes${p} ($inout2,$rndkey1)";
  253. eval"&aes${p} ($inout3,$rndkey1)";
  254. eval"&aes${p}last ($inout0,$rndkey0)";
  255. eval"&aes${p}last ($inout1,$rndkey0)";
  256. eval"&aes${p}last ($inout2,$rndkey0)";
  257. eval"&aes${p}last ($inout3,$rndkey0)";
  258. &ret();
  259. &function_end_B("_aesni_${p}rypt4");
  260. }
  261. sub aesni_generate6
  262. { my $p=shift;
  263. &function_begin_B("_aesni_${p}rypt6");
  264. &static_label("_aesni_${p}rypt6_enter");
  265. &$movekey ($rndkey0,&QWP(0,$key));
  266. &shr ($rounds,1);
  267. &$movekey ($rndkey1,&QWP(16,$key));
  268. &lea ($key,&DWP(32,$key));
  269. &xorps ($inout0,$rndkey0);
  270. &pxor ($inout1,$rndkey0); # pxor does better here
  271. eval"&aes${p} ($inout0,$rndkey1)";
  272. &pxor ($inout2,$rndkey0);
  273. eval"&aes${p} ($inout1,$rndkey1)";
  274. &pxor ($inout3,$rndkey0);
  275. &dec ($rounds);
  276. eval"&aes${p} ($inout2,$rndkey1)";
  277. &pxor ($inout4,$rndkey0);
  278. eval"&aes${p} ($inout3,$rndkey1)";
  279. &pxor ($inout5,$rndkey0);
  280. eval"&aes${p} ($inout4,$rndkey1)";
  281. &$movekey ($rndkey0,&QWP(0,$key));
  282. eval"&aes${p} ($inout5,$rndkey1)";
  283. &jmp (&label("_aesni_${p}rypt6_enter"));
  284. &set_label("${p}6_loop",16);
  285. eval"&aes${p} ($inout0,$rndkey1)";
  286. eval"&aes${p} ($inout1,$rndkey1)";
  287. &dec ($rounds);
  288. eval"&aes${p} ($inout2,$rndkey1)";
  289. eval"&aes${p} ($inout3,$rndkey1)";
  290. eval"&aes${p} ($inout4,$rndkey1)";
  291. eval"&aes${p} ($inout5,$rndkey1)";
  292. &set_label("_aesni_${p}rypt6_enter",16);
  293. &$movekey ($rndkey1,&QWP(16,$key));
  294. eval"&aes${p} ($inout0,$rndkey0)";
  295. eval"&aes${p} ($inout1,$rndkey0)";
  296. &lea ($key,&DWP(32,$key));
  297. eval"&aes${p} ($inout2,$rndkey0)";
  298. eval"&aes${p} ($inout3,$rndkey0)";
  299. eval"&aes${p} ($inout4,$rndkey0)";
  300. eval"&aes${p} ($inout5,$rndkey0)";
  301. &$movekey ($rndkey0,&QWP(0,$key));
  302. &jnz (&label("${p}6_loop"));
  303. eval"&aes${p} ($inout0,$rndkey1)";
  304. eval"&aes${p} ($inout1,$rndkey1)";
  305. eval"&aes${p} ($inout2,$rndkey1)";
  306. eval"&aes${p} ($inout3,$rndkey1)";
  307. eval"&aes${p} ($inout4,$rndkey1)";
  308. eval"&aes${p} ($inout5,$rndkey1)";
  309. eval"&aes${p}last ($inout0,$rndkey0)";
  310. eval"&aes${p}last ($inout1,$rndkey0)";
  311. eval"&aes${p}last ($inout2,$rndkey0)";
  312. eval"&aes${p}last ($inout3,$rndkey0)";
  313. eval"&aes${p}last ($inout4,$rndkey0)";
  314. eval"&aes${p}last ($inout5,$rndkey0)";
  315. &ret();
  316. &function_end_B("_aesni_${p}rypt6");
  317. }
  318. &aesni_generate3("enc") if ($PREFIX eq "aesni");
  319. &aesni_generate3("dec");
  320. &aesni_generate4("enc") if ($PREFIX eq "aesni");
  321. &aesni_generate4("dec");
  322. &aesni_generate6("enc") if ($PREFIX eq "aesni");
  323. &aesni_generate6("dec");
  324. if ($PREFIX eq "aesni") {
  325. ######################################################################
  326. # void aesni_ecb_encrypt (const void *in, void *out,
  327. # size_t length, const AES_KEY *key,
  328. # int enc);
  329. &function_begin("aesni_ecb_encrypt");
  330. &mov ($inp,&wparam(0));
  331. &mov ($out,&wparam(1));
  332. &mov ($len,&wparam(2));
  333. &mov ($key,&wparam(3));
  334. &mov ($rounds_,&wparam(4));
  335. &and ($len,-16);
  336. &jz (&label("ecb_ret"));
  337. &mov ($rounds,&DWP(240,$key));
  338. &test ($rounds_,$rounds_);
  339. &jz (&label("ecb_decrypt"));
  340. &mov ($key_,$key); # backup $key
  341. &mov ($rounds_,$rounds); # backup $rounds
  342. &cmp ($len,0x60);
  343. &jb (&label("ecb_enc_tail"));
  344. &movdqu ($inout0,&QWP(0,$inp));
  345. &movdqu ($inout1,&QWP(0x10,$inp));
  346. &movdqu ($inout2,&QWP(0x20,$inp));
  347. &movdqu ($inout3,&QWP(0x30,$inp));
  348. &movdqu ($inout4,&QWP(0x40,$inp));
  349. &movdqu ($inout5,&QWP(0x50,$inp));
  350. &lea ($inp,&DWP(0x60,$inp));
  351. &sub ($len,0x60);
  352. &jmp (&label("ecb_enc_loop6_enter"));
  353. &set_label("ecb_enc_loop6",16);
  354. &movups (&QWP(0,$out),$inout0);
  355. &movdqu ($inout0,&QWP(0,$inp));
  356. &movups (&QWP(0x10,$out),$inout1);
  357. &movdqu ($inout1,&QWP(0x10,$inp));
  358. &movups (&QWP(0x20,$out),$inout2);
  359. &movdqu ($inout2,&QWP(0x20,$inp));
  360. &movups (&QWP(0x30,$out),$inout3);
  361. &movdqu ($inout3,&QWP(0x30,$inp));
  362. &movups (&QWP(0x40,$out),$inout4);
  363. &movdqu ($inout4,&QWP(0x40,$inp));
  364. &movups (&QWP(0x50,$out),$inout5);
  365. &lea ($out,&DWP(0x60,$out));
  366. &movdqu ($inout5,&QWP(0x50,$inp));
  367. &lea ($inp,&DWP(0x60,$inp));
  368. &set_label("ecb_enc_loop6_enter");
  369. &call ("_aesni_encrypt6");
  370. &mov ($key,$key_); # restore $key
  371. &mov ($rounds,$rounds_); # restore $rounds
  372. &sub ($len,0x60);
  373. &jnc (&label("ecb_enc_loop6"));
  374. &movups (&QWP(0,$out),$inout0);
  375. &movups (&QWP(0x10,$out),$inout1);
  376. &movups (&QWP(0x20,$out),$inout2);
  377. &movups (&QWP(0x30,$out),$inout3);
  378. &movups (&QWP(0x40,$out),$inout4);
  379. &movups (&QWP(0x50,$out),$inout5);
  380. &lea ($out,&DWP(0x60,$out));
  381. &add ($len,0x60);
  382. &jz (&label("ecb_ret"));
  383. &set_label("ecb_enc_tail");
  384. &movups ($inout0,&QWP(0,$inp));
  385. &cmp ($len,0x20);
  386. &jb (&label("ecb_enc_one"));
  387. &movups ($inout1,&QWP(0x10,$inp));
  388. &je (&label("ecb_enc_two"));
  389. &movups ($inout2,&QWP(0x20,$inp));
  390. &cmp ($len,0x40);
  391. &jb (&label("ecb_enc_three"));
  392. &movups ($inout3,&QWP(0x30,$inp));
  393. &je (&label("ecb_enc_four"));
  394. &movups ($inout4,&QWP(0x40,$inp));
  395. &xorps ($inout5,$inout5);
  396. &call ("_aesni_encrypt6");
  397. &movups (&QWP(0,$out),$inout0);
  398. &movups (&QWP(0x10,$out),$inout1);
  399. &movups (&QWP(0x20,$out),$inout2);
  400. &movups (&QWP(0x30,$out),$inout3);
  401. &movups (&QWP(0x40,$out),$inout4);
  402. jmp (&label("ecb_ret"));
  403. &set_label("ecb_enc_one",16);
  404. if ($inline)
  405. { &aesni_inline_generate1("enc"); }
  406. else
  407. { &call ("_aesni_encrypt1"); }
  408. &movups (&QWP(0,$out),$inout0);
  409. &jmp (&label("ecb_ret"));
  410. &set_label("ecb_enc_two",16);
  411. &xorps ($inout2,$inout2);
  412. &call ("_aesni_encrypt3");
  413. &movups (&QWP(0,$out),$inout0);
  414. &movups (&QWP(0x10,$out),$inout1);
  415. &jmp (&label("ecb_ret"));
  416. &set_label("ecb_enc_three",16);
  417. &call ("_aesni_encrypt3");
  418. &movups (&QWP(0,$out),$inout0);
  419. &movups (&QWP(0x10,$out),$inout1);
  420. &movups (&QWP(0x20,$out),$inout2);
  421. &jmp (&label("ecb_ret"));
  422. &set_label("ecb_enc_four",16);
  423. &call ("_aesni_encrypt4");
  424. &movups (&QWP(0,$out),$inout0);
  425. &movups (&QWP(0x10,$out),$inout1);
  426. &movups (&QWP(0x20,$out),$inout2);
  427. &movups (&QWP(0x30,$out),$inout3);
  428. &jmp (&label("ecb_ret"));
  429. ######################################################################
  430. &set_label("ecb_decrypt",16);
  431. &mov ($key_,$key); # backup $key
  432. &mov ($rounds_,$rounds); # backup $rounds
  433. &cmp ($len,0x60);
  434. &jb (&label("ecb_dec_tail"));
  435. &movdqu ($inout0,&QWP(0,$inp));
  436. &movdqu ($inout1,&QWP(0x10,$inp));
  437. &movdqu ($inout2,&QWP(0x20,$inp));
  438. &movdqu ($inout3,&QWP(0x30,$inp));
  439. &movdqu ($inout4,&QWP(0x40,$inp));
  440. &movdqu ($inout5,&QWP(0x50,$inp));
  441. &lea ($inp,&DWP(0x60,$inp));
  442. &sub ($len,0x60);
  443. &jmp (&label("ecb_dec_loop6_enter"));
  444. &set_label("ecb_dec_loop6",16);
  445. &movups (&QWP(0,$out),$inout0);
  446. &movdqu ($inout0,&QWP(0,$inp));
  447. &movups (&QWP(0x10,$out),$inout1);
  448. &movdqu ($inout1,&QWP(0x10,$inp));
  449. &movups (&QWP(0x20,$out),$inout2);
  450. &movdqu ($inout2,&QWP(0x20,$inp));
  451. &movups (&QWP(0x30,$out),$inout3);
  452. &movdqu ($inout3,&QWP(0x30,$inp));
  453. &movups (&QWP(0x40,$out),$inout4);
  454. &movdqu ($inout4,&QWP(0x40,$inp));
  455. &movups (&QWP(0x50,$out),$inout5);
  456. &lea ($out,&DWP(0x60,$out));
  457. &movdqu ($inout5,&QWP(0x50,$inp));
  458. &lea ($inp,&DWP(0x60,$inp));
  459. &set_label("ecb_dec_loop6_enter");
  460. &call ("_aesni_decrypt6");
  461. &mov ($key,$key_); # restore $key
  462. &mov ($rounds,$rounds_); # restore $rounds
  463. &sub ($len,0x60);
  464. &jnc (&label("ecb_dec_loop6"));
  465. &movups (&QWP(0,$out),$inout0);
  466. &movups (&QWP(0x10,$out),$inout1);
  467. &movups (&QWP(0x20,$out),$inout2);
  468. &movups (&QWP(0x30,$out),$inout3);
  469. &movups (&QWP(0x40,$out),$inout4);
  470. &movups (&QWP(0x50,$out),$inout5);
  471. &lea ($out,&DWP(0x60,$out));
  472. &add ($len,0x60);
  473. &jz (&label("ecb_ret"));
  474. &set_label("ecb_dec_tail");
  475. &movups ($inout0,&QWP(0,$inp));
  476. &cmp ($len,0x20);
  477. &jb (&label("ecb_dec_one"));
  478. &movups ($inout1,&QWP(0x10,$inp));
  479. &je (&label("ecb_dec_two"));
  480. &movups ($inout2,&QWP(0x20,$inp));
  481. &cmp ($len,0x40);
  482. &jb (&label("ecb_dec_three"));
  483. &movups ($inout3,&QWP(0x30,$inp));
  484. &je (&label("ecb_dec_four"));
  485. &movups ($inout4,&QWP(0x40,$inp));
  486. &xorps ($inout5,$inout5);
  487. &call ("_aesni_decrypt6");
  488. &movups (&QWP(0,$out),$inout0);
  489. &movups (&QWP(0x10,$out),$inout1);
  490. &movups (&QWP(0x20,$out),$inout2);
  491. &movups (&QWP(0x30,$out),$inout3);
  492. &movups (&QWP(0x40,$out),$inout4);
  493. &jmp (&label("ecb_ret"));
  494. &set_label("ecb_dec_one",16);
  495. if ($inline)
  496. { &aesni_inline_generate1("dec"); }
  497. else
  498. { &call ("_aesni_decrypt1"); }
  499. &movups (&QWP(0,$out),$inout0);
  500. &jmp (&label("ecb_ret"));
  501. &set_label("ecb_dec_two",16);
  502. &xorps ($inout2,$inout2);
  503. &call ("_aesni_decrypt3");
  504. &movups (&QWP(0,$out),$inout0);
  505. &movups (&QWP(0x10,$out),$inout1);
  506. &jmp (&label("ecb_ret"));
  507. &set_label("ecb_dec_three",16);
  508. &call ("_aesni_decrypt3");
  509. &movups (&QWP(0,$out),$inout0);
  510. &movups (&QWP(0x10,$out),$inout1);
  511. &movups (&QWP(0x20,$out),$inout2);
  512. &jmp (&label("ecb_ret"));
  513. &set_label("ecb_dec_four",16);
  514. &call ("_aesni_decrypt4");
  515. &movups (&QWP(0,$out),$inout0);
  516. &movups (&QWP(0x10,$out),$inout1);
  517. &movups (&QWP(0x20,$out),$inout2);
  518. &movups (&QWP(0x30,$out),$inout3);
  519. &set_label("ecb_ret");
  520. &function_end("aesni_ecb_encrypt");
  521. ######################################################################
  522. # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
  523. # size_t blocks, const AES_KEY *key,
  524. # const char *ivec,char *cmac);
  525. #
  526. # Handles only complete blocks, operates on 64-bit counter and
  527. # does not update *ivec! Nor does it finalize CMAC value
  528. # (see engine/eng_aesni.c for details)
  529. #
  530. { my $cmac=$inout1;
  531. &function_begin("aesni_ccm64_encrypt_blocks");
  532. &mov ($inp,&wparam(0));
  533. &mov ($out,&wparam(1));
  534. &mov ($len,&wparam(2));
  535. &mov ($key,&wparam(3));
  536. &mov ($rounds_,&wparam(4));
  537. &mov ($rounds,&wparam(5));
  538. &mov ($key_,"esp");
  539. &sub ("esp",60);
  540. &and ("esp",-16); # align stack
  541. &mov (&DWP(48,"esp"),$key_);
  542. &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
  543. &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
  544. &mov ($rounds,&DWP(240,$key));
  545. # compose byte-swap control mask for pshufb on stack
  546. &mov (&DWP(0,"esp"),0x0c0d0e0f);
  547. &mov (&DWP(4,"esp"),0x08090a0b);
  548. &mov (&DWP(8,"esp"),0x04050607);
  549. &mov (&DWP(12,"esp"),0x00010203);
  550. # compose counter increment vector on stack
  551. &mov ($rounds_,1);
  552. &xor ($key_,$key_);
  553. &mov (&DWP(16,"esp"),$rounds_);
  554. &mov (&DWP(20,"esp"),$key_);
  555. &mov (&DWP(24,"esp"),$key_);
  556. &mov (&DWP(28,"esp"),$key_);
  557. &shr ($rounds,1);
  558. &lea ($key_,&DWP(0,$key));
  559. &movdqa ($inout0,$ivec);
  560. &mov ($rounds_,$rounds);
  561. &movdqa ($inout3,&QWP(0,"esp"));
  562. &set_label("ccm64_enc_outer");
  563. &$movekey ($rndkey0,&QWP(0,$key_));
  564. &mov ($rounds,$rounds_);
  565. &movups ($in0,&QWP(0,$inp));
  566. &xorps ($inout0,$rndkey0);
  567. &$movekey ($rndkey1,&QWP(16,$key_));
  568. &xorps ($rndkey0,$in0);
  569. &lea ($key,&DWP(32,$key_));
  570. &xorps ($cmac,$rndkey0); # cmac^=inp
  571. &$movekey ($rndkey0,&QWP(0,$key));
  572. &set_label("ccm64_enc2_loop");
  573. &aesenc ($inout0,$rndkey1);
  574. &dec ($rounds);
  575. &aesenc ($cmac,$rndkey1);
  576. &$movekey ($rndkey1,&QWP(16,$key));
  577. &aesenc ($inout0,$rndkey0);
  578. &lea ($key,&DWP(32,$key));
  579. &aesenc ($cmac,$rndkey0);
  580. &$movekey ($rndkey0,&QWP(0,$key));
  581. &jnz (&label("ccm64_enc2_loop"));
  582. &pshufb ($ivec,$inout3);
  583. &aesenc ($inout0,$rndkey1);
  584. &aesenc ($cmac,$rndkey1);
  585. &paddq ($ivec,&QWP(16,"esp"));
  586. &aesenclast ($inout0,$rndkey0);
  587. &aesenclast ($cmac,$rndkey0);
  588. &dec ($len);
  589. &lea ($inp,&DWP(16,$inp));
  590. &xorps ($in0,$inout0); # inp^=E(ivec)
  591. &movdqa ($inout0,$ivec);
  592. &movups (&QWP(0,$out),$in0); # save output
  593. &lea ($out,&DWP(16,$out));
  594. &pshufb ($ivec,$inout3);
  595. &jnz (&label("ccm64_enc_outer"));
  596. &mov ("esp",&DWP(48,"esp"));
  597. &mov ($out,&wparam(5));
  598. &movups (&QWP(0,$out),$cmac);
  599. &function_end("aesni_ccm64_encrypt_blocks");
  600. &function_begin("aesni_ccm64_decrypt_blocks");
  601. &mov ($inp,&wparam(0));
  602. &mov ($out,&wparam(1));
  603. &mov ($len,&wparam(2));
  604. &mov ($key,&wparam(3));
  605. &mov ($rounds_,&wparam(4));
  606. &mov ($rounds,&wparam(5));
  607. &mov ($key_,"esp");
  608. &sub ("esp",60);
  609. &and ("esp",-16); # align stack
  610. &mov (&DWP(48,"esp"),$key_);
  611. &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
  612. &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
  613. &mov ($rounds,&DWP(240,$key));
  614. # compose byte-swap control mask for pshufb on stack
  615. &mov (&DWP(0,"esp"),0x0c0d0e0f);
  616. &mov (&DWP(4,"esp"),0x08090a0b);
  617. &mov (&DWP(8,"esp"),0x04050607);
  618. &mov (&DWP(12,"esp"),0x00010203);
  619. # compose counter increment vector on stack
  620. &mov ($rounds_,1);
  621. &xor ($key_,$key_);
  622. &mov (&DWP(16,"esp"),$rounds_);
  623. &mov (&DWP(20,"esp"),$key_);
  624. &mov (&DWP(24,"esp"),$key_);
  625. &mov (&DWP(28,"esp"),$key_);
  626. &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
  627. &movdqa ($inout0,$ivec);
  628. &mov ($key_,$key);
  629. &mov ($rounds_,$rounds);
  630. &pshufb ($ivec,$inout3);
  631. if ($inline)
  632. { &aesni_inline_generate1("enc"); }
  633. else
  634. { &call ("_aesni_encrypt1"); }
  635. &movups ($in0,&QWP(0,$inp)); # load inp
  636. &paddq ($ivec,&QWP(16,"esp"));
  637. &pshufb ($ivec,$inout3);
  638. &lea ($inp,&QWP(16,$inp));
  639. &jmp (&label("ccm64_dec_outer"));
  640. &set_label("ccm64_dec_outer",16);
  641. &xorps ($in0,$inout0); # inp ^= E(ivec)
  642. &movdqa ($inout0,$ivec);
  643. &mov ($rounds,$rounds_);
  644. &movups (&QWP(0,$out),$in0); # save output
  645. &lea ($out,&DWP(16,$out));
  646. &sub ($len,1);
  647. &jz (&label("ccm64_dec_break"));
  648. &$movekey ($rndkey0,&QWP(0,$key_));
  649. &shr ($rounds,1);
  650. &$movekey ($rndkey1,&QWP(16,$key_));
  651. &xorps ($in0,$rndkey0);
  652. &lea ($key,&DWP(32,$key_));
  653. &xorps ($inout0,$rndkey0);
  654. &xorps ($cmac,$in0); # cmac^=out
  655. &$movekey ($rndkey0,&QWP(0,$key));
  656. &set_label("ccm64_dec2_loop");
  657. &aesenc ($inout0,$rndkey1);
  658. &dec ($rounds);
  659. &aesenc ($cmac,$rndkey1);
  660. &$movekey ($rndkey1,&QWP(16,$key));
  661. &aesenc ($inout0,$rndkey0);
  662. &lea ($key,&DWP(32,$key));
  663. &aesenc ($cmac,$rndkey0);
  664. &$movekey ($rndkey0,&QWP(0,$key));
  665. &jnz (&label("ccm64_dec2_loop"));
  666. &movups ($in0,&QWP(0,$inp)); # load inp
  667. &paddq ($ivec,&QWP(16,"esp"));
  668. &aesenc ($inout0,$rndkey1);
  669. &aesenc ($cmac,$rndkey1);
  670. &pshufb ($ivec,$inout3);
  671. &lea ($inp,&QWP(16,$inp));
  672. &aesenclast ($inout0,$rndkey0);
  673. &aesenclast ($cmac,$rndkey0);
  674. &jmp (&label("ccm64_dec_outer"));
  675. &set_label("ccm64_dec_break",16);
  676. &mov ($key,$key_);
  677. if ($inline)
  678. { &aesni_inline_generate1("enc",$cmac,$in0); }
  679. else
  680. { &call ("_aesni_encrypt1",$cmac); }
  681. &mov ("esp",&DWP(48,"esp"));
  682. &mov ($out,&wparam(5));
  683. &movups (&QWP(0,$out),$cmac);
  684. &function_end("aesni_ccm64_decrypt_blocks");
  685. }
  686. ######################################################################
  687. # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
  688. # size_t blocks, const AES_KEY *key,
  689. # const char *ivec);
  690. #
  691. # Handles only complete blocks, operates on 32-bit counter and
  692. # does not update *ivec! (see engine/eng_aesni.c for details)
  693. #
  694. # stack layout:
  695. # 0 pshufb mask
  696. # 16 vector addend: 0,6,6,6
  697. # 32 counter-less ivec
  698. # 48 1st triplet of counter vector
  699. # 64 2nd triplet of counter vector
  700. # 80 saved %esp
  701. &function_begin("aesni_ctr32_encrypt_blocks");
  702. &mov ($inp,&wparam(0));
  703. &mov ($out,&wparam(1));
  704. &mov ($len,&wparam(2));
  705. &mov ($key,&wparam(3));
  706. &mov ($rounds_,&wparam(4));
  707. &mov ($key_,"esp");
  708. &sub ("esp",88);
  709. &and ("esp",-16); # align stack
  710. &mov (&DWP(80,"esp"),$key_);
  711. &cmp ($len,1);
  712. &je (&label("ctr32_one_shortcut"));
  713. &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
  714. # compose byte-swap control mask for pshufb on stack
  715. &mov (&DWP(0,"esp"),0x0c0d0e0f);
  716. &mov (&DWP(4,"esp"),0x08090a0b);
  717. &mov (&DWP(8,"esp"),0x04050607);
  718. &mov (&DWP(12,"esp"),0x00010203);
  719. # compose counter increment vector on stack
  720. &mov ($rounds,6);
  721. &xor ($key_,$key_);
  722. &mov (&DWP(16,"esp"),$rounds);
  723. &mov (&DWP(20,"esp"),$rounds);
  724. &mov (&DWP(24,"esp"),$rounds);
  725. &mov (&DWP(28,"esp"),$key_);
  726. &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
  727. &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
  728. &mov ($rounds,&DWP(240,$key)); # key->rounds
  729. # compose 2 vectors of 3x32-bit counters
  730. &bswap ($rounds_);
  731. &pxor ($rndkey1,$rndkey1);
  732. &pxor ($rndkey0,$rndkey0);
  733. &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
  734. &pinsrd ($rndkey1,$rounds_,0);
  735. &lea ($key_,&DWP(3,$rounds_));
  736. &pinsrd ($rndkey0,$key_,0);
  737. &inc ($rounds_);
  738. &pinsrd ($rndkey1,$rounds_,1);
  739. &inc ($key_);
  740. &pinsrd ($rndkey0,$key_,1);
  741. &inc ($rounds_);
  742. &pinsrd ($rndkey1,$rounds_,2);
  743. &inc ($key_);
  744. &pinsrd ($rndkey0,$key_,2);
  745. &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
  746. &pshufb ($rndkey1,$inout0); # byte swap
  747. &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
  748. &pshufb ($rndkey0,$inout0); # byte swap
  749. &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
  750. &pshufd ($inout1,$rndkey1,2<<6);
  751. &cmp ($len,6);
  752. &jb (&label("ctr32_tail"));
  753. &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
  754. &shr ($rounds,1);
  755. &mov ($key_,$key); # backup $key
  756. &mov ($rounds_,$rounds); # backup $rounds
  757. &sub ($len,6);
  758. &jmp (&label("ctr32_loop6"));
  759. &set_label("ctr32_loop6",16);
  760. &pshufd ($inout2,$rndkey1,1<<6);
  761. &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
  762. &pshufd ($inout3,$rndkey0,3<<6);
  763. &por ($inout0,$rndkey1); # merge counter-less ivec
  764. &pshufd ($inout4,$rndkey0,2<<6);
  765. &por ($inout1,$rndkey1);
  766. &pshufd ($inout5,$rndkey0,1<<6);
  767. &por ($inout2,$rndkey1);
  768. &por ($inout3,$rndkey1);
  769. &por ($inout4,$rndkey1);
  770. &por ($inout5,$rndkey1);
  771. # inlining _aesni_encrypt6's prologue gives ~4% improvement...
  772. &$movekey ($rndkey0,&QWP(0,$key_));
  773. &$movekey ($rndkey1,&QWP(16,$key_));
  774. &lea ($key,&DWP(32,$key_));
  775. &dec ($rounds);
  776. &pxor ($inout0,$rndkey0);
  777. &pxor ($inout1,$rndkey0);
  778. &aesenc ($inout0,$rndkey1);
  779. &pxor ($inout2,$rndkey0);
  780. &aesenc ($inout1,$rndkey1);
  781. &pxor ($inout3,$rndkey0);
  782. &aesenc ($inout2,$rndkey1);
  783. &pxor ($inout4,$rndkey0);
  784. &aesenc ($inout3,$rndkey1);
  785. &pxor ($inout5,$rndkey0);
  786. &aesenc ($inout4,$rndkey1);
  787. &$movekey ($rndkey0,&QWP(0,$key));
  788. &aesenc ($inout5,$rndkey1);
  789. &call (&label("_aesni_encrypt6_enter"));
  790. &movups ($rndkey1,&QWP(0,$inp));
  791. &movups ($rndkey0,&QWP(0x10,$inp));
  792. &xorps ($inout0,$rndkey1);
  793. &movups ($rndkey1,&QWP(0x20,$inp));
  794. &xorps ($inout1,$rndkey0);
  795. &movups (&QWP(0,$out),$inout0);
  796. &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
  797. &xorps ($inout2,$rndkey1);
  798. &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
  799. &movups (&QWP(0x10,$out),$inout1);
  800. &movups (&QWP(0x20,$out),$inout2);
  801. &paddd ($rndkey1,$rndkey0); # 1st triplet increment
  802. &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
  803. &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
  804. &movups ($inout1,&QWP(0x30,$inp));
  805. &movups ($inout2,&QWP(0x40,$inp));
  806. &xorps ($inout3,$inout1);
  807. &movups ($inout1,&QWP(0x50,$inp));
  808. &lea ($inp,&DWP(0x60,$inp));
  809. &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
  810. &pshufb ($rndkey1,$inout0); # byte swap
  811. &xorps ($inout4,$inout2);
  812. &movups (&QWP(0x30,$out),$inout3);
  813. &xorps ($inout5,$inout1);
  814. &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
  815. &pshufb ($rndkey0,$inout0); # byte swap
  816. &movups (&QWP(0x40,$out),$inout4);
  817. &pshufd ($inout0,$rndkey1,3<<6);
  818. &movups (&QWP(0x50,$out),$inout5);
  819. &lea ($out,&DWP(0x60,$out));
  820. &mov ($rounds,$rounds_);
  821. &pshufd ($inout1,$rndkey1,2<<6);
  822. &sub ($len,6);
  823. &jnc (&label("ctr32_loop6"));
  824. &add ($len,6);
  825. &jz (&label("ctr32_ret"));
  826. &mov ($key,$key_);
  827. &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
  828. &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
  829. &set_label("ctr32_tail");
  830. &por ($inout0,$inout5);
  831. &cmp ($len,2);
  832. &jb (&label("ctr32_one"));
  833. &pshufd ($inout2,$rndkey1,1<<6);
  834. &por ($inout1,$inout5);
  835. &je (&label("ctr32_two"));
  836. &pshufd ($inout3,$rndkey0,3<<6);
  837. &por ($inout2,$inout5);
  838. &cmp ($len,4);
  839. &jb (&label("ctr32_three"));
  840. &pshufd ($inout4,$rndkey0,2<<6);
  841. &por ($inout3,$inout5);
  842. &je (&label("ctr32_four"));
  843. &por ($inout4,$inout5);
  844. &call ("_aesni_encrypt6");
  845. &movups ($rndkey1,&QWP(0,$inp));
  846. &movups ($rndkey0,&QWP(0x10,$inp));
  847. &xorps ($inout0,$rndkey1);
  848. &movups ($rndkey1,&QWP(0x20,$inp));
  849. &xorps ($inout1,$rndkey0);
  850. &movups ($rndkey0,&QWP(0x30,$inp));
  851. &xorps ($inout2,$rndkey1);
  852. &movups ($rndkey1,&QWP(0x40,$inp));
  853. &xorps ($inout3,$rndkey0);
  854. &movups (&QWP(0,$out),$inout0);
  855. &xorps ($inout4,$rndkey1);
  856. &movups (&QWP(0x10,$out),$inout1);
  857. &movups (&QWP(0x20,$out),$inout2);
  858. &movups (&QWP(0x30,$out),$inout3);
  859. &movups (&QWP(0x40,$out),$inout4);
  860. &jmp (&label("ctr32_ret"));
  861. &set_label("ctr32_one_shortcut",16);
  862. &movups ($inout0,&QWP(0,$rounds_)); # load ivec
  863. &mov ($rounds,&DWP(240,$key));
  864. &set_label("ctr32_one");
  865. if ($inline)
  866. { &aesni_inline_generate1("enc"); }
  867. else
  868. { &call ("_aesni_encrypt1"); }
  869. &movups ($in0,&QWP(0,$inp));
  870. &xorps ($in0,$inout0);
  871. &movups (&QWP(0,$out),$in0);
  872. &jmp (&label("ctr32_ret"));
  873. &set_label("ctr32_two",16);
  874. &call ("_aesni_encrypt3");
  875. &movups ($inout3,&QWP(0,$inp));
  876. &movups ($inout4,&QWP(0x10,$inp));
  877. &xorps ($inout0,$inout3);
  878. &xorps ($inout1,$inout4);
  879. &movups (&QWP(0,$out),$inout0);
  880. &movups (&QWP(0x10,$out),$inout1);
  881. &jmp (&label("ctr32_ret"));
  882. &set_label("ctr32_three",16);
  883. &call ("_aesni_encrypt3");
  884. &movups ($inout3,&QWP(0,$inp));
  885. &movups ($inout4,&QWP(0x10,$inp));
  886. &xorps ($inout0,$inout3);
  887. &movups ($inout5,&QWP(0x20,$inp));
  888. &xorps ($inout1,$inout4);
  889. &movups (&QWP(0,$out),$inout0);
  890. &xorps ($inout2,$inout5);
  891. &movups (&QWP(0x10,$out),$inout1);
  892. &movups (&QWP(0x20,$out),$inout2);
  893. &jmp (&label("ctr32_ret"));
  894. &set_label("ctr32_four",16);
  895. &call ("_aesni_encrypt4");
  896. &movups ($inout4,&QWP(0,$inp));
  897. &movups ($inout5,&QWP(0x10,$inp));
  898. &movups ($rndkey1,&QWP(0x20,$inp));
  899. &xorps ($inout0,$inout4);
  900. &movups ($rndkey0,&QWP(0x30,$inp));
  901. &xorps ($inout1,$inout5);
  902. &movups (&QWP(0,$out),$inout0);
  903. &xorps ($inout2,$rndkey1);
  904. &movups (&QWP(0x10,$out),$inout1);
  905. &xorps ($inout3,$rndkey0);
  906. &movups (&QWP(0x20,$out),$inout2);
  907. &movups (&QWP(0x30,$out),$inout3);
  908. &set_label("ctr32_ret");
  909. &mov ("esp",&DWP(80,"esp"));
  910. &function_end("aesni_ctr32_encrypt_blocks");
  911. ######################################################################
  912. # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  913. # const AES_KEY *key1, const AES_KEY *key2
  914. # const unsigned char iv[16]);
  915. #
  916. { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
  917. &function_begin("aesni_xts_encrypt");
  918. &mov ($key,&wparam(4)); # key2
  919. &mov ($inp,&wparam(5)); # clear-text tweak
  920. &mov ($rounds,&DWP(240,$key)); # key2->rounds
  921. &movups ($inout0,&QWP(0,$inp));
  922. if ($inline)
  923. { &aesni_inline_generate1("enc"); }
  924. else
  925. { &call ("_aesni_encrypt1"); }
  926. &mov ($inp,&wparam(0));
  927. &mov ($out,&wparam(1));
  928. &mov ($len,&wparam(2));
  929. &mov ($key,&wparam(3)); # key1
  930. &mov ($key_,"esp");
  931. &sub ("esp",16*7+8);
  932. &mov ($rounds,&DWP(240,$key)); # key1->rounds
  933. &and ("esp",-16); # align stack
  934. &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
  935. &mov (&DWP(16*6+4,"esp"),0);
  936. &mov (&DWP(16*6+8,"esp"),1);
  937. &mov (&DWP(16*6+12,"esp"),0);
  938. &mov (&DWP(16*7+0,"esp"),$len); # save original $len
  939. &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
  940. &movdqa ($tweak,$inout0);
  941. &pxor ($twtmp,$twtmp);
  942. &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
  943. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  944. &and ($len,-16);
  945. &mov ($key_,$key); # backup $key
  946. &mov ($rounds_,$rounds); # backup $rounds
  947. &sub ($len,16*6);
  948. &jc (&label("xts_enc_short"));
  949. &shr ($rounds,1);
  950. &mov ($rounds_,$rounds);
  951. &jmp (&label("xts_enc_loop6"));
  952. &set_label("xts_enc_loop6",16);
  953. for ($i=0;$i<4;$i++) {
  954. &pshufd ($twres,$twtmp,0x13);
  955. &pxor ($twtmp,$twtmp);
  956. &movdqa (&QWP(16*$i,"esp"),$tweak);
  957. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  958. &pand ($twres,$twmask); # isolate carry and residue
  959. &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
  960. &pxor ($tweak,$twres);
  961. }
  962. &pshufd ($inout5,$twtmp,0x13);
  963. &movdqa (&QWP(16*$i++,"esp"),$tweak);
  964. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  965. &$movekey ($rndkey0,&QWP(0,$key_));
  966. &pand ($inout5,$twmask); # isolate carry and residue
  967. &movups ($inout0,&QWP(0,$inp)); # load input
  968. &pxor ($inout5,$tweak);
  969. # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
  970. &movdqu ($inout1,&QWP(16*1,$inp));
  971. &xorps ($inout0,$rndkey0); # input^=rndkey[0]
  972. &movdqu ($inout2,&QWP(16*2,$inp));
  973. &pxor ($inout1,$rndkey0);
  974. &movdqu ($inout3,&QWP(16*3,$inp));
  975. &pxor ($inout2,$rndkey0);
  976. &movdqu ($inout4,&QWP(16*4,$inp));
  977. &pxor ($inout3,$rndkey0);
  978. &movdqu ($rndkey1,&QWP(16*5,$inp));
  979. &pxor ($inout4,$rndkey0);
  980. &lea ($inp,&DWP(16*6,$inp));
  981. &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
  982. &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
  983. &pxor ($inout5,$rndkey1);
  984. &$movekey ($rndkey1,&QWP(16,$key_));
  985. &lea ($key,&DWP(32,$key_));
  986. &pxor ($inout1,&QWP(16*1,"esp"));
  987. &aesenc ($inout0,$rndkey1);
  988. &pxor ($inout2,&QWP(16*2,"esp"));
  989. &aesenc ($inout1,$rndkey1);
  990. &pxor ($inout3,&QWP(16*3,"esp"));
  991. &dec ($rounds);
  992. &aesenc ($inout2,$rndkey1);
  993. &pxor ($inout4,&QWP(16*4,"esp"));
  994. &aesenc ($inout3,$rndkey1);
  995. &pxor ($inout5,$rndkey0);
  996. &aesenc ($inout4,$rndkey1);
  997. &$movekey ($rndkey0,&QWP(0,$key));
  998. &aesenc ($inout5,$rndkey1);
  999. &call (&label("_aesni_encrypt6_enter"));
  1000. &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
  1001. &pxor ($twtmp,$twtmp);
  1002. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1003. &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
  1004. &xorps ($inout1,&QWP(16*1,"esp"));
  1005. &movups (&QWP(16*0,$out),$inout0); # write output
  1006. &xorps ($inout2,&QWP(16*2,"esp"));
  1007. &movups (&QWP(16*1,$out),$inout1);
  1008. &xorps ($inout3,&QWP(16*3,"esp"));
  1009. &movups (&QWP(16*2,$out),$inout2);
  1010. &xorps ($inout4,&QWP(16*4,"esp"));
  1011. &movups (&QWP(16*3,$out),$inout3);
  1012. &xorps ($inout5,$tweak);
  1013. &movups (&QWP(16*4,$out),$inout4);
  1014. &pshufd ($twres,$twtmp,0x13);
  1015. &movups (&QWP(16*5,$out),$inout5);
  1016. &lea ($out,&DWP(16*6,$out));
  1017. &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
  1018. &pxor ($twtmp,$twtmp);
  1019. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1020. &pand ($twres,$twmask); # isolate carry and residue
  1021. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1022. &mov ($rounds,$rounds_); # restore $rounds
  1023. &pxor ($tweak,$twres);
  1024. &sub ($len,16*6);
  1025. &jnc (&label("xts_enc_loop6"));
  1026. &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
  1027. &mov ($key,$key_); # restore $key
  1028. &mov ($rounds_,$rounds);
  1029. &set_label("xts_enc_short");
  1030. &add ($len,16*6);
  1031. &jz (&label("xts_enc_done6x"));
  1032. &movdqa ($inout3,$tweak); # put aside previous tweak
  1033. &cmp ($len,0x20);
  1034. &jb (&label("xts_enc_one"));
  1035. &pshufd ($twres,$twtmp,0x13);
  1036. &pxor ($twtmp,$twtmp);
  1037. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1038. &pand ($twres,$twmask); # isolate carry and residue
  1039. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1040. &pxor ($tweak,$twres);
  1041. &je (&label("xts_enc_two"));
  1042. &pshufd ($twres,$twtmp,0x13);
  1043. &pxor ($twtmp,$twtmp);
  1044. &movdqa ($inout4,$tweak); # put aside previous tweak
  1045. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1046. &pand ($twres,$twmask); # isolate carry and residue
  1047. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1048. &pxor ($tweak,$twres);
  1049. &cmp ($len,0x40);
  1050. &jb (&label("xts_enc_three"));
  1051. &pshufd ($twres,$twtmp,0x13);
  1052. &pxor ($twtmp,$twtmp);
  1053. &movdqa ($inout5,$tweak); # put aside previous tweak
  1054. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1055. &pand ($twres,$twmask); # isolate carry and residue
  1056. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1057. &pxor ($tweak,$twres);
  1058. &movdqa (&QWP(16*0,"esp"),$inout3);
  1059. &movdqa (&QWP(16*1,"esp"),$inout4);
  1060. &je (&label("xts_enc_four"));
  1061. &movdqa (&QWP(16*2,"esp"),$inout5);
  1062. &pshufd ($inout5,$twtmp,0x13);
  1063. &movdqa (&QWP(16*3,"esp"),$tweak);
  1064. &paddq ($tweak,$tweak); # &psllq($inout0,1);
  1065. &pand ($inout5,$twmask); # isolate carry and residue
  1066. &pxor ($inout5,$tweak);
  1067. &movdqu ($inout0,&QWP(16*0,$inp)); # load input
  1068. &movdqu ($inout1,&QWP(16*1,$inp));
  1069. &movdqu ($inout2,&QWP(16*2,$inp));
  1070. &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
  1071. &movdqu ($inout3,&QWP(16*3,$inp));
  1072. &pxor ($inout1,&QWP(16*1,"esp"));
  1073. &movdqu ($inout4,&QWP(16*4,$inp));
  1074. &pxor ($inout2,&QWP(16*2,"esp"));
  1075. &lea ($inp,&DWP(16*5,$inp));
  1076. &pxor ($inout3,&QWP(16*3,"esp"));
  1077. &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
  1078. &pxor ($inout4,$inout5);
  1079. &call ("_aesni_encrypt6");
  1080. &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
  1081. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1082. &xorps ($inout1,&QWP(16*1,"esp"));
  1083. &xorps ($inout2,&QWP(16*2,"esp"));
  1084. &movups (&QWP(16*0,$out),$inout0); # write output
  1085. &xorps ($inout3,&QWP(16*3,"esp"));
  1086. &movups (&QWP(16*1,$out),$inout1);
  1087. &xorps ($inout4,$tweak);
  1088. &movups (&QWP(16*2,$out),$inout2);
  1089. &movups (&QWP(16*3,$out),$inout3);
  1090. &movups (&QWP(16*4,$out),$inout4);
  1091. &lea ($out,&DWP(16*5,$out));
  1092. &jmp (&label("xts_enc_done"));
  1093. &set_label("xts_enc_one",16);
  1094. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1095. &lea ($inp,&DWP(16*1,$inp));
  1096. &xorps ($inout0,$inout3); # input^=tweak
  1097. if ($inline)
  1098. { &aesni_inline_generate1("enc"); }
  1099. else
  1100. { &call ("_aesni_encrypt1"); }
  1101. &xorps ($inout0,$inout3); # output^=tweak
  1102. &movups (&QWP(16*0,$out),$inout0); # write output
  1103. &lea ($out,&DWP(16*1,$out));
  1104. &movdqa ($tweak,$inout3); # last tweak
  1105. &jmp (&label("xts_enc_done"));
  1106. &set_label("xts_enc_two",16);
  1107. &movaps ($inout4,$tweak); # put aside last tweak
  1108. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1109. &movups ($inout1,&QWP(16*1,$inp));
  1110. &lea ($inp,&DWP(16*2,$inp));
  1111. &xorps ($inout0,$inout3); # input^=tweak
  1112. &xorps ($inout1,$inout4);
  1113. &xorps ($inout2,$inout2);
  1114. &call ("_aesni_encrypt3");
  1115. &xorps ($inout0,$inout3); # output^=tweak
  1116. &xorps ($inout1,$inout4);
  1117. &movups (&QWP(16*0,$out),$inout0); # write output
  1118. &movups (&QWP(16*1,$out),$inout1);
  1119. &lea ($out,&DWP(16*2,$out));
  1120. &movdqa ($tweak,$inout4); # last tweak
  1121. &jmp (&label("xts_enc_done"));
  1122. &set_label("xts_enc_three",16);
  1123. &movaps ($inout5,$tweak); # put aside last tweak
  1124. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1125. &movups ($inout1,&QWP(16*1,$inp));
  1126. &movups ($inout2,&QWP(16*2,$inp));
  1127. &lea ($inp,&DWP(16*3,$inp));
  1128. &xorps ($inout0,$inout3); # input^=tweak
  1129. &xorps ($inout1,$inout4);
  1130. &xorps ($inout2,$inout5);
  1131. &call ("_aesni_encrypt3");
  1132. &xorps ($inout0,$inout3); # output^=tweak
  1133. &xorps ($inout1,$inout4);
  1134. &xorps ($inout2,$inout5);
  1135. &movups (&QWP(16*0,$out),$inout0); # write output
  1136. &movups (&QWP(16*1,$out),$inout1);
  1137. &movups (&QWP(16*2,$out),$inout2);
  1138. &lea ($out,&DWP(16*3,$out));
  1139. &movdqa ($tweak,$inout5); # last tweak
  1140. &jmp (&label("xts_enc_done"));
  1141. &set_label("xts_enc_four",16);
  1142. &movaps ($inout4,$tweak); # put aside last tweak
  1143. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1144. &movups ($inout1,&QWP(16*1,$inp));
  1145. &movups ($inout2,&QWP(16*2,$inp));
  1146. &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
  1147. &movups ($inout3,&QWP(16*3,$inp));
  1148. &lea ($inp,&DWP(16*4,$inp));
  1149. &xorps ($inout1,&QWP(16*1,"esp"));
  1150. &xorps ($inout2,$inout5);
  1151. &xorps ($inout3,$inout4);
  1152. &call ("_aesni_encrypt4");
  1153. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1154. &xorps ($inout1,&QWP(16*1,"esp"));
  1155. &xorps ($inout2,$inout5);
  1156. &movups (&QWP(16*0,$out),$inout0); # write output
  1157. &xorps ($inout3,$inout4);
  1158. &movups (&QWP(16*1,$out),$inout1);
  1159. &movups (&QWP(16*2,$out),$inout2);
  1160. &movups (&QWP(16*3,$out),$inout3);
  1161. &lea ($out,&DWP(16*4,$out));
  1162. &movdqa ($tweak,$inout4); # last tweak
  1163. &jmp (&label("xts_enc_done"));
  1164. &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
  1165. &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
  1166. &and ($len,15);
  1167. &jz (&label("xts_enc_ret"));
  1168. &movdqa ($inout3,$tweak);
  1169. &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
  1170. &jmp (&label("xts_enc_steal"));
  1171. &set_label("xts_enc_done",16);
  1172. &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
  1173. &pxor ($twtmp,$twtmp);
  1174. &and ($len,15);
  1175. &jz (&label("xts_enc_ret"));
  1176. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1177. &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
  1178. &pshufd ($inout3,$twtmp,0x13);
  1179. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1180. &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
  1181. &pxor ($inout3,$tweak);
  1182. &set_label("xts_enc_steal");
  1183. &movz ($rounds,&BP(0,$inp));
  1184. &movz ($key,&BP(-16,$out));
  1185. &lea ($inp,&DWP(1,$inp));
  1186. &mov (&BP(-16,$out),&LB($rounds));
  1187. &mov (&BP(0,$out),&LB($key));
  1188. &lea ($out,&DWP(1,$out));
  1189. &sub ($len,1);
  1190. &jnz (&label("xts_enc_steal"));
  1191. &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
  1192. &mov ($key,$key_); # restore $key
  1193. &mov ($rounds,$rounds_); # restore $rounds
  1194. &movups ($inout0,&QWP(-16,$out)); # load input
  1195. &xorps ($inout0,$inout3); # input^=tweak
  1196. if ($inline)
  1197. { &aesni_inline_generate1("enc"); }
  1198. else
  1199. { &call ("_aesni_encrypt1"); }
  1200. &xorps ($inout0,$inout3); # output^=tweak
  1201. &movups (&QWP(-16,$out),$inout0); # write output
  1202. &set_label("xts_enc_ret");
  1203. &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
  1204. &function_end("aesni_xts_encrypt");
  1205. &function_begin("aesni_xts_decrypt");
  1206. &mov ($key,&wparam(4)); # key2
  1207. &mov ($inp,&wparam(5)); # clear-text tweak
  1208. &mov ($rounds,&DWP(240,$key)); # key2->rounds
  1209. &movups ($inout0,&QWP(0,$inp));
  1210. if ($inline)
  1211. { &aesni_inline_generate1("enc"); }
  1212. else
  1213. { &call ("_aesni_encrypt1"); }
  1214. &mov ($inp,&wparam(0));
  1215. &mov ($out,&wparam(1));
  1216. &mov ($len,&wparam(2));
  1217. &mov ($key,&wparam(3)); # key1
  1218. &mov ($key_,"esp");
  1219. &sub ("esp",16*7+8);
  1220. &and ("esp",-16); # align stack
  1221. &xor ($rounds_,$rounds_); # if(len%16) len-=16;
  1222. &test ($len,15);
  1223. &setnz (&LB($rounds_));
  1224. &shl ($rounds_,4);
  1225. &sub ($len,$rounds_);
  1226. &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
  1227. &mov (&DWP(16*6+4,"esp"),0);
  1228. &mov (&DWP(16*6+8,"esp"),1);
  1229. &mov (&DWP(16*6+12,"esp"),0);
  1230. &mov (&DWP(16*7+0,"esp"),$len); # save original $len
  1231. &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
  1232. &mov ($rounds,&DWP(240,$key)); # key1->rounds
  1233. &mov ($key_,$key); # backup $key
  1234. &mov ($rounds_,$rounds); # backup $rounds
  1235. &movdqa ($tweak,$inout0);
  1236. &pxor ($twtmp,$twtmp);
  1237. &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
  1238. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1239. &and ($len,-16);
  1240. &sub ($len,16*6);
  1241. &jc (&label("xts_dec_short"));
  1242. &shr ($rounds,1);
  1243. &mov ($rounds_,$rounds);
  1244. &jmp (&label("xts_dec_loop6"));
  1245. &set_label("xts_dec_loop6",16);
  1246. for ($i=0;$i<4;$i++) {
  1247. &pshufd ($twres,$twtmp,0x13);
  1248. &pxor ($twtmp,$twtmp);
  1249. &movdqa (&QWP(16*$i,"esp"),$tweak);
  1250. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1251. &pand ($twres,$twmask); # isolate carry and residue
  1252. &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
  1253. &pxor ($tweak,$twres);
  1254. }
  1255. &pshufd ($inout5,$twtmp,0x13);
  1256. &movdqa (&QWP(16*$i++,"esp"),$tweak);
  1257. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1258. &$movekey ($rndkey0,&QWP(0,$key_));
  1259. &pand ($inout5,$twmask); # isolate carry and residue
  1260. &movups ($inout0,&QWP(0,$inp)); # load input
  1261. &pxor ($inout5,$tweak);
  1262. # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
  1263. &movdqu ($inout1,&QWP(16*1,$inp));
  1264. &xorps ($inout0,$rndkey0); # input^=rndkey[0]
  1265. &movdqu ($inout2,&QWP(16*2,$inp));
  1266. &pxor ($inout1,$rndkey0);
  1267. &movdqu ($inout3,&QWP(16*3,$inp));
  1268. &pxor ($inout2,$rndkey0);
  1269. &movdqu ($inout4,&QWP(16*4,$inp));
  1270. &pxor ($inout3,$rndkey0);
  1271. &movdqu ($rndkey1,&QWP(16*5,$inp));
  1272. &pxor ($inout4,$rndkey0);
  1273. &lea ($inp,&DWP(16*6,$inp));
  1274. &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
  1275. &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
  1276. &pxor ($inout5,$rndkey1);
  1277. &$movekey ($rndkey1,&QWP(16,$key_));
  1278. &lea ($key,&DWP(32,$key_));
  1279. &pxor ($inout1,&QWP(16*1,"esp"));
  1280. &aesdec ($inout0,$rndkey1);
  1281. &pxor ($inout2,&QWP(16*2,"esp"));
  1282. &aesdec ($inout1,$rndkey1);
  1283. &pxor ($inout3,&QWP(16*3,"esp"));
  1284. &dec ($rounds);
  1285. &aesdec ($inout2,$rndkey1);
  1286. &pxor ($inout4,&QWP(16*4,"esp"));
  1287. &aesdec ($inout3,$rndkey1);
  1288. &pxor ($inout5,$rndkey0);
  1289. &aesdec ($inout4,$rndkey1);
  1290. &$movekey ($rndkey0,&QWP(0,$key));
  1291. &aesdec ($inout5,$rndkey1);
  1292. &call (&label("_aesni_decrypt6_enter"));
  1293. &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
  1294. &pxor ($twtmp,$twtmp);
  1295. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1296. &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
  1297. &xorps ($inout1,&QWP(16*1,"esp"));
  1298. &movups (&QWP(16*0,$out),$inout0); # write output
  1299. &xorps ($inout2,&QWP(16*2,"esp"));
  1300. &movups (&QWP(16*1,$out),$inout1);
  1301. &xorps ($inout3,&QWP(16*3,"esp"));
  1302. &movups (&QWP(16*2,$out),$inout2);
  1303. &xorps ($inout4,&QWP(16*4,"esp"));
  1304. &movups (&QWP(16*3,$out),$inout3);
  1305. &xorps ($inout5,$tweak);
  1306. &movups (&QWP(16*4,$out),$inout4);
  1307. &pshufd ($twres,$twtmp,0x13);
  1308. &movups (&QWP(16*5,$out),$inout5);
  1309. &lea ($out,&DWP(16*6,$out));
  1310. &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
  1311. &pxor ($twtmp,$twtmp);
  1312. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1313. &pand ($twres,$twmask); # isolate carry and residue
  1314. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1315. &mov ($rounds,$rounds_); # restore $rounds
  1316. &pxor ($tweak,$twres);
  1317. &sub ($len,16*6);
  1318. &jnc (&label("xts_dec_loop6"));
  1319. &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
  1320. &mov ($key,$key_); # restore $key
  1321. &mov ($rounds_,$rounds);
  1322. &set_label("xts_dec_short");
  1323. &add ($len,16*6);
  1324. &jz (&label("xts_dec_done6x"));
  1325. &movdqa ($inout3,$tweak); # put aside previous tweak
  1326. &cmp ($len,0x20);
  1327. &jb (&label("xts_dec_one"));
  1328. &pshufd ($twres,$twtmp,0x13);
  1329. &pxor ($twtmp,$twtmp);
  1330. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1331. &pand ($twres,$twmask); # isolate carry and residue
  1332. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1333. &pxor ($tweak,$twres);
  1334. &je (&label("xts_dec_two"));
  1335. &pshufd ($twres,$twtmp,0x13);
  1336. &pxor ($twtmp,$twtmp);
  1337. &movdqa ($inout4,$tweak); # put aside previous tweak
  1338. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1339. &pand ($twres,$twmask); # isolate carry and residue
  1340. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1341. &pxor ($tweak,$twres);
  1342. &cmp ($len,0x40);
  1343. &jb (&label("xts_dec_three"));
  1344. &pshufd ($twres,$twtmp,0x13);
  1345. &pxor ($twtmp,$twtmp);
  1346. &movdqa ($inout5,$tweak); # put aside previous tweak
  1347. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1348. &pand ($twres,$twmask); # isolate carry and residue
  1349. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1350. &pxor ($tweak,$twres);
  1351. &movdqa (&QWP(16*0,"esp"),$inout3);
  1352. &movdqa (&QWP(16*1,"esp"),$inout4);
  1353. &je (&label("xts_dec_four"));
  1354. &movdqa (&QWP(16*2,"esp"),$inout5);
  1355. &pshufd ($inout5,$twtmp,0x13);
  1356. &movdqa (&QWP(16*3,"esp"),$tweak);
  1357. &paddq ($tweak,$tweak); # &psllq($inout0,1);
  1358. &pand ($inout5,$twmask); # isolate carry and residue
  1359. &pxor ($inout5,$tweak);
  1360. &movdqu ($inout0,&QWP(16*0,$inp)); # load input
  1361. &movdqu ($inout1,&QWP(16*1,$inp));
  1362. &movdqu ($inout2,&QWP(16*2,$inp));
  1363. &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
  1364. &movdqu ($inout3,&QWP(16*3,$inp));
  1365. &pxor ($inout1,&QWP(16*1,"esp"));
  1366. &movdqu ($inout4,&QWP(16*4,$inp));
  1367. &pxor ($inout2,&QWP(16*2,"esp"));
  1368. &lea ($inp,&DWP(16*5,$inp));
  1369. &pxor ($inout3,&QWP(16*3,"esp"));
  1370. &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
  1371. &pxor ($inout4,$inout5);
  1372. &call ("_aesni_decrypt6");
  1373. &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
  1374. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1375. &xorps ($inout1,&QWP(16*1,"esp"));
  1376. &xorps ($inout2,&QWP(16*2,"esp"));
  1377. &movups (&QWP(16*0,$out),$inout0); # write output
  1378. &xorps ($inout3,&QWP(16*3,"esp"));
  1379. &movups (&QWP(16*1,$out),$inout1);
  1380. &xorps ($inout4,$tweak);
  1381. &movups (&QWP(16*2,$out),$inout2);
  1382. &movups (&QWP(16*3,$out),$inout3);
  1383. &movups (&QWP(16*4,$out),$inout4);
  1384. &lea ($out,&DWP(16*5,$out));
  1385. &jmp (&label("xts_dec_done"));
  1386. &set_label("xts_dec_one",16);
  1387. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1388. &lea ($inp,&DWP(16*1,$inp));
  1389. &xorps ($inout0,$inout3); # input^=tweak
  1390. if ($inline)
  1391. { &aesni_inline_generate1("dec"); }
  1392. else
  1393. { &call ("_aesni_decrypt1"); }
  1394. &xorps ($inout0,$inout3); # output^=tweak
  1395. &movups (&QWP(16*0,$out),$inout0); # write output
  1396. &lea ($out,&DWP(16*1,$out));
  1397. &movdqa ($tweak,$inout3); # last tweak
  1398. &jmp (&label("xts_dec_done"));
  1399. &set_label("xts_dec_two",16);
  1400. &movaps ($inout4,$tweak); # put aside last tweak
  1401. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1402. &movups ($inout1,&QWP(16*1,$inp));
  1403. &lea ($inp,&DWP(16*2,$inp));
  1404. &xorps ($inout0,$inout3); # input^=tweak
  1405. &xorps ($inout1,$inout4);
  1406. &call ("_aesni_decrypt3");
  1407. &xorps ($inout0,$inout3); # output^=tweak
  1408. &xorps ($inout1,$inout4);
  1409. &movups (&QWP(16*0,$out),$inout0); # write output
  1410. &movups (&QWP(16*1,$out),$inout1);
  1411. &lea ($out,&DWP(16*2,$out));
  1412. &movdqa ($tweak,$inout4); # last tweak
  1413. &jmp (&label("xts_dec_done"));
  1414. &set_label("xts_dec_three",16);
  1415. &movaps ($inout5,$tweak); # put aside last tweak
  1416. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1417. &movups ($inout1,&QWP(16*1,$inp));
  1418. &movups ($inout2,&QWP(16*2,$inp));
  1419. &lea ($inp,&DWP(16*3,$inp));
  1420. &xorps ($inout0,$inout3); # input^=tweak
  1421. &xorps ($inout1,$inout4);
  1422. &xorps ($inout2,$inout5);
  1423. &call ("_aesni_decrypt3");
  1424. &xorps ($inout0,$inout3); # output^=tweak
  1425. &xorps ($inout1,$inout4);
  1426. &xorps ($inout2,$inout5);
  1427. &movups (&QWP(16*0,$out),$inout0); # write output
  1428. &movups (&QWP(16*1,$out),$inout1);
  1429. &movups (&QWP(16*2,$out),$inout2);
  1430. &lea ($out,&DWP(16*3,$out));
  1431. &movdqa ($tweak,$inout5); # last tweak
  1432. &jmp (&label("xts_dec_done"));
  1433. &set_label("xts_dec_four",16);
  1434. &movaps ($inout4,$tweak); # put aside last tweak
  1435. &movups ($inout0,&QWP(16*0,$inp)); # load input
  1436. &movups ($inout1,&QWP(16*1,$inp));
  1437. &movups ($inout2,&QWP(16*2,$inp));
  1438. &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
  1439. &movups ($inout3,&QWP(16*3,$inp));
  1440. &lea ($inp,&DWP(16*4,$inp));
  1441. &xorps ($inout1,&QWP(16*1,"esp"));
  1442. &xorps ($inout2,$inout5);
  1443. &xorps ($inout3,$inout4);
  1444. &call ("_aesni_decrypt4");
  1445. &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
  1446. &xorps ($inout1,&QWP(16*1,"esp"));
  1447. &xorps ($inout2,$inout5);
  1448. &movups (&QWP(16*0,$out),$inout0); # write output
  1449. &xorps ($inout3,$inout4);
  1450. &movups (&QWP(16*1,$out),$inout1);
  1451. &movups (&QWP(16*2,$out),$inout2);
  1452. &movups (&QWP(16*3,$out),$inout3);
  1453. &lea ($out,&DWP(16*4,$out));
  1454. &movdqa ($tweak,$inout4); # last tweak
  1455. &jmp (&label("xts_dec_done"));
  1456. &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
  1457. &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
  1458. &and ($len,15);
  1459. &jz (&label("xts_dec_ret"));
  1460. &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
  1461. &jmp (&label("xts_dec_only_one_more"));
  1462. &set_label("xts_dec_done",16);
  1463. &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
  1464. &pxor ($twtmp,$twtmp);
  1465. &and ($len,15);
  1466. &jz (&label("xts_dec_ret"));
  1467. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1468. &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
  1469. &pshufd ($twres,$twtmp,0x13);
  1470. &pxor ($twtmp,$twtmp);
  1471. &movdqa ($twmask,&QWP(16*6,"esp"));
  1472. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1473. &pand ($twres,$twmask); # isolate carry and residue
  1474. &pcmpgtd($twtmp,$tweak); # broadcast upper bits
  1475. &pxor ($tweak,$twres);
  1476. &set_label("xts_dec_only_one_more");
  1477. &pshufd ($inout3,$twtmp,0x13);
  1478. &movdqa ($inout4,$tweak); # put aside previous tweak
  1479. &paddq ($tweak,$tweak); # &psllq($tweak,1);
  1480. &pand ($inout3,$twmask); # isolate carry and residue
  1481. &pxor ($inout3,$tweak);
  1482. &mov ($key,$key_); # restore $key
  1483. &mov ($rounds,$rounds_); # restore $rounds
  1484. &movups ($inout0,&QWP(0,$inp)); # load input
  1485. &xorps ($inout0,$inout3); # input^=tweak
  1486. if ($inline)
  1487. { &aesni_inline_generate1("dec"); }
  1488. else
  1489. { &call ("_aesni_decrypt1"); }
  1490. &xorps ($inout0,$inout3); # output^=tweak
  1491. &movups (&QWP(0,$out),$inout0); # write output
  1492. &set_label("xts_dec_steal");
  1493. &movz ($rounds,&BP(16,$inp));
  1494. &movz ($key,&BP(0,$out));
  1495. &lea ($inp,&DWP(1,$inp));
  1496. &mov (&BP(0,$out),&LB($rounds));
  1497. &mov (&BP(16,$out),&LB($key));
  1498. &lea ($out,&DWP(1,$out));
  1499. &sub ($len,1);
  1500. &jnz (&label("xts_dec_steal"));
  1501. &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
  1502. &mov ($key,$key_); # restore $key
  1503. &mov ($rounds,$rounds_); # restore $rounds
  1504. &movups ($inout0,&QWP(0,$out)); # load input
  1505. &xorps ($inout0,$inout4); # input^=tweak
  1506. if ($inline)
  1507. { &aesni_inline_generate1("dec"); }
  1508. else
  1509. { &call ("_aesni_decrypt1"); }
  1510. &xorps ($inout0,$inout4); # output^=tweak
  1511. &movups (&QWP(0,$out),$inout0); # write output
  1512. &set_label("xts_dec_ret");
  1513. &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
  1514. &function_end("aesni_xts_decrypt");
  1515. }
  1516. }
  1517. ######################################################################
  1518. # void $PREFIX_cbc_encrypt (const void *inp, void *out,
  1519. # size_t length, const AES_KEY *key,
  1520. # unsigned char *ivp,const int enc);
  1521. &function_begin("${PREFIX}_cbc_encrypt");
  1522. &mov ($inp,&wparam(0));
  1523. &mov ($rounds_,"esp");
  1524. &mov ($out,&wparam(1));
  1525. &sub ($rounds_,24);
  1526. &mov ($len,&wparam(2));
  1527. &and ($rounds_,-16);
  1528. &mov ($key,&wparam(3));
  1529. &mov ($key_,&wparam(4));
  1530. &test ($len,$len);
  1531. &jz (&label("cbc_abort"));
  1532. &cmp (&wparam(5),0);
  1533. &xchg ($rounds_,"esp"); # alloca
  1534. &movups ($ivec,&QWP(0,$key_)); # load IV
  1535. &mov ($rounds,&DWP(240,$key));
  1536. &mov ($key_,$key); # backup $key
  1537. &mov (&DWP(16,"esp"),$rounds_); # save original %esp
  1538. &mov ($rounds_,$rounds); # backup $rounds
  1539. &je (&label("cbc_decrypt"));
  1540. &movaps ($inout0,$ivec);
  1541. &cmp ($len,16);
  1542. &jb (&label("cbc_enc_tail"));
  1543. &sub ($len,16);
  1544. &jmp (&label("cbc_enc_loop"));
  1545. &set_label("cbc_enc_loop",16);
  1546. &movups ($ivec,&QWP(0,$inp)); # input actually
  1547. &lea ($inp,&DWP(16,$inp));
  1548. if ($inline)
  1549. { &aesni_inline_generate1("enc",$inout0,$ivec); }
  1550. else
  1551. { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
  1552. &mov ($rounds,$rounds_); # restore $rounds
  1553. &mov ($key,$key_); # restore $key
  1554. &movups (&QWP(0,$out),$inout0); # store output
  1555. &lea ($out,&DWP(16,$out));
  1556. &sub ($len,16);
  1557. &jnc (&label("cbc_enc_loop"));
  1558. &add ($len,16);
  1559. &jnz (&label("cbc_enc_tail"));
  1560. &movaps ($ivec,$inout0);
  1561. &jmp (&label("cbc_ret"));
  1562. &set_label("cbc_enc_tail");
  1563. &mov ("ecx",$len); # zaps $rounds
  1564. &data_word(0xA4F3F689); # rep movsb
  1565. &mov ("ecx",16); # zero tail
  1566. &sub ("ecx",$len);
  1567. &xor ("eax","eax"); # zaps $len
  1568. &data_word(0xAAF3F689); # rep stosb
  1569. &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
  1570. &mov ($rounds,$rounds_); # restore $rounds
  1571. &mov ($inp,$out); # $inp and $out are the same
  1572. &mov ($key,$key_); # restore $key
  1573. &jmp (&label("cbc_enc_loop"));
  1574. ######################################################################
  1575. &set_label("cbc_decrypt",16);
  1576. &cmp ($len,0x50);
  1577. &jbe (&label("cbc_dec_tail"));
  1578. &movaps (&QWP(0,"esp"),$ivec); # save IV
  1579. &sub ($len,0x50);
  1580. &jmp (&label("cbc_dec_loop6_enter"));
  1581. &set_label("cbc_dec_loop6",16);
  1582. &movaps (&QWP(0,"esp"),$rndkey0); # save IV
  1583. &movups (&QWP(0,$out),$inout5);
  1584. &lea ($out,&DWP(0x10,$out));
  1585. &set_label("cbc_dec_loop6_enter");
  1586. &movdqu ($inout0,&QWP(0,$inp));
  1587. &movdqu ($inout1,&QWP(0x10,$inp));
  1588. &movdqu ($inout2,&QWP(0x20,$inp));
  1589. &movdqu ($inout3,&QWP(0x30,$inp));
  1590. &movdqu ($inout4,&QWP(0x40,$inp));
  1591. &movdqu ($inout5,&QWP(0x50,$inp));
  1592. &call ("_aesni_decrypt6");
  1593. &movups ($rndkey1,&QWP(0,$inp));
  1594. &movups ($rndkey0,&QWP(0x10,$inp));
  1595. &xorps ($inout0,&QWP(0,"esp")); # ^=IV
  1596. &xorps ($inout1,$rndkey1);
  1597. &movups ($rndkey1,&QWP(0x20,$inp));
  1598. &xorps ($inout2,$rndkey0);
  1599. &movups ($rndkey0,&QWP(0x30,$inp));
  1600. &xorps ($inout3,$rndkey1);
  1601. &movups ($rndkey1,&QWP(0x40,$inp));
  1602. &xorps ($inout4,$rndkey0);
  1603. &movups ($rndkey0,&QWP(0x50,$inp)); # IV
  1604. &xorps ($inout5,$rndkey1);
  1605. &movups (&QWP(0,$out),$inout0);
  1606. &movups (&QWP(0x10,$out),$inout1);
  1607. &lea ($inp,&DWP(0x60,$inp));
  1608. &movups (&QWP(0x20,$out),$inout2);
  1609. &mov ($rounds,$rounds_) # restore $rounds
  1610. &movups (&QWP(0x30,$out),$inout3);
  1611. &mov ($key,$key_); # restore $key
  1612. &movups (&QWP(0x40,$out),$inout4);
  1613. &lea ($out,&DWP(0x50,$out));
  1614. &sub ($len,0x60);
  1615. &ja (&label("cbc_dec_loop6"));
  1616. &movaps ($inout0,$inout5);
  1617. &movaps ($ivec,$rndkey0);
  1618. &add ($len,0x50);
  1619. &jle (&label("cbc_dec_tail_collected"));
  1620. &movups (&QWP(0,$out),$inout0);
  1621. &lea ($out,&DWP(0x10,$out));
  1622. &set_label("cbc_dec_tail");
  1623. &movups ($inout0,&QWP(0,$inp));
  1624. &movaps ($in0,$inout0);
  1625. &cmp ($len,0x10);
  1626. &jbe (&label("cbc_dec_one"));
  1627. &movups ($inout1,&QWP(0x10,$inp));
  1628. &movaps ($in1,$inout1);
  1629. &cmp ($len,0x20);
  1630. &jbe (&label("cbc_dec_two"));
  1631. &movups ($inout2,&QWP(0x20,$inp));
  1632. &cmp ($len,0x30);
  1633. &jbe (&label("cbc_dec_three"));
  1634. &movups ($inout3,&QWP(0x30,$inp));
  1635. &cmp ($len,0x40);
  1636. &jbe (&label("cbc_dec_four"));
  1637. &movups ($inout4,&QWP(0x40,$inp));
  1638. &movaps (&QWP(0,"esp"),$ivec); # save IV
  1639. &movups ($inout0,&QWP(0,$inp));
  1640. &xorps ($inout5,$inout5);
  1641. &call ("_aesni_decrypt6");
  1642. &movups ($rndkey1,&QWP(0,$inp));
  1643. &movups ($rndkey0,&QWP(0x10,$inp));
  1644. &xorps ($inout0,&QWP(0,"esp")); # ^= IV
  1645. &xorps ($inout1,$rndkey1);
  1646. &movups ($rndkey1,&QWP(0x20,$inp));
  1647. &xorps ($inout2,$rndkey0);
  1648. &movups ($rndkey0,&QWP(0x30,$inp));
  1649. &xorps ($inout3,$rndkey1);
  1650. &movups ($ivec,&QWP(0x40,$inp)); # IV
  1651. &xorps ($inout4,$rndkey0);
  1652. &movups (&QWP(0,$out),$inout0);
  1653. &movups (&QWP(0x10,$out),$inout1);
  1654. &movups (&QWP(0x20,$out),$inout2);
  1655. &movups (&QWP(0x30,$out),$inout3);
  1656. &lea ($out,&DWP(0x40,$out));
  1657. &movaps ($inout0,$inout4);
  1658. &sub ($len,0x50);
  1659. &jmp (&label("cbc_dec_tail_collected"));
  1660. &set_label("cbc_dec_one",16);
  1661. if ($inline)
  1662. { &aesni_inline_generate1("dec"); }
  1663. else
  1664. { &call ("_aesni_decrypt1"); }
  1665. &xorps ($inout0,$ivec);
  1666. &movaps ($ivec,$in0);
  1667. &sub ($len,0x10);
  1668. &jmp (&label("cbc_dec_tail_collected"));
  1669. &set_label("cbc_dec_two",16);
  1670. &xorps ($inout2,$inout2);
  1671. &call ("_aesni_decrypt3");
  1672. &xorps ($inout0,$ivec);
  1673. &xorps ($inout1,$in0);
  1674. &movups (&QWP(0,$out),$inout0);
  1675. &movaps ($inout0,$inout1);
  1676. &lea ($out,&DWP(0x10,$out));
  1677. &movaps ($ivec,$in1);
  1678. &sub ($len,0x20);
  1679. &jmp (&label("cbc_dec_tail_collected"));
  1680. &set_label("cbc_dec_three",16);
  1681. &call ("_aesni_decrypt3");
  1682. &xorps ($inout0,$ivec);
  1683. &xorps ($inout1,$in0);
  1684. &xorps ($inout2,$in1);
  1685. &movups (&QWP(0,$out),$inout0);
  1686. &movaps ($inout0,$inout2);
  1687. &movups (&QWP(0x10,$out),$inout1);
  1688. &lea ($out,&DWP(0x20,$out));
  1689. &movups ($ivec,&QWP(0x20,$inp));
  1690. &sub ($len,0x30);
  1691. &jmp (&label("cbc_dec_tail_collected"));
  1692. &set_label("cbc_dec_four",16);
  1693. &call ("_aesni_decrypt4");
  1694. &movups ($rndkey1,&QWP(0x10,$inp));
  1695. &movups ($rndkey0,&QWP(0x20,$inp));
  1696. &xorps ($inout0,$ivec);
  1697. &movups ($ivec,&QWP(0x30,$inp));
  1698. &xorps ($inout1,$in0);
  1699. &movups (&QWP(0,$out),$inout0);
  1700. &xorps ($inout2,$rndkey1);
  1701. &movups (&QWP(0x10,$out),$inout1);
  1702. &xorps ($inout3,$rndkey0);
  1703. &movups (&QWP(0x20,$out),$inout2);
  1704. &lea ($out,&DWP(0x30,$out));
  1705. &movaps ($inout0,$inout3);
  1706. &sub ($len,0x40);
  1707. &set_label("cbc_dec_tail_collected");
  1708. &and ($len,15);
  1709. &jnz (&label("cbc_dec_tail_partial"));
  1710. &movups (&QWP(0,$out),$inout0);
  1711. &jmp (&label("cbc_ret"));
  1712. &set_label("cbc_dec_tail_partial",16);
  1713. &movaps (&QWP(0,"esp"),$inout0);
  1714. &mov ("ecx",16);
  1715. &mov ($inp,"esp");
  1716. &sub ("ecx",$len);
  1717. &data_word(0xA4F3F689); # rep movsb
  1718. &set_label("cbc_ret");
  1719. &mov ("esp",&DWP(16,"esp")); # pull original %esp
  1720. &mov ($key_,&wparam(4));
  1721. &movups (&QWP(0,$key_),$ivec); # output IV
  1722. &set_label("cbc_abort");
  1723. &function_end("${PREFIX}_cbc_encrypt");
  1724. ######################################################################
  1725. # Mechanical port from aesni-x86_64.pl.
  1726. #
  1727. # _aesni_set_encrypt_key is private interface,
  1728. # input:
  1729. # "eax" const unsigned char *userKey
  1730. # $rounds int bits
  1731. # $key AES_KEY *key
  1732. # output:
  1733. # "eax" return code
  1734. # $round rounds
  1735. &function_begin_B("_aesni_set_encrypt_key");
  1736. &test ("eax","eax");
  1737. &jz (&label("bad_pointer"));
  1738. &test ($key,$key);
  1739. &jz (&label("bad_pointer"));
  1740. &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
  1741. &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
  1742. &lea ($key,&DWP(16,$key));
  1743. &cmp ($rounds,256);
  1744. &je (&label("14rounds"));
  1745. &cmp ($rounds,192);
  1746. &je (&label("12rounds"));
  1747. &cmp ($rounds,128);
  1748. &jne (&label("bad_keybits"));
  1749. &set_label("10rounds",16);
  1750. &mov ($rounds,9);
  1751. &$movekey (&QWP(-16,$key),"xmm0"); # round 0
  1752. &aeskeygenassist("xmm1","xmm0",0x01); # round 1
  1753. &call (&label("key_128_cold"));
  1754. &aeskeygenassist("xmm1","xmm0",0x2); # round 2
  1755. &call (&label("key_128"));
  1756. &aeskeygenassist("xmm1","xmm0",0x04); # round 3
  1757. &call (&label("key_128"));
  1758. &aeskeygenassist("xmm1","xmm0",0x08); # round 4
  1759. &call (&label("key_128"));
  1760. &aeskeygenassist("xmm1","xmm0",0x10); # round 5
  1761. &call (&label("key_128"));
  1762. &aeskeygenassist("xmm1","xmm0",0x20); # round 6
  1763. &call (&label("key_128"));
  1764. &aeskeygenassist("xmm1","xmm0",0x40); # round 7
  1765. &call (&label("key_128"));
  1766. &aeskeygenassist("xmm1","xmm0",0x80); # round 8
  1767. &call (&label("key_128"));
  1768. &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
  1769. &call (&label("key_128"));
  1770. &aeskeygenassist("xmm1","xmm0",0x36); # round 10
  1771. &call (&label("key_128"));
  1772. &$movekey (&QWP(0,$key),"xmm0");
  1773. &mov (&DWP(80,$key),$rounds);
  1774. &xor ("eax","eax");
  1775. &ret();
  1776. &set_label("key_128",16);
  1777. &$movekey (&QWP(0,$key),"xmm0");
  1778. &lea ($key,&DWP(16,$key));
  1779. &set_label("key_128_cold");
  1780. &shufps ("xmm4","xmm0",0b00010000);
  1781. &xorps ("xmm0","xmm4");
  1782. &shufps ("xmm4","xmm0",0b10001100);
  1783. &xorps ("xmm0","xmm4");
  1784. &shufps ("xmm1","xmm1",0b11111111); # critical path
  1785. &xorps ("xmm0","xmm1");
  1786. &ret();
  1787. &set_label("12rounds",16);
  1788. &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
  1789. &mov ($rounds,11);
  1790. &$movekey (&QWP(-16,$key),"xmm0") # round 0
  1791. &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
  1792. &call (&label("key_192a_cold"));
  1793. &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
  1794. &call (&label("key_192b"));
  1795. &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
  1796. &call (&label("key_192a"));
  1797. &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
  1798. &call (&label("key_192b"));
  1799. &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
  1800. &call (&label("key_192a"));
  1801. &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
  1802. &call (&label("key_192b"));
  1803. &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
  1804. &call (&label("key_192a"));
  1805. &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
  1806. &call (&label("key_192b"));
  1807. &$movekey (&QWP(0,$key),"xmm0");
  1808. &mov (&DWP(48,$key),$rounds);
  1809. &xor ("eax","eax");
  1810. &ret();
  1811. &set_label("key_192a",16);
  1812. &$movekey (&QWP(0,$key),"xmm0");
  1813. &lea ($key,&DWP(16,$key));
  1814. &set_label("key_192a_cold",16);
  1815. &movaps ("xmm5","xmm2");
  1816. &set_label("key_192b_warm");
  1817. &shufps ("xmm4","xmm0",0b00010000);
  1818. &movdqa ("xmm3","xmm2");
  1819. &xorps ("xmm0","xmm4");
  1820. &shufps ("xmm4","xmm0",0b10001100);
  1821. &pslldq ("xmm3",4);
  1822. &xorps ("xmm0","xmm4");
  1823. &pshufd ("xmm1","xmm1",0b01010101); # critical path
  1824. &pxor ("xmm2","xmm3");
  1825. &pxor ("xmm0","xmm1");
  1826. &pshufd ("xmm3","xmm0",0b11111111);
  1827. &pxor ("xmm2","xmm3");
  1828. &ret();
  1829. &set_label("key_192b",16);
  1830. &movaps ("xmm3","xmm0");
  1831. &shufps ("xmm5","xmm0",0b01000100);
  1832. &$movekey (&QWP(0,$key),"xmm5");
  1833. &shufps ("xmm3","xmm2",0b01001110);
  1834. &$movekey (&QWP(16,$key),"xmm3");
  1835. &lea ($key,&DWP(32,$key));
  1836. &jmp (&label("key_192b_warm"));
  1837. &set_label("14rounds",16);
  1838. &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
  1839. &mov ($rounds,13);
  1840. &lea ($key,&DWP(16,$key));
  1841. &$movekey (&QWP(-32,$key),"xmm0"); # round 0
  1842. &$movekey (&QWP(-16,$key),"xmm2"); # round 1
  1843. &aeskeygenassist("xmm1","xmm2",0x01); # round 2
  1844. &call (&label("key_256a_cold"));
  1845. &aeskeygenassist("xmm1","xmm0",0x01); # round 3
  1846. &call (&label("key_256b"));
  1847. &aeskeygenassist("xmm1","xmm2",0x02); # round 4
  1848. &call (&label("key_256a"));
  1849. &aeskeygenassist("xmm1","xmm0",0x02); # round 5
  1850. &call (&label("key_256b"));
  1851. &aeskeygenassist("xmm1","xmm2",0x04); # round 6
  1852. &call (&label("key_256a"));
  1853. &aeskeygenassist("xmm1","xmm0",0x04); # round 7
  1854. &call (&label("key_256b"));
  1855. &aeskeygenassist("xmm1","xmm2",0x08); # round 8
  1856. &call (&label("key_256a"));
  1857. &aeskeygenassist("xmm1","xmm0",0x08); # round 9
  1858. &call (&label("key_256b"));
  1859. &aeskeygenassist("xmm1","xmm2",0x10); # round 10
  1860. &call (&label("key_256a"));
  1861. &aeskeygenassist("xmm1","xmm0",0x10); # round 11
  1862. &call (&label("key_256b"));
  1863. &aeskeygenassist("xmm1","xmm2",0x20); # round 12
  1864. &call (&label("key_256a"));
  1865. &aeskeygenassist("xmm1","xmm0",0x20); # round 13
  1866. &call (&label("key_256b"));
  1867. &aeskeygenassist("xmm1","xmm2",0x40); # round 14
  1868. &call (&label("key_256a"));
  1869. &$movekey (&QWP(0,$key),"xmm0");
  1870. &mov (&DWP(16,$key),$rounds);
  1871. &xor ("eax","eax");
  1872. &ret();
  1873. &set_label("key_256a",16);
  1874. &$movekey (&QWP(0,$key),"xmm2");
  1875. &lea ($key,&DWP(16,$key));
  1876. &set_label("key_256a_cold");
  1877. &shufps ("xmm4","xmm0",0b00010000);
  1878. &xorps ("xmm0","xmm4");
  1879. &shufps ("xmm4","xmm0",0b10001100);
  1880. &xorps ("xmm0","xmm4");
  1881. &shufps ("xmm1","xmm1",0b11111111); # critical path
  1882. &xorps ("xmm0","xmm1");
  1883. &ret();
  1884. &set_label("key_256b",16);
  1885. &$movekey (&QWP(0,$key),"xmm0");
  1886. &lea ($key,&DWP(16,$key));
  1887. &shufps ("xmm4","xmm2",0b00010000);
  1888. &xorps ("xmm2","xmm4");
  1889. &shufps ("xmm4","xmm2",0b10001100);
  1890. &xorps ("xmm2","xmm4");
  1891. &shufps ("xmm1","xmm1",0b10101010); # critical path
  1892. &xorps ("xmm2","xmm1");
  1893. &ret();
  1894. &set_label("bad_pointer",4);
  1895. &mov ("eax",-1);
  1896. &ret ();
  1897. &set_label("bad_keybits",4);
  1898. &mov ("eax",-2);
  1899. &ret ();
  1900. &function_end_B("_aesni_set_encrypt_key");
  1901. # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
  1902. # AES_KEY *key)
  1903. &function_begin_B("${PREFIX}_set_encrypt_key");
  1904. &mov ("eax",&wparam(0));
  1905. &mov ($rounds,&wparam(1));
  1906. &mov ($key,&wparam(2));
  1907. &call ("_aesni_set_encrypt_key");
  1908. &ret ();
  1909. &function_end_B("${PREFIX}_set_encrypt_key");
  1910. # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
  1911. # AES_KEY *key)
  1912. &function_begin_B("${PREFIX}_set_decrypt_key");
  1913. &mov ("eax",&wparam(0));
  1914. &mov ($rounds,&wparam(1));
  1915. &mov ($key,&wparam(2));
  1916. &call ("_aesni_set_encrypt_key");
  1917. &mov ($key,&wparam(2));
  1918. &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
  1919. &test ("eax","eax");
  1920. &jnz (&label("dec_key_ret"));
  1921. &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
  1922. &$movekey ("xmm0",&QWP(0,$key)); # just swap
  1923. &$movekey ("xmm1",&QWP(0,"eax"));
  1924. &$movekey (&QWP(0,"eax"),"xmm0");
  1925. &$movekey (&QWP(0,$key),"xmm1");
  1926. &lea ($key,&DWP(16,$key));
  1927. &lea ("eax",&DWP(-16,"eax"));
  1928. &set_label("dec_key_inverse");
  1929. &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
  1930. &$movekey ("xmm1",&QWP(0,"eax"));
  1931. &aesimc ("xmm0","xmm0");
  1932. &aesimc ("xmm1","xmm1");
  1933. &lea ($key,&DWP(16,$key));
  1934. &lea ("eax",&DWP(-16,"eax"));
  1935. &$movekey (&QWP(16,"eax"),"xmm0");
  1936. &$movekey (&QWP(-16,$key),"xmm1");
  1937. &cmp ("eax",$key);
  1938. &ja (&label("dec_key_inverse"));
  1939. &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
  1940. &aesimc ("xmm0","xmm0");
  1941. &$movekey (&QWP(0,$key),"xmm0");
  1942. &xor ("eax","eax"); # return success
  1943. &set_label("dec_key_ret");
  1944. &ret ();
  1945. &function_end_B("${PREFIX}_set_decrypt_key");
  1946. &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
  1947. &asm_finish();