aes-586.pl 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. Rights for redistribution and usage in source and binary
  6. # forms are granted according to the OpenSSL license.
  7. # ====================================================================
  8. #
  9. # Version 3.6.
  10. #
  11. # You might fail to appreciate this module performance from the first
  12. # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
  13. # to be *the* best Intel C compiler without -KPIC, performance appears
  14. # to be virtually identical... But try to re-configure with shared
  15. # library support... Aha! Intel compiler "suddenly" lags behind by 30%
  16. # [on P4, more on others]:-) And if compared to position-independent
  17. # code generated by GNU C, this code performs *more* than *twice* as
  18. # fast! Yes, all this buzz about PIC means that unlike other hand-
  19. # coded implementations, this one was explicitly designed to be safe
  20. # to use even in shared library context... This also means that this
  21. # code isn't necessarily absolutely fastest "ever," because in order
  22. # to achieve position independence an extra register has to be
  23. # off-loaded to stack, which affects the benchmark result.
  24. #
  25. # Special note about instruction choice. Do you recall RC4_INT code
  26. # performing poorly on P4? It might be the time to figure out why.
  27. # RC4_INT code implies effective address calculations in base+offset*4
  28. # form. Trouble is that it seems that offset scaling turned to be
  29. # critical path... At least eliminating scaling resulted in 2.8x RC4
  30. # performance improvement [as you might recall]. As AES code is hungry
  31. # for scaling too, I [try to] avoid the latter by favoring off-by-2
  32. # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
  33. #
  34. # As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
  35. # void. Performance improvement with off-by-2 shifts was observed on
  36. # intermediate implementation, which was spilling yet another register
  37. # to stack... Final offset*4 code below runs just a tad faster on P4,
  38. # but exhibits up to 10% improvement on other cores.
  39. #
  40. # Second version is "monolithic" replacement for aes_core.c, which in
  41. # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
  42. # This made it possible to implement little-endian variant of the
  43. # algorithm without modifying the base C code. Motivating factor for
  44. # the undertaken effort was that it appeared that in tight IA-32
  45. # register window little-endian flavor could achieve slightly higher
  46. # Instruction Level Parallelism, and it indeed resulted in up to 15%
  47. # better performance on most recent µ-archs...
  48. #
  49. # Third version adds AES_cbc_encrypt implementation, which resulted in
  50. # up to 40% performance imrovement of CBC benchmark results. 40% was
  51. # observed on P4 core, where "overall" imrovement coefficient, i.e. if
  52. # compared to PIC generated by GCC and in CBC mode, was observed to be
  53. # as large as 4x:-) CBC performance is virtually identical to ECB now
  54. # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
  55. # Opteron, because certain function prologues and epilogues are
  56. # effectively taken out of the loop...
  57. #
  58. # Version 3.2 implements compressed tables and prefetch of these tables
  59. # in CBC[!] mode. Former means that 3/4 of table references are now
  60. # misaligned, which unfortunately has negative impact on elder IA-32
  61. # implementations, Pentium suffered 30% penalty, PIII - 10%.
  62. #
  63. # Version 3.3 avoids L1 cache aliasing between stack frame and
  64. # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
  65. # latter is achieved by copying the key schedule to controlled place in
  66. # stack. This unfortunately has rather strong impact on small block CBC
  67. # performance, ~2x deterioration on 16-byte block if compared to 3.3.
  68. #
  69. # Version 3.5 checks if there is L1 cache aliasing between user-supplied
  70. # key schedule and S-boxes and abstains from copying the former if
  71. # there is no. This allows end-user to consciously retain small block
  72. # performance by aligning key schedule in specific manner.
  73. #
  74. # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
  75. #
  76. # Current ECB performance numbers for 128-bit key in CPU cycles per
  77. # processed byte [measure commonly used by AES benchmarkers] are:
  78. #
  79. # small footprint fully unrolled
  80. # P4 24 22
  81. # AMD K8 20 19
  82. # PIII 25 23
  83. # Pentium 81 78
  84. push(@INC,"perlasm","../../perlasm");
  85. require "x86asm.pl";
  86. &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
  87. $s0="eax";
  88. $s1="ebx";
  89. $s2="ecx";
  90. $s3="edx";
  91. $key="edi";
  92. $acc="esi";
  93. $compromise=0; # $compromise=128 abstains from copying key
  94. # schedule to stack when encrypting inputs
  95. # shorter than 128 bytes at the cost of
  96. # risksing aliasing with S-boxes. In return
  97. # you get way better, up to +70%, small block
  98. # performance.
  99. $small_footprint=1; # $small_footprint=1 code is ~5% slower [on
  100. # recent µ-archs], but ~5 times smaller!
  101. # I favor compact code to minimize cache
  102. # contention and in hope to "collect" 5% back
  103. # in real-life applications...
  104. $vertical_spin=0; # shift "verticaly" defaults to 0, because of
  105. # its proof-of-concept status...
  106. # Note that there is no decvert(), as well as last encryption round is
  107. # performed with "horizontal" shifts. This is because this "vertical"
  108. # implementation [one which groups shifts on a given $s[i] to form a
  109. # "column," unlike "horizontal" one, which groups shifts on different
  110. # $s[i] to form a "row"] is work in progress. It was observed to run
  111. # few percents faster on Intel cores, but not AMD. On AMD K8 core it's
  112. # whole 12% slower:-( So we face a trade-off... Shall it be resolved
  113. # some day? Till then the code is considered experimental and by
  114. # default remains dormant...
  115. sub encvert()
  116. { my ($te,@s) = @_;
  117. my $v0 = $acc, $v1 = $key;
  118. &mov ($v0,$s[3]); # copy s3
  119. &mov (&DWP(4,"esp"),$s[2]); # save s2
  120. &mov ($v1,$s[0]); # copy s0
  121. &mov (&DWP(8,"esp"),$s[1]); # save s1
  122. &movz ($s[2],&HB($s[0]));
  123. &and ($s[0],0xFF);
  124. &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
  125. &shr ($v1,16);
  126. &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
  127. &movz ($s[1],&HB($v1));
  128. &and ($v1,0xFF);
  129. &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
  130. &mov ($v1,$v0);
  131. &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
  132. &and ($v0,0xFF);
  133. &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
  134. &movz ($v0,&HB($v1));
  135. &shr ($v1,16);
  136. &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
  137. &movz ($v0,&HB($v1));
  138. &and ($v1,0xFF);
  139. &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
  140. &mov ($v1,&DWP(4,"esp")); # restore s2
  141. &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
  142. &mov ($v0,$v1);
  143. &and ($v1,0xFF);
  144. &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
  145. &movz ($v1,&HB($v0));
  146. &shr ($v0,16);
  147. &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
  148. &movz ($v1,&HB($v0));
  149. &and ($v0,0xFF);
  150. &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
  151. &mov ($v0,&DWP(8,"esp")); # restore s1
  152. &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
  153. &mov ($v1,$v0);
  154. &and ($v0,0xFF);
  155. &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
  156. &movz ($v0,&HB($v1));
  157. &shr ($v1,16);
  158. &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
  159. &movz ($v0,&HB($v1));
  160. &and ($v1,0xFF);
  161. &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
  162. &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key
  163. &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
  164. }
  165. sub encstep()
  166. { my ($i,$te,@s) = @_;
  167. my $tmp = $key;
  168. my $out = $i==3?$s[0]:$acc;
  169. # lines marked with #%e?x[i] denote "reordered" instructions...
  170. if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
  171. else { &mov ($out,$s[0]);
  172. &and ($out,0xFF); }
  173. if ($i==1) { &shr ($s[0],16); }#%ebx[1]
  174. if ($i==2) { &shr ($s[0],24); }#%ecx[2]
  175. &mov ($out,&DWP(0,$te,$out,8));
  176. if ($i==3) { $tmp=$s[1]; }##%eax
  177. &movz ($tmp,&HB($s[1]));
  178. &xor ($out,&DWP(3,$te,$tmp,8));
  179. if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
  180. else { &mov ($tmp,$s[2]);
  181. &shr ($tmp,16); }
  182. if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
  183. &and ($tmp,0xFF);
  184. &xor ($out,&DWP(2,$te,$tmp,8));
  185. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
  186. elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
  187. else { &mov ($tmp,$s[3]);
  188. &shr ($tmp,24) }
  189. &xor ($out,&DWP(1,$te,$tmp,8));
  190. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  191. if ($i==3) { &mov ($s[3],$acc); }
  192. &comment();
  193. }
  194. sub enclast()
  195. { my ($i,$te,@s)=@_;
  196. my $tmp = $key;
  197. my $out = $i==3?$s[0]:$acc;
  198. if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
  199. else { &mov ($out,$s[0]); }
  200. &and ($out,0xFF);
  201. if ($i==1) { &shr ($s[0],16); }#%ebx[1]
  202. if ($i==2) { &shr ($s[0],24); }#%ecx[2]
  203. &mov ($out,&DWP(2,$te,$out,8));
  204. &and ($out,0x000000ff);
  205. if ($i==3) { $tmp=$s[1]; }##%eax
  206. &movz ($tmp,&HB($s[1]));
  207. &mov ($tmp,&DWP(0,$te,$tmp,8));
  208. &and ($tmp,0x0000ff00);
  209. &xor ($out,$tmp);
  210. if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
  211. else { mov ($tmp,$s[2]);
  212. &shr ($tmp,16); }
  213. if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
  214. &and ($tmp,0xFF);
  215. &mov ($tmp,&DWP(0,$te,$tmp,8));
  216. &and ($tmp,0x00ff0000);
  217. &xor ($out,$tmp);
  218. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
  219. elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
  220. else { &mov ($tmp,$s[3]);
  221. &shr ($tmp,24); }
  222. &mov ($tmp,&DWP(2,$te,$tmp,8));
  223. &and ($tmp,0xff000000);
  224. &xor ($out,$tmp);
  225. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  226. if ($i==3) { &mov ($s[3],$acc); }
  227. }
  228. sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
  229. &public_label("AES_Te");
  230. &function_begin_B("_x86_AES_encrypt");
  231. if ($vertical_spin) {
  232. # I need high parts of volatile registers to be accessible...
  233. &exch ($s1="edi",$key="ebx");
  234. &mov ($s2="esi",$acc="ecx");
  235. }
  236. # note that caller is expected to allocate stack frame for me!
  237. &mov (&DWP(12,"esp"),$key); # save key
  238. &xor ($s0,&DWP(0,$key)); # xor with key
  239. &xor ($s1,&DWP(4,$key));
  240. &xor ($s2,&DWP(8,$key));
  241. &xor ($s3,&DWP(12,$key));
  242. &mov ($acc,&DWP(240,$key)); # load key->rounds
  243. if ($small_footprint) {
  244. &lea ($acc,&DWP(-2,$acc,$acc));
  245. &lea ($acc,&DWP(0,$key,$acc,8));
  246. &mov (&DWP(16,"esp"),$acc); # end of key schedule
  247. &align (4);
  248. &set_label("loop");
  249. if ($vertical_spin) {
  250. &encvert("ebp",$s0,$s1,$s2,$s3);
  251. } else {
  252. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  253. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  254. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  255. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  256. }
  257. &add ($key,16); # advance rd_key
  258. &xor ($s0,&DWP(0,$key));
  259. &xor ($s1,&DWP(4,$key));
  260. &xor ($s2,&DWP(8,$key));
  261. &xor ($s3,&DWP(12,$key));
  262. &cmp ($key,&DWP(16,"esp"));
  263. &mov (&DWP(12,"esp"),$key);
  264. &jb (&label("loop"));
  265. }
  266. else {
  267. &cmp ($acc,10);
  268. &jle (&label("10rounds"));
  269. &cmp ($acc,12);
  270. &jle (&label("12rounds"));
  271. &set_label("14rounds");
  272. for ($i=1;$i<3;$i++) {
  273. if ($vertical_spin) {
  274. &encvert("ebp",$s0,$s1,$s2,$s3);
  275. } else {
  276. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  277. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  278. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  279. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  280. }
  281. &xor ($s0,&DWP(16*$i+0,$key));
  282. &xor ($s1,&DWP(16*$i+4,$key));
  283. &xor ($s2,&DWP(16*$i+8,$key));
  284. &xor ($s3,&DWP(16*$i+12,$key));
  285. }
  286. &add ($key,32);
  287. &mov (&DWP(12,"esp"),$key); # advance rd_key
  288. &set_label("12rounds");
  289. for ($i=1;$i<3;$i++) {
  290. if ($vertical_spin) {
  291. &encvert("ebp",$s0,$s1,$s2,$s3);
  292. } else {
  293. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  294. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  295. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  296. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  297. }
  298. &xor ($s0,&DWP(16*$i+0,$key));
  299. &xor ($s1,&DWP(16*$i+4,$key));
  300. &xor ($s2,&DWP(16*$i+8,$key));
  301. &xor ($s3,&DWP(16*$i+12,$key));
  302. }
  303. &add ($key,32);
  304. &mov (&DWP(12,"esp"),$key); # advance rd_key
  305. &set_label("10rounds");
  306. for ($i=1;$i<10;$i++) {
  307. if ($vertical_spin) {
  308. &encvert("ebp",$s0,$s1,$s2,$s3);
  309. } else {
  310. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  311. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  312. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  313. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  314. }
  315. &xor ($s0,&DWP(16*$i+0,$key));
  316. &xor ($s1,&DWP(16*$i+4,$key));
  317. &xor ($s2,&DWP(16*$i+8,$key));
  318. &xor ($s3,&DWP(16*$i+12,$key));
  319. }
  320. }
  321. if ($vertical_spin) {
  322. # "reincarnate" some registers for "horizontal" spin...
  323. &mov ($s1="ebx",$key="edi");
  324. &mov ($s2="ecx",$acc="esi");
  325. }
  326. &enclast(0,"ebp",$s0,$s1,$s2,$s3);
  327. &enclast(1,"ebp",$s1,$s2,$s3,$s0);
  328. &enclast(2,"ebp",$s2,$s3,$s0,$s1);
  329. &enclast(3,"ebp",$s3,$s0,$s1,$s2);
  330. &add ($key,$small_footprint?16:160);
  331. &xor ($s0,&DWP(0,$key));
  332. &xor ($s1,&DWP(4,$key));
  333. &xor ($s2,&DWP(8,$key));
  334. &xor ($s3,&DWP(12,$key));
  335. &ret ();
  336. &set_label("AES_Te",64); # Yes! I keep it in the code segment!
  337. &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
  338. &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
  339. &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
  340. &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
  341. &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
  342. &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
  343. &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
  344. &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
  345. &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
  346. &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
  347. &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
  348. &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
  349. &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
  350. &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
  351. &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
  352. &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
  353. &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
  354. &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
  355. &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
  356. &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
  357. &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
  358. &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
  359. &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
  360. &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
  361. &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
  362. &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
  363. &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
  364. &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
  365. &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
  366. &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
  367. &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
  368. &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
  369. &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
  370. &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
  371. &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
  372. &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
  373. &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
  374. &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
  375. &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
  376. &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
  377. &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
  378. &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
  379. &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
  380. &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
  381. &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
  382. &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
  383. &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
  384. &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
  385. &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
  386. &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
  387. &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
  388. &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
  389. &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
  390. &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
  391. &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
  392. &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
  393. &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
  394. &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
  395. &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
  396. &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
  397. &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
  398. &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
  399. &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
  400. &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
  401. #rcon:
  402. &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
  403. &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
  404. &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0);
  405. &function_end_B("_x86_AES_encrypt");
  406. # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
  407. &public_label("AES_Te");
  408. &function_begin("AES_encrypt");
  409. &mov ($acc,&wparam(0)); # load inp
  410. &mov ($key,&wparam(2)); # load key
  411. &mov ($s0,"esp");
  412. &sub ("esp",24);
  413. &and ("esp",-64);
  414. &add ("esp",4);
  415. &mov (&DWP(16,"esp"),$s0);
  416. &call (&label("pic_point")); # make it PIC!
  417. &set_label("pic_point");
  418. &blindpop("ebp");
  419. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  420. &mov ($s0,&DWP(0,$acc)); # load input data
  421. &mov ($s1,&DWP(4,$acc));
  422. &mov ($s2,&DWP(8,$acc));
  423. &mov ($s3,&DWP(12,$acc));
  424. &call ("_x86_AES_encrypt");
  425. &mov ("esp",&DWP(16,"esp"));
  426. &mov ($acc,&wparam(1)); # load out
  427. &mov (&DWP(0,$acc),$s0); # write output data
  428. &mov (&DWP(4,$acc),$s1);
  429. &mov (&DWP(8,$acc),$s2);
  430. &mov (&DWP(12,$acc),$s3);
  431. &function_end("AES_encrypt");
  432. #------------------------------------------------------------------#
  433. sub decstep()
  434. { my ($i,$td,@s) = @_;
  435. my $tmp = $key;
  436. my $out = $i==3?$s[0]:$acc;
  437. # no instructions are reordered, as performance appears
  438. # optimal... or rather that all attempts to reorder didn't
  439. # result in better performance [which by the way is not a
  440. # bit lower than ecryption].
  441. if($i==3) { &mov ($key,&DWP(12,"esp")); }
  442. else { &mov ($out,$s[0]); }
  443. &and ($out,0xFF);
  444. &mov ($out,&DWP(0,$td,$out,8));
  445. if ($i==3) { $tmp=$s[1]; }
  446. &movz ($tmp,&HB($s[1]));
  447. &xor ($out,&DWP(3,$td,$tmp,8));
  448. if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
  449. else { &mov ($tmp,$s[2]); }
  450. &shr ($tmp,16);
  451. &and ($tmp,0xFF);
  452. &xor ($out,&DWP(2,$td,$tmp,8));
  453. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
  454. else { &mov ($tmp,$s[3]); }
  455. &shr ($tmp,24);
  456. &xor ($out,&DWP(1,$td,$tmp,8));
  457. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  458. if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
  459. &comment();
  460. }
  461. sub declast()
  462. { my ($i,$td,@s)=@_;
  463. my $tmp = $key;
  464. my $out = $i==3?$s[0]:$acc;
  465. if($i==3) { &mov ($key,&DWP(12,"esp")); }
  466. else { &mov ($out,$s[0]); }
  467. &and ($out,0xFF);
  468. &movz ($out,&BP(2048,$td,$out,1));
  469. if ($i==3) { $tmp=$s[1]; }
  470. &movz ($tmp,&HB($s[1]));
  471. &movz ($tmp,&BP(2048,$td,$tmp,1));
  472. &shl ($tmp,8);
  473. &xor ($out,$tmp);
  474. if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
  475. else { mov ($tmp,$s[2]); }
  476. &shr ($tmp,16);
  477. &and ($tmp,0xFF);
  478. &movz ($tmp,&BP(2048,$td,$tmp,1));
  479. &shl ($tmp,16);
  480. &xor ($out,$tmp);
  481. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
  482. else { &mov ($tmp,$s[3]); }
  483. &shr ($tmp,24);
  484. &movz ($tmp,&BP(2048,$td,$tmp,1));
  485. &shl ($tmp,24);
  486. &xor ($out,$tmp);
  487. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  488. if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
  489. }
  490. &public_label("AES_Td");
  491. &function_begin_B("_x86_AES_decrypt");
  492. # note that caller is expected to allocate stack frame for me!
  493. &mov (&DWP(12,"esp"),$key); # save key
  494. &xor ($s0,&DWP(0,$key)); # xor with key
  495. &xor ($s1,&DWP(4,$key));
  496. &xor ($s2,&DWP(8,$key));
  497. &xor ($s3,&DWP(12,$key));
  498. &mov ($acc,&DWP(240,$key)); # load key->rounds
  499. if ($small_footprint) {
  500. &lea ($acc,&DWP(-2,$acc,$acc));
  501. &lea ($acc,&DWP(0,$key,$acc,8));
  502. &mov (&DWP(16,"esp"),$acc); # end of key schedule
  503. &align (4);
  504. &set_label("loop");
  505. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  506. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  507. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  508. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  509. &add ($key,16); # advance rd_key
  510. &xor ($s0,&DWP(0,$key));
  511. &xor ($s1,&DWP(4,$key));
  512. &xor ($s2,&DWP(8,$key));
  513. &xor ($s3,&DWP(12,$key));
  514. &cmp ($key,&DWP(16,"esp"));
  515. &mov (&DWP(12,"esp"),$key);
  516. &jb (&label("loop"));
  517. }
  518. else {
  519. &cmp ($acc,10);
  520. &jle (&label("10rounds"));
  521. &cmp ($acc,12);
  522. &jle (&label("12rounds"));
  523. &set_label("14rounds");
  524. for ($i=1;$i<3;$i++) {
  525. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  526. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  527. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  528. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  529. &xor ($s0,&DWP(16*$i+0,$key));
  530. &xor ($s1,&DWP(16*$i+4,$key));
  531. &xor ($s2,&DWP(16*$i+8,$key));
  532. &xor ($s3,&DWP(16*$i+12,$key));
  533. }
  534. &add ($key,32);
  535. &mov (&DWP(12,"esp"),$key); # advance rd_key
  536. &set_label("12rounds");
  537. for ($i=1;$i<3;$i++) {
  538. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  539. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  540. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  541. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  542. &xor ($s0,&DWP(16*$i+0,$key));
  543. &xor ($s1,&DWP(16*$i+4,$key));
  544. &xor ($s2,&DWP(16*$i+8,$key));
  545. &xor ($s3,&DWP(16*$i+12,$key));
  546. }
  547. &add ($key,32);
  548. &mov (&DWP(12,"esp"),$key); # advance rd_key
  549. &set_label("10rounds");
  550. for ($i=1;$i<10;$i++) {
  551. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  552. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  553. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  554. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  555. &xor ($s0,&DWP(16*$i+0,$key));
  556. &xor ($s1,&DWP(16*$i+4,$key));
  557. &xor ($s2,&DWP(16*$i+8,$key));
  558. &xor ($s3,&DWP(16*$i+12,$key));
  559. }
  560. }
  561. &declast(0,"ebp",$s0,$s3,$s2,$s1);
  562. &declast(1,"ebp",$s1,$s0,$s3,$s2);
  563. &declast(2,"ebp",$s2,$s1,$s0,$s3);
  564. &declast(3,"ebp",$s3,$s2,$s1,$s0);
  565. &add ($key,$small_footprint?16:160);
  566. &xor ($s0,&DWP(0,$key));
  567. &xor ($s1,&DWP(4,$key));
  568. &xor ($s2,&DWP(8,$key));
  569. &xor ($s3,&DWP(12,$key));
  570. &ret ();
  571. &set_label("AES_Td",64); # Yes! I keep it in the code segment!
  572. &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
  573. &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
  574. &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
  575. &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
  576. &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
  577. &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
  578. &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
  579. &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
  580. &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
  581. &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
  582. &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
  583. &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
  584. &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
  585. &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
  586. &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
  587. &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
  588. &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
  589. &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
  590. &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
  591. &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
  592. &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
  593. &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
  594. &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
  595. &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
  596. &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
  597. &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
  598. &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
  599. &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
  600. &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
  601. &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
  602. &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
  603. &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
  604. &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
  605. &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
  606. &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
  607. &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
  608. &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
  609. &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
  610. &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
  611. &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
  612. &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
  613. &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
  614. &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
  615. &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
  616. &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
  617. &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
  618. &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
  619. &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
  620. &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
  621. &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
  622. &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
  623. &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
  624. &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
  625. &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
  626. &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
  627. &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
  628. &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
  629. &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
  630. &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
  631. &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
  632. &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
  633. &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
  634. &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
  635. &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
  636. #Td4:
  637. &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
  638. &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
  639. &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
  640. &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
  641. &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
  642. &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
  643. &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
  644. &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
  645. &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
  646. &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
  647. &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
  648. &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
  649. &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
  650. &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
  651. &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
  652. &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
  653. &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
  654. &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
  655. &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
  656. &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
  657. &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
  658. &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
  659. &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
  660. &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
  661. &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
  662. &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
  663. &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
  664. &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
  665. &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
  666. &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
  667. &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
  668. &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
  669. &function_end_B("_x86_AES_decrypt");
  670. # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
  671. &public_label("AES_Td");
  672. &function_begin("AES_decrypt");
  673. &mov ($acc,&wparam(0)); # load inp
  674. &mov ($key,&wparam(2)); # load key
  675. &mov ($s0,"esp");
  676. &sub ("esp",24);
  677. &and ("esp",-64);
  678. &add ("esp",4);
  679. &mov (&DWP(16,"esp"),$s0);
  680. &call (&label("pic_point")); # make it PIC!
  681. &set_label("pic_point");
  682. &blindpop("ebp");
  683. &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  684. # prefetch Td4
  685. &lea ("ebp",&DWP(2048+128,"ebp"));
  686. &mov ($s0,&DWP(0-128,"ebp"));
  687. &mov ($s1,&DWP(32-128,"ebp"));
  688. &mov ($s2,&DWP(64-128,"ebp"));
  689. &mov ($s3,&DWP(96-128,"ebp"));
  690. &mov ($s0,&DWP(128-128,"ebp"));
  691. &mov ($s1,&DWP(160-128,"ebp"));
  692. &mov ($s2,&DWP(192-128,"ebp"));
  693. &mov ($s3,&DWP(224-128,"ebp"));
  694. &lea ("ebp",&DWP(-2048-128,"ebp"));
  695. &mov ($s0,&DWP(0,$acc)); # load input data
  696. &mov ($s1,&DWP(4,$acc));
  697. &mov ($s2,&DWP(8,$acc));
  698. &mov ($s3,&DWP(12,$acc));
  699. &call ("_x86_AES_decrypt");
  700. &mov ("esp",&DWP(16,"esp"));
  701. &mov ($acc,&wparam(1)); # load out
  702. &mov (&DWP(0,$acc),$s0); # write output data
  703. &mov (&DWP(4,$acc),$s1);
  704. &mov (&DWP(8,$acc),$s2);
  705. &mov (&DWP(12,$acc),$s3);
  706. &function_end("AES_decrypt");
  707. # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
  708. # size_t length, const AES_KEY *key,
  709. # unsigned char *ivp,const int enc);
  710. {
  711. # stack frame layout
  712. # -4(%esp) 0(%esp) return address
  713. # 0(%esp) 4(%esp) tmp1
  714. # 4(%esp) 8(%esp) tmp2
  715. # 8(%esp) 12(%esp) key
  716. # 12(%esp) 16(%esp) end of key schedule
  717. my $_esp=&DWP(16,"esp"); #saved %esp
  718. my $_inp=&DWP(20,"esp"); #copy of wparam(0)
  719. my $_out=&DWP(24,"esp"); #copy of wparam(1)
  720. my $_len=&DWP(28,"esp"); #copy of wparam(2)
  721. my $_key=&DWP(32,"esp"); #copy of wparam(3)
  722. my $_ivp=&DWP(36,"esp"); #copy of wparam(4)
  723. my $_tmp=&DWP(40,"esp"); #volatile variable
  724. my $ivec=&DWP(44,"esp"); #ivec[16]
  725. my $aes_key=&DWP(60,"esp"); #copy of aes_key
  726. my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
  727. &public_label("AES_Te");
  728. &public_label("AES_Td");
  729. &function_begin("AES_cbc_encrypt");
  730. &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
  731. &cmp ($s2,0);
  732. &je (&label("enc_out"));
  733. &call (&label("pic_point")); # make it PIC!
  734. &set_label("pic_point");
  735. &blindpop("ebp");
  736. &pushf ();
  737. &cld ();
  738. &cmp (&wparam(5),0);
  739. &je (&label("DECRYPT"));
  740. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  741. # allocate aligned stack frame...
  742. &lea ($key,&DWP(-64-244,"esp"));
  743. &and ($key,-64);
  744. # ... and make sure it doesn't alias with AES_Te modulo 4096
  745. &mov ($s0,"ebp");
  746. &lea ($s1,&DWP(2048,"ebp"));
  747. &mov ($s3,$key);
  748. &and ($s0,0xfff); # s = %ebp&0xfff
  749. &and ($s1,0xfff); # e = (%ebp+2048)&0xfff
  750. &and ($s3,0xfff); # p = %esp&0xfff
  751. &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
  752. &jb (&label("te_break_out"));
  753. &sub ($s3,$s1);
  754. &sub ($key,$s3);
  755. &jmp (&label("te_ok"));
  756. &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz;
  757. &sub ($s3,$s0);
  758. &and ($s3,0xfff);
  759. &add ($s3,64+256);
  760. &sub ($key,$s3);
  761. &align (4);
  762. &set_label("te_ok");
  763. &mov ($s0,&wparam(0)); # load inp
  764. &mov ($s1,&wparam(1)); # load out
  765. &mov ($s3,&wparam(3)); # load key
  766. &mov ($acc,&wparam(4)); # load ivp
  767. &exch ("esp",$key);
  768. &add ("esp",4); # reserve for return address!
  769. &mov ($_esp,$key); # save %esp
  770. &mov ($_inp,$s0); # save copy of inp
  771. &mov ($_out,$s1); # save copy of out
  772. &mov ($_len,$s2); # save copy of len
  773. &mov ($_key,$s3); # save copy of key
  774. &mov ($_ivp,$acc); # save copy of ivp
  775. &mov ($mark,0); # copy of aes_key->rounds = 0;
  776. if ($compromise) {
  777. &cmp ($s2,$compromise);
  778. &jb (&label("skip_ecopy"));
  779. }
  780. # do we copy key schedule to stack?
  781. &mov ($s1 eq "ebx" ? $s1 : "",$s3);
  782. &mov ($s2 eq "ecx" ? $s2 : "",244/4);
  783. &sub ($s1,"ebp");
  784. &mov ("esi",$s3);
  785. &and ($s1,0xfff);
  786. &lea ("edi",$aes_key);
  787. &cmp ($s1,2048);
  788. &jb (&label("do_ecopy"));
  789. &cmp ($s1,4096-244);
  790. &jb (&label("skip_ecopy"));
  791. &align (4);
  792. &set_label("do_ecopy");
  793. &mov ($_key,"edi");
  794. &data_word(0xA5F3F689); # rep movsd
  795. &set_label("skip_ecopy");
  796. &mov ($acc,$s0);
  797. &mov ($key,16);
  798. &align (4);
  799. &set_label("prefetch_te");
  800. &mov ($s0,&DWP(0,"ebp"));
  801. &mov ($s1,&DWP(32,"ebp"));
  802. &mov ($s2,&DWP(64,"ebp"));
  803. &mov ($s3,&DWP(96,"ebp"));
  804. &lea ("ebp",&DWP(128,"ebp"));
  805. &dec ($key);
  806. &jnz (&label("prefetch_te"));
  807. &sub ("ebp",2048);
  808. &mov ($s2,$_len);
  809. &mov ($key,$_ivp);
  810. &test ($s2,0xFFFFFFF0);
  811. &jz (&label("enc_tail")); # short input...
  812. &mov ($s0,&DWP(0,$key)); # load iv
  813. &mov ($s1,&DWP(4,$key));
  814. &align (4);
  815. &set_label("enc_loop");
  816. &mov ($s2,&DWP(8,$key));
  817. &mov ($s3,&DWP(12,$key));
  818. &xor ($s0,&DWP(0,$acc)); # xor input data
  819. &xor ($s1,&DWP(4,$acc));
  820. &xor ($s2,&DWP(8,$acc));
  821. &xor ($s3,&DWP(12,$acc));
  822. &mov ($key,$_key); # load key
  823. &call ("_x86_AES_encrypt");
  824. &mov ($acc,$_inp); # load inp
  825. &mov ($key,$_out); # load out
  826. &mov (&DWP(0,$key),$s0); # save output data
  827. &mov (&DWP(4,$key),$s1);
  828. &mov (&DWP(8,$key),$s2);
  829. &mov (&DWP(12,$key),$s3);
  830. &mov ($s2,$_len); # load len
  831. &lea ($acc,&DWP(16,$acc));
  832. &mov ($_inp,$acc); # save inp
  833. &lea ($s3,&DWP(16,$key));
  834. &mov ($_out,$s3); # save out
  835. &sub ($s2,16);
  836. &test ($s2,0xFFFFFFF0);
  837. &mov ($_len,$s2); # save len
  838. &jnz (&label("enc_loop"));
  839. &test ($s2,15);
  840. &jnz (&label("enc_tail"));
  841. &mov ($acc,$_ivp); # load ivp
  842. &mov ($s2,&DWP(8,$key)); # restore last dwords
  843. &mov ($s3,&DWP(12,$key));
  844. &mov (&DWP(0,$acc),$s0); # save ivec
  845. &mov (&DWP(4,$acc),$s1);
  846. &mov (&DWP(8,$acc),$s2);
  847. &mov (&DWP(12,$acc),$s3);
  848. &cmp ($mark,0); # was the key schedule copied?
  849. &mov ("edi",$_key);
  850. &je (&label("skip_ezero"));
  851. # zero copy of key schedule
  852. &mov ("ecx",240/4);
  853. &xor ("eax","eax");
  854. &align (4);
  855. &data_word(0xABF3F689); # rep stosd
  856. &set_label("skip_ezero")
  857. &mov ("esp",$_esp);
  858. &popf ();
  859. &set_label("enc_out");
  860. &function_end_A();
  861. &pushf (); # kludge, never executed
  862. &align (4);
  863. &set_label("enc_tail");
  864. &mov ($s0,$key eq "edi" ? $key : "");
  865. &mov ($key,$_out); # load out
  866. &push ($s0); # push ivp
  867. &mov ($s1,16);
  868. &sub ($s1,$s2);
  869. &cmp ($key,$acc); # compare with inp
  870. &je (&label("enc_in_place"));
  871. &align (4);
  872. &data_word(0xA4F3F689); # rep movsb # copy input
  873. &jmp (&label("enc_skip_in_place"));
  874. &set_label("enc_in_place");
  875. &lea ($key,&DWP(0,$key,$s2));
  876. &set_label("enc_skip_in_place");
  877. &mov ($s2,$s1);
  878. &xor ($s0,$s0);
  879. &align (4);
  880. &data_word(0xAAF3F689); # rep stosb # zero tail
  881. &pop ($key); # pop ivp
  882. &mov ($acc,$_out); # output as input
  883. &mov ($s0,&DWP(0,$key));
  884. &mov ($s1,&DWP(4,$key));
  885. &mov ($_len,16); # len=16
  886. &jmp (&label("enc_loop")); # one more spin...
  887. #----------------------------- DECRYPT -----------------------------#
  888. &align (4);
  889. &set_label("DECRYPT");
  890. &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  891. # allocate aligned stack frame...
  892. &lea ($key,&DWP(-64-244,"esp"));
  893. &and ($key,-64);
  894. # ... and make sure it doesn't alias with AES_Td modulo 4096
  895. &mov ($s0,"ebp");
  896. &lea ($s1,&DWP(2048+256,"ebp"));
  897. &mov ($s3,$key);
  898. &and ($s0,0xfff); # s = %ebp&0xfff
  899. &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
  900. &and ($s3,0xfff); # p = %esp&0xfff
  901. &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
  902. &jb (&label("td_break_out"));
  903. &sub ($s3,$s1);
  904. &sub ($key,$s3);
  905. &jmp (&label("td_ok"));
  906. &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;
  907. &sub ($s3,$s0);
  908. &and ($s3,0xfff);
  909. &add ($s3,64+256);
  910. &sub ($key,$s3);
  911. &align (4);
  912. &set_label("td_ok");
  913. &mov ($s0,&wparam(0)); # load inp
  914. &mov ($s1,&wparam(1)); # load out
  915. &mov ($s3,&wparam(3)); # load key
  916. &mov ($acc,&wparam(4)); # load ivp
  917. &exch ("esp",$key);
  918. &add ("esp",4); # reserve for return address!
  919. &mov ($_esp,$key); # save %esp
  920. &mov ($_inp,$s0); # save copy of inp
  921. &mov ($_out,$s1); # save copy of out
  922. &mov ($_len,$s2); # save copy of len
  923. &mov ($_key,$s3); # save copy of key
  924. &mov ($_ivp,$acc); # save copy of ivp
  925. &mov ($mark,0); # copy of aes_key->rounds = 0;
  926. if ($compromise) {
  927. &cmp ($s2,$compromise);
  928. &jb (&label("skip_dcopy"));
  929. }
  930. # do we copy key schedule to stack?
  931. &mov ($s1 eq "ebx" ? $s1 : "",$s3);
  932. &mov ($s2 eq "ecx" ? $s2 : "",244/4);
  933. &sub ($s1,"ebp");
  934. &mov ("esi",$s3);
  935. &and ($s1,0xfff);
  936. &lea ("edi",$aes_key);
  937. &cmp ($s1,2048+256);
  938. &jb (&label("do_dcopy"));
  939. &cmp ($s1,4096-244);
  940. &jb (&label("skip_dcopy"));
  941. &align (4);
  942. &set_label("do_dcopy");
  943. &mov ($_key,"edi");
  944. &data_word(0xA5F3F689); # rep movsd
  945. &set_label("skip_dcopy");
  946. &mov ($acc,$s0);
  947. &mov ($key,18);
  948. &align (4);
  949. &set_label("prefetch_td");
  950. &mov ($s0,&DWP(0,"ebp"));
  951. &mov ($s1,&DWP(32,"ebp"));
  952. &mov ($s2,&DWP(64,"ebp"));
  953. &mov ($s3,&DWP(96,"ebp"));
  954. &lea ("ebp",&DWP(128,"ebp"));
  955. &dec ($key);
  956. &jnz (&label("prefetch_td"));
  957. &sub ("ebp",2048+256);
  958. &cmp ($acc,$_out);
  959. &je (&label("dec_in_place")); # in-place processing...
  960. &mov ($key,$_ivp); # load ivp
  961. &mov ($_tmp,$key);
  962. &align (4);
  963. &set_label("dec_loop");
  964. &mov ($s0,&DWP(0,$acc)); # read input
  965. &mov ($s1,&DWP(4,$acc));
  966. &mov ($s2,&DWP(8,$acc));
  967. &mov ($s3,&DWP(12,$acc));
  968. &mov ($key,$_key); # load key
  969. &call ("_x86_AES_decrypt");
  970. &mov ($key,$_tmp); # load ivp
  971. &mov ($acc,$_len); # load len
  972. &xor ($s0,&DWP(0,$key)); # xor iv
  973. &xor ($s1,&DWP(4,$key));
  974. &xor ($s2,&DWP(8,$key));
  975. &xor ($s3,&DWP(12,$key));
  976. &sub ($acc,16);
  977. &jc (&label("dec_partial"));
  978. &mov ($_len,$acc); # save len
  979. &mov ($acc,$_inp); # load inp
  980. &mov ($key,$_out); # load out
  981. &mov (&DWP(0,$key),$s0); # write output
  982. &mov (&DWP(4,$key),$s1);
  983. &mov (&DWP(8,$key),$s2);
  984. &mov (&DWP(12,$key),$s3);
  985. &mov ($_tmp,$acc); # save ivp
  986. &lea ($acc,&DWP(16,$acc));
  987. &mov ($_inp,$acc); # save inp
  988. &lea ($key,&DWP(16,$key));
  989. &mov ($_out,$key); # save out
  990. &jnz (&label("dec_loop"));
  991. &mov ($key,$_tmp); # load temp ivp
  992. &set_label("dec_end");
  993. &mov ($acc,$_ivp); # load user ivp
  994. &mov ($s0,&DWP(0,$key)); # load iv
  995. &mov ($s1,&DWP(4,$key));
  996. &mov ($s2,&DWP(8,$key));
  997. &mov ($s3,&DWP(12,$key));
  998. &mov (&DWP(0,$acc),$s0); # copy back to user
  999. &mov (&DWP(4,$acc),$s1);
  1000. &mov (&DWP(8,$acc),$s2);
  1001. &mov (&DWP(12,$acc),$s3);
  1002. &jmp (&label("dec_out"));
  1003. &align (4);
  1004. &set_label("dec_partial");
  1005. &lea ($key,$ivec);
  1006. &mov (&DWP(0,$key),$s0); # dump output to stack
  1007. &mov (&DWP(4,$key),$s1);
  1008. &mov (&DWP(8,$key),$s2);
  1009. &mov (&DWP(12,$key),$s3);
  1010. &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
  1011. &mov ($acc eq "esi" ? $acc : "",$key);
  1012. &mov ($key eq "edi" ? $key : "",$_out); # load out
  1013. &data_word(0xA4F3F689); # rep movsb # copy output
  1014. &mov ($key,$_inp); # use inp as temp ivp
  1015. &jmp (&label("dec_end"));
  1016. &align (4);
  1017. &set_label("dec_in_place");
  1018. &set_label("dec_in_place_loop");
  1019. &lea ($key,$ivec);
  1020. &mov ($s0,&DWP(0,$acc)); # read input
  1021. &mov ($s1,&DWP(4,$acc));
  1022. &mov ($s2,&DWP(8,$acc));
  1023. &mov ($s3,&DWP(12,$acc));
  1024. &mov (&DWP(0,$key),$s0); # copy to temp
  1025. &mov (&DWP(4,$key),$s1);
  1026. &mov (&DWP(8,$key),$s2);
  1027. &mov (&DWP(12,$key),$s3);
  1028. &mov ($key,$_key); # load key
  1029. &call ("_x86_AES_decrypt");
  1030. &mov ($key,$_ivp); # load ivp
  1031. &mov ($acc,$_out); # load out
  1032. &xor ($s0,&DWP(0,$key)); # xor iv
  1033. &xor ($s1,&DWP(4,$key));
  1034. &xor ($s2,&DWP(8,$key));
  1035. &xor ($s3,&DWP(12,$key));
  1036. &mov (&DWP(0,$acc),$s0); # write output
  1037. &mov (&DWP(4,$acc),$s1);
  1038. &mov (&DWP(8,$acc),$s2);
  1039. &mov (&DWP(12,$acc),$s3);
  1040. &lea ($acc,&DWP(16,$acc));
  1041. &mov ($_out,$acc); # save out
  1042. &lea ($acc,$ivec);
  1043. &mov ($s0,&DWP(0,$acc)); # read temp
  1044. &mov ($s1,&DWP(4,$acc));
  1045. &mov ($s2,&DWP(8,$acc));
  1046. &mov ($s3,&DWP(12,$acc));
  1047. &mov (&DWP(0,$key),$s0); # copy iv
  1048. &mov (&DWP(4,$key),$s1);
  1049. &mov (&DWP(8,$key),$s2);
  1050. &mov (&DWP(12,$key),$s3);
  1051. &mov ($acc,$_inp); # load inp
  1052. &lea ($acc,&DWP(16,$acc));
  1053. &mov ($_inp,$acc); # save inp
  1054. &mov ($s2,$_len); # load len
  1055. &sub ($s2,16);
  1056. &jc (&label("dec_in_place_partial"));
  1057. &mov ($_len,$s2); # save len
  1058. &jnz (&label("dec_in_place_loop"));
  1059. &jmp (&label("dec_out"));
  1060. &align (4);
  1061. &set_label("dec_in_place_partial");
  1062. # one can argue if this is actually required...
  1063. &mov ($key eq "edi" ? $key : "",$_out);
  1064. &lea ($acc eq "esi" ? $acc : "",$ivec);
  1065. &lea ($key,&DWP(0,$key,$s2));
  1066. &lea ($acc,&DWP(16,$acc,$s2));
  1067. &neg ($s2 eq "ecx" ? $s2 : "");
  1068. &data_word(0xA4F3F689); # rep movsb # restore tail
  1069. &align (4);
  1070. &set_label("dec_out");
  1071. &cmp ($mark,0); # was the key schedule copied?
  1072. &mov ("edi",$_key);
  1073. &je (&label("skip_dzero"));
  1074. # zero copy of key schedule
  1075. &mov ("ecx",240/4);
  1076. &xor ("eax","eax");
  1077. &align (4);
  1078. &data_word(0xABF3F689); # rep stosd
  1079. &set_label("skip_dzero")
  1080. &mov ("esp",$_esp);
  1081. &popf ();
  1082. &function_end("AES_cbc_encrypt");
  1083. }
  1084. #------------------------------------------------------------------#
  1085. sub enckey()
  1086. {
  1087. &movz ("esi",&LB("edx")); # rk[i]>>0
  1088. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1089. &movz ("esi",&HB("edx")); # rk[i]>>8
  1090. &and ("ebx",0xFF000000);
  1091. &xor ("eax","ebx");
  1092. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1093. &shr ("edx",16);
  1094. &and ("ebx",0x000000FF);
  1095. &movz ("esi",&LB("edx")); # rk[i]>>16
  1096. &xor ("eax","ebx");
  1097. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1098. &movz ("esi",&HB("edx")); # rk[i]>>24
  1099. &and ("ebx",0x0000FF00);
  1100. &xor ("eax","ebx");
  1101. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1102. &and ("ebx",0x00FF0000);
  1103. &xor ("eax","ebx");
  1104. &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon
  1105. }
  1106. # int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
  1107. # AES_KEY *key)
  1108. &public_label("AES_Te");
  1109. &function_begin("AES_set_encrypt_key");
  1110. &mov ("esi",&wparam(0)); # user supplied key
  1111. &mov ("edi",&wparam(2)); # private key schedule
  1112. &test ("esi",-1);
  1113. &jz (&label("badpointer"));
  1114. &test ("edi",-1);
  1115. &jz (&label("badpointer"));
  1116. &call (&label("pic_point"));
  1117. &set_label("pic_point");
  1118. &blindpop("ebp");
  1119. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  1120. &mov ("ecx",&wparam(1)); # number of bits in key
  1121. &cmp ("ecx",128);
  1122. &je (&label("10rounds"));
  1123. &cmp ("ecx",192);
  1124. &je (&label("12rounds"));
  1125. &cmp ("ecx",256);
  1126. &je (&label("14rounds"));
  1127. &mov ("eax",-2); # invalid number of bits
  1128. &jmp (&label("exit"));
  1129. &set_label("10rounds");
  1130. &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
  1131. &mov ("ebx",&DWP(4,"esi"));
  1132. &mov ("ecx",&DWP(8,"esi"));
  1133. &mov ("edx",&DWP(12,"esi"));
  1134. &mov (&DWP(0,"edi"),"eax");
  1135. &mov (&DWP(4,"edi"),"ebx");
  1136. &mov (&DWP(8,"edi"),"ecx");
  1137. &mov (&DWP(12,"edi"),"edx");
  1138. &xor ("ecx","ecx");
  1139. &jmp (&label("10shortcut"));
  1140. &align (4);
  1141. &set_label("10loop");
  1142. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1143. &mov ("edx",&DWP(12,"edi")); # rk[3]
  1144. &set_label("10shortcut");
  1145. &enckey ();
  1146. &mov (&DWP(16,"edi"),"eax"); # rk[4]
  1147. &xor ("eax",&DWP(4,"edi"));
  1148. &mov (&DWP(20,"edi"),"eax"); # rk[5]
  1149. &xor ("eax",&DWP(8,"edi"));
  1150. &mov (&DWP(24,"edi"),"eax"); # rk[6]
  1151. &xor ("eax",&DWP(12,"edi"));
  1152. &mov (&DWP(28,"edi"),"eax"); # rk[7]
  1153. &inc ("ecx");
  1154. &add ("edi",16);
  1155. &cmp ("ecx",10);
  1156. &jl (&label("10loop"));
  1157. &mov (&DWP(80,"edi"),10); # setup number of rounds
  1158. &xor ("eax","eax");
  1159. &jmp (&label("exit"));
  1160. &set_label("12rounds");
  1161. &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
  1162. &mov ("ebx",&DWP(4,"esi"));
  1163. &mov ("ecx",&DWP(8,"esi"));
  1164. &mov ("edx",&DWP(12,"esi"));
  1165. &mov (&DWP(0,"edi"),"eax");
  1166. &mov (&DWP(4,"edi"),"ebx");
  1167. &mov (&DWP(8,"edi"),"ecx");
  1168. &mov (&DWP(12,"edi"),"edx");
  1169. &mov ("ecx",&DWP(16,"esi"));
  1170. &mov ("edx",&DWP(20,"esi"));
  1171. &mov (&DWP(16,"edi"),"ecx");
  1172. &mov (&DWP(20,"edi"),"edx");
  1173. &xor ("ecx","ecx");
  1174. &jmp (&label("12shortcut"));
  1175. &align (4);
  1176. &set_label("12loop");
  1177. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1178. &mov ("edx",&DWP(20,"edi")); # rk[5]
  1179. &set_label("12shortcut");
  1180. &enckey ();
  1181. &mov (&DWP(24,"edi"),"eax"); # rk[6]
  1182. &xor ("eax",&DWP(4,"edi"));
  1183. &mov (&DWP(28,"edi"),"eax"); # rk[7]
  1184. &xor ("eax",&DWP(8,"edi"));
  1185. &mov (&DWP(32,"edi"),"eax"); # rk[8]
  1186. &xor ("eax",&DWP(12,"edi"));
  1187. &mov (&DWP(36,"edi"),"eax"); # rk[9]
  1188. &cmp ("ecx",7);
  1189. &je (&label("12break"));
  1190. &inc ("ecx");
  1191. &xor ("eax",&DWP(16,"edi"));
  1192. &mov (&DWP(40,"edi"),"eax"); # rk[10]
  1193. &xor ("eax",&DWP(20,"edi"));
  1194. &mov (&DWP(44,"edi"),"eax"); # rk[11]
  1195. &add ("edi",24);
  1196. &jmp (&label("12loop"));
  1197. &set_label("12break");
  1198. &mov (&DWP(72,"edi"),12); # setup number of rounds
  1199. &xor ("eax","eax");
  1200. &jmp (&label("exit"));
  1201. &set_label("14rounds");
  1202. &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
  1203. &mov ("ebx",&DWP(4,"esi"));
  1204. &mov ("ecx",&DWP(8,"esi"));
  1205. &mov ("edx",&DWP(12,"esi"));
  1206. &mov (&DWP(0,"edi"),"eax");
  1207. &mov (&DWP(4,"edi"),"ebx");
  1208. &mov (&DWP(8,"edi"),"ecx");
  1209. &mov (&DWP(12,"edi"),"edx");
  1210. &mov ("eax",&DWP(16,"esi"));
  1211. &mov ("ebx",&DWP(20,"esi"));
  1212. &mov ("ecx",&DWP(24,"esi"));
  1213. &mov ("edx",&DWP(28,"esi"));
  1214. &mov (&DWP(16,"edi"),"eax");
  1215. &mov (&DWP(20,"edi"),"ebx");
  1216. &mov (&DWP(24,"edi"),"ecx");
  1217. &mov (&DWP(28,"edi"),"edx");
  1218. &xor ("ecx","ecx");
  1219. &jmp (&label("14shortcut"));
  1220. &align (4);
  1221. &set_label("14loop");
  1222. &mov ("edx",&DWP(28,"edi")); # rk[7]
  1223. &set_label("14shortcut");
  1224. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1225. &enckey ();
  1226. &mov (&DWP(32,"edi"),"eax"); # rk[8]
  1227. &xor ("eax",&DWP(4,"edi"));
  1228. &mov (&DWP(36,"edi"),"eax"); # rk[9]
  1229. &xor ("eax",&DWP(8,"edi"));
  1230. &mov (&DWP(40,"edi"),"eax"); # rk[10]
  1231. &xor ("eax",&DWP(12,"edi"));
  1232. &mov (&DWP(44,"edi"),"eax"); # rk[11]
  1233. &cmp ("ecx",6);
  1234. &je (&label("14break"));
  1235. &inc ("ecx");
  1236. &mov ("edx","eax");
  1237. &mov ("eax",&DWP(16,"edi")); # rk[4]
  1238. &movz ("esi",&LB("edx")); # rk[11]>>0
  1239. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1240. &movz ("esi",&HB("edx")); # rk[11]>>8
  1241. &and ("ebx",0x000000FF);
  1242. &xor ("eax","ebx");
  1243. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1244. &shr ("edx",16);
  1245. &and ("ebx",0x0000FF00);
  1246. &movz ("esi",&LB("edx")); # rk[11]>>16
  1247. &xor ("eax","ebx");
  1248. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1249. &movz ("esi",&HB("edx")); # rk[11]>>24
  1250. &and ("ebx",0x00FF0000);
  1251. &xor ("eax","ebx");
  1252. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1253. &and ("ebx",0xFF000000);
  1254. &xor ("eax","ebx");
  1255. &mov (&DWP(48,"edi"),"eax"); # rk[12]
  1256. &xor ("eax",&DWP(20,"edi"));
  1257. &mov (&DWP(52,"edi"),"eax"); # rk[13]
  1258. &xor ("eax",&DWP(24,"edi"));
  1259. &mov (&DWP(56,"edi"),"eax"); # rk[14]
  1260. &xor ("eax",&DWP(28,"edi"));
  1261. &mov (&DWP(60,"edi"),"eax"); # rk[15]
  1262. &add ("edi",32);
  1263. &jmp (&label("14loop"));
  1264. &set_label("14break");
  1265. &mov (&DWP(48,"edi"),14); # setup number of rounds
  1266. &xor ("eax","eax");
  1267. &jmp (&label("exit"));
  1268. &set_label("badpointer");
  1269. &mov ("eax",-1);
  1270. &set_label("exit");
  1271. &function_end("AES_set_encrypt_key");
  1272. sub deckey()
  1273. { my ($i,$ptr,$te,$td) = @_;
  1274. &mov ("eax",&DWP($i,$ptr));
  1275. &mov ("edx","eax");
  1276. &movz ("ebx",&HB("eax"));
  1277. &shr ("edx",16);
  1278. &and ("eax",0xFF);
  1279. &movz ("eax",&BP(2,$te,"eax",8));
  1280. &movz ("ebx",&BP(2,$te,"ebx",8));
  1281. &mov ("eax",&DWP(0,$td,"eax",8));
  1282. &xor ("eax",&DWP(3,$td,"ebx",8));
  1283. &movz ("ebx",&HB("edx"));
  1284. &and ("edx",0xFF);
  1285. &movz ("edx",&BP(2,$te,"edx",8));
  1286. &movz ("ebx",&BP(2,$te,"ebx",8));
  1287. &xor ("eax",&DWP(2,$td,"edx",8));
  1288. &xor ("eax",&DWP(1,$td,"ebx",8));
  1289. &mov (&DWP($i,$ptr),"eax");
  1290. }
  1291. # int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
  1292. # AES_KEY *key)
  1293. &public_label("AES_Td");
  1294. &public_label("AES_Te");
  1295. &function_begin_B("AES_set_decrypt_key");
  1296. &mov ("eax",&wparam(0));
  1297. &mov ("ecx",&wparam(1));
  1298. &mov ("edx",&wparam(2));
  1299. &sub ("esp",12);
  1300. &mov (&DWP(0,"esp"),"eax");
  1301. &mov (&DWP(4,"esp"),"ecx");
  1302. &mov (&DWP(8,"esp"),"edx");
  1303. &call ("AES_set_encrypt_key");
  1304. &add ("esp",12);
  1305. &cmp ("eax",0);
  1306. &je (&label("proceed"));
  1307. &ret ();
  1308. &set_label("proceed");
  1309. &push ("ebp");
  1310. &push ("ebx");
  1311. &push ("esi");
  1312. &push ("edi");
  1313. &mov ("esi",&wparam(2));
  1314. &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
  1315. &lea ("ecx",&DWP(0,"","ecx",4));
  1316. &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
  1317. &align (4);
  1318. &set_label("invert"); # invert order of chunks
  1319. &mov ("eax",&DWP(0,"esi"));
  1320. &mov ("ebx",&DWP(4,"esi"));
  1321. &mov ("ecx",&DWP(0,"edi"));
  1322. &mov ("edx",&DWP(4,"edi"));
  1323. &mov (&DWP(0,"edi"),"eax");
  1324. &mov (&DWP(4,"edi"),"ebx");
  1325. &mov (&DWP(0,"esi"),"ecx");
  1326. &mov (&DWP(4,"esi"),"edx");
  1327. &mov ("eax",&DWP(8,"esi"));
  1328. &mov ("ebx",&DWP(12,"esi"));
  1329. &mov ("ecx",&DWP(8,"edi"));
  1330. &mov ("edx",&DWP(12,"edi"));
  1331. &mov (&DWP(8,"edi"),"eax");
  1332. &mov (&DWP(12,"edi"),"ebx");
  1333. &mov (&DWP(8,"esi"),"ecx");
  1334. &mov (&DWP(12,"esi"),"edx");
  1335. &add ("esi",16);
  1336. &sub ("edi",16);
  1337. &cmp ("esi","edi");
  1338. &jne (&label("invert"));
  1339. &call (&label("pic_point"));
  1340. &set_label("pic_point");
  1341. blindpop("ebp");
  1342. &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  1343. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  1344. &mov ("esi",&wparam(2));
  1345. &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
  1346. &dec ("ecx");
  1347. &align (4);
  1348. &set_label("permute"); # permute the key schedule
  1349. &add ("esi",16);
  1350. &deckey (0,"esi","ebp","edi");
  1351. &deckey (4,"esi","ebp","edi");
  1352. &deckey (8,"esi","ebp","edi");
  1353. &deckey (12,"esi","ebp","edi");
  1354. &dec ("ecx");
  1355. &jnz (&label("permute"));
  1356. &xor ("eax","eax"); # return success
  1357. &function_end("AES_set_decrypt_key");
  1358. &asm_finish();