aes-586.pl 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. Rights for redistribution and usage in source and binary
  6. # forms are granted according to the OpenSSL license.
  7. # ====================================================================
  8. #
  9. # Version 3.5.
  10. #
  11. # You might fail to appreciate this module performance from the first
  12. # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
  13. # to be *the* best Intel C compiler without -KPIC, performance appears
  14. # to be virtually identical... But try to re-configure with shared
  15. # library support... Aha! Intel compiler "suddenly" lags behind by 30%
  16. # [on P4, more on others]:-) And if compared to position-independent
  17. # code generated by GNU C, this code performs *more* than *twice* as
  18. # fast! Yes, all this buzz about PIC means that unlike other hand-
  19. # coded implementations, this one was explicitly designed to be safe
  20. # to use even in shared library context... This also means that this
  21. # code isn't necessarily absolutely fastest "ever," because in order
  22. # to achieve position independence an extra register has to be
  23. # off-loaded to stack, which affects the benchmark result.
  24. #
  25. # Special note about instruction choice. Do you recall RC4_INT code
  26. # performing poorly on P4? It might be the time to figure out why.
  27. # RC4_INT code implies effective address calculations in base+offset*4
  28. # form. Trouble is that it seems that offset scaling turned to be
  29. # critical path... At least eliminating scaling resulted in 2.8x RC4
  30. # performance improvement [as you might recall]. As AES code is hungry
  31. # for scaling too, I [try to] avoid the latter by favoring off-by-2
  32. # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
  33. #
  34. # As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
  35. # void. Performance improvement with off-by-2 shifts was observed on
  36. # intermediate implementation, which was spilling yet another register
  37. # to stack... Final offset*4 code below runs just a tad faster on P4,
  38. # but exhibits up to 10% improvement on other cores.
  39. #
  40. # Second version is "monolithic" replacement for aes_core.c, which in
  41. # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
  42. # This made it possible to implement little-endian variant of the
  43. # algorithm without modifying the base C code. Motivating factor for
  44. # the undertaken effort was that it appeared that in tight IA-32
  45. # register window little-endian flavor could achieve slightly higher
  46. # Instruction Level Parallelism, and it indeed resulted in up to 15%
  47. # better performance on most recent µ-archs...
  48. #
  49. # Third version adds AES_cbc_encrypt implementation, which resulted in
  50. # up to 40% performance imrovement of CBC benchmark results. 40% was
  51. # observed on P4 core, where "overall" imrovement coefficient, i.e. if
  52. # compared to PIC generated by GCC and in CBC mode, was observed to be
  53. # as large as 4x:-) CBC performance is virtually identical to ECB now
  54. # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
  55. # Opteron, because certain function prologues and epilogues are
  56. # effectively taken out of the loop...
  57. #
  58. # Version 3.2 implements compressed tables and prefetch of these tables
  59. # in CBC[!] mode. Former means that 3/4 of table references are now
  60. # misaligned, which unfortunately has negative impact on elder IA-32
  61. # implementations, Pentium suffered 30% penalty, PIII - 10%.
  62. #
  63. # Version 3.3 avoids L1 cache aliasing between stack frame and
  64. # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
  65. # latter is achieved by copying the key schedule to controlled place in
  66. # stack. This unfortunately has rather strong impact on small block CBC
  67. # performance, ~2x deterioration on 16-byte block if compared to 3.3.
  68. #
  69. # Version 3.5 checks if there is L1 cache aliasing between user-supplied
  70. # key schedule and S-boxes and abstains from copying the former if
  71. # there is no. This allows end-user to consciously retain small block
  72. # performance by aligning key schedule in specific manner.
  73. #
  74. # Current ECB performance numbers for 128-bit key in CPU cycles per
  75. # processed byte [measure commonly used by AES benchmarkers] are:
  76. #
  77. # small footprint fully unrolled
  78. # P4 24 22
  79. # AMD K8 20 19
  80. # PIII 25 23
  81. # Pentium 81 78
  82. push(@INC,"perlasm","../../perlasm");
  83. require "x86asm.pl";
  84. &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
  85. $s0="eax";
  86. $s1="ebx";
  87. $s2="ecx";
  88. $s3="edx";
  89. $key="edi";
  90. $acc="esi";
  91. $compromise=0; # $compromise=128 abstains from copying key
  92. # schedule to stack when encrypting inputs
  93. # shorter than 128 bytes at the cost of
  94. # risksing aliasing with S-boxes. In return
  95. # you get way better, up to +70%, small block
  96. # performance.
  97. $small_footprint=1; # $small_footprint=1 code is ~5% slower [on
  98. # recent µ-archs], but ~5 times smaller!
  99. # I favor compact code to minimize cache
  100. # contention and in hope to "collect" 5% back
  101. # in real-life applications...
  102. $vertical_spin=0; # shift "verticaly" defaults to 0, because of
  103. # its proof-of-concept status...
  104. # Note that there is no decvert(), as well as last encryption round is
  105. # performed with "horizontal" shifts. This is because this "vertical"
  106. # implementation [one which groups shifts on a given $s[i] to form a
  107. # "column," unlike "horizontal" one, which groups shifts on different
  108. # $s[i] to form a "row"] is work in progress. It was observed to run
  109. # few percents faster on Intel cores, but not AMD. On AMD K8 core it's
  110. # whole 12% slower:-( So we face a trade-off... Shall it be resolved
  111. # some day? Till then the code is considered experimental and by
  112. # default remains dormant...
  113. sub encvert()
  114. { my ($te,@s) = @_;
  115. my $v0 = $acc, $v1 = $key;
  116. &mov ($v0,$s[3]); # copy s3
  117. &mov (&DWP(4,"esp"),$s[2]); # save s2
  118. &mov ($v1,$s[0]); # copy s0
  119. &mov (&DWP(8,"esp"),$s[1]); # save s1
  120. &movz ($s[2],&HB($s[0]));
  121. &and ($s[0],0xFF);
  122. &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
  123. &shr ($v1,16);
  124. &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
  125. &movz ($s[1],&HB($v1));
  126. &and ($v1,0xFF);
  127. &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
  128. &mov ($v1,$v0);
  129. &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
  130. &and ($v0,0xFF);
  131. &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
  132. &movz ($v0,&HB($v1));
  133. &shr ($v1,16);
  134. &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
  135. &movz ($v0,&HB($v1));
  136. &and ($v1,0xFF);
  137. &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
  138. &mov ($v1,&DWP(4,"esp")); # restore s2
  139. &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
  140. &mov ($v0,$v1);
  141. &and ($v1,0xFF);
  142. &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
  143. &movz ($v1,&HB($v0));
  144. &shr ($v0,16);
  145. &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
  146. &movz ($v1,&HB($v0));
  147. &and ($v0,0xFF);
  148. &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
  149. &mov ($v0,&DWP(8,"esp")); # restore s1
  150. &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
  151. &mov ($v1,$v0);
  152. &and ($v0,0xFF);
  153. &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
  154. &movz ($v0,&HB($v1));
  155. &shr ($v1,16);
  156. &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
  157. &movz ($v0,&HB($v1));
  158. &and ($v1,0xFF);
  159. &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
  160. &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key
  161. &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
  162. }
  163. sub encstep()
  164. { my ($i,$te,@s) = @_;
  165. my $tmp = $key;
  166. my $out = $i==3?$s[0]:$acc;
  167. # lines marked with #%e?x[i] denote "reordered" instructions...
  168. if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
  169. else { &mov ($out,$s[0]);
  170. &and ($out,0xFF); }
  171. if ($i==1) { &shr ($s[0],16); }#%ebx[1]
  172. if ($i==2) { &shr ($s[0],24); }#%ecx[2]
  173. &mov ($out,&DWP(0,$te,$out,8));
  174. if ($i==3) { $tmp=$s[1]; }##%eax
  175. &movz ($tmp,&HB($s[1]));
  176. &xor ($out,&DWP(3,$te,$tmp,8));
  177. if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
  178. else { &mov ($tmp,$s[2]);
  179. &shr ($tmp,16); }
  180. if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
  181. &and ($tmp,0xFF);
  182. &xor ($out,&DWP(2,$te,$tmp,8));
  183. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
  184. elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
  185. else { &mov ($tmp,$s[3]);
  186. &shr ($tmp,24) }
  187. &xor ($out,&DWP(1,$te,$tmp,8));
  188. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  189. if ($i==3) { &mov ($s[3],$acc); }
  190. &comment();
  191. }
  192. sub enclast()
  193. { my ($i,$te,@s)=@_;
  194. my $tmp = $key;
  195. my $out = $i==3?$s[0]:$acc;
  196. if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx
  197. else { &mov ($out,$s[0]); }
  198. &and ($out,0xFF);
  199. if ($i==1) { &shr ($s[0],16); }#%ebx[1]
  200. if ($i==2) { &shr ($s[0],24); }#%ecx[2]
  201. &mov ($out,&DWP(2,$te,$out,8));
  202. &and ($out,0x000000ff);
  203. if ($i==3) { $tmp=$s[1]; }##%eax
  204. &movz ($tmp,&HB($s[1]));
  205. &mov ($tmp,&DWP(0,$te,$tmp,8));
  206. &and ($tmp,0x0000ff00);
  207. &xor ($out,$tmp);
  208. if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
  209. else { mov ($tmp,$s[2]);
  210. &shr ($tmp,16); }
  211. if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
  212. &and ($tmp,0xFF);
  213. &mov ($tmp,&DWP(0,$te,$tmp,8));
  214. &and ($tmp,0x00ff0000);
  215. &xor ($out,$tmp);
  216. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
  217. elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
  218. else { &mov ($tmp,$s[3]);
  219. &shr ($tmp,24); }
  220. &mov ($tmp,&DWP(2,$te,$tmp,8));
  221. &and ($tmp,0xff000000);
  222. &xor ($out,$tmp);
  223. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  224. if ($i==3) { &mov ($s[3],$acc); }
  225. }
  226. sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
  227. &public_label("AES_Te");
  228. &function_begin_B("_x86_AES_encrypt");
  229. if ($vertical_spin) {
  230. # I need high parts of volatile registers to be accessible...
  231. &exch ($s1="edi",$key="ebx");
  232. &mov ($s2="esi",$acc="ecx");
  233. }
  234. # note that caller is expected to allocate stack frame for me!
  235. &mov (&DWP(12,"esp"),$key); # save key
  236. &xor ($s0,&DWP(0,$key)); # xor with key
  237. &xor ($s1,&DWP(4,$key));
  238. &xor ($s2,&DWP(8,$key));
  239. &xor ($s3,&DWP(12,$key));
  240. &mov ($acc,&DWP(240,$key)); # load key->rounds
  241. if ($small_footprint) {
  242. &lea ($acc,&DWP(-2,$acc,$acc));
  243. &lea ($acc,&DWP(0,$key,$acc,8));
  244. &mov (&DWP(16,"esp"),$acc); # end of key schedule
  245. &align (4);
  246. &set_label("loop");
  247. if ($vertical_spin) {
  248. &encvert("ebp",$s0,$s1,$s2,$s3);
  249. } else {
  250. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  251. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  252. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  253. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  254. }
  255. &add ($key,16); # advance rd_key
  256. &xor ($s0,&DWP(0,$key));
  257. &xor ($s1,&DWP(4,$key));
  258. &xor ($s2,&DWP(8,$key));
  259. &xor ($s3,&DWP(12,$key));
  260. &cmp ($key,&DWP(16,"esp"));
  261. &mov (&DWP(12,"esp"),$key);
  262. &jb (&label("loop"));
  263. }
  264. else {
  265. &cmp ($acc,10);
  266. &jle (&label("10rounds"));
  267. &cmp ($acc,12);
  268. &jle (&label("12rounds"));
  269. &set_label("14rounds");
  270. for ($i=1;$i<3;$i++) {
  271. if ($vertical_spin) {
  272. &encvert("ebp",$s0,$s1,$s2,$s3);
  273. } else {
  274. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  275. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  276. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  277. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  278. }
  279. &xor ($s0,&DWP(16*$i+0,$key));
  280. &xor ($s1,&DWP(16*$i+4,$key));
  281. &xor ($s2,&DWP(16*$i+8,$key));
  282. &xor ($s3,&DWP(16*$i+12,$key));
  283. }
  284. &add ($key,32);
  285. &mov (&DWP(12,"esp"),$key); # advance rd_key
  286. &set_label("12rounds");
  287. for ($i=1;$i<3;$i++) {
  288. if ($vertical_spin) {
  289. &encvert("ebp",$s0,$s1,$s2,$s3);
  290. } else {
  291. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  292. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  293. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  294. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  295. }
  296. &xor ($s0,&DWP(16*$i+0,$key));
  297. &xor ($s1,&DWP(16*$i+4,$key));
  298. &xor ($s2,&DWP(16*$i+8,$key));
  299. &xor ($s3,&DWP(16*$i+12,$key));
  300. }
  301. &add ($key,32);
  302. &mov (&DWP(12,"esp"),$key); # advance rd_key
  303. &set_label("10rounds");
  304. for ($i=1;$i<10;$i++) {
  305. if ($vertical_spin) {
  306. &encvert("ebp",$s0,$s1,$s2,$s3);
  307. } else {
  308. &encstep(0,"ebp",$s0,$s1,$s2,$s3);
  309. &encstep(1,"ebp",$s1,$s2,$s3,$s0);
  310. &encstep(2,"ebp",$s2,$s3,$s0,$s1);
  311. &encstep(3,"ebp",$s3,$s0,$s1,$s2);
  312. }
  313. &xor ($s0,&DWP(16*$i+0,$key));
  314. &xor ($s1,&DWP(16*$i+4,$key));
  315. &xor ($s2,&DWP(16*$i+8,$key));
  316. &xor ($s3,&DWP(16*$i+12,$key));
  317. }
  318. }
  319. if ($vertical_spin) {
  320. # "reincarnate" some registers for "horizontal" spin...
  321. &mov ($s1="ebx",$key="edi");
  322. &mov ($s2="ecx",$acc="esi");
  323. }
  324. &enclast(0,"ebp",$s0,$s1,$s2,$s3);
  325. &enclast(1,"ebp",$s1,$s2,$s3,$s0);
  326. &enclast(2,"ebp",$s2,$s3,$s0,$s1);
  327. &enclast(3,"ebp",$s3,$s0,$s1,$s2);
  328. &add ($key,$small_footprint?16:160);
  329. &xor ($s0,&DWP(0,$key));
  330. &xor ($s1,&DWP(4,$key));
  331. &xor ($s2,&DWP(8,$key));
  332. &xor ($s3,&DWP(12,$key));
  333. &ret ();
  334. &set_label("AES_Te",64); # Yes! I keep it in the code segment!
  335. &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
  336. &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
  337. &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
  338. &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
  339. &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
  340. &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
  341. &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
  342. &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
  343. &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
  344. &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
  345. &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
  346. &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
  347. &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
  348. &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
  349. &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
  350. &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
  351. &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
  352. &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
  353. &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
  354. &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
  355. &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
  356. &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
  357. &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
  358. &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
  359. &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
  360. &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
  361. &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
  362. &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
  363. &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
  364. &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
  365. &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
  366. &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
  367. &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
  368. &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
  369. &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
  370. &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
  371. &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
  372. &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
  373. &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
  374. &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
  375. &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
  376. &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
  377. &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
  378. &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
  379. &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
  380. &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
  381. &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
  382. &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
  383. &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
  384. &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
  385. &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
  386. &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
  387. &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
  388. &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
  389. &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
  390. &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
  391. &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
  392. &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
  393. &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
  394. &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
  395. &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
  396. &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
  397. &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
  398. &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
  399. #rcon:
  400. &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
  401. &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
  402. &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0);
  403. &function_end_B("_x86_AES_encrypt");
  404. # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
  405. &public_label("AES_Te");
  406. &function_begin("AES_encrypt");
  407. &mov ($acc,&wparam(0)); # load inp
  408. &mov ($key,&wparam(2)); # load key
  409. &mov ($s0,"esp");
  410. &sub ("esp",24);
  411. &and ("esp",-64);
  412. &add ("esp",4);
  413. &mov (&DWP(16,"esp"),$s0);
  414. &call (&label("pic_point")); # make it PIC!
  415. &set_label("pic_point");
  416. &blindpop("ebp");
  417. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  418. &mov ($s0,&DWP(0,$acc)); # load input data
  419. &mov ($s1,&DWP(4,$acc));
  420. &mov ($s2,&DWP(8,$acc));
  421. &mov ($s3,&DWP(12,$acc));
  422. &call ("_x86_AES_encrypt");
  423. &mov ("esp",&DWP(16,"esp"));
  424. &mov ($acc,&wparam(1)); # load out
  425. &mov (&DWP(0,$acc),$s0); # write output data
  426. &mov (&DWP(4,$acc),$s1);
  427. &mov (&DWP(8,$acc),$s2);
  428. &mov (&DWP(12,$acc),$s3);
  429. &function_end("AES_encrypt");
  430. #------------------------------------------------------------------#
  431. sub decstep()
  432. { my ($i,$td,@s) = @_;
  433. my $tmp = $key;
  434. my $out = $i==3?$s[0]:$acc;
  435. # no instructions are reordered, as performance appears
  436. # optimal... or rather that all attempts to reorder didn't
  437. # result in better performance [which by the way is not a
  438. # bit lower than ecryption].
  439. if($i==3) { &mov ($key,&DWP(12,"esp")); }
  440. else { &mov ($out,$s[0]); }
  441. &and ($out,0xFF);
  442. &mov ($out,&DWP(0,$td,$out,8));
  443. if ($i==3) { $tmp=$s[1]; }
  444. &movz ($tmp,&HB($s[1]));
  445. &xor ($out,&DWP(3,$td,$tmp,8));
  446. if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
  447. else { &mov ($tmp,$s[2]); }
  448. &shr ($tmp,16);
  449. &and ($tmp,0xFF);
  450. &xor ($out,&DWP(2,$td,$tmp,8));
  451. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
  452. else { &mov ($tmp,$s[3]); }
  453. &shr ($tmp,24);
  454. &xor ($out,&DWP(1,$td,$tmp,8));
  455. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  456. if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
  457. &comment();
  458. }
  459. sub declast()
  460. { my ($i,$td,@s)=@_;
  461. my $tmp = $key;
  462. my $out = $i==3?$s[0]:$acc;
  463. if($i==3) { &mov ($key,&DWP(12,"esp")); }
  464. else { &mov ($out,$s[0]); }
  465. &and ($out,0xFF);
  466. &mov ($out,&DWP(2048,$td,$out,4));
  467. &and ($out,0x000000ff);
  468. if ($i==3) { $tmp=$s[1]; }
  469. &movz ($tmp,&HB($s[1]));
  470. &mov ($tmp,&DWP(2048,$td,$tmp,4));
  471. &and ($tmp,0x0000ff00);
  472. &xor ($out,$tmp);
  473. if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
  474. else { mov ($tmp,$s[2]); }
  475. &shr ($tmp,16);
  476. &and ($tmp,0xFF);
  477. &mov ($tmp,&DWP(2048,$td,$tmp,4));
  478. &and ($tmp,0x00ff0000);
  479. &xor ($out,$tmp);
  480. if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
  481. else { &mov ($tmp,$s[3]); }
  482. &shr ($tmp,24);
  483. &mov ($tmp,&DWP(2048,$td,$tmp,4));
  484. &and ($tmp,0xff000000);
  485. &xor ($out,$tmp);
  486. if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
  487. if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
  488. }
  489. &public_label("AES_Td");
  490. &function_begin_B("_x86_AES_decrypt");
  491. # note that caller is expected to allocate stack frame for me!
  492. &mov (&DWP(12,"esp"),$key); # save key
  493. &xor ($s0,&DWP(0,$key)); # xor with key
  494. &xor ($s1,&DWP(4,$key));
  495. &xor ($s2,&DWP(8,$key));
  496. &xor ($s3,&DWP(12,$key));
  497. &mov ($acc,&DWP(240,$key)); # load key->rounds
  498. if ($small_footprint) {
  499. &lea ($acc,&DWP(-2,$acc,$acc));
  500. &lea ($acc,&DWP(0,$key,$acc,8));
  501. &mov (&DWP(16,"esp"),$acc); # end of key schedule
  502. &align (4);
  503. &set_label("loop");
  504. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  505. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  506. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  507. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  508. &add ($key,16); # advance rd_key
  509. &xor ($s0,&DWP(0,$key));
  510. &xor ($s1,&DWP(4,$key));
  511. &xor ($s2,&DWP(8,$key));
  512. &xor ($s3,&DWP(12,$key));
  513. &cmp ($key,&DWP(16,"esp"));
  514. &mov (&DWP(12,"esp"),$key);
  515. &jb (&label("loop"));
  516. }
  517. else {
  518. &cmp ($acc,10);
  519. &jle (&label("10rounds"));
  520. &cmp ($acc,12);
  521. &jle (&label("12rounds"));
  522. &set_label("14rounds");
  523. for ($i=1;$i<3;$i++) {
  524. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  525. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  526. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  527. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  528. &xor ($s0,&DWP(16*$i+0,$key));
  529. &xor ($s1,&DWP(16*$i+4,$key));
  530. &xor ($s2,&DWP(16*$i+8,$key));
  531. &xor ($s3,&DWP(16*$i+12,$key));
  532. }
  533. &add ($key,32);
  534. &mov (&DWP(12,"esp"),$key); # advance rd_key
  535. &set_label("12rounds");
  536. for ($i=1;$i<3;$i++) {
  537. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  538. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  539. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  540. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  541. &xor ($s0,&DWP(16*$i+0,$key));
  542. &xor ($s1,&DWP(16*$i+4,$key));
  543. &xor ($s2,&DWP(16*$i+8,$key));
  544. &xor ($s3,&DWP(16*$i+12,$key));
  545. }
  546. &add ($key,32);
  547. &mov (&DWP(12,"esp"),$key); # advance rd_key
  548. &set_label("10rounds");
  549. for ($i=1;$i<10;$i++) {
  550. &decstep(0,"ebp",$s0,$s3,$s2,$s1);
  551. &decstep(1,"ebp",$s1,$s0,$s3,$s2);
  552. &decstep(2,"ebp",$s2,$s1,$s0,$s3);
  553. &decstep(3,"ebp",$s3,$s2,$s1,$s0);
  554. &xor ($s0,&DWP(16*$i+0,$key));
  555. &xor ($s1,&DWP(16*$i+4,$key));
  556. &xor ($s2,&DWP(16*$i+8,$key));
  557. &xor ($s3,&DWP(16*$i+12,$key));
  558. }
  559. }
  560. &declast(0,"ebp",$s0,$s3,$s2,$s1);
  561. &declast(1,"ebp",$s1,$s0,$s3,$s2);
  562. &declast(2,"ebp",$s2,$s1,$s0,$s3);
  563. &declast(3,"ebp",$s3,$s2,$s1,$s0);
  564. &add ($key,$small_footprint?16:160);
  565. &xor ($s0,&DWP(0,$key));
  566. &xor ($s1,&DWP(4,$key));
  567. &xor ($s2,&DWP(8,$key));
  568. &xor ($s3,&DWP(12,$key));
  569. &ret ();
  570. &set_label("AES_Td",64); # Yes! I keep it in the code segment!
  571. &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
  572. &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
  573. &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
  574. &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
  575. &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
  576. &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
  577. &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
  578. &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
  579. &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
  580. &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
  581. &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
  582. &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
  583. &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
  584. &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
  585. &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
  586. &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
  587. &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
  588. &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
  589. &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
  590. &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
  591. &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
  592. &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
  593. &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
  594. &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
  595. &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
  596. &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
  597. &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
  598. &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
  599. &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
  600. &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
  601. &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
  602. &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
  603. &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
  604. &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
  605. &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
  606. &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
  607. &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
  608. &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
  609. &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
  610. &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
  611. &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
  612. &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
  613. &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
  614. &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
  615. &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
  616. &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
  617. &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
  618. &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
  619. &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
  620. &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
  621. &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
  622. &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
  623. &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
  624. &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
  625. &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
  626. &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
  627. &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
  628. &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
  629. &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
  630. &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
  631. &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
  632. &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
  633. &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
  634. &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
  635. #Td4:
  636. &data_word(0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5);
  637. &data_word(0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838);
  638. &data_word(0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e);
  639. &data_word(0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb);
  640. &data_word(0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282);
  641. &data_word(0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787);
  642. &data_word(0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444);
  643. &data_word(0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb);
  644. &data_word(0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232);
  645. &data_word(0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d);
  646. &data_word(0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b);
  647. &data_word(0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e);
  648. &data_word(0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666);
  649. &data_word(0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2);
  650. &data_word(0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949);
  651. &data_word(0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525);
  652. &data_word(0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464);
  653. &data_word(0x86868686, 0x68686868, 0x98989898, 0x16161616);
  654. &data_word(0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc);
  655. &data_word(0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292);
  656. &data_word(0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050);
  657. &data_word(0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada);
  658. &data_word(0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757);
  659. &data_word(0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484);
  660. &data_word(0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000);
  661. &data_word(0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a);
  662. &data_word(0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505);
  663. &data_word(0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606);
  664. &data_word(0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f);
  665. &data_word(0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202);
  666. &data_word(0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303);
  667. &data_word(0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b);
  668. &data_word(0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141);
  669. &data_word(0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea);
  670. &data_word(0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece);
  671. &data_word(0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373);
  672. &data_word(0x96969696, 0xacacacac, 0x74747474, 0x22222222);
  673. &data_word(0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585);
  674. &data_word(0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8);
  675. &data_word(0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e);
  676. &data_word(0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171);
  677. &data_word(0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989);
  678. &data_word(0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e);
  679. &data_word(0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b);
  680. &data_word(0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b);
  681. &data_word(0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020);
  682. &data_word(0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe);
  683. &data_word(0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4);
  684. &data_word(0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333);
  685. &data_word(0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131);
  686. &data_word(0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959);
  687. &data_word(0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f);
  688. &data_word(0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9);
  689. &data_word(0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d);
  690. &data_word(0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f);
  691. &data_word(0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef);
  692. &data_word(0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d);
  693. &data_word(0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0);
  694. &data_word(0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c);
  695. &data_word(0x83838383, 0x53535353, 0x99999999, 0x61616161);
  696. &data_word(0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e);
  697. &data_word(0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626);
  698. &data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
  699. &data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
  700. &function_end_B("_x86_AES_decrypt");
  701. # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
  702. &public_label("AES_Td");
  703. &function_begin("AES_decrypt");
  704. &mov ($acc,&wparam(0)); # load inp
  705. &mov ($key,&wparam(2)); # load key
  706. &mov ($s0,"esp");
  707. &sub ("esp",24);
  708. &and ("esp",-64);
  709. &add ("esp",4);
  710. &mov (&DWP(16,"esp"),$s0);
  711. &call (&label("pic_point")); # make it PIC!
  712. &set_label("pic_point");
  713. &blindpop("ebp");
  714. &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  715. &mov ($s0,&DWP(0,$acc)); # load input data
  716. &mov ($s1,&DWP(4,$acc));
  717. &mov ($s2,&DWP(8,$acc));
  718. &mov ($s3,&DWP(12,$acc));
  719. &call ("_x86_AES_decrypt");
  720. &mov ("esp",&DWP(16,"esp"));
  721. &mov ($acc,&wparam(1)); # load out
  722. &mov (&DWP(0,$acc),$s0); # write output data
  723. &mov (&DWP(4,$acc),$s1);
  724. &mov (&DWP(8,$acc),$s2);
  725. &mov (&DWP(12,$acc),$s3);
  726. &function_end("AES_decrypt");
  727. # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
  728. # size_t length, const AES_KEY *key,
  729. # unsigned char *ivp,const int enc);
  730. {
  731. # stack frame layout
  732. # -4(%esp) 0(%esp) return address
  733. # 0(%esp) 4(%esp) tmp1
  734. # 4(%esp) 8(%esp) tmp2
  735. # 8(%esp) 12(%esp) key
  736. # 12(%esp) 16(%esp) end of key schedule
  737. my $_esp=&DWP(16,"esp"); #saved %esp
  738. my $_inp=&DWP(20,"esp"); #copy of wparam(0)
  739. my $_out=&DWP(24,"esp"); #copy of wparam(1)
  740. my $_len=&DWP(28,"esp"); #copy of wparam(2)
  741. my $_key=&DWP(32,"esp"); #copy of wparam(3)
  742. my $_ivp=&DWP(36,"esp"); #copy of wparam(4)
  743. my $_tmp=&DWP(40,"esp"); #volatile variable
  744. my $ivec=&DWP(44,"esp"); #ivec[16]
  745. my $aes_key=&DWP(60,"esp"); #copy of aes_key
  746. my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
  747. &public_label("AES_Te");
  748. &public_label("AES_Td");
  749. &function_begin("AES_cbc_encrypt");
  750. &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
  751. &cmp ($s2,0);
  752. &je (&label("enc_out"));
  753. &call (&label("pic_point")); # make it PIC!
  754. &set_label("pic_point");
  755. &blindpop("ebp");
  756. &pushf ();
  757. &cld ();
  758. &cmp (&wparam(5),0);
  759. &je (&label("DECRYPT"));
  760. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  761. # allocate aligned stack frame...
  762. &lea ($key,&DWP(-64-244,"esp"));
  763. &and ($key,-64);
  764. # ... and make sure it doesn't alias with AES_Te modulo 4096
  765. &mov ($s0,"ebp");
  766. &lea ($s1,&DWP(2048,"ebp"));
  767. &mov ($s3,$key);
  768. &and ($s0,0xfff); # s = %ebp&0xfff
  769. &and ($s1,0xfff); # e = (%ebp+2048)&0xfff
  770. &and ($s3,0xfff); # p = %esp&0xfff
  771. &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
  772. &jb (&label("te_break_out"));
  773. &sub ($s3,$s1);
  774. &sub ($key,$s3);
  775. &jmp (&label("te_ok"));
  776. &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz;
  777. &sub ($s3,$s0);
  778. &and ($s3,0xfff);
  779. &add ($s3,64+256);
  780. &sub ($key,$s3);
  781. &align (4);
  782. &set_label("te_ok");
  783. &mov ($s0,&wparam(0)); # load inp
  784. &mov ($s1,&wparam(1)); # load out
  785. &mov ($s3,&wparam(3)); # load key
  786. &mov ($acc,&wparam(4)); # load ivp
  787. &exch ("esp",$key);
  788. &add ("esp",4); # reserve for return address!
  789. &mov ($_esp,$key); # save %esp
  790. &mov ($_inp,$s0); # save copy of inp
  791. &mov ($_out,$s1); # save copy of out
  792. &mov ($_len,$s2); # save copy of len
  793. &mov ($_key,$s3); # save copy of key
  794. &mov ($_ivp,$acc); # save copy of ivp
  795. &mov ($mark,0); # copy of aes_key->rounds = 0;
  796. if ($compromise) {
  797. &cmp ($s2,$compromise);
  798. &jb (&label("skip_ecopy"));
  799. }
  800. # do we copy key schedule to stack?
  801. &mov ($s1 eq "ebx" ? $s1 : "",$s3);
  802. &mov ($s2 eq "ecx" ? $s2 : "",244/4);
  803. &sub ($s1,"ebp");
  804. &mov ("esi",$s3);
  805. &and ($s1,0xfff);
  806. &lea ("edi",$aes_key);
  807. &cmp ($s1,2048);
  808. &jb (&label("do_ecopy"));
  809. &cmp ($s1,4096-244);
  810. &jb (&label("skip_ecopy"));
  811. &align (4);
  812. &set_label("do_ecopy");
  813. &mov ($_key,"edi");
  814. &data_word(0xA5F3F689); # rep movsd
  815. &set_label("skip_ecopy");
  816. &mov ($acc,$s0);
  817. &mov ($key,16);
  818. &align (4);
  819. &set_label("prefetch_te");
  820. &mov ($s0,&DWP(0,"ebp"));
  821. &mov ($s1,&DWP(32,"ebp"));
  822. &mov ($s2,&DWP(64,"ebp"));
  823. &mov ($s3,&DWP(96,"ebp"));
  824. &lea ("ebp",&DWP(128,"ebp"));
  825. &dec ($key);
  826. &jnz (&label("prefetch_te"));
  827. &sub ("ebp",2048);
  828. &mov ($s2,$_len);
  829. &mov ($key,$_ivp);
  830. &test ($s2,0xFFFFFFF0);
  831. &jz (&label("enc_tail")); # short input...
  832. &mov ($s0,&DWP(0,$key)); # load iv
  833. &mov ($s1,&DWP(4,$key));
  834. &align (4);
  835. &set_label("enc_loop");
  836. &mov ($s2,&DWP(8,$key));
  837. &mov ($s3,&DWP(12,$key));
  838. &xor ($s0,&DWP(0,$acc)); # xor input data
  839. &xor ($s1,&DWP(4,$acc));
  840. &xor ($s2,&DWP(8,$acc));
  841. &xor ($s3,&DWP(12,$acc));
  842. &mov ($key,$_key); # load key
  843. &call ("_x86_AES_encrypt");
  844. &mov ($acc,$_inp); # load inp
  845. &mov ($key,$_out); # load out
  846. &mov (&DWP(0,$key),$s0); # save output data
  847. &mov (&DWP(4,$key),$s1);
  848. &mov (&DWP(8,$key),$s2);
  849. &mov (&DWP(12,$key),$s3);
  850. &mov ($s2,$_len); # load len
  851. &lea ($acc,&DWP(16,$acc));
  852. &mov ($_inp,$acc); # save inp
  853. &lea ($s3,&DWP(16,$key));
  854. &mov ($_out,$s3); # save out
  855. &sub ($s2,16);
  856. &test ($s2,0xFFFFFFF0);
  857. &mov ($_len,$s2); # save len
  858. &jnz (&label("enc_loop"));
  859. &test ($s2,15);
  860. &jnz (&label("enc_tail"));
  861. &mov ($acc,$_ivp); # load ivp
  862. &mov ($s2,&DWP(8,$key)); # restore last dwords
  863. &mov ($s3,&DWP(12,$key));
  864. &mov (&DWP(0,$acc),$s0); # save ivec
  865. &mov (&DWP(4,$acc),$s1);
  866. &mov (&DWP(8,$acc),$s2);
  867. &mov (&DWP(12,$acc),$s3);
  868. &cmp ($mark,0); # was the key schedule copied?
  869. &mov ("edi",$_key);
  870. &mov ("esp",$_esp);
  871. &je (&label("skip_ezero"));
  872. # zero copy of key schedule
  873. &mov ("ecx",240/4);
  874. &xor ("eax","eax");
  875. &align (4);
  876. &data_word(0xABF3F689); # rep stosd
  877. &set_label("skip_ezero")
  878. &popf ();
  879. &set_label("enc_out");
  880. &function_end_A();
  881. &pushf (); # kludge, never executed
  882. &align (4);
  883. &set_label("enc_tail");
  884. &push ($key eq "edi" ? $key : ""); # push ivp
  885. &mov ($key,$_out); # load out
  886. &mov ($s1,16);
  887. &sub ($s1,$s2);
  888. &cmp ($key,$acc); # compare with inp
  889. &je (&label("enc_in_place"));
  890. &align (4);
  891. &data_word(0xA4F3F689); # rep movsb # copy input
  892. &jmp (&label("enc_skip_in_place"));
  893. &set_label("enc_in_place");
  894. &lea ($key,&DWP(0,$key,$s2));
  895. &set_label("enc_skip_in_place");
  896. &mov ($s2,$s1);
  897. &xor ($s0,$s0);
  898. &align (4);
  899. &data_word(0xAAF3F689); # rep stosb # zero tail
  900. &pop ($key); # pop ivp
  901. &mov ($acc,$_out); # output as input
  902. &mov ($s0,&DWP(0,$key));
  903. &mov ($s1,&DWP(4,$key));
  904. &mov ($_len,16); # len=16
  905. &jmp (&label("enc_loop")); # one more spin...
  906. #----------------------------- DECRYPT -----------------------------#
  907. &align (4);
  908. &set_label("DECRYPT");
  909. &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  910. # allocate aligned stack frame...
  911. &lea ($key,&DWP(-64-244,"esp"));
  912. &and ($key,-64);
  913. # ... and make sure it doesn't alias with AES_Td modulo 4096
  914. &mov ($s0,"ebp");
  915. &lea ($s1,&DWP(3072,"ebp"));
  916. &mov ($s3,$key);
  917. &and ($s0,0xfff); # s = %ebp&0xfff
  918. &and ($s1,0xfff); # e = (%ebp+3072)&0xfff
  919. &and ($s3,0xfff); # p = %esp&0xfff
  920. &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
  921. &jb (&label("td_break_out"));
  922. &sub ($s3,$s1);
  923. &sub ($key,$s3);
  924. &jmp (&label("td_ok"));
  925. &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;
  926. &sub ($s3,$s0);
  927. &and ($s3,0xfff);
  928. &add ($s3,64+256);
  929. &sub ($key,$s3);
  930. &align (4);
  931. &set_label("td_ok");
  932. &mov ($s0,&wparam(0)); # load inp
  933. &mov ($s1,&wparam(1)); # load out
  934. &mov ($s3,&wparam(3)); # load key
  935. &mov ($acc,&wparam(4)); # load ivp
  936. &exch ("esp",$key);
  937. &add ("esp",4); # reserve for return address!
  938. &mov ($_esp,$key); # save %esp
  939. &mov ($_inp,$s0); # save copy of inp
  940. &mov ($_out,$s1); # save copy of out
  941. &mov ($_len,$s2); # save copy of len
  942. &mov ($_key,$s3); # save copy of key
  943. &mov ($_ivp,$acc); # save copy of ivp
  944. &mov ($mark,0); # copy of aes_key->rounds = 0;
  945. if ($compromise) {
  946. &cmp ($s2,$compromise);
  947. &jb (&label("skip_dcopy"));
  948. }
  949. # do we copy key schedule to stack?
  950. &mov ($s1 eq "ebx" ? $s1 : "",$s3);
  951. &mov ($s2 eq "ecx" ? $s2 : "",244/4);
  952. &sub ($s1,"ebp");
  953. &mov ("esi",$s3);
  954. &and ($s1,0xfff);
  955. &lea ("edi",$aes_key);
  956. &cmp ($s1,3072);
  957. &jb (&label("do_dcopy"));
  958. &cmp ($s1,4096-244);
  959. &jb (&label("skip_dcopy"));
  960. &align (4);
  961. &set_label("do_dcopy");
  962. &mov ($_key,"edi");
  963. &data_word(0xA5F3F689); # rep movsd
  964. &set_label("skip_dcopy");
  965. &mov ($acc,$s0);
  966. &mov ($key,24);
  967. &align (4);
  968. &set_label("prefetch_td");
  969. &mov ($s0,&DWP(0,"ebp"));
  970. &mov ($s1,&DWP(32,"ebp"));
  971. &mov ($s2,&DWP(64,"ebp"));
  972. &mov ($s3,&DWP(96,"ebp"));
  973. &lea ("ebp",&DWP(128,"ebp"));
  974. &dec ($key);
  975. &jnz (&label("prefetch_td"));
  976. &sub ("ebp",3072);
  977. &cmp ($acc,$_out);
  978. &je (&label("dec_in_place")); # in-place processing...
  979. &mov ($key,$_ivp); # load ivp
  980. &mov ($_tmp,$key);
  981. &align (4);
  982. &set_label("dec_loop");
  983. &mov ($s0,&DWP(0,$acc)); # read input
  984. &mov ($s1,&DWP(4,$acc));
  985. &mov ($s2,&DWP(8,$acc));
  986. &mov ($s3,&DWP(12,$acc));
  987. &mov ($key,$_key); # load key
  988. &call ("_x86_AES_decrypt");
  989. &mov ($key,$_tmp); # load ivp
  990. &mov ($acc,$_len); # load len
  991. &xor ($s0,&DWP(0,$key)); # xor iv
  992. &xor ($s1,&DWP(4,$key));
  993. &xor ($s2,&DWP(8,$key));
  994. &xor ($s3,&DWP(12,$key));
  995. &sub ($acc,16);
  996. &jc (&label("dec_partial"));
  997. &mov ($_len,$acc); # save len
  998. &mov ($acc,$_inp); # load inp
  999. &mov ($key,$_out); # load out
  1000. &mov (&DWP(0,$key),$s0); # write output
  1001. &mov (&DWP(4,$key),$s1);
  1002. &mov (&DWP(8,$key),$s2);
  1003. &mov (&DWP(12,$key),$s3);
  1004. &mov ($_tmp,$acc); # save ivp
  1005. &lea ($acc,&DWP(16,$acc));
  1006. &mov ($_inp,$acc); # save inp
  1007. &lea ($key,&DWP(16,$key));
  1008. &mov ($_out,$key); # save out
  1009. &jnz (&label("dec_loop"));
  1010. &mov ($key,$_tmp); # load temp ivp
  1011. &set_label("dec_end");
  1012. &mov ($acc,$_ivp); # load user ivp
  1013. &mov ($s0,&DWP(0,$key)); # load iv
  1014. &mov ($s1,&DWP(4,$key));
  1015. &mov ($s2,&DWP(8,$key));
  1016. &mov ($s3,&DWP(12,$key));
  1017. &mov (&DWP(0,$acc),$s0); # copy back to user
  1018. &mov (&DWP(4,$acc),$s1);
  1019. &mov (&DWP(8,$acc),$s2);
  1020. &mov (&DWP(12,$acc),$s3);
  1021. &jmp (&label("dec_out"));
  1022. &align (4);
  1023. &set_label("dec_partial");
  1024. &lea ($key,$ivec);
  1025. &mov (&DWP(0,$key),$s0); # dump output to stack
  1026. &mov (&DWP(4,$key),$s1);
  1027. &mov (&DWP(8,$key),$s2);
  1028. &mov (&DWP(12,$key),$s3);
  1029. &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
  1030. &mov ($acc eq "esi" ? $acc : "",$key);
  1031. &mov ($key eq "edi" ? $key : "",$_out); # load out
  1032. &data_word(0xA4F3F689); # rep movsb # copy output
  1033. &mov ($key,$_inp); # use inp as temp ivp
  1034. &jmp (&label("dec_end"));
  1035. &align (4);
  1036. &set_label("dec_in_place");
  1037. &set_label("dec_in_place_loop");
  1038. &lea ($key,$ivec);
  1039. &mov ($s0,&DWP(0,$acc)); # read input
  1040. &mov ($s1,&DWP(4,$acc));
  1041. &mov ($s2,&DWP(8,$acc));
  1042. &mov ($s3,&DWP(12,$acc));
  1043. &mov (&DWP(0,$key),$s0); # copy to temp
  1044. &mov (&DWP(4,$key),$s1);
  1045. &mov (&DWP(8,$key),$s2);
  1046. &mov (&DWP(12,$key),$s3);
  1047. &mov ($key,$_key); # load key
  1048. &call ("_x86_AES_decrypt");
  1049. &mov ($key,$_ivp); # load ivp
  1050. &mov ($acc,$_out); # load out
  1051. &xor ($s0,&DWP(0,$key)); # xor iv
  1052. &xor ($s1,&DWP(4,$key));
  1053. &xor ($s2,&DWP(8,$key));
  1054. &xor ($s3,&DWP(12,$key));
  1055. &mov (&DWP(0,$acc),$s0); # write output
  1056. &mov (&DWP(4,$acc),$s1);
  1057. &mov (&DWP(8,$acc),$s2);
  1058. &mov (&DWP(12,$acc),$s3);
  1059. &lea ($acc,&DWP(16,$acc));
  1060. &mov ($_out,$acc); # save out
  1061. &lea ($acc,$ivec);
  1062. &mov ($s0,&DWP(0,$acc)); # read temp
  1063. &mov ($s1,&DWP(4,$acc));
  1064. &mov ($s2,&DWP(8,$acc));
  1065. &mov ($s3,&DWP(12,$acc));
  1066. &mov (&DWP(0,$key),$s0); # copy iv
  1067. &mov (&DWP(4,$key),$s1);
  1068. &mov (&DWP(8,$key),$s2);
  1069. &mov (&DWP(12,$key),$s3);
  1070. &mov ($acc,$_inp); # load inp
  1071. &lea ($acc,&DWP(16,$acc));
  1072. &mov ($_inp,$acc); # save inp
  1073. &mov ($s2,$_len); # load len
  1074. &sub ($s2,16);
  1075. &jc (&label("dec_in_place_partial"));
  1076. &mov ($_len,$s2); # save len
  1077. &jnz (&label("dec_in_place_loop"));
  1078. &jmp (&label("dec_out"));
  1079. &align (4);
  1080. &set_label("dec_in_place_partial");
  1081. # one can argue if this is actually required...
  1082. &mov ($key eq "edi" ? $key : "",$_out);
  1083. &lea ($acc eq "esi" ? $acc : "",$ivec);
  1084. &lea ($key,&DWP(0,$key,$s2));
  1085. &lea ($acc,&DWP(16,$acc,$s2));
  1086. &neg ($s2 eq "ecx" ? $s2 : "");
  1087. &data_word(0xA4F3F689); # rep movsb # restore tail
  1088. &align (4);
  1089. &set_label("dec_out");
  1090. &cmp ($mark,0); # was the key schedule copied?
  1091. &mov ("edi",$_key);
  1092. &mov ("esp",$_esp);
  1093. &je (&label("skip_dzero"));
  1094. # zero copy of key schedule
  1095. &mov ("ecx",240/4);
  1096. &xor ("eax","eax");
  1097. &align (4);
  1098. &data_word(0xABF3F689); # rep stosd
  1099. &set_label("skip_dzero")
  1100. &popf ();
  1101. &function_end("AES_cbc_encrypt");
  1102. }
  1103. #------------------------------------------------------------------#
  1104. sub enckey()
  1105. {
  1106. &movz ("esi",&LB("edx")); # rk[i]>>0
  1107. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1108. &movz ("esi",&HB("edx")); # rk[i]>>8
  1109. &and ("ebx",0xFF000000);
  1110. &xor ("eax","ebx");
  1111. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1112. &shr ("edx",16);
  1113. &and ("ebx",0x000000FF);
  1114. &movz ("esi",&LB("edx")); # rk[i]>>16
  1115. &xor ("eax","ebx");
  1116. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1117. &movz ("esi",&HB("edx")); # rk[i]>>24
  1118. &and ("ebx",0x0000FF00);
  1119. &xor ("eax","ebx");
  1120. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1121. &and ("ebx",0x00FF0000);
  1122. &xor ("eax","ebx");
  1123. &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon
  1124. }
  1125. # int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
  1126. # AES_KEY *key)
  1127. &public_label("AES_Te");
  1128. &function_begin("AES_set_encrypt_key");
  1129. &mov ("esi",&wparam(0)); # user supplied key
  1130. &mov ("edi",&wparam(2)); # private key schedule
  1131. &test ("esi",-1);
  1132. &jz (&label("badpointer"));
  1133. &test ("edi",-1);
  1134. &jz (&label("badpointer"));
  1135. &call (&label("pic_point"));
  1136. &set_label("pic_point");
  1137. &blindpop("ebp");
  1138. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  1139. &mov ("ecx",&wparam(1)); # number of bits in key
  1140. &cmp ("ecx",128);
  1141. &je (&label("10rounds"));
  1142. &cmp ("ecx",192);
  1143. &je (&label("12rounds"));
  1144. &cmp ("ecx",256);
  1145. &je (&label("14rounds"));
  1146. &mov ("eax",-2); # invalid number of bits
  1147. &jmp (&label("exit"));
  1148. &set_label("10rounds");
  1149. &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
  1150. &mov ("ebx",&DWP(4,"esi"));
  1151. &mov ("ecx",&DWP(8,"esi"));
  1152. &mov ("edx",&DWP(12,"esi"));
  1153. &mov (&DWP(0,"edi"),"eax");
  1154. &mov (&DWP(4,"edi"),"ebx");
  1155. &mov (&DWP(8,"edi"),"ecx");
  1156. &mov (&DWP(12,"edi"),"edx");
  1157. &xor ("ecx","ecx");
  1158. &jmp (&label("10shortcut"));
  1159. &align (4);
  1160. &set_label("10loop");
  1161. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1162. &mov ("edx",&DWP(12,"edi")); # rk[3]
  1163. &set_label("10shortcut");
  1164. &enckey ();
  1165. &mov (&DWP(16,"edi"),"eax"); # rk[4]
  1166. &xor ("eax",&DWP(4,"edi"));
  1167. &mov (&DWP(20,"edi"),"eax"); # rk[5]
  1168. &xor ("eax",&DWP(8,"edi"));
  1169. &mov (&DWP(24,"edi"),"eax"); # rk[6]
  1170. &xor ("eax",&DWP(12,"edi"));
  1171. &mov (&DWP(28,"edi"),"eax"); # rk[7]
  1172. &inc ("ecx");
  1173. &add ("edi",16);
  1174. &cmp ("ecx",10);
  1175. &jl (&label("10loop"));
  1176. &mov (&DWP(80,"edi"),10); # setup number of rounds
  1177. &xor ("eax","eax");
  1178. &jmp (&label("exit"));
  1179. &set_label("12rounds");
  1180. &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
  1181. &mov ("ebx",&DWP(4,"esi"));
  1182. &mov ("ecx",&DWP(8,"esi"));
  1183. &mov ("edx",&DWP(12,"esi"));
  1184. &mov (&DWP(0,"edi"),"eax");
  1185. &mov (&DWP(4,"edi"),"ebx");
  1186. &mov (&DWP(8,"edi"),"ecx");
  1187. &mov (&DWP(12,"edi"),"edx");
  1188. &mov ("ecx",&DWP(16,"esi"));
  1189. &mov ("edx",&DWP(20,"esi"));
  1190. &mov (&DWP(16,"edi"),"ecx");
  1191. &mov (&DWP(20,"edi"),"edx");
  1192. &xor ("ecx","ecx");
  1193. &jmp (&label("12shortcut"));
  1194. &align (4);
  1195. &set_label("12loop");
  1196. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1197. &mov ("edx",&DWP(20,"edi")); # rk[5]
  1198. &set_label("12shortcut");
  1199. &enckey ();
  1200. &mov (&DWP(24,"edi"),"eax"); # rk[6]
  1201. &xor ("eax",&DWP(4,"edi"));
  1202. &mov (&DWP(28,"edi"),"eax"); # rk[7]
  1203. &xor ("eax",&DWP(8,"edi"));
  1204. &mov (&DWP(32,"edi"),"eax"); # rk[8]
  1205. &xor ("eax",&DWP(12,"edi"));
  1206. &mov (&DWP(36,"edi"),"eax"); # rk[9]
  1207. &cmp ("ecx",7);
  1208. &je (&label("12break"));
  1209. &inc ("ecx");
  1210. &xor ("eax",&DWP(16,"edi"));
  1211. &mov (&DWP(40,"edi"),"eax"); # rk[10]
  1212. &xor ("eax",&DWP(20,"edi"));
  1213. &mov (&DWP(44,"edi"),"eax"); # rk[11]
  1214. &add ("edi",24);
  1215. &jmp (&label("12loop"));
  1216. &set_label("12break");
  1217. &mov (&DWP(72,"edi"),12); # setup number of rounds
  1218. &xor ("eax","eax");
  1219. &jmp (&label("exit"));
  1220. &set_label("14rounds");
  1221. &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
  1222. &mov ("ebx",&DWP(4,"esi"));
  1223. &mov ("ecx",&DWP(8,"esi"));
  1224. &mov ("edx",&DWP(12,"esi"));
  1225. &mov (&DWP(0,"edi"),"eax");
  1226. &mov (&DWP(4,"edi"),"ebx");
  1227. &mov (&DWP(8,"edi"),"ecx");
  1228. &mov (&DWP(12,"edi"),"edx");
  1229. &mov ("eax",&DWP(16,"esi"));
  1230. &mov ("ebx",&DWP(20,"esi"));
  1231. &mov ("ecx",&DWP(24,"esi"));
  1232. &mov ("edx",&DWP(28,"esi"));
  1233. &mov (&DWP(16,"edi"),"eax");
  1234. &mov (&DWP(20,"edi"),"ebx");
  1235. &mov (&DWP(24,"edi"),"ecx");
  1236. &mov (&DWP(28,"edi"),"edx");
  1237. &xor ("ecx","ecx");
  1238. &jmp (&label("14shortcut"));
  1239. &align (4);
  1240. &set_label("14loop");
  1241. &mov ("edx",&DWP(28,"edi")); # rk[7]
  1242. &set_label("14shortcut");
  1243. &mov ("eax",&DWP(0,"edi")); # rk[0]
  1244. &enckey ();
  1245. &mov (&DWP(32,"edi"),"eax"); # rk[8]
  1246. &xor ("eax",&DWP(4,"edi"));
  1247. &mov (&DWP(36,"edi"),"eax"); # rk[9]
  1248. &xor ("eax",&DWP(8,"edi"));
  1249. &mov (&DWP(40,"edi"),"eax"); # rk[10]
  1250. &xor ("eax",&DWP(12,"edi"));
  1251. &mov (&DWP(44,"edi"),"eax"); # rk[11]
  1252. &cmp ("ecx",6);
  1253. &je (&label("14break"));
  1254. &inc ("ecx");
  1255. &mov ("edx","eax");
  1256. &mov ("eax",&DWP(16,"edi")); # rk[4]
  1257. &movz ("esi",&LB("edx")); # rk[11]>>0
  1258. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1259. &movz ("esi",&HB("edx")); # rk[11]>>8
  1260. &and ("ebx",0x000000FF);
  1261. &xor ("eax","ebx");
  1262. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1263. &shr ("edx",16);
  1264. &and ("ebx",0x0000FF00);
  1265. &movz ("esi",&LB("edx")); # rk[11]>>16
  1266. &xor ("eax","ebx");
  1267. &mov ("ebx",&DWP(0,"ebp","esi",8));
  1268. &movz ("esi",&HB("edx")); # rk[11]>>24
  1269. &and ("ebx",0x00FF0000);
  1270. &xor ("eax","ebx");
  1271. &mov ("ebx",&DWP(2,"ebp","esi",8));
  1272. &and ("ebx",0xFF000000);
  1273. &xor ("eax","ebx");
  1274. &mov (&DWP(48,"edi"),"eax"); # rk[12]
  1275. &xor ("eax",&DWP(20,"edi"));
  1276. &mov (&DWP(52,"edi"),"eax"); # rk[13]
  1277. &xor ("eax",&DWP(24,"edi"));
  1278. &mov (&DWP(56,"edi"),"eax"); # rk[14]
  1279. &xor ("eax",&DWP(28,"edi"));
  1280. &mov (&DWP(60,"edi"),"eax"); # rk[15]
  1281. &add ("edi",32);
  1282. &jmp (&label("14loop"));
  1283. &set_label("14break");
  1284. &mov (&DWP(48,"edi"),14); # setup number of rounds
  1285. &xor ("eax","eax");
  1286. &jmp (&label("exit"));
  1287. &set_label("badpointer");
  1288. &mov ("eax",-1);
  1289. &set_label("exit");
  1290. &function_end("AES_set_encrypt_key");
  1291. sub deckey()
  1292. { my ($i,$ptr,$te,$td) = @_;
  1293. &mov ("eax",&DWP($i,$ptr));
  1294. &mov ("edx","eax");
  1295. &movz ("ebx",&HB("eax"));
  1296. &shr ("edx",16);
  1297. &and ("eax",0xFF);
  1298. &movz ("eax",&BP(2,$te,"eax",8));
  1299. &movz ("ebx",&BP(2,$te,"ebx",8));
  1300. &mov ("eax",&DWP(0,$td,"eax",8));
  1301. &xor ("eax",&DWP(3,$td,"ebx",8));
  1302. &movz ("ebx",&HB("edx"));
  1303. &and ("edx",0xFF);
  1304. &movz ("edx",&BP(2,$te,"edx",8));
  1305. &movz ("ebx",&BP(2,$te,"ebx",8));
  1306. &xor ("eax",&DWP(2,$td,"edx",8));
  1307. &xor ("eax",&DWP(1,$td,"ebx",8));
  1308. &mov (&DWP($i,$ptr),"eax");
  1309. }
  1310. # int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
  1311. # AES_KEY *key)
  1312. &public_label("AES_Td");
  1313. &public_label("AES_Te");
  1314. &function_begin_B("AES_set_decrypt_key");
  1315. &mov ("eax",&wparam(0));
  1316. &mov ("ecx",&wparam(1));
  1317. &mov ("edx",&wparam(2));
  1318. &sub ("esp",12);
  1319. &mov (&DWP(0,"esp"),"eax");
  1320. &mov (&DWP(4,"esp"),"ecx");
  1321. &mov (&DWP(8,"esp"),"edx");
  1322. &call ("AES_set_encrypt_key");
  1323. &add ("esp",12);
  1324. &cmp ("eax",0);
  1325. &je (&label("proceed"));
  1326. &ret ();
  1327. &set_label("proceed");
  1328. &push ("ebp");
  1329. &push ("ebx");
  1330. &push ("esi");
  1331. &push ("edi");
  1332. &mov ("esi",&wparam(2));
  1333. &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
  1334. &lea ("ecx",&DWP(0,"","ecx",4));
  1335. &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
  1336. &align (4);
  1337. &set_label("invert"); # invert order of chunks
  1338. &mov ("eax",&DWP(0,"esi"));
  1339. &mov ("ebx",&DWP(4,"esi"));
  1340. &mov ("ecx",&DWP(0,"edi"));
  1341. &mov ("edx",&DWP(4,"edi"));
  1342. &mov (&DWP(0,"edi"),"eax");
  1343. &mov (&DWP(4,"edi"),"ebx");
  1344. &mov (&DWP(0,"esi"),"ecx");
  1345. &mov (&DWP(4,"esi"),"edx");
  1346. &mov ("eax",&DWP(8,"esi"));
  1347. &mov ("ebx",&DWP(12,"esi"));
  1348. &mov ("ecx",&DWP(8,"edi"));
  1349. &mov ("edx",&DWP(12,"edi"));
  1350. &mov (&DWP(8,"edi"),"eax");
  1351. &mov (&DWP(12,"edi"),"ebx");
  1352. &mov (&DWP(8,"esi"),"ecx");
  1353. &mov (&DWP(12,"esi"),"edx");
  1354. &add ("esi",16);
  1355. &sub ("edi",16);
  1356. &cmp ("esi","edi");
  1357. &jne (&label("invert"));
  1358. &call (&label("pic_point"));
  1359. &set_label("pic_point");
  1360. blindpop("ebp");
  1361. &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
  1362. &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
  1363. &mov ("esi",&wparam(2));
  1364. &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
  1365. &dec ("ecx");
  1366. &align (4);
  1367. &set_label("permute"); # permute the key schedule
  1368. &add ("esi",16);
  1369. &deckey (0,"esi","ebp","edi");
  1370. &deckey (4,"esi","ebp","edi");
  1371. &deckey (8,"esi","ebp","edi");
  1372. &deckey (12,"esi","ebp","edi");
  1373. &dec ("ecx");
  1374. &jnz (&label("permute"));
  1375. &xor ("eax","eax"); # return success
  1376. &function_end("AES_set_decrypt_key");
  1377. &asm_finish();