2
0

bn-586.pl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. #!/usr/local/bin/perl
  2. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  3. push(@INC,"${dir}","${dir}../../perlasm");
  4. require "x86asm.pl";
  5. &asm_init($ARGV[0],$0);
  6. $sse2=0;
  7. for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
  8. &external_label("OPENSSL_ia32cap_P") if ($sse2);
  9. &bn_mul_add_words("bn_mul_add_words");
  10. &bn_mul_words("bn_mul_words");
  11. &bn_sqr_words("bn_sqr_words");
  12. &bn_div_words("bn_div_words");
  13. &bn_add_words("bn_add_words");
  14. &bn_sub_words("bn_sub_words");
  15. &bn_sub_part_words("bn_sub_part_words");
  16. &asm_finish();
  17. sub bn_mul_add_words
  18. {
  19. local($name)=@_;
  20. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  21. $r="eax";
  22. $a="edx";
  23. $c="ecx";
  24. if ($sse2) {
  25. &picmeup("eax","OPENSSL_ia32cap_P");
  26. &bt(&DWP(0,"eax"),26);
  27. &jnc(&label("maw_non_sse2"));
  28. &mov($r,&wparam(0));
  29. &mov($a,&wparam(1));
  30. &mov($c,&wparam(2));
  31. &movd("mm0",&wparam(3)); # mm0 = w
  32. &pxor("mm1","mm1"); # mm1 = carry_in
  33. &jmp(&label("maw_sse2_entry"));
  34. &set_label("maw_sse2_unrolled",16);
  35. &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
  36. &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
  37. &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
  38. &pmuludq("mm2","mm0"); # mm2 = w*a[0]
  39. &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
  40. &pmuludq("mm4","mm0"); # mm4 = w*a[1]
  41. &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
  42. &pmuludq("mm6","mm0"); # mm6 = w*a[2]
  43. &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
  44. &pmuludq("mm7","mm0"); # mm7 = w*a[3]
  45. &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
  46. &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
  47. &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
  48. &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
  49. &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
  50. &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
  51. &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
  52. &movd(&DWP(0,$r,"",0),"mm1");
  53. &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
  54. &pmuludq("mm2","mm0"); # mm2 = w*a[4]
  55. &psrlq("mm1",32); # mm1 = carry0
  56. &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
  57. &pmuludq("mm4","mm0"); # mm4 = w*a[5]
  58. &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
  59. &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
  60. &pmuludq("mm6","mm0"); # mm6 = w*a[6]
  61. &movd(&DWP(4,$r,"",0),"mm1");
  62. &psrlq("mm1",32); # mm1 = carry1
  63. &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
  64. &add($a,32);
  65. &pmuludq("mm3","mm0"); # mm3 = w*a[7]
  66. &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
  67. &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
  68. &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
  69. &movd(&DWP(8,$r,"",0),"mm1");
  70. &psrlq("mm1",32); # mm1 = carry2
  71. &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
  72. &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
  73. &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
  74. &movd(&DWP(12,$r,"",0),"mm1");
  75. &psrlq("mm1",32); # mm1 = carry3
  76. &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
  77. &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
  78. &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
  79. &movd(&DWP(16,$r,"",0),"mm1");
  80. &psrlq("mm1",32); # mm1 = carry4
  81. &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
  82. &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
  83. &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
  84. &movd(&DWP(20,$r,"",0),"mm1");
  85. &psrlq("mm1",32); # mm1 = carry5
  86. &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
  87. &movd(&DWP(24,$r,"",0),"mm1");
  88. &psrlq("mm1",32); # mm1 = carry6
  89. &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
  90. &movd(&DWP(28,$r,"",0),"mm1");
  91. &lea($r,&DWP(32,$r));
  92. &psrlq("mm1",32); # mm1 = carry_out
  93. &sub($c,8);
  94. &jz(&label("maw_sse2_exit"));
  95. &set_label("maw_sse2_entry");
  96. &test($c,0xfffffff8);
  97. &jnz(&label("maw_sse2_unrolled"));
  98. &set_label("maw_sse2_loop",4);
  99. &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
  100. &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
  101. &pmuludq("mm2","mm0"); # a[i] *= w
  102. &lea($a,&DWP(4,$a));
  103. &paddq("mm1","mm3"); # carry += r[i]
  104. &paddq("mm1","mm2"); # carry += a[i]*w
  105. &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
  106. &sub($c,1);
  107. &psrlq("mm1",32); # carry = carry_high
  108. &lea($r,&DWP(4,$r));
  109. &jnz(&label("maw_sse2_loop"));
  110. &set_label("maw_sse2_exit");
  111. &movd("eax","mm1"); # c = carry_out
  112. &emms();
  113. &ret();
  114. &set_label("maw_non_sse2",16);
  115. }
  116. # function_begin prologue
  117. &push("ebp");
  118. &push("ebx");
  119. &push("esi");
  120. &push("edi");
  121. &comment("");
  122. $Low="eax";
  123. $High="edx";
  124. $a="ebx";
  125. $w="ebp";
  126. $r="edi";
  127. $c="esi";
  128. &xor($c,$c); # clear carry
  129. &mov($r,&wparam(0)); #
  130. &mov("ecx",&wparam(2)); #
  131. &mov($a,&wparam(1)); #
  132. &and("ecx",0xfffffff8); # num / 8
  133. &mov($w,&wparam(3)); #
  134. &push("ecx"); # Up the stack for a tmp variable
  135. &jz(&label("maw_finish"));
  136. &set_label("maw_loop",16);
  137. for ($i=0; $i<32; $i+=4)
  138. {
  139. &comment("Round $i");
  140. &mov("eax",&DWP($i,$a)); # *a
  141. &mul($w); # *a * w
  142. &add("eax",$c); # L(t)+= c
  143. &adc("edx",0); # H(t)+=carry
  144. &add("eax",&DWP($i,$r)); # L(t)+= *r
  145. &adc("edx",0); # H(t)+=carry
  146. &mov(&DWP($i,$r),"eax"); # *r= L(t);
  147. &mov($c,"edx"); # c= H(t);
  148. }
  149. &comment("");
  150. &sub("ecx",8);
  151. &lea($a,&DWP(32,$a));
  152. &lea($r,&DWP(32,$r));
  153. &jnz(&label("maw_loop"));
  154. &set_label("maw_finish",0);
  155. &mov("ecx",&wparam(2)); # get num
  156. &and("ecx",7);
  157. &jnz(&label("maw_finish2")); # helps branch prediction
  158. &jmp(&label("maw_end"));
  159. &set_label("maw_finish2",1);
  160. for ($i=0; $i<7; $i++)
  161. {
  162. &comment("Tail Round $i");
  163. &mov("eax",&DWP($i*4,$a)); # *a
  164. &mul($w); # *a * w
  165. &add("eax",$c); # L(t)+=c
  166. &adc("edx",0); # H(t)+=carry
  167. &add("eax",&DWP($i*4,$r)); # L(t)+= *r
  168. &adc("edx",0); # H(t)+=carry
  169. &dec("ecx") if ($i != 7-1);
  170. &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
  171. &mov($c,"edx"); # c= H(t);
  172. &jz(&label("maw_end")) if ($i != 7-1);
  173. }
  174. &set_label("maw_end",0);
  175. &mov("eax",$c);
  176. &pop("ecx"); # clear variable from
  177. &function_end($name);
  178. }
  179. sub bn_mul_words
  180. {
  181. local($name)=@_;
  182. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  183. $r="eax";
  184. $a="edx";
  185. $c="ecx";
  186. if ($sse2) {
  187. &picmeup("eax","OPENSSL_ia32cap_P");
  188. &bt(&DWP(0,"eax"),26);
  189. &jnc(&label("mw_non_sse2"));
  190. &mov($r,&wparam(0));
  191. &mov($a,&wparam(1));
  192. &mov($c,&wparam(2));
  193. &movd("mm0",&wparam(3)); # mm0 = w
  194. &pxor("mm1","mm1"); # mm1 = carry = 0
  195. &set_label("mw_sse2_loop",16);
  196. &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
  197. &pmuludq("mm2","mm0"); # a[i] *= w
  198. &lea($a,&DWP(4,$a));
  199. &paddq("mm1","mm2"); # carry += a[i]*w
  200. &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
  201. &sub($c,1);
  202. &psrlq("mm1",32); # carry = carry_high
  203. &lea($r,&DWP(4,$r));
  204. &jnz(&label("mw_sse2_loop"));
  205. &movd("eax","mm1"); # return carry
  206. &emms();
  207. &ret();
  208. &set_label("mw_non_sse2",16);
  209. }
  210. # function_begin prologue
  211. &push("ebp");
  212. &push("ebx");
  213. &push("esi");
  214. &push("edi");
  215. &comment("");
  216. $Low="eax";
  217. $High="edx";
  218. $a="ebx";
  219. $w="ecx";
  220. $r="edi";
  221. $c="esi";
  222. $num="ebp";
  223. &xor($c,$c); # clear carry
  224. &mov($r,&wparam(0)); #
  225. &mov($a,&wparam(1)); #
  226. &mov($num,&wparam(2)); #
  227. &mov($w,&wparam(3)); #
  228. &and($num,0xfffffff8); # num / 8
  229. &jz(&label("mw_finish"));
  230. &set_label("mw_loop",0);
  231. for ($i=0; $i<32; $i+=4)
  232. {
  233. &comment("Round $i");
  234. &mov("eax",&DWP($i,$a,"",0)); # *a
  235. &mul($w); # *a * w
  236. &add("eax",$c); # L(t)+=c
  237. # XXX
  238. &adc("edx",0); # H(t)+=carry
  239. &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
  240. &mov($c,"edx"); # c= H(t);
  241. }
  242. &comment("");
  243. &add($a,32);
  244. &add($r,32);
  245. &sub($num,8);
  246. &jz(&label("mw_finish"));
  247. &jmp(&label("mw_loop"));
  248. &set_label("mw_finish",0);
  249. &mov($num,&wparam(2)); # get num
  250. &and($num,7);
  251. &jnz(&label("mw_finish2"));
  252. &jmp(&label("mw_end"));
  253. &set_label("mw_finish2",1);
  254. for ($i=0; $i<7; $i++)
  255. {
  256. &comment("Tail Round $i");
  257. &mov("eax",&DWP($i*4,$a,"",0));# *a
  258. &mul($w); # *a * w
  259. &add("eax",$c); # L(t)+=c
  260. # XXX
  261. &adc("edx",0); # H(t)+=carry
  262. &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
  263. &mov($c,"edx"); # c= H(t);
  264. &dec($num) if ($i != 7-1);
  265. &jz(&label("mw_end")) if ($i != 7-1);
  266. }
  267. &set_label("mw_end",0);
  268. &mov("eax",$c);
  269. &function_end($name);
  270. }
  271. sub bn_sqr_words
  272. {
  273. local($name)=@_;
  274. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  275. $r="eax";
  276. $a="edx";
  277. $c="ecx";
  278. if ($sse2) {
  279. &picmeup("eax","OPENSSL_ia32cap_P");
  280. &bt(&DWP(0,"eax"),26);
  281. &jnc(&label("sqr_non_sse2"));
  282. &mov($r,&wparam(0));
  283. &mov($a,&wparam(1));
  284. &mov($c,&wparam(2));
  285. &set_label("sqr_sse2_loop",16);
  286. &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
  287. &pmuludq("mm0","mm0"); # a[i] *= a[i]
  288. &lea($a,&DWP(4,$a)); # a++
  289. &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
  290. &sub($c,1);
  291. &lea($r,&DWP(8,$r)); # r += 2
  292. &jnz(&label("sqr_sse2_loop"));
  293. &emms();
  294. &ret();
  295. &set_label("sqr_non_sse2",16);
  296. }
  297. # function_begin prologue
  298. &push("ebp");
  299. &push("ebx");
  300. &push("esi");
  301. &push("edi");
  302. &comment("");
  303. $r="esi";
  304. $a="edi";
  305. $num="ebx";
  306. &mov($r,&wparam(0)); #
  307. &mov($a,&wparam(1)); #
  308. &mov($num,&wparam(2)); #
  309. &and($num,0xfffffff8); # num / 8
  310. &jz(&label("sw_finish"));
  311. &set_label("sw_loop",0);
  312. for ($i=0; $i<32; $i+=4)
  313. {
  314. &comment("Round $i");
  315. &mov("eax",&DWP($i,$a,"",0)); # *a
  316. # XXX
  317. &mul("eax"); # *a * *a
  318. &mov(&DWP($i*2,$r,"",0),"eax"); #
  319. &mov(&DWP($i*2+4,$r,"",0),"edx");#
  320. }
  321. &comment("");
  322. &add($a,32);
  323. &add($r,64);
  324. &sub($num,8);
  325. &jnz(&label("sw_loop"));
  326. &set_label("sw_finish",0);
  327. &mov($num,&wparam(2)); # get num
  328. &and($num,7);
  329. &jz(&label("sw_end"));
  330. for ($i=0; $i<7; $i++)
  331. {
  332. &comment("Tail Round $i");
  333. &mov("eax",&DWP($i*4,$a,"",0)); # *a
  334. # XXX
  335. &mul("eax"); # *a * *a
  336. &mov(&DWP($i*8,$r,"",0),"eax"); #
  337. &dec($num) if ($i != 7-1);
  338. &mov(&DWP($i*8+4,$r,"",0),"edx");
  339. &jz(&label("sw_end")) if ($i != 7-1);
  340. }
  341. &set_label("sw_end",0);
  342. &function_end($name);
  343. }
  344. sub bn_div_words
  345. {
  346. local($name)=@_;
  347. &function_begin_B($name,"");
  348. &mov("edx",&wparam(0)); #
  349. &mov("eax",&wparam(1)); #
  350. &mov("ecx",&wparam(2)); #
  351. &div("ecx");
  352. &ret();
  353. &function_end_B($name);
  354. }
  355. sub bn_add_words
  356. {
  357. local($name)=@_;
  358. &function_begin($name,"");
  359. &comment("");
  360. $a="esi";
  361. $b="edi";
  362. $c="eax";
  363. $r="ebx";
  364. $tmp1="ecx";
  365. $tmp2="edx";
  366. $num="ebp";
  367. &mov($r,&wparam(0)); # get r
  368. &mov($a,&wparam(1)); # get a
  369. &mov($b,&wparam(2)); # get b
  370. &mov($num,&wparam(3)); # get num
  371. &xor($c,$c); # clear carry
  372. &and($num,0xfffffff8); # num / 8
  373. &jz(&label("aw_finish"));
  374. &set_label("aw_loop",0);
  375. for ($i=0; $i<8; $i++)
  376. {
  377. &comment("Round $i");
  378. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  379. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  380. &add($tmp1,$c);
  381. &mov($c,0);
  382. &adc($c,$c);
  383. &add($tmp1,$tmp2);
  384. &adc($c,0);
  385. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  386. }
  387. &comment("");
  388. &add($a,32);
  389. &add($b,32);
  390. &add($r,32);
  391. &sub($num,8);
  392. &jnz(&label("aw_loop"));
  393. &set_label("aw_finish",0);
  394. &mov($num,&wparam(3)); # get num
  395. &and($num,7);
  396. &jz(&label("aw_end"));
  397. for ($i=0; $i<7; $i++)
  398. {
  399. &comment("Tail Round $i");
  400. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  401. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  402. &add($tmp1,$c);
  403. &mov($c,0);
  404. &adc($c,$c);
  405. &add($tmp1,$tmp2);
  406. &adc($c,0);
  407. &dec($num) if ($i != 6);
  408. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  409. &jz(&label("aw_end")) if ($i != 6);
  410. }
  411. &set_label("aw_end",0);
  412. # &mov("eax",$c); # $c is "eax"
  413. &function_end($name);
  414. }
  415. sub bn_sub_words
  416. {
  417. local($name)=@_;
  418. &function_begin($name,"");
  419. &comment("");
  420. $a="esi";
  421. $b="edi";
  422. $c="eax";
  423. $r="ebx";
  424. $tmp1="ecx";
  425. $tmp2="edx";
  426. $num="ebp";
  427. &mov($r,&wparam(0)); # get r
  428. &mov($a,&wparam(1)); # get a
  429. &mov($b,&wparam(2)); # get b
  430. &mov($num,&wparam(3)); # get num
  431. &xor($c,$c); # clear carry
  432. &and($num,0xfffffff8); # num / 8
  433. &jz(&label("aw_finish"));
  434. &set_label("aw_loop",0);
  435. for ($i=0; $i<8; $i++)
  436. {
  437. &comment("Round $i");
  438. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  439. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  440. &sub($tmp1,$c);
  441. &mov($c,0);
  442. &adc($c,$c);
  443. &sub($tmp1,$tmp2);
  444. &adc($c,0);
  445. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  446. }
  447. &comment("");
  448. &add($a,32);
  449. &add($b,32);
  450. &add($r,32);
  451. &sub($num,8);
  452. &jnz(&label("aw_loop"));
  453. &set_label("aw_finish",0);
  454. &mov($num,&wparam(3)); # get num
  455. &and($num,7);
  456. &jz(&label("aw_end"));
  457. for ($i=0; $i<7; $i++)
  458. {
  459. &comment("Tail Round $i");
  460. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  461. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  462. &sub($tmp1,$c);
  463. &mov($c,0);
  464. &adc($c,$c);
  465. &sub($tmp1,$tmp2);
  466. &adc($c,0);
  467. &dec($num) if ($i != 6);
  468. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  469. &jz(&label("aw_end")) if ($i != 6);
  470. }
  471. &set_label("aw_end",0);
  472. # &mov("eax",$c); # $c is "eax"
  473. &function_end($name);
  474. }
  475. sub bn_sub_part_words
  476. {
  477. local($name)=@_;
  478. &function_begin($name,"");
  479. &comment("");
  480. $a="esi";
  481. $b="edi";
  482. $c="eax";
  483. $r="ebx";
  484. $tmp1="ecx";
  485. $tmp2="edx";
  486. $num="ebp";
  487. &mov($r,&wparam(0)); # get r
  488. &mov($a,&wparam(1)); # get a
  489. &mov($b,&wparam(2)); # get b
  490. &mov($num,&wparam(3)); # get num
  491. &xor($c,$c); # clear carry
  492. &and($num,0xfffffff8); # num / 8
  493. &jz(&label("aw_finish"));
  494. &set_label("aw_loop",0);
  495. for ($i=0; $i<8; $i++)
  496. {
  497. &comment("Round $i");
  498. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  499. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  500. &sub($tmp1,$c);
  501. &mov($c,0);
  502. &adc($c,$c);
  503. &sub($tmp1,$tmp2);
  504. &adc($c,0);
  505. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  506. }
  507. &comment("");
  508. &add($a,32);
  509. &add($b,32);
  510. &add($r,32);
  511. &sub($num,8);
  512. &jnz(&label("aw_loop"));
  513. &set_label("aw_finish",0);
  514. &mov($num,&wparam(3)); # get num
  515. &and($num,7);
  516. &jz(&label("aw_end"));
  517. for ($i=0; $i<7; $i++)
  518. {
  519. &comment("Tail Round $i");
  520. &mov($tmp1,&DWP(0,$a,"",0)); # *a
  521. &mov($tmp2,&DWP(0,$b,"",0));# *b
  522. &sub($tmp1,$c);
  523. &mov($c,0);
  524. &adc($c,$c);
  525. &sub($tmp1,$tmp2);
  526. &adc($c,0);
  527. &mov(&DWP(0,$r,"",0),$tmp1); # *r
  528. &add($a, 4);
  529. &add($b, 4);
  530. &add($r, 4);
  531. &dec($num) if ($i != 6);
  532. &jz(&label("aw_end")) if ($i != 6);
  533. }
  534. &set_label("aw_end",0);
  535. &cmp(&wparam(4),0);
  536. &je(&label("pw_end"));
  537. &mov($num,&wparam(4)); # get dl
  538. &cmp($num,0);
  539. &je(&label("pw_end"));
  540. &jge(&label("pw_pos"));
  541. &comment("pw_neg");
  542. &mov($tmp2,0);
  543. &sub($tmp2,$num);
  544. &mov($num,$tmp2);
  545. &and($num,0xfffffff8); # num / 8
  546. &jz(&label("pw_neg_finish"));
  547. &set_label("pw_neg_loop",0);
  548. for ($i=0; $i<8; $i++)
  549. {
  550. &comment("dl<0 Round $i");
  551. &mov($tmp1,0);
  552. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  553. &sub($tmp1,$c);
  554. &mov($c,0);
  555. &adc($c,$c);
  556. &sub($tmp1,$tmp2);
  557. &adc($c,0);
  558. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  559. }
  560. &comment("");
  561. &add($b,32);
  562. &add($r,32);
  563. &sub($num,8);
  564. &jnz(&label("pw_neg_loop"));
  565. &set_label("pw_neg_finish",0);
  566. &mov($tmp2,&wparam(4)); # get dl
  567. &mov($num,0);
  568. &sub($num,$tmp2);
  569. &and($num,7);
  570. &jz(&label("pw_end"));
  571. for ($i=0; $i<7; $i++)
  572. {
  573. &comment("dl<0 Tail Round $i");
  574. &mov($tmp1,0);
  575. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  576. &sub($tmp1,$c);
  577. &mov($c,0);
  578. &adc($c,$c);
  579. &sub($tmp1,$tmp2);
  580. &adc($c,0);
  581. &dec($num) if ($i != 6);
  582. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  583. &jz(&label("pw_end")) if ($i != 6);
  584. }
  585. &jmp(&label("pw_end"));
  586. &set_label("pw_pos",0);
  587. &and($num,0xfffffff8); # num / 8
  588. &jz(&label("pw_pos_finish"));
  589. &set_label("pw_pos_loop",0);
  590. for ($i=0; $i<8; $i++)
  591. {
  592. &comment("dl>0 Round $i");
  593. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  594. &sub($tmp1,$c);
  595. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  596. &jnc(&label("pw_nc".$i));
  597. }
  598. &comment("");
  599. &add($a,32);
  600. &add($r,32);
  601. &sub($num,8);
  602. &jnz(&label("pw_pos_loop"));
  603. &set_label("pw_pos_finish",0);
  604. &mov($num,&wparam(4)); # get dl
  605. &and($num,7);
  606. &jz(&label("pw_end"));
  607. for ($i=0; $i<7; $i++)
  608. {
  609. &comment("dl>0 Tail Round $i");
  610. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  611. &sub($tmp1,$c);
  612. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  613. &jnc(&label("pw_tail_nc".$i));
  614. &dec($num) if ($i != 6);
  615. &jz(&label("pw_end")) if ($i != 6);
  616. }
  617. &mov($c,1);
  618. &jmp(&label("pw_end"));
  619. &set_label("pw_nc_loop",0);
  620. for ($i=0; $i<8; $i++)
  621. {
  622. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  623. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  624. &set_label("pw_nc".$i,0);
  625. }
  626. &comment("");
  627. &add($a,32);
  628. &add($r,32);
  629. &sub($num,8);
  630. &jnz(&label("pw_nc_loop"));
  631. &mov($num,&wparam(4)); # get dl
  632. &and($num,7);
  633. &jz(&label("pw_nc_end"));
  634. for ($i=0; $i<7; $i++)
  635. {
  636. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  637. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  638. &set_label("pw_tail_nc".$i,0);
  639. &dec($num) if ($i != 6);
  640. &jz(&label("pw_nc_end")) if ($i != 6);
  641. }
  642. &set_label("pw_nc_end",0);
  643. &mov($c,0);
  644. &set_label("pw_end",0);
  645. # &mov("eax",$c); # $c is "eax"
  646. &function_end($name);
  647. }