x86_64-gcc.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. /*
  2. * x86_64 BIGNUM accelerator version 0.1, December 2002.
  3. *
  4. * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. * project.
  6. *
  7. * Rights for redistribution and usage in source and binary forms are
  8. * granted according to the OpenSSL license. Warranty of any kind is
  9. * disclaimed.
  10. *
  11. * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  12. * versions, like 1.0...
  13. * A. Well, that's because this code is basically a quick-n-dirty
  14. * proof-of-concept hack. As you can see it's implemented with
  15. * inline assembler, which means that you're bound to GCC and that
  16. * there might be enough room for further improvement.
  17. *
  18. * Q. Why inline assembler?
  19. * A. x86_64 features own ABI which I'm not familiar with. This is
  20. * why I decided to let the compiler take care of subroutine
  21. * prologue/epilogue as well as register allocation. For reference.
  22. * Win64 implements different ABI for AMD64, different from Linux.
  23. *
  24. * Q. How much faster does it get?
  25. * A. 'apps/openssl speed rsa dsa' output with no-asm:
  26. *
  27. * sign verify sign/s verify/s
  28. * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
  29. * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
  30. * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
  31. * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
  32. * sign verify sign/s verify/s
  33. * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
  34. * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
  35. * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
  36. *
  37. * 'apps/openssl speed rsa dsa' output with this module:
  38. *
  39. * sign verify sign/s verify/s
  40. * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
  41. * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
  42. * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
  43. * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
  44. * sign verify sign/s verify/s
  45. * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
  46. * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
  47. * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
  48. *
  49. * For the reference. IA-32 assembler implementation performs
  50. * very much like 64-bit code compiled with no-asm on the same
  51. * machine.
  52. */
  53. #define BN_ULONG unsigned long
  54. /*
  55. * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
  56. * "g"(0) let the compiler to decide where does it
  57. * want to keep the value of zero;
  58. */
  59. #define mul_add(r,a,word,carry) do { \
  60. register BN_ULONG high,low; \
  61. asm ("mulq %3" \
  62. : "=a"(low),"=d"(high) \
  63. : "a"(word),"m"(a) \
  64. : "cc"); \
  65. asm ("addq %2,%0; adcq %3,%1" \
  66. : "+r"(carry),"+d"(high)\
  67. : "a"(low),"g"(0) \
  68. : "cc"); \
  69. asm ("addq %2,%0; adcq %3,%1" \
  70. : "+m"(r),"+d"(high) \
  71. : "r"(carry),"g"(0) \
  72. : "cc"); \
  73. carry=high; \
  74. } while (0)
  75. #define mul(r,a,word,carry) do { \
  76. register BN_ULONG high,low; \
  77. asm ("mulq %3" \
  78. : "=a"(low),"=d"(high) \
  79. : "a"(word),"g"(a) \
  80. : "cc"); \
  81. asm ("addq %2,%0; adcq %3,%1" \
  82. : "+r"(carry),"+d"(high)\
  83. : "a"(low),"g"(0) \
  84. : "cc"); \
  85. (r)=carry, carry=high; \
  86. } while (0)
  87. #define sqr(r0,r1,a) \
  88. asm ("mulq %2" \
  89. : "=a"(r0),"=d"(r1) \
  90. : "a"(a) \
  91. : "cc");
  92. BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  93. {
  94. BN_ULONG c1=0;
  95. if (num <= 0) return(c1);
  96. while (num&~3)
  97. {
  98. mul_add(rp[0],ap[0],w,c1);
  99. mul_add(rp[1],ap[1],w,c1);
  100. mul_add(rp[2],ap[2],w,c1);
  101. mul_add(rp[3],ap[3],w,c1);
  102. ap+=4; rp+=4; num-=4;
  103. }
  104. if (num)
  105. {
  106. mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
  107. mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
  108. mul_add(rp[2],ap[2],w,c1); return c1;
  109. }
  110. return(c1);
  111. }
  112. BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  113. {
  114. BN_ULONG c1=0;
  115. if (num <= 0) return(c1);
  116. while (num&~3)
  117. {
  118. mul(rp[0],ap[0],w,c1);
  119. mul(rp[1],ap[1],w,c1);
  120. mul(rp[2],ap[2],w,c1);
  121. mul(rp[3],ap[3],w,c1);
  122. ap+=4; rp+=4; num-=4;
  123. }
  124. if (num)
  125. {
  126. mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
  127. mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
  128. mul(rp[2],ap[2],w,c1);
  129. }
  130. return(c1);
  131. }
  132. void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
  133. {
  134. if (n <= 0) return;
  135. while (n&~3)
  136. {
  137. sqr(r[0],r[1],a[0]);
  138. sqr(r[2],r[3],a[1]);
  139. sqr(r[4],r[5],a[2]);
  140. sqr(r[6],r[7],a[3]);
  141. a+=4; r+=8; n-=4;
  142. }
  143. if (n)
  144. {
  145. sqr(r[0],r[1],a[0]); if (--n == 0) return;
  146. sqr(r[2],r[3],a[1]); if (--n == 0) return;
  147. sqr(r[4],r[5],a[2]);
  148. }
  149. }
  150. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  151. { BN_ULONG ret,waste;
  152. asm ("divq %4"
  153. : "=a"(ret),"=d"(waste)
  154. : "a"(l),"d"(h),"g"(d)
  155. : "cc");
  156. return ret;
  157. }
  158. BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
  159. { BN_ULONG ret=0,i=0;
  160. if (n <= 0) return 0;
  161. asm (
  162. " subq %2,%2 \n"
  163. ".align 16 \n"
  164. "1: movq (%4,%2,8),%0 \n"
  165. " adcq (%5,%2,8),%0 \n"
  166. " movq %0,(%3,%2,8) \n"
  167. " leaq 1(%2),%2 \n"
  168. " loop 1b \n"
  169. " sbbq %0,%0 \n"
  170. : "=&a"(ret),"+c"(n),"=&r"(i)
  171. : "r"(rp),"r"(ap),"r"(bp)
  172. : "cc"
  173. );
  174. return ret&1;
  175. }
  176. #ifndef SIMICS
  177. BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
  178. { BN_ULONG ret=0,i=0;
  179. if (n <= 0) return 0;
  180. asm (
  181. " subq %2,%2 \n"
  182. ".align 16 \n"
  183. "1: movq (%4,%2,8),%0 \n"
  184. " sbbq (%5,%2,8),%0 \n"
  185. " movq %0,(%3,%2,8) \n"
  186. " leaq 1(%2),%2 \n"
  187. " loop 1b \n"
  188. " sbbq %0,%0 \n"
  189. : "=&a"(ret),"+c"(n),"=&r"(i)
  190. : "r"(rp),"r"(ap),"r"(bp)
  191. : "cc"
  192. );
  193. return ret&1;
  194. }
  195. #else
  196. /* Simics 1.4<7 has buggy sbbq:-( */
  197. #define BN_MASK2 0xffffffffffffffffL
  198. BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  199. {
  200. BN_ULONG t1,t2;
  201. int c=0;
  202. if (n <= 0) return((BN_ULONG)0);
  203. for (;;)
  204. {
  205. t1=a[0]; t2=b[0];
  206. r[0]=(t1-t2-c)&BN_MASK2;
  207. if (t1 != t2) c=(t1 < t2);
  208. if (--n <= 0) break;
  209. t1=a[1]; t2=b[1];
  210. r[1]=(t1-t2-c)&BN_MASK2;
  211. if (t1 != t2) c=(t1 < t2);
  212. if (--n <= 0) break;
  213. t1=a[2]; t2=b[2];
  214. r[2]=(t1-t2-c)&BN_MASK2;
  215. if (t1 != t2) c=(t1 < t2);
  216. if (--n <= 0) break;
  217. t1=a[3]; t2=b[3];
  218. r[3]=(t1-t2-c)&BN_MASK2;
  219. if (t1 != t2) c=(t1 < t2);
  220. if (--n <= 0) break;
  221. a+=4;
  222. b+=4;
  223. r+=4;
  224. }
  225. return(c);
  226. }
  227. #endif
  228. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  229. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  230. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  231. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
  232. #if 0
  233. /* original macros are kept for reference purposes */
  234. #define mul_add_c(a,b,c0,c1,c2) { \
  235. BN_ULONG ta=(a),tb=(b); \
  236. t1 = ta * tb; \
  237. t2 = BN_UMULT_HIGH(ta,tb); \
  238. c0 += t1; t2 += (c0<t1)?1:0; \
  239. c1 += t2; c2 += (c1<t2)?1:0; \
  240. }
  241. #define mul_add_c2(a,b,c0,c1,c2) { \
  242. BN_ULONG ta=(a),tb=(b),t0; \
  243. t1 = BN_UMULT_HIGH(ta,tb); \
  244. t0 = ta * tb; \
  245. t2 = t1+t1; c2 += (t2<t1)?1:0; \
  246. t1 = t0+t0; t2 += (t1<t0)?1:0; \
  247. c0 += t1; t2 += (c0<t1)?1:0; \
  248. c1 += t2; c2 += (c1<t2)?1:0; \
  249. }
  250. #else
  251. #define mul_add_c(a,b,c0,c1,c2) do { \
  252. asm ("mulq %3" \
  253. : "=a"(t1),"=d"(t2) \
  254. : "a"(a),"m"(b) \
  255. : "cc"); \
  256. asm ("addq %2,%0; adcq %3,%1" \
  257. : "+r"(c0),"+d"(t2) \
  258. : "a"(t1),"g"(0) \
  259. : "cc"); \
  260. asm ("addq %2,%0; adcq %3,%1" \
  261. : "+r"(c1),"+r"(c2) \
  262. : "d"(t2),"g"(0) \
  263. : "cc"); \
  264. } while (0)
  265. #define sqr_add_c(a,i,c0,c1,c2) do { \
  266. asm ("mulq %2" \
  267. : "=a"(t1),"=d"(t2) \
  268. : "a"(a[i]) \
  269. : "cc"); \
  270. asm ("addq %2,%0; adcq %3,%1" \
  271. : "+r"(c0),"+d"(t2) \
  272. : "a"(t1),"g"(0) \
  273. : "cc"); \
  274. asm ("addq %2,%0; adcq %3,%1" \
  275. : "+r"(c1),"+r"(c2) \
  276. : "d"(t2),"g"(0) \
  277. : "cc"); \
  278. } while (0)
  279. #define mul_add_c2(a,b,c0,c1,c2) do { \
  280. asm ("mulq %3" \
  281. : "=a"(t1),"=d"(t2) \
  282. : "a"(a),"m"(b) \
  283. : "cc"); \
  284. asm ("addq %0,%0; adcq %2,%1" \
  285. : "+d"(t2),"+r"(c2) \
  286. : "g"(0) \
  287. : "cc"); \
  288. asm ("addq %0,%0; adcq %2,%1" \
  289. : "+a"(t1),"+d"(t2) \
  290. : "g"(0) \
  291. : "cc"); \
  292. asm ("addq %2,%0; adcq %3,%1" \
  293. : "+r"(c0),"+d"(t2) \
  294. : "a"(t1),"g"(0) \
  295. : "cc"); \
  296. asm ("addq %2,%0; adcq %3,%1" \
  297. : "+r"(c1),"+r"(c2) \
  298. : "d"(t2),"g"(0) \
  299. : "cc"); \
  300. } while (0)
  301. #endif
  302. #define sqr_add_c2(a,i,j,c0,c1,c2) \
  303. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  304. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  305. {
  306. BN_ULONG t1,t2;
  307. BN_ULONG c1,c2,c3;
  308. c1=0;
  309. c2=0;
  310. c3=0;
  311. mul_add_c(a[0],b[0],c1,c2,c3);
  312. r[0]=c1;
  313. c1=0;
  314. mul_add_c(a[0],b[1],c2,c3,c1);
  315. mul_add_c(a[1],b[0],c2,c3,c1);
  316. r[1]=c2;
  317. c2=0;
  318. mul_add_c(a[2],b[0],c3,c1,c2);
  319. mul_add_c(a[1],b[1],c3,c1,c2);
  320. mul_add_c(a[0],b[2],c3,c1,c2);
  321. r[2]=c3;
  322. c3=0;
  323. mul_add_c(a[0],b[3],c1,c2,c3);
  324. mul_add_c(a[1],b[2],c1,c2,c3);
  325. mul_add_c(a[2],b[1],c1,c2,c3);
  326. mul_add_c(a[3],b[0],c1,c2,c3);
  327. r[3]=c1;
  328. c1=0;
  329. mul_add_c(a[4],b[0],c2,c3,c1);
  330. mul_add_c(a[3],b[1],c2,c3,c1);
  331. mul_add_c(a[2],b[2],c2,c3,c1);
  332. mul_add_c(a[1],b[3],c2,c3,c1);
  333. mul_add_c(a[0],b[4],c2,c3,c1);
  334. r[4]=c2;
  335. c2=0;
  336. mul_add_c(a[0],b[5],c3,c1,c2);
  337. mul_add_c(a[1],b[4],c3,c1,c2);
  338. mul_add_c(a[2],b[3],c3,c1,c2);
  339. mul_add_c(a[3],b[2],c3,c1,c2);
  340. mul_add_c(a[4],b[1],c3,c1,c2);
  341. mul_add_c(a[5],b[0],c3,c1,c2);
  342. r[5]=c3;
  343. c3=0;
  344. mul_add_c(a[6],b[0],c1,c2,c3);
  345. mul_add_c(a[5],b[1],c1,c2,c3);
  346. mul_add_c(a[4],b[2],c1,c2,c3);
  347. mul_add_c(a[3],b[3],c1,c2,c3);
  348. mul_add_c(a[2],b[4],c1,c2,c3);
  349. mul_add_c(a[1],b[5],c1,c2,c3);
  350. mul_add_c(a[0],b[6],c1,c2,c3);
  351. r[6]=c1;
  352. c1=0;
  353. mul_add_c(a[0],b[7],c2,c3,c1);
  354. mul_add_c(a[1],b[6],c2,c3,c1);
  355. mul_add_c(a[2],b[5],c2,c3,c1);
  356. mul_add_c(a[3],b[4],c2,c3,c1);
  357. mul_add_c(a[4],b[3],c2,c3,c1);
  358. mul_add_c(a[5],b[2],c2,c3,c1);
  359. mul_add_c(a[6],b[1],c2,c3,c1);
  360. mul_add_c(a[7],b[0],c2,c3,c1);
  361. r[7]=c2;
  362. c2=0;
  363. mul_add_c(a[7],b[1],c3,c1,c2);
  364. mul_add_c(a[6],b[2],c3,c1,c2);
  365. mul_add_c(a[5],b[3],c3,c1,c2);
  366. mul_add_c(a[4],b[4],c3,c1,c2);
  367. mul_add_c(a[3],b[5],c3,c1,c2);
  368. mul_add_c(a[2],b[6],c3,c1,c2);
  369. mul_add_c(a[1],b[7],c3,c1,c2);
  370. r[8]=c3;
  371. c3=0;
  372. mul_add_c(a[2],b[7],c1,c2,c3);
  373. mul_add_c(a[3],b[6],c1,c2,c3);
  374. mul_add_c(a[4],b[5],c1,c2,c3);
  375. mul_add_c(a[5],b[4],c1,c2,c3);
  376. mul_add_c(a[6],b[3],c1,c2,c3);
  377. mul_add_c(a[7],b[2],c1,c2,c3);
  378. r[9]=c1;
  379. c1=0;
  380. mul_add_c(a[7],b[3],c2,c3,c1);
  381. mul_add_c(a[6],b[4],c2,c3,c1);
  382. mul_add_c(a[5],b[5],c2,c3,c1);
  383. mul_add_c(a[4],b[6],c2,c3,c1);
  384. mul_add_c(a[3],b[7],c2,c3,c1);
  385. r[10]=c2;
  386. c2=0;
  387. mul_add_c(a[4],b[7],c3,c1,c2);
  388. mul_add_c(a[5],b[6],c3,c1,c2);
  389. mul_add_c(a[6],b[5],c3,c1,c2);
  390. mul_add_c(a[7],b[4],c3,c1,c2);
  391. r[11]=c3;
  392. c3=0;
  393. mul_add_c(a[7],b[5],c1,c2,c3);
  394. mul_add_c(a[6],b[6],c1,c2,c3);
  395. mul_add_c(a[5],b[7],c1,c2,c3);
  396. r[12]=c1;
  397. c1=0;
  398. mul_add_c(a[6],b[7],c2,c3,c1);
  399. mul_add_c(a[7],b[6],c2,c3,c1);
  400. r[13]=c2;
  401. c2=0;
  402. mul_add_c(a[7],b[7],c3,c1,c2);
  403. r[14]=c3;
  404. r[15]=c1;
  405. }
  406. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  407. {
  408. BN_ULONG t1,t2;
  409. BN_ULONG c1,c2,c3;
  410. c1=0;
  411. c2=0;
  412. c3=0;
  413. mul_add_c(a[0],b[0],c1,c2,c3);
  414. r[0]=c1;
  415. c1=0;
  416. mul_add_c(a[0],b[1],c2,c3,c1);
  417. mul_add_c(a[1],b[0],c2,c3,c1);
  418. r[1]=c2;
  419. c2=0;
  420. mul_add_c(a[2],b[0],c3,c1,c2);
  421. mul_add_c(a[1],b[1],c3,c1,c2);
  422. mul_add_c(a[0],b[2],c3,c1,c2);
  423. r[2]=c3;
  424. c3=0;
  425. mul_add_c(a[0],b[3],c1,c2,c3);
  426. mul_add_c(a[1],b[2],c1,c2,c3);
  427. mul_add_c(a[2],b[1],c1,c2,c3);
  428. mul_add_c(a[3],b[0],c1,c2,c3);
  429. r[3]=c1;
  430. c1=0;
  431. mul_add_c(a[3],b[1],c2,c3,c1);
  432. mul_add_c(a[2],b[2],c2,c3,c1);
  433. mul_add_c(a[1],b[3],c2,c3,c1);
  434. r[4]=c2;
  435. c2=0;
  436. mul_add_c(a[2],b[3],c3,c1,c2);
  437. mul_add_c(a[3],b[2],c3,c1,c2);
  438. r[5]=c3;
  439. c3=0;
  440. mul_add_c(a[3],b[3],c1,c2,c3);
  441. r[6]=c1;
  442. r[7]=c2;
  443. }
  444. void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  445. {
  446. BN_ULONG t1,t2;
  447. BN_ULONG c1,c2,c3;
  448. c1=0;
  449. c2=0;
  450. c3=0;
  451. sqr_add_c(a,0,c1,c2,c3);
  452. r[0]=c1;
  453. c1=0;
  454. sqr_add_c2(a,1,0,c2,c3,c1);
  455. r[1]=c2;
  456. c2=0;
  457. sqr_add_c(a,1,c3,c1,c2);
  458. sqr_add_c2(a,2,0,c3,c1,c2);
  459. r[2]=c3;
  460. c3=0;
  461. sqr_add_c2(a,3,0,c1,c2,c3);
  462. sqr_add_c2(a,2,1,c1,c2,c3);
  463. r[3]=c1;
  464. c1=0;
  465. sqr_add_c(a,2,c2,c3,c1);
  466. sqr_add_c2(a,3,1,c2,c3,c1);
  467. sqr_add_c2(a,4,0,c2,c3,c1);
  468. r[4]=c2;
  469. c2=0;
  470. sqr_add_c2(a,5,0,c3,c1,c2);
  471. sqr_add_c2(a,4,1,c3,c1,c2);
  472. sqr_add_c2(a,3,2,c3,c1,c2);
  473. r[5]=c3;
  474. c3=0;
  475. sqr_add_c(a,3,c1,c2,c3);
  476. sqr_add_c2(a,4,2,c1,c2,c3);
  477. sqr_add_c2(a,5,1,c1,c2,c3);
  478. sqr_add_c2(a,6,0,c1,c2,c3);
  479. r[6]=c1;
  480. c1=0;
  481. sqr_add_c2(a,7,0,c2,c3,c1);
  482. sqr_add_c2(a,6,1,c2,c3,c1);
  483. sqr_add_c2(a,5,2,c2,c3,c1);
  484. sqr_add_c2(a,4,3,c2,c3,c1);
  485. r[7]=c2;
  486. c2=0;
  487. sqr_add_c(a,4,c3,c1,c2);
  488. sqr_add_c2(a,5,3,c3,c1,c2);
  489. sqr_add_c2(a,6,2,c3,c1,c2);
  490. sqr_add_c2(a,7,1,c3,c1,c2);
  491. r[8]=c3;
  492. c3=0;
  493. sqr_add_c2(a,7,2,c1,c2,c3);
  494. sqr_add_c2(a,6,3,c1,c2,c3);
  495. sqr_add_c2(a,5,4,c1,c2,c3);
  496. r[9]=c1;
  497. c1=0;
  498. sqr_add_c(a,5,c2,c3,c1);
  499. sqr_add_c2(a,6,4,c2,c3,c1);
  500. sqr_add_c2(a,7,3,c2,c3,c1);
  501. r[10]=c2;
  502. c2=0;
  503. sqr_add_c2(a,7,4,c3,c1,c2);
  504. sqr_add_c2(a,6,5,c3,c1,c2);
  505. r[11]=c3;
  506. c3=0;
  507. sqr_add_c(a,6,c1,c2,c3);
  508. sqr_add_c2(a,7,5,c1,c2,c3);
  509. r[12]=c1;
  510. c1=0;
  511. sqr_add_c2(a,7,6,c2,c3,c1);
  512. r[13]=c2;
  513. c2=0;
  514. sqr_add_c(a,7,c3,c1,c2);
  515. r[14]=c3;
  516. r[15]=c1;
  517. }
  518. void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  519. {
  520. BN_ULONG t1,t2;
  521. BN_ULONG c1,c2,c3;
  522. c1=0;
  523. c2=0;
  524. c3=0;
  525. sqr_add_c(a,0,c1,c2,c3);
  526. r[0]=c1;
  527. c1=0;
  528. sqr_add_c2(a,1,0,c2,c3,c1);
  529. r[1]=c2;
  530. c2=0;
  531. sqr_add_c(a,1,c3,c1,c2);
  532. sqr_add_c2(a,2,0,c3,c1,c2);
  533. r[2]=c3;
  534. c3=0;
  535. sqr_add_c2(a,3,0,c1,c2,c3);
  536. sqr_add_c2(a,2,1,c1,c2,c3);
  537. r[3]=c1;
  538. c1=0;
  539. sqr_add_c(a,2,c2,c3,c1);
  540. sqr_add_c2(a,3,1,c2,c3,c1);
  541. r[4]=c2;
  542. c2=0;
  543. sqr_add_c2(a,3,2,c3,c1,c2);
  544. r[5]=c3;
  545. c3=0;
  546. sqr_add_c(a,3,c1,c2,c3);
  547. r[6]=c1;
  548. r[7]=c2;
  549. }