x86_64-gcc.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. #include "../bn_lcl.h"
  2. #if !(defined(__GNUC__) && __GNUC__>=2)
  3. # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
  4. #else
  5. /*
  6. * x86_64 BIGNUM accelerator version 0.1, December 2002.
  7. *
  8. * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  9. * project.
  10. *
  11. * Rights for redistribution and usage in source and binary forms are
  12. * granted according to the OpenSSL license. Warranty of any kind is
  13. * disclaimed.
  14. *
  15. * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  16. * versions, like 1.0...
  17. * A. Well, that's because this code is basically a quick-n-dirty
  18. * proof-of-concept hack. As you can see it's implemented with
  19. * inline assembler, which means that you're bound to GCC and that
  20. * there might be enough room for further improvement.
  21. *
  22. * Q. Why inline assembler?
  23. * A. x86_64 features own ABI which I'm not familiar with. This is
  24. * why I decided to let the compiler take care of subroutine
  25. * prologue/epilogue as well as register allocation. For reference.
  26. * Win64 implements different ABI for AMD64, different from Linux.
  27. *
  28. * Q. How much faster does it get?
  29. * A. 'apps/openssl speed rsa dsa' output with no-asm:
  30. *
  31. * sign verify sign/s verify/s
  32. * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
  33. * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
  34. * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
  35. * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
  36. * sign verify sign/s verify/s
  37. * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
  38. * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
  39. * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
  40. *
  41. * 'apps/openssl speed rsa dsa' output with this module:
  42. *
  43. * sign verify sign/s verify/s
  44. * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
  45. * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
  46. * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
  47. * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
  48. * sign verify sign/s verify/s
  49. * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
  50. * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
  51. * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
  52. *
  53. * For the reference. IA-32 assembler implementation performs
  54. * very much like 64-bit code compiled with no-asm on the same
  55. * machine.
  56. */
  57. #ifdef _WIN64
  58. #define BN_ULONG unsigned long long
  59. #else
  60. #define BN_ULONG unsigned long
  61. #endif
  62. #undef mul
  63. #undef mul_add
  64. /*
  65. * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
  66. * "g"(0) let the compiler to decide where does it
  67. * want to keep the value of zero;
  68. */
  69. #define mul_add(r,a,word,carry) do { \
  70. register BN_ULONG high,low; \
  71. asm ("mulq %3" \
  72. : "=a"(low),"=d"(high) \
  73. : "a"(word),"m"(a) \
  74. : "cc"); \
  75. asm ("addq %2,%0; adcq %3,%1" \
  76. : "+r"(carry),"+d"(high)\
  77. : "a"(low),"g"(0) \
  78. : "cc"); \
  79. asm ("addq %2,%0; adcq %3,%1" \
  80. : "+m"(r),"+d"(high) \
  81. : "r"(carry),"g"(0) \
  82. : "cc"); \
  83. carry=high; \
  84. } while (0)
  85. #define mul(r,a,word,carry) do { \
  86. register BN_ULONG high,low; \
  87. asm ("mulq %3" \
  88. : "=a"(low),"=d"(high) \
  89. : "a"(word),"g"(a) \
  90. : "cc"); \
  91. asm ("addq %2,%0; adcq %3,%1" \
  92. : "+r"(carry),"+d"(high)\
  93. : "a"(low),"g"(0) \
  94. : "cc"); \
  95. (r)=carry, carry=high; \
  96. } while (0)
  97. #undef sqr
  98. #define sqr(r0,r1,a) \
  99. asm ("mulq %2" \
  100. : "=a"(r0),"=d"(r1) \
  101. : "a"(a) \
  102. : "cc");
  103. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  104. {
  105. BN_ULONG c1=0;
  106. if (num <= 0) return(c1);
  107. while (num&~3)
  108. {
  109. mul_add(rp[0],ap[0],w,c1);
  110. mul_add(rp[1],ap[1],w,c1);
  111. mul_add(rp[2],ap[2],w,c1);
  112. mul_add(rp[3],ap[3],w,c1);
  113. ap+=4; rp+=4; num-=4;
  114. }
  115. if (num)
  116. {
  117. mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
  118. mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
  119. mul_add(rp[2],ap[2],w,c1); return c1;
  120. }
  121. return(c1);
  122. }
  123. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  124. {
  125. BN_ULONG c1=0;
  126. if (num <= 0) return(c1);
  127. while (num&~3)
  128. {
  129. mul(rp[0],ap[0],w,c1);
  130. mul(rp[1],ap[1],w,c1);
  131. mul(rp[2],ap[2],w,c1);
  132. mul(rp[3],ap[3],w,c1);
  133. ap+=4; rp+=4; num-=4;
  134. }
  135. if (num)
  136. {
  137. mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
  138. mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
  139. mul(rp[2],ap[2],w,c1);
  140. }
  141. return(c1);
  142. }
  143. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  144. {
  145. if (n <= 0) return;
  146. while (n&~3)
  147. {
  148. sqr(r[0],r[1],a[0]);
  149. sqr(r[2],r[3],a[1]);
  150. sqr(r[4],r[5],a[2]);
  151. sqr(r[6],r[7],a[3]);
  152. a+=4; r+=8; n-=4;
  153. }
  154. if (n)
  155. {
  156. sqr(r[0],r[1],a[0]); if (--n == 0) return;
  157. sqr(r[2],r[3],a[1]); if (--n == 0) return;
  158. sqr(r[4],r[5],a[2]);
  159. }
  160. }
  161. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  162. { BN_ULONG ret,waste;
  163. asm ("divq %4"
  164. : "=a"(ret),"=d"(waste)
  165. : "a"(l),"d"(h),"g"(d)
  166. : "cc");
  167. return ret;
  168. }
  169. BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
  170. { BN_ULONG ret=0,i=0;
  171. if (n <= 0) return 0;
  172. asm (
  173. " subq %2,%2 \n"
  174. ".p2align 4 \n"
  175. "1: movq (%4,%2,8),%0 \n"
  176. " adcq (%5,%2,8),%0 \n"
  177. " movq %0,(%3,%2,8) \n"
  178. " leaq 1(%2),%2 \n"
  179. " loop 1b \n"
  180. " sbbq %0,%0 \n"
  181. : "=&a"(ret),"+c"(n),"=&r"(i)
  182. : "r"(rp),"r"(ap),"r"(bp)
  183. : "cc"
  184. );
  185. return ret&1;
  186. }
  187. #ifndef SIMICS
  188. BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
  189. { BN_ULONG ret=0,i=0;
  190. if (n <= 0) return 0;
  191. asm (
  192. " subq %2,%2 \n"
  193. ".p2align 4 \n"
  194. "1: movq (%4,%2,8),%0 \n"
  195. " sbbq (%5,%2,8),%0 \n"
  196. " movq %0,(%3,%2,8) \n"
  197. " leaq 1(%2),%2 \n"
  198. " loop 1b \n"
  199. " sbbq %0,%0 \n"
  200. : "=&a"(ret),"+c"(n),"=&r"(i)
  201. : "r"(rp),"r"(ap),"r"(bp)
  202. : "cc"
  203. );
  204. return ret&1;
  205. }
  206. #else
  207. /* Simics 1.4<7 has buggy sbbq:-( */
  208. #define BN_MASK2 0xffffffffffffffffL
  209. BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  210. {
  211. BN_ULONG t1,t2;
  212. int c=0;
  213. if (n <= 0) return((BN_ULONG)0);
  214. for (;;)
  215. {
  216. t1=a[0]; t2=b[0];
  217. r[0]=(t1-t2-c)&BN_MASK2;
  218. if (t1 != t2) c=(t1 < t2);
  219. if (--n <= 0) break;
  220. t1=a[1]; t2=b[1];
  221. r[1]=(t1-t2-c)&BN_MASK2;
  222. if (t1 != t2) c=(t1 < t2);
  223. if (--n <= 0) break;
  224. t1=a[2]; t2=b[2];
  225. r[2]=(t1-t2-c)&BN_MASK2;
  226. if (t1 != t2) c=(t1 < t2);
  227. if (--n <= 0) break;
  228. t1=a[3]; t2=b[3];
  229. r[3]=(t1-t2-c)&BN_MASK2;
  230. if (t1 != t2) c=(t1 < t2);
  231. if (--n <= 0) break;
  232. a+=4;
  233. b+=4;
  234. r+=4;
  235. }
  236. return(c);
  237. }
  238. #endif
  239. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  240. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  241. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  242. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
  243. #if 0
  244. /* original macros are kept for reference purposes */
  245. #define mul_add_c(a,b,c0,c1,c2) { \
  246. BN_ULONG ta=(a),tb=(b); \
  247. t1 = ta * tb; \
  248. t2 = BN_UMULT_HIGH(ta,tb); \
  249. c0 += t1; t2 += (c0<t1)?1:0; \
  250. c1 += t2; c2 += (c1<t2)?1:0; \
  251. }
  252. #define mul_add_c2(a,b,c0,c1,c2) { \
  253. BN_ULONG ta=(a),tb=(b),t0; \
  254. t1 = BN_UMULT_HIGH(ta,tb); \
  255. t0 = ta * tb; \
  256. t2 = t1+t1; c2 += (t2<t1)?1:0; \
  257. t1 = t0+t0; t2 += (t1<t0)?1:0; \
  258. c0 += t1; t2 += (c0<t1)?1:0; \
  259. c1 += t2; c2 += (c1<t2)?1:0; \
  260. }
  261. #else
  262. #define mul_add_c(a,b,c0,c1,c2) do { \
  263. asm ("mulq %3" \
  264. : "=a"(t1),"=d"(t2) \
  265. : "a"(a),"m"(b) \
  266. : "cc"); \
  267. asm ("addq %2,%0; adcq %3,%1" \
  268. : "+r"(c0),"+d"(t2) \
  269. : "a"(t1),"g"(0) \
  270. : "cc"); \
  271. asm ("addq %2,%0; adcq %3,%1" \
  272. : "+r"(c1),"+r"(c2) \
  273. : "d"(t2),"g"(0) \
  274. : "cc"); \
  275. } while (0)
  276. #define sqr_add_c(a,i,c0,c1,c2) do { \
  277. asm ("mulq %2" \
  278. : "=a"(t1),"=d"(t2) \
  279. : "a"(a[i]) \
  280. : "cc"); \
  281. asm ("addq %2,%0; adcq %3,%1" \
  282. : "+r"(c0),"+d"(t2) \
  283. : "a"(t1),"g"(0) \
  284. : "cc"); \
  285. asm ("addq %2,%0; adcq %3,%1" \
  286. : "+r"(c1),"+r"(c2) \
  287. : "d"(t2),"g"(0) \
  288. : "cc"); \
  289. } while (0)
  290. #define mul_add_c2(a,b,c0,c1,c2) do { \
  291. asm ("mulq %3" \
  292. : "=a"(t1),"=d"(t2) \
  293. : "a"(a),"m"(b) \
  294. : "cc"); \
  295. asm ("addq %0,%0; adcq %2,%1" \
  296. : "+d"(t2),"+r"(c2) \
  297. : "g"(0) \
  298. : "cc"); \
  299. asm ("addq %0,%0; adcq %2,%1" \
  300. : "+a"(t1),"+d"(t2) \
  301. : "g"(0) \
  302. : "cc"); \
  303. asm ("addq %2,%0; adcq %3,%1" \
  304. : "+r"(c0),"+d"(t2) \
  305. : "a"(t1),"g"(0) \
  306. : "cc"); \
  307. asm ("addq %2,%0; adcq %3,%1" \
  308. : "+r"(c1),"+r"(c2) \
  309. : "d"(t2),"g"(0) \
  310. : "cc"); \
  311. } while (0)
  312. #endif
  313. #define sqr_add_c2(a,i,j,c0,c1,c2) \
  314. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  315. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  316. {
  317. BN_ULONG t1,t2;
  318. BN_ULONG c1,c2,c3;
  319. c1=0;
  320. c2=0;
  321. c3=0;
  322. mul_add_c(a[0],b[0],c1,c2,c3);
  323. r[0]=c1;
  324. c1=0;
  325. mul_add_c(a[0],b[1],c2,c3,c1);
  326. mul_add_c(a[1],b[0],c2,c3,c1);
  327. r[1]=c2;
  328. c2=0;
  329. mul_add_c(a[2],b[0],c3,c1,c2);
  330. mul_add_c(a[1],b[1],c3,c1,c2);
  331. mul_add_c(a[0],b[2],c3,c1,c2);
  332. r[2]=c3;
  333. c3=0;
  334. mul_add_c(a[0],b[3],c1,c2,c3);
  335. mul_add_c(a[1],b[2],c1,c2,c3);
  336. mul_add_c(a[2],b[1],c1,c2,c3);
  337. mul_add_c(a[3],b[0],c1,c2,c3);
  338. r[3]=c1;
  339. c1=0;
  340. mul_add_c(a[4],b[0],c2,c3,c1);
  341. mul_add_c(a[3],b[1],c2,c3,c1);
  342. mul_add_c(a[2],b[2],c2,c3,c1);
  343. mul_add_c(a[1],b[3],c2,c3,c1);
  344. mul_add_c(a[0],b[4],c2,c3,c1);
  345. r[4]=c2;
  346. c2=0;
  347. mul_add_c(a[0],b[5],c3,c1,c2);
  348. mul_add_c(a[1],b[4],c3,c1,c2);
  349. mul_add_c(a[2],b[3],c3,c1,c2);
  350. mul_add_c(a[3],b[2],c3,c1,c2);
  351. mul_add_c(a[4],b[1],c3,c1,c2);
  352. mul_add_c(a[5],b[0],c3,c1,c2);
  353. r[5]=c3;
  354. c3=0;
  355. mul_add_c(a[6],b[0],c1,c2,c3);
  356. mul_add_c(a[5],b[1],c1,c2,c3);
  357. mul_add_c(a[4],b[2],c1,c2,c3);
  358. mul_add_c(a[3],b[3],c1,c2,c3);
  359. mul_add_c(a[2],b[4],c1,c2,c3);
  360. mul_add_c(a[1],b[5],c1,c2,c3);
  361. mul_add_c(a[0],b[6],c1,c2,c3);
  362. r[6]=c1;
  363. c1=0;
  364. mul_add_c(a[0],b[7],c2,c3,c1);
  365. mul_add_c(a[1],b[6],c2,c3,c1);
  366. mul_add_c(a[2],b[5],c2,c3,c1);
  367. mul_add_c(a[3],b[4],c2,c3,c1);
  368. mul_add_c(a[4],b[3],c2,c3,c1);
  369. mul_add_c(a[5],b[2],c2,c3,c1);
  370. mul_add_c(a[6],b[1],c2,c3,c1);
  371. mul_add_c(a[7],b[0],c2,c3,c1);
  372. r[7]=c2;
  373. c2=0;
  374. mul_add_c(a[7],b[1],c3,c1,c2);
  375. mul_add_c(a[6],b[2],c3,c1,c2);
  376. mul_add_c(a[5],b[3],c3,c1,c2);
  377. mul_add_c(a[4],b[4],c3,c1,c2);
  378. mul_add_c(a[3],b[5],c3,c1,c2);
  379. mul_add_c(a[2],b[6],c3,c1,c2);
  380. mul_add_c(a[1],b[7],c3,c1,c2);
  381. r[8]=c3;
  382. c3=0;
  383. mul_add_c(a[2],b[7],c1,c2,c3);
  384. mul_add_c(a[3],b[6],c1,c2,c3);
  385. mul_add_c(a[4],b[5],c1,c2,c3);
  386. mul_add_c(a[5],b[4],c1,c2,c3);
  387. mul_add_c(a[6],b[3],c1,c2,c3);
  388. mul_add_c(a[7],b[2],c1,c2,c3);
  389. r[9]=c1;
  390. c1=0;
  391. mul_add_c(a[7],b[3],c2,c3,c1);
  392. mul_add_c(a[6],b[4],c2,c3,c1);
  393. mul_add_c(a[5],b[5],c2,c3,c1);
  394. mul_add_c(a[4],b[6],c2,c3,c1);
  395. mul_add_c(a[3],b[7],c2,c3,c1);
  396. r[10]=c2;
  397. c2=0;
  398. mul_add_c(a[4],b[7],c3,c1,c2);
  399. mul_add_c(a[5],b[6],c3,c1,c2);
  400. mul_add_c(a[6],b[5],c3,c1,c2);
  401. mul_add_c(a[7],b[4],c3,c1,c2);
  402. r[11]=c3;
  403. c3=0;
  404. mul_add_c(a[7],b[5],c1,c2,c3);
  405. mul_add_c(a[6],b[6],c1,c2,c3);
  406. mul_add_c(a[5],b[7],c1,c2,c3);
  407. r[12]=c1;
  408. c1=0;
  409. mul_add_c(a[6],b[7],c2,c3,c1);
  410. mul_add_c(a[7],b[6],c2,c3,c1);
  411. r[13]=c2;
  412. c2=0;
  413. mul_add_c(a[7],b[7],c3,c1,c2);
  414. r[14]=c3;
  415. r[15]=c1;
  416. }
  417. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  418. {
  419. BN_ULONG t1,t2;
  420. BN_ULONG c1,c2,c3;
  421. c1=0;
  422. c2=0;
  423. c3=0;
  424. mul_add_c(a[0],b[0],c1,c2,c3);
  425. r[0]=c1;
  426. c1=0;
  427. mul_add_c(a[0],b[1],c2,c3,c1);
  428. mul_add_c(a[1],b[0],c2,c3,c1);
  429. r[1]=c2;
  430. c2=0;
  431. mul_add_c(a[2],b[0],c3,c1,c2);
  432. mul_add_c(a[1],b[1],c3,c1,c2);
  433. mul_add_c(a[0],b[2],c3,c1,c2);
  434. r[2]=c3;
  435. c3=0;
  436. mul_add_c(a[0],b[3],c1,c2,c3);
  437. mul_add_c(a[1],b[2],c1,c2,c3);
  438. mul_add_c(a[2],b[1],c1,c2,c3);
  439. mul_add_c(a[3],b[0],c1,c2,c3);
  440. r[3]=c1;
  441. c1=0;
  442. mul_add_c(a[3],b[1],c2,c3,c1);
  443. mul_add_c(a[2],b[2],c2,c3,c1);
  444. mul_add_c(a[1],b[3],c2,c3,c1);
  445. r[4]=c2;
  446. c2=0;
  447. mul_add_c(a[2],b[3],c3,c1,c2);
  448. mul_add_c(a[3],b[2],c3,c1,c2);
  449. r[5]=c3;
  450. c3=0;
  451. mul_add_c(a[3],b[3],c1,c2,c3);
  452. r[6]=c1;
  453. r[7]=c2;
  454. }
  455. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  456. {
  457. BN_ULONG t1,t2;
  458. BN_ULONG c1,c2,c3;
  459. c1=0;
  460. c2=0;
  461. c3=0;
  462. sqr_add_c(a,0,c1,c2,c3);
  463. r[0]=c1;
  464. c1=0;
  465. sqr_add_c2(a,1,0,c2,c3,c1);
  466. r[1]=c2;
  467. c2=0;
  468. sqr_add_c(a,1,c3,c1,c2);
  469. sqr_add_c2(a,2,0,c3,c1,c2);
  470. r[2]=c3;
  471. c3=0;
  472. sqr_add_c2(a,3,0,c1,c2,c3);
  473. sqr_add_c2(a,2,1,c1,c2,c3);
  474. r[3]=c1;
  475. c1=0;
  476. sqr_add_c(a,2,c2,c3,c1);
  477. sqr_add_c2(a,3,1,c2,c3,c1);
  478. sqr_add_c2(a,4,0,c2,c3,c1);
  479. r[4]=c2;
  480. c2=0;
  481. sqr_add_c2(a,5,0,c3,c1,c2);
  482. sqr_add_c2(a,4,1,c3,c1,c2);
  483. sqr_add_c2(a,3,2,c3,c1,c2);
  484. r[5]=c3;
  485. c3=0;
  486. sqr_add_c(a,3,c1,c2,c3);
  487. sqr_add_c2(a,4,2,c1,c2,c3);
  488. sqr_add_c2(a,5,1,c1,c2,c3);
  489. sqr_add_c2(a,6,0,c1,c2,c3);
  490. r[6]=c1;
  491. c1=0;
  492. sqr_add_c2(a,7,0,c2,c3,c1);
  493. sqr_add_c2(a,6,1,c2,c3,c1);
  494. sqr_add_c2(a,5,2,c2,c3,c1);
  495. sqr_add_c2(a,4,3,c2,c3,c1);
  496. r[7]=c2;
  497. c2=0;
  498. sqr_add_c(a,4,c3,c1,c2);
  499. sqr_add_c2(a,5,3,c3,c1,c2);
  500. sqr_add_c2(a,6,2,c3,c1,c2);
  501. sqr_add_c2(a,7,1,c3,c1,c2);
  502. r[8]=c3;
  503. c3=0;
  504. sqr_add_c2(a,7,2,c1,c2,c3);
  505. sqr_add_c2(a,6,3,c1,c2,c3);
  506. sqr_add_c2(a,5,4,c1,c2,c3);
  507. r[9]=c1;
  508. c1=0;
  509. sqr_add_c(a,5,c2,c3,c1);
  510. sqr_add_c2(a,6,4,c2,c3,c1);
  511. sqr_add_c2(a,7,3,c2,c3,c1);
  512. r[10]=c2;
  513. c2=0;
  514. sqr_add_c2(a,7,4,c3,c1,c2);
  515. sqr_add_c2(a,6,5,c3,c1,c2);
  516. r[11]=c3;
  517. c3=0;
  518. sqr_add_c(a,6,c1,c2,c3);
  519. sqr_add_c2(a,7,5,c1,c2,c3);
  520. r[12]=c1;
  521. c1=0;
  522. sqr_add_c2(a,7,6,c2,c3,c1);
  523. r[13]=c2;
  524. c2=0;
  525. sqr_add_c(a,7,c3,c1,c2);
  526. r[14]=c3;
  527. r[15]=c1;
  528. }
  529. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  530. {
  531. BN_ULONG t1,t2;
  532. BN_ULONG c1,c2,c3;
  533. c1=0;
  534. c2=0;
  535. c3=0;
  536. sqr_add_c(a,0,c1,c2,c3);
  537. r[0]=c1;
  538. c1=0;
  539. sqr_add_c2(a,1,0,c2,c3,c1);
  540. r[1]=c2;
  541. c2=0;
  542. sqr_add_c(a,1,c3,c1,c2);
  543. sqr_add_c2(a,2,0,c3,c1,c2);
  544. r[2]=c3;
  545. c3=0;
  546. sqr_add_c2(a,3,0,c1,c2,c3);
  547. sqr_add_c2(a,2,1,c1,c2,c3);
  548. r[3]=c1;
  549. c1=0;
  550. sqr_add_c(a,2,c2,c3,c1);
  551. sqr_add_c2(a,3,1,c2,c3,c1);
  552. r[4]=c2;
  553. c2=0;
  554. sqr_add_c2(a,3,2,c3,c1,c2);
  555. r[5]=c3;
  556. c3=0;
  557. sqr_add_c(a,3,c1,c2,c3);
  558. r[6]=c1;
  559. r[7]=c2;
  560. }
  561. #endif