x86_64-gcc.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. #ifdef __SUNPRO_C
  2. # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
  3. #else
  4. /*
  5. * x86_64 BIGNUM accelerator version 0.1, December 2002.
  6. *
  7. * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  8. * project.
  9. *
  10. * Rights for redistribution and usage in source and binary forms are
  11. * granted according to the OpenSSL license. Warranty of any kind is
  12. * disclaimed.
  13. *
  14. * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  15. * versions, like 1.0...
  16. * A. Well, that's because this code is basically a quick-n-dirty
  17. * proof-of-concept hack. As you can see it's implemented with
  18. * inline assembler, which means that you're bound to GCC and that
  19. * there might be enough room for further improvement.
  20. *
  21. * Q. Why inline assembler?
  22. * A. x86_64 features own ABI which I'm not familiar with. This is
  23. * why I decided to let the compiler take care of subroutine
  24. * prologue/epilogue as well as register allocation. For reference.
  25. * Win64 implements different ABI for AMD64, different from Linux.
  26. *
  27. * Q. How much faster does it get?
  28. * A. 'apps/openssl speed rsa dsa' output with no-asm:
  29. *
  30. * sign verify sign/s verify/s
  31. * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
  32. * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
  33. * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
  34. * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
  35. * sign verify sign/s verify/s
  36. * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
  37. * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
  38. * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
  39. *
  40. * 'apps/openssl speed rsa dsa' output with this module:
  41. *
  42. * sign verify sign/s verify/s
  43. * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
  44. * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
  45. * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
  46. * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
  47. * sign verify sign/s verify/s
  48. * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
  49. * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
  50. * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
  51. *
  52. * For the reference. IA-32 assembler implementation performs
  53. * very much like 64-bit code compiled with no-asm on the same
  54. * machine.
  55. */
  56. #define BN_ULONG unsigned long
  57. /*
  58. * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
  59. * "g"(0) let the compiler to decide where does it
  60. * want to keep the value of zero;
  61. */
  62. #define mul_add(r,a,word,carry) do { \
  63. register BN_ULONG high,low; \
  64. asm ("mulq %3" \
  65. : "=a"(low),"=d"(high) \
  66. : "a"(word),"m"(a) \
  67. : "cc"); \
  68. asm ("addq %2,%0; adcq %3,%1" \
  69. : "+r"(carry),"+d"(high)\
  70. : "a"(low),"g"(0) \
  71. : "cc"); \
  72. asm ("addq %2,%0; adcq %3,%1" \
  73. : "+m"(r),"+d"(high) \
  74. : "r"(carry),"g"(0) \
  75. : "cc"); \
  76. carry=high; \
  77. } while (0)
  78. #define mul(r,a,word,carry) do { \
  79. register BN_ULONG high,low; \
  80. asm ("mulq %3" \
  81. : "=a"(low),"=d"(high) \
  82. : "a"(word),"g"(a) \
  83. : "cc"); \
  84. asm ("addq %2,%0; adcq %3,%1" \
  85. : "+r"(carry),"+d"(high)\
  86. : "a"(low),"g"(0) \
  87. : "cc"); \
  88. (r)=carry, carry=high; \
  89. } while (0)
  90. #define sqr(r0,r1,a) \
  91. asm ("mulq %2" \
  92. : "=a"(r0),"=d"(r1) \
  93. : "a"(a) \
  94. : "cc");
  95. BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  96. {
  97. BN_ULONG c1=0;
  98. if (num <= 0) return(c1);
  99. while (num&~3)
  100. {
  101. mul_add(rp[0],ap[0],w,c1);
  102. mul_add(rp[1],ap[1],w,c1);
  103. mul_add(rp[2],ap[2],w,c1);
  104. mul_add(rp[3],ap[3],w,c1);
  105. ap+=4; rp+=4; num-=4;
  106. }
  107. if (num)
  108. {
  109. mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
  110. mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
  111. mul_add(rp[2],ap[2],w,c1); return c1;
  112. }
  113. return(c1);
  114. }
  115. BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  116. {
  117. BN_ULONG c1=0;
  118. if (num <= 0) return(c1);
  119. while (num&~3)
  120. {
  121. mul(rp[0],ap[0],w,c1);
  122. mul(rp[1],ap[1],w,c1);
  123. mul(rp[2],ap[2],w,c1);
  124. mul(rp[3],ap[3],w,c1);
  125. ap+=4; rp+=4; num-=4;
  126. }
  127. if (num)
  128. {
  129. mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
  130. mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
  131. mul(rp[2],ap[2],w,c1);
  132. }
  133. return(c1);
  134. }
  135. void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
  136. {
  137. if (n <= 0) return;
  138. while (n&~3)
  139. {
  140. sqr(r[0],r[1],a[0]);
  141. sqr(r[2],r[3],a[1]);
  142. sqr(r[4],r[5],a[2]);
  143. sqr(r[6],r[7],a[3]);
  144. a+=4; r+=8; n-=4;
  145. }
  146. if (n)
  147. {
  148. sqr(r[0],r[1],a[0]); if (--n == 0) return;
  149. sqr(r[2],r[3],a[1]); if (--n == 0) return;
  150. sqr(r[4],r[5],a[2]);
  151. }
  152. }
  153. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  154. { BN_ULONG ret,waste;
  155. asm ("divq %4"
  156. : "=a"(ret),"=d"(waste)
  157. : "a"(l),"d"(h),"g"(d)
  158. : "cc");
  159. return ret;
  160. }
  161. BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
  162. { BN_ULONG ret=0,i=0;
  163. if (n <= 0) return 0;
  164. asm (
  165. " subq %2,%2 \n"
  166. ".align 16 \n"
  167. "1: movq (%4,%2,8),%0 \n"
  168. " adcq (%5,%2,8),%0 \n"
  169. " movq %0,(%3,%2,8) \n"
  170. " leaq 1(%2),%2 \n"
  171. " loop 1b \n"
  172. " sbbq %0,%0 \n"
  173. : "=&a"(ret),"+c"(n),"=&r"(i)
  174. : "r"(rp),"r"(ap),"r"(bp)
  175. : "cc"
  176. );
  177. return ret&1;
  178. }
  179. #ifndef SIMICS
  180. BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
  181. { BN_ULONG ret=0,i=0;
  182. if (n <= 0) return 0;
  183. asm (
  184. " subq %2,%2 \n"
  185. ".align 16 \n"
  186. "1: movq (%4,%2,8),%0 \n"
  187. " sbbq (%5,%2,8),%0 \n"
  188. " movq %0,(%3,%2,8) \n"
  189. " leaq 1(%2),%2 \n"
  190. " loop 1b \n"
  191. " sbbq %0,%0 \n"
  192. : "=&a"(ret),"+c"(n),"=&r"(i)
  193. : "r"(rp),"r"(ap),"r"(bp)
  194. : "cc"
  195. );
  196. return ret&1;
  197. }
  198. #else
  199. /* Simics 1.4<7 has buggy sbbq:-( */
  200. #define BN_MASK2 0xffffffffffffffffL
  201. BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  202. {
  203. BN_ULONG t1,t2;
  204. int c=0;
  205. if (n <= 0) return((BN_ULONG)0);
  206. for (;;)
  207. {
  208. t1=a[0]; t2=b[0];
  209. r[0]=(t1-t2-c)&BN_MASK2;
  210. if (t1 != t2) c=(t1 < t2);
  211. if (--n <= 0) break;
  212. t1=a[1]; t2=b[1];
  213. r[1]=(t1-t2-c)&BN_MASK2;
  214. if (t1 != t2) c=(t1 < t2);
  215. if (--n <= 0) break;
  216. t1=a[2]; t2=b[2];
  217. r[2]=(t1-t2-c)&BN_MASK2;
  218. if (t1 != t2) c=(t1 < t2);
  219. if (--n <= 0) break;
  220. t1=a[3]; t2=b[3];
  221. r[3]=(t1-t2-c)&BN_MASK2;
  222. if (t1 != t2) c=(t1 < t2);
  223. if (--n <= 0) break;
  224. a+=4;
  225. b+=4;
  226. r+=4;
  227. }
  228. return(c);
  229. }
  230. #endif
  231. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  232. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  233. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  234. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
  235. #if 0
  236. /* original macros are kept for reference purposes */
  237. #define mul_add_c(a,b,c0,c1,c2) { \
  238. BN_ULONG ta=(a),tb=(b); \
  239. t1 = ta * tb; \
  240. t2 = BN_UMULT_HIGH(ta,tb); \
  241. c0 += t1; t2 += (c0<t1)?1:0; \
  242. c1 += t2; c2 += (c1<t2)?1:0; \
  243. }
  244. #define mul_add_c2(a,b,c0,c1,c2) { \
  245. BN_ULONG ta=(a),tb=(b),t0; \
  246. t1 = BN_UMULT_HIGH(ta,tb); \
  247. t0 = ta * tb; \
  248. t2 = t1+t1; c2 += (t2<t1)?1:0; \
  249. t1 = t0+t0; t2 += (t1<t0)?1:0; \
  250. c0 += t1; t2 += (c0<t1)?1:0; \
  251. c1 += t2; c2 += (c1<t2)?1:0; \
  252. }
  253. #else
  254. #define mul_add_c(a,b,c0,c1,c2) do { \
  255. asm ("mulq %3" \
  256. : "=a"(t1),"=d"(t2) \
  257. : "a"(a),"m"(b) \
  258. : "cc"); \
  259. asm ("addq %2,%0; adcq %3,%1" \
  260. : "+r"(c0),"+d"(t2) \
  261. : "a"(t1),"g"(0) \
  262. : "cc"); \
  263. asm ("addq %2,%0; adcq %3,%1" \
  264. : "+r"(c1),"+r"(c2) \
  265. : "d"(t2),"g"(0) \
  266. : "cc"); \
  267. } while (0)
  268. #define sqr_add_c(a,i,c0,c1,c2) do { \
  269. asm ("mulq %2" \
  270. : "=a"(t1),"=d"(t2) \
  271. : "a"(a[i]) \
  272. : "cc"); \
  273. asm ("addq %2,%0; adcq %3,%1" \
  274. : "+r"(c0),"+d"(t2) \
  275. : "a"(t1),"g"(0) \
  276. : "cc"); \
  277. asm ("addq %2,%0; adcq %3,%1" \
  278. : "+r"(c1),"+r"(c2) \
  279. : "d"(t2),"g"(0) \
  280. : "cc"); \
  281. } while (0)
  282. #define mul_add_c2(a,b,c0,c1,c2) do { \
  283. asm ("mulq %3" \
  284. : "=a"(t1),"=d"(t2) \
  285. : "a"(a),"m"(b) \
  286. : "cc"); \
  287. asm ("addq %0,%0; adcq %2,%1" \
  288. : "+d"(t2),"+r"(c2) \
  289. : "g"(0) \
  290. : "cc"); \
  291. asm ("addq %0,%0; adcq %2,%1" \
  292. : "+a"(t1),"+d"(t2) \
  293. : "g"(0) \
  294. : "cc"); \
  295. asm ("addq %2,%0; adcq %3,%1" \
  296. : "+r"(c0),"+d"(t2) \
  297. : "a"(t1),"g"(0) \
  298. : "cc"); \
  299. asm ("addq %2,%0; adcq %3,%1" \
  300. : "+r"(c1),"+r"(c2) \
  301. : "d"(t2),"g"(0) \
  302. : "cc"); \
  303. } while (0)
  304. #endif
  305. #define sqr_add_c2(a,i,j,c0,c1,c2) \
  306. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  307. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  308. {
  309. BN_ULONG t1,t2;
  310. BN_ULONG c1,c2,c3;
  311. c1=0;
  312. c2=0;
  313. c3=0;
  314. mul_add_c(a[0],b[0],c1,c2,c3);
  315. r[0]=c1;
  316. c1=0;
  317. mul_add_c(a[0],b[1],c2,c3,c1);
  318. mul_add_c(a[1],b[0],c2,c3,c1);
  319. r[1]=c2;
  320. c2=0;
  321. mul_add_c(a[2],b[0],c3,c1,c2);
  322. mul_add_c(a[1],b[1],c3,c1,c2);
  323. mul_add_c(a[0],b[2],c3,c1,c2);
  324. r[2]=c3;
  325. c3=0;
  326. mul_add_c(a[0],b[3],c1,c2,c3);
  327. mul_add_c(a[1],b[2],c1,c2,c3);
  328. mul_add_c(a[2],b[1],c1,c2,c3);
  329. mul_add_c(a[3],b[0],c1,c2,c3);
  330. r[3]=c1;
  331. c1=0;
  332. mul_add_c(a[4],b[0],c2,c3,c1);
  333. mul_add_c(a[3],b[1],c2,c3,c1);
  334. mul_add_c(a[2],b[2],c2,c3,c1);
  335. mul_add_c(a[1],b[3],c2,c3,c1);
  336. mul_add_c(a[0],b[4],c2,c3,c1);
  337. r[4]=c2;
  338. c2=0;
  339. mul_add_c(a[0],b[5],c3,c1,c2);
  340. mul_add_c(a[1],b[4],c3,c1,c2);
  341. mul_add_c(a[2],b[3],c3,c1,c2);
  342. mul_add_c(a[3],b[2],c3,c1,c2);
  343. mul_add_c(a[4],b[1],c3,c1,c2);
  344. mul_add_c(a[5],b[0],c3,c1,c2);
  345. r[5]=c3;
  346. c3=0;
  347. mul_add_c(a[6],b[0],c1,c2,c3);
  348. mul_add_c(a[5],b[1],c1,c2,c3);
  349. mul_add_c(a[4],b[2],c1,c2,c3);
  350. mul_add_c(a[3],b[3],c1,c2,c3);
  351. mul_add_c(a[2],b[4],c1,c2,c3);
  352. mul_add_c(a[1],b[5],c1,c2,c3);
  353. mul_add_c(a[0],b[6],c1,c2,c3);
  354. r[6]=c1;
  355. c1=0;
  356. mul_add_c(a[0],b[7],c2,c3,c1);
  357. mul_add_c(a[1],b[6],c2,c3,c1);
  358. mul_add_c(a[2],b[5],c2,c3,c1);
  359. mul_add_c(a[3],b[4],c2,c3,c1);
  360. mul_add_c(a[4],b[3],c2,c3,c1);
  361. mul_add_c(a[5],b[2],c2,c3,c1);
  362. mul_add_c(a[6],b[1],c2,c3,c1);
  363. mul_add_c(a[7],b[0],c2,c3,c1);
  364. r[7]=c2;
  365. c2=0;
  366. mul_add_c(a[7],b[1],c3,c1,c2);
  367. mul_add_c(a[6],b[2],c3,c1,c2);
  368. mul_add_c(a[5],b[3],c3,c1,c2);
  369. mul_add_c(a[4],b[4],c3,c1,c2);
  370. mul_add_c(a[3],b[5],c3,c1,c2);
  371. mul_add_c(a[2],b[6],c3,c1,c2);
  372. mul_add_c(a[1],b[7],c3,c1,c2);
  373. r[8]=c3;
  374. c3=0;
  375. mul_add_c(a[2],b[7],c1,c2,c3);
  376. mul_add_c(a[3],b[6],c1,c2,c3);
  377. mul_add_c(a[4],b[5],c1,c2,c3);
  378. mul_add_c(a[5],b[4],c1,c2,c3);
  379. mul_add_c(a[6],b[3],c1,c2,c3);
  380. mul_add_c(a[7],b[2],c1,c2,c3);
  381. r[9]=c1;
  382. c1=0;
  383. mul_add_c(a[7],b[3],c2,c3,c1);
  384. mul_add_c(a[6],b[4],c2,c3,c1);
  385. mul_add_c(a[5],b[5],c2,c3,c1);
  386. mul_add_c(a[4],b[6],c2,c3,c1);
  387. mul_add_c(a[3],b[7],c2,c3,c1);
  388. r[10]=c2;
  389. c2=0;
  390. mul_add_c(a[4],b[7],c3,c1,c2);
  391. mul_add_c(a[5],b[6],c3,c1,c2);
  392. mul_add_c(a[6],b[5],c3,c1,c2);
  393. mul_add_c(a[7],b[4],c3,c1,c2);
  394. r[11]=c3;
  395. c3=0;
  396. mul_add_c(a[7],b[5],c1,c2,c3);
  397. mul_add_c(a[6],b[6],c1,c2,c3);
  398. mul_add_c(a[5],b[7],c1,c2,c3);
  399. r[12]=c1;
  400. c1=0;
  401. mul_add_c(a[6],b[7],c2,c3,c1);
  402. mul_add_c(a[7],b[6],c2,c3,c1);
  403. r[13]=c2;
  404. c2=0;
  405. mul_add_c(a[7],b[7],c3,c1,c2);
  406. r[14]=c3;
  407. r[15]=c1;
  408. }
  409. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  410. {
  411. BN_ULONG t1,t2;
  412. BN_ULONG c1,c2,c3;
  413. c1=0;
  414. c2=0;
  415. c3=0;
  416. mul_add_c(a[0],b[0],c1,c2,c3);
  417. r[0]=c1;
  418. c1=0;
  419. mul_add_c(a[0],b[1],c2,c3,c1);
  420. mul_add_c(a[1],b[0],c2,c3,c1);
  421. r[1]=c2;
  422. c2=0;
  423. mul_add_c(a[2],b[0],c3,c1,c2);
  424. mul_add_c(a[1],b[1],c3,c1,c2);
  425. mul_add_c(a[0],b[2],c3,c1,c2);
  426. r[2]=c3;
  427. c3=0;
  428. mul_add_c(a[0],b[3],c1,c2,c3);
  429. mul_add_c(a[1],b[2],c1,c2,c3);
  430. mul_add_c(a[2],b[1],c1,c2,c3);
  431. mul_add_c(a[3],b[0],c1,c2,c3);
  432. r[3]=c1;
  433. c1=0;
  434. mul_add_c(a[3],b[1],c2,c3,c1);
  435. mul_add_c(a[2],b[2],c2,c3,c1);
  436. mul_add_c(a[1],b[3],c2,c3,c1);
  437. r[4]=c2;
  438. c2=0;
  439. mul_add_c(a[2],b[3],c3,c1,c2);
  440. mul_add_c(a[3],b[2],c3,c1,c2);
  441. r[5]=c3;
  442. c3=0;
  443. mul_add_c(a[3],b[3],c1,c2,c3);
  444. r[6]=c1;
  445. r[7]=c2;
  446. }
  447. void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  448. {
  449. BN_ULONG t1,t2;
  450. BN_ULONG c1,c2,c3;
  451. c1=0;
  452. c2=0;
  453. c3=0;
  454. sqr_add_c(a,0,c1,c2,c3);
  455. r[0]=c1;
  456. c1=0;
  457. sqr_add_c2(a,1,0,c2,c3,c1);
  458. r[1]=c2;
  459. c2=0;
  460. sqr_add_c(a,1,c3,c1,c2);
  461. sqr_add_c2(a,2,0,c3,c1,c2);
  462. r[2]=c3;
  463. c3=0;
  464. sqr_add_c2(a,3,0,c1,c2,c3);
  465. sqr_add_c2(a,2,1,c1,c2,c3);
  466. r[3]=c1;
  467. c1=0;
  468. sqr_add_c(a,2,c2,c3,c1);
  469. sqr_add_c2(a,3,1,c2,c3,c1);
  470. sqr_add_c2(a,4,0,c2,c3,c1);
  471. r[4]=c2;
  472. c2=0;
  473. sqr_add_c2(a,5,0,c3,c1,c2);
  474. sqr_add_c2(a,4,1,c3,c1,c2);
  475. sqr_add_c2(a,3,2,c3,c1,c2);
  476. r[5]=c3;
  477. c3=0;
  478. sqr_add_c(a,3,c1,c2,c3);
  479. sqr_add_c2(a,4,2,c1,c2,c3);
  480. sqr_add_c2(a,5,1,c1,c2,c3);
  481. sqr_add_c2(a,6,0,c1,c2,c3);
  482. r[6]=c1;
  483. c1=0;
  484. sqr_add_c2(a,7,0,c2,c3,c1);
  485. sqr_add_c2(a,6,1,c2,c3,c1);
  486. sqr_add_c2(a,5,2,c2,c3,c1);
  487. sqr_add_c2(a,4,3,c2,c3,c1);
  488. r[7]=c2;
  489. c2=0;
  490. sqr_add_c(a,4,c3,c1,c2);
  491. sqr_add_c2(a,5,3,c3,c1,c2);
  492. sqr_add_c2(a,6,2,c3,c1,c2);
  493. sqr_add_c2(a,7,1,c3,c1,c2);
  494. r[8]=c3;
  495. c3=0;
  496. sqr_add_c2(a,7,2,c1,c2,c3);
  497. sqr_add_c2(a,6,3,c1,c2,c3);
  498. sqr_add_c2(a,5,4,c1,c2,c3);
  499. r[9]=c1;
  500. c1=0;
  501. sqr_add_c(a,5,c2,c3,c1);
  502. sqr_add_c2(a,6,4,c2,c3,c1);
  503. sqr_add_c2(a,7,3,c2,c3,c1);
  504. r[10]=c2;
  505. c2=0;
  506. sqr_add_c2(a,7,4,c3,c1,c2);
  507. sqr_add_c2(a,6,5,c3,c1,c2);
  508. r[11]=c3;
  509. c3=0;
  510. sqr_add_c(a,6,c1,c2,c3);
  511. sqr_add_c2(a,7,5,c1,c2,c3);
  512. r[12]=c1;
  513. c1=0;
  514. sqr_add_c2(a,7,6,c2,c3,c1);
  515. r[13]=c2;
  516. c2=0;
  517. sqr_add_c(a,7,c3,c1,c2);
  518. r[14]=c3;
  519. r[15]=c1;
  520. }
  521. void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  522. {
  523. BN_ULONG t1,t2;
  524. BN_ULONG c1,c2,c3;
  525. c1=0;
  526. c2=0;
  527. c3=0;
  528. sqr_add_c(a,0,c1,c2,c3);
  529. r[0]=c1;
  530. c1=0;
  531. sqr_add_c2(a,1,0,c2,c3,c1);
  532. r[1]=c2;
  533. c2=0;
  534. sqr_add_c(a,1,c3,c1,c2);
  535. sqr_add_c2(a,2,0,c3,c1,c2);
  536. r[2]=c3;
  537. c3=0;
  538. sqr_add_c2(a,3,0,c1,c2,c3);
  539. sqr_add_c2(a,2,1,c1,c2,c3);
  540. r[3]=c1;
  541. c1=0;
  542. sqr_add_c(a,2,c2,c3,c1);
  543. sqr_add_c2(a,3,1,c2,c3,c1);
  544. r[4]=c2;
  545. c2=0;
  546. sqr_add_c2(a,3,2,c3,c1,c2);
  547. r[5]=c3;
  548. c3=0;
  549. sqr_add_c(a,3,c1,c2,c3);
  550. r[6]=c1;
  551. r[7]=c2;
  552. }
  553. #endif