2
0

bn_asm.c 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. /* crypto/bn/bn_asm.c */
  2. /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  3. * All rights reserved.
  4. *
  5. * This package is an SSL implementation written
  6. * by Eric Young (eay@cryptsoft.com).
  7. * The implementation was written so as to conform with Netscapes SSL.
  8. *
  9. * This library is free for commercial and non-commercial use as long as
  10. * the following conditions are aheared to. The following conditions
  11. * apply to all code found in this distribution, be it the RC4, RSA,
  12. * lhash, DES, etc., code; not just the SSL code. The SSL documentation
  13. * included with this distribution is covered by the same copyright terms
  14. * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  15. *
  16. * Copyright remains Eric Young's, and as such any Copyright notices in
  17. * the code are not to be removed.
  18. * If this package is used in a product, Eric Young should be given attribution
  19. * as the author of the parts of the library used.
  20. * This can be in the form of a textual message at program startup or
  21. * in documentation (online or textual) provided with the package.
  22. *
  23. * Redistribution and use in source and binary forms, with or without
  24. * modification, are permitted provided that the following conditions
  25. * are met:
  26. * 1. Redistributions of source code must retain the copyright
  27. * notice, this list of conditions and the following disclaimer.
  28. * 2. Redistributions in binary form must reproduce the above copyright
  29. * notice, this list of conditions and the following disclaimer in the
  30. * documentation and/or other materials provided with the distribution.
  31. * 3. All advertising materials mentioning features or use of this software
  32. * must display the following acknowledgement:
  33. * "This product includes cryptographic software written by
  34. * Eric Young (eay@cryptsoft.com)"
  35. * The word 'cryptographic' can be left out if the rouines from the library
  36. * being used are not cryptographic related :-).
  37. * 4. If you include any Windows specific code (or a derivative thereof) from
  38. * the apps directory (application code) you must include an acknowledgement:
  39. * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  40. *
  41. * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51. * SUCH DAMAGE.
  52. *
  53. * The licence and distribution terms for any publically available version or
  54. * derivative of this code cannot be changed. i.e. this code cannot simply be
  55. * copied and put under another distribution licence
  56. * [including the GNU Public Licence.]
  57. */
  58. #ifndef BN_DEBUG
  59. # undef NDEBUG /* avoid conflicting definitions */
  60. # define NDEBUG
  61. #endif
  62. #include <stdio.h>
  63. #include <assert.h>
  64. #include "cryptlib.h"
  65. #include "bn_lcl.h"
  66. #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  67. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  68. BN_ULONG w)
  69. {
  70. BN_ULONG c1 = 0;
  71. assert(num >= 0);
  72. if (num <= 0)
  73. return (c1);
  74. # ifndef OPENSSL_SMALL_FOOTPRINT
  75. while (num & ~3) {
  76. mul_add(rp[0], ap[0], w, c1);
  77. mul_add(rp[1], ap[1], w, c1);
  78. mul_add(rp[2], ap[2], w, c1);
  79. mul_add(rp[3], ap[3], w, c1);
  80. ap += 4;
  81. rp += 4;
  82. num -= 4;
  83. }
  84. # endif
  85. while (num) {
  86. mul_add(rp[0], ap[0], w, c1);
  87. ap++;
  88. rp++;
  89. num--;
  90. }
  91. return (c1);
  92. }
  93. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  94. {
  95. BN_ULONG c1 = 0;
  96. assert(num >= 0);
  97. if (num <= 0)
  98. return (c1);
  99. # ifndef OPENSSL_SMALL_FOOTPRINT
  100. while (num & ~3) {
  101. mul(rp[0], ap[0], w, c1);
  102. mul(rp[1], ap[1], w, c1);
  103. mul(rp[2], ap[2], w, c1);
  104. mul(rp[3], ap[3], w, c1);
  105. ap += 4;
  106. rp += 4;
  107. num -= 4;
  108. }
  109. # endif
  110. while (num) {
  111. mul(rp[0], ap[0], w, c1);
  112. ap++;
  113. rp++;
  114. num--;
  115. }
  116. return (c1);
  117. }
  118. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  119. {
  120. assert(n >= 0);
  121. if (n <= 0)
  122. return;
  123. # ifndef OPENSSL_SMALL_FOOTPRINT
  124. while (n & ~3) {
  125. sqr(r[0], r[1], a[0]);
  126. sqr(r[2], r[3], a[1]);
  127. sqr(r[4], r[5], a[2]);
  128. sqr(r[6], r[7], a[3]);
  129. a += 4;
  130. r += 8;
  131. n -= 4;
  132. }
  133. # endif
  134. while (n) {
  135. sqr(r[0], r[1], a[0]);
  136. a++;
  137. r += 2;
  138. n--;
  139. }
  140. }
  141. #else /* !(defined(BN_LLONG) ||
  142. * defined(BN_UMULT_HIGH)) */
  143. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  144. BN_ULONG w)
  145. {
  146. BN_ULONG c = 0;
  147. BN_ULONG bl, bh;
  148. assert(num >= 0);
  149. if (num <= 0)
  150. return ((BN_ULONG)0);
  151. bl = LBITS(w);
  152. bh = HBITS(w);
  153. # ifndef OPENSSL_SMALL_FOOTPRINT
  154. while (num & ~3) {
  155. mul_add(rp[0], ap[0], bl, bh, c);
  156. mul_add(rp[1], ap[1], bl, bh, c);
  157. mul_add(rp[2], ap[2], bl, bh, c);
  158. mul_add(rp[3], ap[3], bl, bh, c);
  159. ap += 4;
  160. rp += 4;
  161. num -= 4;
  162. }
  163. # endif
  164. while (num) {
  165. mul_add(rp[0], ap[0], bl, bh, c);
  166. ap++;
  167. rp++;
  168. num--;
  169. }
  170. return (c);
  171. }
  172. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  173. {
  174. BN_ULONG carry = 0;
  175. BN_ULONG bl, bh;
  176. assert(num >= 0);
  177. if (num <= 0)
  178. return ((BN_ULONG)0);
  179. bl = LBITS(w);
  180. bh = HBITS(w);
  181. # ifndef OPENSSL_SMALL_FOOTPRINT
  182. while (num & ~3) {
  183. mul(rp[0], ap[0], bl, bh, carry);
  184. mul(rp[1], ap[1], bl, bh, carry);
  185. mul(rp[2], ap[2], bl, bh, carry);
  186. mul(rp[3], ap[3], bl, bh, carry);
  187. ap += 4;
  188. rp += 4;
  189. num -= 4;
  190. }
  191. # endif
  192. while (num) {
  193. mul(rp[0], ap[0], bl, bh, carry);
  194. ap++;
  195. rp++;
  196. num--;
  197. }
  198. return (carry);
  199. }
  200. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  201. {
  202. assert(n >= 0);
  203. if (n <= 0)
  204. return;
  205. # ifndef OPENSSL_SMALL_FOOTPRINT
  206. while (n & ~3) {
  207. sqr64(r[0], r[1], a[0]);
  208. sqr64(r[2], r[3], a[1]);
  209. sqr64(r[4], r[5], a[2]);
  210. sqr64(r[6], r[7], a[3]);
  211. a += 4;
  212. r += 8;
  213. n -= 4;
  214. }
  215. # endif
  216. while (n) {
  217. sqr64(r[0], r[1], a[0]);
  218. a++;
  219. r += 2;
  220. n--;
  221. }
  222. }
  223. #endif /* !(defined(BN_LLONG) ||
  224. * defined(BN_UMULT_HIGH)) */
  225. #if defined(BN_LLONG) && defined(BN_DIV2W)
  226. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  227. {
  228. return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
  229. }
  230. #else
  231. /* Divide h,l by d and return the result. */
  232. /* I need to test this some more :-( */
  233. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  234. {
  235. BN_ULONG dh, dl, q, ret = 0, th, tl, t;
  236. int i, count = 2;
  237. if (d == 0)
  238. return (BN_MASK2);
  239. i = BN_num_bits_word(d);
  240. assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
  241. i = BN_BITS2 - i;
  242. if (h >= d)
  243. h -= d;
  244. if (i) {
  245. d <<= i;
  246. h = (h << i) | (l >> (BN_BITS2 - i));
  247. l <<= i;
  248. }
  249. dh = (d & BN_MASK2h) >> BN_BITS4;
  250. dl = (d & BN_MASK2l);
  251. for (;;) {
  252. if ((h >> BN_BITS4) == dh)
  253. q = BN_MASK2l;
  254. else
  255. q = h / dh;
  256. th = q * dh;
  257. tl = dl * q;
  258. for (;;) {
  259. t = h - th;
  260. if ((t & BN_MASK2h) ||
  261. ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
  262. break;
  263. q--;
  264. th -= dh;
  265. tl -= dl;
  266. }
  267. t = (tl >> BN_BITS4);
  268. tl = (tl << BN_BITS4) & BN_MASK2h;
  269. th += t;
  270. if (l < tl)
  271. th++;
  272. l -= tl;
  273. if (h < th) {
  274. h += d;
  275. q--;
  276. }
  277. h -= th;
  278. if (--count == 0)
  279. break;
  280. ret = q << BN_BITS4;
  281. h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
  282. l = (l & BN_MASK2l) << BN_BITS4;
  283. }
  284. ret |= q;
  285. return (ret);
  286. }
  287. #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
  288. #ifdef BN_LLONG
  289. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  290. int n)
  291. {
  292. BN_ULLONG ll = 0;
  293. assert(n >= 0);
  294. if (n <= 0)
  295. return ((BN_ULONG)0);
  296. # ifndef OPENSSL_SMALL_FOOTPRINT
  297. while (n & ~3) {
  298. ll += (BN_ULLONG) a[0] + b[0];
  299. r[0] = (BN_ULONG)ll & BN_MASK2;
  300. ll >>= BN_BITS2;
  301. ll += (BN_ULLONG) a[1] + b[1];
  302. r[1] = (BN_ULONG)ll & BN_MASK2;
  303. ll >>= BN_BITS2;
  304. ll += (BN_ULLONG) a[2] + b[2];
  305. r[2] = (BN_ULONG)ll & BN_MASK2;
  306. ll >>= BN_BITS2;
  307. ll += (BN_ULLONG) a[3] + b[3];
  308. r[3] = (BN_ULONG)ll & BN_MASK2;
  309. ll >>= BN_BITS2;
  310. a += 4;
  311. b += 4;
  312. r += 4;
  313. n -= 4;
  314. }
  315. # endif
  316. while (n) {
  317. ll += (BN_ULLONG) a[0] + b[0];
  318. r[0] = (BN_ULONG)ll & BN_MASK2;
  319. ll >>= BN_BITS2;
  320. a++;
  321. b++;
  322. r++;
  323. n--;
  324. }
  325. return ((BN_ULONG)ll);
  326. }
  327. #else /* !BN_LLONG */
  328. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  329. int n)
  330. {
  331. BN_ULONG c, l, t;
  332. assert(n >= 0);
  333. if (n <= 0)
  334. return ((BN_ULONG)0);
  335. c = 0;
  336. # ifndef OPENSSL_SMALL_FOOTPRINT
  337. while (n & ~3) {
  338. t = a[0];
  339. t = (t + c) & BN_MASK2;
  340. c = (t < c);
  341. l = (t + b[0]) & BN_MASK2;
  342. c += (l < t);
  343. r[0] = l;
  344. t = a[1];
  345. t = (t + c) & BN_MASK2;
  346. c = (t < c);
  347. l = (t + b[1]) & BN_MASK2;
  348. c += (l < t);
  349. r[1] = l;
  350. t = a[2];
  351. t = (t + c) & BN_MASK2;
  352. c = (t < c);
  353. l = (t + b[2]) & BN_MASK2;
  354. c += (l < t);
  355. r[2] = l;
  356. t = a[3];
  357. t = (t + c) & BN_MASK2;
  358. c = (t < c);
  359. l = (t + b[3]) & BN_MASK2;
  360. c += (l < t);
  361. r[3] = l;
  362. a += 4;
  363. b += 4;
  364. r += 4;
  365. n -= 4;
  366. }
  367. # endif
  368. while (n) {
  369. t = a[0];
  370. t = (t + c) & BN_MASK2;
  371. c = (t < c);
  372. l = (t + b[0]) & BN_MASK2;
  373. c += (l < t);
  374. r[0] = l;
  375. a++;
  376. b++;
  377. r++;
  378. n--;
  379. }
  380. return ((BN_ULONG)c);
  381. }
  382. #endif /* !BN_LLONG */
  383. BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  384. int n)
  385. {
  386. BN_ULONG t1, t2;
  387. int c = 0;
  388. assert(n >= 0);
  389. if (n <= 0)
  390. return ((BN_ULONG)0);
  391. #ifndef OPENSSL_SMALL_FOOTPRINT
  392. while (n & ~3) {
  393. t1 = a[0];
  394. t2 = b[0];
  395. r[0] = (t1 - t2 - c) & BN_MASK2;
  396. if (t1 != t2)
  397. c = (t1 < t2);
  398. t1 = a[1];
  399. t2 = b[1];
  400. r[1] = (t1 - t2 - c) & BN_MASK2;
  401. if (t1 != t2)
  402. c = (t1 < t2);
  403. t1 = a[2];
  404. t2 = b[2];
  405. r[2] = (t1 - t2 - c) & BN_MASK2;
  406. if (t1 != t2)
  407. c = (t1 < t2);
  408. t1 = a[3];
  409. t2 = b[3];
  410. r[3] = (t1 - t2 - c) & BN_MASK2;
  411. if (t1 != t2)
  412. c = (t1 < t2);
  413. a += 4;
  414. b += 4;
  415. r += 4;
  416. n -= 4;
  417. }
  418. #endif
  419. while (n) {
  420. t1 = a[0];
  421. t2 = b[0];
  422. r[0] = (t1 - t2 - c) & BN_MASK2;
  423. if (t1 != t2)
  424. c = (t1 < t2);
  425. a++;
  426. b++;
  427. r++;
  428. n--;
  429. }
  430. return (c);
  431. }
  432. #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
  433. # undef bn_mul_comba8
  434. # undef bn_mul_comba4
  435. # undef bn_sqr_comba8
  436. # undef bn_sqr_comba4
  437. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  438. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  439. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  440. /*
  441. * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
  442. * c=(c2,c1,c0)
  443. */
  444. # ifdef BN_LLONG
  445. /*
  446. * Keep in mind that additions to multiplication result can not
  447. * overflow, because its high half cannot be all-ones.
  448. */
  449. # define mul_add_c(a,b,c0,c1,c2) do { \
  450. BN_ULONG hi; \
  451. BN_ULLONG t = (BN_ULLONG)(a)*(b); \
  452. t += c0; /* no carry */ \
  453. c0 = (BN_ULONG)Lw(t); \
  454. hi = (BN_ULONG)Hw(t); \
  455. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  456. } while(0)
  457. # define mul_add_c2(a,b,c0,c1,c2) do { \
  458. BN_ULONG hi; \
  459. BN_ULLONG t = (BN_ULLONG)(a)*(b); \
  460. BN_ULLONG tt = t+c0; /* no carry */ \
  461. c0 = (BN_ULONG)Lw(tt); \
  462. hi = (BN_ULONG)Hw(tt); \
  463. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  464. t += c0; /* no carry */ \
  465. c0 = (BN_ULONG)Lw(t); \
  466. hi = (BN_ULONG)Hw(t); \
  467. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  468. } while(0)
  469. # define sqr_add_c(a,i,c0,c1,c2) do { \
  470. BN_ULONG hi; \
  471. BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
  472. t += c0; /* no carry */ \
  473. c0 = (BN_ULONG)Lw(t); \
  474. hi = (BN_ULONG)Hw(t); \
  475. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  476. } while(0)
  477. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  478. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  479. # elif defined(BN_UMULT_LOHI)
  480. /*
  481. * Keep in mind that additions to hi can not overflow, because
  482. * the high word of a multiplication result cannot be all-ones.
  483. */
  484. # define mul_add_c(a,b,c0,c1,c2) do { \
  485. BN_ULONG ta = (a), tb = (b); \
  486. BN_ULONG lo, hi; \
  487. BN_UMULT_LOHI(lo,hi,ta,tb); \
  488. c0 += lo; hi += (c0<lo)?1:0; \
  489. c1 += hi; c2 += (c1<hi)?1:0; \
  490. } while(0)
  491. # define mul_add_c2(a,b,c0,c1,c2) do { \
  492. BN_ULONG ta = (a), tb = (b); \
  493. BN_ULONG lo, hi, tt; \
  494. BN_UMULT_LOHI(lo,hi,ta,tb); \
  495. c0 += lo; tt = hi+((c0<lo)?1:0); \
  496. c1 += tt; c2 += (c1<tt)?1:0; \
  497. c0 += lo; hi += (c0<lo)?1:0; \
  498. c1 += hi; c2 += (c1<hi)?1:0; \
  499. } while(0)
  500. # define sqr_add_c(a,i,c0,c1,c2) do { \
  501. BN_ULONG ta = (a)[i]; \
  502. BN_ULONG lo, hi; \
  503. BN_UMULT_LOHI(lo,hi,ta,ta); \
  504. c0 += lo; hi += (c0<lo)?1:0; \
  505. c1 += hi; c2 += (c1<hi)?1:0; \
  506. } while(0)
  507. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  508. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  509. # elif defined(BN_UMULT_HIGH)
  510. /*
  511. * Keep in mind that additions to hi can not overflow, because
  512. * the high word of a multiplication result cannot be all-ones.
  513. */
  514. # define mul_add_c(a,b,c0,c1,c2) do { \
  515. BN_ULONG ta = (a), tb = (b); \
  516. BN_ULONG lo = ta * tb; \
  517. BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
  518. c0 += lo; hi += (c0<lo)?1:0; \
  519. c1 += hi; c2 += (c1<hi)?1:0; \
  520. } while(0)
  521. # define mul_add_c2(a,b,c0,c1,c2) do { \
  522. BN_ULONG ta = (a), tb = (b), tt; \
  523. BN_ULONG lo = ta * tb; \
  524. BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
  525. c0 += lo; tt = hi + ((c0<lo)?1:0); \
  526. c1 += tt; c2 += (c1<tt)?1:0; \
  527. c0 += lo; hi += (c0<lo)?1:0; \
  528. c1 += hi; c2 += (c1<hi)?1:0; \
  529. } while(0)
  530. # define sqr_add_c(a,i,c0,c1,c2) do { \
  531. BN_ULONG ta = (a)[i]; \
  532. BN_ULONG lo = ta * ta; \
  533. BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
  534. c0 += lo; hi += (c0<lo)?1:0; \
  535. c1 += hi; c2 += (c1<hi)?1:0; \
  536. } while(0)
  537. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  538. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  539. # else /* !BN_LLONG */
  540. /*
  541. * Keep in mind that additions to hi can not overflow, because
  542. * the high word of a multiplication result cannot be all-ones.
  543. */
  544. # define mul_add_c(a,b,c0,c1,c2) do { \
  545. BN_ULONG lo = LBITS(a), hi = HBITS(a); \
  546. BN_ULONG bl = LBITS(b), bh = HBITS(b); \
  547. mul64(lo,hi,bl,bh); \
  548. c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
  549. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  550. } while(0)
  551. # define mul_add_c2(a,b,c0,c1,c2) do { \
  552. BN_ULONG tt; \
  553. BN_ULONG lo = LBITS(a), hi = HBITS(a); \
  554. BN_ULONG bl = LBITS(b), bh = HBITS(b); \
  555. mul64(lo,hi,bl,bh); \
  556. tt = hi; \
  557. c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
  558. c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
  559. c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
  560. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  561. } while(0)
  562. # define sqr_add_c(a,i,c0,c1,c2) do { \
  563. BN_ULONG lo, hi; \
  564. sqr64(lo,hi,(a)[i]); \
  565. c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
  566. c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
  567. } while(0)
  568. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  569. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  570. # endif /* !BN_LLONG */
  571. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  572. {
  573. BN_ULONG c1, c2, c3;
  574. c1 = 0;
  575. c2 = 0;
  576. c3 = 0;
  577. mul_add_c(a[0], b[0], c1, c2, c3);
  578. r[0] = c1;
  579. c1 = 0;
  580. mul_add_c(a[0], b[1], c2, c3, c1);
  581. mul_add_c(a[1], b[0], c2, c3, c1);
  582. r[1] = c2;
  583. c2 = 0;
  584. mul_add_c(a[2], b[0], c3, c1, c2);
  585. mul_add_c(a[1], b[1], c3, c1, c2);
  586. mul_add_c(a[0], b[2], c3, c1, c2);
  587. r[2] = c3;
  588. c3 = 0;
  589. mul_add_c(a[0], b[3], c1, c2, c3);
  590. mul_add_c(a[1], b[2], c1, c2, c3);
  591. mul_add_c(a[2], b[1], c1, c2, c3);
  592. mul_add_c(a[3], b[0], c1, c2, c3);
  593. r[3] = c1;
  594. c1 = 0;
  595. mul_add_c(a[4], b[0], c2, c3, c1);
  596. mul_add_c(a[3], b[1], c2, c3, c1);
  597. mul_add_c(a[2], b[2], c2, c3, c1);
  598. mul_add_c(a[1], b[3], c2, c3, c1);
  599. mul_add_c(a[0], b[4], c2, c3, c1);
  600. r[4] = c2;
  601. c2 = 0;
  602. mul_add_c(a[0], b[5], c3, c1, c2);
  603. mul_add_c(a[1], b[4], c3, c1, c2);
  604. mul_add_c(a[2], b[3], c3, c1, c2);
  605. mul_add_c(a[3], b[2], c3, c1, c2);
  606. mul_add_c(a[4], b[1], c3, c1, c2);
  607. mul_add_c(a[5], b[0], c3, c1, c2);
  608. r[5] = c3;
  609. c3 = 0;
  610. mul_add_c(a[6], b[0], c1, c2, c3);
  611. mul_add_c(a[5], b[1], c1, c2, c3);
  612. mul_add_c(a[4], b[2], c1, c2, c3);
  613. mul_add_c(a[3], b[3], c1, c2, c3);
  614. mul_add_c(a[2], b[4], c1, c2, c3);
  615. mul_add_c(a[1], b[5], c1, c2, c3);
  616. mul_add_c(a[0], b[6], c1, c2, c3);
  617. r[6] = c1;
  618. c1 = 0;
  619. mul_add_c(a[0], b[7], c2, c3, c1);
  620. mul_add_c(a[1], b[6], c2, c3, c1);
  621. mul_add_c(a[2], b[5], c2, c3, c1);
  622. mul_add_c(a[3], b[4], c2, c3, c1);
  623. mul_add_c(a[4], b[3], c2, c3, c1);
  624. mul_add_c(a[5], b[2], c2, c3, c1);
  625. mul_add_c(a[6], b[1], c2, c3, c1);
  626. mul_add_c(a[7], b[0], c2, c3, c1);
  627. r[7] = c2;
  628. c2 = 0;
  629. mul_add_c(a[7], b[1], c3, c1, c2);
  630. mul_add_c(a[6], b[2], c3, c1, c2);
  631. mul_add_c(a[5], b[3], c3, c1, c2);
  632. mul_add_c(a[4], b[4], c3, c1, c2);
  633. mul_add_c(a[3], b[5], c3, c1, c2);
  634. mul_add_c(a[2], b[6], c3, c1, c2);
  635. mul_add_c(a[1], b[7], c3, c1, c2);
  636. r[8] = c3;
  637. c3 = 0;
  638. mul_add_c(a[2], b[7], c1, c2, c3);
  639. mul_add_c(a[3], b[6], c1, c2, c3);
  640. mul_add_c(a[4], b[5], c1, c2, c3);
  641. mul_add_c(a[5], b[4], c1, c2, c3);
  642. mul_add_c(a[6], b[3], c1, c2, c3);
  643. mul_add_c(a[7], b[2], c1, c2, c3);
  644. r[9] = c1;
  645. c1 = 0;
  646. mul_add_c(a[7], b[3], c2, c3, c1);
  647. mul_add_c(a[6], b[4], c2, c3, c1);
  648. mul_add_c(a[5], b[5], c2, c3, c1);
  649. mul_add_c(a[4], b[6], c2, c3, c1);
  650. mul_add_c(a[3], b[7], c2, c3, c1);
  651. r[10] = c2;
  652. c2 = 0;
  653. mul_add_c(a[4], b[7], c3, c1, c2);
  654. mul_add_c(a[5], b[6], c3, c1, c2);
  655. mul_add_c(a[6], b[5], c3, c1, c2);
  656. mul_add_c(a[7], b[4], c3, c1, c2);
  657. r[11] = c3;
  658. c3 = 0;
  659. mul_add_c(a[7], b[5], c1, c2, c3);
  660. mul_add_c(a[6], b[6], c1, c2, c3);
  661. mul_add_c(a[5], b[7], c1, c2, c3);
  662. r[12] = c1;
  663. c1 = 0;
  664. mul_add_c(a[6], b[7], c2, c3, c1);
  665. mul_add_c(a[7], b[6], c2, c3, c1);
  666. r[13] = c2;
  667. c2 = 0;
  668. mul_add_c(a[7], b[7], c3, c1, c2);
  669. r[14] = c3;
  670. r[15] = c1;
  671. }
  672. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  673. {
  674. BN_ULONG c1, c2, c3;
  675. c1 = 0;
  676. c2 = 0;
  677. c3 = 0;
  678. mul_add_c(a[0], b[0], c1, c2, c3);
  679. r[0] = c1;
  680. c1 = 0;
  681. mul_add_c(a[0], b[1], c2, c3, c1);
  682. mul_add_c(a[1], b[0], c2, c3, c1);
  683. r[1] = c2;
  684. c2 = 0;
  685. mul_add_c(a[2], b[0], c3, c1, c2);
  686. mul_add_c(a[1], b[1], c3, c1, c2);
  687. mul_add_c(a[0], b[2], c3, c1, c2);
  688. r[2] = c3;
  689. c3 = 0;
  690. mul_add_c(a[0], b[3], c1, c2, c3);
  691. mul_add_c(a[1], b[2], c1, c2, c3);
  692. mul_add_c(a[2], b[1], c1, c2, c3);
  693. mul_add_c(a[3], b[0], c1, c2, c3);
  694. r[3] = c1;
  695. c1 = 0;
  696. mul_add_c(a[3], b[1], c2, c3, c1);
  697. mul_add_c(a[2], b[2], c2, c3, c1);
  698. mul_add_c(a[1], b[3], c2, c3, c1);
  699. r[4] = c2;
  700. c2 = 0;
  701. mul_add_c(a[2], b[3], c3, c1, c2);
  702. mul_add_c(a[3], b[2], c3, c1, c2);
  703. r[5] = c3;
  704. c3 = 0;
  705. mul_add_c(a[3], b[3], c1, c2, c3);
  706. r[6] = c1;
  707. r[7] = c2;
  708. }
  709. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  710. {
  711. BN_ULONG c1, c2, c3;
  712. c1 = 0;
  713. c2 = 0;
  714. c3 = 0;
  715. sqr_add_c(a, 0, c1, c2, c3);
  716. r[0] = c1;
  717. c1 = 0;
  718. sqr_add_c2(a, 1, 0, c2, c3, c1);
  719. r[1] = c2;
  720. c2 = 0;
  721. sqr_add_c(a, 1, c3, c1, c2);
  722. sqr_add_c2(a, 2, 0, c3, c1, c2);
  723. r[2] = c3;
  724. c3 = 0;
  725. sqr_add_c2(a, 3, 0, c1, c2, c3);
  726. sqr_add_c2(a, 2, 1, c1, c2, c3);
  727. r[3] = c1;
  728. c1 = 0;
  729. sqr_add_c(a, 2, c2, c3, c1);
  730. sqr_add_c2(a, 3, 1, c2, c3, c1);
  731. sqr_add_c2(a, 4, 0, c2, c3, c1);
  732. r[4] = c2;
  733. c2 = 0;
  734. sqr_add_c2(a, 5, 0, c3, c1, c2);
  735. sqr_add_c2(a, 4, 1, c3, c1, c2);
  736. sqr_add_c2(a, 3, 2, c3, c1, c2);
  737. r[5] = c3;
  738. c3 = 0;
  739. sqr_add_c(a, 3, c1, c2, c3);
  740. sqr_add_c2(a, 4, 2, c1, c2, c3);
  741. sqr_add_c2(a, 5, 1, c1, c2, c3);
  742. sqr_add_c2(a, 6, 0, c1, c2, c3);
  743. r[6] = c1;
  744. c1 = 0;
  745. sqr_add_c2(a, 7, 0, c2, c3, c1);
  746. sqr_add_c2(a, 6, 1, c2, c3, c1);
  747. sqr_add_c2(a, 5, 2, c2, c3, c1);
  748. sqr_add_c2(a, 4, 3, c2, c3, c1);
  749. r[7] = c2;
  750. c2 = 0;
  751. sqr_add_c(a, 4, c3, c1, c2);
  752. sqr_add_c2(a, 5, 3, c3, c1, c2);
  753. sqr_add_c2(a, 6, 2, c3, c1, c2);
  754. sqr_add_c2(a, 7, 1, c3, c1, c2);
  755. r[8] = c3;
  756. c3 = 0;
  757. sqr_add_c2(a, 7, 2, c1, c2, c3);
  758. sqr_add_c2(a, 6, 3, c1, c2, c3);
  759. sqr_add_c2(a, 5, 4, c1, c2, c3);
  760. r[9] = c1;
  761. c1 = 0;
  762. sqr_add_c(a, 5, c2, c3, c1);
  763. sqr_add_c2(a, 6, 4, c2, c3, c1);
  764. sqr_add_c2(a, 7, 3, c2, c3, c1);
  765. r[10] = c2;
  766. c2 = 0;
  767. sqr_add_c2(a, 7, 4, c3, c1, c2);
  768. sqr_add_c2(a, 6, 5, c3, c1, c2);
  769. r[11] = c3;
  770. c3 = 0;
  771. sqr_add_c(a, 6, c1, c2, c3);
  772. sqr_add_c2(a, 7, 5, c1, c2, c3);
  773. r[12] = c1;
  774. c1 = 0;
  775. sqr_add_c2(a, 7, 6, c2, c3, c1);
  776. r[13] = c2;
  777. c2 = 0;
  778. sqr_add_c(a, 7, c3, c1, c2);
  779. r[14] = c3;
  780. r[15] = c1;
  781. }
  782. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  783. {
  784. BN_ULONG c1, c2, c3;
  785. c1 = 0;
  786. c2 = 0;
  787. c3 = 0;
  788. sqr_add_c(a, 0, c1, c2, c3);
  789. r[0] = c1;
  790. c1 = 0;
  791. sqr_add_c2(a, 1, 0, c2, c3, c1);
  792. r[1] = c2;
  793. c2 = 0;
  794. sqr_add_c(a, 1, c3, c1, c2);
  795. sqr_add_c2(a, 2, 0, c3, c1, c2);
  796. r[2] = c3;
  797. c3 = 0;
  798. sqr_add_c2(a, 3, 0, c1, c2, c3);
  799. sqr_add_c2(a, 2, 1, c1, c2, c3);
  800. r[3] = c1;
  801. c1 = 0;
  802. sqr_add_c(a, 2, c2, c3, c1);
  803. sqr_add_c2(a, 3, 1, c2, c3, c1);
  804. r[4] = c2;
  805. c2 = 0;
  806. sqr_add_c2(a, 3, 2, c3, c1, c2);
  807. r[5] = c3;
  808. c3 = 0;
  809. sqr_add_c(a, 3, c1, c2, c3);
  810. r[6] = c1;
  811. r[7] = c2;
  812. }
  813. # ifdef OPENSSL_NO_ASM
  814. # ifdef OPENSSL_BN_ASM_MONT
  815. # include <alloca.h>
  816. /*
  817. * This is essentially reference implementation, which may or may not
  818. * result in performance improvement. E.g. on IA-32 this routine was
  819. * observed to give 40% faster rsa1024 private key operations and 10%
  820. * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
  821. * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
  822. * reference implementation, one to be used as starting point for
  823. * platform-specific assembler. Mentioned numbers apply to compiler
  824. * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
  825. * can vary not only from platform to platform, but even for compiler
  826. * versions. Assembler vs. assembler improvement coefficients can
  827. * [and are known to] differ and are to be documented elsewhere.
  828. */
  829. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  830. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  831. {
  832. BN_ULONG c0, c1, ml, *tp, n0;
  833. # ifdef mul64
  834. BN_ULONG mh;
  835. # endif
  836. volatile BN_ULONG *vp;
  837. int i = 0, j;
  838. # if 0 /* template for platform-specific
  839. * implementation */
  840. if (ap == bp)
  841. return bn_sqr_mont(rp, ap, np, n0p, num);
  842. # endif
  843. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  844. n0 = *n0p;
  845. c0 = 0;
  846. ml = bp[0];
  847. # ifdef mul64
  848. mh = HBITS(ml);
  849. ml = LBITS(ml);
  850. for (j = 0; j < num; ++j)
  851. mul(tp[j], ap[j], ml, mh, c0);
  852. # else
  853. for (j = 0; j < num; ++j)
  854. mul(tp[j], ap[j], ml, c0);
  855. # endif
  856. tp[num] = c0;
  857. tp[num + 1] = 0;
  858. goto enter;
  859. for (i = 0; i < num; i++) {
  860. c0 = 0;
  861. ml = bp[i];
  862. # ifdef mul64
  863. mh = HBITS(ml);
  864. ml = LBITS(ml);
  865. for (j = 0; j < num; ++j)
  866. mul_add(tp[j], ap[j], ml, mh, c0);
  867. # else
  868. for (j = 0; j < num; ++j)
  869. mul_add(tp[j], ap[j], ml, c0);
  870. # endif
  871. c1 = (tp[num] + c0) & BN_MASK2;
  872. tp[num] = c1;
  873. tp[num + 1] = (c1 < c0 ? 1 : 0);
  874. enter:
  875. c1 = tp[0];
  876. ml = (c1 * n0) & BN_MASK2;
  877. c0 = 0;
  878. # ifdef mul64
  879. mh = HBITS(ml);
  880. ml = LBITS(ml);
  881. mul_add(c1, np[0], ml, mh, c0);
  882. # else
  883. mul_add(c1, ml, np[0], c0);
  884. # endif
  885. for (j = 1; j < num; j++) {
  886. c1 = tp[j];
  887. # ifdef mul64
  888. mul_add(c1, np[j], ml, mh, c0);
  889. # else
  890. mul_add(c1, ml, np[j], c0);
  891. # endif
  892. tp[j - 1] = c1 & BN_MASK2;
  893. }
  894. c1 = (tp[num] + c0) & BN_MASK2;
  895. tp[num - 1] = c1;
  896. tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
  897. }
  898. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  899. c0 = bn_sub_words(rp, tp, np, num);
  900. if (tp[num] != 0 || c0 == 0) {
  901. for (i = 0; i < num + 2; i++)
  902. vp[i] = 0;
  903. return 1;
  904. }
  905. }
  906. for (i = 0; i < num; i++)
  907. rp[i] = tp[i], vp[i] = 0;
  908. vp[num] = 0;
  909. vp[num + 1] = 0;
  910. return 1;
  911. }
  912. # else
  913. /*
  914. * Return value of 0 indicates that multiplication/convolution was not
  915. * performed to signal the caller to fall down to alternative/original
  916. * code-path.
  917. */
  918. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  919. const BN_ULONG *np, const BN_ULONG *n0, int num)
  920. {
  921. return 0;
  922. }
  923. # endif /* OPENSSL_BN_ASM_MONT */
  924. # endif
  925. #else /* !BN_MUL_COMBA */
  926. /* hmm... is it faster just to do a multiply? */
  927. # undef bn_sqr_comba4
  928. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  929. {
  930. BN_ULONG t[8];
  931. bn_sqr_normal(r, a, 4, t);
  932. }
  933. # undef bn_sqr_comba8
  934. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  935. {
  936. BN_ULONG t[16];
  937. bn_sqr_normal(r, a, 8, t);
  938. }
  939. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  940. {
  941. r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
  942. r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
  943. r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
  944. r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
  945. }
  946. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  947. {
  948. r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
  949. r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
  950. r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
  951. r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
  952. r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
  953. r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
  954. r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
  955. r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
  956. }
  957. # ifdef OPENSSL_NO_ASM
  958. # ifdef OPENSSL_BN_ASM_MONT
  959. # include <alloca.h>
  960. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  961. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  962. {
  963. BN_ULONG c0, c1, *tp, n0 = *n0p;
  964. volatile BN_ULONG *vp;
  965. int i = 0, j;
  966. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  967. for (i = 0; i <= num; i++)
  968. tp[i] = 0;
  969. for (i = 0; i < num; i++) {
  970. c0 = bn_mul_add_words(tp, ap, num, bp[i]);
  971. c1 = (tp[num] + c0) & BN_MASK2;
  972. tp[num] = c1;
  973. tp[num + 1] = (c1 < c0 ? 1 : 0);
  974. c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
  975. c1 = (tp[num] + c0) & BN_MASK2;
  976. tp[num] = c1;
  977. tp[num + 1] += (c1 < c0 ? 1 : 0);
  978. for (j = 0; j <= num; j++)
  979. tp[j] = tp[j + 1];
  980. }
  981. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  982. c0 = bn_sub_words(rp, tp, np, num);
  983. if (tp[num] != 0 || c0 == 0) {
  984. for (i = 0; i < num + 2; i++)
  985. vp[i] = 0;
  986. return 1;
  987. }
  988. }
  989. for (i = 0; i < num; i++)
  990. rp[i] = tp[i], vp[i] = 0;
  991. vp[num] = 0;
  992. vp[num + 1] = 0;
  993. return 1;
  994. }
  995. # else
  996. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  997. const BN_ULONG *np, const BN_ULONG *n0, int num)
  998. {
  999. return 0;
  1000. }
  1001. # endif /* OPENSSL_BN_ASM_MONT */
  1002. # endif
  1003. #endif /* !BN_MUL_COMBA */