3
0

tls_pstm_sqr_comba.c 43 KB


  1. /*
  2. * Copyright (C) 2017 Denys Vlasenko
  3. *
  4. * Licensed under GPLv2, see file LICENSE in this source tree.
  5. */
  6. #include "tls.h"
  7. /* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
  8. * Changes are flagged with //bbox
  9. */
  10. /**
  11. * @file pstm_sqr_comba.c
  12. * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
  13. *
  14. * Multiprecision Squaring with Comba technique.
  15. */
  16. /*
  17. * Copyright (c) 2013-2015 INSIDE Secure Corporation
  18. * Copyright (c) PeerSec Networks, 2002-2011
  19. * All Rights Reserved
  20. *
  21. * The latest version of this code is available at http://www.matrixssl.org
  22. *
  23. * This software is open source; you can redistribute it and/or modify
  24. * it under the terms of the GNU General Public License as published by
  25. * the Free Software Foundation; either version 2 of the License, or
  26. * (at your option) any later version.
  27. *
  28. * This General Public License does NOT permit incorporating this software
  29. * into proprietary programs. If you are unable to comply with the GPL, a
  30. * commercial license for this software may be purchased from INSIDE at
  31. * http://www.insidesecure.com/eng/Company/Locations
  32. *
  33. * This program is distributed in WITHOUT ANY WARRANTY; without even the
  34. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  35. * See the GNU General Public License for more details.
  36. *
  37. * You should have received a copy of the GNU General Public License
  38. * along with this program; if not, write to the Free Software
  39. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  40. * http://www.gnu.org/copyleft/gpl.html
  41. */
  42. /******************************************************************************/
  43. //bbox
  44. //#include "../cryptoApi.h"
  45. #ifndef DISABLE_PSTM
  46. /******************************************************************************/
  47. #if defined(PSTM_X86)
  48. /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
  49. #if !defined(__GNUC__) || !defined(__i386__)
  50. #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
  51. #endif
  52. //#pragma message ("Using 32 bit x86 Assembly Optimizations")
  53. #define COMBA_START
  54. #define CLEAR_CARRY \
  55. c0 = c1 = c2 = 0;
  56. #define COMBA_STORE(x) \
  57. x = c0;
  58. #define COMBA_STORE2(x) \
  59. x = c1;
  60. #define CARRY_FORWARD \
  61. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  62. #define COMBA_FINI
  63. #define SQRADD(i, j) \
  64. asm( \
  65. "movl %6,%%eax \n\t" \
  66. "mull %%eax \n\t" \
  67. "addl %%eax,%0 \n\t" \
  68. "adcl %%edx,%1 \n\t" \
  69. "adcl $0,%2 \n\t" \
  70. :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
  71. //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
  72. #define SQRADD2(i, j) \
  73. asm( \
  74. "movl %6,%%eax \n\t" \
  75. "mull %7 \n\t" \
  76. "addl %%eax,%0 \n\t" \
  77. "adcl %%edx,%1 \n\t" \
  78. "adcl $0,%2 \n\t" \
  79. "addl %%eax,%0 \n\t" \
  80. "adcl %%edx,%1 \n\t" \
  81. "adcl $0,%2 \n\t" \
  82. :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
  83. //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
  84. #define SQRADDSC(i, j) \
  85. asm( \
  86. "movl %6,%%eax \n\t" \
  87. "mull %7 \n\t" \
  88. "movl %%eax,%0 \n\t" \
  89. "movl %%edx,%1 \n\t" \
  90. "xorl %2,%2 \n\t" \
  91. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
  92. #define SQRADDAC(i, j) \
  93. asm( \
  94. "movl %6,%%eax \n\t" \
  95. "mull %7 \n\t" \
  96. "addl %%eax,%0 \n\t" \
  97. "adcl %%edx,%1 \n\t" \
  98. "adcl $0,%2 \n\t" \
  99. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
  100. #define SQRADDDB \
  101. asm( \
  102. "addl %6,%0 \n\t" \
  103. "adcl %7,%1 \n\t" \
  104. "adcl %8,%2 \n\t" \
  105. "addl %6,%0 \n\t" \
  106. "adcl %7,%1 \n\t" \
  107. "adcl %8,%2 \n\t" \
  108. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
  109. /******************************************************************************/
  110. #elif defined(PSTM_X86_64)
  111. /* x86-64 optimized */
  112. #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
  113. #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
  114. #endif
  115. //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
  116. #define COMBA_START
  117. #define CLEAR_CARRY \
  118. c0 = c1 = c2 = 0;
  119. #define COMBA_STORE(x) \
  120. x = c0;
  121. #define COMBA_STORE2(x) \
  122. x = c1;
  123. #define CARRY_FORWARD \
  124. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  125. #define COMBA_FINI
  126. #define SQRADD(i, j) \
  127. asm( \
  128. "movq %6,%%rax \n\t" \
  129. "mulq %%rax \n\t" \
  130. "addq %%rax,%0 \n\t" \
  131. "adcq %%rdx,%1 \n\t" \
  132. "adcq $0,%2 \n\t" \
  133. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
  134. #define SQRADD2(i, j) \
  135. asm( \
  136. "movq %6,%%rax \n\t" \
  137. "mulq %7 \n\t" \
  138. "addq %%rax,%0 \n\t" \
  139. "adcq %%rdx,%1 \n\t" \
  140. "adcq $0,%2 \n\t" \
  141. "addq %%rax,%0 \n\t" \
  142. "adcq %%rdx,%1 \n\t" \
  143. "adcq $0,%2 \n\t" \
  144. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  145. #define SQRADDSC(i, j) \
  146. asm( \
  147. "movq %6,%%rax \n\t" \
  148. "mulq %7 \n\t" \
  149. "movq %%rax,%0 \n\t" \
  150. "movq %%rdx,%1 \n\t" \
  151. "xorq %2,%2 \n\t" \
  152. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  153. #define SQRADDAC(i, j) \
  154. asm( \
  155. "movq %6,%%rax \n\t" \
  156. "mulq %7 \n\t" \
  157. "addq %%rax,%0 \n\t" \
  158. "adcq %%rdx,%1 \n\t" \
  159. "adcq $0,%2 \n\t" \
  160. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  161. #define SQRADDDB \
  162. asm( \
  163. "addq %6,%0 \n\t" \
  164. "adcq %7,%1 \n\t" \
  165. "adcq %8,%2 \n\t" \
  166. "addq %6,%0 \n\t" \
  167. "adcq %7,%1 \n\t" \
  168. "adcq %8,%2 \n\t" \
  169. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
  170. /******************************************************************************/
  171. #elif defined(PSTM_ARM)
  172. /* ARM code */
  173. //#pragma message ("Using 32 bit ARM Assembly Optimizations")
  174. #define COMBA_START
  175. #define CLEAR_CARRY \
  176. c0 = c1 = c2 = 0;
  177. #define COMBA_STORE(x) \
  178. x = c0;
  179. #define COMBA_STORE2(x) \
  180. x = c1;
  181. #define CARRY_FORWARD \
  182. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  183. #define COMBA_FINI
  184. /* multiplies point i and j, updates carry "c1" and digit c2 */
  185. #define SQRADD(i, j) \
  186. asm( \
  187. " UMULL r0,r1,%6,%6 \n\t" \
  188. " ADDS %0,%0,r0 \n\t" \
  189. " ADCS %1,%1,r1 \n\t" \
  190. " ADC %2,%2,#0 \n\t" \
  191. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
  192. /* for squaring some of the terms are doubled... */
  193. #define SQRADD2(i, j) \
  194. asm( \
  195. " UMULL r0,r1,%6,%7 \n\t" \
  196. " ADDS %0,%0,r0 \n\t" \
  197. " ADCS %1,%1,r1 \n\t" \
  198. " ADC %2,%2,#0 \n\t" \
  199. " ADDS %0,%0,r0 \n\t" \
  200. " ADCS %1,%1,r1 \n\t" \
  201. " ADC %2,%2,#0 \n\t" \
  202. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
  203. #define SQRADDSC(i, j) \
  204. asm( \
  205. " UMULL %0,%1,%6,%7 \n\t" \
  206. " SUB %2,%2,%2 \n\t" \
  207. :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
  208. #define SQRADDAC(i, j) \
  209. asm( \
  210. " UMULL r0,r1,%6,%7 \n\t" \
  211. " ADDS %0,%0,r0 \n\t" \
  212. " ADCS %1,%1,r1 \n\t" \
  213. " ADC %2,%2,#0 \n\t" \
  214. :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
  215. #define SQRADDDB \
  216. asm( \
  217. " ADDS %0,%0,%3 \n\t" \
  218. " ADCS %1,%1,%4 \n\t" \
  219. " ADC %2,%2,%5 \n\t" \
  220. " ADDS %0,%0,%3 \n\t" \
  221. " ADCS %1,%1,%4 \n\t" \
  222. " ADC %2,%2,%5 \n\t" \
  223. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
  224. /******************************************************************************/
  225. #elif defined(PSTM_MIPS)
  226. /* MIPS32 */
  227. //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
  228. #define COMBA_START
  229. #define CLEAR_CARRY \
  230. c0 = c1 = c2 = 0;
  231. #define COMBA_STORE(x) \
  232. x = c0;
  233. #define COMBA_STORE2(x) \
  234. x = c1;
  235. #define CARRY_FORWARD \
  236. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  237. #define COMBA_FINI
  238. /* multiplies point i and j, updates carry "c1" and digit c2 */
  239. #define SQRADD(i, j) \
  240. asm( \
  241. " multu %6,%6 \n\t" \
  242. " mflo $12 \n\t" \
  243. " mfhi $13 \n\t" \
  244. " addu %0,%0,$12 \n\t" \
  245. " sltu $12,%0,$12 \n\t" \
  246. " addu %1,%1,$13 \n\t" \
  247. " sltu $13,%1,$13 \n\t" \
  248. " addu %1,%1,$12 \n\t" \
  249. " sltu $12,%1,$12 \n\t" \
  250. " addu %2,%2,$13 \n\t" \
  251. " addu %2,%2,$12 \n\t" \
  252. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
  253. /* for squaring some of the terms are doubled... */
  254. #define SQRADD2(i, j) \
  255. asm( \
  256. " multu %6,%7 \n\t" \
  257. " mflo $12 \n\t" \
  258. " mfhi $13 \n\t" \
  259. \
  260. " addu %0,%0,$12 \n\t" \
  261. " sltu $14,%0,$12 \n\t" \
  262. " addu %1,%1,$13 \n\t" \
  263. " sltu $15,%1,$13 \n\t" \
  264. " addu %1,%1,$14 \n\t" \
  265. " sltu $14,%1,$14 \n\t" \
  266. " addu %2,%2,$15 \n\t" \
  267. " addu %2,%2,$14 \n\t" \
  268. \
  269. " addu %0,%0,$12 \n\t" \
  270. " sltu $14,%0,$12 \n\t" \
  271. " addu %1,%1,$13 \n\t" \
  272. " sltu $15,%1,$13 \n\t" \
  273. " addu %1,%1,$14 \n\t" \
  274. " sltu $14,%1,$14 \n\t" \
  275. " addu %2,%2,$15 \n\t" \
  276. " addu %2,%2,$14 \n\t" \
  277. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
  278. #define SQRADDSC(i, j) \
  279. asm( \
  280. " multu %6,%7 \n\t" \
  281. " mflo %0 \n\t" \
  282. " mfhi %1 \n\t" \
  283. " xor %2,%2,%2 \n\t" \
  284. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
  285. #define SQRADDAC(i, j) \
  286. asm( \
  287. " multu %6,%7 \n\t" \
  288. " mflo $12 \n\t" \
  289. " mfhi $13 \n\t" \
  290. " addu %0,%0,$12 \n\t" \
  291. " sltu $12,%0,$12 \n\t" \
  292. " addu %1,%1,$13 \n\t" \
  293. " sltu $13,%1,$13 \n\t" \
  294. " addu %1,%1,$12 \n\t" \
  295. " sltu $12,%1,$12 \n\t" \
  296. " addu %2,%2,$13 \n\t" \
  297. " addu %2,%2,$12 \n\t" \
  298. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
  299. #define SQRADDDB \
  300. asm( \
  301. " addu %0,%0,%3 \n\t" \
  302. " sltu $10,%0,%3 \n\t" \
  303. " addu %1,%1,$10 \n\t" \
  304. " sltu $10,%1,$10 \n\t" \
  305. " addu %1,%1,%4 \n\t" \
  306. " sltu $11,%1,%4 \n\t" \
  307. " addu %2,%2,$10 \n\t" \
  308. " addu %2,%2,$11 \n\t" \
  309. " addu %2,%2,%5 \n\t" \
  310. \
  311. " addu %0,%0,%3 \n\t" \
  312. " sltu $10,%0,%3 \n\t" \
  313. " addu %1,%1,$10 \n\t" \
  314. " sltu $10,%1,$10 \n\t" \
  315. " addu %1,%1,%4 \n\t" \
  316. " sltu $11,%1,%4 \n\t" \
  317. " addu %2,%2,$10 \n\t" \
  318. " addu %2,%2,$11 \n\t" \
  319. " addu %2,%2,%5 \n\t" \
  320. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
  321. #else
  322. /******************************************************************************/
  323. #define PSTM_ISO
  324. /* ISO C portable code */
  325. #define COMBA_START
  326. #define CLEAR_CARRY \
  327. c0 = c1 = c2 = 0;
  328. #define COMBA_STORE(x) \
  329. x = c0;
  330. #define COMBA_STORE2(x) \
  331. x = c1;
  332. #define CARRY_FORWARD \
  333. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  334. #define COMBA_FINI
  335. /* multiplies point i and j, updates carry "c1" and digit c2 */
  336. #define SQRADD(i, j) \
  337. do { pstm_word t; \
  338. t = c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \
  339. t = c1 + (t >> DIGIT_BIT); \
  340. c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \
  341. } while (0);
  342. /* for squaring some of the terms are doubled... */
  343. #define SQRADD2(i, j) \
  344. do { pstm_word t; \
  345. t = ((pstm_word)i) * ((pstm_word)j); \
  346. tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
  347. tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
  348. c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
  349. tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
  350. tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
  351. c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
  352. } while (0);
  353. #define SQRADDSC(i, j) \
  354. do { pstm_word t; \
  355. t = ((pstm_word)i) * ((pstm_word)j); \
  356. sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0; \
  357. } while (0);
  358. #define SQRADDAC(i, j) \
  359. do { pstm_word t; \
  360. t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j); \
  361. sc0 = (pstm_digit)t; \
  362. t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t; \
  363. sc2 += (pstm_digit)(t >> DIGIT_BIT); \
  364. } while (0);
  365. #define SQRADDDB \
  366. do { pstm_word t; \
  367. t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0); \
  368. c0 = (pstm_digit)t; \
  369. t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT); \
  370. c1 = (pstm_digit)t; \
  371. c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT); \
  372. } while (0);
  373. #endif /* ISO_C */
  374. /******************************************************************************/
  375. /*
  376. Non-unrolled comba squarer
  377. */
  378. //bbox: pool unused
  379. #define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
  380. pstm_sqr_comba_gen( A, B, paD, paDlen)
  381. static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
  382. pstm_digit *paD, uint32 paDlen)
  383. {
  384. int paDfail, pa; //bbox: was int16
  385. int32 ix, iz;
  386. pstm_digit c0, c1, c2, *dst;
  387. #ifdef PSTM_ISO
  388. pstm_word tt;
  389. #endif
  390. paDfail = 0;
  391. /* get size of output and trim */
  392. pa = A->used + A->used;
  393. /* number of output digits to produce */
  394. COMBA_START;
  395. CLEAR_CARRY;
  396. /*
  397. If b is not large enough grow it and continue
  398. */
  399. if (B->alloc < pa) {
  400. if (pstm_grow(B, pa) != PSTM_OKAY) {
  401. return PS_MEM_FAIL;
  402. }
  403. }
  404. if (paD != NULL) {
  405. if (paDlen < (sizeof(pstm_digit) * pa)) {
  406. paDfail = 1; /* have a paD, but it's not big enough */
  407. dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
  408. } else {
  409. dst = paD;
  410. memset(dst, 0x0, paDlen);
  411. }
  412. } else {
  413. dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
  414. }
  415. for (ix = 0; ix < pa; ix++) {
  416. int32 tx, ty, iy;
  417. pstm_digit *tmpy, *tmpx;
  418. /* get offsets into the two bignums */
  419. ty = min(A->used-1, ix);
  420. tx = ix - ty;
  421. /* setup temp aliases */
  422. tmpx = A->dp + tx;
  423. tmpy = A->dp + ty;
  424. /*
  425. This is the number of times the loop will iterate,
  426. while (tx++ < a->used && ty-- >= 0) { ... }
  427. */
  428. iy = min(A->used-tx, ty+1);
  429. /*
  430. now for squaring tx can never equal ty. We halve the distance since
  431. they approach at a rate of 2x and we have to round because odd cases
  432. need to be executed
  433. */
  434. iy = min(iy, (ty-tx+1)>>1);
  435. /* forward carries */
  436. CARRY_FORWARD;
  437. /* execute loop */
  438. for (iz = 0; iz < iy; iz++) {
  439. SQRADD2(*tmpx++, *tmpy--);
  440. }
  441. /* even columns have the square term in them */
  442. if ((ix&1) == 0) {
  443. SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
  444. }
  445. /* store it */
  446. COMBA_STORE(dst[ix]);
  447. }
  448. COMBA_FINI;
  449. /*
  450. setup dest
  451. */
  452. iz = B->used;
  453. B->used = pa;
  454. {
  455. pstm_digit *tmpc;
  456. tmpc = B->dp;
  457. for (ix = 0; ix < pa; ix++) {
  458. *tmpc++ = dst[ix];
  459. }
  460. /* clear unused digits (that existed in the old copy of c) */
  461. for (; ix < iz; ix++) {
  462. *tmpc++ = 0;
  463. }
  464. }
  465. pstm_clamp(B);
  466. if ((paD == NULL) || paDfail == 1) {
  467. psFree(dst, pool);
  468. }
  469. return PS_SUCCESS;
  470. }
  471. /******************************************************************************/
  472. /*
  473. Unrolled Comba loop for 1024 bit keys
  474. */
  475. #ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
  476. static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
  477. {
  478. pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
  479. #ifdef PSTM_ISO
  480. pstm_word tt;
  481. #endif
  482. if (B->alloc < 32) {
  483. if (pstm_grow(B, 32) != PSTM_OKAY) {
  484. return PS_MEM_FAIL;
  485. }
  486. }
  487. a = A->dp;
  488. sc0 = sc1 = sc2 = 0;
  489. COMBA_START;
  490. /* clear carries */
  491. CLEAR_CARRY;
  492. /* output 0 */
  493. SQRADD(a[0],a[0]);
  494. COMBA_STORE(b[0]);
  495. /* output 1 */
  496. CARRY_FORWARD;
  497. SQRADD2(a[0], a[1]);
  498. COMBA_STORE(b[1]);
  499. /* output 2 */
  500. CARRY_FORWARD;
  501. SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
  502. COMBA_STORE(b[2]);
  503. /* output 3 */
  504. CARRY_FORWARD;
  505. SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
  506. COMBA_STORE(b[3]);
  507. /* output 4 */
  508. CARRY_FORWARD;
  509. SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
  510. COMBA_STORE(b[4]);
  511. /* output 5 */
  512. CARRY_FORWARD;
  513. SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
  514. COMBA_STORE(b[5]);
  515. /* output 6 */
  516. CARRY_FORWARD;
  517. SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
  518. COMBA_STORE(b[6]);
  519. /* output 7 */
  520. CARRY_FORWARD;
  521. SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
  522. COMBA_STORE(b[7]);
  523. /* output 8 */
  524. CARRY_FORWARD;
  525. SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
  526. COMBA_STORE(b[8]);
  527. /* output 9 */
  528. CARRY_FORWARD;
  529. SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
  530. COMBA_STORE(b[9]);
  531. /* output 10 */
  532. CARRY_FORWARD;
  533. SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
  534. COMBA_STORE(b[10]);
  535. /* output 11 */
  536. CARRY_FORWARD;
  537. SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
  538. COMBA_STORE(b[11]);
  539. /* output 12 */
  540. CARRY_FORWARD;
  541. SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
  542. COMBA_STORE(b[12]);
  543. /* output 13 */
  544. CARRY_FORWARD;
  545. SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
  546. COMBA_STORE(b[13]);
  547. /* output 14 */
  548. CARRY_FORWARD;
  549. SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
  550. COMBA_STORE(b[14]);
  551. /* output 15 */
  552. CARRY_FORWARD;
  553. SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
  554. COMBA_STORE(b[15]);
  555. /* output 16 */
  556. CARRY_FORWARD;
  557. SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
  558. COMBA_STORE(b[16]);
  559. /* output 17 */
  560. CARRY_FORWARD;
  561. SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
  562. COMBA_STORE(b[17]);
  563. /* output 18 */
  564. CARRY_FORWARD;
  565. SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
  566. COMBA_STORE(b[18]);
  567. /* output 19 */
  568. CARRY_FORWARD;
  569. SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
  570. COMBA_STORE(b[19]);
  571. /* output 20 */
  572. CARRY_FORWARD;
  573. SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
  574. COMBA_STORE(b[20]);
  575. /* output 21 */
  576. CARRY_FORWARD;
  577. SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
  578. COMBA_STORE(b[21]);
  579. /* output 22 */
  580. CARRY_FORWARD;
  581. SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
  582. COMBA_STORE(b[22]);
  583. /* output 23 */
  584. CARRY_FORWARD;
  585. SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
  586. COMBA_STORE(b[23]);
  587. /* output 24 */
  588. CARRY_FORWARD;
  589. SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
  590. COMBA_STORE(b[24]);
  591. /* output 25 */
  592. CARRY_FORWARD;
  593. SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
  594. COMBA_STORE(b[25]);
  595. /* output 26 */
  596. CARRY_FORWARD;
  597. SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
  598. COMBA_STORE(b[26]);
  599. /* output 27 */
  600. CARRY_FORWARD;
  601. SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
  602. COMBA_STORE(b[27]);
  603. /* output 28 */
  604. CARRY_FORWARD;
  605. SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
  606. COMBA_STORE(b[28]);
  607. /* output 29 */
  608. CARRY_FORWARD;
  609. SQRADD2(a[14], a[15]);
  610. COMBA_STORE(b[29]);
  611. /* output 30 */
  612. CARRY_FORWARD;
  613. SQRADD(a[15], a[15]);
  614. COMBA_STORE(b[30]);
  615. COMBA_STORE2(b[31]);
  616. COMBA_FINI;
  617. B->used = 32;
  618. B->sign = PSTM_ZPOS;
  619. memcpy(B->dp, b, 32 * sizeof(pstm_digit));
  620. pstm_clamp(B);
  621. return PSTM_OKAY;
  622. }
  623. #endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
  624. #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
  625. static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
  626. {
  627. pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
  628. #ifdef PSTM_ISO
  629. pstm_word tt;
  630. #endif
  631. if (B->alloc < 64) {
  632. if (pstm_grow(B, 64) != PSTM_OKAY) {
  633. return PS_MEM_FAIL;
  634. }
  635. }
  636. sc0 = sc1 = sc2 = 0;
  637. a = A->dp;
  638. COMBA_START;
  639. /* clear carries */
  640. CLEAR_CARRY;
  641. /* output 0 */
  642. SQRADD(a[0],a[0]);
  643. COMBA_STORE(b[0]);
  644. /* output 1 */
  645. CARRY_FORWARD;
  646. SQRADD2(a[0], a[1]);
  647. COMBA_STORE(b[1]);
  648. /* output 2 */
  649. CARRY_FORWARD;
  650. SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
  651. COMBA_STORE(b[2]);
  652. /* output 3 */
  653. CARRY_FORWARD;
  654. SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
  655. COMBA_STORE(b[3]);
  656. /* output 4 */
  657. CARRY_FORWARD;
  658. SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
  659. COMBA_STORE(b[4]);
  660. /* output 5 */
  661. CARRY_FORWARD;
  662. SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
  663. COMBA_STORE(b[5]);
  664. /* output 6 */
  665. CARRY_FORWARD;
  666. SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
  667. COMBA_STORE(b[6]);
  668. /* output 7 */
  669. CARRY_FORWARD;
  670. SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
  671. COMBA_STORE(b[7]);
  672. /* output 8 */
  673. CARRY_FORWARD;
  674. SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
  675. COMBA_STORE(b[8]);
  676. /* output 9 */
  677. CARRY_FORWARD;
  678. SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
  679. COMBA_STORE(b[9]);
  680. /* output 10 */
  681. CARRY_FORWARD;
  682. SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
  683. COMBA_STORE(b[10]);
  684. /* output 11 */
  685. CARRY_FORWARD;
  686. SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
  687. COMBA_STORE(b[11]);
  688. /* output 12 */
  689. CARRY_FORWARD;
  690. SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
  691. COMBA_STORE(b[12]);
  692. /* output 13 */
  693. CARRY_FORWARD;
  694. SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
  695. COMBA_STORE(b[13]);
  696. /* output 14 */
  697. CARRY_FORWARD;
  698. SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
  699. COMBA_STORE(b[14]);
  700. /* output 15 */
  701. CARRY_FORWARD;
  702. SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
  703. COMBA_STORE(b[15]);
  704. /* output 16 */
  705. CARRY_FORWARD;
  706. SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
  707. COMBA_STORE(b[16]);
  708. /* output 17 */
  709. CARRY_FORWARD;
  710. SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
  711. COMBA_STORE(b[17]);
  712. /* output 18 */
  713. CARRY_FORWARD;
  714. SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
  715. COMBA_STORE(b[18]);
  716. /* output 19 */
  717. CARRY_FORWARD;
  718. SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
  719. COMBA_STORE(b[19]);
  720. /* output 20 */
  721. CARRY_FORWARD;
  722. SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
  723. COMBA_STORE(b[20]);
  724. /* output 21 */
  725. CARRY_FORWARD;
  726. SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
  727. COMBA_STORE(b[21]);
  728. /* output 22 */
  729. CARRY_FORWARD;
  730. SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
  731. COMBA_STORE(b[22]);
  732. /* output 23 */
  733. CARRY_FORWARD;
  734. SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
  735. COMBA_STORE(b[23]);
  736. /* output 24 */
  737. CARRY_FORWARD;
  738. SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
  739. COMBA_STORE(b[24]);
  740. /* output 25 */
  741. CARRY_FORWARD;
  742. SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
  743. COMBA_STORE(b[25]);
  744. /* output 26 */
  745. CARRY_FORWARD;
  746. SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
  747. COMBA_STORE(b[26]);
  748. /* output 27 */
  749. CARRY_FORWARD;
  750. SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
  751. COMBA_STORE(b[27]);
  752. /* output 28 */
  753. CARRY_FORWARD;
  754. SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
  755. COMBA_STORE(b[28]);
  756. /* output 29 */
  757. CARRY_FORWARD;
  758. SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
  759. COMBA_STORE(b[29]);
  760. /* output 30 */
  761. CARRY_FORWARD;
  762. SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
  763. COMBA_STORE(b[30]);
  764. /* output 31 */
  765. CARRY_FORWARD;
  766. SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
  767. COMBA_STORE(b[31]);
  768. /* output 32 */
  769. CARRY_FORWARD;
  770. SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
  771. COMBA_STORE(b[32]);
  772. /* output 33 */
  773. CARRY_FORWARD;
  774. SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
  775. COMBA_STORE(b[33]);
  776. /* output 34 */
  777. CARRY_FORWARD;
  778. SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
  779. COMBA_STORE(b[34]);
  780. /* output 35 */
  781. CARRY_FORWARD;
  782. SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
  783. COMBA_STORE(b[35]);
  784. /* output 36 */
  785. CARRY_FORWARD;
  786. SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
  787. COMBA_STORE(b[36]);
  788. /* output 37 */
  789. CARRY_FORWARD;
  790. SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
  791. COMBA_STORE(b[37]);
  792. /* output 38 */
  793. CARRY_FORWARD;
  794. SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
  795. COMBA_STORE(b[38]);
  796. /* output 39 */
  797. CARRY_FORWARD;
  798. SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
  799. COMBA_STORE(b[39]);
  800. /* output 40 */
  801. CARRY_FORWARD;
  802. SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
  803. COMBA_STORE(b[40]);
  804. /* output 41 */
  805. CARRY_FORWARD;
  806. SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
  807. COMBA_STORE(b[41]);
  808. /* output 42 */
  809. CARRY_FORWARD;
  810. SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
  811. COMBA_STORE(b[42]);
  812. /* output 43 */
  813. CARRY_FORWARD;
  814. SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
  815. COMBA_STORE(b[43]);
  816. /* output 44 */
  817. CARRY_FORWARD;
  818. SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
  819. COMBA_STORE(b[44]);
  820. /* output 45 */
  821. CARRY_FORWARD;
  822. SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
  823. COMBA_STORE(b[45]);
  824. /* output 46 */
  825. CARRY_FORWARD;
  826. SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
  827. COMBA_STORE(b[46]);
  828. /* output 47 */
  829. CARRY_FORWARD;
  830. SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
  831. COMBA_STORE(b[47]);
  832. /* output 48 */
  833. CARRY_FORWARD;
  834. SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
  835. COMBA_STORE(b[48]);
  836. /* output 49 */
  837. CARRY_FORWARD;
  838. SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
  839. COMBA_STORE(b[49]);
  840. /* output 50 */
  841. CARRY_FORWARD;
  842. SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
  843. COMBA_STORE(b[50]);
  844. /* output 51 */
  845. CARRY_FORWARD;
  846. SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
  847. COMBA_STORE(b[51]);
  848. /* output 52 */
  849. CARRY_FORWARD;
  850. SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
  851. COMBA_STORE(b[52]);
  852. /* output 53 */
  853. CARRY_FORWARD;
  854. SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
  855. COMBA_STORE(b[53]);
  856. /* output 54 */
  857. CARRY_FORWARD;
  858. SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
  859. COMBA_STORE(b[54]);
  860. /* output 55 */
  861. CARRY_FORWARD;
  862. SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
  863. COMBA_STORE(b[55]);
  864. /* output 56 */
  865. CARRY_FORWARD;
  866. SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
  867. COMBA_STORE(b[56]);
  868. /* output 57 */
  869. CARRY_FORWARD;
  870. SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
  871. COMBA_STORE(b[57]);
  872. /* output 58 */
  873. CARRY_FORWARD;
  874. SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
  875. COMBA_STORE(b[58]);
  876. /* output 59 */
  877. CARRY_FORWARD;
  878. SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
  879. COMBA_STORE(b[59]);
  880. /* output 60 */
  881. CARRY_FORWARD;
  882. SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
  883. COMBA_STORE(b[60]);
  884. /* output 61 */
  885. CARRY_FORWARD;
  886. SQRADD2(a[30], a[31]);
  887. COMBA_STORE(b[61]);
  888. /* output 62 */
  889. CARRY_FORWARD;
  890. SQRADD(a[31], a[31]);
  891. COMBA_STORE(b[62]);
  892. COMBA_STORE2(b[63]);
  893. COMBA_FINI;
  894. B->used = 64;
  895. B->sign = PSTM_ZPOS;
  896. memcpy(B->dp, b, 64 * sizeof(pstm_digit));
  897. pstm_clamp(B);
  898. return PSTM_OKAY;
  899. }
  900. #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
  901. /******************************************************************************/
  902. /*
  903. */
  904. int32 FAST_FUNC pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
  905. uint32 paDlen)
  906. {
  907. #ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
  908. if (A->used == 16) {
  909. return pstm_sqr_comba16(A, B);
  910. } else {
  911. #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
  912. if (A->used == 32) {
  913. return pstm_sqr_comba32(A, B);
  914. }
  915. #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
  916. return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
  917. }
  918. #else
  919. #ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
  920. if (A->used == 32) {
  921. return pstm_sqr_comba32(A, B);
  922. }
  923. #endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
  924. return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
  925. #endif
  926. }
  927. #endif /* DISABLE_PSTM */
  928. /******************************************************************************/