tls_pstm_montgomery_reduce.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. /*
  2. * Copyright (C) 2017 Denys Vlasenko
  3. *
  4. * Licensed under GPLv2, see file LICENSE in this source tree.
  5. */
  6. #include "tls.h"
  7. /* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
  8. * Changes are flagged with //bbox
  9. */
  10. /**
  11. * @file pstm_montgomery_reduce.c
  12. * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
  13. *
  14. * Multiprecision Montgomery Reduction.
  15. */
  16. /*
  17. * Copyright (c) 2013-2015 INSIDE Secure Corporation
  18. * Copyright (c) PeerSec Networks, 2002-2011
  19. * All Rights Reserved
  20. *
  21. * The latest version of this code is available at http://www.matrixssl.org
  22. *
  23. * This software is open source; you can redistribute it and/or modify
  24. * it under the terms of the GNU General Public License as published by
  25. * the Free Software Foundation; either version 2 of the License, or
  26. * (at your option) any later version.
  27. *
  28. * This General Public License does NOT permit incorporating this software
  29. * into proprietary programs. If you are unable to comply with the GPL, a
  30. * commercial license for this software may be purchased from INSIDE at
  31. * http://www.insidesecure.com/eng/Company/Locations
  32. *
  33. * This program is distributed in WITHOUT ANY WARRANTY; without even the
  34. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  35. * See the GNU General Public License for more details.
  36. *
  37. * You should have received a copy of the GNU General Public License
  38. * along with this program; if not, write to the Free Software
  39. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  40. * http://www.gnu.org/copyleft/gpl.html
  41. */
  42. /******************************************************************************/
  43. //bbox
  44. //#include "../cryptoApi.h"
  45. #ifndef DISABLE_PSTM
  46. /******************************************************************************/
  47. #if defined(PSTM_X86)
  48. /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
  49. #if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
  50. #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
  51. #endif
  52. //#pragma message ("Using 32 bit x86 Assembly Optimizations")
  53. #define MONT_START
  54. #define MONT_FINI
  55. #define LOOP_END
  56. #define LOOP_START \
  57. mu = c[x] * mp
  58. #define INNERMUL \
  59. asm( \
  60. "movl %5,%%eax \n\t" \
  61. "mull %4 \n\t" \
  62. "addl %1,%%eax \n\t" \
  63. "adcl $0,%%edx \n\t" \
  64. "addl %%eax,%0 \n\t" \
  65. "adcl $0,%%edx \n\t" \
  66. "movl %%edx,%1 \n\t" \
  67. :"=g"(_c[LO]), "=r"(cy) \
  68. :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
  69. : "%eax", "%edx", "%cc")
  70. #define PROPCARRY \
  71. asm( \
  72. "addl %1,%0 \n\t" \
  73. "setb %%al \n\t" \
  74. "movzbl %%al,%1 \n\t" \
  75. :"=g"(_c[LO]), "=r"(cy) \
  76. :"0"(_c[LO]), "1"(cy) \
  77. : "%eax", "%cc")
  78. /******************************************************************************/
  79. #elif defined(PSTM_X86_64)
  80. /* x86-64 optimized */
  81. #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
  82. #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
  83. #endif
  84. //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
  85. #define MONT_START
  86. #define MONT_FINI
  87. #define LOOP_END
  88. #define LOOP_START \
  89. mu = c[x] * mp
  90. #define INNERMUL \
  91. asm( \
  92. "movq %5,%%rax \n\t" \
  93. "mulq %4 \n\t" \
  94. "addq %1,%%rax \n\t" \
  95. "adcq $0,%%rdx \n\t" \
  96. "addq %%rax,%0 \n\t" \
  97. "adcq $0,%%rdx \n\t" \
  98. "movq %%rdx,%1 \n\t" \
  99. :"=g"(_c[LO]), "=r"(cy) \
  100. :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
  101. : "%rax", "%rdx", "cc")
  102. #define INNERMUL8 \
  103. asm( \
  104. "movq 0(%5),%%rax \n\t" \
  105. "movq 0(%2),%%r10 \n\t" \
  106. "movq 0x8(%5),%%r11 \n\t" \
  107. "mulq %4 \n\t" \
  108. "addq %%r10,%%rax \n\t" \
  109. "adcq $0,%%rdx \n\t" \
  110. "movq 0x8(%2),%%r10 \n\t" \
  111. "addq %3,%%rax \n\t" \
  112. "adcq $0,%%rdx \n\t" \
  113. "movq %%rax,0(%0) \n\t" \
  114. "movq %%rdx,%1 \n\t" \
  115. \
  116. "movq %%r11,%%rax \n\t" \
  117. "movq 0x10(%5),%%r11 \n\t" \
  118. "mulq %4 \n\t" \
  119. "addq %%r10,%%rax \n\t" \
  120. "adcq $0,%%rdx \n\t" \
  121. "movq 0x10(%2),%%r10 \n\t" \
  122. "addq %3,%%rax \n\t" \
  123. "adcq $0,%%rdx \n\t" \
  124. "movq %%rax,0x8(%0) \n\t" \
  125. "movq %%rdx,%1 \n\t" \
  126. \
  127. "movq %%r11,%%rax \n\t" \
  128. "movq 0x18(%5),%%r11 \n\t" \
  129. "mulq %4 \n\t" \
  130. "addq %%r10,%%rax \n\t" \
  131. "adcq $0,%%rdx \n\t" \
  132. "movq 0x18(%2),%%r10 \n\t" \
  133. "addq %3,%%rax \n\t" \
  134. "adcq $0,%%rdx \n\t" \
  135. "movq %%rax,0x10(%0) \n\t" \
  136. "movq %%rdx,%1 \n\t" \
  137. \
  138. "movq %%r11,%%rax \n\t" \
  139. "movq 0x20(%5),%%r11 \n\t" \
  140. "mulq %4 \n\t" \
  141. "addq %%r10,%%rax \n\t" \
  142. "adcq $0,%%rdx \n\t" \
  143. "movq 0x20(%2),%%r10 \n\t" \
  144. "addq %3,%%rax \n\t" \
  145. "adcq $0,%%rdx \n\t" \
  146. "movq %%rax,0x18(%0) \n\t" \
  147. "movq %%rdx,%1 \n\t" \
  148. \
  149. "movq %%r11,%%rax \n\t" \
  150. "movq 0x28(%5),%%r11 \n\t" \
  151. "mulq %4 \n\t" \
  152. "addq %%r10,%%rax \n\t" \
  153. "adcq $0,%%rdx \n\t" \
  154. "movq 0x28(%2),%%r10 \n\t" \
  155. "addq %3,%%rax \n\t" \
  156. "adcq $0,%%rdx \n\t" \
  157. "movq %%rax,0x20(%0) \n\t" \
  158. "movq %%rdx,%1 \n\t" \
  159. \
  160. "movq %%r11,%%rax \n\t" \
  161. "movq 0x30(%5),%%r11 \n\t" \
  162. "mulq %4 \n\t" \
  163. "addq %%r10,%%rax \n\t" \
  164. "adcq $0,%%rdx \n\t" \
  165. "movq 0x30(%2),%%r10 \n\t" \
  166. "addq %3,%%rax \n\t" \
  167. "adcq $0,%%rdx \n\t" \
  168. "movq %%rax,0x28(%0) \n\t" \
  169. "movq %%rdx,%1 \n\t" \
  170. \
  171. "movq %%r11,%%rax \n\t" \
  172. "movq 0x38(%5),%%r11 \n\t" \
  173. "mulq %4 \n\t" \
  174. "addq %%r10,%%rax \n\t" \
  175. "adcq $0,%%rdx \n\t" \
  176. "movq 0x38(%2),%%r10 \n\t" \
  177. "addq %3,%%rax \n\t" \
  178. "adcq $0,%%rdx \n\t" \
  179. "movq %%rax,0x30(%0) \n\t" \
  180. "movq %%rdx,%1 \n\t" \
  181. \
  182. "movq %%r11,%%rax \n\t" \
  183. "mulq %4 \n\t" \
  184. "addq %%r10,%%rax \n\t" \
  185. "adcq $0,%%rdx \n\t" \
  186. "addq %3,%%rax \n\t" \
  187. "adcq $0,%%rdx \n\t" \
  188. "movq %%rax,0x38(%0) \n\t" \
  189. "movq %%rdx,%1 \n\t" \
  190. \
  191. :"=r"(_c), "=r"(cy) \
  192. : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
  193. : "%rax", "%rdx", "%r10", "%r11", "cc")
  194. #define PROPCARRY \
  195. asm( \
  196. "addq %1,%0 \n\t" \
  197. "setb %%al \n\t" \
  198. "movzbq %%al,%1 \n\t" \
  199. :"=g"(_c[LO]), "=r"(cy) \
  200. :"0"(_c[LO]), "1"(cy) \
  201. : "%rax", "cc")
  202. /******************************************************************************/
  203. #elif defined(PSTM_ARM)
  204. #define MONT_START
  205. #define MONT_FINI
  206. #define LOOP_END
  207. #define LOOP_START \
  208. mu = c[x] * mp
  209. #ifdef __thumb2__
  210. //#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
  211. #define INNERMUL \
  212. asm( \
  213. " LDR r0,%1 \n\t" \
  214. " ADDS r0,r0,%0 \n\t" \
  215. " ITE CS \n\t" \
  216. " MOVCS %0,#1 \n\t" \
  217. " MOVCC %0,#0 \n\t" \
  218. " UMLAL r0,%0,%3,%4 \n\t" \
  219. " STR r0,%1 \n\t" \
  220. :"=r"(cy),"=m"(_c[0])\
  221. :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
  222. :"r0","%cc");
  223. #define PROPCARRY \
  224. asm( \
  225. " LDR r0,%1 \n\t" \
  226. " ADDS r0,r0,%0 \n\t" \
  227. " STR r0,%1 \n\t" \
  228. " ITE CS \n\t" \
  229. " MOVCS %0,#1 \n\t" \
  230. " MOVCC %0,#0 \n\t" \
  231. :"=r"(cy),"=m"(_c[0])\
  232. :"0"(cy),"m"(_c[0])\
  233. :"r0","%cc");
  234. #else /* Non-Thumb2 code */
  235. //#pragma message ("Using 32 bit ARM Assembly Optimizations")
  236. #define INNERMUL \
  237. asm( \
  238. " LDR r0,%1 \n\t" \
  239. " ADDS r0,r0,%0 \n\t" \
  240. " MOVCS %0,#1 \n\t" \
  241. " MOVCC %0,#0 \n\t" \
  242. " UMLAL r0,%0,%3,%4 \n\t" \
  243. " STR r0,%1 \n\t" \
  244. :"=r"(cy),"=m"(_c[0])\
  245. :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
  246. :"r0","%cc");
  247. #define PROPCARRY \
  248. asm( \
  249. " LDR r0,%1 \n\t" \
  250. " ADDS r0,r0,%0 \n\t" \
  251. " STR r0,%1 \n\t" \
  252. " MOVCS %0,#1 \n\t" \
  253. " MOVCC %0,#0 \n\t" \
  254. :"=r"(cy),"=m"(_c[0])\
  255. :"0"(cy),"m"(_c[0])\
  256. :"r0","%cc");
  257. #endif /* __thumb2__ */
  258. /******************************************************************************/
  259. #elif defined(PSTM_MIPS)
  260. /* MIPS32 */
  261. //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
  262. #define MONT_START
  263. #define MONT_FINI
  264. #define LOOP_END
  265. #define LOOP_START \
  266. mu = c[x] * mp
  267. #define INNERMUL \
  268. asm( \
  269. " multu %3,%4 \n\t" \
  270. " mflo $12 \n\t" \
  271. " mfhi $13 \n\t" \
  272. " addu $12,$12,%0 \n\t" \
  273. " sltu $10,$12,%0 \n\t" \
  274. " addu $13,$13,$10 \n\t" \
  275. " lw $10,%1 \n\t" \
  276. " addu $12,$12,$10 \n\t" \
  277. " sltu $10,$12,$10 \n\t" \
  278. " addu %0,$13,$10 \n\t" \
  279. " sw $12,%1 \n\t" \
  280. :"=r"(cy),"=m"(_c[0])\
  281. :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
  282. :"$10","$12","$13")\
  283. ; ++tmpm;
  284. #define PROPCARRY \
  285. asm( \
  286. " lw $10,%1 \n\t" \
  287. " addu $10,$10,%0 \n\t" \
  288. " sw $10,%1 \n\t" \
  289. " sltu %0,$10,%0 \n\t" \
  290. :"=r"(cy),"=m"(_c[0])\
  291. :"r"(cy),"r"(_c[0])\
  292. :"$10");
  293. /******************************************************************************/
  294. #else
  295. /* ISO C code */
  296. #define MONT_START
  297. #define MONT_FINI
  298. #define LOOP_END
  299. #define LOOP_START \
  300. mu = c[x] * mp
  301. #define INNERMUL \
  302. do { pstm_word t; \
  303. t = ((pstm_word)_c[0] + (pstm_word)cy) + \
  304. (((pstm_word)mu) * ((pstm_word)*tmpm++)); \
  305. _c[0] = (pstm_digit)t; \
  306. cy = (pstm_digit)(t >> DIGIT_BIT); \
  307. } while (0)
  308. #define PROPCARRY \
  309. do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
  310. #endif
  311. /******************************************************************************/
  312. #define LO 0
  313. /* computes x/R == x (mod N) via Montgomery Reduction */
  314. int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
  315. pstm_digit mp, pstm_digit *paD, uint32 paDlen)
  316. {
  317. pstm_digit *c, *_c, *tmpm, mu;
  318. int32 oldused, x, y;
  319. int16 pa;
  320. pa = m->used;
  321. if (pa > a->alloc) {
  322. /* Sanity test for bad numbers. This will confirm no buffer overruns */
  323. return PS_LIMIT_FAIL;
  324. }
  325. if (paD && paDlen >= (uint32)2*pa+1) {
  326. c = paD;
  327. memset(c, 0x0, paDlen);
  328. } else {
  329. c = xzalloc(2*pa+1);//bbox
  330. }
  331. /* copy the input */
  332. oldused = a->used;
  333. for (x = 0; x < oldused; x++) {
  334. c[x] = a->dp[x];
  335. }
  336. MONT_START;
  337. for (x = 0; x < pa; x++) {
  338. pstm_digit cy = 0;
  339. /* get Mu for this round */
  340. LOOP_START;
  341. _c = c + x;
  342. tmpm = m->dp;
  343. y = 0;
  344. #ifdef PSTM_X86_64
  345. for (; y < (pa & ~7); y += 8) {
  346. INNERMUL8;
  347. _c += 8;
  348. tmpm += 8;
  349. }
  350. #endif /* PSTM_X86_64 */
  351. for (; y < pa; y++) {
  352. INNERMUL;
  353. ++_c;
  354. }
  355. LOOP_END;
  356. while (cy) {
  357. PROPCARRY;
  358. ++_c;
  359. }
  360. }
  361. /* now copy out */
  362. _c = c + pa;
  363. tmpm = a->dp;
  364. for (x = 0; x < pa+1; x++) {
  365. *tmpm++ = *_c++;
  366. }
  367. for (; x < oldused; x++) {
  368. *tmpm++ = 0;
  369. }
  370. MONT_FINI;
  371. a->used = pa+1;
  372. pstm_clamp(a);
  373. /* reuse x as return code */
  374. x = PSTM_OKAY;
  375. /* if A >= m then A = A - m */
  376. if (pstm_cmp_mag (a, m) != PSTM_LT) {
  377. if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
  378. x = PS_MEM_FAIL;
  379. }
  380. }
  381. if (paDlen < (uint32)2*pa+1) {
  382. psFree(c, pool);
  383. }
  384. return x;
  385. }
  386. #endif /* !DISABLE_PSTM */
  387. /******************************************************************************/