tls_sp_c32.c 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523
  1. /*
  2. * Copyright (C) 2021 Denys Vlasenko
  3. *
  4. * Licensed under GPLv2, see file LICENSE in this source tree.
  5. */
  6. #include "tls.h"
  7. #define SP_DEBUG 0
  8. #define FIXED_SECRET 0
  9. #define FIXED_PEER_PUBKEY 0
  10. #define ALLOW_ASM 1
  11. #if SP_DEBUG
  12. # define dbg(...) fprintf(stderr, __VA_ARGS__)
  13. static void dump_hex(const char *fmt, const void *vp, int len)
  14. {
  15. char hexbuf[32 * 1024 + 4];
  16. const uint8_t *p = vp;
  17. bin2hex(hexbuf, (void*)p, len)[0] = '\0';
  18. dbg(fmt, hexbuf);
  19. }
  20. #else
  21. # define dbg(...) ((void)0)
  22. # define dump_hex(...) ((void)0)
  23. #endif
  24. typedef uint32_t sp_digit;
  25. typedef int32_t signed_sp_digit;
  26. /* 64-bit optimizations:
  27. * if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff,
  28. * then loads and stores can be done in 64-bit chunks.
  29. *
  30. * A narrower case is when arch is also little-endian (such as x86_64),
  31. * then "LSW first", uint32[8] and uint64[4] representations are equivalent,
  32. * and arithmetic can be done in 64 bits too.
  33. */
  34. #if defined(__GNUC__) && defined(__x86_64__)
  35. # define UNALIGNED_LE_64BIT 1
  36. #else
  37. # define UNALIGNED_LE_64BIT 0
  38. #endif
  39. /* The code below is taken from parts of
  40. * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c
  41. * and heavily modified.
  42. */
  43. typedef struct sp_point {
  44. sp_digit x[8]
  45. #if ULONG_MAX > 0xffffffff
  46. /* Make sp_point[] arrays to not be 64-bit misaligned */
  47. ALIGNED(8)
  48. #endif
  49. ;
  50. sp_digit y[8];
  51. sp_digit z[8];
  52. int infinity;
  53. } sp_point;
  54. /* The modulus (prime) of the curve P256. */
  55. static const sp_digit p256_mod[8] ALIGNED(8) = {
  56. 0xffffffff,0xffffffff,0xffffffff,0x00000000,
  57. 0x00000000,0x00000000,0x00000001,0xffffffff,
  58. };
  59. #define p256_mp_mod ((sp_digit)0x000001)
  60. /* Normalize the values in each word to 32 bits - NOP */
  61. #define sp_256_norm_8(a) ((void)0)
  62. /* Write r as big endian to byte array.
  63. * Fixed length number of bytes written: 32
  64. *
  65. * r A single precision integer.
  66. * a Byte array.
  67. */
  68. #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
  69. static void sp_256_to_bin_8(const sp_digit* rr, uint8_t* a)
  70. {
  71. int i;
  72. const uint64_t* r = (void*)rr;
  73. sp_256_norm_8(rr);
  74. r += 4;
  75. for (i = 0; i < 4; i++) {
  76. r--;
  77. move_to_unaligned64(a, SWAP_BE64(*r));
  78. a += 8;
  79. }
  80. }
  81. #else
  82. static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a)
  83. {
  84. int i;
  85. sp_256_norm_8(r);
  86. r += 8;
  87. for (i = 0; i < 8; i++) {
  88. r--;
  89. move_to_unaligned32(a, SWAP_BE32(*r));
  90. a += 4;
  91. }
  92. }
  93. #endif
  94. /* Read big endian unsigned byte array into r.
  95. *
  96. * r A single precision integer.
  97. * a Byte array.
  98. * n Number of bytes in array to read.
  99. */
  100. #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
  101. static void sp_256_from_bin_8(sp_digit* rr, const uint8_t* a)
  102. {
  103. int i;
  104. uint64_t* r = (void*)rr;
  105. r += 4;
  106. for (i = 0; i < 4; i++) {
  107. uint64_t v;
  108. move_from_unaligned64(v, a);
  109. *--r = SWAP_BE64(v);
  110. a += 8;
  111. }
  112. }
  113. #else
  114. static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a)
  115. {
  116. int i;
  117. r += 8;
  118. for (i = 0; i < 8; i++) {
  119. sp_digit v;
  120. move_from_unaligned32(v, a);
  121. *--r = SWAP_BE32(v);
  122. a += 4;
  123. }
  124. }
  125. #endif
  126. #if SP_DEBUG
  127. static void dump_256(const char *fmt, const sp_digit* r)
  128. {
  129. uint8_t b32[32];
  130. sp_256_to_bin_8(r, b32);
  131. dump_hex(fmt, b32, 32);
  132. }
  133. static void dump_512(const char *fmt, const sp_digit* r)
  134. {
  135. uint8_t b64[64];
  136. sp_256_to_bin_8(r, b64 + 32);
  137. sp_256_to_bin_8(r+8, b64);
  138. dump_hex(fmt, b64, 64);
  139. }
  140. #else
  141. # define dump_256(...) ((void)0)
  142. # define dump_512(...) ((void)0)
  143. #endif
  144. /* Convert a point of big-endian 32-byte x,y pair to type sp_point. */
  145. static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32)
  146. {
  147. memset(p, 0, sizeof(*p));
  148. /*p->infinity = 0;*/
  149. sp_256_from_bin_8(p->x, bin2x32);
  150. sp_256_from_bin_8(p->y, bin2x32 + 32);
  151. p->z[0] = 1; /* p->z = 1 */
  152. }
  153. /* Compare a with b.
  154. *
  155. * return -ve, 0 or +ve if a is less than, equal to or greater than b
  156. * respectively.
  157. */
  158. #if UNALIGNED_LE_64BIT
  159. static signed_sp_digit sp_256_cmp_8(const sp_digit* aa, const sp_digit* bb)
  160. {
  161. const uint64_t* a = (void*)aa;
  162. const uint64_t* b = (void*)bb;
  163. int i;
  164. for (i = 3; i >= 0; i--) {
  165. if (a[i] == b[i])
  166. continue;
  167. return (a[i] > b[i]) * 2 - 1;
  168. }
  169. return 0;
  170. }
  171. #else
  172. static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
  173. {
  174. int i;
  175. for (i = 7; i >= 0; i--) {
  176. /* signed_sp_digit r = a[i] - b[i];
  177. * if (r != 0)
  178. * return r;
  179. * does not work: think about a[i]=0, b[i]=0xffffffff
  180. */
  181. if (a[i] == b[i])
  182. continue;
  183. return (a[i] > b[i]) * 2 - 1;
  184. }
  185. return 0;
  186. }
  187. #endif
  188. /* Compare two numbers to determine if they are equal.
  189. *
  190. * return 1 when equal and 0 otherwise.
  191. */
  192. static int sp_256_cmp_equal_8(const sp_digit* a, const sp_digit* b)
  193. {
  194. return sp_256_cmp_8(a, b) == 0;
  195. }
  196. /* Add b to a into r. (r = a + b). Return !0 on overflow */
  197. static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
  198. {
  199. #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
  200. sp_digit reg;
  201. asm volatile (
  202. "\n movl (%0), %3"
  203. "\n addl (%1), %3"
  204. "\n movl %3, (%2)"
  205. "\n"
  206. "\n movl 1*4(%0), %3"
  207. "\n adcl 1*4(%1), %3"
  208. "\n movl %3, 1*4(%2)"
  209. "\n"
  210. "\n movl 2*4(%0), %3"
  211. "\n adcl 2*4(%1), %3"
  212. "\n movl %3, 2*4(%2)"
  213. "\n"
  214. "\n movl 3*4(%0), %3"
  215. "\n adcl 3*4(%1), %3"
  216. "\n movl %3, 3*4(%2)"
  217. "\n"
  218. "\n movl 4*4(%0), %3"
  219. "\n adcl 4*4(%1), %3"
  220. "\n movl %3, 4*4(%2)"
  221. "\n"
  222. "\n movl 5*4(%0), %3"
  223. "\n adcl 5*4(%1), %3"
  224. "\n movl %3, 5*4(%2)"
  225. "\n"
  226. "\n movl 6*4(%0), %3"
  227. "\n adcl 6*4(%1), %3"
  228. "\n movl %3, 6*4(%2)"
  229. "\n"
  230. "\n movl 7*4(%0), %3"
  231. "\n adcl 7*4(%1), %3"
  232. "\n movl %3, 7*4(%2)"
  233. "\n"
  234. "\n sbbl %3, %3"
  235. "\n"
  236. : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
  237. : "0" (a), "1" (b), "2" (r)
  238. : "memory"
  239. );
  240. return reg;
  241. #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
  242. uint64_t reg;
  243. asm volatile (
  244. "\n movq (%0), %3"
  245. "\n addq (%1), %3"
  246. "\n movq %3, (%2)"
  247. "\n"
  248. "\n movq 1*8(%0), %3"
  249. "\n adcq 1*8(%1), %3"
  250. "\n movq %3, 1*8(%2)"
  251. "\n"
  252. "\n movq 2*8(%0), %3"
  253. "\n adcq 2*8(%1), %3"
  254. "\n movq %3, 2*8(%2)"
  255. "\n"
  256. "\n movq 3*8(%0), %3"
  257. "\n adcq 3*8(%1), %3"
  258. "\n movq %3, 3*8(%2)"
  259. "\n"
  260. "\n sbbq %3, %3"
  261. "\n"
  262. : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
  263. : "0" (a), "1" (b), "2" (r)
  264. : "memory"
  265. );
  266. return reg;
  267. #else
  268. int i;
  269. sp_digit carry;
  270. carry = 0;
  271. for (i = 0; i < 8; i++) {
  272. sp_digit w, v;
  273. w = b[i] + carry;
  274. v = a[i];
  275. if (w != 0) {
  276. v = a[i] + w;
  277. carry = (v < a[i]);
  278. /* hope compiler detects above as "carry flag set" */
  279. }
  280. /* else: b + carry == 0, two cases:
  281. * b:ffffffff, carry:1
  282. * b:00000000, carry:0
  283. * in either case, r[i] = a[i] and carry remains unchanged
  284. */
  285. r[i] = v;
  286. }
  287. return carry;
  288. #endif
  289. }
  290. /* Sub b from a into r. (r = a - b). Return !0 on underflow */
  291. static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
  292. {
  293. #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
  294. sp_digit reg;
  295. asm volatile (
  296. "\n movl (%0), %3"
  297. "\n subl (%1), %3"
  298. "\n movl %3, (%2)"
  299. "\n"
  300. "\n movl 1*4(%0), %3"
  301. "\n sbbl 1*4(%1), %3"
  302. "\n movl %3, 1*4(%2)"
  303. "\n"
  304. "\n movl 2*4(%0), %3"
  305. "\n sbbl 2*4(%1), %3"
  306. "\n movl %3, 2*4(%2)"
  307. "\n"
  308. "\n movl 3*4(%0), %3"
  309. "\n sbbl 3*4(%1), %3"
  310. "\n movl %3, 3*4(%2)"
  311. "\n"
  312. "\n movl 4*4(%0), %3"
  313. "\n sbbl 4*4(%1), %3"
  314. "\n movl %3, 4*4(%2)"
  315. "\n"
  316. "\n movl 5*4(%0), %3"
  317. "\n sbbl 5*4(%1), %3"
  318. "\n movl %3, 5*4(%2)"
  319. "\n"
  320. "\n movl 6*4(%0), %3"
  321. "\n sbbl 6*4(%1), %3"
  322. "\n movl %3, 6*4(%2)"
  323. "\n"
  324. "\n movl 7*4(%0), %3"
  325. "\n sbbl 7*4(%1), %3"
  326. "\n movl %3, 7*4(%2)"
  327. "\n"
  328. "\n sbbl %3, %3"
  329. "\n"
  330. : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
  331. : "0" (a), "1" (b), "2" (r)
  332. : "memory"
  333. );
  334. return reg;
  335. #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
  336. uint64_t reg;
  337. asm volatile (
  338. "\n movq (%0), %3"
  339. "\n subq (%1), %3"
  340. "\n movq %3, (%2)"
  341. "\n"
  342. "\n movq 1*8(%0), %3"
  343. "\n sbbq 1*8(%1), %3"
  344. "\n movq %3, 1*8(%2)"
  345. "\n"
  346. "\n movq 2*8(%0), %3"
  347. "\n sbbq 2*8(%1), %3"
  348. "\n movq %3, 2*8(%2)"
  349. "\n"
  350. "\n movq 3*8(%0), %3"
  351. "\n sbbq 3*8(%1), %3"
  352. "\n movq %3, 3*8(%2)"
  353. "\n"
  354. "\n sbbq %3, %3"
  355. "\n"
  356. : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
  357. : "0" (a), "1" (b), "2" (r)
  358. : "memory"
  359. );
  360. return reg;
  361. #else
  362. int i;
  363. sp_digit borrow;
  364. borrow = 0;
  365. for (i = 0; i < 8; i++) {
  366. sp_digit w, v;
  367. w = b[i] + borrow;
  368. v = a[i];
  369. if (w != 0) {
  370. v = a[i] - w;
  371. borrow = (v > a[i]);
  372. /* hope compiler detects above as "carry flag set" */
  373. }
  374. /* else: b + borrow == 0, two cases:
  375. * b:ffffffff, borrow:1
  376. * b:00000000, borrow:0
  377. * in either case, r[i] = a[i] and borrow remains unchanged
  378. */
  379. r[i] = v;
  380. }
  381. return borrow;
  382. #endif
  383. }
  384. /* Sub p256_mod from r. (r = r - p256_mod). */
  385. #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
  386. static void sp_256_sub_8_p256_mod(sp_digit* r)
  387. {
  388. //p256_mod[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
  389. asm volatile (
  390. "\n subl $0xffffffff, (%0)"
  391. "\n sbbl $0xffffffff, 1*4(%0)"
  392. "\n sbbl $0xffffffff, 2*4(%0)"
  393. "\n sbbl $0, 3*4(%0)"
  394. "\n sbbl $0, 4*4(%0)"
  395. "\n sbbl $0, 5*4(%0)"
  396. "\n sbbl $1, 6*4(%0)"
  397. "\n sbbl $0xffffffff, 7*4(%0)"
  398. "\n"
  399. : "=r" (r)
  400. : "0" (r)
  401. : "memory"
  402. );
  403. }
  404. #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
  405. static void sp_256_sub_8_p256_mod(sp_digit* r)
  406. {
  407. uint64_t reg;
  408. uint64_t ooff;
  409. //p256_mod[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
  410. asm volatile (
  411. "\n addq $1, (%0)" // adding 1 is the same as subtracting ffffffffffffffff
  412. "\n cmc" // only carry bit needs inverting
  413. "\n"
  414. "\n sbbq %1, 1*8(%0)" // %1 holds 00000000ffffffff
  415. "\n"
  416. "\n sbbq $0, 2*8(%0)"
  417. "\n"
  418. "\n movq 3*8(%0), %2"
  419. "\n sbbq $0, %2" // adding 00000000ffffffff (in %1)
  420. "\n addq %1, %2" // is the same as subtracting ffffffff00000001
  421. "\n movq %2, 3*8(%0)"
  422. "\n"
  423. : "=r" (r), "=r" (ooff), "=r" (reg)
  424. : "0" (r), "1" (0x00000000ffffffff)
  425. : "memory"
  426. );
  427. }
  428. #else
  429. static void sp_256_sub_8_p256_mod(sp_digit* r)
  430. {
  431. sp_256_sub_8(r, r, p256_mod);
  432. }
  433. #endif
  434. /* Multiply a and b into r. (r = a * b)
  435. * r should be [16] array (512 bits), and must not coincide with a or b.
  436. */
  437. static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
  438. {
  439. #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
  440. int k;
  441. uint32_t accl;
  442. uint32_t acch;
  443. acch = accl = 0;
  444. for (k = 0; k < 15; k++) {
  445. int i, j;
  446. uint32_t acc_hi;
  447. i = k - 7;
  448. if (i < 0)
  449. i = 0;
  450. j = k - i;
  451. acc_hi = 0;
  452. do {
  453. ////////////////////////
  454. // uint64_t m = ((uint64_t)a[i]) * b[j];
  455. // acc_hi:acch:accl += m;
  456. asm volatile (
  457. // a[i] is already loaded in %%eax
  458. "\n mull %7"
  459. "\n addl %%eax, %0"
  460. "\n adcl %%edx, %1"
  461. "\n adcl $0, %2"
  462. : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
  463. : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
  464. : "cc", "dx"
  465. );
  466. ////////////////////////
  467. j--;
  468. i++;
  469. } while (i != 8 && i <= k);
  470. r[k] = accl;
  471. accl = acch;
  472. acch = acc_hi;
  473. }
  474. r[15] = accl;
  475. #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
  476. const uint64_t* aa = (const void*)a;
  477. const uint64_t* bb = (const void*)b;
  478. uint64_t* rr = (void*)r;
  479. int k;
  480. uint64_t accl;
  481. uint64_t acch;
  482. acch = accl = 0;
  483. for (k = 0; k < 7; k++) {
  484. int i, j;
  485. uint64_t acc_hi;
  486. i = k - 3;
  487. if (i < 0)
  488. i = 0;
  489. j = k - i;
  490. acc_hi = 0;
  491. do {
  492. ////////////////////////
  493. // uint128_t m = ((uint128_t)a[i]) * b[j];
  494. // acc_hi:acch:accl += m;
  495. asm volatile (
  496. // aa[i] is already loaded in %%rax
  497. "\n mulq %7"
  498. "\n addq %%rax, %0"
  499. "\n adcq %%rdx, %1"
  500. "\n adcq $0, %2"
  501. : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
  502. : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
  503. : "cc", "dx"
  504. );
  505. ////////////////////////
  506. j--;
  507. i++;
  508. } while (i != 4 && i <= k);
  509. rr[k] = accl;
  510. accl = acch;
  511. acch = acc_hi;
  512. }
  513. rr[7] = accl;
  514. #elif 0
  515. //TODO: arm assembly (untested)
  516. asm volatile (
  517. "\n mov r5, #0"
  518. "\n mov r6, #0"
  519. "\n mov r7, #0"
  520. "\n mov r8, #0"
  521. "\n 1:"
  522. "\n subs r3, r5, #28"
  523. "\n movcc r3, #0"
  524. "\n sub r4, r5, r3"
  525. "\n 2:"
  526. "\n ldr r14, [%[a], r3]"
  527. "\n ldr r12, [%[b], r4]"
  528. "\n umull r9, r10, r14, r12"
  529. "\n adds r6, r6, r9"
  530. "\n adcs r7, r7, r10"
  531. "\n adc r8, r8, #0"
  532. "\n add r3, r3, #4"
  533. "\n sub r4, r4, #4"
  534. "\n cmp r3, #32"
  535. "\n beq 3f"
  536. "\n cmp r3, r5"
  537. "\n ble 2b"
  538. "\n 3:"
  539. "\n str r6, [%[r], r5]"
  540. "\n mov r6, r7"
  541. "\n mov r7, r8"
  542. "\n mov r8, #0"
  543. "\n add r5, r5, #4"
  544. "\n cmp r5, #56"
  545. "\n ble 1b"
  546. "\n str r6, [%[r], r5]"
  547. : [r] "r" (r), [a] "r" (a), [b] "r" (b)
  548. : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
  549. );
  550. #else
  551. int i, j, k;
  552. uint64_t acc;
  553. acc = 0;
  554. for (k = 0; k < 15; k++) {
  555. uint32_t acc_hi;
  556. i = k - 7;
  557. if (i < 0)
  558. i = 0;
  559. j = k - i;
  560. acc_hi = 0;
  561. do {
  562. uint64_t m = ((uint64_t)a[i]) * b[j];
  563. acc += m;
  564. if (acc < m)
  565. acc_hi++;
  566. j--;
  567. i++;
  568. } while (i != 8 && i <= k);
  569. r[k] = acc;
  570. acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
  571. }
  572. r[15] = acc;
  573. #endif
  574. }
  575. /* Shift number right one bit. Bottom bit is lost. */
  576. #if UNALIGNED_LE_64BIT
  577. static void sp_256_rshift1_8(sp_digit* rr, uint64_t carry)
  578. {
  579. uint64_t *r = (void*)rr;
  580. int i;
  581. carry = (((uint64_t)!!carry) << 63);
  582. for (i = 3; i >= 0; i--) {
  583. uint64_t c = r[i] << 63;
  584. r[i] = (r[i] >> 1) | carry;
  585. carry = c;
  586. }
  587. }
  588. #else
  589. static void sp_256_rshift1_8(sp_digit* r, sp_digit carry)
  590. {
  591. int i;
  592. carry = (((sp_digit)!!carry) << 31);
  593. for (i = 7; i >= 0; i--) {
  594. sp_digit c = r[i] << 31;
  595. r[i] = (r[i] >> 1) | carry;
  596. carry = c;
  597. }
  598. }
  599. #endif
  600. /* Divide the number by 2 mod the modulus (prime). (r = (r / 2) % m) */
  601. static void sp_256_div2_8(sp_digit* r /*, const sp_digit* m*/)
  602. {
  603. const sp_digit* m = p256_mod;
  604. int carry = 0;
  605. if (r[0] & 1)
  606. carry = sp_256_add_8(r, r, m);
  607. sp_256_norm_8(r);
  608. sp_256_rshift1_8(r, carry);
  609. }
  610. /* Add two Montgomery form numbers (r = a + b % m) */
  611. static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b
  612. /*, const sp_digit* m*/)
  613. {
  614. // const sp_digit* m = p256_mod;
  615. int carry = sp_256_add_8(r, a, b);
  616. sp_256_norm_8(r);
  617. if (carry) {
  618. sp_256_sub_8_p256_mod(r);
  619. sp_256_norm_8(r);
  620. }
  621. }
  622. /* Subtract two Montgomery form numbers (r = a - b % m) */
  623. static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b
  624. /*, const sp_digit* m*/)
  625. {
  626. const sp_digit* m = p256_mod;
  627. int borrow;
  628. borrow = sp_256_sub_8(r, a, b);
  629. sp_256_norm_8(r);
  630. if (borrow) {
  631. sp_256_add_8(r, r, m);
  632. sp_256_norm_8(r);
  633. }
  634. }
  635. /* Double a Montgomery form number (r = a + a % m) */
  636. static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
  637. {
  638. // const sp_digit* m = p256_mod;
  639. int carry = sp_256_add_8(r, a, a);
  640. sp_256_norm_8(r);
  641. if (carry)
  642. sp_256_sub_8_p256_mod(r);
  643. sp_256_norm_8(r);
  644. }
  645. /* Triple a Montgomery form number (r = a + a + a % m) */
  646. static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
  647. {
  648. // const sp_digit* m = p256_mod;
  649. int carry = sp_256_add_8(r, a, a);
  650. sp_256_norm_8(r);
  651. if (carry) {
  652. sp_256_sub_8_p256_mod(r);
  653. sp_256_norm_8(r);
  654. }
  655. carry = sp_256_add_8(r, r, a);
  656. sp_256_norm_8(r);
  657. if (carry) {
  658. sp_256_sub_8_p256_mod(r);
  659. sp_256_norm_8(r);
  660. }
  661. }
  662. /* Shift the result in the high 256 bits down to the bottom. */
  663. static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
  664. {
  665. memcpy(r, a + 8, sizeof(*r) * 8);
  666. }
  667. #if UNALIGNED_LE_64BIT
  668. /* 64-bit little-endian optimized version.
  669. * See generic 32-bit version below for explanation.
  670. * The benefit of this version is: even though r[3] calculation is atrocious,
  671. * we call sp_256_mul_add_4() four times, not 8.
  672. * Measured run time improvement of curve_P256_compute_pubkey_and_premaster()
  673. * call on x86-64: from ~1500us to ~900us. Code size +32 bytes.
  674. */
  675. static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
  676. {
  677. uint64_t b = r[0];
  678. # if 0
  679. const uint64_t* a = (const void*)p256_mod;
  680. //a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
  681. uint128_t t;
  682. int i;
  683. t = 0;
  684. for (i = 0; i < 4; i++) {
  685. uint32_t t_hi;
  686. uint128_t m = ((uint128_t)b * a[i]) + r[i];
  687. t += m;
  688. t_hi = (t < m);
  689. r[i] = (uint64_t)t;
  690. t = (t >> 64) | ((uint128_t)t_hi << 64);
  691. }
  692. r[4] += (uint64_t)t;
  693. return (r[4] < (uint64_t)t); /* 1 if addition overflowed */
  694. # else
  695. // Unroll, then optimize the above loop:
  696. //uint32_t t_hi;
  697. //uint128_t m;
  698. uint64_t t64, t64u;
  699. //m = ((uint128_t)b * a[0]) + r[0];
  700. // Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to:
  701. // m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64;
  702. //t += m;
  703. // t = r[0] << 64 = b << 64;
  704. //t_hi = (t < m);
  705. // t_hi = 0;
  706. //r[0] = (uint64_t)t;
  707. // r[0] = 0;
  708. //the store can be eliminated since caller won't look at lower 256 bits of the result
  709. //t = (t >> 64) | ((uint128_t)t_hi << 64);
  710. // t = b;
  711. //m = ((uint128_t)b * a[1]) + r[1];
  712. // Since a[1] is 00000000ffffffff, the above optimizes to:
  713. // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
  714. //t += m;
  715. // t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
  716. //t_hi = (t < m);
  717. // t_hi = 0;
  718. //r[1] = (uint64_t)t;
  719. r[1] += (b << 32);
  720. //t = (t >> 64) | ((uint128_t)t_hi << 64);
  721. t64 = (r[1] < (b << 32));
  722. t64 += (b >> 32);
  723. //m = ((uint128_t)b * a[2]) + r[2];
  724. // Since a[2] is 0000000000000000, the above optimizes to:
  725. // m = b * 0 + r[2] = r[2];
  726. //t += m;
  727. // t = t64 + r[2];
  728. //t_hi = (t < m);
  729. // t_hi = 0;
  730. //r[2] = (uint64_t)t;
  731. r[2] += t64;
  732. //t = (t >> 64) | ((uint128_t)t_hi << 64);
  733. t64 = (r[2] < t64);
  734. //m = ((uint128_t)b * a[3]) + r[3];
  735. // Since a[3] is ffffffff00000001, the above optimizes to:
  736. // m = b * ffffffff00000001 + r[3];
  737. // m = b + b*ffffffff00000000 + r[3]
  738. // m = b + (b*ffffffff << 32) + r[3]
  739. // m = b + (((b<<32) - b) << 32) + r[3]
  740. //t += m;
  741. // t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3];
  742. t64 += b;
  743. t64u = (t64 < b);
  744. t64 += r[3];
  745. t64u += (t64 < r[3]);
  746. { // add ((((uint128_t)b << 32) - b) << 32):
  747. uint64_t lo, hi;
  748. //lo = (((b << 32) - b) << 32
  749. //hi = (((uint128_t)b << 32) - b) >> 32
  750. //but without uint128_t:
  751. hi = (b << 32) - b; /* make lower 32 bits of "hi", part 1 */
  752. b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */
  753. lo = hi << 32; /* (use "hi" value to calculate "lo",... */
  754. t64 += lo; /* ...consume... */
  755. t64u += (t64 < lo); /* ..."lo") */
  756. hi >>= 32; /* make lower 32 bits of "hi", part 2 */
  757. hi |= (b << 32); /* combine lower and upper 32 bits */
  758. t64u += hi; /* consume "hi" */
  759. }
  760. //t_hi = (t < m);
  761. // t_hi = 0;
  762. //r[3] = (uint64_t)t;
  763. r[3] = t64;
  764. //t = (t >> 64) | ((uint128_t)t_hi << 64);
  765. // t = t64u;
  766. r[4] += t64u;
  767. return (r[4] < t64u); /* 1 if addition overflowed */
  768. # endif
  769. }
  770. static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/*, const sp_digit* m, sp_digit mp*/)
  771. {
  772. // const sp_digit* m = p256_mod;
  773. int i;
  774. uint64_t *a = (void*)aa;
  775. sp_digit carry = 0;
  776. for (i = 0; i < 4; i++) {
  777. // mu = a[i];
  778. if (sp_256_mul_add_4(a+i /*, m, mu*/)) {
  779. int j = i + 4;
  780. inc_next_word:
  781. if (++j > 7) { /* a[8] array has no more words? */
  782. carry++;
  783. continue;
  784. }
  785. if (++a[j] == 0) /* did this overflow too? */
  786. goto inc_next_word;
  787. }
  788. }
  789. sp_512to256_mont_shift_8(r, aa);
  790. if (carry != 0)
  791. sp_256_sub_8_p256_mod(r);
  792. sp_256_norm_8(r);
  793. }
  794. #else /* Generic 32-bit version */
  795. /* Mul a by scalar b and add into r. (r += a * b)
  796. * a = p256_mod
  797. * b = r[0]
  798. */
  799. static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
  800. {
  801. sp_digit b = r[0];
  802. uint64_t t;
  803. # if 0
  804. const sp_digit* a = p256_mod;
  805. //a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
  806. int i;
  807. t = 0;
  808. for (i = 0; i < 8; i++) {
  809. uint32_t t_hi;
  810. uint64_t m = ((uint64_t)b * a[i]) + r[i];
  811. t += m;
  812. t_hi = (t < m);
  813. r[i] = (sp_digit)t;
  814. t = (t >> 32) | ((uint64_t)t_hi << 32);
  815. }
  816. r[8] += (sp_digit)t;
  817. return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
  818. # else
  819. // Unroll, then optimize the above loop:
  820. //uint32_t t_hi;
  821. uint64_t m;
  822. uint32_t t32;
  823. //m = ((uint64_t)b * a[0]) + r[0];
  824. // Since b is r[0] and a[0] is ffffffff, the above optimizes to:
  825. // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32;
  826. //t += m;
  827. // t = r[0] << 32 = b << 32;
  828. //t_hi = (t < m);
  829. // t_hi = 0;
  830. //r[0] = (sp_digit)t;
  831. // r[0] = 0;
  832. //the store can be eliminated since caller won't look at lower 256 bits of the result
  833. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  834. // t = b;
  835. //m = ((uint64_t)b * a[1]) + r[1];
  836. // Since a[1] is ffffffff, the above optimizes to:
  837. // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
  838. //t += m;
  839. // t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
  840. //t_hi = (t < m);
  841. // t_hi = 0;
  842. //r[1] = (sp_digit)t;
  843. // r[1] = r[1];
  844. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  845. // t = b;
  846. //m = ((uint64_t)b * a[2]) + r[2];
  847. // Since a[2] is ffffffff, the above optimizes to:
  848. // m = b * ffffffff + r[2] = (b * 100000000 - b) + r[2] = (b << 32) - b + r[2];
  849. //t += m;
  850. // t = b + (b << 32) - b + r[2] = (b << 32) + r[2]
  851. //t_hi = (t < m);
  852. // t_hi = 0;
  853. //r[2] = (sp_digit)t;
  854. // r[2] = r[2];
  855. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  856. // t = b;
  857. //m = ((uint64_t)b * a[3]) + r[3];
  858. // Since a[3] is 00000000, the above optimizes to:
  859. // m = b * 0 + r[3] = r[3];
  860. //t += m;
  861. // t = b + r[3];
  862. //t_hi = (t < m);
  863. // t_hi = 0;
  864. //r[3] = (sp_digit)t;
  865. r[3] = r[3] + b;
  866. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  867. t32 = (r[3] < b); // 0 or 1
  868. //m = ((uint64_t)b * a[4]) + r[4];
  869. // Since a[4] is 00000000, the above optimizes to:
  870. // m = b * 0 + r[4] = r[4];
  871. //t += m;
  872. // t = t32 + r[4];
  873. //t_hi = (t < m);
  874. // t_hi = 0;
  875. //r[4] = (sp_digit)t;
  876. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  877. if (t32 != 0) {
  878. r[4]++;
  879. t32 = (r[4] == 0); // 0 or 1
  880. //m = ((uint64_t)b * a[5]) + r[5];
  881. // Since a[5] is 00000000, the above optimizes to:
  882. // m = b * 0 + r[5] = r[5];
  883. //t += m;
  884. // t = t32 + r[5]; (t32 is 0 or 1)
  885. //t_hi = (t < m);
  886. // t_hi = 0;
  887. //r[5] = (sp_digit)t;
  888. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  889. if (t32 != 0) {
  890. r[5]++;
  891. t32 = (r[5] == 0); // 0 or 1
  892. }
  893. }
  894. //m = ((uint64_t)b * a[6]) + r[6];
  895. // Since a[6] is 00000001, the above optimizes to:
  896. // m = (uint64_t)b + r[6]; // 33 bits at most
  897. //t += m;
  898. t = t32 + (uint64_t)b + r[6];
  899. //t_hi = (t < m);
  900. // t_hi = 0;
  901. r[6] = (sp_digit)t;
  902. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  903. t = (t >> 32);
  904. //m = ((uint64_t)b * a[7]) + r[7];
  905. // Since a[7] is ffffffff, the above optimizes to:
  906. // m = b * ffffffff + r[7] = (b * 100000000 - b) + r[7]
  907. m = ((uint64_t)b << 32) - b + r[7];
  908. t += m;
  909. //t_hi = (t < m);
  910. // t_hi in fact is always 0 here (256bit * 32bit can't have more than 32 bits of overflow)
  911. r[7] = (sp_digit)t;
  912. //t = (t >> 32) | ((uint64_t)t_hi << 32);
  913. t = (t >> 32);
  914. r[8] += (sp_digit)t;
  915. return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
  916. # endif
  917. }
  918. /* Reduce the number back to 256 bits using Montgomery reduction.
  919. * Note: the result is NOT guaranteed to be less than p256_mod!
  920. * (it is only guaranteed to fit into 256 bits).
  921. *
  922. * r Result.
  923. * a Double-wide number to reduce. Clobbered.
  924. * m The single precision number representing the modulus.
  925. * mp The digit representing the negative inverse of m mod 2^n.
  926. *
  927. * Montgomery reduction on multiprecision integers:
  928. * Montgomery reduction requires products modulo R.
  929. * When R is a power of B [in our case R=2^128, B=2^32], there is a variant
  930. * of Montgomery reduction which requires products only of machine word sized
  931. * integers. T is stored as an little-endian word array a[0..n]. The algorithm
  932. * reduces it one word at a time. First an appropriate multiple of modulus
  933. * is added to make T divisible by B. [In our case, it is p256_mp_mod * a[0].]
  934. * Then a multiple of modulus is added to make T divisible by B^2.
  935. * [In our case, it is (p256_mp_mod * a[1]) << 32.]
  936. * And so on. Eventually T is divisible by R, and after division by R
  937. * the algorithm is in the same place as the usual Montgomery reduction.
  938. *
  939. * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
  940. */
  941. static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
  942. {
  943. // const sp_digit* m = p256_mod;
  944. sp_digit mp = p256_mp_mod;
  945. int i;
  946. // sp_digit mu;
  947. if (mp != 1) {
  948. sp_digit word16th = 0;
  949. for (i = 0; i < 8; i++) {
  950. // mu = (sp_digit)(a[i] * mp);
  951. if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
  952. int j = i + 8;
  953. inc_next_word0:
  954. if (++j > 15) { /* a[16] array has no more words? */
  955. word16th++;
  956. continue;
  957. }
  958. if (++a[j] == 0) /* did this overflow too? */
  959. goto inc_next_word0;
  960. }
  961. }
  962. sp_512to256_mont_shift_8(r, a);
  963. if (word16th != 0)
  964. sp_256_sub_8_p256_mod(r);
  965. sp_256_norm_8(r);
  966. }
  967. else { /* Same code for explicit mp == 1 (which is always the case for P256) */
  968. sp_digit word16th = 0;
  969. for (i = 0; i < 8; i++) {
  970. // mu = a[i];
  971. if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
  972. int j = i + 8;
  973. inc_next_word:
  974. if (++j > 15) { /* a[16] array has no more words? */
  975. word16th++;
  976. continue;
  977. }
  978. if (++a[j] == 0) /* did this overflow too? */
  979. goto inc_next_word;
  980. }
  981. }
  982. sp_512to256_mont_shift_8(r, a);
  983. if (word16th != 0)
  984. sp_256_sub_8_p256_mod(r);
  985. sp_256_norm_8(r);
  986. }
  987. }
  988. #endif
  989. /* Multiply two Montogmery form numbers mod the modulus (prime).
  990. * (r = a * b mod m)
  991. *
  992. * r Result of multiplication.
  993. * a First number to multiply in Montogmery form.
  994. * b Second number to multiply in Montogmery form.
  995. * m Modulus (prime).
  996. * mp Montogmery multiplier.
  997. */
  998. static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
  999. /*, const sp_digit* m, sp_digit mp*/)
  1000. {
  1001. //const sp_digit* m = p256_mod;
  1002. //sp_digit mp = p256_mp_mod;
  1003. sp_digit t[2 * 8];
  1004. sp_256to512_mul_8(t, a, b);
  1005. sp_512to256_mont_reduce_8(r, t /*, m, mp*/);
  1006. }
  1007. /* Square the Montgomery form number. (r = a * a mod m)
  1008. *
  1009. * r Result of squaring.
  1010. * a Number to square in Montogmery form.
  1011. * m Modulus (prime).
  1012. * mp Montogmery multiplier.
  1013. */
  1014. static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
  1015. /*, const sp_digit* m, sp_digit mp*/)
  1016. {
  1017. //const sp_digit* m = p256_mod;
  1018. //sp_digit mp = p256_mp_mod;
  1019. sp_256_mont_mul_8(r, a, a /*, m, mp*/);
  1020. }
  1021. static NOINLINE void sp_256_mont_mul_and_reduce_8(sp_digit* r,
  1022. const sp_digit* a, const sp_digit* b
  1023. /*, const sp_digit* m, sp_digit mp*/)
  1024. {
  1025. sp_digit rr[2 * 8];
  1026. sp_256_mont_mul_8(rr, a, b /*, p256_mod, p256_mp_mod*/);
  1027. memset(rr + 8, 0, sizeof(rr) / 2);
  1028. sp_512to256_mont_reduce_8(r, rr /*, p256_mod, p256_mp_mod*/);
  1029. }
  1030. /* Invert the number, in Montgomery form, modulo the modulus (prime) of the
  1031. * P256 curve. (r = 1 / a mod m)
  1032. *
  1033. * r Inverse result. Must not coincide with a.
  1034. * a Number to invert.
  1035. */
  1036. static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
  1037. {
  1038. int i;
  1039. memcpy(r, a, sizeof(sp_digit) * 8);
  1040. for (i = 254; i >= 0; i--) {
  1041. sp_256_mont_sqr_8(r, r /*, p256_mod, p256_mp_mod*/);
  1042. /* p256_mod - 2:
  1043. * ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff - 2
  1044. * Bit pattern:
  1045. * 2 2 2 2 2 2 2 1...1
  1046. * 5 5 4 3 2 1 0 9...0 9...1
  1047. * 543210987654321098765432109876543210987654321098765432109876543210...09876543210...09876543210
  1048. * 111111111111111111111111111111110000000000000000000000000000000100...00000111111...11111111101
  1049. */
  1050. /*if (p256_mod_minus_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
  1051. if (i >= 224 || i == 192 || (i <= 95 && i != 1))
  1052. sp_256_mont_mul_8(r, r, a /*, p256_mod, p256_mp_mod*/);
  1053. }
  1054. }
  1055. /* Multiply a number by Montogmery normalizer mod modulus (prime).
  1056. *
  1057. * r The resulting Montgomery form number.
  1058. * a The number to convert.
  1059. */
  1060. static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
  1061. {
  1062. int64_t t[8];
  1063. int32_t o;
  1064. #define A(n) ((uint64_t)a[n])
  1065. /* 1 1 0 -1 -1 -1 -1 0 */
  1066. t[0] = 0 + A(0) + A(1) - A(3) - A(4) - A(5) - A(6);
  1067. /* 0 1 1 0 -1 -1 -1 -1 */
  1068. t[1] = 0 + A(1) + A(2) - A(4) - A(5) - A(6) - A(7);
  1069. /* 0 0 1 1 0 -1 -1 -1 */
  1070. t[2] = 0 + A(2) + A(3) - A(5) - A(6) - A(7);
  1071. /* -1 -1 0 2 2 1 0 -1 */
  1072. t[3] = 0 - A(0) - A(1) + 2 * A(3) + 2 * A(4) + A(5) - A(7);
  1073. /* 0 -1 -1 0 2 2 1 0 */
  1074. t[4] = 0 - A(1) - A(2) + 2 * A(4) + 2 * A(5) + A(6);
  1075. /* 0 0 -1 -1 0 2 2 1 */
  1076. t[5] = 0 - A(2) - A(3) + 2 * A(5) + 2 * A(6) + A(7);
  1077. /* -1 -1 0 0 0 1 3 2 */
  1078. t[6] = 0 - A(0) - A(1) + A(5) + 3 * A(6) + 2 * A(7);
  1079. /* 1 0 -1 -1 -1 -1 0 3 */
  1080. t[7] = 0 + A(0) - A(2) - A(3) - A(4) - A(5) + 3 * A(7);
  1081. #undef A
  1082. t[1] += t[0] >> 32; t[0] &= 0xffffffff;
  1083. t[2] += t[1] >> 32; t[1] &= 0xffffffff;
  1084. t[3] += t[2] >> 32; t[2] &= 0xffffffff;
  1085. t[4] += t[3] >> 32; t[3] &= 0xffffffff;
  1086. t[5] += t[4] >> 32; t[4] &= 0xffffffff;
  1087. t[6] += t[5] >> 32; t[5] &= 0xffffffff;
  1088. t[7] += t[6] >> 32; t[6] &= 0xffffffff;
  1089. o = t[7] >> 32; //t[7] &= 0xffffffff;
  1090. t[0] += o;
  1091. t[3] -= o;
  1092. t[6] -= o;
  1093. t[7] += o;
  1094. r[0] = (sp_digit)t[0];
  1095. t[1] += t[0] >> 32;
  1096. r[1] = (sp_digit)t[1];
  1097. t[2] += t[1] >> 32;
  1098. r[2] = (sp_digit)t[2];
  1099. t[3] += t[2] >> 32;
  1100. r[3] = (sp_digit)t[3];
  1101. t[4] += t[3] >> 32;
  1102. r[4] = (sp_digit)t[4];
  1103. t[5] += t[4] >> 32;
  1104. r[5] = (sp_digit)t[5];
  1105. t[6] += t[5] >> 32;
  1106. r[6] = (sp_digit)t[6];
  1107. // t[7] += t[6] >> 32;
  1108. // r[7] = (sp_digit)t[7];
  1109. r[7] = (sp_digit)t[7] + (sp_digit)(t[6] >> 32);
  1110. }
  1111. /* Map the Montgomery form projective co-ordinate point to an affine point.
  1112. *
  1113. * r Resulting affine co-ordinate point.
  1114. * p Montgomery form projective co-ordinate point.
  1115. */
  1116. static void sp_256_map_8(sp_point* r, sp_point* p)
  1117. {
  1118. sp_digit t1[8];
  1119. sp_digit t2[8];
  1120. sp_256_mont_inv_8(t1, p->z);
  1121. sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
  1122. sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
  1123. /* x /= z^2 */
  1124. sp_256_mont_mul_and_reduce_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/);
  1125. /* Reduce x to less than modulus */
  1126. if (sp_256_cmp_8(r->x, p256_mod) >= 0)
  1127. sp_256_sub_8_p256_mod(r->x);
  1128. sp_256_norm_8(r->x);
  1129. /* y /= z^3 */
  1130. sp_256_mont_mul_and_reduce_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/);
  1131. /* Reduce y to less than modulus */
  1132. if (sp_256_cmp_8(r->y, p256_mod) >= 0)
  1133. sp_256_sub_8_p256_mod(r->y);
  1134. sp_256_norm_8(r->y);
  1135. memset(r->z, 0, sizeof(r->z));
  1136. r->z[0] = 1;
  1137. }
  1138. /* Double the Montgomery form projective point p.
  1139. *
  1140. * r Result of doubling point.
  1141. * p Point to double.
  1142. */
  1143. static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
  1144. {
  1145. sp_digit t1[8];
  1146. sp_digit t2[8];
  1147. /* Put point to double into result */
  1148. if (r != p)
  1149. *r = *p; /* struct copy */
  1150. if (r->infinity)
  1151. return;
  1152. /* T1 = Z * Z */
  1153. sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
  1154. /* Z = Y * Z */
  1155. sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
  1156. /* Z = 2Z */
  1157. sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
  1158. /* T2 = X - T1 */
  1159. sp_256_mont_sub_8(t2, r->x, t1 /*, p256_mod*/);
  1160. /* T1 = X + T1 */
  1161. sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
  1162. /* T2 = T1 * T2 */
  1163. sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
  1164. /* T1 = 3T2 */
  1165. sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
  1166. /* Y = 2Y */
  1167. sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
  1168. /* Y = Y * Y */
  1169. sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
  1170. /* T2 = Y * Y */
  1171. sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
  1172. /* T2 = T2/2 */
  1173. sp_256_div2_8(t2 /*, p256_mod*/);
  1174. /* Y = Y * X */
  1175. sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
  1176. /* X = T1 * T1 */
  1177. sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
  1178. /* X = X - Y */
  1179. sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
  1180. /* X = X - Y */
  1181. sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
  1182. /* Y = Y - X */
  1183. sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
  1184. /* Y = Y * T1 */
  1185. sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
  1186. /* Y = Y - T2 */
  1187. sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
  1188. dump_512("y2 %s\n", r->y);
  1189. }
  1190. /* Add two Montgomery form projective points.
  1191. *
  1192. * r Result of addition.
  1193. * p Frist point to add.
  1194. * q Second point to add.
  1195. */
  1196. static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
  1197. {
  1198. sp_digit t1[8];
  1199. sp_digit t2[8];
  1200. sp_digit t3[8];
  1201. sp_digit t4[8];
  1202. sp_digit t5[8];
  1203. /* Ensure only the first point is the same as the result. */
  1204. if (q == r) {
  1205. sp_point* a = p;
  1206. p = q;
  1207. q = a;
  1208. }
  1209. /* Check double */
  1210. sp_256_sub_8(t1, p256_mod, q->y);
  1211. sp_256_norm_8(t1);
  1212. if (sp_256_cmp_equal_8(p->x, q->x)
  1213. && sp_256_cmp_equal_8(p->z, q->z)
  1214. && (sp_256_cmp_equal_8(p->y, q->y) || sp_256_cmp_equal_8(p->y, t1))
  1215. ) {
  1216. sp_256_proj_point_dbl_8(r, p);
  1217. return;
  1218. }
  1219. if (p->infinity || q->infinity) {
  1220. *r = p->infinity ? *q : *p; /* struct copy */
  1221. return;
  1222. }
  1223. /* U1 = X1*Z2^2 */
  1224. sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
  1225. sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
  1226. sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
  1227. /* U2 = X2*Z1^2 */
  1228. sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
  1229. sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
  1230. sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
  1231. /* S1 = Y1*Z2^3 */
  1232. sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
  1233. /* S2 = Y2*Z1^3 */
  1234. sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
  1235. /* H = U2 - U1 */
  1236. sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
  1237. /* R = S2 - S1 */
  1238. sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
  1239. /* Z3 = H*Z1*Z2 */
  1240. sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
  1241. sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
  1242. /* X3 = R^2 - H^3 - 2*U1*H^2 */
  1243. sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
  1244. sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
  1245. sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
  1246. sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
  1247. sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
  1248. sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
  1249. sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
  1250. /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
  1251. sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
  1252. sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
  1253. sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
  1254. sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
  1255. }
  1256. /* Multiply the point by the scalar and return the result.
  1257. * If map is true then convert result to affine co-ordinates.
  1258. *
  1259. * r Resulting point.
  1260. * g Point to multiply.
  1261. * k Scalar to multiply by.
  1262. * map Indicates whether to convert result to affine.
  1263. */
  1264. static void sp_256_ecc_mulmod_8(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/)
  1265. {
  1266. enum { map = 1 }; /* we always convert result to affine coordinates */
  1267. sp_point t[3];
  1268. sp_digit n = n; /* for compiler */
  1269. int c, y;
  1270. memset(t, 0, sizeof(t));
  1271. /* t[0] = {0, 0, 1} * norm */
  1272. t[0].infinity = 1;
  1273. /* t[1] = {g->x, g->y, g->z} * norm */
  1274. sp_256_mod_mul_norm_8(t[1].x, g->x);
  1275. sp_256_mod_mul_norm_8(t[1].y, g->y);
  1276. sp_256_mod_mul_norm_8(t[1].z, g->z);
  1277. /* For every bit, starting from most significant... */
  1278. k += 7;
  1279. c = 256;
  1280. for (;;) {
  1281. if ((c & 0x1f) == 0) {
  1282. if (c == 0)
  1283. break;
  1284. n = *k--;
  1285. }
  1286. y = (n >> 31);
  1287. dbg("y:%d t[%d] = t[0]+t[1]\n", y, y^1);
  1288. sp_256_proj_point_add_8(&t[y^1], &t[0], &t[1]);
  1289. dump_512("t[0].x %s\n", t[0].x);
  1290. dump_512("t[0].y %s\n", t[0].y);
  1291. dump_512("t[0].z %s\n", t[0].z);
  1292. dump_512("t[1].x %s\n", t[1].x);
  1293. dump_512("t[1].y %s\n", t[1].y);
  1294. dump_512("t[1].z %s\n", t[1].z);
  1295. dbg("t[2] = t[%d]\n", y);
  1296. t[2] = t[y]; /* struct copy */
  1297. dbg("t[2] *= 2\n");
  1298. sp_256_proj_point_dbl_8(&t[2], &t[2]);
  1299. dump_512("t[2].x %s\n", t[2].x);
  1300. dump_512("t[2].y %s\n", t[2].y);
  1301. dump_512("t[2].z %s\n", t[2].z);
  1302. t[y] = t[2]; /* struct copy */
  1303. n <<= 1;
  1304. c--;
  1305. }
  1306. if (map)
  1307. sp_256_map_8(r, &t[0]);
  1308. else
  1309. *r = t[0]; /* struct copy */
  1310. memset(t, 0, sizeof(t)); //paranoia
  1311. }
  1312. /* Multiply the base point of P256 by the scalar and return the result.
  1313. * If map is true then convert result to affine co-ordinates.
  1314. *
  1315. * r Resulting point.
  1316. * k Scalar to multiply by.
  1317. * map Indicates whether to convert result to affine.
  1318. */
  1319. static void sp_256_ecc_mulmod_base_8(sp_point* r, sp_digit* k /*, int map*/)
  1320. {
  1321. /* Since this function is called only once, save space:
  1322. * don't have "static const sp_point p256_base = {...}",
  1323. * it would have more zeros than data.
  1324. */
  1325. static const uint8_t p256_base_bin[] = {
  1326. /* x (big-endian) */
  1327. 0x6b,0x17,0xd1,0xf2,0xe1,0x2c,0x42,0x47,0xf8,0xbc,0xe6,0xe5,0x63,0xa4,0x40,0xf2,0x77,0x03,0x7d,0x81,0x2d,0xeb,0x33,0xa0,0xf4,0xa1,0x39,0x45,0xd8,0x98,0xc2,0x96,
  1328. /* y */
  1329. 0x4f,0xe3,0x42,0xe2,0xfe,0x1a,0x7f,0x9b,0x8e,0xe7,0xeb,0x4a,0x7c,0x0f,0x9e,0x16,0x2b,0xce,0x33,0x57,0x6b,0x31,0x5e,0xce,0xcb,0xb6,0x40,0x68,0x37,0xbf,0x51,0xf5,
  1330. /* z will be set to 1, infinity flag to "false" */
  1331. };
  1332. sp_point p256_base;
  1333. sp_256_point_from_bin2x32(&p256_base, p256_base_bin);
  1334. sp_256_ecc_mulmod_8(r, &p256_base, k /*, map*/);
  1335. }
  1336. /* Multiply the point by the scalar and serialize the X ordinate.
  1337. * The number is 0 padded to maximum size on output.
  1338. *
  1339. * priv Scalar to multiply the point by.
  1340. * pub2x32 Point to multiply.
  1341. * out32 Buffer to hold X ordinate.
  1342. */
  1343. static void sp_ecc_secret_gen_256(const sp_digit priv[8], const uint8_t *pub2x32, uint8_t* out32)
  1344. {
  1345. sp_point point[1];
  1346. #if FIXED_PEER_PUBKEY
  1347. memset((void*)pub2x32, 0x55, 64);
  1348. #endif
  1349. dump_hex("peerkey %s\n", pub2x32, 32); /* in TLS, this is peer's public key */
  1350. dump_hex(" %s\n", pub2x32 + 32, 32);
  1351. sp_256_point_from_bin2x32(point, pub2x32);
  1352. dump_512("point->x %s\n", point->x);
  1353. dump_512("point->y %s\n", point->y);
  1354. sp_256_ecc_mulmod_8(point, point, priv);
  1355. sp_256_to_bin_8(point->x, out32);
  1356. dump_hex("out32: %s\n", out32, 32);
  1357. }
  1358. /* Generates a random scalar in [1..order-1] range. */
  1359. static void sp_256_ecc_gen_k_8(sp_digit k[8])
  1360. {
  1361. /* Since 32-bit words are "dense", no need to use
  1362. * sp_256_from_bin_8(k, buf) to convert random stream
  1363. * to sp_digit array - just store random bits there directly.
  1364. */
  1365. tls_get_random(k, 8 * sizeof(k[0]));
  1366. #if FIXED_SECRET
  1367. memset(k, 0x77, 8 * sizeof(k[0]));
  1368. #endif
  1369. // If scalar is too large, try again (pseudo-code)
  1370. // if (k >= 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 - 1) // order of P256
  1371. // goto pick_another_random;
  1372. // k++; // ensure non-zero
  1373. /* Simpler alternative, at the cost of not choosing some valid
  1374. * random values, and slightly non-uniform distribution */
  1375. if (k[0] == 0)
  1376. k[0] = 1;
  1377. if (k[7] >= 0xffffffff)
  1378. k[7] = 0xfffffffe;
  1379. }
  1380. /* Makes a random EC key pair. */
  1381. static void sp_ecc_make_key_256(sp_digit privkey[8], uint8_t *pubkey)
  1382. {
  1383. sp_point point[1];
  1384. sp_256_ecc_gen_k_8(privkey);
  1385. dump_256("privkey %s\n", privkey);
  1386. sp_256_ecc_mulmod_base_8(point, privkey);
  1387. dump_512("point->x %s\n", point->x);
  1388. dump_512("point->y %s\n", point->y);
  1389. sp_256_to_bin_8(point->x, pubkey);
  1390. sp_256_to_bin_8(point->y, pubkey + 32);
  1391. memset(point, 0, sizeof(point)); //paranoia
  1392. }
  1393. void FAST_FUNC curve_P256_compute_pubkey_and_premaster(
  1394. uint8_t *pubkey2x32, uint8_t *premaster32,
  1395. const uint8_t *peerkey2x32)
  1396. {
  1397. sp_digit privkey[8];
  1398. dump_hex("peerkey2x32: %s\n", peerkey2x32, 64);
  1399. sp_ecc_make_key_256(privkey, pubkey2x32);
  1400. dump_hex("pubkey: %s\n", pubkey2x32, 32);
  1401. dump_hex(" %s\n", pubkey2x32 + 32, 32);
  1402. /* Combine our privkey and peer's public key to generate premaster */
  1403. sp_ecc_secret_gen_256(privkey, /*x,y:*/peerkey2x32, premaster32);
  1404. dump_hex("premaster: %s\n", premaster32, 32);
  1405. }