poly1305.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. /* poly1305.c
  2. *
  3. * Copyright (C) 2006-2020 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. /*
  22. DESCRIPTION
  23. This library contains implementation for the Poly1305 authenticator.
  24. Based off the public domain implementations by Andrew Moon
  25. and Daniel J. Bernstein
  26. */
  27. #ifdef HAVE_CONFIG_H
  28. #include <config.h>
  29. #endif
  30. #include <wolfssl/wolfcrypt/settings.h>
  31. #ifdef HAVE_POLY1305
  32. #include <wolfssl/wolfcrypt/poly1305.h>
  33. #include <wolfssl/wolfcrypt/error-crypt.h>
  34. #include <wolfssl/wolfcrypt/logging.h>
  35. #include <wolfssl/wolfcrypt/cpuid.h>
  36. #ifdef NO_INLINE
  37. #include <wolfssl/wolfcrypt/misc.h>
  38. #else
  39. #define WOLFSSL_MISC_INCLUDED
  40. #include <wolfcrypt/src/misc.c>
  41. #endif
  42. #ifdef CHACHA_AEAD_TEST
  43. #include <stdio.h>
  44. #endif
  45. #ifdef _MSC_VER
  46. /* 4127 warning constant while(1) */
  47. #pragma warning(disable: 4127)
  48. #endif
  49. #ifdef USE_INTEL_SPEEDUP
  50. #include <emmintrin.h>
  51. #include <immintrin.h>
  52. #if defined(__GNUC__) && ((__GNUC__ < 4) || \
  53. (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
  54. #undef NO_AVX2_SUPPORT
  55. #define NO_AVX2_SUPPORT
  56. #endif
  57. #if defined(__clang__) && ((__clang_major__ < 3) || \
  58. (__clang_major__ == 3 && __clang_minor__ <= 5))
  59. #define NO_AVX2_SUPPORT
  60. #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
  61. #undef NO_AVX2_SUPPORT
  62. #endif
  63. #define HAVE_INTEL_AVX1
  64. #ifndef NO_AVX2_SUPPORT
  65. #define HAVE_INTEL_AVX2
  66. #endif
  67. #endif
  68. #ifdef USE_INTEL_SPEEDUP
  69. static word32 intel_flags = 0;
  70. static word32 cpu_flags_set = 0;
  71. #endif
  72. #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
  73. #if defined(_MSC_VER)
  74. #define POLY1305_NOINLINE __declspec(noinline)
  75. #elif defined(__GNUC__)
  76. #define POLY1305_NOINLINE __attribute__((noinline))
  77. #else
  78. #define POLY1305_NOINLINE
  79. #endif
  80. #if defined(_MSC_VER)
  81. #include <intrin.h>
  82. typedef struct word128 {
  83. word64 lo;
  84. word64 hi;
  85. } word128;
  86. #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
  87. #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
  88. out.hi += (out.lo < t) + in.hi; }
  89. #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
  90. out.hi += (out.lo < t); }
  91. #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
  92. #define LO(in) (in.lo)
  93. #elif defined(__GNUC__)
  94. #if defined(__SIZEOF_INT128__)
  95. typedef unsigned __int128 word128;
  96. #else
  97. typedef unsigned word128 __attribute__((mode(TI)));
  98. #endif
  99. #define MUL(out, x, y) out = ((word128)x * y)
  100. #define ADD(out, in) out += in
  101. #define ADDLO(out, in) out += in
  102. #define SHR(in, shift) (word64)(in >> (shift))
  103. #define LO(in) (word64)(in)
  104. #endif
  105. #endif
  106. #ifdef USE_INTEL_SPEEDUP
  107. #ifdef __cplusplus
  108. extern "C" {
  109. #endif
  110. #ifdef HAVE_INTEL_AVX1
  111. /* Process one block (16 bytes) of data.
  112. *
  113. * ctx Poly1305 context.
  114. * m One block of message data.
  115. */
  116. extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
  117. /* Process multiple blocks (n * 16 bytes) of data.
  118. *
  119. * ctx Poly1305 context.
  120. * m Blocks of message data.
  121. * bytes The number of bytes to process.
  122. */
  123. extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
  124. size_t bytes);
  125. /* Set the key to use when processing data.
  126. * Initialize the context.
  127. *
  128. * ctx Poly1305 context.
  129. * key The key data (16 bytes).
  130. */
  131. extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
  132. /* Calculate the final result - authentication data.
  133. * Zeros out the private data in the context.
  134. *
  135. * ctx Poly1305 context.
  136. * mac Buffer to hold 16 bytes.
  137. */
  138. extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
  139. #endif
  140. #ifdef HAVE_INTEL_AVX2
  141. /* Process multiple blocks (n * 16 bytes) of data.
  142. *
  143. * ctx Poly1305 context.
  144. * m Blocks of message data.
  145. * bytes The number of bytes to process.
  146. */
  147. extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
  148. size_t bytes);
  149. /* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
  150. *
  151. * ctx Poly1305 context.
  152. */
  153. extern void poly1305_calc_powers_avx2(Poly1305* ctx);
  154. /* Set the key to use when processing data.
  155. * Initialize the context.
  156. * Calls AVX set key function as final function calls AVX code.
  157. *
  158. * ctx Poly1305 context.
  159. * key The key data (16 bytes).
  160. */
  161. extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
  162. /* Calculate the final result - authentication data.
  163. * Zeros out the private data in the context.
  164. * Calls AVX final function to quickly process last blocks.
  165. *
  166. * ctx Poly1305 context.
  167. * mac Buffer to hold 16 bytes - authentication data.
  168. */
  169. extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
  170. #endif
  171. #ifdef __cplusplus
  172. } /* extern "C" */
  173. #endif
  174. #elif defined(POLY130564)
  175. #ifndef WOLFSSL_ARMASM
  176. static word64 U8TO64(const byte* p)
  177. {
  178. return
  179. (((word64)(p[0] & 0xff) ) |
  180. ((word64)(p[1] & 0xff) << 8) |
  181. ((word64)(p[2] & 0xff) << 16) |
  182. ((word64)(p[3] & 0xff) << 24) |
  183. ((word64)(p[4] & 0xff) << 32) |
  184. ((word64)(p[5] & 0xff) << 40) |
  185. ((word64)(p[6] & 0xff) << 48) |
  186. ((word64)(p[7] & 0xff) << 56));
  187. }
  188. static void U64TO8(byte* p, word64 v) {
  189. p[0] = (v ) & 0xff;
  190. p[1] = (v >> 8) & 0xff;
  191. p[2] = (v >> 16) & 0xff;
  192. p[3] = (v >> 24) & 0xff;
  193. p[4] = (v >> 32) & 0xff;
  194. p[5] = (v >> 40) & 0xff;
  195. p[6] = (v >> 48) & 0xff;
  196. p[7] = (v >> 56) & 0xff;
  197. }
  198. #endif/* WOLFSSL_ARMASM */
  199. #else /* if not 64 bit then use 32 bit */
  200. static word32 U8TO32(const byte *p)
  201. {
  202. return
  203. (((word32)(p[0] & 0xff) ) |
  204. ((word32)(p[1] & 0xff) << 8) |
  205. ((word32)(p[2] & 0xff) << 16) |
  206. ((word32)(p[3] & 0xff) << 24));
  207. }
  208. static void U32TO8(byte *p, word32 v) {
  209. p[0] = (byte)((v ) & 0xff);
  210. p[1] = (byte)((v >> 8) & 0xff);
  211. p[2] = (byte)((v >> 16) & 0xff);
  212. p[3] = (byte)((v >> 24) & 0xff);
  213. }
  214. #endif
  215. /* convert 32-bit unsigned to little endian 64 bit type as byte array */
  216. static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
  217. {
  218. #ifndef WOLFSSL_X86_64_BUILD
  219. outLe64[0] = (byte)(inLe32 & 0x000000FF);
  220. outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
  221. outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
  222. outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
  223. outLe64[4] = 0;
  224. outLe64[5] = 0;
  225. outLe64[6] = 0;
  226. outLe64[7] = 0;
  227. #else
  228. *(word64*)outLe64 = inLe32;
  229. #endif
  230. }
  231. #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
  232. /*
  233. This local function operates on a message with a given number of bytes
  234. with a given ctx pointer to a Poly1305 structure.
  235. */
  236. static void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
  237. size_t bytes)
  238. {
  239. #ifdef USE_INTEL_SPEEDUP
  240. /* AVX2 is handled in wc_Poly1305Update. */
  241. SAVE_VECTOR_REGISTERS();
  242. poly1305_blocks_avx(ctx, m, bytes);
  243. RESTORE_VECTOR_REGISTERS();
  244. #elif defined(POLY130564)
  245. const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
  246. word64 r0,r1,r2;
  247. word64 s1,s2;
  248. word64 h0,h1,h2;
  249. word64 c;
  250. word128 d0,d1,d2,d;
  251. r0 = ctx->r[0];
  252. r1 = ctx->r[1];
  253. r2 = ctx->r[2];
  254. h0 = ctx->h[0];
  255. h1 = ctx->h[1];
  256. h2 = ctx->h[2];
  257. s1 = r1 * (5 << 2);
  258. s2 = r2 * (5 << 2);
  259. while (bytes >= POLY1305_BLOCK_SIZE) {
  260. word64 t0,t1;
  261. /* h += m[i] */
  262. t0 = U8TO64(&m[0]);
  263. t1 = U8TO64(&m[8]);
  264. h0 += (( t0 ) & 0xfffffffffff);
  265. h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
  266. h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
  267. /* h *= r */
  268. MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
  269. MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
  270. MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
  271. /* (partial) h %= p */
  272. c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
  273. ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
  274. ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
  275. h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
  276. h1 += c;
  277. m += POLY1305_BLOCK_SIZE;
  278. bytes -= POLY1305_BLOCK_SIZE;
  279. }
  280. ctx->h[0] = h0;
  281. ctx->h[1] = h1;
  282. ctx->h[2] = h2;
  283. #else /* if not 64 bit then use 32 bit */
  284. const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
  285. word32 r0,r1,r2,r3,r4;
  286. word32 s1,s2,s3,s4;
  287. word32 h0,h1,h2,h3,h4;
  288. word64 d0,d1,d2,d3,d4;
  289. word32 c;
  290. r0 = ctx->r[0];
  291. r1 = ctx->r[1];
  292. r2 = ctx->r[2];
  293. r3 = ctx->r[3];
  294. r4 = ctx->r[4];
  295. s1 = r1 * 5;
  296. s2 = r2 * 5;
  297. s3 = r3 * 5;
  298. s4 = r4 * 5;
  299. h0 = ctx->h[0];
  300. h1 = ctx->h[1];
  301. h2 = ctx->h[2];
  302. h3 = ctx->h[3];
  303. h4 = ctx->h[4];
  304. while (bytes >= POLY1305_BLOCK_SIZE) {
  305. /* h += m[i] */
  306. h0 += (U8TO32(m+ 0) ) & 0x3ffffff;
  307. h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
  308. h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
  309. h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
  310. h4 += (U8TO32(m+12) >> 8) | hibit;
  311. /* h *= r */
  312. d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
  313. ((word64)h3 * s2) + ((word64)h4 * s1);
  314. d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
  315. ((word64)h3 * s3) + ((word64)h4 * s2);
  316. d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
  317. ((word64)h3 * s4) + ((word64)h4 * s3);
  318. d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
  319. ((word64)h3 * r0) + ((word64)h4 * s4);
  320. d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
  321. ((word64)h3 * r1) + ((word64)h4 * r0);
  322. /* (partial) h %= p */
  323. c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
  324. d1 += c; c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
  325. d2 += c; c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
  326. d3 += c; c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
  327. d4 += c; c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
  328. h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
  329. h1 += c;
  330. m += POLY1305_BLOCK_SIZE;
  331. bytes -= POLY1305_BLOCK_SIZE;
  332. }
  333. ctx->h[0] = h0;
  334. ctx->h[1] = h1;
  335. ctx->h[2] = h2;
  336. ctx->h[3] = h3;
  337. ctx->h[4] = h4;
  338. #endif /* end of 64 bit cpu blocks or 32 bit cpu */
  339. }
  340. /*
  341. This local function is used for the last call when a message with a given
  342. number of bytes is less than the block size.
  343. */
  344. static void poly1305_block(Poly1305* ctx, const unsigned char *m)
  345. {
  346. #ifdef USE_INTEL_SPEEDUP
  347. /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
  348. SAVE_VECTOR_REGISTERS();
  349. poly1305_block_avx(ctx, m);
  350. RESTORE_VECTOR_REGISTERS();
  351. #else
  352. poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
  353. #endif
  354. }
  355. #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
  356. #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
  357. int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
  358. {
  359. #if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
  360. word64 t0,t1;
  361. #endif
  362. if (key == NULL)
  363. return BAD_FUNC_ARG;
  364. #ifdef CHACHA_AEAD_TEST
  365. word32 k;
  366. printf("Poly key used:\n");
  367. for (k = 0; k < keySz; k++) {
  368. printf("%02x", key[k]);
  369. if ((k+1) % 8 == 0)
  370. printf("\n");
  371. }
  372. printf("\n");
  373. #endif
  374. if (keySz != 32 || ctx == NULL)
  375. return BAD_FUNC_ARG;
  376. #ifdef USE_INTEL_SPEEDUP
  377. if (!cpu_flags_set) {
  378. intel_flags = cpuid_get_flags();
  379. cpu_flags_set = 1;
  380. }
  381. SAVE_VECTOR_REGISTERS();
  382. #ifdef HAVE_INTEL_AVX2
  383. if (IS_INTEL_AVX2(intel_flags))
  384. poly1305_setkey_avx2(ctx, key);
  385. else
  386. #endif
  387. poly1305_setkey_avx(ctx, key);
  388. RESTORE_VECTOR_REGISTERS();
  389. #elif defined(POLY130564)
  390. /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
  391. t0 = U8TO64(key + 0);
  392. t1 = U8TO64(key + 8);
  393. ctx->r[0] = ( t0 ) & 0xffc0fffffff;
  394. ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
  395. ctx->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
  396. /* h (accumulator) = 0 */
  397. ctx->h[0] = 0;
  398. ctx->h[1] = 0;
  399. ctx->h[2] = 0;
  400. /* save pad for later */
  401. ctx->pad[0] = U8TO64(key + 16);
  402. ctx->pad[1] = U8TO64(key + 24);
  403. ctx->leftover = 0;
  404. ctx->finished = 0;
  405. #else /* if not 64 bit then use 32 bit */
  406. /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
  407. ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff;
  408. ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03;
  409. ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff;
  410. ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff;
  411. ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
  412. /* h = 0 */
  413. ctx->h[0] = 0;
  414. ctx->h[1] = 0;
  415. ctx->h[2] = 0;
  416. ctx->h[3] = 0;
  417. ctx->h[4] = 0;
  418. /* save pad for later */
  419. ctx->pad[0] = U8TO32(key + 16);
  420. ctx->pad[1] = U8TO32(key + 20);
  421. ctx->pad[2] = U8TO32(key + 24);
  422. ctx->pad[3] = U8TO32(key + 28);
  423. ctx->leftover = 0;
  424. ctx->finished = 0;
  425. #endif
  426. return 0;
  427. }
  428. int wc_Poly1305Final(Poly1305* ctx, byte* mac)
  429. {
  430. #ifdef USE_INTEL_SPEEDUP
  431. #elif defined(POLY130564)
  432. word64 h0,h1,h2,c;
  433. word64 g0,g1,g2;
  434. word64 t0,t1;
  435. #else
  436. word32 h0,h1,h2,h3,h4,c;
  437. word32 g0,g1,g2,g3,g4;
  438. word64 f;
  439. word32 mask;
  440. #endif
  441. if (ctx == NULL || mac == NULL)
  442. return BAD_FUNC_ARG;
  443. #ifdef USE_INTEL_SPEEDUP
  444. SAVE_VECTOR_REGISTERS();
  445. #ifdef HAVE_INTEL_AVX2
  446. if (IS_INTEL_AVX2(intel_flags))
  447. poly1305_final_avx2(ctx, mac);
  448. else
  449. #endif
  450. poly1305_final_avx(ctx, mac);
  451. RESTORE_VECTOR_REGISTERS();
  452. #elif defined(POLY130564)
  453. /* process the remaining block */
  454. if (ctx->leftover) {
  455. size_t i = ctx->leftover;
  456. ctx->buffer[i] = 1;
  457. for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
  458. ctx->buffer[i] = 0;
  459. ctx->finished = 1;
  460. poly1305_block(ctx, ctx->buffer);
  461. }
  462. /* fully carry h */
  463. h0 = ctx->h[0];
  464. h1 = ctx->h[1];
  465. h2 = ctx->h[2];
  466. c = (h1 >> 44); h1 &= 0xfffffffffff;
  467. h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
  468. h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
  469. h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
  470. h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
  471. h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
  472. h1 += c;
  473. /* compute h + -p */
  474. g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
  475. g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
  476. g2 = h2 + c - ((word64)1 << 42);
  477. /* select h if h < p, or h + -p if h >= p */
  478. c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
  479. g0 &= c;
  480. g1 &= c;
  481. g2 &= c;
  482. c = ~c;
  483. h0 = (h0 & c) | g0;
  484. h1 = (h1 & c) | g1;
  485. h2 = (h2 & c) | g2;
  486. /* h = (h + pad) */
  487. t0 = ctx->pad[0];
  488. t1 = ctx->pad[1];
  489. h0 += (( t0 ) & 0xfffffffffff) ;
  490. c = (h0 >> 44); h0 &= 0xfffffffffff;
  491. h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
  492. c = (h1 >> 44); h1 &= 0xfffffffffff;
  493. h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c;
  494. h2 &= 0x3ffffffffff;
  495. /* mac = h % (2^128) */
  496. h0 = ((h0 ) | (h1 << 44));
  497. h1 = ((h1 >> 20) | (h2 << 24));
  498. U64TO8(mac + 0, h0);
  499. U64TO8(mac + 8, h1);
  500. /* zero out the state */
  501. ctx->h[0] = 0;
  502. ctx->h[1] = 0;
  503. ctx->h[2] = 0;
  504. ctx->r[0] = 0;
  505. ctx->r[1] = 0;
  506. ctx->r[2] = 0;
  507. ctx->pad[0] = 0;
  508. ctx->pad[1] = 0;
  509. #else /* if not 64 bit then use 32 bit */
  510. /* process the remaining block */
  511. if (ctx->leftover) {
  512. size_t i = ctx->leftover;
  513. ctx->buffer[i++] = 1;
  514. for (; i < POLY1305_BLOCK_SIZE; i++)
  515. ctx->buffer[i] = 0;
  516. ctx->finished = 1;
  517. poly1305_block(ctx, ctx->buffer);
  518. }
  519. /* fully carry h */
  520. h0 = ctx->h[0];
  521. h1 = ctx->h[1];
  522. h2 = ctx->h[2];
  523. h3 = ctx->h[3];
  524. h4 = ctx->h[4];
  525. c = h1 >> 26; h1 = h1 & 0x3ffffff;
  526. h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
  527. h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
  528. h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
  529. h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
  530. h1 += c;
  531. /* compute h + -p */
  532. g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
  533. g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
  534. g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
  535. g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
  536. g4 = h4 + c - ((word32)1 << 26);
  537. /* select h if h < p, or h + -p if h >= p */
  538. mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
  539. g0 &= mask;
  540. g1 &= mask;
  541. g2 &= mask;
  542. g3 &= mask;
  543. g4 &= mask;
  544. mask = ~mask;
  545. h0 = (h0 & mask) | g0;
  546. h1 = (h1 & mask) | g1;
  547. h2 = (h2 & mask) | g2;
  548. h3 = (h3 & mask) | g3;
  549. h4 = (h4 & mask) | g4;
  550. /* h = h % (2^128) */
  551. h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
  552. h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
  553. h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
  554. h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
  555. /* mac = (h + pad) % (2^128) */
  556. f = (word64)h0 + ctx->pad[0] ; h0 = (word32)f;
  557. f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
  558. f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
  559. f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
  560. U32TO8(mac + 0, h0);
  561. U32TO8(mac + 4, h1);
  562. U32TO8(mac + 8, h2);
  563. U32TO8(mac + 12, h3);
  564. /* zero out the state */
  565. ctx->h[0] = 0;
  566. ctx->h[1] = 0;
  567. ctx->h[2] = 0;
  568. ctx->h[3] = 0;
  569. ctx->h[4] = 0;
  570. ctx->r[0] = 0;
  571. ctx->r[1] = 0;
  572. ctx->r[2] = 0;
  573. ctx->r[3] = 0;
  574. ctx->r[4] = 0;
  575. ctx->pad[0] = 0;
  576. ctx->pad[1] = 0;
  577. ctx->pad[2] = 0;
  578. ctx->pad[3] = 0;
  579. #endif
  580. return 0;
  581. }
  582. #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
  583. int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
  584. {
  585. size_t i;
  586. if (ctx == NULL || (m == NULL && bytes > 0))
  587. return BAD_FUNC_ARG;
  588. if (bytes == 0) {
  589. /* valid, but do nothing */
  590. return 0;
  591. }
  592. #ifdef CHACHA_AEAD_TEST
  593. word32 k;
  594. printf("Raw input to poly:\n");
  595. for (k = 0; k < bytes; k++) {
  596. printf("%02x", m[k]);
  597. if ((k+1) % 16 == 0)
  598. printf("\n");
  599. }
  600. printf("\n");
  601. #endif
  602. #ifdef USE_INTEL_SPEEDUP
  603. #ifdef HAVE_INTEL_AVX2
  604. if (IS_INTEL_AVX2(intel_flags)) {
  605. /* handle leftover */
  606. if (ctx->leftover) {
  607. size_t want = sizeof(ctx->buffer) - ctx->leftover;
  608. if (want > bytes)
  609. want = bytes;
  610. for (i = 0; i < want; i++)
  611. ctx->buffer[ctx->leftover + i] = m[i];
  612. bytes -= (word32)want;
  613. m += want;
  614. ctx->leftover += want;
  615. if (ctx->leftover < sizeof(ctx->buffer))
  616. return 0;
  617. SAVE_VECTOR_REGISTERS();
  618. if (!ctx->started)
  619. poly1305_calc_powers_avx2(ctx);
  620. poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
  621. ctx->leftover = 0;
  622. } else
  623. SAVE_VECTOR_REGISTERS();
  624. /* process full blocks */
  625. if (bytes >= sizeof(ctx->buffer)) {
  626. size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
  627. if (!ctx->started)
  628. poly1305_calc_powers_avx2(ctx);
  629. poly1305_blocks_avx2(ctx, m, want);
  630. m += want;
  631. bytes -= (word32)want;
  632. }
  633. /* store leftover */
  634. if (bytes) {
  635. for (i = 0; i < bytes; i++)
  636. ctx->buffer[ctx->leftover + i] = m[i];
  637. ctx->leftover += bytes;
  638. }
  639. RESTORE_VECTOR_REGISTERS();
  640. }
  641. else
  642. #endif
  643. #endif
  644. {
  645. /* handle leftover */
  646. if (ctx->leftover) {
  647. size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
  648. if (want > bytes)
  649. want = bytes;
  650. for (i = 0; i < want; i++)
  651. ctx->buffer[ctx->leftover + i] = m[i];
  652. bytes -= (word32)want;
  653. m += want;
  654. ctx->leftover += want;
  655. if (ctx->leftover < POLY1305_BLOCK_SIZE)
  656. return 0;
  657. poly1305_block(ctx, ctx->buffer);
  658. ctx->leftover = 0;
  659. }
  660. /* process full blocks */
  661. if (bytes >= POLY1305_BLOCK_SIZE) {
  662. size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
  663. poly1305_blocks(ctx, m, want);
  664. m += want;
  665. bytes -= (word32)want;
  666. }
  667. /* store leftover */
  668. if (bytes) {
  669. for (i = 0; i < bytes; i++)
  670. ctx->buffer[ctx->leftover + i] = m[i];
  671. ctx->leftover += bytes;
  672. }
  673. }
  674. return 0;
  675. }
  676. /* Takes a Poly1305 struct that has a key loaded and pads the provided length
  677. ctx : Initialized Poly1305 struct to use
  678. lenToPad : Current number of bytes updated that needs padding to 16
  679. */
  680. int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
  681. {
  682. int ret = 0;
  683. word32 paddingLen;
  684. byte padding[WC_POLY1305_PAD_SZ - 1];
  685. if (ctx == NULL) {
  686. return BAD_FUNC_ARG;
  687. }
  688. if (lenToPad == 0) {
  689. return 0; /* nothing needs to be done */
  690. }
  691. XMEMSET(padding, 0, sizeof(padding));
  692. /* Pad length to 16 bytes */
  693. paddingLen = -(int)lenToPad & (WC_POLY1305_PAD_SZ - 1);
  694. if (paddingLen > 0) {
  695. ret = wc_Poly1305Update(ctx, padding, paddingLen);
  696. }
  697. return ret;
  698. }
  699. /* Takes a Poly1305 struct that has a key loaded and adds the AEAD length
  700. encoding in 64-bit little endian
  701. aadSz : Size of the additional authentication data
  702. dataSz : Size of the plaintext or ciphertext
  703. */
  704. int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
  705. {
  706. int ret;
  707. byte little64[16]; /* sizeof(word64) * 2 */
  708. if (ctx == NULL) {
  709. return BAD_FUNC_ARG;
  710. }
  711. XMEMSET(little64, 0, sizeof(little64));
  712. /* size of additional data and input data as little endian 64 bit types */
  713. u32tole64(aadSz, little64);
  714. u32tole64(dataSz, little64 + 8);
  715. ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
  716. return ret;
  717. }
  718. #ifdef WORD64_AVAILABLE
  719. int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, word64 dataSz)
  720. {
  721. int ret;
  722. word64 little64[2];
  723. if (ctx == NULL) {
  724. return BAD_FUNC_ARG;
  725. }
  726. #ifdef BIG_ENDIAN_ORDER
  727. little64[0] = ByteReverseWord64(aadSz);
  728. little64[1] = ByteReverseWord64(dataSz);
  729. #else
  730. little64[0] = aadSz;
  731. little64[1] = dataSz;
  732. #endif
  733. ret = wc_Poly1305Update(ctx, (byte *)little64, sizeof(little64));
  734. return ret;
  735. }
  736. #endif
  737. /* Takes in an initialized Poly1305 struct that has a key loaded and creates
  738. a MAC (tag) using recent TLS AEAD padding scheme.
  739. ctx : Initialized Poly1305 struct to use
  740. additional : Additional data to use
  741. addSz : Size of additional buffer
  742. input : Input buffer to create tag from
  743. sz : Size of input buffer
  744. tag : Buffer to hold created tag
  745. tagSz : Size of input tag buffer (must be at least
  746. WC_POLY1305_MAC_SZ(16))
  747. */
  748. int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz,
  749. byte* input, word32 sz, byte* tag, word32 tagSz)
  750. {
  751. int ret;
  752. /* sanity check on arguments */
  753. if (ctx == NULL || input == NULL || tag == NULL ||
  754. tagSz < WC_POLY1305_MAC_SZ) {
  755. return BAD_FUNC_ARG;
  756. }
  757. /* additional allowed to be 0 */
  758. if (addSz > 0) {
  759. if (additional == NULL)
  760. return BAD_FUNC_ARG;
  761. /* additional data plus padding */
  762. if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
  763. return ret;
  764. }
  765. /* pad additional data */
  766. if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
  767. return ret;
  768. }
  769. }
  770. /* input plus padding */
  771. if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
  772. return ret;
  773. }
  774. /* pad input data */
  775. if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
  776. return ret;
  777. }
  778. /* encode size of AAD and input data as little endian 64 bit types */
  779. if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
  780. return ret;
  781. }
  782. /* Finalize the auth tag */
  783. ret = wc_Poly1305Final(ctx, tag);
  784. return ret;
  785. }
  786. #endif /* HAVE_POLY1305 */