f_impl64.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*
  2. * Copyright 2017-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. * Copyright 2014 Cryptography Research, Inc.
  4. *
  5. * Licensed under the OpenSSL license (the "License"). You may not use
  6. * this file except in compliance with the License. You can obtain a copy
  7. * in the file LICENSE in the source distribution or at
  8. * https://www.openssl.org/source/license.html
  9. *
  10. * Originally written by Mike Hamburg
  11. */
  12. #include "e_os.h"
  13. #include <openssl/macros.h>
  14. #include "internal/numbers.h"
  15. #ifndef UINT128_MAX
  16. /* No support for 128 bit ints, so do nothing here */
  17. NON_EMPTY_TRANSLATION_UNIT
  18. #else
  19. # include "../field.h"
  20. void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
  21. {
  22. const uint64_t *a = as->limb, *b = bs->limb;
  23. uint64_t *c = cs->limb;
  24. uint128_t accum0 = 0, accum1 = 0, accum2;
  25. uint64_t mask = (1ULL << 56) - 1;
  26. uint64_t aa[4], bb[4], bbb[4];
  27. unsigned int i, j;
  28. for (i = 0; i < 4; i++) {
  29. aa[i] = a[i] + a[i + 4];
  30. bb[i] = b[i] + b[i + 4];
  31. bbb[i] = bb[i] + b[i + 4];
  32. }
  33. for (i = 0; i < 4; i++) {
  34. accum2 = 0;
  35. for (j = 0; j <= i; j++) {
  36. accum2 += widemul(a[j], b[i - j]);
  37. accum1 += widemul(aa[j], bb[i - j]);
  38. accum0 += widemul(a[j + 4], b[i - j + 4]);
  39. }
  40. for (; j < 4; j++) {
  41. accum2 += widemul(a[j], b[i - j + 8]);
  42. accum1 += widemul(aa[j], bbb[i - j + 4]);
  43. accum0 += widemul(a[j + 4], bb[i - j + 4]);
  44. }
  45. accum1 -= accum2;
  46. accum0 += accum2;
  47. c[i] = ((uint64_t)(accum0)) & mask;
  48. c[i + 4] = ((uint64_t)(accum1)) & mask;
  49. accum0 >>= 56;
  50. accum1 >>= 56;
  51. }
  52. accum0 += accum1;
  53. accum0 += c[4];
  54. accum1 += c[0];
  55. c[4] = ((uint64_t)(accum0)) & mask;
  56. c[0] = ((uint64_t)(accum1)) & mask;
  57. accum0 >>= 56;
  58. accum1 >>= 56;
  59. c[5] += ((uint64_t)(accum0));
  60. c[1] += ((uint64_t)(accum1));
  61. }
  62. void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
  63. {
  64. const uint64_t *a = as->limb;
  65. uint64_t *c = cs->limb;
  66. uint128_t accum0 = 0, accum4 = 0;
  67. uint64_t mask = (1ULL << 56) - 1;
  68. int i;
  69. for (i = 0; i < 4; i++) {
  70. accum0 += widemul(b, a[i]);
  71. accum4 += widemul(b, a[i + 4]);
  72. c[i] = accum0 & mask;
  73. accum0 >>= 56;
  74. c[i + 4] = accum4 & mask;
  75. accum4 >>= 56;
  76. }
  77. accum0 += accum4 + c[4];
  78. c[4] = accum0 & mask;
  79. c[5] += accum0 >> 56;
  80. accum4 += c[0];
  81. c[0] = accum4 & mask;
  82. c[1] += accum4 >> 56;
  83. }
  84. void gf_sqr(gf_s * RESTRICT cs, const gf as)
  85. {
  86. const uint64_t *a = as->limb;
  87. uint64_t *c = cs->limb;
  88. uint128_t accum0 = 0, accum1 = 0, accum2;
  89. uint64_t mask = (1ULL << 56) - 1;
  90. uint64_t aa[4];
  91. unsigned int i;
  92. /* For some reason clang doesn't vectorize this without prompting? */
  93. for (i = 0; i < 4; i++)
  94. aa[i] = a[i] + a[i + 4];
  95. accum2 = widemul(a[0], a[3]);
  96. accum0 = widemul(aa[0], aa[3]);
  97. accum1 = widemul(a[4], a[7]);
  98. accum2 += widemul(a[1], a[2]);
  99. accum0 += widemul(aa[1], aa[2]);
  100. accum1 += widemul(a[5], a[6]);
  101. accum0 -= accum2;
  102. accum1 += accum2;
  103. c[3] = ((uint64_t)(accum1)) << 1 & mask;
  104. c[7] = ((uint64_t)(accum0)) << 1 & mask;
  105. accum0 >>= 55;
  106. accum1 >>= 55;
  107. accum0 += widemul(2 * aa[1], aa[3]);
  108. accum1 += widemul(2 * a[5], a[7]);
  109. accum0 += widemul(aa[2], aa[2]);
  110. accum1 += accum0;
  111. accum0 -= widemul(2 * a[1], a[3]);
  112. accum1 += widemul(a[6], a[6]);
  113. accum2 = widemul(a[0], a[0]);
  114. accum1 -= accum2;
  115. accum0 += accum2;
  116. accum0 -= widemul(a[2], a[2]);
  117. accum1 += widemul(aa[0], aa[0]);
  118. accum0 += widemul(a[4], a[4]);
  119. c[0] = ((uint64_t)(accum0)) & mask;
  120. c[4] = ((uint64_t)(accum1)) & mask;
  121. accum0 >>= 56;
  122. accum1 >>= 56;
  123. accum2 = widemul(2 * aa[2], aa[3]);
  124. accum0 -= widemul(2 * a[2], a[3]);
  125. accum1 += widemul(2 * a[6], a[7]);
  126. accum1 += accum2;
  127. accum0 += accum2;
  128. accum2 = widemul(2 * a[0], a[1]);
  129. accum1 += widemul(2 * aa[0], aa[1]);
  130. accum0 += widemul(2 * a[4], a[5]);
  131. accum1 -= accum2;
  132. accum0 += accum2;
  133. c[1] = ((uint64_t)(accum0)) & mask;
  134. c[5] = ((uint64_t)(accum1)) & mask;
  135. accum0 >>= 56;
  136. accum1 >>= 56;
  137. accum2 = widemul(aa[3], aa[3]);
  138. accum0 -= widemul(a[3], a[3]);
  139. accum1 += widemul(a[7], a[7]);
  140. accum1 += accum2;
  141. accum0 += accum2;
  142. accum2 = widemul(2 * a[0], a[2]);
  143. accum1 += widemul(2 * aa[0], aa[2]);
  144. accum0 += widemul(2 * a[4], a[6]);
  145. accum2 += widemul(a[1], a[1]);
  146. accum1 += widemul(aa[1], aa[1]);
  147. accum0 += widemul(a[5], a[5]);
  148. accum1 -= accum2;
  149. accum0 += accum2;
  150. c[2] = ((uint64_t)(accum0)) & mask;
  151. c[6] = ((uint64_t)(accum1)) & mask;
  152. accum0 >>= 56;
  153. accum1 >>= 56;
  154. accum0 += c[3];
  155. accum1 += c[7];
  156. c[3] = ((uint64_t)(accum0)) & mask;
  157. c[7] = ((uint64_t)(accum1)) & mask;
  158. /* we could almost stop here, but it wouldn't be stable, so... */
  159. accum0 >>= 56;
  160. accum1 >>= 56;
  161. c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
  162. c[0] += ((uint64_t)(accum1));
  163. }
  164. #endif