auth.s 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. /**
  2. * @author Billy Brumley <billy.brumley at aalto dot fi>
  3. * @version 1.0
  4. * @since 28 Oct 2011
  5. *
  6. * Bernstein's Poly1305 for chips featuring Intel AVX.
  7. *
  8. * This is free and unencumbered software released into the public domain.
  9. *
  10. * Anyone is free to copy, modify, publish, use, compile, sell, or
  11. * distribute this software, either in source code form or as a compiled
  12. * binary, for any purpose, commercial or non-commercial, and by any
  13. * means.
  14. *
  15. * In jurisdictions that recognize copyright laws, the author or authors
  16. * of this software dedicate any and all copyright interest in the
  17. * software to the public domain. We make this dedication for the benefit
  18. * of the public at large and to the detriment of our heirs and
  19. * successors. We intend this dedication to be an overt act of
  20. * relinquishment in perpetuity of all present and future rights to this
  21. * software under copyright law.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  26. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  27. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  28. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  29. * OTHER DEALINGS IN THE SOFTWARE.
  30. */
  31. .data
  32. .p2align 5
  33. SCALE: .quad 0x37f4000000000000, 0x37f4000000000000, 0x37f4000000000000, 0x37f4000000000000
  34. ALPHA22: .quad 0x4498000000000000, 0x4498000000000000, 0x4498000000000000, 0x4498000000000000
  35. ALPHA44: .quad 0x45f8000000000000, 0x45f8000000000000, 0x45f8000000000000, 0x45f8000000000000
  36. ALPHA65: .quad 0x4748000000000000, 0x4748000000000000, 0x4748000000000000, 0x4748000000000000
  37. ALPHA87: .quad 0x48a8000000000000, 0x48a8000000000000, 0x48a8000000000000, 0x48a8000000000000
  38. ALPHA109: .quad 0x4a08000000000000, 0x4a08000000000000, 0x4a08000000000000, 0x4a08000000000000
  39. ALPHA130: .quad 0x4b58000000000000, 0x4b58000000000000, 0x4b58000000000000, 0x4b58000000000000
  40. POW232: .quad 0x41f0000000000000, 0x41f0000000000000, 0x41f0000000000000, 0x41f0000000000000
  41. POW264: .quad 0x43f0000000000000, 0x43f0000000000000, 0x43f0000000000000, 0x43f0000000000000
  42. POW296: .quad 0x45f0000000000000, 0x45f0000000000000, 0x45f0000000000000, 0x45f0000000000000
  43. POW2128: .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
  44. POWMIX: .quad 0x3ff0000000000000, 0x41f0000000000000, 0x43f0000000000000, 0x45f0000000000000
  45. P0: .quad 0x414ffffb00000000, 0x414ffffb00000000, 0x414ffffb00000000, 0x414ffffb00000000
  46. P22: .quad 0x42afffff80000000, 0x42afffff80000000, 0x42afffff80000000, 0x42afffff80000000
  47. P44: .quad 0x440fffff80000000, 0x440fffff80000000, 0x440fffff80000000, 0x440fffff80000000
  48. P65: .quad 0x456fffff80000000, 0x456fffff80000000, 0x456fffff80000000, 0x456fffff80000000
  49. P87: .quad 0x46cfffff80000000, 0x46cfffff80000000, 0x46cfffff80000000, 0x46cfffff80000000
  50. P109: .quad 0x481fffff00000000, 0x481fffff00000000, 0x481fffff00000000, 0x481fffff00000000
  51. POW222I: .quad 0x3e90000000000000, 0x3e90000000000000, 0x3e90000000000000, 0x3e90000000000000
  52. POW244I: .quad 0x3d30000000000000, 0x3d30000000000000, 0x3d30000000000000, 0x3d30000000000000
  53. POW265I: .quad 0x3be0000000000000, 0x3be0000000000000, 0x3be0000000000000, 0x3be0000000000000
  54. POW287I: .quad 0x3a80000000000000, 0x3a80000000000000, 0x3a80000000000000, 0x3a80000000000000
  55. POW2109I: .quad 0x3920000000000000, 0x3920000000000000, 0x3920000000000000, 0x3920000000000000
  56. PMASK: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC, 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
  57. .macro MSTEP z0, z1, z2, z3, z4, z5, x0, x1, x2, x3, x4, x5, y0, t0, t1, t2
  58. vmulpd SCALE, \z5, \z5
  59. vmulpd \y0, \x5, \t2
  60. vmulpd \y0, \x4, \t1
  61. vmulpd \y0, \x3, \t0
  62. vaddpd \t2, \z4, \z4
  63. vaddpd \t1, \z3, \z3
  64. vaddpd \t0, \z2, \z2
  65. vmulpd \y0, \x2, \t2
  66. vmulpd \y0, \x1, \t1
  67. vmulpd \y0, \x0, \t0
  68. vaddpd \t2, \z1, \z1
  69. vaddpd \t1, \z0, \z0
  70. vaddpd \t0, \z5, \z5
  71. .endm
  72. .macro CSTEP z0, z1, alpha, t0, t1
  73. vmovapd \alpha, \t0
  74. vaddpd \t0, \z0, \t1
  75. vsubpd \t0, \t1, \t1
  76. vsubpd \t1, \z0, \z0
  77. vaddpd \t1, \z1, \z1
  78. .endm
  79. .macro CARRY z0, z1, z2, z3, z4, z5, t0, t1
  80. CSTEP \z0, \z1, ALPHA22, \t0, \t1
  81. CSTEP \z1, \z2, ALPHA44, \t0, \t1
  82. CSTEP \z2, \z3, ALPHA65, \t0, \t1
  83. CSTEP \z3, \z4, ALPHA87, \t0, \t1
  84. CSTEP \z4, \z5, ALPHA109, \t0, \t1
  85. .endm
  86. .macro CARRYR z0, z1, z2, z3, z4, z5, t0, t1
  87. CSTEP \z1, \z2, ALPHA44, \t0, \t1
  88. CSTEP \z2, \z3, ALPHA65, \t0, \t1
  89. CSTEP \z3, \z4, ALPHA87, \t0, \t1
  90. CSTEP \z4, \z5, ALPHA109, \t0, \t1
  91. vmovapd ALPHA130, \t0
  92. vaddpd \t0, \z5, \t1
  93. vsubpd \t0, \t1, \t1
  94. vsubpd \t1, \z5, \z5
  95. vmulpd SCALE, \t1, \t1
  96. vaddpd \t1, \z0, \z0
  97. CSTEP \z0, \z1, ALPHA22, \t0, \t1
  98. CSTEP \z1, \z2, ALPHA44, \t0, \t1
  99. .endm
  100. .macro MULI c0, c1, c2, c3, c4, a0, a1, a2, b0, b1
  101. movq \a0, %rax
  102. mulq \b1
  103. movq %rax, \c1
  104. movq %rdx, \c2
  105. movq \a2, %rax
  106. mulq \b1
  107. movq %rax, \c3
  108. movq %rdx, \c4
  109. movq \a1, %rax
  110. mulq \b1
  111. addq %rax, \c2
  112. adcq %rdx, \c3
  113. adcq $0, \c4
  114. movq \a0, %rax
  115. mulq \b0
  116. movq %rax, \c0
  117. addq %rdx, \c1
  118. adcq $0, \c2
  119. adcq $0, \c3
  120. adcq $0, \c4
  121. movq \a1, %rax
  122. mulq \b0
  123. addq %rax, \c1
  124. adcq %rdx, \c2
  125. adcq $0, \c3
  126. adcq $0, \c4
  127. movq \a2, %rax
  128. mulq \b0
  129. addq %rax, \c2
  130. adcq %rdx, \c3
  131. adcq $0, \c4
  132. movq \c2, %rdx
  133. andq $0x3, \c2
  134. andq $0xFFFFFFFFFFFFFFFC, %rdx
  135. addq %rdx, \c0
  136. adcq \c3, \c1
  137. adcq \c4, \c2
  138. shrq $1, \c4
  139. rcrq $1, \c3
  140. rcrq $1, %rdx
  141. shrq $1, \c4
  142. rcrq $1, \c3
  143. rcrq $1, %rdx
  144. addq %rdx, \c0
  145. adcq \c3, \c1
  146. adcq \c4, \c2
  147. .endm
  148. /* extern void poly1305_tag_asm(unsigned char *tag, const unsigned char *key, const unsigned char *data, int len); */
  149. /* rdi rsi rdx rcx */
  150. /* int crypto_onetimeauth(unsigned char *out,const unsigned char *in,unsigned long long inlen,const unsigned char *k); */
  151. /* rdi rsi rdx rcx */
  152. .globl _crypto_onetimeauth_poly1305_avx
  153. .globl crypto_onetimeauth_poly1305_avx
  154. _crypto_onetimeauth_poly1305_avx:
  155. crypto_onetimeauth_poly1305_avx:
  156. /* retrofit API: (rsi, rdx, rcx) := (rcx, rsi, rdx) */
  157. xchgq %rsi, %rdx
  158. xchgq %rsi, %rcx
  159. cmp $1, %rcx
  160. jge Lstart
  161. /* handle this corner case immediately. */
  162. movq 16(%rsi), %r8
  163. movq 24(%rsi), %r9
  164. movq %r8, 0(%rdi)
  165. movq %r9, 8(%rdi)
  166. xorq %rax,%rax
  167. xorq %rdx,%rdx
  168. ret
  169. Lstart:
  170. pushq %r15
  171. pushq %r14
  172. pushq %r13
  173. pushq %r12
  174. pushq %rbp
  175. pushq %rsp
  176. pushq %rbx
  177. pushq %rdi
  178. /* skip all the cool stuff for short messages */
  179. xorq %r10, %r10
  180. xorq %r11, %r11
  181. xorq %r12, %r12
  182. cmp $64, %rcx
  183. jl Lfinalize
  184. /* save, align stack */
  185. movq %rsp, %rax
  186. andq $0x1f, %rax
  187. subq %rax, %rsp
  188. addq %rsp, %rax
  189. /* load point */
  190. movdqu (%rsi), %xmm4
  191. pand PMASK, %xmm4
  192. /* *signed* convert, then fix sign */
  193. vcvtdq2pd %xmm4, %ymm5
  194. vxorpd %ymm6, %ymm6, %ymm6
  195. vblendvpd %ymm5, POW232, %ymm6, %ymm0
  196. vaddpd %ymm0, %ymm5, %ymm5
  197. vmulpd POWMIX, %ymm5, %ymm5
  198. /* ymm0..5 := (a, a, a, a) */
  199. vunpcklpd %ymm5, %ymm5, %ymm6
  200. vperm2f128 $0x00, %ymm6, %ymm6, %ymm0
  201. vperm2f128 $0x11, %ymm6, %ymm6, %ymm2
  202. vunpckhpd %ymm5, %ymm5, %ymm9
  203. vperm2f128 $0x00, %ymm9, %ymm9, %ymm1
  204. vperm2f128 $0x11, %ymm9, %ymm9, %ymm4
  205. /* balance bits across polynomial */
  206. vxorpd %ymm3, %ymm3, %ymm3
  207. vxorpd %ymm5, %ymm5, %ymm5
  208. CARRY %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm14, %ymm15
  209. /* ymm6..11 := (a, a, a, a) * (a, a, a, a) -> (a**2, a**2, a**2, a**2) */
  210. vmulpd %ymm0, %ymm5, %ymm11
  211. vmulpd %ymm1, %ymm5, %ymm6
  212. vmulpd %ymm2, %ymm5, %ymm7
  213. vmulpd %ymm3, %ymm5, %ymm8
  214. vmulpd %ymm4, %ymm5, %ymm9
  215. vmulpd %ymm5, %ymm5, %ymm10
  216. MSTEP %ymm11, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm4, %ymm13, %ymm14, %ymm15
  217. MSTEP %ymm10, %ymm11, %ymm6, %ymm7, %ymm8, %ymm9, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm3, %ymm13, %ymm14, %ymm15
  218. MSTEP %ymm9, %ymm10, %ymm11, %ymm6, %ymm7, %ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm2, %ymm13, %ymm14, %ymm15
  219. MSTEP %ymm8, %ymm9, %ymm10, %ymm11, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm1, %ymm13, %ymm14, %ymm15
  220. MSTEP %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm6, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm0, %ymm13, %ymm14, %ymm15
  221. CARRYR %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm14, %ymm15
  222. /* ymm0..5 := (a, a, a, a) (a**2, a**2, a**2, a**2) -> (a**2, a, a**2, a) */
  223. vblendpd $0x5, %ymm6, %ymm0, %ymm0
  224. vblendpd $0x5, %ymm7, %ymm1, %ymm1
  225. vblendpd $0x5, %ymm8, %ymm2, %ymm2
  226. vblendpd $0x5, %ymm9, %ymm3, %ymm3
  227. vblendpd $0x5, %ymm10, %ymm4, %ymm4
  228. vblendpd $0x5, %ymm11, %ymm5, %ymm5
  229. /* ymm6..11 := (a**2, a, a**2, a) * (a**2, a**2, a**2, a**2) -> (a**4, a**3, a**4, a**3) */
  230. vmulpd %ymm1, %ymm11, %ymm6
  231. vmulpd %ymm2, %ymm11, %ymm7
  232. vmulpd %ymm3, %ymm11, %ymm8
  233. vmulpd %ymm4, %ymm11, %ymm9
  234. vmulpd %ymm5, %ymm11, %ymm10
  235. vmulpd %ymm0, %ymm11, %ymm11
  236. /* ymm12 := (a**2, a, a**2, a) -> (a**2, a**2, a**2, a**2) */
  237. /* vmovddup avoids stack */
  238. vmovddup %ymm4, %ymm12
  239. MSTEP %ymm11, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm12, %ymm13, %ymm14, %ymm15
  240. vmovddup %ymm3, %ymm12
  241. MSTEP %ymm10, %ymm11, %ymm6, %ymm7, %ymm8, %ymm9, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm12, %ymm13, %ymm14, %ymm15
  242. vmovddup %ymm2, %ymm12
  243. MSTEP %ymm9, %ymm10, %ymm11, %ymm6, %ymm7, %ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm12, %ymm13, %ymm14, %ymm15
  244. vmovddup %ymm1, %ymm12
  245. MSTEP %ymm8, %ymm9, %ymm10, %ymm11, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm12, %ymm13, %ymm14, %ymm15
  246. vmovddup %ymm0, %ymm12
  247. MSTEP %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm6, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm12, %ymm13, %ymm14, %ymm15
  248. CARRYR %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm14, %ymm15
  249. /* ymm0..5 := (a**2, a, a**2, a) (a**4, a**3, a**4, a**3) -> (a**4, a**3, a**2, a) */
  250. vblendpd $0x3, %ymm6, %ymm0, %ymm0
  251. vblendpd $0x3, %ymm7, %ymm1, %ymm1
  252. vblendpd $0x3, %ymm8, %ymm2, %ymm2
  253. vblendpd $0x3, %ymm9, %ymm3, %ymm3
  254. vblendpd $0x3, %ymm10, %ymm4, %ymm4
  255. vblendpd $0x3, %ymm11, %ymm5, %ymm5
  256. /* (a**4, a**3, a**2, a) to stack for after main loop */
  257. leaq -192(%rsp), %rsp
  258. vmovapd %ymm0, 0(%rsp)
  259. vmovapd %ymm1, 32(%rsp)
  260. vmovapd %ymm2, 64(%rsp)
  261. vmovapd %ymm3, 96(%rsp)
  262. vmovapd %ymm4, 128(%rsp)
  263. vmovapd %ymm5, 160(%rsp)
  264. /* ymm6..11 := (a**4, a**3, a**4, a**3) -> (a**4, a**4, a**4, a**4) */
  265. vmovddup %ymm6, %ymm6
  266. vmovddup %ymm7, %ymm7
  267. vmovddup %ymm8, %ymm8
  268. vmovddup %ymm9, %ymm9
  269. vmovddup %ymm10, %ymm10
  270. vmovddup %ymm11, %ymm11
  271. /* (a**4, a**4, a**4, a**4) to stack for main loop */
  272. leaq -192(%rsp), %rsp
  273. vmovupd %ymm6, 0(%rsp)
  274. vmovupd %ymm7, 32(%rsp)
  275. vmovupd %ymm8, 64(%rsp)
  276. vmovupd %ymm9, 96(%rsp)
  277. vmovupd %ymm10, 128(%rsp)
  278. vmovupd %ymm11, 160(%rsp)
  279. /* initialize accumulator ymm0..5 */
  280. vxorpd %ymm0, %ymm0, %ymm0
  281. vxorpd %ymm1, %ymm1, %ymm1
  282. vxorpd %ymm2, %ymm2, %ymm2
  283. vxorpd %ymm3, %ymm3, %ymm3
  284. vxorpd %ymm4, %ymm4, %ymm4
  285. vxorpd %ymm5, %ymm5, %ymm5
  286. Laccumulate:
  287. /* NB: careful not to clobber accumulator ymm0..5 */
  288. /* load, slice message data */
  289. movdqu 0(%rdx), %xmm9 # 0123
  290. movdqu 16(%rdx), %xmm10 # 4567
  291. movdqu 32(%rdx), %xmm11 # 89ab
  292. movdqu 48(%rdx), %xmm12 # cdef
  293. /* slice columns 0-1 */
  294. movdqa %xmm9, %xmm13
  295. movdqa %xmm11, %xmm14
  296. punpckldq %xmm10, %xmm13 # 0415
  297. punpckldq %xmm12, %xmm14 # 8c9d
  298. movdqa %xmm13, %xmm15
  299. punpcklqdq %xmm14, %xmm13 # 048c
  300. punpckhqdq %xmm14, %xmm15 # 159d
  301. /* slice columns 2-3 */
  302. punpckhdq %xmm10, %xmm9 # 2637
  303. punpckhdq %xmm12, %xmm11 # aebf
  304. movdqa %xmm9, %xmm14
  305. punpcklqdq %xmm11, %xmm9 # 26ae
  306. punpckhqdq %xmm11, %xmm14 # 37bf
  307. /* *signed* convert */
  308. vcvtdq2pd %xmm13, %ymm6 # 048c
  309. vcvtdq2pd %xmm15, %ymm7 # 159d
  310. vcvtdq2pd %xmm9, %ymm8 # 26ae
  311. vcvtdq2pd %xmm14, %ymm9 # 37bf
  312. /* fix sign */
  313. vmovapd POW232, %ymm14
  314. vxorpd %ymm15, %ymm15, %ymm15
  315. vblendvpd %ymm6, %ymm14, %ymm15, %ymm10
  316. vblendvpd %ymm7, %ymm14, %ymm15, %ymm11
  317. vblendvpd %ymm8, %ymm14, %ymm15, %ymm12
  318. vblendvpd %ymm9, %ymm14, %ymm15, %ymm13
  319. vaddpd %ymm10, %ymm6, %ymm6
  320. vaddpd %ymm11, %ymm7, %ymm7
  321. vaddpd %ymm12, %ymm8, %ymm8
  322. vaddpd %ymm13, %ymm9, %ymm10
  323. /* adjust exponent */
  324. vmulpd %ymm14, %ymm7, %ymm7
  325. vmulpd POW264, %ymm8, %ymm8
  326. vmulpd POW296, %ymm10, %ymm10
  327. /* accumulate, add in message data, padding */
  328. vaddpd %ymm6, %ymm0, %ymm0
  329. vaddpd %ymm7, %ymm1, %ymm1
  330. vaddpd %ymm8, %ymm2, %ymm2
  331. vaddpd %ymm10, %ymm4, %ymm4
  332. vaddpd POW2128, %ymm5, %ymm5
  333. /* balance bits across polynomial */
  334. /* this 2-way parallel chain wins over generic */
  335. vmovapd ALPHA44, %ymm12
  336. vmovapd ALPHA109, %ymm14
  337. vaddpd %ymm12, %ymm1, %ymm13
  338. vaddpd %ymm14, %ymm4, %ymm15
  339. vsubpd %ymm12, %ymm13, %ymm13
  340. vsubpd %ymm14, %ymm15, %ymm15
  341. vsubpd %ymm13, %ymm1, %ymm1
  342. vsubpd %ymm15, %ymm4, %ymm4
  343. vaddpd %ymm13, %ymm2, %ymm2
  344. vaddpd %ymm15, %ymm5, %ymm5
  345. vmovapd ALPHA65, %ymm12
  346. vmovapd ALPHA130, %ymm14
  347. vaddpd %ymm12, %ymm2, %ymm13
  348. vaddpd %ymm14, %ymm5, %ymm15
  349. vsubpd %ymm12, %ymm13, %ymm13
  350. vsubpd %ymm14, %ymm15, %ymm15
  351. vsubpd %ymm13, %ymm2, %ymm2
  352. vsubpd %ymm15, %ymm5, %ymm5
  353. vmulpd SCALE, %ymm15, %ymm15
  354. vaddpd %ymm13, %ymm3, %ymm3
  355. vaddpd %ymm15, %ymm0, %ymm0
  356. vmovapd ALPHA22, %ymm12
  357. vmovapd ALPHA87, %ymm14
  358. vaddpd %ymm12, %ymm0, %ymm13
  359. vaddpd %ymm14, %ymm3, %ymm15
  360. vsubpd %ymm12, %ymm13, %ymm13
  361. vsubpd %ymm14, %ymm15, %ymm15
  362. vsubpd %ymm13, %ymm0, %ymm6 # ymm6 := ymm0
  363. vsubpd %ymm15, %ymm3, %ymm9 # ymm9 := ymm3
  364. vaddpd %ymm13, %ymm1, %ymm1
  365. vaddpd %ymm15, %ymm4, %ymm4
  366. vmovapd ALPHA44, %ymm12
  367. vmovapd ALPHA109, %ymm14
  368. vaddpd %ymm12, %ymm1, %ymm13
  369. vaddpd %ymm14, %ymm4, %ymm15
  370. vsubpd %ymm12, %ymm13, %ymm13
  371. vsubpd %ymm14, %ymm15, %ymm15
  372. vsubpd %ymm13, %ymm1, %ymm7 # ymm7 := ymm1
  373. vsubpd %ymm15, %ymm4, %ymm10 # ymm10 := ymm4
  374. vaddpd %ymm13, %ymm2, %ymm8 # ymm8 := ymm2
  375. vaddpd %ymm15, %ymm5, %ymm11 # ymm11 := ymm5
  376. /* this is the generic carry chain */
  377. #CARRYR %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm14, %ymm15
  378. /* ymm6..11 now holds the operand */
  379. /* adjust message data pointer, len */
  380. leaq 64(%rdx), %rdx
  381. leaq -64(%rcx), %rcx
  382. /* jump down a bit if there's still data */
  383. testq $0xFFFFFFFFFFFFFFC0, %rcx
  384. jnz Lmultiply
  385. /* no remaining data, pop stack */
  386. leaq 192(%rsp), %rsp
  387. Lmultiply:
  388. /**
  389. * multiply by some form of the point; two cases.
  390. * ymm0..5 := (z0, z1, z2, z3) * (a**4, a**4, a**4, a**4)
  391. * ymm0..5 := (z0, z1, z2, z3) * (a**4, a**3, a**2, a**1)
  392. */
  393. vmovapd 160(%rsp), %ymm12
  394. vmulpd %ymm6, %ymm12, %ymm5
  395. vmulpd %ymm7, %ymm12, %ymm0
  396. vmulpd %ymm8, %ymm12, %ymm1
  397. vmulpd %ymm9, %ymm12, %ymm2
  398. vmulpd %ymm10, %ymm12, %ymm3
  399. vmulpd %ymm11, %ymm12, %ymm4
  400. vmovapd 128(%rsp), %ymm12
  401. MSTEP %ymm5, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
  402. vmovapd 96(%rsp), %ymm12
  403. MSTEP %ymm4, %ymm5, %ymm0, %ymm1, %ymm2, %ymm3, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
  404. vmovapd 64(%rsp), %ymm12
  405. MSTEP %ymm3, %ymm4, %ymm5, %ymm0, %ymm1, %ymm2, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
  406. vmovapd 32(%rsp), %ymm12
  407. MSTEP %ymm2, %ymm3, %ymm4, %ymm5, %ymm0, %ymm1, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
  408. vmovapd 0(%rsp), %ymm12
  409. MSTEP %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm0, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
  410. /* balance the polynomial elsewhere */
  411. #CARRYR %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm14, %ymm15
  412. /* jump back if there's still data */
  413. /* NB: careful not to clobber zero flag */
  414. jnz Laccumulate
  415. /* pop final mult operand from the stack */
  416. leaq 192(%rsp), %rsp
  417. /* balance bits across polynomial */
  418. CARRYR %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm14, %ymm15
  419. /* add in (p+p) to make coefficients positive. */
  420. vaddpd P0, %ymm0, %ymm0
  421. vaddpd P22, %ymm1, %ymm1
  422. vaddpd P44, %ymm2, %ymm2
  423. vaddpd P65, %ymm3, %ymm3
  424. vaddpd P87, %ymm4, %ymm4
  425. vaddpd P109, %ymm5, %ymm5
  426. /* scale coefficients down */
  427. vmulpd POW222I, %ymm1, %ymm1
  428. vmulpd POW244I, %ymm2, %ymm2
  429. vmulpd POW265I, %ymm3, %ymm3
  430. vmulpd POW287I, %ymm4, %ymm4
  431. vmulpd POW2109I, %ymm5, %ymm5
  432. /* convert to integers */
  433. vcvttpd2dq %ymm0, %xmm0
  434. vcvttpd2dq %ymm1, %xmm1
  435. vcvttpd2dq %ymm2, %xmm2
  436. vcvttpd2dq %ymm3, %xmm3
  437. vcvttpd2dq %ymm4, %xmm4
  438. vcvttpd2dq %ymm5, %xmm5
  439. /* sum the polynomials */
  440. phaddd %xmm0, %xmm0
  441. phaddd %xmm1, %xmm1
  442. phaddd %xmm2, %xmm2
  443. phaddd %xmm3, %xmm3
  444. phaddd %xmm4, %xmm4
  445. phaddd %xmm5, %xmm5
  446. phaddd %xmm0, %xmm0
  447. phaddd %xmm1, %xmm1
  448. phaddd %xmm2, %xmm2
  449. phaddd %xmm3, %xmm3
  450. phaddd %xmm4, %xmm4
  451. phaddd %xmm5, %xmm5
  452. /* rcx < 64 holds at this point. */
  453. /* restore stack */
  454. movq %rax, %rsp
  455. /* construct top two words first */
  456. pextrq $0, %xmm2, %r8
  457. pextrq $0, %xmm3, %r9
  458. pextrq $0, %xmm4, %r10
  459. pextrq $0, %xmm5, %r12
  460. andq $0x7FFFFFFF, %r8
  461. andq $0x7FFFFFFF, %r9
  462. andq $0x7FFFFFFF, %r10
  463. andq $0x7FFFFFFF, %r12
  464. movq %r8, %rax
  465. shrq $20, %rax
  466. shlq $1, %r9
  467. shlq $23, %r10
  468. movq %r12, %r11
  469. shlq $45, %r11
  470. shrq $19, %r12
  471. addq %rax, %r9
  472. addq %r9, %r10
  473. addq %r10, %r11
  474. adcq $0, %r12
  475. /* construct bottom word */
  476. shlq $44, %r8
  477. pextrq $0, %xmm0, %r9
  478. pextrq $0, %xmm1, %r10
  479. andq $0x7FFFFFFF, %r9
  480. andq $0x7FFFFFFF, %r10
  481. shlq $22, %r10
  482. addq %r9, %r10
  483. addq %r8, %r10
  484. adcq $0, %r11
  485. adcq $0, %r12
  486. /* (r10, r11, r12) holds result r */
  487. Lfinalize:
  488. /* handle end of msg. */
  489. cmp $0, %rcx
  490. jle Loutput
  491. /* save stack */
  492. movq %rsp, %r15
  493. /* rdi := remaining message byte count */
  494. /* rcx := remaining message block count */
  495. /* rbp := padding mask, 2**rcx - 1 */
  496. movq %rcx, %rdi
  497. addq $0xF, %rcx
  498. shrq $4, %rcx
  499. xorq %rbp, %rbp
  500. /* push empty blocks on the stack, build padding mask */
  501. Lploopa:
  502. pushq $0
  503. pushq $0
  504. leaq 1(%rbp, %rbp), %rbp
  505. loop Lploopa
  506. /* rcx := remaining message byte count */
  507. movq %rdi, %rcx
  508. testq $0xF, %rcx
  509. jz Lploopb
  510. /* pad last block manually */
  511. movb $1, (%rsp, %rcx)
  512. shrq $1, %rbp
  513. /* move remaining message bytes to said blocks on stack */
  514. Lploopb:
  515. movzbq -1(%rdx, %rcx), %rax
  516. movb %al, -1(%rsp, %rcx)
  517. loop Lploopb
  518. /* fetch the point again */
  519. movq 0(%rsi), %rbx
  520. movq 8(%rsi), %rdi
  521. movq $0x0FFFFFFC0FFFFFFF, %r8
  522. movq $0x0FFFFFFC0FFFFFFC, %r9
  523. andq %r8, %rbx
  524. andq %r9, %rdi
  525. Lploopc:
  526. /* pop next message block */
  527. popq %r8
  528. popq %r9
  529. /* accumulate */
  530. addq %r8, %r10
  531. adcq %r9, %r11
  532. adcq $0, %r12
  533. /* throw in the padding */
  534. shrq $1, %rbp
  535. adcq $0, %r12
  536. /* multiply */
  537. MULI %r8, %r9, %r13, %r14, %rcx, %r10, %r11, %r12, %rbx, %rdi
  538. movq %r8, %r10
  539. movq %r9, %r11
  540. movq %r13, %r12
  541. cmp %rsp, %r15
  542. jg Lploopc
  543. Loutput:
  544. /* r mod p, first pass */
  545. movq %r12, %r13
  546. andq $0x3, %r12
  547. shrq $2, %r13
  548. leaq (%r13, %r13, 4), %r13
  549. addq %r13, %r10
  550. adcq $0, %r11
  551. adcq $0, %r12
  552. /* 0 <= r <= 2**130 + c for some very small positive c. */
  553. /* construct r - p */
  554. movq %r10, %r13
  555. movq %r11, %r14
  556. movq %r12, %r15
  557. subq $0xFFFFFFFFFFFFFFFB, %r13
  558. sbbq $0xFFFFFFFFFFFFFFFF, %r14
  559. sbbq $0x3, %r15
  560. sbbq %rcx, %rcx
  561. /* (r13, r14, r15) holds r - p */
  562. /* 2-to-1 multiplex, select r or r - p using borrow (rcx) as control wire */
  563. andq %rcx, %r10
  564. andq %rcx, %r11
  565. andq %rcx, %r12
  566. notq %rcx
  567. andq %rcx, %r13
  568. andq %rcx, %r14
  569. andq %rcx, %r15
  570. orq %r13, %r10
  571. orq %r14, %r11
  572. orq %r15, %r12
  573. /* fetch one time pad, add it in mod 2**128 */
  574. movq 16(%rsi), %r8
  575. movq 24(%rsi), %r9
  576. addq %r8, %r10
  577. adcq %r9, %r11
  578. adcq $0, %r12
  579. /* fetch output address, store tag */
  580. popq %rdi
  581. movq %r10, 0(%rdi)
  582. movq %r11, 8(%rdi)
  583. #movq %r12, 16(%rdi)
  584. /* restore state */
  585. popq %rbx
  586. popq %rsp
  587. popq %rbp
  588. popq %r12
  589. popq %r13
  590. popq %r14
  591. popq %r15
  592. /* done */
  593. xorq %rax,%rax
  594. xorq %rdx,%rdx
  595. ret
  596. /*
  597. radix
  598. 0 22 44 65 87 109
  599. gp regs
  600. rax rbx rcx rdx rbp rsp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
  601. scratch regs
  602. rax rcx rdx rsi rdi r8 r9 r10 r11
  603. calling convention
  604. rdi rsi rdx rcx r8 r9
  605. mulq %foo # (rax, rdx) := foo*rax (l,h)
  606. K = GF(2**130-5)
  607. R.<x> = K[]
  608. */