asm.s 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758
  1. # 1 "curve25519-donna-x86-64.s"
  2. # 1 "<built-in>"
  3. # 1 "<command-line>"
  4. # 1 "curve25519-donna-x86-64.s"
  5. # 2008, Google Inc.
  6. # All rights reserved.
  7. # Code released into the public domain
  8. ################################################################################
  9. # curve25519-donna.s - an x86-64 bit implementation of curve25519. See the
  10. # comments at the top of curve25519-donna.c
  11. # Adam Langley <agl@imperialviolet.org>
  12. # Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  13. # More information about curve25519 can be found here
  14. # http:
  15. ################################################################################
  16. .text
  17. .extern crypto_scalarmult_curve25519_donna_fmonty
  18. .globl crypto_scalarmult_curve25519_donna_fmul
  19. .globl crypto_scalarmult_curve25519_donna_fsquare
  20. .globl crypto_scalarmult_curve25519_donna_fexpand
  21. .globl crypto_scalarmult_curve25519_donna_fcontract
  22. .globl crypto_scalarmult_curve25519_donna_freduce_coefficients
  23. .globl crypto_scalarmult_curve25519_donna_fscalar
  24. .globl crypto_scalarmult_curve25519_donna_fdifference_backwards
  25. .globl crypto_scalarmult_curve25519_donna_cmult
  26. ################################################################################
  27. # fmul - multiply two 256-bit numbers
  28. # Registers: RDI (output): uint64_t[5] product
  29. # RSI (input): uint64_t[5] input 1
  30. # RDX (input): uint64_t[5] input 2
  31. ################################################################################
  32. crypto_scalarmult_curve25519_donna_fmul:
  33. # Input pointers: rdi (output), rsi (in1), rdx (in2)
  34. # Spill: rdi, rbx, r12..15
  35. push %rbx
  36. push %r12
  37. push %r13
  38. push %r14
  39. push %r15
  40. push %rdi
  41. # Load 5 64-bit values from *rsi to rsi, r8..11
  42. mov %rsi,%rcx
  43. mov (%rcx),%rsi
  44. mov 8(%rcx),%r8
  45. mov 16(%rcx),%r9
  46. mov 24(%rcx),%r10
  47. mov 32(%rcx),%r11
  48. # Load 5 64-bit values from *rdx to rdi, r12..15
  49. mov (%rdx),%rdi
  50. mov 8(%rdx),%r12
  51. mov 16(%rdx),%r13
  52. mov 24(%rdx),%r14
  53. mov 32(%rdx),%r15
  54. # We are going to perform a polynomial multiplication of two, five element
  55. # polynomials. I and J and the polynomials and I2 would be the coefficient of
  56. # x^2 etc.
  57. # 85 "curve25519-donna-x86-64.s"
  58. # We accumululate results in RCX:RBX
  59. # 97 "curve25519-donna-x86-64.s"
  60. # p[0] = i[0] * j[0]
  61. # p[0] stored in xmm0, xmm1
  62. mov %rsi,%rax ; mul %rdi
  63. movq %rax,%xmm0
  64. movq %rdx,%xmm1
  65. # p[1] = i[0] * j[1] + i[1] * j[0]
  66. mov %rsi,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
  67. mov %r8,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
  68. movq %rbx,%xmm2
  69. movq %rcx,%xmm3
  70. # p[2] = i[1] * j[1] + i[0] * j[2] + i[2] * j[0]
  71. mov %r8,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
  72. mov %rsi,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
  73. mov %r9,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
  74. movq %rbx,%xmm4
  75. movq %rcx,%xmm5
  76. # p[3]
  77. mov %rsi,%rax ; mul %r14 ; mov %rax,%rbx ; mov %rdx,%rcx
  78. mov %r10,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
  79. mov %r8,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
  80. mov %r9,%rax ; mul %r12 ; add %rax,%rbx ; adc %rdx,%rcx
  81. movq %rbx,%xmm6
  82. movq %rcx,%xmm7
  83. # p[4]
  84. mov %rsi,%rax ; mul %r15 ; mov %rax,%rbx ; mov %rdx,%rcx
  85. mov %r11,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
  86. mov %r10,%rax ; mul %r12 ; add %rax,%rbx ; adc %rdx,%rcx
  87. mov %r8,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
  88. mov %r9,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
  89. movq %rbx,%xmm8
  90. movq %rcx,%xmm9
  91. # p[5]
  92. mov %r11,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
  93. mov %r8,%rax ; mul %r15 ; add %rax,%rbx ; adc %rdx,%rcx
  94. mov %r9,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
  95. mov %r10,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
  96. movq %rbx,%xmm10
  97. movq %rcx,%xmm11
  98. # p[6]
  99. mov %r11,%rax ; mul %r13 ; mov %rax,%rbx ; mov %rdx,%rcx
  100. mov %r9,%rax ; mul %r15 ; add %rax,%rbx ; adc %rdx,%rcx
  101. mov %r10,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
  102. movq %rbx,%xmm12
  103. movq %rcx,%xmm13
  104. # p[7]
  105. mov %r10,%rax ; mul %r15 ; mov %rax,%rbx ; mov %rdx,%rcx
  106. mov %r11,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
  107. movq %rbx,%xmm14
  108. movq %rcx,%xmm15
  109. # p[8], keeping it in RDX:RAX
  110. mov %r11,%rax ; mul %r15
  111. donna_reduce:
  112. # We done with the original inputs now, so we start reusing them
  113. # At this point we have a degree 8 resulting polynomial and we need to reduce
  114. # mod 2**255-19. Since 2**255 is in our polynomial, we can multiply the
  115. # coefficients of the higher powers and add them to the lower powers. The limb
  116. # size (51-bits) is chosen to avoid overflows.
  117. mov $19,%r15
  118. # p[8] *= 19, store in R13:R12
  119. mov %rdx,%r13
  120. mul %r15
  121. imul %r15,%r13
  122. add %rdx,%r13
  123. mov %rax,%r12
  124. # p[3] += p[8] * 19
  125. movq %xmm7,%rcx
  126. movq %xmm6,%rbx
  127. add %rbx,%r12
  128. adc %rcx,%r13
  129. # 209 "curve25519-donna-x86-64.s"
  130. # p[2] += p[7] * 19, store in R11:R10
  131. movq %xmm14,%rax ; mul %r15 ; movq %xmm15,%r11 ; imul %r15,%r11 ; add %rdx,%r11 ; mov %rax,%r10 ; movq %xmm5,%rcx ; movq %xmm4,%rbx ; add %rbx,%r10 ; adc %rcx,%r11
  132. # p[1] += p[6] * 19, store in R9:R8
  133. movq %xmm12,%rax ; mul %r15 ; movq %xmm13,%r9 ; imul %r15,%r9 ; add %rdx,%r9 ; mov %rax,%r8 ; movq %xmm3,%rcx ; movq %xmm2,%rbx ; add %rbx,%r8 ; adc %rcx,%r9
  134. # p[0] += p[5] * 19, store in RDI:RSI
  135. movq %xmm10,%rax ; mul %r15 ; movq %xmm11,%rdi ; imul %r15,%rdi ; add %rdx,%rdi ; mov %rax,%rsi ; movq %xmm1,%rcx ; movq %xmm0,%rbx ; add %rbx,%rsi ; adc %rcx,%rdi
  136. # p[4], store in R15:R14
  137. movq %xmm9,%r15
  138. movq %xmm8,%r14
  139. # Coefficient reduction
  140. # Bottom 51-bits set
  141. mov $0x7ffffffffffff,%rbx
  142. mov $19,%rcx
  143. coeffreduction:
  144. # The carry chain takes the excess bits from a 128-bit result (excess are
  145. # anything over 51-bits and above) and adds them to the next value. If the top
  146. # value spills over, we reduce mod 2**255-19 again by multipling by 19 and
  147. # adding onto the bottom.
  148. # 262 "curve25519-donna-x86-64.s"
  149. mov %rsi,%rax ; shr $51,%rsi ; shl $13,%rdi ; or %rsi,%rdi ; add %rdi,%r8 ; adc $0,%r9 ; xor %rdi,%rdi ; mov %rax,%rsi ; and %rbx,%rsi
  150. mov %r8,%rax ; shr $51,%r8 ; shl $13,%r9 ; or %r8,%r9 ; add %r9,%r10 ; adc $0,%r11 ; xor %r9,%r9 ; mov %rax,%r8 ; and %rbx,%r8
  151. mov %r10,%rax ; shr $51,%r10 ; shl $13,%r11 ; or %r10,%r11 ; add %r11,%r12 ; adc $0,%r13 ; xor %r11,%r11 ; mov %rax,%r10 ; and %rbx,%r10
  152. mov %r12,%rax ; shr $51,%r12 ; shl $13,%r13 ; or %r12,%r13 ; add %r13,%r14 ; adc $0,%r15 ; xor %r13,%r13 ; mov %rax,%r12 ; and %rbx,%r12
  153. mov %r14,%rax ; shr $51,%r14 ; shl $13,%r15 ; or %r14,%r15 ; imul $19,%r15 ; add %r15,%rsi ; adc $0,%rdi ; xor %r15,%r15 ; mov %rax,%r14 ; and %rbx,%r14
  154. mov %rsi,%rax ; shr $51,%rsi ; shl $13,%rdi ; or %rsi,%rdi ; add %rdi,%r8 ; adc $0,%r9 ; xor %rdi,%rdi ; mov %rax,%rsi ; and %rbx,%rsi
  155. # write out results, which are in rsi, r8, r10, r12, rax
  156. # output pointer is on top of the stack
  157. pop %rdi
  158. mov %rsi,(%rdi)
  159. mov %r8,8(%rdi)
  160. mov %r10,16(%rdi)
  161. mov %r12,24(%rdi)
  162. mov %r14,32(%rdi)
  163. pop %r15
  164. pop %r14
  165. pop %r13
  166. pop %r12
  167. pop %rbx
  168. ret
  169. ################################################################################
  170. # fsquare - square a 256-bit number
  171. # Registers: RDI (output): uint64_t[5] product
  172. # RSI (input): uint64_t[5] input
  173. # This is very similar to fmul, above, however when squaring a number we can
  174. # save some multiplications and replace them with doublings.
  175. ################################################################################
  176. crypto_scalarmult_curve25519_donna_fsquare:
  177. push %rbx
  178. push %r12
  179. push %r13
  180. push %r14
  181. push %r15
  182. push %rdi
  183. # Load 5 64-bit values from *rsi to rsi, r8..11
  184. mov %rsi,%rcx
  185. mov (%rcx),%rsi
  186. mov 8(%rcx),%r8
  187. mov 16(%rcx),%r9
  188. mov 24(%rcx),%r10
  189. mov 32(%rcx),%r11
  190. # p[0] = i[0] * j[0]
  191. # p[0] stored in xmm0, xmm1
  192. mov %rsi,%rax ; mul %rsi
  193. movq %rax,%xmm0
  194. movq %rdx,%xmm1
  195. # p[1] = i[0] * j[1] + i[1] * j[0]
  196. mov %rsi,%rax ; mul %r8
  197. sal $1,%rax
  198. rcl $1,%rdx
  199. movq %rax,%xmm2
  200. movq %rdx,%xmm3
  201. # p[2] = i[1] * j[1] + i[0] * j[2] + i[2] * j[0]
  202. # 345 "curve25519-donna-x86-64.s"
  203. mov %r8,%rax ; mul %r8 ; mov %rax,%rbx ; mov %rdx,%rcx
  204. mov %rsi,%rax ; mul %r9 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
  205. movq %rbx,%xmm4
  206. movq %rcx,%xmm5
  207. # p[3]
  208. mov %rsi,%rax ; mul %r10 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
  209. mov %r8,%rax ; mul %r9 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
  210. movq %rbx,%xmm6
  211. movq %rcx,%xmm7
  212. # p[4]
  213. mov %rsi,%rax ; mul %r11 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
  214. mov %r10,%rax ; mul %r8 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
  215. mov %r9,%rax ; mul %r9 ; add %rax,%rbx ; adc %rdx,%rcx
  216. movq %rbx,%xmm8
  217. movq %rcx,%xmm9
  218. # p[5]
  219. mov %r11,%rax ; mul %r8 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
  220. mov %r9,%rax ; mul %r10 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
  221. movq %rbx,%xmm10
  222. movq %rcx,%xmm11
  223. # p[6]
  224. mov %r11,%rax ; mul %r9 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
  225. mov %r10,%rax ; mul %r10 ; add %rax,%rbx ; adc %rdx,%rcx
  226. movq %rbx,%xmm12
  227. movq %rcx,%xmm13
  228. # p[7]
  229. mov %r10,%rax ; mul %r11
  230. sal $1,%rax
  231. rcl $1,%rdx
  232. movq %rax,%xmm14
  233. movq %rdx,%xmm15
  234. # p[8], keeping it in RDX:RAX
  235. mov %r11,%rax ; mul %r11
  236. jmp donna_reduce
  237. ################################################################################
  238. # fdifference_backwards - set output to in - output (note order)
  239. # 439 "curve25519-donna-x86-64.s"
  240. ################################################################################
  241. crypto_scalarmult_curve25519_donna_fdifference_backwards:
  242. mov (%rsi),%rax
  243. mov 8(%rsi),%r8
  244. mov 16(%rsi),%r9
  245. mov 24(%rsi),%r10
  246. mov 32(%rsi),%r11
  247. sub (%rdi),%rax
  248. sub 8(%rdi),%r8
  249. sub 16(%rdi),%r9
  250. sub 24(%rdi),%r10
  251. sub 32(%rdi),%r11
  252. # 2**51
  253. mov $0x8000000000000,%rdx
  254. fdifference_backwards_loop:
  255. # In the C code, above, we have lots of branches. We replace these branches
  256. # with a trick. An arithmetic shift right of 63-bits turns a positive number to
  257. # 0, but a negative number turns to all ones. This gives us a bit-mask that we
  258. # can AND against to add 2**51, conditionally.
  259. # 482 "curve25519-donna-x86-64.s"
  260. mov %rax,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%rax ; shr $51,%rcx ; sub %rcx,%r8
  261. mov %r8,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r8 ; shr $51,%rcx ; sub %rcx,%r9
  262. mov %r9,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r9 ; shr $51,%rcx ; sub %rcx,%r10
  263. mov %r10,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r10 ; shr $51,%rcx ; sub %rcx,%r11
  264. mov %r11,%rcx ; sar $63,%rcx ; mov %rcx,%rsi ; and %rdx,%rcx ; add %rcx,%r11 ; and $19,%rsi ; sub %rsi,%rax
  265. mov %rax,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%rax ; shr $51,%rcx ; sub %rcx,%r8
  266. mov %r8,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r8 ; shr $51,%rcx ; sub %rcx,%r9
  267. mov %r9,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r9 ; shr $51,%rcx ; sub %rcx,%r10
  268. mov %r10,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r10 ; shr $51,%rcx ; sub %rcx,%r11
  269. mov %rax,(%rdi)
  270. mov %r8,8(%rdi)
  271. mov %r9,16(%rdi)
  272. mov %r10,24(%rdi)
  273. mov %r11,32(%rdi)
  274. ret
  275. ################################################################################
  276. # fscalar - multiply by 121665
  277. # Registers: RDI: (out) pointer to uint64_t[5]
  278. # RSI: (in) pointer to uint64_t[5]
  279. # Since we only have 13-bits of space at the top of our limbs, this is a full,
  280. # cascading multiplication.
  281. ################################################################################
  282. crypto_scalarmult_curve25519_donna_fscalar:
  283. mov $121665,%rcx
  284. mov (%rsi),%rax
  285. mul %rcx
  286. shl $13,%rdx
  287. mov %rdx,%r8
  288. mov %rax,%r9
  289. mov 8(%rsi),%rax
  290. mul %rcx
  291. add %r8,%rax
  292. shl $13,%rdx
  293. mov %rdx,%r8
  294. mov %rax,8(%rdi)
  295. mov 16(%rsi),%rax
  296. mul %rcx
  297. add %r8,%rax
  298. shl $13,%rdx
  299. mov %rdx,%r8
  300. mov %rax,16(%rdi)
  301. mov 24(%rsi),%rax
  302. mul %rcx
  303. add %r8,%rax
  304. shl $13,%rdx
  305. mov %rdx,%r8
  306. mov %rax,24(%rdi)
  307. mov 32(%rsi),%rax
  308. mul %rcx
  309. add %r8,%rax
  310. mov %rax,32(%rdi)
  311. shl $13,%rdx
  312. mov $19,%rcx
  313. mov %rdx,%rax
  314. mul %rcx
  315. add %rax,%r9
  316. mov %r9,0(%rdi)
  317. ret
  318. ################################################################################
  319. # freduce_coefficients
  320. # Registers: RDI: (in/out) pointer to uint64_t[5]
  321. ################################################################################
  322. crypto_scalarmult_curve25519_donna_freduce_coefficients:
  323. push %r12
  324. mov $0x7ffffffffffff,%rcx
  325. mov $19,%rdx
  326. mov (%rdi),%r8
  327. mov 8(%rdi),%r9
  328. mov 16(%rdi),%r10
  329. mov 24(%rdi),%r11
  330. mov 32(%rdi),%r12
  331. mov %r8,%rax
  332. shr $51,%rax
  333. add %rax,%r9
  334. and %rcx,%r8
  335. mov %r9,%rax
  336. shr $51,%rax
  337. add %rax,%r10
  338. and %rcx,%r9
  339. mov %r10,%rax
  340. shr $51,%rax
  341. add %rax,%r11
  342. and %rcx,%r10
  343. mov %r11,%rax
  344. shr $51,%rax
  345. add %rax,%r12
  346. and %rcx,%r11
  347. mov %r12,%rax
  348. shr $51,%rax
  349. imul $19,%rax
  350. add %rax,%r8
  351. and %rcx,%r12
  352. mov %r8,(%rdi)
  353. mov %r9,8(%rdi)
  354. mov %r10,16(%rdi)
  355. mov %r11,24(%rdi)
  356. mov %r12,32(%rdi)
  357. pop %r12
  358. ret
  359. ################################################################################
  360. # fexpand - convert a packed (32 byte) representation to 5 uint64_t's
  361. # Registers: RDI: (output) pointer to uint64_t[5]
  362. # RSI: (input) pointer to uint8_t[32]
  363. ################################################################################
  364. crypto_scalarmult_curve25519_donna_fexpand:
  365. mov $0x7ffffffffffff,%rdx
  366. mov (%rsi),%rax
  367. and %rdx,%rax
  368. mov %rax,(%rdi)
  369. mov 6(%rsi),%rax
  370. shr $3,%rax
  371. and %rdx,%rax
  372. mov %rax,8(%rdi)
  373. mov 12(%rsi),%rax
  374. shr $6,%rax
  375. and %rdx,%rax
  376. mov %rax,16(%rdi)
  377. mov 19(%rsi),%rax
  378. shr $1,%rax
  379. and %rdx,%rax
  380. mov %rax,24(%rdi)
  381. mov 25(%rsi),%rax
  382. shr $4,%rax
  383. and %rdx,%rax
  384. mov %rax,32(%rdi)
  385. ret
  386. ################################################################################
  387. # fcontract - convert 5 uint64_t's to a packed (32 byte) representation
  388. # Registers: RDI: (output) pointer to uint8_t[32]
  389. # RSI: (input) pointer to uint64_t[5]
  390. ################################################################################
  391. crypto_scalarmult_curve25519_donna_fcontract:
  392. mov (%rsi),%rax
  393. mov 8(%rsi),%rdx
  394. mov 16(%rsi),%r8
  395. mov 24(%rsi),%r9
  396. mov 32(%rsi),%r10
  397. mov %rdx,%rcx
  398. shl $51,%rcx
  399. or %rcx,%rax
  400. mov %rax,(%rdi)
  401. shr $13,%rdx
  402. mov %r8,%rcx
  403. shl $38,%rcx
  404. or %rcx,%rdx
  405. mov %rdx,8(%rdi)
  406. shr $26,%r8
  407. mov %r9,%rcx
  408. shl $25,%rcx
  409. or %rcx,%r8
  410. mov %r8,16(%rdi)
  411. shr $39,%r9
  412. shl $12,%r10
  413. or %r10,%r9
  414. mov %r9,24(%rdi)
  415. ret
  416. ################################################################################
  417. # cmult - calculates nQ wher Q is the x-coordinate of a point on the curve
  418. # Registers: RDI: (output) final x
  419. # RSI: (output) final z
  420. # RDX: (input) n (big-endian)
  421. # RCX: (input) q (big-endian)
  422. # 747 "curve25519-donna-x86-64.s"
  423. ################################################################################
  424. crypto_scalarmult_curve25519_donna_cmult:
  425. push %rbp
  426. push %r13
  427. push %r14
  428. mov %rsp,%rbp
  429. mov $63,%r8
  430. not %r8
  431. and %r8,%rsp
  432. mov %rdx,%r13
  433. mov %rcx,%r14
  434. sub $320,%rsp
  435. # value nQ+Q (x)
  436. movq (%rcx),%rax
  437. movq %rax,(%rsp)
  438. movq 8(%rcx),%r8
  439. movq %r8,8(%rsp)
  440. movq 16(%rcx),%r9
  441. movq %r9,16(%rsp)
  442. movq 24(%rcx),%r10
  443. movq %r10,24(%rsp)
  444. movq 32(%rcx),%r11
  445. movq %r11,32(%rsp)
  446. # value nQ+Q (z)
  447. movq $1,40(%rsp)
  448. movq $0,48(%rsp)
  449. movq $0,56(%rsp)
  450. movq $0,64(%rsp)
  451. movq $0,72(%rsp)
  452. # value nQ (x)
  453. movq $1,80(%rsp)
  454. movq $0,88(%rsp)
  455. movq $0,96(%rsp)
  456. movq $0,104(%rsp)
  457. movq $0,112(%rsp)
  458. # value nQ (z)
  459. movq $0,120(%rsp)
  460. movq $0,128(%rsp)
  461. movq $0,136(%rsp)
  462. movq $0,144(%rsp)
  463. movq $0,152(%rsp)
  464. push %rbx
  465. push %r12
  466. push %r15
  467. push %rdi
  468. push %rsi
  469. # The stack looks like
  470. # (nQ)'
  471. # (nQ+Q)'
  472. # nQ
  473. # nQ+Q
  474. # saved registers (40-bytes) <-- %rsp
  475. # We switch between the two banks with an offset in %r12, starting by writing
  476. # into the prime bank and reading from the non-prime bank.
  477. # Based on the current MSB of the operand, we flip the two values over based
  478. # on an offset in %r8 for the first first member and %r9 for the second
  479. mov $160,%r12
  480. mov $32,%rbx
  481. cmult_loop_outer:
  482. # On entry to the loop, the word offset is kept in %rbx. We dec 8 bytes and
  483. # then store the outer loop counter in the top 32-bits of %rbx. The inner loop
  484. # counter is kept in %ebx
  485. sub $8,%rbx
  486. movq (%r13,%rbx),%r15
  487. shl $32,%rbx
  488. or $64,%rbx
  489. cmult_loop_inner:
  490. # Register allocation:
  491. # r11: complement r12
  492. # Preserved by fmonty:
  493. # rbx: loop counters
  494. # r12: bank switch offset
  495. # r13: (input) n
  496. # r14: (input) q
  497. # r15: the current qword, getting left shifted
  498. # We wish to test the MSB of the qword in r15. An arithmetic shift right of 63
  499. # places turns this either into all 1's (if MSB is set) or all zeros otherwise.
  500. mov %r15,%r8
  501. sar $63,%r8
  502. # Now replicate the mask to 128-bits in xmm0
  503. movq %r8,%xmm1
  504. movq %xmm1,%xmm0
  505. pslldq $8,%xmm0
  506. por %xmm1,%xmm0
  507. # Based on that mask, we swap the contents of several arrays in a side-channel
  508. # free manner.
  509. # Swap two xmm registers based on a mask in xmm0. Uses xmm11 as a temporary
  510. # Swap the 80 byte arrays pointed to by %rdi based on the mask in
  511. # %xmm0
  512. # 893 "curve25519-donna-x86-64.s"
  513. mov %r12,%r11
  514. xor $160,%r11
  515. lea 40(%rsp,%r11),%rdi
  516. movdqa (%rdi),%xmm1 ; movdqa 80(%rdi),%xmm2 ; movdqa 16(%rdi),%xmm3 ; movdqa 96(%rdi),%xmm4 ; movdqa 32(%rdi),%xmm5 ; movdqa 112(%rdi),%xmm6 ; movdqa 48(%rdi),%xmm7 ; movdqa 128(%rdi),%xmm8 ; movdqa 64(%rdi),%xmm9 ; movdqa 144(%rdi),%xmm10 ; movdqa %xmm1,%xmm11 ; pxor %xmm2,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm1 ; pxor %xmm11,%xmm2 ; movdqa %xmm3,%xmm11 ; pxor %xmm4,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm3 ; pxor %xmm11,%xmm4 ; movdqa %xmm5,%xmm11 ; pxor %xmm6,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm5 ; pxor %xmm11,%xmm6 ; movdqa %xmm7,%xmm11 ; pxor %xmm8,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm7 ; pxor %xmm11,%xmm8 ; movdqa %xmm9,%xmm11 ; pxor %xmm10,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm9 ; pxor %xmm11,%xmm10 ; movdqa %xmm1,(%rdi) ; movdqa %xmm2,80(%rdi) ; movdqa %xmm3,16(%rdi) ; movdqa %xmm4,96(%rdi) ; movdqa %xmm5,32(%rdi) ; movdqa %xmm6,112(%rdi) ; movdqa %xmm7,48(%rdi) ; movdqa %xmm8,128(%rdi) ; movdqa %xmm9,64(%rdi) ; movdqa %xmm10,144(%rdi)
  517. mov %rdi,%rdx
  518. lea 40(%rsp,%r12),%rdi
  519. mov %rdi,%rsi
  520. add $80,%rdi
  521. mov %rdx,%rcx
  522. add $80,%rdx
  523. mov %r14,%r8
  524. call crypto_scalarmult_curve25519_donna_fmonty
  525. mov %r15,%r8
  526. sar $63,%r8
  527. movq %r8,%xmm1
  528. movq %xmm1,%xmm0
  529. pslldq $8,%xmm0
  530. por %xmm1,%xmm0
  531. lea 40(%rsp,%r12),%rdi
  532. movdqa (%rdi),%xmm1 ; movdqa 80(%rdi),%xmm2 ; movdqa 16(%rdi),%xmm3 ; movdqa 96(%rdi),%xmm4 ; movdqa 32(%rdi),%xmm5 ; movdqa 112(%rdi),%xmm6 ; movdqa 48(%rdi),%xmm7 ; movdqa 128(%rdi),%xmm8 ; movdqa 64(%rdi),%xmm9 ; movdqa 144(%rdi),%xmm10 ; movdqa %xmm1,%xmm11 ; pxor %xmm2,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm1 ; pxor %xmm11,%xmm2 ; movdqa %xmm3,%xmm11 ; pxor %xmm4,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm3 ; pxor %xmm11,%xmm4 ; movdqa %xmm5,%xmm11 ; pxor %xmm6,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm5 ; pxor %xmm11,%xmm6 ; movdqa %xmm7,%xmm11 ; pxor %xmm8,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm7 ; pxor %xmm11,%xmm8 ; movdqa %xmm9,%xmm11 ; pxor %xmm10,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm9 ; pxor %xmm11,%xmm10 ; movdqa %xmm1,(%rdi) ; movdqa %xmm2,80(%rdi) ; movdqa %xmm3,16(%rdi) ; movdqa %xmm4,96(%rdi) ; movdqa %xmm5,32(%rdi) ; movdqa %xmm6,112(%rdi) ; movdqa %xmm7,48(%rdi) ; movdqa %xmm8,128(%rdi) ; movdqa %xmm9,64(%rdi) ; movdqa %xmm10,144(%rdi)
  533. shl $1,%r15
  534. xor $160,%r12
  535. dec %rbx
  536. cmp $0,%ebx
  537. jnz cmult_loop_inner
  538. shr $32,%rbx
  539. cmp $0,%rbx
  540. jnz cmult_loop_outer
  541. pop %rsi
  542. pop %rdi
  543. pop %r15
  544. pop %r12
  545. pop %rbx
  546. lea 80(%rsp),%r8
  547. movq (%r8),%rax
  548. movq %rax,(%rdi)
  549. movq 8(%r8),%rax
  550. movq %rax,8(%rdi)
  551. movq 16(%r8),%rax
  552. movq %rax,16(%rdi)
  553. movq 24(%r8),%rax
  554. movq %rax,24(%rdi)
  555. movq 32(%r8),%rax
  556. movq %rax,32(%rdi)
  557. movq 40(%r8),%rax
  558. movq %rax,(%rsi)
  559. movq 48(%r8),%rax
  560. movq %rax,8(%rsi)
  561. movq 56(%r8),%rax
  562. movq %rax,16(%rsi)
  563. movq 64(%r8),%rax
  564. movq %rax,24(%rsi)
  565. movq 72(%r8),%rax
  566. movq %rax,32(%rsi)
  567. mov %rbp,%rsp
  568. pop %r14
  569. pop %r13
  570. pop %rbp
  571. ret