123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758 |
- # 1 "curve25519-donna-x86-64.s"
- # 1 "<built-in>"
- # 1 "<command-line>"
- # 1 "curve25519-donna-x86-64.s"
- # 2008, Google Inc.
- # All rights reserved.
- # Code released into the public domain
- ################################################################################
- # curve25519-donna.s - an x86-64 bit implementation of curve25519. See the
- # comments at the top of curve25519-donna.c
- # Adam Langley <agl@imperialviolet.org>
- # Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
- # More information about curve25519 can be found here
- # http:
- ################################################################################
- .text
- .extern crypto_scalarmult_curve25519_donna_fmonty
- .globl crypto_scalarmult_curve25519_donna_fmul
- .globl crypto_scalarmult_curve25519_donna_fsquare
- .globl crypto_scalarmult_curve25519_donna_fexpand
- .globl crypto_scalarmult_curve25519_donna_fcontract
- .globl crypto_scalarmult_curve25519_donna_freduce_coefficients
- .globl crypto_scalarmult_curve25519_donna_fscalar
- .globl crypto_scalarmult_curve25519_donna_fdifference_backwards
- .globl crypto_scalarmult_curve25519_donna_cmult
- ################################################################################
- # fmul - multiply two 256-bit numbers
- # Registers: RDI (output): uint64_t[5] product
- # RSI (input): uint64_t[5] input 1
- # RDX (input): uint64_t[5] input 2
- ################################################################################
- crypto_scalarmult_curve25519_donna_fmul:
- # Input pointers: rdi (output), rsi (in1), rdx (in2)
- # Spill: rdi, rbx, r12..15
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
- push %rdi
- # Load 5 64-bit values from *rsi to rsi, r8..11
- mov %rsi,%rcx
- mov (%rcx),%rsi
- mov 8(%rcx),%r8
- mov 16(%rcx),%r9
- mov 24(%rcx),%r10
- mov 32(%rcx),%r11
- # Load 5 64-bit values from *rdx to rdi, r12..15
- mov (%rdx),%rdi
- mov 8(%rdx),%r12
- mov 16(%rdx),%r13
- mov 24(%rdx),%r14
- mov 32(%rdx),%r15
- # We are going to perform a polynomial multiplication of two, five element
- # polynomials. I and J and the polynomials and I2 would be the coefficient of
- # x^2 etc.
- # 85 "curve25519-donna-x86-64.s"
- # We accumululate results in RCX:RBX
- # 97 "curve25519-donna-x86-64.s"
- # p[0] = i[0] * j[0]
- # p[0] stored in xmm0, xmm1
- mov %rsi,%rax ; mul %rdi
- movq %rax,%xmm0
- movq %rdx,%xmm1
- # p[1] = i[0] * j[1] + i[1] * j[0]
- mov %rsi,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r8,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm2
- movq %rcx,%xmm3
- # p[2] = i[1] * j[1] + i[0] * j[2] + i[2] * j[0]
- mov %r8,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %rsi,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r9,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm4
- movq %rcx,%xmm5
- # p[3]
- mov %rsi,%rax ; mul %r14 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r10,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r8,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r9,%rax ; mul %r12 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm6
- movq %rcx,%xmm7
- # p[4]
- mov %rsi,%rax ; mul %r15 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r11,%rax ; mul %rdi ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r10,%rax ; mul %r12 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r8,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r9,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm8
- movq %rcx,%xmm9
- # p[5]
- mov %r11,%rax ; mul %r12 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r8,%rax ; mul %r15 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r9,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r10,%rax ; mul %r13 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm10
- movq %rcx,%xmm11
- # p[6]
- mov %r11,%rax ; mul %r13 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r9,%rax ; mul %r15 ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r10,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm12
- movq %rcx,%xmm13
- # p[7]
- mov %r10,%rax ; mul %r15 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %r11,%rax ; mul %r14 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm14
- movq %rcx,%xmm15
- # p[8], keeping it in RDX:RAX
- mov %r11,%rax ; mul %r15
- donna_reduce:
- # We done with the original inputs now, so we start reusing them
- # At this point we have a degree 8 resulting polynomial and we need to reduce
- # mod 2**255-19. Since 2**255 is in our polynomial, we can multiply the
- # coefficients of the higher powers and add them to the lower powers. The limb
- # size (51-bits) is chosen to avoid overflows.
- mov $19,%r15
- # p[8] *= 19, store in R13:R12
- mov %rdx,%r13
- mul %r15
- imul %r15,%r13
- add %rdx,%r13
- mov %rax,%r12
- # p[3] += p[8] * 19
- movq %xmm7,%rcx
- movq %xmm6,%rbx
- add %rbx,%r12
- adc %rcx,%r13
- # 209 "curve25519-donna-x86-64.s"
- # p[2] += p[7] * 19, store in R11:R10
- movq %xmm14,%rax ; mul %r15 ; movq %xmm15,%r11 ; imul %r15,%r11 ; add %rdx,%r11 ; mov %rax,%r10 ; movq %xmm5,%rcx ; movq %xmm4,%rbx ; add %rbx,%r10 ; adc %rcx,%r11
- # p[1] += p[6] * 19, store in R9:R8
- movq %xmm12,%rax ; mul %r15 ; movq %xmm13,%r9 ; imul %r15,%r9 ; add %rdx,%r9 ; mov %rax,%r8 ; movq %xmm3,%rcx ; movq %xmm2,%rbx ; add %rbx,%r8 ; adc %rcx,%r9
- # p[0] += p[5] * 19, store in RDI:RSI
- movq %xmm10,%rax ; mul %r15 ; movq %xmm11,%rdi ; imul %r15,%rdi ; add %rdx,%rdi ; mov %rax,%rsi ; movq %xmm1,%rcx ; movq %xmm0,%rbx ; add %rbx,%rsi ; adc %rcx,%rdi
- # p[4], store in R15:R14
- movq %xmm9,%r15
- movq %xmm8,%r14
- # Coefficient reduction
- # Bottom 51-bits set
- mov $0x7ffffffffffff,%rbx
- mov $19,%rcx
- coeffreduction:
- # The carry chain takes the excess bits from a 128-bit result (excess are
- # anything over 51-bits and above) and adds them to the next value. If the top
- # value spills over, we reduce mod 2**255-19 again by multipling by 19 and
- # adding onto the bottom.
- # 262 "curve25519-donna-x86-64.s"
- mov %rsi,%rax ; shr $51,%rsi ; shl $13,%rdi ; or %rsi,%rdi ; add %rdi,%r8 ; adc $0,%r9 ; xor %rdi,%rdi ; mov %rax,%rsi ; and %rbx,%rsi
- mov %r8,%rax ; shr $51,%r8 ; shl $13,%r9 ; or %r8,%r9 ; add %r9,%r10 ; adc $0,%r11 ; xor %r9,%r9 ; mov %rax,%r8 ; and %rbx,%r8
- mov %r10,%rax ; shr $51,%r10 ; shl $13,%r11 ; or %r10,%r11 ; add %r11,%r12 ; adc $0,%r13 ; xor %r11,%r11 ; mov %rax,%r10 ; and %rbx,%r10
- mov %r12,%rax ; shr $51,%r12 ; shl $13,%r13 ; or %r12,%r13 ; add %r13,%r14 ; adc $0,%r15 ; xor %r13,%r13 ; mov %rax,%r12 ; and %rbx,%r12
- mov %r14,%rax ; shr $51,%r14 ; shl $13,%r15 ; or %r14,%r15 ; imul $19,%r15 ; add %r15,%rsi ; adc $0,%rdi ; xor %r15,%r15 ; mov %rax,%r14 ; and %rbx,%r14
- mov %rsi,%rax ; shr $51,%rsi ; shl $13,%rdi ; or %rsi,%rdi ; add %rdi,%r8 ; adc $0,%r9 ; xor %rdi,%rdi ; mov %rax,%rsi ; and %rbx,%rsi
- # write out results, which are in rsi, r8, r10, r12, rax
- # output pointer is on top of the stack
- pop %rdi
- mov %rsi,(%rdi)
- mov %r8,8(%rdi)
- mov %r10,16(%rdi)
- mov %r12,24(%rdi)
- mov %r14,32(%rdi)
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- ret
- ################################################################################
- # fsquare - square a 256-bit number
- # Registers: RDI (output): uint64_t[5] product
- # RSI (input): uint64_t[5] input
- # This is very similar to fmul, above, however when squaring a number we can
- # save some multiplications and replace them with doublings.
- ################################################################################
- crypto_scalarmult_curve25519_donna_fsquare:
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
- push %rdi
- # Load 5 64-bit values from *rsi to rsi, r8..11
- mov %rsi,%rcx
- mov (%rcx),%rsi
- mov 8(%rcx),%r8
- mov 16(%rcx),%r9
- mov 24(%rcx),%r10
- mov 32(%rcx),%r11
- # p[0] = i[0] * j[0]
- # p[0] stored in xmm0, xmm1
- mov %rsi,%rax ; mul %rsi
- movq %rax,%xmm0
- movq %rdx,%xmm1
- # p[1] = i[0] * j[1] + i[1] * j[0]
- mov %rsi,%rax ; mul %r8
- sal $1,%rax
- rcl $1,%rdx
- movq %rax,%xmm2
- movq %rdx,%xmm3
- # p[2] = i[1] * j[1] + i[0] * j[2] + i[2] * j[0]
- # 345 "curve25519-donna-x86-64.s"
- mov %r8,%rax ; mul %r8 ; mov %rax,%rbx ; mov %rdx,%rcx
- mov %rsi,%rax ; mul %r9 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm4
- movq %rcx,%xmm5
- # p[3]
- mov %rsi,%rax ; mul %r10 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
- mov %r8,%rax ; mul %r9 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm6
- movq %rcx,%xmm7
- # p[4]
- mov %rsi,%rax ; mul %r11 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
- mov %r10,%rax ; mul %r8 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
- mov %r9,%rax ; mul %r9 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm8
- movq %rcx,%xmm9
- # p[5]
- mov %r11,%rax ; mul %r8 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
- mov %r9,%rax ; mul %r10 ; sal $1,%rax ; rcl $1,%rdx ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm10
- movq %rcx,%xmm11
- # p[6]
- mov %r11,%rax ; mul %r9 ; mov %rax,%rbx ; mov %rdx,%rcx ; sal $1,%rbx ; rcl $1,%rcx
- mov %r10,%rax ; mul %r10 ; add %rax,%rbx ; adc %rdx,%rcx
- movq %rbx,%xmm12
- movq %rcx,%xmm13
- # p[7]
- mov %r10,%rax ; mul %r11
- sal $1,%rax
- rcl $1,%rdx
- movq %rax,%xmm14
- movq %rdx,%xmm15
- # p[8], keeping it in RDX:RAX
- mov %r11,%rax ; mul %r11
- jmp donna_reduce
- ################################################################################
- # fdifference_backwards - set output to in - output (note order)
- # 439 "curve25519-donna-x86-64.s"
- ################################################################################
- crypto_scalarmult_curve25519_donna_fdifference_backwards:
- mov (%rsi),%rax
- mov 8(%rsi),%r8
- mov 16(%rsi),%r9
- mov 24(%rsi),%r10
- mov 32(%rsi),%r11
- sub (%rdi),%rax
- sub 8(%rdi),%r8
- sub 16(%rdi),%r9
- sub 24(%rdi),%r10
- sub 32(%rdi),%r11
- # 2**51
- mov $0x8000000000000,%rdx
- fdifference_backwards_loop:
- # In the C code, above, we have lots of branches. We replace these branches
- # with a trick. An arithmetic shift right of 63-bits turns a positive number to
- # 0, but a negative number turns to all ones. This gives us a bit-mask that we
- # can AND against to add 2**51, conditionally.
- # 482 "curve25519-donna-x86-64.s"
- mov %rax,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%rax ; shr $51,%rcx ; sub %rcx,%r8
- mov %r8,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r8 ; shr $51,%rcx ; sub %rcx,%r9
- mov %r9,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r9 ; shr $51,%rcx ; sub %rcx,%r10
- mov %r10,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r10 ; shr $51,%rcx ; sub %rcx,%r11
- mov %r11,%rcx ; sar $63,%rcx ; mov %rcx,%rsi ; and %rdx,%rcx ; add %rcx,%r11 ; and $19,%rsi ; sub %rsi,%rax
- mov %rax,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%rax ; shr $51,%rcx ; sub %rcx,%r8
- mov %r8,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r8 ; shr $51,%rcx ; sub %rcx,%r9
- mov %r9,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r9 ; shr $51,%rcx ; sub %rcx,%r10
- mov %r10,%rcx ; sar $63,%rcx ; and %rdx,%rcx ; add %rcx,%r10 ; shr $51,%rcx ; sub %rcx,%r11
- mov %rax,(%rdi)
- mov %r8,8(%rdi)
- mov %r9,16(%rdi)
- mov %r10,24(%rdi)
- mov %r11,32(%rdi)
- ret
- ################################################################################
- # fscalar - multiply by 121665
- # Registers: RDI: (out) pointer to uint64_t[5]
- # RSI: (in) pointer to uint64_t[5]
- # Since we only have 13-bits of space at the top of our limbs, this is a full,
- # cascading multiplication.
- ################################################################################
- crypto_scalarmult_curve25519_donna_fscalar:
- mov $121665,%rcx
- mov (%rsi),%rax
- mul %rcx
- shl $13,%rdx
- mov %rdx,%r8
- mov %rax,%r9
- mov 8(%rsi),%rax
- mul %rcx
- add %r8,%rax
- shl $13,%rdx
- mov %rdx,%r8
- mov %rax,8(%rdi)
- mov 16(%rsi),%rax
- mul %rcx
- add %r8,%rax
- shl $13,%rdx
- mov %rdx,%r8
- mov %rax,16(%rdi)
- mov 24(%rsi),%rax
- mul %rcx
- add %r8,%rax
- shl $13,%rdx
- mov %rdx,%r8
- mov %rax,24(%rdi)
- mov 32(%rsi),%rax
- mul %rcx
- add %r8,%rax
- mov %rax,32(%rdi)
- shl $13,%rdx
- mov $19,%rcx
- mov %rdx,%rax
- mul %rcx
- add %rax,%r9
- mov %r9,0(%rdi)
- ret
- ################################################################################
- # freduce_coefficients
- # Registers: RDI: (in/out) pointer to uint64_t[5]
- ################################################################################
- crypto_scalarmult_curve25519_donna_freduce_coefficients:
- push %r12
- mov $0x7ffffffffffff,%rcx
- mov $19,%rdx
- mov (%rdi),%r8
- mov 8(%rdi),%r9
- mov 16(%rdi),%r10
- mov 24(%rdi),%r11
- mov 32(%rdi),%r12
- mov %r8,%rax
- shr $51,%rax
- add %rax,%r9
- and %rcx,%r8
- mov %r9,%rax
- shr $51,%rax
- add %rax,%r10
- and %rcx,%r9
- mov %r10,%rax
- shr $51,%rax
- add %rax,%r11
- and %rcx,%r10
- mov %r11,%rax
- shr $51,%rax
- add %rax,%r12
- and %rcx,%r11
- mov %r12,%rax
- shr $51,%rax
- imul $19,%rax
- add %rax,%r8
- and %rcx,%r12
- mov %r8,(%rdi)
- mov %r9,8(%rdi)
- mov %r10,16(%rdi)
- mov %r11,24(%rdi)
- mov %r12,32(%rdi)
- pop %r12
- ret
- ################################################################################
- # fexpand - convert a packed (32 byte) representation to 5 uint64_t's
- # Registers: RDI: (output) pointer to uint64_t[5]
- # RSI: (input) pointer to uint8_t[32]
- ################################################################################
- crypto_scalarmult_curve25519_donna_fexpand:
- mov $0x7ffffffffffff,%rdx
- mov (%rsi),%rax
- and %rdx,%rax
- mov %rax,(%rdi)
- mov 6(%rsi),%rax
- shr $3,%rax
- and %rdx,%rax
- mov %rax,8(%rdi)
- mov 12(%rsi),%rax
- shr $6,%rax
- and %rdx,%rax
- mov %rax,16(%rdi)
- mov 19(%rsi),%rax
- shr $1,%rax
- and %rdx,%rax
- mov %rax,24(%rdi)
- mov 25(%rsi),%rax
- shr $4,%rax
- and %rdx,%rax
- mov %rax,32(%rdi)
- ret
- ################################################################################
- # fcontract - convert 5 uint64_t's to a packed (32 byte) representation
- # Registers: RDI: (output) pointer to uint8_t[32]
- # RSI: (input) pointer to uint64_t[5]
- ################################################################################
- crypto_scalarmult_curve25519_donna_fcontract:
- mov (%rsi),%rax
- mov 8(%rsi),%rdx
- mov 16(%rsi),%r8
- mov 24(%rsi),%r9
- mov 32(%rsi),%r10
- mov %rdx,%rcx
- shl $51,%rcx
- or %rcx,%rax
- mov %rax,(%rdi)
- shr $13,%rdx
- mov %r8,%rcx
- shl $38,%rcx
- or %rcx,%rdx
- mov %rdx,8(%rdi)
- shr $26,%r8
- mov %r9,%rcx
- shl $25,%rcx
- or %rcx,%r8
- mov %r8,16(%rdi)
- shr $39,%r9
- shl $12,%r10
- or %r10,%r9
- mov %r9,24(%rdi)
- ret
- ################################################################################
- # cmult - calculates nQ wher Q is the x-coordinate of a point on the curve
- # Registers: RDI: (output) final x
- # RSI: (output) final z
- # RDX: (input) n (big-endian)
- # RCX: (input) q (big-endian)
- # 747 "curve25519-donna-x86-64.s"
- ################################################################################
- crypto_scalarmult_curve25519_donna_cmult:
- push %rbp
- push %r13
- push %r14
- mov %rsp,%rbp
- mov $63,%r8
- not %r8
- and %r8,%rsp
- mov %rdx,%r13
- mov %rcx,%r14
- sub $320,%rsp
- # value nQ+Q (x)
- movq (%rcx),%rax
- movq %rax,(%rsp)
- movq 8(%rcx),%r8
- movq %r8,8(%rsp)
- movq 16(%rcx),%r9
- movq %r9,16(%rsp)
- movq 24(%rcx),%r10
- movq %r10,24(%rsp)
- movq 32(%rcx),%r11
- movq %r11,32(%rsp)
- # value nQ+Q (z)
- movq $1,40(%rsp)
- movq $0,48(%rsp)
- movq $0,56(%rsp)
- movq $0,64(%rsp)
- movq $0,72(%rsp)
- # value nQ (x)
- movq $1,80(%rsp)
- movq $0,88(%rsp)
- movq $0,96(%rsp)
- movq $0,104(%rsp)
- movq $0,112(%rsp)
- # value nQ (z)
- movq $0,120(%rsp)
- movq $0,128(%rsp)
- movq $0,136(%rsp)
- movq $0,144(%rsp)
- movq $0,152(%rsp)
- push %rbx
- push %r12
- push %r15
- push %rdi
- push %rsi
- # The stack looks like
- # (nQ)'
- # (nQ+Q)'
- # nQ
- # nQ+Q
- # saved registers (40-bytes) <-- %rsp
- # We switch between the two banks with an offset in %r12, starting by writing
- # into the prime bank and reading from the non-prime bank.
- # Based on the current MSB of the operand, we flip the two values over based
- # on an offset in %r8 for the first first member and %r9 for the second
- mov $160,%r12
- mov $32,%rbx
- cmult_loop_outer:
- # On entry to the loop, the word offset is kept in %rbx. We dec 8 bytes and
- # then store the outer loop counter in the top 32-bits of %rbx. The inner loop
- # counter is kept in %ebx
- sub $8,%rbx
- movq (%r13,%rbx),%r15
- shl $32,%rbx
- or $64,%rbx
- cmult_loop_inner:
- # Register allocation:
- # r11: complement r12
- # Preserved by fmonty:
- # rbx: loop counters
- # r12: bank switch offset
- # r13: (input) n
- # r14: (input) q
- # r15: the current qword, getting left shifted
- # We wish to test the MSB of the qword in r15. An arithmetic shift right of 63
- # places turns this either into all 1's (if MSB is set) or all zeros otherwise.
- mov %r15,%r8
- sar $63,%r8
- # Now replicate the mask to 128-bits in xmm0
- movq %r8,%xmm1
- movq %xmm1,%xmm0
- pslldq $8,%xmm0
- por %xmm1,%xmm0
- # Based on that mask, we swap the contents of several arrays in a side-channel
- # free manner.
- # Swap two xmm registers based on a mask in xmm0. Uses xmm11 as a temporary
- # Swap the 80 byte arrays pointed to by %rdi based on the mask in
- # %xmm0
- # 893 "curve25519-donna-x86-64.s"
- mov %r12,%r11
- xor $160,%r11
- lea 40(%rsp,%r11),%rdi
- movdqa (%rdi),%xmm1 ; movdqa 80(%rdi),%xmm2 ; movdqa 16(%rdi),%xmm3 ; movdqa 96(%rdi),%xmm4 ; movdqa 32(%rdi),%xmm5 ; movdqa 112(%rdi),%xmm6 ; movdqa 48(%rdi),%xmm7 ; movdqa 128(%rdi),%xmm8 ; movdqa 64(%rdi),%xmm9 ; movdqa 144(%rdi),%xmm10 ; movdqa %xmm1,%xmm11 ; pxor %xmm2,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm1 ; pxor %xmm11,%xmm2 ; movdqa %xmm3,%xmm11 ; pxor %xmm4,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm3 ; pxor %xmm11,%xmm4 ; movdqa %xmm5,%xmm11 ; pxor %xmm6,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm5 ; pxor %xmm11,%xmm6 ; movdqa %xmm7,%xmm11 ; pxor %xmm8,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm7 ; pxor %xmm11,%xmm8 ; movdqa %xmm9,%xmm11 ; pxor %xmm10,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm9 ; pxor %xmm11,%xmm10 ; movdqa %xmm1,(%rdi) ; movdqa %xmm2,80(%rdi) ; movdqa %xmm3,16(%rdi) ; movdqa %xmm4,96(%rdi) ; movdqa %xmm5,32(%rdi) ; movdqa %xmm6,112(%rdi) ; movdqa %xmm7,48(%rdi) ; movdqa %xmm8,128(%rdi) ; movdqa %xmm9,64(%rdi) ; movdqa %xmm10,144(%rdi)
- mov %rdi,%rdx
- lea 40(%rsp,%r12),%rdi
- mov %rdi,%rsi
- add $80,%rdi
- mov %rdx,%rcx
- add $80,%rdx
- mov %r14,%r8
- call crypto_scalarmult_curve25519_donna_fmonty
- mov %r15,%r8
- sar $63,%r8
- movq %r8,%xmm1
- movq %xmm1,%xmm0
- pslldq $8,%xmm0
- por %xmm1,%xmm0
- lea 40(%rsp,%r12),%rdi
- movdqa (%rdi),%xmm1 ; movdqa 80(%rdi),%xmm2 ; movdqa 16(%rdi),%xmm3 ; movdqa 96(%rdi),%xmm4 ; movdqa 32(%rdi),%xmm5 ; movdqa 112(%rdi),%xmm6 ; movdqa 48(%rdi),%xmm7 ; movdqa 128(%rdi),%xmm8 ; movdqa 64(%rdi),%xmm9 ; movdqa 144(%rdi),%xmm10 ; movdqa %xmm1,%xmm11 ; pxor %xmm2,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm1 ; pxor %xmm11,%xmm2 ; movdqa %xmm3,%xmm11 ; pxor %xmm4,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm3 ; pxor %xmm11,%xmm4 ; movdqa %xmm5,%xmm11 ; pxor %xmm6,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm5 ; pxor %xmm11,%xmm6 ; movdqa %xmm7,%xmm11 ; pxor %xmm8,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm7 ; pxor %xmm11,%xmm8 ; movdqa %xmm9,%xmm11 ; pxor %xmm10,%xmm11 ; pand %xmm0,%xmm11 ; pxor %xmm11,%xmm9 ; pxor %xmm11,%xmm10 ; movdqa %xmm1,(%rdi) ; movdqa %xmm2,80(%rdi) ; movdqa %xmm3,16(%rdi) ; movdqa %xmm4,96(%rdi) ; movdqa %xmm5,32(%rdi) ; movdqa %xmm6,112(%rdi) ; movdqa %xmm7,48(%rdi) ; movdqa %xmm8,128(%rdi) ; movdqa %xmm9,64(%rdi) ; movdqa %xmm10,144(%rdi)
- shl $1,%r15
- xor $160,%r12
- dec %rbx
- cmp $0,%ebx
- jnz cmult_loop_inner
- shr $32,%rbx
- cmp $0,%rbx
- jnz cmult_loop_outer
- pop %rsi
- pop %rdi
- pop %r15
- pop %r12
- pop %rbx
- lea 80(%rsp),%r8
- movq (%r8),%rax
- movq %rax,(%rdi)
- movq 8(%r8),%rax
- movq %rax,8(%rdi)
- movq 16(%r8),%rax
- movq %rax,16(%rdi)
- movq 24(%r8),%rax
- movq %rax,24(%rdi)
- movq 32(%r8),%rax
- movq %rax,32(%rdi)
- movq 40(%r8),%rax
- movq %rax,(%rsi)
- movq 48(%r8),%rax
- movq %rax,8(%rsi)
- movq 56(%r8),%rax
- movq %rax,16(%rsi)
- movq 64(%r8),%rax
- movq %rax,24(%rsi)
- movq 72(%r8),%rax
- movq %rax,32(%rsi)
- mov %rbp,%rsp
- pop %r14
- pop %r13
- pop %rbp
- ret
|