12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133 |
- #!/usr/bin/env perl
- # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # X25519 lower-level primitives for x86_64.
- #
- # February 2018.
- #
- # This module implements radix 2^51 multiplication and squaring, and
- # radix 2^64 multiplication, squaring, addition, subtraction and final
- # reduction. Latter radix is used on ADCX/ADOX-capable processors such
- # as Broadwell. On related note one should mention that there are
- # vector implementations that provide significantly better performance
- # on some processors(*), but they are large and overly complex. Which
- # in combination with them being effectively processor-specific makes
- # the undertaking hard to justify. The goal for this implementation
- # is rather versatility and simplicity [and ultimately formal
- # verification].
- #
- # (*) For example sandy2x should provide ~30% improvement on Sandy
- # Bridge, but only nominal ~5% on Haswell [and big loss on
- # Broadwell and successors].
- #
- ######################################################################
- # Improvement coefficients:
- #
- # amd64-51(*) gcc-5.x(**)
- #
- # P4 +22% +40%
- # Sandy Bridge -3% +11%
- # Haswell -1% +13%
- # Broadwell(***) +30% +35%
- # Skylake(***) +33% +47%
- # Silvermont +20% +26%
- # Goldmont +40% +50%
- # Bulldozer +20% +9%
- # Ryzen(***) +43% +40%
- # VIA +170% +120%
- #
- # (*) amd64-51 is popular assembly implementation with 2^51 radix,
- # only multiplication and squaring subroutines were linked
- # for comparison, but not complete ladder step; gain on most
- # processors is because this module refrains from shld, and
- # minor regression on others is because this does result in
- # higher instruction count;
- # (**) compiler is free to inline functions, in assembly one would
- # need to implement ladder step to do that, and it will improve
- # performance by several percent;
- # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
- # C implementation, so that comparison is always against
- # 2^51 radix;
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $addx = ($1>=2.23);
- }
- if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
- $addx = ($1>=2.10);
- }
- if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $addx = ($1>=12);
- }
- if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
- my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
- $addx = ($ver>=3.03);
- }
- $code.=<<___;
- .text
- .globl x25519_fe51_mul
- .type x25519_fe51_mul,\@function,3
- .align 32
- x25519_fe51_mul:
- .cfi_startproc
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -8*5(%rsp),%rsp
- .cfi_adjust_cfa_offset 40
- .Lfe51_mul_body:
- mov 8*0(%rsi),%rax # f[0]
- mov 8*0(%rdx),%r11 # load g[0-4]
- mov 8*1(%rdx),%r12
- mov 8*2(%rdx),%r13
- mov 8*3(%rdx),%rbp
- mov 8*4(%rdx),%r14
- mov %rdi,8*4(%rsp) # offload 1st argument
- mov %rax,%rdi
- mulq %r11 # f[0]*g[0]
- mov %r11,8*0(%rsp) # offload g[0]
- mov %rax,%rbx # %rbx:%rcx = h0
- mov %rdi,%rax
- mov %rdx,%rcx
- mulq %r12 # f[0]*g[1]
- mov %r12,8*1(%rsp) # offload g[1]
- mov %rax,%r8 # %r8:%r9 = h1
- mov %rdi,%rax
- lea (%r14,%r14,8),%r15
- mov %rdx,%r9
- mulq %r13 # f[0]*g[2]
- mov %r13,8*2(%rsp) # offload g[2]
- mov %rax,%r10 # %r10:%r11 = h2
- mov %rdi,%rax
- lea (%r14,%r15,2),%rdi # g[4]*19
- mov %rdx,%r11
- mulq %rbp # f[0]*g[3]
- mov %rax,%r12 # %r12:%r13 = h3
- mov 8*0(%rsi),%rax # f[0]
- mov %rdx,%r13
- mulq %r14 # f[0]*g[4]
- mov %rax,%r14 # %r14:%r15 = h4
- mov 8*1(%rsi),%rax # f[1]
- mov %rdx,%r15
- mulq %rdi # f[1]*g[4]*19
- add %rax,%rbx
- mov 8*2(%rsi),%rax # f[2]
- adc %rdx,%rcx
- mulq %rdi # f[2]*g[4]*19
- add %rax,%r8
- mov 8*3(%rsi),%rax # f[3]
- adc %rdx,%r9
- mulq %rdi # f[3]*g[4]*19
- add %rax,%r10
- mov 8*4(%rsi),%rax # f[4]
- adc %rdx,%r11
- mulq %rdi # f[4]*g[4]*19
- imulq \$19,%rbp,%rdi # g[3]*19
- add %rax,%r12
- mov 8*1(%rsi),%rax # f[1]
- adc %rdx,%r13
- mulq %rbp # f[1]*g[3]
- mov 8*2(%rsp),%rbp # g[2]
- add %rax,%r14
- mov 8*2(%rsi),%rax # f[2]
- adc %rdx,%r15
- mulq %rdi # f[2]*g[3]*19
- add %rax,%rbx
- mov 8*3(%rsi),%rax # f[3]
- adc %rdx,%rcx
- mulq %rdi # f[3]*g[3]*19
- add %rax,%r8
- mov 8*4(%rsi),%rax # f[4]
- adc %rdx,%r9
- mulq %rdi # f[4]*g[3]*19
- imulq \$19,%rbp,%rdi # g[2]*19
- add %rax,%r10
- mov 8*1(%rsi),%rax # f[1]
- adc %rdx,%r11
- mulq %rbp # f[1]*g[2]
- add %rax,%r12
- mov 8*2(%rsi),%rax # f[2]
- adc %rdx,%r13
- mulq %rbp # f[2]*g[2]
- mov 8*1(%rsp),%rbp # g[1]
- add %rax,%r14
- mov 8*3(%rsi),%rax # f[3]
- adc %rdx,%r15
- mulq %rdi # f[3]*g[2]*19
- add %rax,%rbx
- mov 8*4(%rsi),%rax # f[3]
- adc %rdx,%rcx
- mulq %rdi # f[4]*g[2]*19
- add %rax,%r8
- mov 8*1(%rsi),%rax # f[1]
- adc %rdx,%r9
- mulq %rbp # f[1]*g[1]
- imulq \$19,%rbp,%rdi
- add %rax,%r10
- mov 8*2(%rsi),%rax # f[2]
- adc %rdx,%r11
- mulq %rbp # f[2]*g[1]
- add %rax,%r12
- mov 8*3(%rsi),%rax # f[3]
- adc %rdx,%r13
- mulq %rbp # f[3]*g[1]
- mov 8*0(%rsp),%rbp # g[0]
- add %rax,%r14
- mov 8*4(%rsi),%rax # f[4]
- adc %rdx,%r15
- mulq %rdi # f[4]*g[1]*19
- add %rax,%rbx
- mov 8*1(%rsi),%rax # f[1]
- adc %rdx,%rcx
- mul %rbp # f[1]*g[0]
- add %rax,%r8
- mov 8*2(%rsi),%rax # f[2]
- adc %rdx,%r9
- mul %rbp # f[2]*g[0]
- add %rax,%r10
- mov 8*3(%rsi),%rax # f[3]
- adc %rdx,%r11
- mul %rbp # f[3]*g[0]
- add %rax,%r12
- mov 8*4(%rsi),%rax # f[4]
- adc %rdx,%r13
- mulq %rbp # f[4]*g[0]
- add %rax,%r14
- adc %rdx,%r15
- mov 8*4(%rsp),%rdi # restore 1st argument
- jmp .Lreduce51
- .Lfe51_mul_epilogue:
- .cfi_endproc
- .size x25519_fe51_mul,.-x25519_fe51_mul
- .globl x25519_fe51_sqr
- .type x25519_fe51_sqr,\@function,2
- .align 32
- x25519_fe51_sqr:
- .cfi_startproc
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -8*5(%rsp),%rsp
- .cfi_adjust_cfa_offset 40
- .Lfe51_sqr_body:
- mov 8*0(%rsi),%rax # g[0]
- mov 8*2(%rsi),%r15 # g[2]
- mov 8*4(%rsi),%rbp # g[4]
- mov %rdi,8*4(%rsp) # offload 1st argument
- lea (%rax,%rax),%r14
- mulq %rax # g[0]*g[0]
- mov %rax,%rbx
- mov 8*1(%rsi),%rax # g[1]
- mov %rdx,%rcx
- mulq %r14 # 2*g[0]*g[1]
- mov %rax,%r8
- mov %r15,%rax
- mov %r15,8*0(%rsp) # offload g[2]
- mov %rdx,%r9
- mulq %r14 # 2*g[0]*g[2]
- mov %rax,%r10
- mov 8*3(%rsi),%rax
- mov %rdx,%r11
- imulq \$19,%rbp,%rdi # g[4]*19
- mulq %r14 # 2*g[0]*g[3]
- mov %rax,%r12
- mov %rbp,%rax
- mov %rdx,%r13
- mulq %r14 # 2*g[0]*g[4]
- mov %rax,%r14
- mov %rbp,%rax
- mov %rdx,%r15
- mulq %rdi # g[4]*g[4]*19
- add %rax,%r12
- mov 8*1(%rsi),%rax # g[1]
- adc %rdx,%r13
- mov 8*3(%rsi),%rsi # g[3]
- lea (%rax,%rax),%rbp
- mulq %rax # g[1]*g[1]
- add %rax,%r10
- mov 8*0(%rsp),%rax # g[2]
- adc %rdx,%r11
- mulq %rbp # 2*g[1]*g[2]
- add %rax,%r12
- mov %rbp,%rax
- adc %rdx,%r13
- mulq %rsi # 2*g[1]*g[3]
- add %rax,%r14
- mov %rbp,%rax
- adc %rdx,%r15
- imulq \$19,%rsi,%rbp # g[3]*19
- mulq %rdi # 2*g[1]*g[4]*19
- add %rax,%rbx
- lea (%rsi,%rsi),%rax
- adc %rdx,%rcx
- mulq %rdi # 2*g[3]*g[4]*19
- add %rax,%r10
- mov %rsi,%rax
- adc %rdx,%r11
- mulq %rbp # g[3]*g[3]*19
- add %rax,%r8
- mov 8*0(%rsp),%rax # g[2]
- adc %rdx,%r9
- lea (%rax,%rax),%rsi
- mulq %rax # g[2]*g[2]
- add %rax,%r14
- mov %rbp,%rax
- adc %rdx,%r15
- mulq %rsi # 2*g[2]*g[3]*19
- add %rax,%rbx
- mov %rsi,%rax
- adc %rdx,%rcx
- mulq %rdi # 2*g[2]*g[4]*19
- add %rax,%r8
- adc %rdx,%r9
- mov 8*4(%rsp),%rdi # restore 1st argument
- jmp .Lreduce51
- .align 32
- .Lreduce51:
- mov \$0x7ffffffffffff,%rbp
- mov %r10,%rdx
- shr \$51,%r10
- shl \$13,%r11
- and %rbp,%rdx # %rdx = g2 = h2 & mask
- or %r10,%r11 # h2>>51
- add %r11,%r12
- adc \$0,%r13 # h3 += h2>>51
- mov %rbx,%rax
- shr \$51,%rbx
- shl \$13,%rcx
- and %rbp,%rax # %rax = g0 = h0 & mask
- or %rbx,%rcx # h0>>51
- add %rcx,%r8 # h1 += h0>>51
- adc \$0,%r9
- mov %r12,%rbx
- shr \$51,%r12
- shl \$13,%r13
- and %rbp,%rbx # %rbx = g3 = h3 & mask
- or %r12,%r13 # h3>>51
- add %r13,%r14 # h4 += h3>>51
- adc \$0,%r15
- mov %r8,%rcx
- shr \$51,%r8
- shl \$13,%r9
- and %rbp,%rcx # %rcx = g1 = h1 & mask
- or %r8,%r9
- add %r9,%rdx # g2 += h1>>51
- mov %r14,%r10
- shr \$51,%r14
- shl \$13,%r15
- and %rbp,%r10 # %r10 = g4 = h0 & mask
- or %r14,%r15 # h0>>51
- lea (%r15,%r15,8),%r14
- lea (%r15,%r14,2),%r15
- add %r15,%rax # g0 += (h0>>51)*19
- mov %rdx,%r8
- and %rbp,%rdx # g2 &= mask
- shr \$51,%r8
- add %r8,%rbx # g3 += g2>>51
- mov %rax,%r9
- and %rbp,%rax # g0 &= mask
- shr \$51,%r9
- add %r9,%rcx # g1 += g0>>51
- mov %rax,8*0(%rdi) # save the result
- mov %rcx,8*1(%rdi)
- mov %rdx,8*2(%rdi)
- mov %rbx,8*3(%rdi)
- mov %r10,8*4(%rdi)
- mov 8*5(%rsp),%r15
- .cfi_restore %r15
- mov 8*6(%rsp),%r14
- .cfi_restore %r14
- mov 8*7(%rsp),%r13
- .cfi_restore %r13
- mov 8*8(%rsp),%r12
- .cfi_restore %r12
- mov 8*9(%rsp),%rbx
- .cfi_restore %rbx
- mov 8*10(%rsp),%rbp
- .cfi_restore %rbp
- lea 8*11(%rsp),%rsp
- .cfi_adjust_cfa_offset 88
- .Lfe51_sqr_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe51_sqr,.-x25519_fe51_sqr
- .globl x25519_fe51_mul121666
- .type x25519_fe51_mul121666,\@function,2
- .align 32
- x25519_fe51_mul121666:
- .cfi_startproc
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -8*5(%rsp),%rsp
- .cfi_adjust_cfa_offset 40
- .Lfe51_mul121666_body:
- mov \$121666,%eax
- mulq 8*0(%rsi)
- mov %rax,%rbx # %rbx:%rcx = h0
- mov \$121666,%eax
- mov %rdx,%rcx
- mulq 8*1(%rsi)
- mov %rax,%r8 # %r8:%r9 = h1
- mov \$121666,%eax
- mov %rdx,%r9
- mulq 8*2(%rsi)
- mov %rax,%r10 # %r10:%r11 = h2
- mov \$121666,%eax
- mov %rdx,%r11
- mulq 8*3(%rsi)
- mov %rax,%r12 # %r12:%r13 = h3
- mov \$121666,%eax # f[0]
- mov %rdx,%r13
- mulq 8*4(%rsi)
- mov %rax,%r14 # %r14:%r15 = h4
- mov %rdx,%r15
- jmp .Lreduce51
- .Lfe51_mul121666_epilogue:
- .cfi_endproc
- .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
- ___
- ########################################################################
- # Base 2^64 subroutines modulo 2*(2^255-19)
- #
- if ($addx) {
- my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
- $code.=<<___;
- .extern OPENSSL_ia32cap_P
- .globl x25519_fe64_eligible
- .type x25519_fe64_eligible,\@abi-omnipotent
- .align 32
- x25519_fe64_eligible:
- .cfi_startproc
- mov OPENSSL_ia32cap_P+8(%rip),%ecx
- xor %eax,%eax
- and \$0x80100,%ecx
- cmp \$0x80100,%ecx
- cmove %ecx,%eax
- ret
- .cfi_endproc
- .size x25519_fe64_eligible,.-x25519_fe64_eligible
- .globl x25519_fe64_mul
- .type x25519_fe64_mul,\@function,3
- .align 32
- x25519_fe64_mul:
- .cfi_startproc
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- push %rdi # offload dst
- .cfi_push %rdi
- lea -8*2(%rsp),%rsp
- .cfi_adjust_cfa_offset 16
- .Lfe64_mul_body:
- mov %rdx,%rax
- mov 8*0(%rdx),%rbp # b[0]
- mov 8*0(%rsi),%rdx # a[0]
- mov 8*1(%rax),%rcx # b[1]
- mov 8*2(%rax),$acc6 # b[2]
- mov 8*3(%rax),$acc7 # b[3]
- mulx %rbp,$acc0,%rax # a[0]*b[0]
- xor %edi,%edi # cf=0,of=0
- mulx %rcx,$acc1,%rbx # a[0]*b[1]
- adcx %rax,$acc1
- mulx $acc6,$acc2,%rax # a[0]*b[2]
- adcx %rbx,$acc2
- mulx $acc7,$acc3,$acc4 # a[0]*b[3]
- mov 8*1(%rsi),%rdx # a[1]
- adcx %rax,$acc3
- mov $acc6,(%rsp) # offload b[2]
- adcx %rdi,$acc4 # cf=0
- mulx %rbp,%rax,%rbx # a[1]*b[0]
- adox %rax,$acc1
- adcx %rbx,$acc2
- mulx %rcx,%rax,%rbx # a[1]*b[1]
- adox %rax,$acc2
- adcx %rbx,$acc3
- mulx $acc6,%rax,%rbx # a[1]*b[2]
- adox %rax,$acc3
- adcx %rbx,$acc4
- mulx $acc7,%rax,$acc5 # a[1]*b[3]
- mov 8*2(%rsi),%rdx # a[2]
- adox %rax,$acc4
- adcx %rdi,$acc5 # cf=0
- adox %rdi,$acc5 # of=0
- mulx %rbp,%rax,%rbx # a[2]*b[0]
- adcx %rax,$acc2
- adox %rbx,$acc3
- mulx %rcx,%rax,%rbx # a[2]*b[1]
- adcx %rax,$acc3
- adox %rbx,$acc4
- mulx $acc6,%rax,%rbx # a[2]*b[2]
- adcx %rax,$acc4
- adox %rbx,$acc5
- mulx $acc7,%rax,$acc6 # a[2]*b[3]
- mov 8*3(%rsi),%rdx # a[3]
- adcx %rax,$acc5
- adox %rdi,$acc6 # of=0
- adcx %rdi,$acc6 # cf=0
- mulx %rbp,%rax,%rbx # a[3]*b[0]
- adox %rax,$acc3
- adcx %rbx,$acc4
- mulx %rcx,%rax,%rbx # a[3]*b[1]
- adox %rax,$acc4
- adcx %rbx,$acc5
- mulx (%rsp),%rax,%rbx # a[3]*b[2]
- adox %rax,$acc5
- adcx %rbx,$acc6
- mulx $acc7,%rax,$acc7 # a[3]*b[3]
- mov \$38,%edx
- adox %rax,$acc6
- adcx %rdi,$acc7 # cf=0
- adox %rdi,$acc7 # of=0
- jmp .Lreduce64
- .Lfe64_mul_epilogue:
- .cfi_endproc
- .size x25519_fe64_mul,.-x25519_fe64_mul
- .globl x25519_fe64_sqr
- .type x25519_fe64_sqr,\@function,2
- .align 32
- x25519_fe64_sqr:
- .cfi_startproc
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- push %rdi # offload dst
- .cfi_push %rdi
- lea -8*2(%rsp),%rsp
- .cfi_adjust_cfa_offset 16
- .Lfe64_sqr_body:
- mov 8*0(%rsi),%rdx # a[0]
- mov 8*1(%rsi),%rcx # a[1]
- mov 8*2(%rsi),%rbp # a[2]
- mov 8*3(%rsi),%rsi # a[3]
- ################################################################
- mulx %rdx,$acc0,$acc7 # a[0]*a[0]
- mulx %rcx,$acc1,%rax # a[0]*a[1]
- xor %edi,%edi # cf=0,of=0
- mulx %rbp,$acc2,%rbx # a[0]*a[2]
- adcx %rax,$acc2
- mulx %rsi,$acc3,$acc4 # a[0]*a[3]
- mov %rcx,%rdx # a[1]
- adcx %rbx,$acc3
- adcx %rdi,$acc4 # cf=0
- ################################################################
- mulx %rbp,%rax,%rbx # a[1]*a[2]
- adox %rax,$acc3
- adcx %rbx,$acc4
- mulx %rsi,%rax,$acc5 # a[1]*a[3]
- mov %rbp,%rdx # a[2]
- adox %rax,$acc4
- adcx %rdi,$acc5
- ################################################################
- mulx %rsi,%rax,$acc6 # a[2]*a[3]
- mov %rcx,%rdx # a[1]
- adox %rax,$acc5
- adcx %rdi,$acc6 # cf=0
- adox %rdi,$acc6 # of=0
- adcx $acc1,$acc1 # acc1:6<<1
- adox $acc7,$acc1
- adcx $acc2,$acc2
- mulx %rdx,%rax,%rbx # a[1]*a[1]
- mov %rbp,%rdx # a[2]
- adcx $acc3,$acc3
- adox %rax,$acc2
- adcx $acc4,$acc4
- adox %rbx,$acc3
- mulx %rdx,%rax,%rbx # a[2]*a[2]
- mov %rsi,%rdx # a[3]
- adcx $acc5,$acc5
- adox %rax,$acc4
- adcx $acc6,$acc6
- adox %rbx,$acc5
- mulx %rdx,%rax,$acc7 # a[3]*a[3]
- mov \$38,%edx
- adox %rax,$acc6
- adcx %rdi,$acc7 # cf=0
- adox %rdi,$acc7 # of=0
- jmp .Lreduce64
- .align 32
- .Lreduce64:
- mulx $acc4,%rax,%rbx
- adcx %rax,$acc0
- adox %rbx,$acc1
- mulx $acc5,%rax,%rbx
- adcx %rax,$acc1
- adox %rbx,$acc2
- mulx $acc6,%rax,%rbx
- adcx %rax,$acc2
- adox %rbx,$acc3
- mulx $acc7,%rax,$acc4
- adcx %rax,$acc3
- adox %rdi,$acc4
- adcx %rdi,$acc4
- mov 8*2(%rsp),%rdi # restore dst
- imulq %rdx,$acc4
- add $acc4,$acc0
- adc \$0,$acc1
- adc \$0,$acc2
- adc \$0,$acc3
- sbb %rax,%rax # cf -> mask
- and \$38,%rax
- add %rax,$acc0
- mov $acc1,8*1(%rdi)
- mov $acc2,8*2(%rdi)
- mov $acc3,8*3(%rdi)
- mov $acc0,8*0(%rdi)
- mov 8*3(%rsp),%r15
- .cfi_restore %r15
- mov 8*4(%rsp),%r14
- .cfi_restore %r14
- mov 8*5(%rsp),%r13
- .cfi_restore %r13
- mov 8*6(%rsp),%r12
- .cfi_restore %r12
- mov 8*7(%rsp),%rbx
- .cfi_restore %rbx
- mov 8*8(%rsp),%rbp
- .cfi_restore %rbp
- lea 8*9(%rsp),%rsp
- .cfi_adjust_cfa_offset 88
- .Lfe64_sqr_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe64_sqr,.-x25519_fe64_sqr
- .globl x25519_fe64_mul121666
- .type x25519_fe64_mul121666,\@function,2
- .align 32
- x25519_fe64_mul121666:
- .Lfe64_mul121666_body:
- .cfi_startproc
- mov \$121666,%edx
- mulx 8*0(%rsi),$acc0,%rcx
- mulx 8*1(%rsi),$acc1,%rax
- add %rcx,$acc1
- mulx 8*2(%rsi),$acc2,%rcx
- adc %rax,$acc2
- mulx 8*3(%rsi),$acc3,%rax
- adc %rcx,$acc3
- adc \$0,%rax
- imulq \$38,%rax,%rax
- add %rax,$acc0
- adc \$0,$acc1
- adc \$0,$acc2
- adc \$0,$acc3
- sbb %rax,%rax # cf -> mask
- and \$38,%rax
- add %rax,$acc0
- mov $acc1,8*1(%rdi)
- mov $acc2,8*2(%rdi)
- mov $acc3,8*3(%rdi)
- mov $acc0,8*0(%rdi)
- .Lfe64_mul121666_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
- .globl x25519_fe64_add
- .type x25519_fe64_add,\@function,3
- .align 32
- x25519_fe64_add:
- .Lfe64_add_body:
- .cfi_startproc
- mov 8*0(%rsi),$acc0
- mov 8*1(%rsi),$acc1
- mov 8*2(%rsi),$acc2
- mov 8*3(%rsi),$acc3
- add 8*0(%rdx),$acc0
- adc 8*1(%rdx),$acc1
- adc 8*2(%rdx),$acc2
- adc 8*3(%rdx),$acc3
- sbb %rax,%rax # cf -> mask
- and \$38,%rax
- add %rax,$acc0
- adc \$0,$acc1
- adc \$0,$acc2
- mov $acc1,8*1(%rdi)
- adc \$0,$acc3
- mov $acc2,8*2(%rdi)
- sbb %rax,%rax # cf -> mask
- mov $acc3,8*3(%rdi)
- and \$38,%rax
- add %rax,$acc0
- mov $acc0,8*0(%rdi)
- .Lfe64_add_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe64_add,.-x25519_fe64_add
- .globl x25519_fe64_sub
- .type x25519_fe64_sub,\@function,3
- .align 32
- x25519_fe64_sub:
- .Lfe64_sub_body:
- .cfi_startproc
- mov 8*0(%rsi),$acc0
- mov 8*1(%rsi),$acc1
- mov 8*2(%rsi),$acc2
- mov 8*3(%rsi),$acc3
- sub 8*0(%rdx),$acc0
- sbb 8*1(%rdx),$acc1
- sbb 8*2(%rdx),$acc2
- sbb 8*3(%rdx),$acc3
- sbb %rax,%rax # cf -> mask
- and \$38,%rax
- sub %rax,$acc0
- sbb \$0,$acc1
- sbb \$0,$acc2
- mov $acc1,8*1(%rdi)
- sbb \$0,$acc3
- mov $acc2,8*2(%rdi)
- sbb %rax,%rax # cf -> mask
- mov $acc3,8*3(%rdi)
- and \$38,%rax
- sub %rax,$acc0
- mov $acc0,8*0(%rdi)
- .Lfe64_sub_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe64_sub,.-x25519_fe64_sub
- .globl x25519_fe64_tobytes
- .type x25519_fe64_tobytes,\@function,2
- .align 32
- x25519_fe64_tobytes:
- .Lfe64_to_body:
- .cfi_startproc
- mov 8*0(%rsi),$acc0
- mov 8*1(%rsi),$acc1
- mov 8*2(%rsi),$acc2
- mov 8*3(%rsi),$acc3
- ################################# reduction modulo 2^255-19
- lea ($acc3,$acc3),%rax
- sar \$63,$acc3 # most significant bit -> mask
- shr \$1,%rax # most significant bit cleared
- and \$19,$acc3
- add \$19,$acc3 # compare to modulus in the same go
- add $acc3,$acc0
- adc \$0,$acc1
- adc \$0,$acc2
- adc \$0,%rax
- lea (%rax,%rax),$acc3
- sar \$63,%rax # most significant bit -> mask
- shr \$1,$acc3 # most significant bit cleared
- not %rax
- and \$19,%rax
- sub %rax,$acc0
- sbb \$0,$acc1
- sbb \$0,$acc2
- sbb \$0,$acc3
- mov $acc0,8*0(%rdi)
- mov $acc1,8*1(%rdi)
- mov $acc2,8*2(%rdi)
- mov $acc3,8*3(%rdi)
- .Lfe64_to_epilogue:
- ret
- .cfi_endproc
- .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
- ___
- } else {
- $code.=<<___;
- .globl x25519_fe64_eligible
- .type x25519_fe64_eligible,\@abi-omnipotent
- .align 32
- x25519_fe64_eligible:
- .cfi_startproc
- xor %eax,%eax
- ret
- .cfi_endproc
- .size x25519_fe64_eligible,.-x25519_fe64_eligible
- .globl x25519_fe64_mul
- .type x25519_fe64_mul,\@abi-omnipotent
- .globl x25519_fe64_sqr
- .globl x25519_fe64_mul121666
- .globl x25519_fe64_add
- .globl x25519_fe64_sub
- .globl x25519_fe64_tobytes
- x25519_fe64_mul:
- x25519_fe64_sqr:
- x25519_fe64_mul121666:
- x25519_fe64_add:
- x25519_fe64_sub:
- x25519_fe64_tobytes:
- .cfi_startproc
- .byte 0x0f,0x0b # ud2
- ret
- .cfi_endproc
- .size x25519_fe64_mul,.-x25519_fe64_mul
- ___
- }
- $code.=<<___;
- .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
- ___
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
- if ($win64) {
- $rec="%rcx";
- $frame="%rdx";
- $context="%r8";
- $disp="%r9";
- $code.=<<___;
- .extern __imp_RtlVirtualUnwind
- .type short_handler,\@abi-omnipotent
- .align 16
- short_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # end of prologue label
- cmp %r10,%rbx # context->Rip<end of prologue label
- jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
- jmp .Lcommon_seh_tail
- .size short_handler,.-short_handler
- .type full_handler,\@abi-omnipotent
- .align 16
- full_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # end of prologue label
- cmp %r10,%rbx # context->Rip<end of prologue label
- jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
- mov 8(%r11),%r10d # HandlerData[2]
- lea (%rax,%r10),%rax
- mov -8(%rax),%rbp
- mov -16(%rax),%rbx
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- .Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
- .size full_handler,.-full_handler
- .section .pdata
- .align 4
- .rva .LSEH_begin_x25519_fe51_mul
- .rva .LSEH_end_x25519_fe51_mul
- .rva .LSEH_info_x25519_fe51_mul
- .rva .LSEH_begin_x25519_fe51_sqr
- .rva .LSEH_end_x25519_fe51_sqr
- .rva .LSEH_info_x25519_fe51_sqr
- .rva .LSEH_begin_x25519_fe51_mul121666
- .rva .LSEH_end_x25519_fe51_mul121666
- .rva .LSEH_info_x25519_fe51_mul121666
- ___
- $code.=<<___ if ($addx);
- .rva .LSEH_begin_x25519_fe64_mul
- .rva .LSEH_end_x25519_fe64_mul
- .rva .LSEH_info_x25519_fe64_mul
- .rva .LSEH_begin_x25519_fe64_sqr
- .rva .LSEH_end_x25519_fe64_sqr
- .rva .LSEH_info_x25519_fe64_sqr
- .rva .LSEH_begin_x25519_fe64_mul121666
- .rva .LSEH_end_x25519_fe64_mul121666
- .rva .LSEH_info_x25519_fe64_mul121666
- .rva .LSEH_begin_x25519_fe64_add
- .rva .LSEH_end_x25519_fe64_add
- .rva .LSEH_info_x25519_fe64_add
- .rva .LSEH_begin_x25519_fe64_sub
- .rva .LSEH_end_x25519_fe64_sub
- .rva .LSEH_info_x25519_fe64_sub
- .rva .LSEH_begin_x25519_fe64_tobytes
- .rva .LSEH_end_x25519_fe64_tobytes
- .rva .LSEH_info_x25519_fe64_tobytes
- ___
- $code.=<<___;
- .section .xdata
- .align 8
- .LSEH_info_x25519_fe51_mul:
- .byte 9,0,0,0
- .rva full_handler
- .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
- .long 88,0
- .LSEH_info_x25519_fe51_sqr:
- .byte 9,0,0,0
- .rva full_handler
- .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
- .long 88,0
- .LSEH_info_x25519_fe51_mul121666:
- .byte 9,0,0,0
- .rva full_handler
- .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
- .long 88,0
- ___
- $code.=<<___ if ($addx);
- .LSEH_info_x25519_fe64_mul:
- .byte 9,0,0,0
- .rva full_handler
- .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
- .long 72,0
- .LSEH_info_x25519_fe64_sqr:
- .byte 9,0,0,0
- .rva full_handler
- .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
- .long 72,0
- .LSEH_info_x25519_fe64_mul121666:
- .byte 9,0,0,0
- .rva short_handler
- .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
- .LSEH_info_x25519_fe64_add:
- .byte 9,0,0,0
- .rva short_handler
- .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
- .LSEH_info_x25519_fe64_sub:
- .byte 9,0,0,0
- .rva short_handler
- .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
- .LSEH_info_x25519_fe64_tobytes:
- .byte 9,0,0,0
- .rva short_handler
- .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
- ___
- }
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|