12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907 |
- #! /usr/bin/env perl
- # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- # March 2015
- #
- # "Teaser" Montgomery multiplication module for ARMv8. Needs more
- # work. While it does improve RSA sign performance by 20-30% (less for
- # longer keys) on most processors, for some reason RSA2048 is not
- # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
- # instruction issue rate is limited on processor in question, meaning
- # that dedicated squaring procedure is a must. Well, actually all
- # contemporary AArch64 processors seem to have limited multiplication
- # issue rate, i.e. they can't issue multiplication every cycle, which
- # explains moderate improvement coefficients in comparison to
- # compiler-generated code. Recall that compiler is instructed to use
- # umulh and therefore uses same amount of multiplication instructions
- # to do the job. Assembly's edge is to minimize number of "collateral"
- # instructions and of course instruction scheduling.
- #
- # April 2015
- #
- # Squaring procedure that handles lengths divisible by 8 improves
- # RSA/DSA performance by 25-40-60% depending on processor and key
- # length. Overall improvement coefficients are always positive in
- # comparison to compiler-generated code. On Cortex-A57 improvement
- # is still modest on longest key lengths, while others exhibit e.g.
- # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
- # on Cortex-A57 and ~60-100% faster on others.
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour \"$output\""
- or die "can't call $xlate: $1";
- *STDOUT=*OUT;
- ($lo0,$hi0,$aj,$m0,$alo,$ahi,
- $lo1,$hi1,$nj,$m1,$nlo,$nhi,
- $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
- # int bn_mul_mont(
- $rp="x0"; # BN_ULONG *rp,
- $ap="x1"; # const BN_ULONG *ap,
- $bp="x2"; # const BN_ULONG *bp,
- $np="x3"; # const BN_ULONG *np,
- $n0="x4"; # const BN_ULONG *n0,
- $num="x5"; # int num);
- $code.=<<___;
- #include "arm_arch.h"
- #ifndef __KERNEL__
- .extern OPENSSL_armv8_rsa_neonized
- .hidden OPENSSL_armv8_rsa_neonized
- #endif
- .text
- .globl bn_mul_mont
- .type bn_mul_mont,%function
- .align 5
- bn_mul_mont:
- AARCH64_SIGN_LINK_REGISTER
- .Lbn_mul_mont:
- tst $num,#3
- b.ne .Lmul_mont
- cmp $num,#32
- b.le .Lscalar_impl
- #ifndef __KERNEL__
- adrp x17,OPENSSL_armv8_rsa_neonized
- ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
- cbnz w17, bn_mul8x_mont_neon
- #endif
- .Lscalar_impl:
- tst $num,#7
- b.eq __bn_sqr8x_mont
- tst $num,#3
- b.eq __bn_mul4x_mont
- .Lmul_mont:
- stp x29,x30,[sp,#-64]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- ldr $m0,[$bp],#8 // bp[0]
- sub $tp,sp,$num,lsl#3
- ldp $hi0,$aj,[$ap],#16 // ap[0..1]
- lsl $num,$num,#3
- ldr $n0,[$n0] // *n0
- and $tp,$tp,#-16 // ABI says so
- ldp $hi1,$nj,[$np],#16 // np[0..1]
- mul $lo0,$hi0,$m0 // ap[0]*bp[0]
- sub $j,$num,#16 // j=num-2
- umulh $hi0,$hi0,$m0
- mul $alo,$aj,$m0 // ap[1]*bp[0]
- umulh $ahi,$aj,$m0
- mul $m1,$lo0,$n0 // "tp[0]"*n0
- mov sp,$tp // alloca
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
- umulh $hi1,$hi1,$m1
- mul $nlo,$nj,$m1 // np[1]*m1
- // (*) adds $lo1,$lo1,$lo0 // discarded
- // (*) As for removal of first multiplication and addition
- // instructions. The outcome of first addition is
- // guaranteed to be zero, which leaves two computationally
- // significant outcomes: it either carries or not. Then
- // question is when does it carry? Is there alternative
- // way to deduce it? If you follow operations, you can
- // observe that condition for carry is quite simple:
- // $lo0 being non-zero. So that carry can be calculated
- // by adding -1 to $lo0. That's what next instruction does.
- subs xzr,$lo0,#1 // (*)
- umulh $nhi,$nj,$m1
- adc $hi1,$hi1,xzr
- cbz $j,.L1st_skip
- .L1st:
- ldr $aj,[$ap],#8
- adds $lo0,$alo,$hi0
- sub $j,$j,#8 // j--
- adc $hi0,$ahi,xzr
- ldr $nj,[$np],#8
- adds $lo1,$nlo,$hi1
- mul $alo,$aj,$m0 // ap[j]*bp[0]
- adc $hi1,$nhi,xzr
- umulh $ahi,$aj,$m0
- adds $lo1,$lo1,$lo0
- mul $nlo,$nj,$m1 // np[j]*m1
- adc $hi1,$hi1,xzr
- umulh $nhi,$nj,$m1
- str $lo1,[$tp],#8 // tp[j-1]
- cbnz $j,.L1st
- .L1st_skip:
- adds $lo0,$alo,$hi0
- sub $ap,$ap,$num // rewind $ap
- adc $hi0,$ahi,xzr
- adds $lo1,$nlo,$hi1
- sub $np,$np,$num // rewind $np
- adc $hi1,$nhi,xzr
- adds $lo1,$lo1,$lo0
- sub $i,$num,#8 // i=num-1
- adcs $hi1,$hi1,$hi0
- adc $ovf,xzr,xzr // upmost overflow bit
- stp $lo1,$hi1,[$tp]
- .Louter:
- ldr $m0,[$bp],#8 // bp[i]
- ldp $hi0,$aj,[$ap],#16
- ldr $tj,[sp] // tp[0]
- add $tp,sp,#8
- mul $lo0,$hi0,$m0 // ap[0]*bp[i]
- sub $j,$num,#16 // j=num-2
- umulh $hi0,$hi0,$m0
- ldp $hi1,$nj,[$np],#16
- mul $alo,$aj,$m0 // ap[1]*bp[i]
- adds $lo0,$lo0,$tj
- umulh $ahi,$aj,$m0
- adc $hi0,$hi0,xzr
- mul $m1,$lo0,$n0
- sub $i,$i,#8 // i--
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
- umulh $hi1,$hi1,$m1
- mul $nlo,$nj,$m1 // np[1]*m1
- // (*) adds $lo1,$lo1,$lo0
- subs xzr,$lo0,#1 // (*)
- umulh $nhi,$nj,$m1
- cbz $j,.Linner_skip
- .Linner:
- ldr $aj,[$ap],#8
- adc $hi1,$hi1,xzr
- ldr $tj,[$tp],#8 // tp[j]
- adds $lo0,$alo,$hi0
- sub $j,$j,#8 // j--
- adc $hi0,$ahi,xzr
- adds $lo1,$nlo,$hi1
- ldr $nj,[$np],#8
- adc $hi1,$nhi,xzr
- mul $alo,$aj,$m0 // ap[j]*bp[i]
- adds $lo0,$lo0,$tj
- umulh $ahi,$aj,$m0
- adc $hi0,$hi0,xzr
- mul $nlo,$nj,$m1 // np[j]*m1
- adds $lo1,$lo1,$lo0
- umulh $nhi,$nj,$m1
- stur $lo1,[$tp,#-16] // tp[j-1]
- cbnz $j,.Linner
- .Linner_skip:
- ldr $tj,[$tp],#8 // tp[j]
- adc $hi1,$hi1,xzr
- adds $lo0,$alo,$hi0
- sub $ap,$ap,$num // rewind $ap
- adc $hi0,$ahi,xzr
- adds $lo1,$nlo,$hi1
- sub $np,$np,$num // rewind $np
- adcs $hi1,$nhi,$ovf
- adc $ovf,xzr,xzr
- adds $lo0,$lo0,$tj
- adc $hi0,$hi0,xzr
- adds $lo1,$lo1,$lo0
- adcs $hi1,$hi1,$hi0
- adc $ovf,$ovf,xzr // upmost overflow bit
- stp $lo1,$hi1,[$tp,#-16]
- cbnz $i,.Louter
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr $tj,[sp] // tp[0]
- add $tp,sp,#8
- ldr $nj,[$np],#8 // np[0]
- subs $j,$num,#8 // j=num-1 and clear borrow
- mov $ap,$rp
- .Lsub:
- sbcs $aj,$tj,$nj // tp[j]-np[j]
- ldr $tj,[$tp],#8
- sub $j,$j,#8 // j--
- ldr $nj,[$np],#8
- str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
- cbnz $j,.Lsub
- sbcs $aj,$tj,$nj
- sbcs $ovf,$ovf,xzr // did it borrow?
- str $aj,[$ap],#8 // rp[num-1]
- ldr $tj,[sp] // tp[0]
- add $tp,sp,#8
- ldr $aj,[$rp],#8 // rp[0]
- sub $num,$num,#8 // num--
- nop
- .Lcond_copy:
- sub $num,$num,#8 // num--
- csel $nj,$tj,$aj,lo // did it borrow?
- ldr $tj,[$tp],#8
- ldr $aj,[$rp],#8
- stur xzr,[$tp,#-16] // wipe tp
- stur $nj,[$rp,#-16]
- cbnz $num,.Lcond_copy
- csel $nj,$tj,$aj,lo
- stur xzr,[$tp,#-8] // wipe tp
- stur $nj,[$rp,#-8]
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldr x29,[sp],#64
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size bn_mul_mont,.-bn_mul_mont
- ___
- {
- my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
- my ($Z,$Temp)=("v4.16b","v5");
- my @ACC=map("v$_",(6..13));
- my ($Bi,$Ni,$M0)=map("v$_",(28..30));
- my $sBi="s28";
- my $sM0="s30";
- my $zero="v14";
- my $temp="v15";
- my $ACCTemp="v16";
- my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
- my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
- $code.=<<___;
- .type bn_mul8x_mont_neon,%function
- .align 5
- bn_mul8x_mont_neon:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
- // only from bn_mul_mont which has already signed the return address.
- stp x29,x30,[sp,#-80]!
- mov x16,sp
- stp d8,d9,[sp,#16]
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
- lsl $num,$num,#1
- eor $zero.16b,$zero.16b,$zero.16b
- .align 4
- .LNEON_8n:
- eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
- sub $toutptr,sp,#128
- eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
- sub $toutptr,$toutptr,$num,lsl#4
- eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
- and $toutptr,$toutptr,#-64
- eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
- mov sp,$toutptr // alloca
- eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
- add $toutptr,$toutptr,#256
- eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
- sub $inner,$num,#8
- eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
- eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
- .LNEON_8n_init:
- st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
- subs $inner,$inner,#8
- st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
- st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
- st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
- bne .LNEON_8n_init
- add $tinptr,sp,#256
- ld1 {$A0.4s,$A1.4s},[$aptr],#32
- add $bnptr,sp,#8
- ldr $sM0,[$n0],#4
- mov $outer,$num
- b .LNEON_8n_outer
- .align 4
- .LNEON_8n_outer:
- ldr $sBi,[$bptr],#4 // *b++
- uxtl $Bi.4s,$Bi.4h
- add $toutptr,sp,#128
- ld1 {$N0.4s,$N1.4s},[$nptr],#32
- umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
- umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
- umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
- shl $Ni.2d,@ACC[0].2d,#16
- ext $Ni.16b,$Ni.16b,$Ni.16b,#8
- umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
- add $Ni.2d,$Ni.2d,@ACC[0].2d
- umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
- mul $Ni.2s,$Ni.2s,$M0.2s
- umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
- st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
- umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
- uxtl $Ni.4s,$Ni.4h
- umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
- ___
- for ($i=0; $i<7;) {
- $code.=<<___;
- ldr $sBi,[$bptr],#4 // *b++
- umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
- umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
- uxtl $Bi.4s,$Bi.4h
- umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
- ushr $temp.2d,@ACC[0].2d,#16
- umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
- umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
- ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
- add @ACC[0].2d,@ACC[0].2d,$temp.2d
- umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
- ushr @ACC[0].2d,@ACC[0].2d,#16
- umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
- umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
- add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
- ins @ACC[1].d[0],$ACCTemp.d[0]
- st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
- ___
- push(@ACC,shift(@ACC)); $i++;
- $code.=<<___;
- umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
- ld1 {@ACC[7].2d},[$tinptr],#16
- umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
- umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
- shl $Ni.2d,@ACC[0].2d,#16
- ext $Ni.16b,$Ni.16b,$Ni.16b,#8
- umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
- add $Ni.2d,$Ni.2d,@ACC[0].2d
- umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
- mul $Ni.2s,$Ni.2s,$M0.2s
- umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
- st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
- umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
- uxtl $Ni.4s,$Ni.4h
- umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
- ___
- }
- $code.=<<___;
- ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
- umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
- ld1 {$A0.4s,$A1.4s},[$aptr],#32
- umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
- umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
- mov $Temp.16b,@ACC[0].16b
- ushr $Temp.2d,$Temp.2d,#16
- ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
- umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
- umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
- add @ACC[0].2d,@ACC[0].2d,$Temp.2d
- umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
- ushr @ACC[0].2d,@ACC[0].2d,#16
- eor $temp.16b,$temp.16b,$temp.16b
- ins @ACC[0].d[1],$temp.d[0]
- umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
- umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
- add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
- st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
- add $bnptr,sp,#8 // rewind
- ___
- push(@ACC,shift(@ACC));
- $code.=<<___;
- sub $inner,$num,#8
- b .LNEON_8n_inner
- .align 4
- .LNEON_8n_inner:
- subs $inner,$inner,#8
- umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
- ld1 {@ACC[7].2d},[$tinptr]
- umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
- ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
- umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
- ld1 {$N0.4s,$N1.4s},[$nptr],#32
- umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
- b.eq .LInner_jump
- add $tinptr,$tinptr,#16 // don't advance in last iteration
- .LInner_jump:
- umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
- umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
- umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
- umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
- ___
- for ($i=1; $i<8; $i++) {
- $code.=<<___;
- ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
- umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
- umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
- umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
- umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
- umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
- umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
- umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
- umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
- st1 {@ACC[0].2d},[$toutptr],#16
- ___
- push(@ACC,shift(@ACC));
- $code.=<<___;
- umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
- ld1 {@ACC[7].2d},[$tinptr]
- umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
- ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
- umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
- b.eq .LInner_jump$i
- add $tinptr,$tinptr,#16 // don't advance in last iteration
- .LInner_jump$i:
- umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
- umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
- umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
- umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
- umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
- ___
- }
- $code.=<<___;
- b.ne .LInner_after_rewind$i
- sub $aptr,$aptr,$num,lsl#2 // rewind
- .LInner_after_rewind$i:
- umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
- ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
- umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
- ld1 {$A0.4s,$A1.4s},[$aptr],#32
- umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
- add $bnptr,sp,#8 // rewind
- umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
- umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
- umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
- umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
- st1 {@ACC[0].2d},[$toutptr],#16
- umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
- bne .LNEON_8n_inner
- ___
- push(@ACC,shift(@ACC));
- $code.=<<___;
- add $tinptr,sp,#128
- st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
- eor $N0.16b,$N0.16b,$N0.16b // $N0
- st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
- eor $N1.16b,$N1.16b,$N1.16b // $N1
- st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
- st1 {@ACC[6].2d},[$toutptr]
- subs $outer,$outer,#8
- ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
- ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
- ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
- ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
- b.eq .LInner_8n_jump_2steps
- sub $nptr,$nptr,$num,lsl#2 // rewind
- b .LNEON_8n_outer
- .LInner_8n_jump_2steps:
- add $toutptr,sp,#128
- st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
- mov $Temp.16b,@ACC[0].16b
- ushr $temp.2d,@ACC[0].2d,#16
- ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
- st1 {$N0.2d,$N1.2d}, [sp],#32
- add @ACC[0].2d,@ACC[0].2d,$temp.2d
- st1 {$N0.2d,$N1.2d}, [sp],#32
- ushr $temp.2d,@ACC[0].2d,#16
- st1 {$N0.2d,$N1.2d}, [sp],#32
- zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
- ins $temp.d[1],$zero.d[0]
- mov $inner,$num
- b .LNEON_tail_entry
- .align 4
- .LNEON_tail:
- add @ACC[0].2d,@ACC[0].2d,$temp.2d
- mov $Temp.16b,@ACC[0].16b
- ushr $temp.2d,@ACC[0].2d,#16
- ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
- ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
- add @ACC[0].2d,@ACC[0].2d,$temp.2d
- ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
- ushr $temp.2d,@ACC[0].2d,#16
- ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
- zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
- ins $temp.d[1],$zero.d[0]
- .LNEON_tail_entry:
- ___
- for ($i=1; $i<8; $i++) {
- $code.=<<___;
- add @ACC[1].2d,@ACC[1].2d,$temp.2d
- st1 {@ACC[0].s}[0], [$toutptr],#4
- ushr $temp.2d,@ACC[1].2d,#16
- mov $Temp.16b,@ACC[1].16b
- ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
- add @ACC[1].2d,@ACC[1].2d,$temp.2d
- ushr $temp.2d,@ACC[1].2d,#16
- zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
- ins $temp.d[1],$zero.d[0]
- ___
- push(@ACC,shift(@ACC));
- }
- push(@ACC,shift(@ACC));
- $code.=<<___;
- ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
- subs $inner,$inner,#8
- st1 {@ACC[7].s}[0], [$toutptr],#4
- bne .LNEON_tail
- st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
- sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
- subs $aptr,sp,#0 // clear carry flag
- add $bptr,sp,$num,lsl#2
- .LNEON_sub:
- ldp w4,w5,[$aptr],#8
- ldp w6,w7,[$aptr],#8
- ldp w8,w9,[$nptr],#8
- ldp w10,w11,[$nptr],#8
- sbcs w8,w4,w8
- sbcs w9,w5,w9
- sbcs w10,w6,w10
- sbcs w11,w7,w11
- sub x17,$bptr,$aptr
- stp w8,w9,[$rptr],#8
- stp w10,w11,[$rptr],#8
- cbnz x17,.LNEON_sub
- ldr w10, [$aptr] // load top-most bit
- mov x11,sp
- eor v0.16b,v0.16b,v0.16b
- sub x11,$bptr,x11 // this is num*4
- eor v1.16b,v1.16b,v1.16b
- mov $aptr,sp
- sub $rptr,$rptr,x11 // rewind $rptr
- mov $nptr,$bptr // second 3/4th of frame
- sbcs w10,w10,wzr // result is carry flag
- .LNEON_copy_n_zap:
- ldp w4,w5,[$aptr],#8
- ldp w6,w7,[$aptr],#8
- ldp w8,w9,[$rptr],#8
- ldp w10,w11,[$rptr]
- sub $rptr,$rptr,#8
- b.cs .LCopy_1
- mov w8,w4
- mov w9,w5
- mov w10,w6
- mov w11,w7
- .LCopy_1:
- st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
- st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
- ldp w4,w5,[$aptr],#8
- ldp w6,w7,[$aptr],#8
- stp w8,w9,[$rptr],#8
- stp w10,w11,[$rptr],#8
- sub $aptr,$aptr,#32
- ldp w8,w9,[$rptr],#8
- ldp w10,w11,[$rptr]
- sub $rptr,$rptr,#8
- b.cs .LCopy_2
- mov w8, w4
- mov w9, w5
- mov w10, w6
- mov w11, w7
- .LCopy_2:
- st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
- st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
- sub x17,$bptr,$aptr // preserves carry
- stp w8,w9,[$rptr],#8
- stp w10,w11,[$rptr],#8
- cbnz x17,.LNEON_copy_n_zap
- mov sp,x16
- ldp d14,d15,[sp,#64]
- ldp d12,d13,[sp,#48]
- ldp d10,d11,[sp,#32]
- ldp d8,d9,[sp,#16]
- ldr x29,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret // bx lr
- .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
- ___
- }
- {
- ########################################################################
- # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
- my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
- my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
- my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
- my ($cnt,$carry,$topmost)=("x27","x28","x30");
- my ($tp,$ap_end,$na0)=($bp,$np,$carry);
- $code.=<<___;
- .type __bn_sqr8x_mont,%function
- .align 5
- __bn_sqr8x_mont:
- cmp $ap,$bp
- b.ne __bn_mul4x_mont
- .Lsqr8x_mont:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
- // only from bn_mul_mont which has already signed the return address.
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- stp $rp,$np,[sp,#96] // offload rp and np
- ldp $a0,$a1,[$ap,#8*0]
- ldp $a2,$a3,[$ap,#8*2]
- ldp $a4,$a5,[$ap,#8*4]
- ldp $a6,$a7,[$ap,#8*6]
- sub $tp,sp,$num,lsl#4
- lsl $num,$num,#3
- ldr $n0,[$n0] // *n0
- mov sp,$tp // alloca
- sub $cnt,$num,#8*8
- b .Lsqr8x_zero_start
- .Lsqr8x_zero:
- sub $cnt,$cnt,#8*8
- stp xzr,xzr,[$tp,#8*0]
- stp xzr,xzr,[$tp,#8*2]
- stp xzr,xzr,[$tp,#8*4]
- stp xzr,xzr,[$tp,#8*6]
- .Lsqr8x_zero_start:
- stp xzr,xzr,[$tp,#8*8]
- stp xzr,xzr,[$tp,#8*10]
- stp xzr,xzr,[$tp,#8*12]
- stp xzr,xzr,[$tp,#8*14]
- add $tp,$tp,#8*16
- cbnz $cnt,.Lsqr8x_zero
- add $ap_end,$ap,$num
- add $ap,$ap,#8*8
- mov $acc0,xzr
- mov $acc1,xzr
- mov $acc2,xzr
- mov $acc3,xzr
- mov $acc4,xzr
- mov $acc5,xzr
- mov $acc6,xzr
- mov $acc7,xzr
- mov $tp,sp
- str $n0,[x29,#112] // offload n0
- // Multiply everything but a[i]*a[i]
- .align 4
- .Lsqr8x_outer_loop:
- // a[1]a[0] (i)
- // a[2]a[0]
- // a[3]a[0]
- // a[4]a[0]
- // a[5]a[0]
- // a[6]a[0]
- // a[7]a[0]
- // a[2]a[1] (ii)
- // a[3]a[1]
- // a[4]a[1]
- // a[5]a[1]
- // a[6]a[1]
- // a[7]a[1]
- // a[3]a[2] (iii)
- // a[4]a[2]
- // a[5]a[2]
- // a[6]a[2]
- // a[7]a[2]
- // a[4]a[3] (iv)
- // a[5]a[3]
- // a[6]a[3]
- // a[7]a[3]
- // a[5]a[4] (v)
- // a[6]a[4]
- // a[7]a[4]
- // a[6]a[5] (vi)
- // a[7]a[5]
- // a[7]a[6] (vii)
- mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
- mul $t1,$a2,$a0
- mul $t2,$a3,$a0
- mul $t3,$a4,$a0
- adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
- mul $t0,$a5,$a0
- adcs $acc2,$acc2,$t1
- mul $t1,$a6,$a0
- adcs $acc3,$acc3,$t2
- mul $t2,$a7,$a0
- adcs $acc4,$acc4,$t3
- umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
- adcs $acc5,$acc5,$t0
- umulh $t0,$a2,$a0
- adcs $acc6,$acc6,$t1
- umulh $t1,$a3,$a0
- adcs $acc7,$acc7,$t2
- umulh $t2,$a4,$a0
- stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
- adc $acc0,xzr,xzr // t[8]
- adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
- umulh $t3,$a5,$a0
- adcs $acc3,$acc3,$t0
- umulh $t0,$a6,$a0
- adcs $acc4,$acc4,$t1
- umulh $t1,$a7,$a0
- adcs $acc5,$acc5,$t2
- mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
- adcs $acc6,$acc6,$t3
- mul $t3,$a3,$a1
- adcs $acc7,$acc7,$t0
- mul $t0,$a4,$a1
- adc $acc0,$acc0,$t1
- mul $t1,$a5,$a1
- adds $acc3,$acc3,$t2
- mul $t2,$a6,$a1
- adcs $acc4,$acc4,$t3
- mul $t3,$a7,$a1
- adcs $acc5,$acc5,$t0
- umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
- adcs $acc6,$acc6,$t1
- umulh $t1,$a3,$a1
- adcs $acc7,$acc7,$t2
- umulh $t2,$a4,$a1
- adcs $acc0,$acc0,$t3
- umulh $t3,$a5,$a1
- stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
- adc $acc1,xzr,xzr // t[9]
- adds $acc4,$acc4,$t0
- umulh $t0,$a6,$a1
- adcs $acc5,$acc5,$t1
- umulh $t1,$a7,$a1
- adcs $acc6,$acc6,$t2
- mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
- adcs $acc7,$acc7,$t3
- mul $t3,$a4,$a2
- adcs $acc0,$acc0,$t0
- mul $t0,$a5,$a2
- adc $acc1,$acc1,$t1
- mul $t1,$a6,$a2
- adds $acc5,$acc5,$t2
- mul $t2,$a7,$a2
- adcs $acc6,$acc6,$t3
- umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
- adcs $acc7,$acc7,$t0
- umulh $t0,$a4,$a2
- adcs $acc0,$acc0,$t1
- umulh $t1,$a5,$a2
- adcs $acc1,$acc1,$t2
- umulh $t2,$a6,$a2
- stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
- adc $acc2,xzr,xzr // t[10]
- adds $acc6,$acc6,$t3
- umulh $t3,$a7,$a2
- adcs $acc7,$acc7,$t0
- mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
- adcs $acc0,$acc0,$t1
- mul $t1,$a5,$a3
- adcs $acc1,$acc1,$t2
- mul $t2,$a6,$a3
- adc $acc2,$acc2,$t3
- mul $t3,$a7,$a3
- adds $acc7,$acc7,$t0
- umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
- adcs $acc0,$acc0,$t1
- umulh $t1,$a5,$a3
- adcs $acc1,$acc1,$t2
- umulh $t2,$a6,$a3
- adcs $acc2,$acc2,$t3
- umulh $t3,$a7,$a3
- stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
- adc $acc3,xzr,xzr // t[11]
- adds $acc0,$acc0,$t0
- mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
- adcs $acc1,$acc1,$t1
- mul $t1,$a6,$a4
- adcs $acc2,$acc2,$t2
- mul $t2,$a7,$a4
- adc $acc3,$acc3,$t3
- umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
- adds $acc1,$acc1,$t0
- umulh $t0,$a6,$a4
- adcs $acc2,$acc2,$t1
- umulh $t1,$a7,$a4
- adcs $acc3,$acc3,$t2
- mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
- adc $acc4,xzr,xzr // t[12]
- adds $acc2,$acc2,$t3
- mul $t3,$a7,$a5
- adcs $acc3,$acc3,$t0
- umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
- adc $acc4,$acc4,$t1
- umulh $t1,$a7,$a5
- adds $acc3,$acc3,$t2
- mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
- adcs $acc4,$acc4,$t3
- umulh $t3,$a7,$a6 // hi(a[7]*a[6])
- adc $acc5,xzr,xzr // t[13]
- adds $acc4,$acc4,$t0
- sub $cnt,$ap_end,$ap // done yet?
- adc $acc5,$acc5,$t1
- adds $acc5,$acc5,$t2
- sub $t0,$ap_end,$num // rewinded ap
- adc $acc6,xzr,xzr // t[14]
- add $acc6,$acc6,$t3
- cbz $cnt,.Lsqr8x_outer_break
- mov $n0,$a0
- ldp $a0,$a1,[$tp,#8*0]
- ldp $a2,$a3,[$tp,#8*2]
- ldp $a4,$a5,[$tp,#8*4]
- ldp $a6,$a7,[$tp,#8*6]
- adds $acc0,$acc0,$a0
- adcs $acc1,$acc1,$a1
- ldp $a0,$a1,[$ap,#8*0]
- adcs $acc2,$acc2,$a2
- adcs $acc3,$acc3,$a3
- ldp $a2,$a3,[$ap,#8*2]
- adcs $acc4,$acc4,$a4
- adcs $acc5,$acc5,$a5
- ldp $a4,$a5,[$ap,#8*4]
- adcs $acc6,$acc6,$a6
- mov $rp,$ap
- adcs $acc7,xzr,$a7
- ldp $a6,$a7,[$ap,#8*6]
- add $ap,$ap,#8*8
- //adc $carry,xzr,xzr // moved below
- mov $cnt,#-8*8
- // a[8]a[0]
- // a[9]a[0]
- // a[a]a[0]
- // a[b]a[0]
- // a[c]a[0]
- // a[d]a[0]
- // a[e]a[0]
- // a[f]a[0]
- // a[8]a[1]
- // a[f]a[1]........................
- // a[8]a[2]
- // a[f]a[2]........................
- // a[8]a[3]
- // a[f]a[3]........................
- // a[8]a[4]
- // a[f]a[4]........................
- // a[8]a[5]
- // a[f]a[5]........................
- // a[8]a[6]
- // a[f]a[6]........................
- // a[8]a[7]
- // a[f]a[7]........................
- .Lsqr8x_mul:
- mul $t0,$a0,$n0
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
- mul $t1,$a1,$n0
- add $cnt,$cnt,#8
- mul $t2,$a2,$n0
- mul $t3,$a3,$n0
- adds $acc0,$acc0,$t0
- mul $t0,$a4,$n0
- adcs $acc1,$acc1,$t1
- mul $t1,$a5,$n0
- adcs $acc2,$acc2,$t2
- mul $t2,$a6,$n0
- adcs $acc3,$acc3,$t3
- mul $t3,$a7,$n0
- adcs $acc4,$acc4,$t0
- umulh $t0,$a0,$n0
- adcs $acc5,$acc5,$t1
- umulh $t1,$a1,$n0
- adcs $acc6,$acc6,$t2
- umulh $t2,$a2,$n0
- adcs $acc7,$acc7,$t3
- umulh $t3,$a3,$n0
- adc $carry,$carry,xzr
- str $acc0,[$tp],#8
- adds $acc0,$acc1,$t0
- umulh $t0,$a4,$n0
- adcs $acc1,$acc2,$t1
- umulh $t1,$a5,$n0
- adcs $acc2,$acc3,$t2
- umulh $t2,$a6,$n0
- adcs $acc3,$acc4,$t3
- umulh $t3,$a7,$n0
- ldr $n0,[$rp,$cnt]
- adcs $acc4,$acc5,$t0
- adcs $acc5,$acc6,$t1
- adcs $acc6,$acc7,$t2
- adcs $acc7,$carry,$t3
- //adc $carry,xzr,xzr // moved above
- cbnz $cnt,.Lsqr8x_mul
- // note that carry flag is guaranteed
- // to be zero at this point
- cmp $ap,$ap_end // done yet?
- b.eq .Lsqr8x_break
- ldp $a0,$a1,[$tp,#8*0]
- ldp $a2,$a3,[$tp,#8*2]
- ldp $a4,$a5,[$tp,#8*4]
- ldp $a6,$a7,[$tp,#8*6]
- adds $acc0,$acc0,$a0
- ldur $n0,[$rp,#-8*8]
- adcs $acc1,$acc1,$a1
- ldp $a0,$a1,[$ap,#8*0]
- adcs $acc2,$acc2,$a2
- adcs $acc3,$acc3,$a3
- ldp $a2,$a3,[$ap,#8*2]
- adcs $acc4,$acc4,$a4
- adcs $acc5,$acc5,$a5
- ldp $a4,$a5,[$ap,#8*4]
- adcs $acc6,$acc6,$a6
- mov $cnt,#-8*8
- adcs $acc7,$acc7,$a7
- ldp $a6,$a7,[$ap,#8*6]
- add $ap,$ap,#8*8
- //adc $carry,xzr,xzr // moved above
- b .Lsqr8x_mul
- .align 4
- .Lsqr8x_break:
- ldp $a0,$a1,[$rp,#8*0]
- add $ap,$rp,#8*8
- ldp $a2,$a3,[$rp,#8*2]
- sub $t0,$ap_end,$ap // is it last iteration?
- ldp $a4,$a5,[$rp,#8*4]
- sub $t1,$tp,$t0
- ldp $a6,$a7,[$rp,#8*6]
- cbz $t0,.Lsqr8x_outer_loop
- stp $acc0,$acc1,[$tp,#8*0]
- ldp $acc0,$acc1,[$t1,#8*0]
- stp $acc2,$acc3,[$tp,#8*2]
- ldp $acc2,$acc3,[$t1,#8*2]
- stp $acc4,$acc5,[$tp,#8*4]
- ldp $acc4,$acc5,[$t1,#8*4]
- stp $acc6,$acc7,[$tp,#8*6]
- mov $tp,$t1
- ldp $acc6,$acc7,[$t1,#8*6]
- b .Lsqr8x_outer_loop
- .align 4
- .Lsqr8x_outer_break:
- // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
- ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
- ldp $t1,$t2,[sp,#8*1]
- ldp $a5,$a7,[$t0,#8*2]
- add $ap,$t0,#8*4
- ldp $t3,$t0,[sp,#8*3]
- stp $acc0,$acc1,[$tp,#8*0]
- mul $acc0,$a1,$a1
- stp $acc2,$acc3,[$tp,#8*2]
- umulh $a1,$a1,$a1
- stp $acc4,$acc5,[$tp,#8*4]
- mul $a2,$a3,$a3
- stp $acc6,$acc7,[$tp,#8*6]
- mov $tp,sp
- umulh $a3,$a3,$a3
- adds $acc1,$a1,$t1,lsl#1
- extr $t1,$t2,$t1,#63
- sub $cnt,$num,#8*4
- .Lsqr4x_shift_n_add:
- adcs $acc2,$a2,$t1
- extr $t2,$t3,$t2,#63
- sub $cnt,$cnt,#8*4
- adcs $acc3,$a3,$t2
- ldp $t1,$t2,[$tp,#8*5]
- mul $a4,$a5,$a5
- ldp $a1,$a3,[$ap],#8*2
- umulh $a5,$a5,$a5
- mul $a6,$a7,$a7
- umulh $a7,$a7,$a7
- extr $t3,$t0,$t3,#63
- stp $acc0,$acc1,[$tp,#8*0]
- adcs $acc4,$a4,$t3
- extr $t0,$t1,$t0,#63
- stp $acc2,$acc3,[$tp,#8*2]
- adcs $acc5,$a5,$t0
- ldp $t3,$t0,[$tp,#8*7]
- extr $t1,$t2,$t1,#63
- adcs $acc6,$a6,$t1
- extr $t2,$t3,$t2,#63
- adcs $acc7,$a7,$t2
- ldp $t1,$t2,[$tp,#8*9]
- mul $a0,$a1,$a1
- ldp $a5,$a7,[$ap],#8*2
- umulh $a1,$a1,$a1
- mul $a2,$a3,$a3
- umulh $a3,$a3,$a3
- stp $acc4,$acc5,[$tp,#8*4]
- extr $t3,$t0,$t3,#63
- stp $acc6,$acc7,[$tp,#8*6]
- add $tp,$tp,#8*8
- adcs $acc0,$a0,$t3
- extr $t0,$t1,$t0,#63
- adcs $acc1,$a1,$t0
- ldp $t3,$t0,[$tp,#8*3]
- extr $t1,$t2,$t1,#63
- cbnz $cnt,.Lsqr4x_shift_n_add
- ___
- my ($np,$np_end)=($ap,$ap_end);
- $code.=<<___;
- ldp $np,$n0,[x29,#104] // pull np and n0
- adcs $acc2,$a2,$t1
- extr $t2,$t3,$t2,#63
- adcs $acc3,$a3,$t2
- ldp $t1,$t2,[$tp,#8*5]
- mul $a4,$a5,$a5
- umulh $a5,$a5,$a5
- stp $acc0,$acc1,[$tp,#8*0]
- mul $a6,$a7,$a7
- umulh $a7,$a7,$a7
- stp $acc2,$acc3,[$tp,#8*2]
- extr $t3,$t0,$t3,#63
- adcs $acc4,$a4,$t3
- extr $t0,$t1,$t0,#63
- ldp $acc0,$acc1,[sp,#8*0]
- adcs $acc5,$a5,$t0
- extr $t1,$t2,$t1,#63
- ldp $a0,$a1,[$np,#8*0]
- adcs $acc6,$a6,$t1
- extr $t2,xzr,$t2,#63
- ldp $a2,$a3,[$np,#8*2]
- adc $acc7,$a7,$t2
- ldp $a4,$a5,[$np,#8*4]
- // Reduce by 512 bits per iteration
- mul $na0,$n0,$acc0 // t[0]*n0
- ldp $a6,$a7,[$np,#8*6]
- add $np_end,$np,$num
- ldp $acc2,$acc3,[sp,#8*2]
- stp $acc4,$acc5,[$tp,#8*4]
- ldp $acc4,$acc5,[sp,#8*4]
- stp $acc6,$acc7,[$tp,#8*6]
- ldp $acc6,$acc7,[sp,#8*6]
- add $np,$np,#8*8
- mov $topmost,xzr // initial top-most carry
- mov $tp,sp
- mov $cnt,#8
- .Lsqr8x_reduction:
- // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
- mul $t1,$a1,$na0
- sub $cnt,$cnt,#1
- mul $t2,$a2,$na0
- str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
- mul $t3,$a3,$na0
- // (*) adds xzr,$acc0,$t0
- subs xzr,$acc0,#1 // (*)
- mul $t0,$a4,$na0
- adcs $acc0,$acc1,$t1
- mul $t1,$a5,$na0
- adcs $acc1,$acc2,$t2
- mul $t2,$a6,$na0
- adcs $acc2,$acc3,$t3
- mul $t3,$a7,$na0
- adcs $acc3,$acc4,$t0
- umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
- adcs $acc4,$acc5,$t1
- umulh $t1,$a1,$na0
- adcs $acc5,$acc6,$t2
- umulh $t2,$a2,$na0
- adcs $acc6,$acc7,$t3
- umulh $t3,$a3,$na0
- adc $acc7,xzr,xzr
- adds $acc0,$acc0,$t0
- umulh $t0,$a4,$na0
- adcs $acc1,$acc1,$t1
- umulh $t1,$a5,$na0
- adcs $acc2,$acc2,$t2
- umulh $t2,$a6,$na0
- adcs $acc3,$acc3,$t3
- umulh $t3,$a7,$na0
- mul $na0,$n0,$acc0 // next t[0]*n0
- adcs $acc4,$acc4,$t0
- adcs $acc5,$acc5,$t1
- adcs $acc6,$acc6,$t2
- adc $acc7,$acc7,$t3
- cbnz $cnt,.Lsqr8x_reduction
- ldp $t0,$t1,[$tp,#8*0]
- ldp $t2,$t3,[$tp,#8*2]
- mov $rp,$tp
- sub $cnt,$np_end,$np // done yet?
- adds $acc0,$acc0,$t0
- adcs $acc1,$acc1,$t1
- ldp $t0,$t1,[$tp,#8*4]
- adcs $acc2,$acc2,$t2
- adcs $acc3,$acc3,$t3
- ldp $t2,$t3,[$tp,#8*6]
- adcs $acc4,$acc4,$t0
- adcs $acc5,$acc5,$t1
- adcs $acc6,$acc6,$t2
- adcs $acc7,$acc7,$t3
- //adc $carry,xzr,xzr // moved below
- cbz $cnt,.Lsqr8x8_post_condition
- ldur $n0,[$tp,#-8*8]
- ldp $a0,$a1,[$np,#8*0]
- ldp $a2,$a3,[$np,#8*2]
- ldp $a4,$a5,[$np,#8*4]
- mov $cnt,#-8*8
- ldp $a6,$a7,[$np,#8*6]
- add $np,$np,#8*8
- .Lsqr8x_tail:
- mul $t0,$a0,$n0
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
- mul $t1,$a1,$n0
- add $cnt,$cnt,#8
- mul $t2,$a2,$n0
- mul $t3,$a3,$n0
- adds $acc0,$acc0,$t0
- mul $t0,$a4,$n0
- adcs $acc1,$acc1,$t1
- mul $t1,$a5,$n0
- adcs $acc2,$acc2,$t2
- mul $t2,$a6,$n0
- adcs $acc3,$acc3,$t3
- mul $t3,$a7,$n0
- adcs $acc4,$acc4,$t0
- umulh $t0,$a0,$n0
- adcs $acc5,$acc5,$t1
- umulh $t1,$a1,$n0
- adcs $acc6,$acc6,$t2
- umulh $t2,$a2,$n0
- adcs $acc7,$acc7,$t3
- umulh $t3,$a3,$n0
- adc $carry,$carry,xzr
- str $acc0,[$tp],#8
- adds $acc0,$acc1,$t0
- umulh $t0,$a4,$n0
- adcs $acc1,$acc2,$t1
- umulh $t1,$a5,$n0
- adcs $acc2,$acc3,$t2
- umulh $t2,$a6,$n0
- adcs $acc3,$acc4,$t3
- umulh $t3,$a7,$n0
- ldr $n0,[$rp,$cnt]
- adcs $acc4,$acc5,$t0
- adcs $acc5,$acc6,$t1
- adcs $acc6,$acc7,$t2
- adcs $acc7,$carry,$t3
- //adc $carry,xzr,xzr // moved above
- cbnz $cnt,.Lsqr8x_tail
- // note that carry flag is guaranteed
- // to be zero at this point
- ldp $a0,$a1,[$tp,#8*0]
- sub $cnt,$np_end,$np // done yet?
- sub $t2,$np_end,$num // rewinded np
- ldp $a2,$a3,[$tp,#8*2]
- ldp $a4,$a5,[$tp,#8*4]
- ldp $a6,$a7,[$tp,#8*6]
- cbz $cnt,.Lsqr8x_tail_break
- ldur $n0,[$rp,#-8*8]
- adds $acc0,$acc0,$a0
- adcs $acc1,$acc1,$a1
- ldp $a0,$a1,[$np,#8*0]
- adcs $acc2,$acc2,$a2
- adcs $acc3,$acc3,$a3
- ldp $a2,$a3,[$np,#8*2]
- adcs $acc4,$acc4,$a4
- adcs $acc5,$acc5,$a5
- ldp $a4,$a5,[$np,#8*4]
- adcs $acc6,$acc6,$a6
- mov $cnt,#-8*8
- adcs $acc7,$acc7,$a7
- ldp $a6,$a7,[$np,#8*6]
- add $np,$np,#8*8
- //adc $carry,xzr,xzr // moved above
- b .Lsqr8x_tail
- .align 4
- .Lsqr8x_tail_break:
- ldr $n0,[x29,#112] // pull n0
- add $cnt,$tp,#8*8 // end of current t[num] window
- subs xzr,$topmost,#1 // "move" top-most carry to carry bit
- adcs $t0,$acc0,$a0
- adcs $t1,$acc1,$a1
- ldp $acc0,$acc1,[$rp,#8*0]
- adcs $acc2,$acc2,$a2
- ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
- adcs $acc3,$acc3,$a3
- ldp $a2,$a3,[$t2,#8*2]
- adcs $acc4,$acc4,$a4
- adcs $acc5,$acc5,$a5
- ldp $a4,$a5,[$t2,#8*4]
- adcs $acc6,$acc6,$a6
- adcs $acc7,$acc7,$a7
- ldp $a6,$a7,[$t2,#8*6]
- add $np,$t2,#8*8
- adc $topmost,xzr,xzr // top-most carry
- mul $na0,$n0,$acc0
- stp $t0,$t1,[$tp,#8*0]
- stp $acc2,$acc3,[$tp,#8*2]
- ldp $acc2,$acc3,[$rp,#8*2]
- stp $acc4,$acc5,[$tp,#8*4]
- ldp $acc4,$acc5,[$rp,#8*4]
- cmp $cnt,x29 // did we hit the bottom?
- stp $acc6,$acc7,[$tp,#8*6]
- mov $tp,$rp // slide the window
- ldp $acc6,$acc7,[$rp,#8*6]
- mov $cnt,#8
- b.ne .Lsqr8x_reduction
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr $rp,[x29,#96] // pull rp
- add $tp,$tp,#8*8
- subs $t0,$acc0,$a0
- sbcs $t1,$acc1,$a1
- sub $cnt,$num,#8*8
- mov $ap_end,$rp // $rp copy
- .Lsqr8x_sub:
- sbcs $t2,$acc2,$a2
- ldp $a0,$a1,[$np,#8*0]
- sbcs $t3,$acc3,$a3
- stp $t0,$t1,[$rp,#8*0]
- sbcs $t0,$acc4,$a4
- ldp $a2,$a3,[$np,#8*2]
- sbcs $t1,$acc5,$a5
- stp $t2,$t3,[$rp,#8*2]
- sbcs $t2,$acc6,$a6
- ldp $a4,$a5,[$np,#8*4]
- sbcs $t3,$acc7,$a7
- ldp $a6,$a7,[$np,#8*6]
- add $np,$np,#8*8
- ldp $acc0,$acc1,[$tp,#8*0]
- sub $cnt,$cnt,#8*8
- ldp $acc2,$acc3,[$tp,#8*2]
- ldp $acc4,$acc5,[$tp,#8*4]
- ldp $acc6,$acc7,[$tp,#8*6]
- add $tp,$tp,#8*8
- stp $t0,$t1,[$rp,#8*4]
- sbcs $t0,$acc0,$a0
- stp $t2,$t3,[$rp,#8*6]
- add $rp,$rp,#8*8
- sbcs $t1,$acc1,$a1
- cbnz $cnt,.Lsqr8x_sub
- sbcs $t2,$acc2,$a2
- mov $tp,sp
- add $ap,sp,$num
- ldp $a0,$a1,[$ap_end,#8*0]
- sbcs $t3,$acc3,$a3
- stp $t0,$t1,[$rp,#8*0]
- sbcs $t0,$acc4,$a4
- ldp $a2,$a3,[$ap_end,#8*2]
- sbcs $t1,$acc5,$a5
- stp $t2,$t3,[$rp,#8*2]
- sbcs $t2,$acc6,$a6
- ldp $acc0,$acc1,[$ap,#8*0]
- sbcs $t3,$acc7,$a7
- ldp $acc2,$acc3,[$ap,#8*2]
- sbcs xzr,$topmost,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
- stp $t0,$t1,[$rp,#8*4]
- stp $t2,$t3,[$rp,#8*6]
- sub $cnt,$num,#8*4
- .Lsqr4x_cond_copy:
- sub $cnt,$cnt,#8*4
- csel $t0,$acc0,$a0,lo
- stp xzr,xzr,[$tp,#8*0]
- csel $t1,$acc1,$a1,lo
- ldp $a0,$a1,[$ap_end,#8*4]
- ldp $acc0,$acc1,[$ap,#8*4]
- csel $t2,$acc2,$a2,lo
- stp xzr,xzr,[$tp,#8*2]
- add $tp,$tp,#8*4
- csel $t3,$acc3,$a3,lo
- ldp $a2,$a3,[$ap_end,#8*6]
- ldp $acc2,$acc3,[$ap,#8*6]
- add $ap,$ap,#8*4
- stp $t0,$t1,[$ap_end,#8*0]
- stp $t2,$t3,[$ap_end,#8*2]
- add $ap_end,$ap_end,#8*4
- stp xzr,xzr,[$ap,#8*0]
- stp xzr,xzr,[$ap,#8*2]
- cbnz $cnt,.Lsqr4x_cond_copy
- csel $t0,$acc0,$a0,lo
- stp xzr,xzr,[$tp,#8*0]
- csel $t1,$acc1,$a1,lo
- stp xzr,xzr,[$tp,#8*2]
- csel $t2,$acc2,$a2,lo
- csel $t3,$acc3,$a3,lo
- stp $t0,$t1,[$ap_end,#8*0]
- stp $t2,$t3,[$ap_end,#8*2]
- b .Lsqr8x_done
- .align 4
- .Lsqr8x8_post_condition:
- adc $carry,xzr,xzr
- ldr x30,[x29,#8] // pull return address
- // $acc0-7,$carry hold result, $a0-7 hold modulus
- subs $a0,$acc0,$a0
- ldr $ap,[x29,#96] // pull rp
- sbcs $a1,$acc1,$a1
- stp xzr,xzr,[sp,#8*0]
- sbcs $a2,$acc2,$a2
- stp xzr,xzr,[sp,#8*2]
- sbcs $a3,$acc3,$a3
- stp xzr,xzr,[sp,#8*4]
- sbcs $a4,$acc4,$a4
- stp xzr,xzr,[sp,#8*6]
- sbcs $a5,$acc5,$a5
- stp xzr,xzr,[sp,#8*8]
- sbcs $a6,$acc6,$a6
- stp xzr,xzr,[sp,#8*10]
- sbcs $a7,$acc7,$a7
- stp xzr,xzr,[sp,#8*12]
- sbcs $carry,$carry,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*14]
- // $a0-7 hold result-modulus
- csel $a0,$acc0,$a0,lo
- csel $a1,$acc1,$a1,lo
- csel $a2,$acc2,$a2,lo
- csel $a3,$acc3,$a3,lo
- stp $a0,$a1,[$ap,#8*0]
- csel $a4,$acc4,$a4,lo
- csel $a5,$acc5,$a5,lo
- stp $a2,$a3,[$ap,#8*2]
- csel $a6,$acc6,$a6,lo
- csel $a7,$acc7,$a7,lo
- stp $a4,$a5,[$ap,#8*4]
- stp $a6,$a7,[$ap,#8*6]
- .Lsqr8x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- // x30 is loaded earlier
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
- ___
- }
- {
- ########################################################################
- # Even though this might look as ARMv8 adaptation of mulx4x_mont from
- # x86_64-mont5 module, it's different in sense that it performs
- # reduction 256 bits at a time.
- my ($a0,$a1,$a2,$a3,
- $t0,$t1,$t2,$t3,
- $m0,$m1,$m2,$m3,
- $acc0,$acc1,$acc2,$acc3,$acc4,
- $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
- my $bp_end=$rp;
- my ($carry,$topmost) = ($rp,"x30");
- $code.=<<___;
- .type __bn_mul4x_mont,%function
- .align 5
- __bn_mul4x_mont:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
- // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub $tp,sp,$num,lsl#3
- lsl $num,$num,#3
- ldr $n0,[$n0] // *n0
- sub sp,$tp,#8*4 // alloca
- add $t0,$bp,$num
- add $ap_end,$ap,$num
- stp $rp,$t0,[x29,#96] // offload rp and &b[num]
- ldr $bi,[$bp,#8*0] // b[0]
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
- ldp $a2,$a3,[$ap,#8*2]
- add $ap,$ap,#8*4
- mov $acc0,xzr
- mov $acc1,xzr
- mov $acc2,xzr
- mov $acc3,xzr
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
- ldp $m2,$m3,[$np,#8*2]
- adds $np,$np,#8*4 // clear carry bit
- mov $carry,xzr
- mov $cnt,#0
- mov $tp,sp
- .Loop_mul4x_1st_reduction:
- mul $t0,$a0,$bi // lo(a[0..3]*b[0])
- adc $carry,$carry,xzr // modulo-scheduled
- mul $t1,$a1,$bi
- add $cnt,$cnt,#8
- mul $t2,$a2,$bi
- and $cnt,$cnt,#31
- mul $t3,$a3,$bi
- adds $acc0,$acc0,$t0
- umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
- adcs $acc1,$acc1,$t1
- mul $mi,$acc0,$n0 // t[0]*n0
- adcs $acc2,$acc2,$t2
- umulh $t1,$a1,$bi
- adcs $acc3,$acc3,$t3
- umulh $t2,$a2,$bi
- adc $acc4,xzr,xzr
- umulh $t3,$a3,$bi
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
- adds $acc1,$acc1,$t0
- // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
- adcs $acc2,$acc2,$t1
- mul $t1,$m1,$mi
- adcs $acc3,$acc3,$t2
- mul $t2,$m2,$mi
- adc $acc4,$acc4,$t3 // can't overflow
- mul $t3,$m3,$mi
- // (*) adds xzr,$acc0,$t0
- subs xzr,$acc0,#1 // (*)
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
- adcs $acc0,$acc1,$t1
- umulh $t1,$m1,$mi
- adcs $acc1,$acc2,$t2
- umulh $t2,$m2,$mi
- adcs $acc2,$acc3,$t3
- umulh $t3,$m3,$mi
- adcs $acc3,$acc4,$carry
- adc $carry,xzr,xzr
- adds $acc0,$acc0,$t0
- sub $t0,$ap_end,$ap
- adcs $acc1,$acc1,$t1
- adcs $acc2,$acc2,$t2
- adcs $acc3,$acc3,$t3
- //adc $carry,$carry,xzr
- cbnz $cnt,.Loop_mul4x_1st_reduction
- cbz $t0,.Lmul4x4_post_condition
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
- ldp $a2,$a3,[$ap,#8*2]
- add $ap,$ap,#8*4
- ldr $mi,[sp] // a[0]*n0
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
- ldp $m2,$m3,[$np,#8*2]
- add $np,$np,#8*4
- .Loop_mul4x_1st_tail:
- mul $t0,$a0,$bi // lo(a[4..7]*b[i])
- adc $carry,$carry,xzr // modulo-scheduled
- mul $t1,$a1,$bi
- add $cnt,$cnt,#8
- mul $t2,$a2,$bi
- and $cnt,$cnt,#31
- mul $t3,$a3,$bi
- adds $acc0,$acc0,$t0
- umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
- adcs $acc1,$acc1,$t1
- umulh $t1,$a1,$bi
- adcs $acc2,$acc2,$t2
- umulh $t2,$a2,$bi
- adcs $acc3,$acc3,$t3
- umulh $t3,$a3,$bi
- adc $acc4,xzr,xzr
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
- adds $acc1,$acc1,$t0
- mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
- adcs $acc2,$acc2,$t1
- mul $t1,$m1,$mi
- adcs $acc3,$acc3,$t2
- mul $t2,$m2,$mi
- adc $acc4,$acc4,$t3 // can't overflow
- mul $t3,$m3,$mi
- adds $acc0,$acc0,$t0
- umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
- adcs $acc1,$acc1,$t1
- umulh $t1,$m1,$mi
- adcs $acc2,$acc2,$t2
- umulh $t2,$m2,$mi
- adcs $acc3,$acc3,$t3
- adcs $acc4,$acc4,$carry
- umulh $t3,$m3,$mi
- adc $carry,xzr,xzr
- ldr $mi,[sp,$cnt] // next t[0]*n0
- str $acc0,[$tp],#8 // result!!!
- adds $acc0,$acc1,$t0
- sub $t0,$ap_end,$ap // done yet?
- adcs $acc1,$acc2,$t1
- adcs $acc2,$acc3,$t2
- adcs $acc3,$acc4,$t3
- //adc $carry,$carry,xzr
- cbnz $cnt,.Loop_mul4x_1st_tail
- sub $t1,$ap_end,$num // rewinded $ap
- cbz $t0,.Lmul4x_proceed
- ldp $a0,$a1,[$ap,#8*0]
- ldp $a2,$a3,[$ap,#8*2]
- add $ap,$ap,#8*4
- ldp $m0,$m1,[$np,#8*0]
- ldp $m2,$m3,[$np,#8*2]
- add $np,$np,#8*4
- b .Loop_mul4x_1st_tail
- .align 5
- .Lmul4x_proceed:
- ldr $bi,[$bp,#8*4]! // *++b
- adc $topmost,$carry,xzr
- ldp $a0,$a1,[$t1,#8*0] // a[0..3]
- sub $np,$np,$num // rewind np
- ldp $a2,$a3,[$t1,#8*2]
- add $ap,$t1,#8*4
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
- ldp $acc2,$acc3,[sp,#8*6]
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
- mov $tp,sp
- ldp $m2,$m3,[$np,#8*2]
- adds $np,$np,#8*4 // clear carry bit
- mov $carry,xzr
- .align 4
- .Loop_mul4x_reduction:
- mul $t0,$a0,$bi // lo(a[0..3]*b[4])
- adc $carry,$carry,xzr // modulo-scheduled
- mul $t1,$a1,$bi
- add $cnt,$cnt,#8
- mul $t2,$a2,$bi
- and $cnt,$cnt,#31
- mul $t3,$a3,$bi
- adds $acc0,$acc0,$t0
- umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
- adcs $acc1,$acc1,$t1
- mul $mi,$acc0,$n0 // t[0]*n0
- adcs $acc2,$acc2,$t2
- umulh $t1,$a1,$bi
- adcs $acc3,$acc3,$t3
- umulh $t2,$a2,$bi
- adc $acc4,xzr,xzr
- umulh $t3,$a3,$bi
- ldr $bi,[$bp,$cnt] // next b[i]
- adds $acc1,$acc1,$t0
- // (*) mul $t0,$m0,$mi
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
- adcs $acc2,$acc2,$t1
- mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
- adcs $acc3,$acc3,$t2
- mul $t2,$m2,$mi
- adc $acc4,$acc4,$t3 // can't overflow
- mul $t3,$m3,$mi
- // (*) adds xzr,$acc0,$t0
- subs xzr,$acc0,#1 // (*)
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
- adcs $acc0,$acc1,$t1
- umulh $t1,$m1,$mi
- adcs $acc1,$acc2,$t2
- umulh $t2,$m2,$mi
- adcs $acc2,$acc3,$t3
- umulh $t3,$m3,$mi
- adcs $acc3,$acc4,$carry
- adc $carry,xzr,xzr
- adds $acc0,$acc0,$t0
- adcs $acc1,$acc1,$t1
- adcs $acc2,$acc2,$t2
- adcs $acc3,$acc3,$t3
- //adc $carry,$carry,xzr
- cbnz $cnt,.Loop_mul4x_reduction
- adc $carry,$carry,xzr
- ldp $t0,$t1,[$tp,#8*4] // t[4..7]
- ldp $t2,$t3,[$tp,#8*6]
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
- ldp $a2,$a3,[$ap,#8*2]
- add $ap,$ap,#8*4
- adds $acc0,$acc0,$t0
- adcs $acc1,$acc1,$t1
- adcs $acc2,$acc2,$t2
- adcs $acc3,$acc3,$t3
- //adc $carry,$carry,xzr
- ldr $mi,[sp] // t[0]*n0
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
- ldp $m2,$m3,[$np,#8*2]
- add $np,$np,#8*4
- .align 4
- .Loop_mul4x_tail:
- mul $t0,$a0,$bi // lo(a[4..7]*b[4])
- adc $carry,$carry,xzr // modulo-scheduled
- mul $t1,$a1,$bi
- add $cnt,$cnt,#8
- mul $t2,$a2,$bi
- and $cnt,$cnt,#31
- mul $t3,$a3,$bi
- adds $acc0,$acc0,$t0
- umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
- adcs $acc1,$acc1,$t1
- umulh $t1,$a1,$bi
- adcs $acc2,$acc2,$t2
- umulh $t2,$a2,$bi
- adcs $acc3,$acc3,$t3
- umulh $t3,$a3,$bi
- adc $acc4,xzr,xzr
- ldr $bi,[$bp,$cnt] // next b[i]
- adds $acc1,$acc1,$t0
- mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
- adcs $acc2,$acc2,$t1
- mul $t1,$m1,$mi
- adcs $acc3,$acc3,$t2
- mul $t2,$m2,$mi
- adc $acc4,$acc4,$t3 // can't overflow
- mul $t3,$m3,$mi
- adds $acc0,$acc0,$t0
- umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
- adcs $acc1,$acc1,$t1
- umulh $t1,$m1,$mi
- adcs $acc2,$acc2,$t2
- umulh $t2,$m2,$mi
- adcs $acc3,$acc3,$t3
- umulh $t3,$m3,$mi
- adcs $acc4,$acc4,$carry
- ldr $mi,[sp,$cnt] // next a[0]*n0
- adc $carry,xzr,xzr
- str $acc0,[$tp],#8 // result!!!
- adds $acc0,$acc1,$t0
- sub $t0,$ap_end,$ap // done yet?
- adcs $acc1,$acc2,$t1
- adcs $acc2,$acc3,$t2
- adcs $acc3,$acc4,$t3
- //adc $carry,$carry,xzr
- cbnz $cnt,.Loop_mul4x_tail
- sub $t1,$np,$num // rewinded np?
- adc $carry,$carry,xzr
- cbz $t0,.Loop_mul4x_break
- ldp $t0,$t1,[$tp,#8*4]
- ldp $t2,$t3,[$tp,#8*6]
- ldp $a0,$a1,[$ap,#8*0]
- ldp $a2,$a3,[$ap,#8*2]
- add $ap,$ap,#8*4
- adds $acc0,$acc0,$t0
- adcs $acc1,$acc1,$t1
- adcs $acc2,$acc2,$t2
- adcs $acc3,$acc3,$t3
- //adc $carry,$carry,xzr
- ldp $m0,$m1,[$np,#8*0]
- ldp $m2,$m3,[$np,#8*2]
- add $np,$np,#8*4
- b .Loop_mul4x_tail
- .align 4
- .Loop_mul4x_break:
- ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
- adds $acc0,$acc0,$topmost
- add $bp,$bp,#8*4 // bp++
- adcs $acc1,$acc1,xzr
- sub $ap,$ap,$num // rewind ap
- adcs $acc2,$acc2,xzr
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
- adcs $acc3,$acc3,xzr
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
- adc $topmost,$carry,xzr
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
- cmp $bp,$t3 // done yet?
- ldp $acc2,$acc3,[sp,#8*6]
- ldp $m0,$m1,[$t1,#8*0] // n[0..3]
- ldp $m2,$m3,[$t1,#8*2]
- add $np,$t1,#8*4
- b.eq .Lmul4x_post
- ldr $bi,[$bp]
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
- ldp $a2,$a3,[$ap,#8*2]
- adds $ap,$ap,#8*4 // clear carry bit
- mov $carry,xzr
- mov $tp,sp
- b .Loop_mul4x_reduction
- .align 4
- .Lmul4x_post:
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- mov $rp,$t2
- mov $ap_end,$t2 // $rp copy
- subs $t0,$acc0,$m0
- add $tp,sp,#8*8
- sbcs $t1,$acc1,$m1
- sub $cnt,$num,#8*4
- .Lmul4x_sub:
- sbcs $t2,$acc2,$m2
- ldp $m0,$m1,[$np,#8*0]
- sub $cnt,$cnt,#8*4
- ldp $acc0,$acc1,[$tp,#8*0]
- sbcs $t3,$acc3,$m3
- ldp $m2,$m3,[$np,#8*2]
- add $np,$np,#8*4
- ldp $acc2,$acc3,[$tp,#8*2]
- add $tp,$tp,#8*4
- stp $t0,$t1,[$rp,#8*0]
- sbcs $t0,$acc0,$m0
- stp $t2,$t3,[$rp,#8*2]
- add $rp,$rp,#8*4
- sbcs $t1,$acc1,$m1
- cbnz $cnt,.Lmul4x_sub
- sbcs $t2,$acc2,$m2
- mov $tp,sp
- add $ap,sp,#8*4
- ldp $a0,$a1,[$ap_end,#8*0]
- sbcs $t3,$acc3,$m3
- stp $t0,$t1,[$rp,#8*0]
- ldp $a2,$a3,[$ap_end,#8*2]
- stp $t2,$t3,[$rp,#8*2]
- ldp $acc0,$acc1,[$ap,#8*0]
- ldp $acc2,$acc3,[$ap,#8*2]
- sbcs xzr,$topmost,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
- sub $cnt,$num,#8*4
- .Lmul4x_cond_copy:
- sub $cnt,$cnt,#8*4
- csel $t0,$acc0,$a0,lo
- stp xzr,xzr,[$tp,#8*0]
- csel $t1,$acc1,$a1,lo
- ldp $a0,$a1,[$ap_end,#8*4]
- ldp $acc0,$acc1,[$ap,#8*4]
- csel $t2,$acc2,$a2,lo
- stp xzr,xzr,[$tp,#8*2]
- add $tp,$tp,#8*4
- csel $t3,$acc3,$a3,lo
- ldp $a2,$a3,[$ap_end,#8*6]
- ldp $acc2,$acc3,[$ap,#8*6]
- add $ap,$ap,#8*4
- stp $t0,$t1,[$ap_end,#8*0]
- stp $t2,$t3,[$ap_end,#8*2]
- add $ap_end,$ap_end,#8*4
- cbnz $cnt,.Lmul4x_cond_copy
- csel $t0,$acc0,$a0,lo
- stp xzr,xzr,[$tp,#8*0]
- csel $t1,$acc1,$a1,lo
- stp xzr,xzr,[$tp,#8*2]
- csel $t2,$acc2,$a2,lo
- stp xzr,xzr,[$tp,#8*3]
- csel $t3,$acc3,$a3,lo
- stp xzr,xzr,[$tp,#8*4]
- stp $t0,$t1,[$ap_end,#8*0]
- stp $t2,$t3,[$ap_end,#8*2]
- b .Lmul4x_done
- .align 4
- .Lmul4x4_post_condition:
- adc $carry,$carry,xzr
- ldr $ap,[x29,#96] // pull rp
- // $acc0-3,$carry hold result, $m0-7 hold modulus
- subs $a0,$acc0,$m0
- ldr x30,[x29,#8] // pull return address
- sbcs $a1,$acc1,$m1
- stp xzr,xzr,[sp,#8*0]
- sbcs $a2,$acc2,$m2
- stp xzr,xzr,[sp,#8*2]
- sbcs $a3,$acc3,$m3
- stp xzr,xzr,[sp,#8*4]
- sbcs xzr,$carry,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*6]
- // $a0-3 hold result-modulus
- csel $a0,$acc0,$a0,lo
- csel $a1,$acc1,$a1,lo
- csel $a2,$acc2,$a2,lo
- csel $a3,$acc3,$a3,lo
- stp $a0,$a1,[$ap,#8*0]
- stp $a2,$a3,[$ap,#8*2]
- .Lmul4x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- // x30 loaded earlier
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size __bn_mul4x_mont,.-__bn_mul4x_mont
- ___
- }
- $code.=<<___;
- .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
- .align 4
- ___
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|