armv8-mont.pl 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907
  1. #! /usr/bin/env perl
  2. # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # March 2015
  15. #
  16. # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  17. # work. While it does improve RSA sign performance by 20-30% (less for
  18. # longer keys) on most processors, for some reason RSA2048 is not
  19. # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  20. # instruction issue rate is limited on processor in question, meaning
  21. # that dedicated squaring procedure is a must. Well, actually all
  22. # contemporary AArch64 processors seem to have limited multiplication
  23. # issue rate, i.e. they can't issue multiplication every cycle, which
  24. # explains moderate improvement coefficients in comparison to
  25. # compiler-generated code. Recall that compiler is instructed to use
  26. # umulh and therefore uses same amount of multiplication instructions
  27. # to do the job. Assembly's edge is to minimize number of "collateral"
  28. # instructions and of course instruction scheduling.
  29. #
  30. # April 2015
  31. #
  32. # Squaring procedure that handles lengths divisible by 8 improves
  33. # RSA/DSA performance by 25-40-60% depending on processor and key
  34. # length. Overall improvement coefficients are always positive in
  35. # comparison to compiler-generated code. On Cortex-A57 improvement
  36. # is still modest on longest key lengths, while others exhibit e.g.
  37. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
  38. # on Cortex-A57 and ~60-100% faster on others.
  39. # $output is the last argument if it looks like a file (it has an extension)
  40. # $flavour is the first argument if it doesn't look like a file
  41. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  42. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  46. die "can't locate arm-xlate.pl";
  47. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  48. or die "can't call $xlate: $1";
  49. *STDOUT=*OUT;
  50. ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  51. $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  52. $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  53. # int bn_mul_mont(
  54. $rp="x0"; # BN_ULONG *rp,
  55. $ap="x1"; # const BN_ULONG *ap,
  56. $bp="x2"; # const BN_ULONG *bp,
  57. $np="x3"; # const BN_ULONG *np,
  58. $n0="x4"; # const BN_ULONG *n0,
  59. $num="x5"; # int num);
  60. $code.=<<___;
  61. #include "arm_arch.h"
  62. #ifndef __KERNEL__
  63. .extern OPENSSL_armv8_rsa_neonized
  64. .hidden OPENSSL_armv8_rsa_neonized
  65. #endif
  66. .text
  67. .globl bn_mul_mont
  68. .type bn_mul_mont,%function
  69. .align 5
  70. bn_mul_mont:
  71. AARCH64_SIGN_LINK_REGISTER
  72. .Lbn_mul_mont:
  73. tst $num,#3
  74. b.ne .Lmul_mont
  75. cmp $num,#32
  76. b.le .Lscalar_impl
  77. #ifndef __KERNEL__
  78. adrp x17,OPENSSL_armv8_rsa_neonized
  79. ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
  80. cbnz w17, bn_mul8x_mont_neon
  81. #endif
  82. .Lscalar_impl:
  83. tst $num,#7
  84. b.eq __bn_sqr8x_mont
  85. tst $num,#3
  86. b.eq __bn_mul4x_mont
  87. .Lmul_mont:
  88. stp x29,x30,[sp,#-64]!
  89. add x29,sp,#0
  90. stp x19,x20,[sp,#16]
  91. stp x21,x22,[sp,#32]
  92. stp x23,x24,[sp,#48]
  93. ldr $m0,[$bp],#8 // bp[0]
  94. sub $tp,sp,$num,lsl#3
  95. ldp $hi0,$aj,[$ap],#16 // ap[0..1]
  96. lsl $num,$num,#3
  97. ldr $n0,[$n0] // *n0
  98. and $tp,$tp,#-16 // ABI says so
  99. ldp $hi1,$nj,[$np],#16 // np[0..1]
  100. mul $lo0,$hi0,$m0 // ap[0]*bp[0]
  101. sub $j,$num,#16 // j=num-2
  102. umulh $hi0,$hi0,$m0
  103. mul $alo,$aj,$m0 // ap[1]*bp[0]
  104. umulh $ahi,$aj,$m0
  105. mul $m1,$lo0,$n0 // "tp[0]"*n0
  106. mov sp,$tp // alloca
  107. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  108. umulh $hi1,$hi1,$m1
  109. mul $nlo,$nj,$m1 // np[1]*m1
  110. // (*) adds $lo1,$lo1,$lo0 // discarded
  111. // (*) As for removal of first multiplication and addition
  112. // instructions. The outcome of first addition is
  113. // guaranteed to be zero, which leaves two computationally
  114. // significant outcomes: it either carries or not. Then
  115. // question is when does it carry? Is there alternative
  116. // way to deduce it? If you follow operations, you can
  117. // observe that condition for carry is quite simple:
  118. // $lo0 being non-zero. So that carry can be calculated
  119. // by adding -1 to $lo0. That's what next instruction does.
  120. subs xzr,$lo0,#1 // (*)
  121. umulh $nhi,$nj,$m1
  122. adc $hi1,$hi1,xzr
  123. cbz $j,.L1st_skip
  124. .L1st:
  125. ldr $aj,[$ap],#8
  126. adds $lo0,$alo,$hi0
  127. sub $j,$j,#8 // j--
  128. adc $hi0,$ahi,xzr
  129. ldr $nj,[$np],#8
  130. adds $lo1,$nlo,$hi1
  131. mul $alo,$aj,$m0 // ap[j]*bp[0]
  132. adc $hi1,$nhi,xzr
  133. umulh $ahi,$aj,$m0
  134. adds $lo1,$lo1,$lo0
  135. mul $nlo,$nj,$m1 // np[j]*m1
  136. adc $hi1,$hi1,xzr
  137. umulh $nhi,$nj,$m1
  138. str $lo1,[$tp],#8 // tp[j-1]
  139. cbnz $j,.L1st
  140. .L1st_skip:
  141. adds $lo0,$alo,$hi0
  142. sub $ap,$ap,$num // rewind $ap
  143. adc $hi0,$ahi,xzr
  144. adds $lo1,$nlo,$hi1
  145. sub $np,$np,$num // rewind $np
  146. adc $hi1,$nhi,xzr
  147. adds $lo1,$lo1,$lo0
  148. sub $i,$num,#8 // i=num-1
  149. adcs $hi1,$hi1,$hi0
  150. adc $ovf,xzr,xzr // upmost overflow bit
  151. stp $lo1,$hi1,[$tp]
  152. .Louter:
  153. ldr $m0,[$bp],#8 // bp[i]
  154. ldp $hi0,$aj,[$ap],#16
  155. ldr $tj,[sp] // tp[0]
  156. add $tp,sp,#8
  157. mul $lo0,$hi0,$m0 // ap[0]*bp[i]
  158. sub $j,$num,#16 // j=num-2
  159. umulh $hi0,$hi0,$m0
  160. ldp $hi1,$nj,[$np],#16
  161. mul $alo,$aj,$m0 // ap[1]*bp[i]
  162. adds $lo0,$lo0,$tj
  163. umulh $ahi,$aj,$m0
  164. adc $hi0,$hi0,xzr
  165. mul $m1,$lo0,$n0
  166. sub $i,$i,#8 // i--
  167. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  168. umulh $hi1,$hi1,$m1
  169. mul $nlo,$nj,$m1 // np[1]*m1
  170. // (*) adds $lo1,$lo1,$lo0
  171. subs xzr,$lo0,#1 // (*)
  172. umulh $nhi,$nj,$m1
  173. cbz $j,.Linner_skip
  174. .Linner:
  175. ldr $aj,[$ap],#8
  176. adc $hi1,$hi1,xzr
  177. ldr $tj,[$tp],#8 // tp[j]
  178. adds $lo0,$alo,$hi0
  179. sub $j,$j,#8 // j--
  180. adc $hi0,$ahi,xzr
  181. adds $lo1,$nlo,$hi1
  182. ldr $nj,[$np],#8
  183. adc $hi1,$nhi,xzr
  184. mul $alo,$aj,$m0 // ap[j]*bp[i]
  185. adds $lo0,$lo0,$tj
  186. umulh $ahi,$aj,$m0
  187. adc $hi0,$hi0,xzr
  188. mul $nlo,$nj,$m1 // np[j]*m1
  189. adds $lo1,$lo1,$lo0
  190. umulh $nhi,$nj,$m1
  191. stur $lo1,[$tp,#-16] // tp[j-1]
  192. cbnz $j,.Linner
  193. .Linner_skip:
  194. ldr $tj,[$tp],#8 // tp[j]
  195. adc $hi1,$hi1,xzr
  196. adds $lo0,$alo,$hi0
  197. sub $ap,$ap,$num // rewind $ap
  198. adc $hi0,$ahi,xzr
  199. adds $lo1,$nlo,$hi1
  200. sub $np,$np,$num // rewind $np
  201. adcs $hi1,$nhi,$ovf
  202. adc $ovf,xzr,xzr
  203. adds $lo0,$lo0,$tj
  204. adc $hi0,$hi0,xzr
  205. adds $lo1,$lo1,$lo0
  206. adcs $hi1,$hi1,$hi0
  207. adc $ovf,$ovf,xzr // upmost overflow bit
  208. stp $lo1,$hi1,[$tp,#-16]
  209. cbnz $i,.Louter
  210. // Final step. We see if result is larger than modulus, and
  211. // if it is, subtract the modulus. But comparison implies
  212. // subtraction. So we subtract modulus, see if it borrowed,
  213. // and conditionally copy original value.
  214. ldr $tj,[sp] // tp[0]
  215. add $tp,sp,#8
  216. ldr $nj,[$np],#8 // np[0]
  217. subs $j,$num,#8 // j=num-1 and clear borrow
  218. mov $ap,$rp
  219. .Lsub:
  220. sbcs $aj,$tj,$nj // tp[j]-np[j]
  221. ldr $tj,[$tp],#8
  222. sub $j,$j,#8 // j--
  223. ldr $nj,[$np],#8
  224. str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
  225. cbnz $j,.Lsub
  226. sbcs $aj,$tj,$nj
  227. sbcs $ovf,$ovf,xzr // did it borrow?
  228. str $aj,[$ap],#8 // rp[num-1]
  229. ldr $tj,[sp] // tp[0]
  230. add $tp,sp,#8
  231. ldr $aj,[$rp],#8 // rp[0]
  232. sub $num,$num,#8 // num--
  233. nop
  234. .Lcond_copy:
  235. sub $num,$num,#8 // num--
  236. csel $nj,$tj,$aj,lo // did it borrow?
  237. ldr $tj,[$tp],#8
  238. ldr $aj,[$rp],#8
  239. stur xzr,[$tp,#-16] // wipe tp
  240. stur $nj,[$rp,#-16]
  241. cbnz $num,.Lcond_copy
  242. csel $nj,$tj,$aj,lo
  243. stur xzr,[$tp,#-8] // wipe tp
  244. stur $nj,[$rp,#-8]
  245. ldp x19,x20,[x29,#16]
  246. mov sp,x29
  247. ldp x21,x22,[x29,#32]
  248. mov x0,#1
  249. ldp x23,x24,[x29,#48]
  250. ldr x29,[sp],#64
  251. AARCH64_VALIDATE_LINK_REGISTER
  252. ret
  253. .size bn_mul_mont,.-bn_mul_mont
  254. ___
  255. {
  256. my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
  257. my ($Z,$Temp)=("v4.16b","v5");
  258. my @ACC=map("v$_",(6..13));
  259. my ($Bi,$Ni,$M0)=map("v$_",(28..30));
  260. my $sBi="s28";
  261. my $sM0="s30";
  262. my $zero="v14";
  263. my $temp="v15";
  264. my $ACCTemp="v16";
  265. my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
  266. my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
  267. $code.=<<___;
  268. .type bn_mul8x_mont_neon,%function
  269. .align 5
  270. bn_mul8x_mont_neon:
  271. // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
  272. // only from bn_mul_mont which has already signed the return address.
  273. stp x29,x30,[sp,#-80]!
  274. mov x16,sp
  275. stp d8,d9,[sp,#16]
  276. stp d10,d11,[sp,#32]
  277. stp d12,d13,[sp,#48]
  278. stp d14,d15,[sp,#64]
  279. lsl $num,$num,#1
  280. eor $zero.16b,$zero.16b,$zero.16b
  281. .align 4
  282. .LNEON_8n:
  283. eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
  284. sub $toutptr,sp,#128
  285. eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
  286. sub $toutptr,$toutptr,$num,lsl#4
  287. eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
  288. and $toutptr,$toutptr,#-64
  289. eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
  290. mov sp,$toutptr // alloca
  291. eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
  292. add $toutptr,$toutptr,#256
  293. eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
  294. sub $inner,$num,#8
  295. eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
  296. eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
  297. .LNEON_8n_init:
  298. st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
  299. subs $inner,$inner,#8
  300. st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
  301. st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
  302. st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
  303. bne .LNEON_8n_init
  304. add $tinptr,sp,#256
  305. ld1 {$A0.4s,$A1.4s},[$aptr],#32
  306. add $bnptr,sp,#8
  307. ldr $sM0,[$n0],#4
  308. mov $outer,$num
  309. b .LNEON_8n_outer
  310. .align 4
  311. .LNEON_8n_outer:
  312. ldr $sBi,[$bptr],#4 // *b++
  313. uxtl $Bi.4s,$Bi.4h
  314. add $toutptr,sp,#128
  315. ld1 {$N0.4s,$N1.4s},[$nptr],#32
  316. umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
  317. umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
  318. umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
  319. shl $Ni.2d,@ACC[0].2d,#16
  320. ext $Ni.16b,$Ni.16b,$Ni.16b,#8
  321. umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
  322. add $Ni.2d,$Ni.2d,@ACC[0].2d
  323. umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
  324. mul $Ni.2s,$Ni.2s,$M0.2s
  325. umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
  326. st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
  327. umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
  328. uxtl $Ni.4s,$Ni.4h
  329. umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
  330. ___
  331. for ($i=0; $i<7;) {
  332. $code.=<<___;
  333. ldr $sBi,[$bptr],#4 // *b++
  334. umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
  335. umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
  336. uxtl $Bi.4s,$Bi.4h
  337. umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
  338. ushr $temp.2d,@ACC[0].2d,#16
  339. umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
  340. umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
  341. ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
  342. add @ACC[0].2d,@ACC[0].2d,$temp.2d
  343. umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
  344. ushr @ACC[0].2d,@ACC[0].2d,#16
  345. umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
  346. umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
  347. add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
  348. ins @ACC[1].d[0],$ACCTemp.d[0]
  349. st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
  350. ___
  351. push(@ACC,shift(@ACC)); $i++;
  352. $code.=<<___;
  353. umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
  354. ld1 {@ACC[7].2d},[$tinptr],#16
  355. umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
  356. umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
  357. shl $Ni.2d,@ACC[0].2d,#16
  358. ext $Ni.16b,$Ni.16b,$Ni.16b,#8
  359. umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
  360. add $Ni.2d,$Ni.2d,@ACC[0].2d
  361. umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
  362. mul $Ni.2s,$Ni.2s,$M0.2s
  363. umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
  364. st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
  365. umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
  366. uxtl $Ni.4s,$Ni.4h
  367. umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
  368. ___
  369. }
  370. $code.=<<___;
  371. ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
  372. umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
  373. ld1 {$A0.4s,$A1.4s},[$aptr],#32
  374. umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
  375. umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
  376. mov $Temp.16b,@ACC[0].16b
  377. ushr $Temp.2d,$Temp.2d,#16
  378. ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
  379. umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
  380. umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
  381. add @ACC[0].2d,@ACC[0].2d,$Temp.2d
  382. umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
  383. ushr @ACC[0].2d,@ACC[0].2d,#16
  384. eor $temp.16b,$temp.16b,$temp.16b
  385. ins @ACC[0].d[1],$temp.d[0]
  386. umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
  387. umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
  388. add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
  389. st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
  390. add $bnptr,sp,#8 // rewind
  391. ___
  392. push(@ACC,shift(@ACC));
  393. $code.=<<___;
  394. sub $inner,$num,#8
  395. b .LNEON_8n_inner
  396. .align 4
  397. .LNEON_8n_inner:
  398. subs $inner,$inner,#8
  399. umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
  400. ld1 {@ACC[7].2d},[$tinptr]
  401. umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
  402. ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
  403. umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
  404. ld1 {$N0.4s,$N1.4s},[$nptr],#32
  405. umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
  406. b.eq .LInner_jump
  407. add $tinptr,$tinptr,#16 // don't advance in last iteration
  408. .LInner_jump:
  409. umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
  410. umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
  411. umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
  412. umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
  413. ___
  414. for ($i=1; $i<8; $i++) {
  415. $code.=<<___;
  416. ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
  417. umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
  418. umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
  419. umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
  420. umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
  421. umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
  422. umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
  423. umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
  424. umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
  425. st1 {@ACC[0].2d},[$toutptr],#16
  426. ___
  427. push(@ACC,shift(@ACC));
  428. $code.=<<___;
  429. umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
  430. ld1 {@ACC[7].2d},[$tinptr]
  431. umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
  432. ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
  433. umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
  434. b.eq .LInner_jump$i
  435. add $tinptr,$tinptr,#16 // don't advance in last iteration
  436. .LInner_jump$i:
  437. umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
  438. umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
  439. umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
  440. umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
  441. umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
  442. ___
  443. }
  444. $code.=<<___;
  445. b.ne .LInner_after_rewind$i
  446. sub $aptr,$aptr,$num,lsl#2 // rewind
  447. .LInner_after_rewind$i:
  448. umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
  449. ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
  450. umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
  451. ld1 {$A0.4s,$A1.4s},[$aptr],#32
  452. umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
  453. add $bnptr,sp,#8 // rewind
  454. umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
  455. umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
  456. umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
  457. umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
  458. st1 {@ACC[0].2d},[$toutptr],#16
  459. umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
  460. bne .LNEON_8n_inner
  461. ___
  462. push(@ACC,shift(@ACC));
  463. $code.=<<___;
  464. add $tinptr,sp,#128
  465. st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
  466. eor $N0.16b,$N0.16b,$N0.16b // $N0
  467. st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
  468. eor $N1.16b,$N1.16b,$N1.16b // $N1
  469. st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
  470. st1 {@ACC[6].2d},[$toutptr]
  471. subs $outer,$outer,#8
  472. ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
  473. ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
  474. ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
  475. ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
  476. b.eq .LInner_8n_jump_2steps
  477. sub $nptr,$nptr,$num,lsl#2 // rewind
  478. b .LNEON_8n_outer
  479. .LInner_8n_jump_2steps:
  480. add $toutptr,sp,#128
  481. st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
  482. mov $Temp.16b,@ACC[0].16b
  483. ushr $temp.2d,@ACC[0].2d,#16
  484. ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
  485. st1 {$N0.2d,$N1.2d}, [sp],#32
  486. add @ACC[0].2d,@ACC[0].2d,$temp.2d
  487. st1 {$N0.2d,$N1.2d}, [sp],#32
  488. ushr $temp.2d,@ACC[0].2d,#16
  489. st1 {$N0.2d,$N1.2d}, [sp],#32
  490. zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
  491. ins $temp.d[1],$zero.d[0]
  492. mov $inner,$num
  493. b .LNEON_tail_entry
  494. .align 4
  495. .LNEON_tail:
  496. add @ACC[0].2d,@ACC[0].2d,$temp.2d
  497. mov $Temp.16b,@ACC[0].16b
  498. ushr $temp.2d,@ACC[0].2d,#16
  499. ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
  500. ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
  501. add @ACC[0].2d,@ACC[0].2d,$temp.2d
  502. ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
  503. ushr $temp.2d,@ACC[0].2d,#16
  504. ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
  505. zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
  506. ins $temp.d[1],$zero.d[0]
  507. .LNEON_tail_entry:
  508. ___
  509. for ($i=1; $i<8; $i++) {
  510. $code.=<<___;
  511. add @ACC[1].2d,@ACC[1].2d,$temp.2d
  512. st1 {@ACC[0].s}[0], [$toutptr],#4
  513. ushr $temp.2d,@ACC[1].2d,#16
  514. mov $Temp.16b,@ACC[1].16b
  515. ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
  516. add @ACC[1].2d,@ACC[1].2d,$temp.2d
  517. ushr $temp.2d,@ACC[1].2d,#16
  518. zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
  519. ins $temp.d[1],$zero.d[0]
  520. ___
  521. push(@ACC,shift(@ACC));
  522. }
  523. push(@ACC,shift(@ACC));
  524. $code.=<<___;
  525. ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
  526. subs $inner,$inner,#8
  527. st1 {@ACC[7].s}[0], [$toutptr],#4
  528. bne .LNEON_tail
  529. st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
  530. sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
  531. subs $aptr,sp,#0 // clear carry flag
  532. add $bptr,sp,$num,lsl#2
  533. .LNEON_sub:
  534. ldp w4,w5,[$aptr],#8
  535. ldp w6,w7,[$aptr],#8
  536. ldp w8,w9,[$nptr],#8
  537. ldp w10,w11,[$nptr],#8
  538. sbcs w8,w4,w8
  539. sbcs w9,w5,w9
  540. sbcs w10,w6,w10
  541. sbcs w11,w7,w11
  542. sub x17,$bptr,$aptr
  543. stp w8,w9,[$rptr],#8
  544. stp w10,w11,[$rptr],#8
  545. cbnz x17,.LNEON_sub
  546. ldr w10, [$aptr] // load top-most bit
  547. mov x11,sp
  548. eor v0.16b,v0.16b,v0.16b
  549. sub x11,$bptr,x11 // this is num*4
  550. eor v1.16b,v1.16b,v1.16b
  551. mov $aptr,sp
  552. sub $rptr,$rptr,x11 // rewind $rptr
  553. mov $nptr,$bptr // second 3/4th of frame
  554. sbcs w10,w10,wzr // result is carry flag
  555. .LNEON_copy_n_zap:
  556. ldp w4,w5,[$aptr],#8
  557. ldp w6,w7,[$aptr],#8
  558. ldp w8,w9,[$rptr],#8
  559. ldp w10,w11,[$rptr]
  560. sub $rptr,$rptr,#8
  561. b.cs .LCopy_1
  562. mov w8,w4
  563. mov w9,w5
  564. mov w10,w6
  565. mov w11,w7
  566. .LCopy_1:
  567. st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
  568. st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
  569. ldp w4,w5,[$aptr],#8
  570. ldp w6,w7,[$aptr],#8
  571. stp w8,w9,[$rptr],#8
  572. stp w10,w11,[$rptr],#8
  573. sub $aptr,$aptr,#32
  574. ldp w8,w9,[$rptr],#8
  575. ldp w10,w11,[$rptr]
  576. sub $rptr,$rptr,#8
  577. b.cs .LCopy_2
  578. mov w8, w4
  579. mov w9, w5
  580. mov w10, w6
  581. mov w11, w7
  582. .LCopy_2:
  583. st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
  584. st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
  585. sub x17,$bptr,$aptr // preserves carry
  586. stp w8,w9,[$rptr],#8
  587. stp w10,w11,[$rptr],#8
  588. cbnz x17,.LNEON_copy_n_zap
  589. mov sp,x16
  590. ldp d14,d15,[sp,#64]
  591. ldp d12,d13,[sp,#48]
  592. ldp d10,d11,[sp,#32]
  593. ldp d8,d9,[sp,#16]
  594. ldr x29,[sp],#80
  595. AARCH64_VALIDATE_LINK_REGISTER
  596. ret // bx lr
  597. .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
  598. ___
  599. }
  600. {
  601. ########################################################################
  602. # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
  603. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
  604. my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
  605. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
  606. my ($cnt,$carry,$topmost)=("x27","x28","x30");
  607. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  608. $code.=<<___;
  609. .type __bn_sqr8x_mont,%function
  610. .align 5
  611. __bn_sqr8x_mont:
  612. cmp $ap,$bp
  613. b.ne __bn_mul4x_mont
  614. .Lsqr8x_mont:
  615. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
  616. // only from bn_mul_mont which has already signed the return address.
  617. stp x29,x30,[sp,#-128]!
  618. add x29,sp,#0
  619. stp x19,x20,[sp,#16]
  620. stp x21,x22,[sp,#32]
  621. stp x23,x24,[sp,#48]
  622. stp x25,x26,[sp,#64]
  623. stp x27,x28,[sp,#80]
  624. stp $rp,$np,[sp,#96] // offload rp and np
  625. ldp $a0,$a1,[$ap,#8*0]
  626. ldp $a2,$a3,[$ap,#8*2]
  627. ldp $a4,$a5,[$ap,#8*4]
  628. ldp $a6,$a7,[$ap,#8*6]
  629. sub $tp,sp,$num,lsl#4
  630. lsl $num,$num,#3
  631. ldr $n0,[$n0] // *n0
  632. mov sp,$tp // alloca
  633. sub $cnt,$num,#8*8
  634. b .Lsqr8x_zero_start
  635. .Lsqr8x_zero:
  636. sub $cnt,$cnt,#8*8
  637. stp xzr,xzr,[$tp,#8*0]
  638. stp xzr,xzr,[$tp,#8*2]
  639. stp xzr,xzr,[$tp,#8*4]
  640. stp xzr,xzr,[$tp,#8*6]
  641. .Lsqr8x_zero_start:
  642. stp xzr,xzr,[$tp,#8*8]
  643. stp xzr,xzr,[$tp,#8*10]
  644. stp xzr,xzr,[$tp,#8*12]
  645. stp xzr,xzr,[$tp,#8*14]
  646. add $tp,$tp,#8*16
  647. cbnz $cnt,.Lsqr8x_zero
  648. add $ap_end,$ap,$num
  649. add $ap,$ap,#8*8
  650. mov $acc0,xzr
  651. mov $acc1,xzr
  652. mov $acc2,xzr
  653. mov $acc3,xzr
  654. mov $acc4,xzr
  655. mov $acc5,xzr
  656. mov $acc6,xzr
  657. mov $acc7,xzr
  658. mov $tp,sp
  659. str $n0,[x29,#112] // offload n0
  660. // Multiply everything but a[i]*a[i]
  661. .align 4
  662. .Lsqr8x_outer_loop:
  663. // a[1]a[0] (i)
  664. // a[2]a[0]
  665. // a[3]a[0]
  666. // a[4]a[0]
  667. // a[5]a[0]
  668. // a[6]a[0]
  669. // a[7]a[0]
  670. // a[2]a[1] (ii)
  671. // a[3]a[1]
  672. // a[4]a[1]
  673. // a[5]a[1]
  674. // a[6]a[1]
  675. // a[7]a[1]
  676. // a[3]a[2] (iii)
  677. // a[4]a[2]
  678. // a[5]a[2]
  679. // a[6]a[2]
  680. // a[7]a[2]
  681. // a[4]a[3] (iv)
  682. // a[5]a[3]
  683. // a[6]a[3]
  684. // a[7]a[3]
  685. // a[5]a[4] (v)
  686. // a[6]a[4]
  687. // a[7]a[4]
  688. // a[6]a[5] (vi)
  689. // a[7]a[5]
  690. // a[7]a[6] (vii)
  691. mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
  692. mul $t1,$a2,$a0
  693. mul $t2,$a3,$a0
  694. mul $t3,$a4,$a0
  695. adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
  696. mul $t0,$a5,$a0
  697. adcs $acc2,$acc2,$t1
  698. mul $t1,$a6,$a0
  699. adcs $acc3,$acc3,$t2
  700. mul $t2,$a7,$a0
  701. adcs $acc4,$acc4,$t3
  702. umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
  703. adcs $acc5,$acc5,$t0
  704. umulh $t0,$a2,$a0
  705. adcs $acc6,$acc6,$t1
  706. umulh $t1,$a3,$a0
  707. adcs $acc7,$acc7,$t2
  708. umulh $t2,$a4,$a0
  709. stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
  710. adc $acc0,xzr,xzr // t[8]
  711. adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
  712. umulh $t3,$a5,$a0
  713. adcs $acc3,$acc3,$t0
  714. umulh $t0,$a6,$a0
  715. adcs $acc4,$acc4,$t1
  716. umulh $t1,$a7,$a0
  717. adcs $acc5,$acc5,$t2
  718. mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
  719. adcs $acc6,$acc6,$t3
  720. mul $t3,$a3,$a1
  721. adcs $acc7,$acc7,$t0
  722. mul $t0,$a4,$a1
  723. adc $acc0,$acc0,$t1
  724. mul $t1,$a5,$a1
  725. adds $acc3,$acc3,$t2
  726. mul $t2,$a6,$a1
  727. adcs $acc4,$acc4,$t3
  728. mul $t3,$a7,$a1
  729. adcs $acc5,$acc5,$t0
  730. umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
  731. adcs $acc6,$acc6,$t1
  732. umulh $t1,$a3,$a1
  733. adcs $acc7,$acc7,$t2
  734. umulh $t2,$a4,$a1
  735. adcs $acc0,$acc0,$t3
  736. umulh $t3,$a5,$a1
  737. stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
  738. adc $acc1,xzr,xzr // t[9]
  739. adds $acc4,$acc4,$t0
  740. umulh $t0,$a6,$a1
  741. adcs $acc5,$acc5,$t1
  742. umulh $t1,$a7,$a1
  743. adcs $acc6,$acc6,$t2
  744. mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
  745. adcs $acc7,$acc7,$t3
  746. mul $t3,$a4,$a2
  747. adcs $acc0,$acc0,$t0
  748. mul $t0,$a5,$a2
  749. adc $acc1,$acc1,$t1
  750. mul $t1,$a6,$a2
  751. adds $acc5,$acc5,$t2
  752. mul $t2,$a7,$a2
  753. adcs $acc6,$acc6,$t3
  754. umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
  755. adcs $acc7,$acc7,$t0
  756. umulh $t0,$a4,$a2
  757. adcs $acc0,$acc0,$t1
  758. umulh $t1,$a5,$a2
  759. adcs $acc1,$acc1,$t2
  760. umulh $t2,$a6,$a2
  761. stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
  762. adc $acc2,xzr,xzr // t[10]
  763. adds $acc6,$acc6,$t3
  764. umulh $t3,$a7,$a2
  765. adcs $acc7,$acc7,$t0
  766. mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
  767. adcs $acc0,$acc0,$t1
  768. mul $t1,$a5,$a3
  769. adcs $acc1,$acc1,$t2
  770. mul $t2,$a6,$a3
  771. adc $acc2,$acc2,$t3
  772. mul $t3,$a7,$a3
  773. adds $acc7,$acc7,$t0
  774. umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
  775. adcs $acc0,$acc0,$t1
  776. umulh $t1,$a5,$a3
  777. adcs $acc1,$acc1,$t2
  778. umulh $t2,$a6,$a3
  779. adcs $acc2,$acc2,$t3
  780. umulh $t3,$a7,$a3
  781. stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
  782. adc $acc3,xzr,xzr // t[11]
  783. adds $acc0,$acc0,$t0
  784. mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
  785. adcs $acc1,$acc1,$t1
  786. mul $t1,$a6,$a4
  787. adcs $acc2,$acc2,$t2
  788. mul $t2,$a7,$a4
  789. adc $acc3,$acc3,$t3
  790. umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
  791. adds $acc1,$acc1,$t0
  792. umulh $t0,$a6,$a4
  793. adcs $acc2,$acc2,$t1
  794. umulh $t1,$a7,$a4
  795. adcs $acc3,$acc3,$t2
  796. mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
  797. adc $acc4,xzr,xzr // t[12]
  798. adds $acc2,$acc2,$t3
  799. mul $t3,$a7,$a5
  800. adcs $acc3,$acc3,$t0
  801. umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
  802. adc $acc4,$acc4,$t1
  803. umulh $t1,$a7,$a5
  804. adds $acc3,$acc3,$t2
  805. mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
  806. adcs $acc4,$acc4,$t3
  807. umulh $t3,$a7,$a6 // hi(a[7]*a[6])
  808. adc $acc5,xzr,xzr // t[13]
  809. adds $acc4,$acc4,$t0
  810. sub $cnt,$ap_end,$ap // done yet?
  811. adc $acc5,$acc5,$t1
  812. adds $acc5,$acc5,$t2
  813. sub $t0,$ap_end,$num // rewinded ap
  814. adc $acc6,xzr,xzr // t[14]
  815. add $acc6,$acc6,$t3
  816. cbz $cnt,.Lsqr8x_outer_break
  817. mov $n0,$a0
  818. ldp $a0,$a1,[$tp,#8*0]
  819. ldp $a2,$a3,[$tp,#8*2]
  820. ldp $a4,$a5,[$tp,#8*4]
  821. ldp $a6,$a7,[$tp,#8*6]
  822. adds $acc0,$acc0,$a0
  823. adcs $acc1,$acc1,$a1
  824. ldp $a0,$a1,[$ap,#8*0]
  825. adcs $acc2,$acc2,$a2
  826. adcs $acc3,$acc3,$a3
  827. ldp $a2,$a3,[$ap,#8*2]
  828. adcs $acc4,$acc4,$a4
  829. adcs $acc5,$acc5,$a5
  830. ldp $a4,$a5,[$ap,#8*4]
  831. adcs $acc6,$acc6,$a6
  832. mov $rp,$ap
  833. adcs $acc7,xzr,$a7
  834. ldp $a6,$a7,[$ap,#8*6]
  835. add $ap,$ap,#8*8
  836. //adc $carry,xzr,xzr // moved below
  837. mov $cnt,#-8*8
  838. // a[8]a[0]
  839. // a[9]a[0]
  840. // a[a]a[0]
  841. // a[b]a[0]
  842. // a[c]a[0]
  843. // a[d]a[0]
  844. // a[e]a[0]
  845. // a[f]a[0]
  846. // a[8]a[1]
  847. // a[f]a[1]........................
  848. // a[8]a[2]
  849. // a[f]a[2]........................
  850. // a[8]a[3]
  851. // a[f]a[3]........................
  852. // a[8]a[4]
  853. // a[f]a[4]........................
  854. // a[8]a[5]
  855. // a[f]a[5]........................
  856. // a[8]a[6]
  857. // a[f]a[6]........................
  858. // a[8]a[7]
  859. // a[f]a[7]........................
  860. .Lsqr8x_mul:
  861. mul $t0,$a0,$n0
  862. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  863. mul $t1,$a1,$n0
  864. add $cnt,$cnt,#8
  865. mul $t2,$a2,$n0
  866. mul $t3,$a3,$n0
  867. adds $acc0,$acc0,$t0
  868. mul $t0,$a4,$n0
  869. adcs $acc1,$acc1,$t1
  870. mul $t1,$a5,$n0
  871. adcs $acc2,$acc2,$t2
  872. mul $t2,$a6,$n0
  873. adcs $acc3,$acc3,$t3
  874. mul $t3,$a7,$n0
  875. adcs $acc4,$acc4,$t0
  876. umulh $t0,$a0,$n0
  877. adcs $acc5,$acc5,$t1
  878. umulh $t1,$a1,$n0
  879. adcs $acc6,$acc6,$t2
  880. umulh $t2,$a2,$n0
  881. adcs $acc7,$acc7,$t3
  882. umulh $t3,$a3,$n0
  883. adc $carry,$carry,xzr
  884. str $acc0,[$tp],#8
  885. adds $acc0,$acc1,$t0
  886. umulh $t0,$a4,$n0
  887. adcs $acc1,$acc2,$t1
  888. umulh $t1,$a5,$n0
  889. adcs $acc2,$acc3,$t2
  890. umulh $t2,$a6,$n0
  891. adcs $acc3,$acc4,$t3
  892. umulh $t3,$a7,$n0
  893. ldr $n0,[$rp,$cnt]
  894. adcs $acc4,$acc5,$t0
  895. adcs $acc5,$acc6,$t1
  896. adcs $acc6,$acc7,$t2
  897. adcs $acc7,$carry,$t3
  898. //adc $carry,xzr,xzr // moved above
  899. cbnz $cnt,.Lsqr8x_mul
  900. // note that carry flag is guaranteed
  901. // to be zero at this point
  902. cmp $ap,$ap_end // done yet?
  903. b.eq .Lsqr8x_break
  904. ldp $a0,$a1,[$tp,#8*0]
  905. ldp $a2,$a3,[$tp,#8*2]
  906. ldp $a4,$a5,[$tp,#8*4]
  907. ldp $a6,$a7,[$tp,#8*6]
  908. adds $acc0,$acc0,$a0
  909. ldur $n0,[$rp,#-8*8]
  910. adcs $acc1,$acc1,$a1
  911. ldp $a0,$a1,[$ap,#8*0]
  912. adcs $acc2,$acc2,$a2
  913. adcs $acc3,$acc3,$a3
  914. ldp $a2,$a3,[$ap,#8*2]
  915. adcs $acc4,$acc4,$a4
  916. adcs $acc5,$acc5,$a5
  917. ldp $a4,$a5,[$ap,#8*4]
  918. adcs $acc6,$acc6,$a6
  919. mov $cnt,#-8*8
  920. adcs $acc7,$acc7,$a7
  921. ldp $a6,$a7,[$ap,#8*6]
  922. add $ap,$ap,#8*8
  923. //adc $carry,xzr,xzr // moved above
  924. b .Lsqr8x_mul
  925. .align 4
  926. .Lsqr8x_break:
  927. ldp $a0,$a1,[$rp,#8*0]
  928. add $ap,$rp,#8*8
  929. ldp $a2,$a3,[$rp,#8*2]
  930. sub $t0,$ap_end,$ap // is it last iteration?
  931. ldp $a4,$a5,[$rp,#8*4]
  932. sub $t1,$tp,$t0
  933. ldp $a6,$a7,[$rp,#8*6]
  934. cbz $t0,.Lsqr8x_outer_loop
  935. stp $acc0,$acc1,[$tp,#8*0]
  936. ldp $acc0,$acc1,[$t1,#8*0]
  937. stp $acc2,$acc3,[$tp,#8*2]
  938. ldp $acc2,$acc3,[$t1,#8*2]
  939. stp $acc4,$acc5,[$tp,#8*4]
  940. ldp $acc4,$acc5,[$t1,#8*4]
  941. stp $acc6,$acc7,[$tp,#8*6]
  942. mov $tp,$t1
  943. ldp $acc6,$acc7,[$t1,#8*6]
  944. b .Lsqr8x_outer_loop
  945. .align 4
  946. .Lsqr8x_outer_break:
  947. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  948. ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
  949. ldp $t1,$t2,[sp,#8*1]
  950. ldp $a5,$a7,[$t0,#8*2]
  951. add $ap,$t0,#8*4
  952. ldp $t3,$t0,[sp,#8*3]
  953. stp $acc0,$acc1,[$tp,#8*0]
  954. mul $acc0,$a1,$a1
  955. stp $acc2,$acc3,[$tp,#8*2]
  956. umulh $a1,$a1,$a1
  957. stp $acc4,$acc5,[$tp,#8*4]
  958. mul $a2,$a3,$a3
  959. stp $acc6,$acc7,[$tp,#8*6]
  960. mov $tp,sp
  961. umulh $a3,$a3,$a3
  962. adds $acc1,$a1,$t1,lsl#1
  963. extr $t1,$t2,$t1,#63
  964. sub $cnt,$num,#8*4
  965. .Lsqr4x_shift_n_add:
  966. adcs $acc2,$a2,$t1
  967. extr $t2,$t3,$t2,#63
  968. sub $cnt,$cnt,#8*4
  969. adcs $acc3,$a3,$t2
  970. ldp $t1,$t2,[$tp,#8*5]
  971. mul $a4,$a5,$a5
  972. ldp $a1,$a3,[$ap],#8*2
  973. umulh $a5,$a5,$a5
  974. mul $a6,$a7,$a7
  975. umulh $a7,$a7,$a7
  976. extr $t3,$t0,$t3,#63
  977. stp $acc0,$acc1,[$tp,#8*0]
  978. adcs $acc4,$a4,$t3
  979. extr $t0,$t1,$t0,#63
  980. stp $acc2,$acc3,[$tp,#8*2]
  981. adcs $acc5,$a5,$t0
  982. ldp $t3,$t0,[$tp,#8*7]
  983. extr $t1,$t2,$t1,#63
  984. adcs $acc6,$a6,$t1
  985. extr $t2,$t3,$t2,#63
  986. adcs $acc7,$a7,$t2
  987. ldp $t1,$t2,[$tp,#8*9]
  988. mul $a0,$a1,$a1
  989. ldp $a5,$a7,[$ap],#8*2
  990. umulh $a1,$a1,$a1
  991. mul $a2,$a3,$a3
  992. umulh $a3,$a3,$a3
  993. stp $acc4,$acc5,[$tp,#8*4]
  994. extr $t3,$t0,$t3,#63
  995. stp $acc6,$acc7,[$tp,#8*6]
  996. add $tp,$tp,#8*8
  997. adcs $acc0,$a0,$t3
  998. extr $t0,$t1,$t0,#63
  999. adcs $acc1,$a1,$t0
  1000. ldp $t3,$t0,[$tp,#8*3]
  1001. extr $t1,$t2,$t1,#63
  1002. cbnz $cnt,.Lsqr4x_shift_n_add
  1003. ___
  1004. my ($np,$np_end)=($ap,$ap_end);
  1005. $code.=<<___;
  1006. ldp $np,$n0,[x29,#104] // pull np and n0
  1007. adcs $acc2,$a2,$t1
  1008. extr $t2,$t3,$t2,#63
  1009. adcs $acc3,$a3,$t2
  1010. ldp $t1,$t2,[$tp,#8*5]
  1011. mul $a4,$a5,$a5
  1012. umulh $a5,$a5,$a5
  1013. stp $acc0,$acc1,[$tp,#8*0]
  1014. mul $a6,$a7,$a7
  1015. umulh $a7,$a7,$a7
  1016. stp $acc2,$acc3,[$tp,#8*2]
  1017. extr $t3,$t0,$t3,#63
  1018. adcs $acc4,$a4,$t3
  1019. extr $t0,$t1,$t0,#63
  1020. ldp $acc0,$acc1,[sp,#8*0]
  1021. adcs $acc5,$a5,$t0
  1022. extr $t1,$t2,$t1,#63
  1023. ldp $a0,$a1,[$np,#8*0]
  1024. adcs $acc6,$a6,$t1
  1025. extr $t2,xzr,$t2,#63
  1026. ldp $a2,$a3,[$np,#8*2]
  1027. adc $acc7,$a7,$t2
  1028. ldp $a4,$a5,[$np,#8*4]
  1029. // Reduce by 512 bits per iteration
  1030. mul $na0,$n0,$acc0 // t[0]*n0
  1031. ldp $a6,$a7,[$np,#8*6]
  1032. add $np_end,$np,$num
  1033. ldp $acc2,$acc3,[sp,#8*2]
  1034. stp $acc4,$acc5,[$tp,#8*4]
  1035. ldp $acc4,$acc5,[sp,#8*4]
  1036. stp $acc6,$acc7,[$tp,#8*6]
  1037. ldp $acc6,$acc7,[sp,#8*6]
  1038. add $np,$np,#8*8
  1039. mov $topmost,xzr // initial top-most carry
  1040. mov $tp,sp
  1041. mov $cnt,#8
  1042. .Lsqr8x_reduction:
  1043. // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
  1044. mul $t1,$a1,$na0
  1045. sub $cnt,$cnt,#1
  1046. mul $t2,$a2,$na0
  1047. str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
  1048. mul $t3,$a3,$na0
  1049. // (*) adds xzr,$acc0,$t0
  1050. subs xzr,$acc0,#1 // (*)
  1051. mul $t0,$a4,$na0
  1052. adcs $acc0,$acc1,$t1
  1053. mul $t1,$a5,$na0
  1054. adcs $acc1,$acc2,$t2
  1055. mul $t2,$a6,$na0
  1056. adcs $acc2,$acc3,$t3
  1057. mul $t3,$a7,$na0
  1058. adcs $acc3,$acc4,$t0
  1059. umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
  1060. adcs $acc4,$acc5,$t1
  1061. umulh $t1,$a1,$na0
  1062. adcs $acc5,$acc6,$t2
  1063. umulh $t2,$a2,$na0
  1064. adcs $acc6,$acc7,$t3
  1065. umulh $t3,$a3,$na0
  1066. adc $acc7,xzr,xzr
  1067. adds $acc0,$acc0,$t0
  1068. umulh $t0,$a4,$na0
  1069. adcs $acc1,$acc1,$t1
  1070. umulh $t1,$a5,$na0
  1071. adcs $acc2,$acc2,$t2
  1072. umulh $t2,$a6,$na0
  1073. adcs $acc3,$acc3,$t3
  1074. umulh $t3,$a7,$na0
  1075. mul $na0,$n0,$acc0 // next t[0]*n0
  1076. adcs $acc4,$acc4,$t0
  1077. adcs $acc5,$acc5,$t1
  1078. adcs $acc6,$acc6,$t2
  1079. adc $acc7,$acc7,$t3
  1080. cbnz $cnt,.Lsqr8x_reduction
  1081. ldp $t0,$t1,[$tp,#8*0]
  1082. ldp $t2,$t3,[$tp,#8*2]
  1083. mov $rp,$tp
  1084. sub $cnt,$np_end,$np // done yet?
  1085. adds $acc0,$acc0,$t0
  1086. adcs $acc1,$acc1,$t1
  1087. ldp $t0,$t1,[$tp,#8*4]
  1088. adcs $acc2,$acc2,$t2
  1089. adcs $acc3,$acc3,$t3
  1090. ldp $t2,$t3,[$tp,#8*6]
  1091. adcs $acc4,$acc4,$t0
  1092. adcs $acc5,$acc5,$t1
  1093. adcs $acc6,$acc6,$t2
  1094. adcs $acc7,$acc7,$t3
  1095. //adc $carry,xzr,xzr // moved below
  1096. cbz $cnt,.Lsqr8x8_post_condition
  1097. ldur $n0,[$tp,#-8*8]
  1098. ldp $a0,$a1,[$np,#8*0]
  1099. ldp $a2,$a3,[$np,#8*2]
  1100. ldp $a4,$a5,[$np,#8*4]
  1101. mov $cnt,#-8*8
  1102. ldp $a6,$a7,[$np,#8*6]
  1103. add $np,$np,#8*8
  1104. .Lsqr8x_tail:
  1105. mul $t0,$a0,$n0
  1106. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  1107. mul $t1,$a1,$n0
  1108. add $cnt,$cnt,#8
  1109. mul $t2,$a2,$n0
  1110. mul $t3,$a3,$n0
  1111. adds $acc0,$acc0,$t0
  1112. mul $t0,$a4,$n0
  1113. adcs $acc1,$acc1,$t1
  1114. mul $t1,$a5,$n0
  1115. adcs $acc2,$acc2,$t2
  1116. mul $t2,$a6,$n0
  1117. adcs $acc3,$acc3,$t3
  1118. mul $t3,$a7,$n0
  1119. adcs $acc4,$acc4,$t0
  1120. umulh $t0,$a0,$n0
  1121. adcs $acc5,$acc5,$t1
  1122. umulh $t1,$a1,$n0
  1123. adcs $acc6,$acc6,$t2
  1124. umulh $t2,$a2,$n0
  1125. adcs $acc7,$acc7,$t3
  1126. umulh $t3,$a3,$n0
  1127. adc $carry,$carry,xzr
  1128. str $acc0,[$tp],#8
  1129. adds $acc0,$acc1,$t0
  1130. umulh $t0,$a4,$n0
  1131. adcs $acc1,$acc2,$t1
  1132. umulh $t1,$a5,$n0
  1133. adcs $acc2,$acc3,$t2
  1134. umulh $t2,$a6,$n0
  1135. adcs $acc3,$acc4,$t3
  1136. umulh $t3,$a7,$n0
  1137. ldr $n0,[$rp,$cnt]
  1138. adcs $acc4,$acc5,$t0
  1139. adcs $acc5,$acc6,$t1
  1140. adcs $acc6,$acc7,$t2
  1141. adcs $acc7,$carry,$t3
  1142. //adc $carry,xzr,xzr // moved above
  1143. cbnz $cnt,.Lsqr8x_tail
  1144. // note that carry flag is guaranteed
  1145. // to be zero at this point
  1146. ldp $a0,$a1,[$tp,#8*0]
  1147. sub $cnt,$np_end,$np // done yet?
  1148. sub $t2,$np_end,$num // rewinded np
  1149. ldp $a2,$a3,[$tp,#8*2]
  1150. ldp $a4,$a5,[$tp,#8*4]
  1151. ldp $a6,$a7,[$tp,#8*6]
  1152. cbz $cnt,.Lsqr8x_tail_break
  1153. ldur $n0,[$rp,#-8*8]
  1154. adds $acc0,$acc0,$a0
  1155. adcs $acc1,$acc1,$a1
  1156. ldp $a0,$a1,[$np,#8*0]
  1157. adcs $acc2,$acc2,$a2
  1158. adcs $acc3,$acc3,$a3
  1159. ldp $a2,$a3,[$np,#8*2]
  1160. adcs $acc4,$acc4,$a4
  1161. adcs $acc5,$acc5,$a5
  1162. ldp $a4,$a5,[$np,#8*4]
  1163. adcs $acc6,$acc6,$a6
  1164. mov $cnt,#-8*8
  1165. adcs $acc7,$acc7,$a7
  1166. ldp $a6,$a7,[$np,#8*6]
  1167. add $np,$np,#8*8
  1168. //adc $carry,xzr,xzr // moved above
  1169. b .Lsqr8x_tail
  1170. .align 4
  1171. .Lsqr8x_tail_break:
  1172. ldr $n0,[x29,#112] // pull n0
  1173. add $cnt,$tp,#8*8 // end of current t[num] window
  1174. subs xzr,$topmost,#1 // "move" top-most carry to carry bit
  1175. adcs $t0,$acc0,$a0
  1176. adcs $t1,$acc1,$a1
  1177. ldp $acc0,$acc1,[$rp,#8*0]
  1178. adcs $acc2,$acc2,$a2
  1179. ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
  1180. adcs $acc3,$acc3,$a3
  1181. ldp $a2,$a3,[$t2,#8*2]
  1182. adcs $acc4,$acc4,$a4
  1183. adcs $acc5,$acc5,$a5
  1184. ldp $a4,$a5,[$t2,#8*4]
  1185. adcs $acc6,$acc6,$a6
  1186. adcs $acc7,$acc7,$a7
  1187. ldp $a6,$a7,[$t2,#8*6]
  1188. add $np,$t2,#8*8
  1189. adc $topmost,xzr,xzr // top-most carry
  1190. mul $na0,$n0,$acc0
  1191. stp $t0,$t1,[$tp,#8*0]
  1192. stp $acc2,$acc3,[$tp,#8*2]
  1193. ldp $acc2,$acc3,[$rp,#8*2]
  1194. stp $acc4,$acc5,[$tp,#8*4]
  1195. ldp $acc4,$acc5,[$rp,#8*4]
  1196. cmp $cnt,x29 // did we hit the bottom?
  1197. stp $acc6,$acc7,[$tp,#8*6]
  1198. mov $tp,$rp // slide the window
  1199. ldp $acc6,$acc7,[$rp,#8*6]
  1200. mov $cnt,#8
  1201. b.ne .Lsqr8x_reduction
  1202. // Final step. We see if result is larger than modulus, and
  1203. // if it is, subtract the modulus. But comparison implies
  1204. // subtraction. So we subtract modulus, see if it borrowed,
  1205. // and conditionally copy original value.
  1206. ldr $rp,[x29,#96] // pull rp
  1207. add $tp,$tp,#8*8
  1208. subs $t0,$acc0,$a0
  1209. sbcs $t1,$acc1,$a1
  1210. sub $cnt,$num,#8*8
  1211. mov $ap_end,$rp // $rp copy
  1212. .Lsqr8x_sub:
  1213. sbcs $t2,$acc2,$a2
  1214. ldp $a0,$a1,[$np,#8*0]
  1215. sbcs $t3,$acc3,$a3
  1216. stp $t0,$t1,[$rp,#8*0]
  1217. sbcs $t0,$acc4,$a4
  1218. ldp $a2,$a3,[$np,#8*2]
  1219. sbcs $t1,$acc5,$a5
  1220. stp $t2,$t3,[$rp,#8*2]
  1221. sbcs $t2,$acc6,$a6
  1222. ldp $a4,$a5,[$np,#8*4]
  1223. sbcs $t3,$acc7,$a7
  1224. ldp $a6,$a7,[$np,#8*6]
  1225. add $np,$np,#8*8
  1226. ldp $acc0,$acc1,[$tp,#8*0]
  1227. sub $cnt,$cnt,#8*8
  1228. ldp $acc2,$acc3,[$tp,#8*2]
  1229. ldp $acc4,$acc5,[$tp,#8*4]
  1230. ldp $acc6,$acc7,[$tp,#8*6]
  1231. add $tp,$tp,#8*8
  1232. stp $t0,$t1,[$rp,#8*4]
  1233. sbcs $t0,$acc0,$a0
  1234. stp $t2,$t3,[$rp,#8*6]
  1235. add $rp,$rp,#8*8
  1236. sbcs $t1,$acc1,$a1
  1237. cbnz $cnt,.Lsqr8x_sub
  1238. sbcs $t2,$acc2,$a2
  1239. mov $tp,sp
  1240. add $ap,sp,$num
  1241. ldp $a0,$a1,[$ap_end,#8*0]
  1242. sbcs $t3,$acc3,$a3
  1243. stp $t0,$t1,[$rp,#8*0]
  1244. sbcs $t0,$acc4,$a4
  1245. ldp $a2,$a3,[$ap_end,#8*2]
  1246. sbcs $t1,$acc5,$a5
  1247. stp $t2,$t3,[$rp,#8*2]
  1248. sbcs $t2,$acc6,$a6
  1249. ldp $acc0,$acc1,[$ap,#8*0]
  1250. sbcs $t3,$acc7,$a7
  1251. ldp $acc2,$acc3,[$ap,#8*2]
  1252. sbcs xzr,$topmost,xzr // did it borrow?
  1253. ldr x30,[x29,#8] // pull return address
  1254. stp $t0,$t1,[$rp,#8*4]
  1255. stp $t2,$t3,[$rp,#8*6]
  1256. sub $cnt,$num,#8*4
  1257. .Lsqr4x_cond_copy:
  1258. sub $cnt,$cnt,#8*4
  1259. csel $t0,$acc0,$a0,lo
  1260. stp xzr,xzr,[$tp,#8*0]
  1261. csel $t1,$acc1,$a1,lo
  1262. ldp $a0,$a1,[$ap_end,#8*4]
  1263. ldp $acc0,$acc1,[$ap,#8*4]
  1264. csel $t2,$acc2,$a2,lo
  1265. stp xzr,xzr,[$tp,#8*2]
  1266. add $tp,$tp,#8*4
  1267. csel $t3,$acc3,$a3,lo
  1268. ldp $a2,$a3,[$ap_end,#8*6]
  1269. ldp $acc2,$acc3,[$ap,#8*6]
  1270. add $ap,$ap,#8*4
  1271. stp $t0,$t1,[$ap_end,#8*0]
  1272. stp $t2,$t3,[$ap_end,#8*2]
  1273. add $ap_end,$ap_end,#8*4
  1274. stp xzr,xzr,[$ap,#8*0]
  1275. stp xzr,xzr,[$ap,#8*2]
  1276. cbnz $cnt,.Lsqr4x_cond_copy
  1277. csel $t0,$acc0,$a0,lo
  1278. stp xzr,xzr,[$tp,#8*0]
  1279. csel $t1,$acc1,$a1,lo
  1280. stp xzr,xzr,[$tp,#8*2]
  1281. csel $t2,$acc2,$a2,lo
  1282. csel $t3,$acc3,$a3,lo
  1283. stp $t0,$t1,[$ap_end,#8*0]
  1284. stp $t2,$t3,[$ap_end,#8*2]
  1285. b .Lsqr8x_done
  1286. .align 4
  1287. .Lsqr8x8_post_condition:
  1288. adc $carry,xzr,xzr
  1289. ldr x30,[x29,#8] // pull return address
  1290. // $acc0-7,$carry hold result, $a0-7 hold modulus
  1291. subs $a0,$acc0,$a0
  1292. ldr $ap,[x29,#96] // pull rp
  1293. sbcs $a1,$acc1,$a1
  1294. stp xzr,xzr,[sp,#8*0]
  1295. sbcs $a2,$acc2,$a2
  1296. stp xzr,xzr,[sp,#8*2]
  1297. sbcs $a3,$acc3,$a3
  1298. stp xzr,xzr,[sp,#8*4]
  1299. sbcs $a4,$acc4,$a4
  1300. stp xzr,xzr,[sp,#8*6]
  1301. sbcs $a5,$acc5,$a5
  1302. stp xzr,xzr,[sp,#8*8]
  1303. sbcs $a6,$acc6,$a6
  1304. stp xzr,xzr,[sp,#8*10]
  1305. sbcs $a7,$acc7,$a7
  1306. stp xzr,xzr,[sp,#8*12]
  1307. sbcs $carry,$carry,xzr // did it borrow?
  1308. stp xzr,xzr,[sp,#8*14]
  1309. // $a0-7 hold result-modulus
  1310. csel $a0,$acc0,$a0,lo
  1311. csel $a1,$acc1,$a1,lo
  1312. csel $a2,$acc2,$a2,lo
  1313. csel $a3,$acc3,$a3,lo
  1314. stp $a0,$a1,[$ap,#8*0]
  1315. csel $a4,$acc4,$a4,lo
  1316. csel $a5,$acc5,$a5,lo
  1317. stp $a2,$a3,[$ap,#8*2]
  1318. csel $a6,$acc6,$a6,lo
  1319. csel $a7,$acc7,$a7,lo
  1320. stp $a4,$a5,[$ap,#8*4]
  1321. stp $a6,$a7,[$ap,#8*6]
  1322. .Lsqr8x_done:
  1323. ldp x19,x20,[x29,#16]
  1324. mov sp,x29
  1325. ldp x21,x22,[x29,#32]
  1326. mov x0,#1
  1327. ldp x23,x24,[x29,#48]
  1328. ldp x25,x26,[x29,#64]
  1329. ldp x27,x28,[x29,#80]
  1330. ldr x29,[sp],#128
  1331. // x30 is loaded earlier
  1332. AARCH64_VALIDATE_LINK_REGISTER
  1333. ret
  1334. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  1335. ___
  1336. }
  1337. {
  1338. ########################################################################
  1339. # Even though this might look as ARMv8 adaptation of mulx4x_mont from
  1340. # x86_64-mont5 module, it's different in sense that it performs
  1341. # reduction 256 bits at a time.
  1342. my ($a0,$a1,$a2,$a3,
  1343. $t0,$t1,$t2,$t3,
  1344. $m0,$m1,$m2,$m3,
  1345. $acc0,$acc1,$acc2,$acc3,$acc4,
  1346. $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
  1347. my $bp_end=$rp;
  1348. my ($carry,$topmost) = ($rp,"x30");
  1349. $code.=<<___;
  1350. .type __bn_mul4x_mont,%function
  1351. .align 5
  1352. __bn_mul4x_mont:
  1353. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
  1354. // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
  1355. stp x29,x30,[sp,#-128]!
  1356. add x29,sp,#0
  1357. stp x19,x20,[sp,#16]
  1358. stp x21,x22,[sp,#32]
  1359. stp x23,x24,[sp,#48]
  1360. stp x25,x26,[sp,#64]
  1361. stp x27,x28,[sp,#80]
  1362. sub $tp,sp,$num,lsl#3
  1363. lsl $num,$num,#3
  1364. ldr $n0,[$n0] // *n0
  1365. sub sp,$tp,#8*4 // alloca
  1366. add $t0,$bp,$num
  1367. add $ap_end,$ap,$num
  1368. stp $rp,$t0,[x29,#96] // offload rp and &b[num]
  1369. ldr $bi,[$bp,#8*0] // b[0]
  1370. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1371. ldp $a2,$a3,[$ap,#8*2]
  1372. add $ap,$ap,#8*4
  1373. mov $acc0,xzr
  1374. mov $acc1,xzr
  1375. mov $acc2,xzr
  1376. mov $acc3,xzr
  1377. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1378. ldp $m2,$m3,[$np,#8*2]
  1379. adds $np,$np,#8*4 // clear carry bit
  1380. mov $carry,xzr
  1381. mov $cnt,#0
  1382. mov $tp,sp
  1383. .Loop_mul4x_1st_reduction:
  1384. mul $t0,$a0,$bi // lo(a[0..3]*b[0])
  1385. adc $carry,$carry,xzr // modulo-scheduled
  1386. mul $t1,$a1,$bi
  1387. add $cnt,$cnt,#8
  1388. mul $t2,$a2,$bi
  1389. and $cnt,$cnt,#31
  1390. mul $t3,$a3,$bi
  1391. adds $acc0,$acc0,$t0
  1392. umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
  1393. adcs $acc1,$acc1,$t1
  1394. mul $mi,$acc0,$n0 // t[0]*n0
  1395. adcs $acc2,$acc2,$t2
  1396. umulh $t1,$a1,$bi
  1397. adcs $acc3,$acc3,$t3
  1398. umulh $t2,$a2,$bi
  1399. adc $acc4,xzr,xzr
  1400. umulh $t3,$a3,$bi
  1401. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1402. adds $acc1,$acc1,$t0
  1403. // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
  1404. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1405. adcs $acc2,$acc2,$t1
  1406. mul $t1,$m1,$mi
  1407. adcs $acc3,$acc3,$t2
  1408. mul $t2,$m2,$mi
  1409. adc $acc4,$acc4,$t3 // can't overflow
  1410. mul $t3,$m3,$mi
  1411. // (*) adds xzr,$acc0,$t0
  1412. subs xzr,$acc0,#1 // (*)
  1413. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
  1414. adcs $acc0,$acc1,$t1
  1415. umulh $t1,$m1,$mi
  1416. adcs $acc1,$acc2,$t2
  1417. umulh $t2,$m2,$mi
  1418. adcs $acc2,$acc3,$t3
  1419. umulh $t3,$m3,$mi
  1420. adcs $acc3,$acc4,$carry
  1421. adc $carry,xzr,xzr
  1422. adds $acc0,$acc0,$t0
  1423. sub $t0,$ap_end,$ap
  1424. adcs $acc1,$acc1,$t1
  1425. adcs $acc2,$acc2,$t2
  1426. adcs $acc3,$acc3,$t3
  1427. //adc $carry,$carry,xzr
  1428. cbnz $cnt,.Loop_mul4x_1st_reduction
  1429. cbz $t0,.Lmul4x4_post_condition
  1430. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1431. ldp $a2,$a3,[$ap,#8*2]
  1432. add $ap,$ap,#8*4
  1433. ldr $mi,[sp] // a[0]*n0
  1434. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1435. ldp $m2,$m3,[$np,#8*2]
  1436. add $np,$np,#8*4
  1437. .Loop_mul4x_1st_tail:
  1438. mul $t0,$a0,$bi // lo(a[4..7]*b[i])
  1439. adc $carry,$carry,xzr // modulo-scheduled
  1440. mul $t1,$a1,$bi
  1441. add $cnt,$cnt,#8
  1442. mul $t2,$a2,$bi
  1443. and $cnt,$cnt,#31
  1444. mul $t3,$a3,$bi
  1445. adds $acc0,$acc0,$t0
  1446. umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
  1447. adcs $acc1,$acc1,$t1
  1448. umulh $t1,$a1,$bi
  1449. adcs $acc2,$acc2,$t2
  1450. umulh $t2,$a2,$bi
  1451. adcs $acc3,$acc3,$t3
  1452. umulh $t3,$a3,$bi
  1453. adc $acc4,xzr,xzr
  1454. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1455. adds $acc1,$acc1,$t0
  1456. mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
  1457. adcs $acc2,$acc2,$t1
  1458. mul $t1,$m1,$mi
  1459. adcs $acc3,$acc3,$t2
  1460. mul $t2,$m2,$mi
  1461. adc $acc4,$acc4,$t3 // can't overflow
  1462. mul $t3,$m3,$mi
  1463. adds $acc0,$acc0,$t0
  1464. umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
  1465. adcs $acc1,$acc1,$t1
  1466. umulh $t1,$m1,$mi
  1467. adcs $acc2,$acc2,$t2
  1468. umulh $t2,$m2,$mi
  1469. adcs $acc3,$acc3,$t3
  1470. adcs $acc4,$acc4,$carry
  1471. umulh $t3,$m3,$mi
  1472. adc $carry,xzr,xzr
  1473. ldr $mi,[sp,$cnt] // next t[0]*n0
  1474. str $acc0,[$tp],#8 // result!!!
  1475. adds $acc0,$acc1,$t0
  1476. sub $t0,$ap_end,$ap // done yet?
  1477. adcs $acc1,$acc2,$t1
  1478. adcs $acc2,$acc3,$t2
  1479. adcs $acc3,$acc4,$t3
  1480. //adc $carry,$carry,xzr
  1481. cbnz $cnt,.Loop_mul4x_1st_tail
  1482. sub $t1,$ap_end,$num // rewinded $ap
  1483. cbz $t0,.Lmul4x_proceed
  1484. ldp $a0,$a1,[$ap,#8*0]
  1485. ldp $a2,$a3,[$ap,#8*2]
  1486. add $ap,$ap,#8*4
  1487. ldp $m0,$m1,[$np,#8*0]
  1488. ldp $m2,$m3,[$np,#8*2]
  1489. add $np,$np,#8*4
  1490. b .Loop_mul4x_1st_tail
  1491. .align 5
  1492. .Lmul4x_proceed:
  1493. ldr $bi,[$bp,#8*4]! // *++b
  1494. adc $topmost,$carry,xzr
  1495. ldp $a0,$a1,[$t1,#8*0] // a[0..3]
  1496. sub $np,$np,$num // rewind np
  1497. ldp $a2,$a3,[$t1,#8*2]
  1498. add $ap,$t1,#8*4
  1499. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1500. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1501. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1502. ldp $acc2,$acc3,[sp,#8*6]
  1503. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1504. mov $tp,sp
  1505. ldp $m2,$m3,[$np,#8*2]
  1506. adds $np,$np,#8*4 // clear carry bit
  1507. mov $carry,xzr
  1508. .align 4
  1509. .Loop_mul4x_reduction:
  1510. mul $t0,$a0,$bi // lo(a[0..3]*b[4])
  1511. adc $carry,$carry,xzr // modulo-scheduled
  1512. mul $t1,$a1,$bi
  1513. add $cnt,$cnt,#8
  1514. mul $t2,$a2,$bi
  1515. and $cnt,$cnt,#31
  1516. mul $t3,$a3,$bi
  1517. adds $acc0,$acc0,$t0
  1518. umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
  1519. adcs $acc1,$acc1,$t1
  1520. mul $mi,$acc0,$n0 // t[0]*n0
  1521. adcs $acc2,$acc2,$t2
  1522. umulh $t1,$a1,$bi
  1523. adcs $acc3,$acc3,$t3
  1524. umulh $t2,$a2,$bi
  1525. adc $acc4,xzr,xzr
  1526. umulh $t3,$a3,$bi
  1527. ldr $bi,[$bp,$cnt] // next b[i]
  1528. adds $acc1,$acc1,$t0
  1529. // (*) mul $t0,$m0,$mi
  1530. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1531. adcs $acc2,$acc2,$t1
  1532. mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
  1533. adcs $acc3,$acc3,$t2
  1534. mul $t2,$m2,$mi
  1535. adc $acc4,$acc4,$t3 // can't overflow
  1536. mul $t3,$m3,$mi
  1537. // (*) adds xzr,$acc0,$t0
  1538. subs xzr,$acc0,#1 // (*)
  1539. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
  1540. adcs $acc0,$acc1,$t1
  1541. umulh $t1,$m1,$mi
  1542. adcs $acc1,$acc2,$t2
  1543. umulh $t2,$m2,$mi
  1544. adcs $acc2,$acc3,$t3
  1545. umulh $t3,$m3,$mi
  1546. adcs $acc3,$acc4,$carry
  1547. adc $carry,xzr,xzr
  1548. adds $acc0,$acc0,$t0
  1549. adcs $acc1,$acc1,$t1
  1550. adcs $acc2,$acc2,$t2
  1551. adcs $acc3,$acc3,$t3
  1552. //adc $carry,$carry,xzr
  1553. cbnz $cnt,.Loop_mul4x_reduction
  1554. adc $carry,$carry,xzr
  1555. ldp $t0,$t1,[$tp,#8*4] // t[4..7]
  1556. ldp $t2,$t3,[$tp,#8*6]
  1557. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1558. ldp $a2,$a3,[$ap,#8*2]
  1559. add $ap,$ap,#8*4
  1560. adds $acc0,$acc0,$t0
  1561. adcs $acc1,$acc1,$t1
  1562. adcs $acc2,$acc2,$t2
  1563. adcs $acc3,$acc3,$t3
  1564. //adc $carry,$carry,xzr
  1565. ldr $mi,[sp] // t[0]*n0
  1566. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1567. ldp $m2,$m3,[$np,#8*2]
  1568. add $np,$np,#8*4
  1569. .align 4
  1570. .Loop_mul4x_tail:
  1571. mul $t0,$a0,$bi // lo(a[4..7]*b[4])
  1572. adc $carry,$carry,xzr // modulo-scheduled
  1573. mul $t1,$a1,$bi
  1574. add $cnt,$cnt,#8
  1575. mul $t2,$a2,$bi
  1576. and $cnt,$cnt,#31
  1577. mul $t3,$a3,$bi
  1578. adds $acc0,$acc0,$t0
  1579. umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
  1580. adcs $acc1,$acc1,$t1
  1581. umulh $t1,$a1,$bi
  1582. adcs $acc2,$acc2,$t2
  1583. umulh $t2,$a2,$bi
  1584. adcs $acc3,$acc3,$t3
  1585. umulh $t3,$a3,$bi
  1586. adc $acc4,xzr,xzr
  1587. ldr $bi,[$bp,$cnt] // next b[i]
  1588. adds $acc1,$acc1,$t0
  1589. mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
  1590. adcs $acc2,$acc2,$t1
  1591. mul $t1,$m1,$mi
  1592. adcs $acc3,$acc3,$t2
  1593. mul $t2,$m2,$mi
  1594. adc $acc4,$acc4,$t3 // can't overflow
  1595. mul $t3,$m3,$mi
  1596. adds $acc0,$acc0,$t0
  1597. umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
  1598. adcs $acc1,$acc1,$t1
  1599. umulh $t1,$m1,$mi
  1600. adcs $acc2,$acc2,$t2
  1601. umulh $t2,$m2,$mi
  1602. adcs $acc3,$acc3,$t3
  1603. umulh $t3,$m3,$mi
  1604. adcs $acc4,$acc4,$carry
  1605. ldr $mi,[sp,$cnt] // next a[0]*n0
  1606. adc $carry,xzr,xzr
  1607. str $acc0,[$tp],#8 // result!!!
  1608. adds $acc0,$acc1,$t0
  1609. sub $t0,$ap_end,$ap // done yet?
  1610. adcs $acc1,$acc2,$t1
  1611. adcs $acc2,$acc3,$t2
  1612. adcs $acc3,$acc4,$t3
  1613. //adc $carry,$carry,xzr
  1614. cbnz $cnt,.Loop_mul4x_tail
  1615. sub $t1,$np,$num // rewinded np?
  1616. adc $carry,$carry,xzr
  1617. cbz $t0,.Loop_mul4x_break
  1618. ldp $t0,$t1,[$tp,#8*4]
  1619. ldp $t2,$t3,[$tp,#8*6]
  1620. ldp $a0,$a1,[$ap,#8*0]
  1621. ldp $a2,$a3,[$ap,#8*2]
  1622. add $ap,$ap,#8*4
  1623. adds $acc0,$acc0,$t0
  1624. adcs $acc1,$acc1,$t1
  1625. adcs $acc2,$acc2,$t2
  1626. adcs $acc3,$acc3,$t3
  1627. //adc $carry,$carry,xzr
  1628. ldp $m0,$m1,[$np,#8*0]
  1629. ldp $m2,$m3,[$np,#8*2]
  1630. add $np,$np,#8*4
  1631. b .Loop_mul4x_tail
  1632. .align 4
  1633. .Loop_mul4x_break:
  1634. ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
  1635. adds $acc0,$acc0,$topmost
  1636. add $bp,$bp,#8*4 // bp++
  1637. adcs $acc1,$acc1,xzr
  1638. sub $ap,$ap,$num // rewind ap
  1639. adcs $acc2,$acc2,xzr
  1640. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1641. adcs $acc3,$acc3,xzr
  1642. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1643. adc $topmost,$carry,xzr
  1644. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1645. cmp $bp,$t3 // done yet?
  1646. ldp $acc2,$acc3,[sp,#8*6]
  1647. ldp $m0,$m1,[$t1,#8*0] // n[0..3]
  1648. ldp $m2,$m3,[$t1,#8*2]
  1649. add $np,$t1,#8*4
  1650. b.eq .Lmul4x_post
  1651. ldr $bi,[$bp]
  1652. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1653. ldp $a2,$a3,[$ap,#8*2]
  1654. adds $ap,$ap,#8*4 // clear carry bit
  1655. mov $carry,xzr
  1656. mov $tp,sp
  1657. b .Loop_mul4x_reduction
  1658. .align 4
  1659. .Lmul4x_post:
  1660. // Final step. We see if result is larger than modulus, and
  1661. // if it is, subtract the modulus. But comparison implies
  1662. // subtraction. So we subtract modulus, see if it borrowed,
  1663. // and conditionally copy original value.
  1664. mov $rp,$t2
  1665. mov $ap_end,$t2 // $rp copy
  1666. subs $t0,$acc0,$m0
  1667. add $tp,sp,#8*8
  1668. sbcs $t1,$acc1,$m1
  1669. sub $cnt,$num,#8*4
  1670. .Lmul4x_sub:
  1671. sbcs $t2,$acc2,$m2
  1672. ldp $m0,$m1,[$np,#8*0]
  1673. sub $cnt,$cnt,#8*4
  1674. ldp $acc0,$acc1,[$tp,#8*0]
  1675. sbcs $t3,$acc3,$m3
  1676. ldp $m2,$m3,[$np,#8*2]
  1677. add $np,$np,#8*4
  1678. ldp $acc2,$acc3,[$tp,#8*2]
  1679. add $tp,$tp,#8*4
  1680. stp $t0,$t1,[$rp,#8*0]
  1681. sbcs $t0,$acc0,$m0
  1682. stp $t2,$t3,[$rp,#8*2]
  1683. add $rp,$rp,#8*4
  1684. sbcs $t1,$acc1,$m1
  1685. cbnz $cnt,.Lmul4x_sub
  1686. sbcs $t2,$acc2,$m2
  1687. mov $tp,sp
  1688. add $ap,sp,#8*4
  1689. ldp $a0,$a1,[$ap_end,#8*0]
  1690. sbcs $t3,$acc3,$m3
  1691. stp $t0,$t1,[$rp,#8*0]
  1692. ldp $a2,$a3,[$ap_end,#8*2]
  1693. stp $t2,$t3,[$rp,#8*2]
  1694. ldp $acc0,$acc1,[$ap,#8*0]
  1695. ldp $acc2,$acc3,[$ap,#8*2]
  1696. sbcs xzr,$topmost,xzr // did it borrow?
  1697. ldr x30,[x29,#8] // pull return address
  1698. sub $cnt,$num,#8*4
  1699. .Lmul4x_cond_copy:
  1700. sub $cnt,$cnt,#8*4
  1701. csel $t0,$acc0,$a0,lo
  1702. stp xzr,xzr,[$tp,#8*0]
  1703. csel $t1,$acc1,$a1,lo
  1704. ldp $a0,$a1,[$ap_end,#8*4]
  1705. ldp $acc0,$acc1,[$ap,#8*4]
  1706. csel $t2,$acc2,$a2,lo
  1707. stp xzr,xzr,[$tp,#8*2]
  1708. add $tp,$tp,#8*4
  1709. csel $t3,$acc3,$a3,lo
  1710. ldp $a2,$a3,[$ap_end,#8*6]
  1711. ldp $acc2,$acc3,[$ap,#8*6]
  1712. add $ap,$ap,#8*4
  1713. stp $t0,$t1,[$ap_end,#8*0]
  1714. stp $t2,$t3,[$ap_end,#8*2]
  1715. add $ap_end,$ap_end,#8*4
  1716. cbnz $cnt,.Lmul4x_cond_copy
  1717. csel $t0,$acc0,$a0,lo
  1718. stp xzr,xzr,[$tp,#8*0]
  1719. csel $t1,$acc1,$a1,lo
  1720. stp xzr,xzr,[$tp,#8*2]
  1721. csel $t2,$acc2,$a2,lo
  1722. stp xzr,xzr,[$tp,#8*3]
  1723. csel $t3,$acc3,$a3,lo
  1724. stp xzr,xzr,[$tp,#8*4]
  1725. stp $t0,$t1,[$ap_end,#8*0]
  1726. stp $t2,$t3,[$ap_end,#8*2]
  1727. b .Lmul4x_done
  1728. .align 4
  1729. .Lmul4x4_post_condition:
  1730. adc $carry,$carry,xzr
  1731. ldr $ap,[x29,#96] // pull rp
  1732. // $acc0-3,$carry hold result, $m0-7 hold modulus
  1733. subs $a0,$acc0,$m0
  1734. ldr x30,[x29,#8] // pull return address
  1735. sbcs $a1,$acc1,$m1
  1736. stp xzr,xzr,[sp,#8*0]
  1737. sbcs $a2,$acc2,$m2
  1738. stp xzr,xzr,[sp,#8*2]
  1739. sbcs $a3,$acc3,$m3
  1740. stp xzr,xzr,[sp,#8*4]
  1741. sbcs xzr,$carry,xzr // did it borrow?
  1742. stp xzr,xzr,[sp,#8*6]
  1743. // $a0-3 hold result-modulus
  1744. csel $a0,$acc0,$a0,lo
  1745. csel $a1,$acc1,$a1,lo
  1746. csel $a2,$acc2,$a2,lo
  1747. csel $a3,$acc3,$a3,lo
  1748. stp $a0,$a1,[$ap,#8*0]
  1749. stp $a2,$a3,[$ap,#8*2]
  1750. .Lmul4x_done:
  1751. ldp x19,x20,[x29,#16]
  1752. mov sp,x29
  1753. ldp x21,x22,[x29,#32]
  1754. mov x0,#1
  1755. ldp x23,x24,[x29,#48]
  1756. ldp x25,x26,[x29,#64]
  1757. ldp x27,x28,[x29,#80]
  1758. ldr x29,[sp],#128
  1759. // x30 loaded earlier
  1760. AARCH64_VALIDATE_LINK_REGISTER
  1761. ret
  1762. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1763. ___
  1764. }
  1765. $code.=<<___;
  1766. .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  1767. .align 4
  1768. ___
  1769. print $code;
  1770. close STDOUT or die "error closing STDOUT: $!";