armv8-mont.pl 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510
  1. #! /usr/bin/env perl
  2. # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # March 2015
  15. #
  16. # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  17. # work. While it does improve RSA sign performance by 20-30% (less for
  18. # longer keys) on most processors, for some reason RSA2048 is not
  19. # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  20. # instruction issue rate is limited on processor in question, meaning
  21. # that dedicated squaring procedure is a must. Well, actually all
  22. # contemporary AArch64 processors seem to have limited multiplication
  23. # issue rate, i.e. they can't issue multiplication every cycle, which
  24. # explains moderate improvement coefficients in comparison to
  25. # compiler-generated code. Recall that compiler is instructed to use
  26. # umulh and therefore uses same amount of multiplication instructions
  27. # to do the job. Assembly's edge is to minimize number of "collateral"
  28. # instructions and of course instruction scheduling.
  29. #
  30. # April 2015
  31. #
  32. # Squaring procedure that handles lengths divisible by 8 improves
  33. # RSA/DSA performance by 25-40-60% depending on processor and key
  34. # length. Overall improvement coefficients are always positive in
  35. # comparison to compiler-generated code. On Cortex-A57 improvement
  36. # is still modest on longest key lengths, while others exhibit e.g.
  37. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
  38. # on Cortex-A57 and ~60-100% faster on others.
  39. $flavour = shift;
  40. $output = shift;
  41. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  42. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  43. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  44. die "can't locate arm-xlate.pl";
  45. open OUT,"| \"$^X\" $xlate $flavour $output";
  46. *STDOUT=*OUT;
  47. ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  48. $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  49. $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  50. # int bn_mul_mont(
  51. $rp="x0"; # BN_ULONG *rp,
  52. $ap="x1"; # const BN_ULONG *ap,
  53. $bp="x2"; # const BN_ULONG *bp,
  54. $np="x3"; # const BN_ULONG *np,
  55. $n0="x4"; # const BN_ULONG *n0,
  56. $num="x5"; # int num);
  57. $code.=<<___;
  58. .text
  59. .globl bn_mul_mont
  60. .type bn_mul_mont,%function
  61. .align 5
  62. bn_mul_mont:
  63. tst $num,#7
  64. b.eq __bn_sqr8x_mont
  65. tst $num,#3
  66. b.eq __bn_mul4x_mont
  67. .Lmul_mont:
  68. stp x29,x30,[sp,#-64]!
  69. add x29,sp,#0
  70. stp x19,x20,[sp,#16]
  71. stp x21,x22,[sp,#32]
  72. stp x23,x24,[sp,#48]
  73. ldr $m0,[$bp],#8 // bp[0]
  74. sub $tp,sp,$num,lsl#3
  75. ldp $hi0,$aj,[$ap],#16 // ap[0..1]
  76. lsl $num,$num,#3
  77. ldr $n0,[$n0] // *n0
  78. and $tp,$tp,#-16 // ABI says so
  79. ldp $hi1,$nj,[$np],#16 // np[0..1]
  80. mul $lo0,$hi0,$m0 // ap[0]*bp[0]
  81. sub $j,$num,#16 // j=num-2
  82. umulh $hi0,$hi0,$m0
  83. mul $alo,$aj,$m0 // ap[1]*bp[0]
  84. umulh $ahi,$aj,$m0
  85. mul $m1,$lo0,$n0 // "tp[0]"*n0
  86. mov sp,$tp // alloca
  87. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  88. umulh $hi1,$hi1,$m1
  89. mul $nlo,$nj,$m1 // np[1]*m1
  90. // (*) adds $lo1,$lo1,$lo0 // discarded
  91. // (*) As for removal of first multiplication and addition
  92. // instructions. The outcome of first addition is
  93. // guaranteed to be zero, which leaves two computationally
  94. // significant outcomes: it either carries or not. Then
  95. // question is when does it carry? Is there alternative
  96. // way to deduce it? If you follow operations, you can
  97. // observe that condition for carry is quite simple:
  98. // $lo0 being non-zero. So that carry can be calculated
  99. // by adding -1 to $lo0. That's what next instruction does.
  100. subs xzr,$lo0,#1 // (*)
  101. umulh $nhi,$nj,$m1
  102. adc $hi1,$hi1,xzr
  103. cbz $j,.L1st_skip
  104. .L1st:
  105. ldr $aj,[$ap],#8
  106. adds $lo0,$alo,$hi0
  107. sub $j,$j,#8 // j--
  108. adc $hi0,$ahi,xzr
  109. ldr $nj,[$np],#8
  110. adds $lo1,$nlo,$hi1
  111. mul $alo,$aj,$m0 // ap[j]*bp[0]
  112. adc $hi1,$nhi,xzr
  113. umulh $ahi,$aj,$m0
  114. adds $lo1,$lo1,$lo0
  115. mul $nlo,$nj,$m1 // np[j]*m1
  116. adc $hi1,$hi1,xzr
  117. umulh $nhi,$nj,$m1
  118. str $lo1,[$tp],#8 // tp[j-1]
  119. cbnz $j,.L1st
  120. .L1st_skip:
  121. adds $lo0,$alo,$hi0
  122. sub $ap,$ap,$num // rewind $ap
  123. adc $hi0,$ahi,xzr
  124. adds $lo1,$nlo,$hi1
  125. sub $np,$np,$num // rewind $np
  126. adc $hi1,$nhi,xzr
  127. adds $lo1,$lo1,$lo0
  128. sub $i,$num,#8 // i=num-1
  129. adcs $hi1,$hi1,$hi0
  130. adc $ovf,xzr,xzr // upmost overflow bit
  131. stp $lo1,$hi1,[$tp]
  132. .Louter:
  133. ldr $m0,[$bp],#8 // bp[i]
  134. ldp $hi0,$aj,[$ap],#16
  135. ldr $tj,[sp] // tp[0]
  136. add $tp,sp,#8
  137. mul $lo0,$hi0,$m0 // ap[0]*bp[i]
  138. sub $j,$num,#16 // j=num-2
  139. umulh $hi0,$hi0,$m0
  140. ldp $hi1,$nj,[$np],#16
  141. mul $alo,$aj,$m0 // ap[1]*bp[i]
  142. adds $lo0,$lo0,$tj
  143. umulh $ahi,$aj,$m0
  144. adc $hi0,$hi0,xzr
  145. mul $m1,$lo0,$n0
  146. sub $i,$i,#8 // i--
  147. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  148. umulh $hi1,$hi1,$m1
  149. mul $nlo,$nj,$m1 // np[1]*m1
  150. // (*) adds $lo1,$lo1,$lo0
  151. subs xzr,$lo0,#1 // (*)
  152. umulh $nhi,$nj,$m1
  153. cbz $j,.Linner_skip
  154. .Linner:
  155. ldr $aj,[$ap],#8
  156. adc $hi1,$hi1,xzr
  157. ldr $tj,[$tp],#8 // tp[j]
  158. adds $lo0,$alo,$hi0
  159. sub $j,$j,#8 // j--
  160. adc $hi0,$ahi,xzr
  161. adds $lo1,$nlo,$hi1
  162. ldr $nj,[$np],#8
  163. adc $hi1,$nhi,xzr
  164. mul $alo,$aj,$m0 // ap[j]*bp[i]
  165. adds $lo0,$lo0,$tj
  166. umulh $ahi,$aj,$m0
  167. adc $hi0,$hi0,xzr
  168. mul $nlo,$nj,$m1 // np[j]*m1
  169. adds $lo1,$lo1,$lo0
  170. umulh $nhi,$nj,$m1
  171. str $lo1,[$tp,#-16] // tp[j-1]
  172. cbnz $j,.Linner
  173. .Linner_skip:
  174. ldr $tj,[$tp],#8 // tp[j]
  175. adc $hi1,$hi1,xzr
  176. adds $lo0,$alo,$hi0
  177. sub $ap,$ap,$num // rewind $ap
  178. adc $hi0,$ahi,xzr
  179. adds $lo1,$nlo,$hi1
  180. sub $np,$np,$num // rewind $np
  181. adcs $hi1,$nhi,$ovf
  182. adc $ovf,xzr,xzr
  183. adds $lo0,$lo0,$tj
  184. adc $hi0,$hi0,xzr
  185. adds $lo1,$lo1,$lo0
  186. adcs $hi1,$hi1,$hi0
  187. adc $ovf,$ovf,xzr // upmost overflow bit
  188. stp $lo1,$hi1,[$tp,#-16]
  189. cbnz $i,.Louter
  190. // Final step. We see if result is larger than modulus, and
  191. // if it is, subtract the modulus. But comparison implies
  192. // subtraction. So we subtract modulus, see if it borrowed,
  193. // and conditionally copy original value.
  194. ldr $tj,[sp] // tp[0]
  195. add $tp,sp,#8
  196. ldr $nj,[$np],#8 // np[0]
  197. subs $j,$num,#8 // j=num-1 and clear borrow
  198. mov $ap,$rp
  199. .Lsub:
  200. sbcs $aj,$tj,$nj // tp[j]-np[j]
  201. ldr $tj,[$tp],#8
  202. sub $j,$j,#8 // j--
  203. ldr $nj,[$np],#8
  204. str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
  205. cbnz $j,.Lsub
  206. sbcs $aj,$tj,$nj
  207. sbcs $ovf,$ovf,xzr // did it borrow?
  208. str $aj,[$ap],#8 // rp[num-1]
  209. ldr $tj,[sp] // tp[0]
  210. add $tp,sp,#8
  211. ldr $aj,[$rp],#8 // rp[0]
  212. sub $num,$num,#8 // num--
  213. nop
  214. .Lcond_copy:
  215. sub $num,$num,#8 // num--
  216. csel $nj,$tj,$aj,lo // did it borrow?
  217. ldr $tj,[$tp],#8
  218. ldr $aj,[$rp],#8
  219. str xzr,[$tp,#-16] // wipe tp
  220. str $nj,[$rp,#-16]
  221. cbnz $num,.Lcond_copy
  222. csel $nj,$tj,$aj,lo
  223. str xzr,[$tp,#-8] // wipe tp
  224. str $nj,[$rp,#-8]
  225. ldp x19,x20,[x29,#16]
  226. mov sp,x29
  227. ldp x21,x22,[x29,#32]
  228. mov x0,#1
  229. ldp x23,x24,[x29,#48]
  230. ldr x29,[sp],#64
  231. ret
  232. .size bn_mul_mont,.-bn_mul_mont
  233. ___
  234. {
  235. ########################################################################
  236. # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
  237. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
  238. my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
  239. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
  240. my ($cnt,$carry,$topmost)=("x27","x28","x30");
  241. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  242. $code.=<<___;
  243. .type __bn_sqr8x_mont,%function
  244. .align 5
  245. __bn_sqr8x_mont:
  246. cmp $ap,$bp
  247. b.ne __bn_mul4x_mont
  248. .Lsqr8x_mont:
  249. stp x29,x30,[sp,#-128]!
  250. add x29,sp,#0
  251. stp x19,x20,[sp,#16]
  252. stp x21,x22,[sp,#32]
  253. stp x23,x24,[sp,#48]
  254. stp x25,x26,[sp,#64]
  255. stp x27,x28,[sp,#80]
  256. stp $rp,$np,[sp,#96] // offload rp and np
  257. ldp $a0,$a1,[$ap,#8*0]
  258. ldp $a2,$a3,[$ap,#8*2]
  259. ldp $a4,$a5,[$ap,#8*4]
  260. ldp $a6,$a7,[$ap,#8*6]
  261. sub $tp,sp,$num,lsl#4
  262. lsl $num,$num,#3
  263. ldr $n0,[$n0] // *n0
  264. mov sp,$tp // alloca
  265. sub $cnt,$num,#8*8
  266. b .Lsqr8x_zero_start
  267. .Lsqr8x_zero:
  268. sub $cnt,$cnt,#8*8
  269. stp xzr,xzr,[$tp,#8*0]
  270. stp xzr,xzr,[$tp,#8*2]
  271. stp xzr,xzr,[$tp,#8*4]
  272. stp xzr,xzr,[$tp,#8*6]
  273. .Lsqr8x_zero_start:
  274. stp xzr,xzr,[$tp,#8*8]
  275. stp xzr,xzr,[$tp,#8*10]
  276. stp xzr,xzr,[$tp,#8*12]
  277. stp xzr,xzr,[$tp,#8*14]
  278. add $tp,$tp,#8*16
  279. cbnz $cnt,.Lsqr8x_zero
  280. add $ap_end,$ap,$num
  281. add $ap,$ap,#8*8
  282. mov $acc0,xzr
  283. mov $acc1,xzr
  284. mov $acc2,xzr
  285. mov $acc3,xzr
  286. mov $acc4,xzr
  287. mov $acc5,xzr
  288. mov $acc6,xzr
  289. mov $acc7,xzr
  290. mov $tp,sp
  291. str $n0,[x29,#112] // offload n0
  292. // Multiply everything but a[i]*a[i]
  293. .align 4
  294. .Lsqr8x_outer_loop:
  295. // a[1]a[0] (i)
  296. // a[2]a[0]
  297. // a[3]a[0]
  298. // a[4]a[0]
  299. // a[5]a[0]
  300. // a[6]a[0]
  301. // a[7]a[0]
  302. // a[2]a[1] (ii)
  303. // a[3]a[1]
  304. // a[4]a[1]
  305. // a[5]a[1]
  306. // a[6]a[1]
  307. // a[7]a[1]
  308. // a[3]a[2] (iii)
  309. // a[4]a[2]
  310. // a[5]a[2]
  311. // a[6]a[2]
  312. // a[7]a[2]
  313. // a[4]a[3] (iv)
  314. // a[5]a[3]
  315. // a[6]a[3]
  316. // a[7]a[3]
  317. // a[5]a[4] (v)
  318. // a[6]a[4]
  319. // a[7]a[4]
  320. // a[6]a[5] (vi)
  321. // a[7]a[5]
  322. // a[7]a[6] (vii)
  323. mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
  324. mul $t1,$a2,$a0
  325. mul $t2,$a3,$a0
  326. mul $t3,$a4,$a0
  327. adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
  328. mul $t0,$a5,$a0
  329. adcs $acc2,$acc2,$t1
  330. mul $t1,$a6,$a0
  331. adcs $acc3,$acc3,$t2
  332. mul $t2,$a7,$a0
  333. adcs $acc4,$acc4,$t3
  334. umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
  335. adcs $acc5,$acc5,$t0
  336. umulh $t0,$a2,$a0
  337. adcs $acc6,$acc6,$t1
  338. umulh $t1,$a3,$a0
  339. adcs $acc7,$acc7,$t2
  340. umulh $t2,$a4,$a0
  341. stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
  342. adc $acc0,xzr,xzr // t[8]
  343. adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
  344. umulh $t3,$a5,$a0
  345. adcs $acc3,$acc3,$t0
  346. umulh $t0,$a6,$a0
  347. adcs $acc4,$acc4,$t1
  348. umulh $t1,$a7,$a0
  349. adcs $acc5,$acc5,$t2
  350. mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
  351. adcs $acc6,$acc6,$t3
  352. mul $t3,$a3,$a1
  353. adcs $acc7,$acc7,$t0
  354. mul $t0,$a4,$a1
  355. adc $acc0,$acc0,$t1
  356. mul $t1,$a5,$a1
  357. adds $acc3,$acc3,$t2
  358. mul $t2,$a6,$a1
  359. adcs $acc4,$acc4,$t3
  360. mul $t3,$a7,$a1
  361. adcs $acc5,$acc5,$t0
  362. umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
  363. adcs $acc6,$acc6,$t1
  364. umulh $t1,$a3,$a1
  365. adcs $acc7,$acc7,$t2
  366. umulh $t2,$a4,$a1
  367. adcs $acc0,$acc0,$t3
  368. umulh $t3,$a5,$a1
  369. stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
  370. adc $acc1,xzr,xzr // t[9]
  371. adds $acc4,$acc4,$t0
  372. umulh $t0,$a6,$a1
  373. adcs $acc5,$acc5,$t1
  374. umulh $t1,$a7,$a1
  375. adcs $acc6,$acc6,$t2
  376. mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
  377. adcs $acc7,$acc7,$t3
  378. mul $t3,$a4,$a2
  379. adcs $acc0,$acc0,$t0
  380. mul $t0,$a5,$a2
  381. adc $acc1,$acc1,$t1
  382. mul $t1,$a6,$a2
  383. adds $acc5,$acc5,$t2
  384. mul $t2,$a7,$a2
  385. adcs $acc6,$acc6,$t3
  386. umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
  387. adcs $acc7,$acc7,$t0
  388. umulh $t0,$a4,$a2
  389. adcs $acc0,$acc0,$t1
  390. umulh $t1,$a5,$a2
  391. adcs $acc1,$acc1,$t2
  392. umulh $t2,$a6,$a2
  393. stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
  394. adc $acc2,xzr,xzr // t[10]
  395. adds $acc6,$acc6,$t3
  396. umulh $t3,$a7,$a2
  397. adcs $acc7,$acc7,$t0
  398. mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
  399. adcs $acc0,$acc0,$t1
  400. mul $t1,$a5,$a3
  401. adcs $acc1,$acc1,$t2
  402. mul $t2,$a6,$a3
  403. adc $acc2,$acc2,$t3
  404. mul $t3,$a7,$a3
  405. adds $acc7,$acc7,$t0
  406. umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
  407. adcs $acc0,$acc0,$t1
  408. umulh $t1,$a5,$a3
  409. adcs $acc1,$acc1,$t2
  410. umulh $t2,$a6,$a3
  411. adcs $acc2,$acc2,$t3
  412. umulh $t3,$a7,$a3
  413. stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
  414. adc $acc3,xzr,xzr // t[11]
  415. adds $acc0,$acc0,$t0
  416. mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
  417. adcs $acc1,$acc1,$t1
  418. mul $t1,$a6,$a4
  419. adcs $acc2,$acc2,$t2
  420. mul $t2,$a7,$a4
  421. adc $acc3,$acc3,$t3
  422. umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
  423. adds $acc1,$acc1,$t0
  424. umulh $t0,$a6,$a4
  425. adcs $acc2,$acc2,$t1
  426. umulh $t1,$a7,$a4
  427. adcs $acc3,$acc3,$t2
  428. mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
  429. adc $acc4,xzr,xzr // t[12]
  430. adds $acc2,$acc2,$t3
  431. mul $t3,$a7,$a5
  432. adcs $acc3,$acc3,$t0
  433. umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
  434. adc $acc4,$acc4,$t1
  435. umulh $t1,$a7,$a5
  436. adds $acc3,$acc3,$t2
  437. mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
  438. adcs $acc4,$acc4,$t3
  439. umulh $t3,$a7,$a6 // hi(a[7]*a[6])
  440. adc $acc5,xzr,xzr // t[13]
  441. adds $acc4,$acc4,$t0
  442. sub $cnt,$ap_end,$ap // done yet?
  443. adc $acc5,$acc5,$t1
  444. adds $acc5,$acc5,$t2
  445. sub $t0,$ap_end,$num // rewinded ap
  446. adc $acc6,xzr,xzr // t[14]
  447. add $acc6,$acc6,$t3
  448. cbz $cnt,.Lsqr8x_outer_break
  449. mov $n0,$a0
  450. ldp $a0,$a1,[$tp,#8*0]
  451. ldp $a2,$a3,[$tp,#8*2]
  452. ldp $a4,$a5,[$tp,#8*4]
  453. ldp $a6,$a7,[$tp,#8*6]
  454. adds $acc0,$acc0,$a0
  455. adcs $acc1,$acc1,$a1
  456. ldp $a0,$a1,[$ap,#8*0]
  457. adcs $acc2,$acc2,$a2
  458. adcs $acc3,$acc3,$a3
  459. ldp $a2,$a3,[$ap,#8*2]
  460. adcs $acc4,$acc4,$a4
  461. adcs $acc5,$acc5,$a5
  462. ldp $a4,$a5,[$ap,#8*4]
  463. adcs $acc6,$acc6,$a6
  464. mov $rp,$ap
  465. adcs $acc7,xzr,$a7
  466. ldp $a6,$a7,[$ap,#8*6]
  467. add $ap,$ap,#8*8
  468. //adc $carry,xzr,xzr // moved below
  469. mov $cnt,#-8*8
  470. // a[8]a[0]
  471. // a[9]a[0]
  472. // a[a]a[0]
  473. // a[b]a[0]
  474. // a[c]a[0]
  475. // a[d]a[0]
  476. // a[e]a[0]
  477. // a[f]a[0]
  478. // a[8]a[1]
  479. // a[f]a[1]........................
  480. // a[8]a[2]
  481. // a[f]a[2]........................
  482. // a[8]a[3]
  483. // a[f]a[3]........................
  484. // a[8]a[4]
  485. // a[f]a[4]........................
  486. // a[8]a[5]
  487. // a[f]a[5]........................
  488. // a[8]a[6]
  489. // a[f]a[6]........................
  490. // a[8]a[7]
  491. // a[f]a[7]........................
  492. .Lsqr8x_mul:
  493. mul $t0,$a0,$n0
  494. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  495. mul $t1,$a1,$n0
  496. add $cnt,$cnt,#8
  497. mul $t2,$a2,$n0
  498. mul $t3,$a3,$n0
  499. adds $acc0,$acc0,$t0
  500. mul $t0,$a4,$n0
  501. adcs $acc1,$acc1,$t1
  502. mul $t1,$a5,$n0
  503. adcs $acc2,$acc2,$t2
  504. mul $t2,$a6,$n0
  505. adcs $acc3,$acc3,$t3
  506. mul $t3,$a7,$n0
  507. adcs $acc4,$acc4,$t0
  508. umulh $t0,$a0,$n0
  509. adcs $acc5,$acc5,$t1
  510. umulh $t1,$a1,$n0
  511. adcs $acc6,$acc6,$t2
  512. umulh $t2,$a2,$n0
  513. adcs $acc7,$acc7,$t3
  514. umulh $t3,$a3,$n0
  515. adc $carry,$carry,xzr
  516. str $acc0,[$tp],#8
  517. adds $acc0,$acc1,$t0
  518. umulh $t0,$a4,$n0
  519. adcs $acc1,$acc2,$t1
  520. umulh $t1,$a5,$n0
  521. adcs $acc2,$acc3,$t2
  522. umulh $t2,$a6,$n0
  523. adcs $acc3,$acc4,$t3
  524. umulh $t3,$a7,$n0
  525. ldr $n0,[$rp,$cnt]
  526. adcs $acc4,$acc5,$t0
  527. adcs $acc5,$acc6,$t1
  528. adcs $acc6,$acc7,$t2
  529. adcs $acc7,$carry,$t3
  530. //adc $carry,xzr,xzr // moved above
  531. cbnz $cnt,.Lsqr8x_mul
  532. // note that carry flag is guaranteed
  533. // to be zero at this point
  534. cmp $ap,$ap_end // done yet?
  535. b.eq .Lsqr8x_break
  536. ldp $a0,$a1,[$tp,#8*0]
  537. ldp $a2,$a3,[$tp,#8*2]
  538. ldp $a4,$a5,[$tp,#8*4]
  539. ldp $a6,$a7,[$tp,#8*6]
  540. adds $acc0,$acc0,$a0
  541. ldr $n0,[$rp,#-8*8]
  542. adcs $acc1,$acc1,$a1
  543. ldp $a0,$a1,[$ap,#8*0]
  544. adcs $acc2,$acc2,$a2
  545. adcs $acc3,$acc3,$a3
  546. ldp $a2,$a3,[$ap,#8*2]
  547. adcs $acc4,$acc4,$a4
  548. adcs $acc5,$acc5,$a5
  549. ldp $a4,$a5,[$ap,#8*4]
  550. adcs $acc6,$acc6,$a6
  551. mov $cnt,#-8*8
  552. adcs $acc7,$acc7,$a7
  553. ldp $a6,$a7,[$ap,#8*6]
  554. add $ap,$ap,#8*8
  555. //adc $carry,xzr,xzr // moved above
  556. b .Lsqr8x_mul
  557. .align 4
  558. .Lsqr8x_break:
  559. ldp $a0,$a1,[$rp,#8*0]
  560. add $ap,$rp,#8*8
  561. ldp $a2,$a3,[$rp,#8*2]
  562. sub $t0,$ap_end,$ap // is it last iteration?
  563. ldp $a4,$a5,[$rp,#8*4]
  564. sub $t1,$tp,$t0
  565. ldp $a6,$a7,[$rp,#8*6]
  566. cbz $t0,.Lsqr8x_outer_loop
  567. stp $acc0,$acc1,[$tp,#8*0]
  568. ldp $acc0,$acc1,[$t1,#8*0]
  569. stp $acc2,$acc3,[$tp,#8*2]
  570. ldp $acc2,$acc3,[$t1,#8*2]
  571. stp $acc4,$acc5,[$tp,#8*4]
  572. ldp $acc4,$acc5,[$t1,#8*4]
  573. stp $acc6,$acc7,[$tp,#8*6]
  574. mov $tp,$t1
  575. ldp $acc6,$acc7,[$t1,#8*6]
  576. b .Lsqr8x_outer_loop
  577. .align 4
  578. .Lsqr8x_outer_break:
  579. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  580. ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
  581. ldp $t1,$t2,[sp,#8*1]
  582. ldp $a5,$a7,[$t0,#8*2]
  583. add $ap,$t0,#8*4
  584. ldp $t3,$t0,[sp,#8*3]
  585. stp $acc0,$acc1,[$tp,#8*0]
  586. mul $acc0,$a1,$a1
  587. stp $acc2,$acc3,[$tp,#8*2]
  588. umulh $a1,$a1,$a1
  589. stp $acc4,$acc5,[$tp,#8*4]
  590. mul $a2,$a3,$a3
  591. stp $acc6,$acc7,[$tp,#8*6]
  592. mov $tp,sp
  593. umulh $a3,$a3,$a3
  594. adds $acc1,$a1,$t1,lsl#1
  595. extr $t1,$t2,$t1,#63
  596. sub $cnt,$num,#8*4
  597. .Lsqr4x_shift_n_add:
  598. adcs $acc2,$a2,$t1
  599. extr $t2,$t3,$t2,#63
  600. sub $cnt,$cnt,#8*4
  601. adcs $acc3,$a3,$t2
  602. ldp $t1,$t2,[$tp,#8*5]
  603. mul $a4,$a5,$a5
  604. ldp $a1,$a3,[$ap],#8*2
  605. umulh $a5,$a5,$a5
  606. mul $a6,$a7,$a7
  607. umulh $a7,$a7,$a7
  608. extr $t3,$t0,$t3,#63
  609. stp $acc0,$acc1,[$tp,#8*0]
  610. adcs $acc4,$a4,$t3
  611. extr $t0,$t1,$t0,#63
  612. stp $acc2,$acc3,[$tp,#8*2]
  613. adcs $acc5,$a5,$t0
  614. ldp $t3,$t0,[$tp,#8*7]
  615. extr $t1,$t2,$t1,#63
  616. adcs $acc6,$a6,$t1
  617. extr $t2,$t3,$t2,#63
  618. adcs $acc7,$a7,$t2
  619. ldp $t1,$t2,[$tp,#8*9]
  620. mul $a0,$a1,$a1
  621. ldp $a5,$a7,[$ap],#8*2
  622. umulh $a1,$a1,$a1
  623. mul $a2,$a3,$a3
  624. umulh $a3,$a3,$a3
  625. stp $acc4,$acc5,[$tp,#8*4]
  626. extr $t3,$t0,$t3,#63
  627. stp $acc6,$acc7,[$tp,#8*6]
  628. add $tp,$tp,#8*8
  629. adcs $acc0,$a0,$t3
  630. extr $t0,$t1,$t0,#63
  631. adcs $acc1,$a1,$t0
  632. ldp $t3,$t0,[$tp,#8*3]
  633. extr $t1,$t2,$t1,#63
  634. cbnz $cnt,.Lsqr4x_shift_n_add
  635. ___
  636. my ($np,$np_end)=($ap,$ap_end);
  637. $code.=<<___;
  638. ldp $np,$n0,[x29,#104] // pull np and n0
  639. adcs $acc2,$a2,$t1
  640. extr $t2,$t3,$t2,#63
  641. adcs $acc3,$a3,$t2
  642. ldp $t1,$t2,[$tp,#8*5]
  643. mul $a4,$a5,$a5
  644. umulh $a5,$a5,$a5
  645. stp $acc0,$acc1,[$tp,#8*0]
  646. mul $a6,$a7,$a7
  647. umulh $a7,$a7,$a7
  648. stp $acc2,$acc3,[$tp,#8*2]
  649. extr $t3,$t0,$t3,#63
  650. adcs $acc4,$a4,$t3
  651. extr $t0,$t1,$t0,#63
  652. ldp $acc0,$acc1,[sp,#8*0]
  653. adcs $acc5,$a5,$t0
  654. extr $t1,$t2,$t1,#63
  655. ldp $a0,$a1,[$np,#8*0]
  656. adcs $acc6,$a6,$t1
  657. extr $t2,xzr,$t2,#63
  658. ldp $a2,$a3,[$np,#8*2]
  659. adc $acc7,$a7,$t2
  660. ldp $a4,$a5,[$np,#8*4]
  661. // Reduce by 512 bits per iteration
  662. mul $na0,$n0,$acc0 // t[0]*n0
  663. ldp $a6,$a7,[$np,#8*6]
  664. add $np_end,$np,$num
  665. ldp $acc2,$acc3,[sp,#8*2]
  666. stp $acc4,$acc5,[$tp,#8*4]
  667. ldp $acc4,$acc5,[sp,#8*4]
  668. stp $acc6,$acc7,[$tp,#8*6]
  669. ldp $acc6,$acc7,[sp,#8*6]
  670. add $np,$np,#8*8
  671. mov $topmost,xzr // initial top-most carry
  672. mov $tp,sp
  673. mov $cnt,#8
  674. .Lsqr8x_reduction:
  675. // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
  676. mul $t1,$a1,$na0
  677. sub $cnt,$cnt,#1
  678. mul $t2,$a2,$na0
  679. str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
  680. mul $t3,$a3,$na0
  681. // (*) adds xzr,$acc0,$t0
  682. subs xzr,$acc0,#1 // (*)
  683. mul $t0,$a4,$na0
  684. adcs $acc0,$acc1,$t1
  685. mul $t1,$a5,$na0
  686. adcs $acc1,$acc2,$t2
  687. mul $t2,$a6,$na0
  688. adcs $acc2,$acc3,$t3
  689. mul $t3,$a7,$na0
  690. adcs $acc3,$acc4,$t0
  691. umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
  692. adcs $acc4,$acc5,$t1
  693. umulh $t1,$a1,$na0
  694. adcs $acc5,$acc6,$t2
  695. umulh $t2,$a2,$na0
  696. adcs $acc6,$acc7,$t3
  697. umulh $t3,$a3,$na0
  698. adc $acc7,xzr,xzr
  699. adds $acc0,$acc0,$t0
  700. umulh $t0,$a4,$na0
  701. adcs $acc1,$acc1,$t1
  702. umulh $t1,$a5,$na0
  703. adcs $acc2,$acc2,$t2
  704. umulh $t2,$a6,$na0
  705. adcs $acc3,$acc3,$t3
  706. umulh $t3,$a7,$na0
  707. mul $na0,$n0,$acc0 // next t[0]*n0
  708. adcs $acc4,$acc4,$t0
  709. adcs $acc5,$acc5,$t1
  710. adcs $acc6,$acc6,$t2
  711. adc $acc7,$acc7,$t3
  712. cbnz $cnt,.Lsqr8x_reduction
  713. ldp $t0,$t1,[$tp,#8*0]
  714. ldp $t2,$t3,[$tp,#8*2]
  715. mov $rp,$tp
  716. sub $cnt,$np_end,$np // done yet?
  717. adds $acc0,$acc0,$t0
  718. adcs $acc1,$acc1,$t1
  719. ldp $t0,$t1,[$tp,#8*4]
  720. adcs $acc2,$acc2,$t2
  721. adcs $acc3,$acc3,$t3
  722. ldp $t2,$t3,[$tp,#8*6]
  723. adcs $acc4,$acc4,$t0
  724. adcs $acc5,$acc5,$t1
  725. adcs $acc6,$acc6,$t2
  726. adcs $acc7,$acc7,$t3
  727. //adc $carry,xzr,xzr // moved below
  728. cbz $cnt,.Lsqr8x8_post_condition
  729. ldr $n0,[$tp,#-8*8]
  730. ldp $a0,$a1,[$np,#8*0]
  731. ldp $a2,$a3,[$np,#8*2]
  732. ldp $a4,$a5,[$np,#8*4]
  733. mov $cnt,#-8*8
  734. ldp $a6,$a7,[$np,#8*6]
  735. add $np,$np,#8*8
  736. .Lsqr8x_tail:
  737. mul $t0,$a0,$n0
  738. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  739. mul $t1,$a1,$n0
  740. add $cnt,$cnt,#8
  741. mul $t2,$a2,$n0
  742. mul $t3,$a3,$n0
  743. adds $acc0,$acc0,$t0
  744. mul $t0,$a4,$n0
  745. adcs $acc1,$acc1,$t1
  746. mul $t1,$a5,$n0
  747. adcs $acc2,$acc2,$t2
  748. mul $t2,$a6,$n0
  749. adcs $acc3,$acc3,$t3
  750. mul $t3,$a7,$n0
  751. adcs $acc4,$acc4,$t0
  752. umulh $t0,$a0,$n0
  753. adcs $acc5,$acc5,$t1
  754. umulh $t1,$a1,$n0
  755. adcs $acc6,$acc6,$t2
  756. umulh $t2,$a2,$n0
  757. adcs $acc7,$acc7,$t3
  758. umulh $t3,$a3,$n0
  759. adc $carry,$carry,xzr
  760. str $acc0,[$tp],#8
  761. adds $acc0,$acc1,$t0
  762. umulh $t0,$a4,$n0
  763. adcs $acc1,$acc2,$t1
  764. umulh $t1,$a5,$n0
  765. adcs $acc2,$acc3,$t2
  766. umulh $t2,$a6,$n0
  767. adcs $acc3,$acc4,$t3
  768. umulh $t3,$a7,$n0
  769. ldr $n0,[$rp,$cnt]
  770. adcs $acc4,$acc5,$t0
  771. adcs $acc5,$acc6,$t1
  772. adcs $acc6,$acc7,$t2
  773. adcs $acc7,$carry,$t3
  774. //adc $carry,xzr,xzr // moved above
  775. cbnz $cnt,.Lsqr8x_tail
  776. // note that carry flag is guaranteed
  777. // to be zero at this point
  778. ldp $a0,$a1,[$tp,#8*0]
  779. sub $cnt,$np_end,$np // done yet?
  780. sub $t2,$np_end,$num // rewinded np
  781. ldp $a2,$a3,[$tp,#8*2]
  782. ldp $a4,$a5,[$tp,#8*4]
  783. ldp $a6,$a7,[$tp,#8*6]
  784. cbz $cnt,.Lsqr8x_tail_break
  785. ldr $n0,[$rp,#-8*8]
  786. adds $acc0,$acc0,$a0
  787. adcs $acc1,$acc1,$a1
  788. ldp $a0,$a1,[$np,#8*0]
  789. adcs $acc2,$acc2,$a2
  790. adcs $acc3,$acc3,$a3
  791. ldp $a2,$a3,[$np,#8*2]
  792. adcs $acc4,$acc4,$a4
  793. adcs $acc5,$acc5,$a5
  794. ldp $a4,$a5,[$np,#8*4]
  795. adcs $acc6,$acc6,$a6
  796. mov $cnt,#-8*8
  797. adcs $acc7,$acc7,$a7
  798. ldp $a6,$a7,[$np,#8*6]
  799. add $np,$np,#8*8
  800. //adc $carry,xzr,xzr // moved above
  801. b .Lsqr8x_tail
  802. .align 4
  803. .Lsqr8x_tail_break:
  804. ldr $n0,[x29,#112] // pull n0
  805. add $cnt,$tp,#8*8 // end of current t[num] window
  806. subs xzr,$topmost,#1 // "move" top-most carry to carry bit
  807. adcs $t0,$acc0,$a0
  808. adcs $t1,$acc1,$a1
  809. ldp $acc0,$acc1,[$rp,#8*0]
  810. adcs $acc2,$acc2,$a2
  811. ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
  812. adcs $acc3,$acc3,$a3
  813. ldp $a2,$a3,[$t2,#8*2]
  814. adcs $acc4,$acc4,$a4
  815. adcs $acc5,$acc5,$a5
  816. ldp $a4,$a5,[$t2,#8*4]
  817. adcs $acc6,$acc6,$a6
  818. adcs $acc7,$acc7,$a7
  819. ldp $a6,$a7,[$t2,#8*6]
  820. add $np,$t2,#8*8
  821. adc $topmost,xzr,xzr // top-most carry
  822. mul $na0,$n0,$acc0
  823. stp $t0,$t1,[$tp,#8*0]
  824. stp $acc2,$acc3,[$tp,#8*2]
  825. ldp $acc2,$acc3,[$rp,#8*2]
  826. stp $acc4,$acc5,[$tp,#8*4]
  827. ldp $acc4,$acc5,[$rp,#8*4]
  828. cmp $cnt,x29 // did we hit the bottom?
  829. stp $acc6,$acc7,[$tp,#8*6]
  830. mov $tp,$rp // slide the window
  831. ldp $acc6,$acc7,[$rp,#8*6]
  832. mov $cnt,#8
  833. b.ne .Lsqr8x_reduction
  834. // Final step. We see if result is larger than modulus, and
  835. // if it is, subtract the modulus. But comparison implies
  836. // subtraction. So we subtract modulus, see if it borrowed,
  837. // and conditionally copy original value.
  838. ldr $rp,[x29,#96] // pull rp
  839. add $tp,$tp,#8*8
  840. subs $t0,$acc0,$a0
  841. sbcs $t1,$acc1,$a1
  842. sub $cnt,$num,#8*8
  843. mov $ap_end,$rp // $rp copy
  844. .Lsqr8x_sub:
  845. sbcs $t2,$acc2,$a2
  846. ldp $a0,$a1,[$np,#8*0]
  847. sbcs $t3,$acc3,$a3
  848. stp $t0,$t1,[$rp,#8*0]
  849. sbcs $t0,$acc4,$a4
  850. ldp $a2,$a3,[$np,#8*2]
  851. sbcs $t1,$acc5,$a5
  852. stp $t2,$t3,[$rp,#8*2]
  853. sbcs $t2,$acc6,$a6
  854. ldp $a4,$a5,[$np,#8*4]
  855. sbcs $t3,$acc7,$a7
  856. ldp $a6,$a7,[$np,#8*6]
  857. add $np,$np,#8*8
  858. ldp $acc0,$acc1,[$tp,#8*0]
  859. sub $cnt,$cnt,#8*8
  860. ldp $acc2,$acc3,[$tp,#8*2]
  861. ldp $acc4,$acc5,[$tp,#8*4]
  862. ldp $acc6,$acc7,[$tp,#8*6]
  863. add $tp,$tp,#8*8
  864. stp $t0,$t1,[$rp,#8*4]
  865. sbcs $t0,$acc0,$a0
  866. stp $t2,$t3,[$rp,#8*6]
  867. add $rp,$rp,#8*8
  868. sbcs $t1,$acc1,$a1
  869. cbnz $cnt,.Lsqr8x_sub
  870. sbcs $t2,$acc2,$a2
  871. mov $tp,sp
  872. add $ap,sp,$num
  873. ldp $a0,$a1,[$ap_end,#8*0]
  874. sbcs $t3,$acc3,$a3
  875. stp $t0,$t1,[$rp,#8*0]
  876. sbcs $t0,$acc4,$a4
  877. ldp $a2,$a3,[$ap_end,#8*2]
  878. sbcs $t1,$acc5,$a5
  879. stp $t2,$t3,[$rp,#8*2]
  880. sbcs $t2,$acc6,$a6
  881. ldp $acc0,$acc1,[$ap,#8*0]
  882. sbcs $t3,$acc7,$a7
  883. ldp $acc2,$acc3,[$ap,#8*2]
  884. sbcs xzr,$topmost,xzr // did it borrow?
  885. ldr x30,[x29,#8] // pull return address
  886. stp $t0,$t1,[$rp,#8*4]
  887. stp $t2,$t3,[$rp,#8*6]
  888. sub $cnt,$num,#8*4
  889. .Lsqr4x_cond_copy:
  890. sub $cnt,$cnt,#8*4
  891. csel $t0,$acc0,$a0,lo
  892. stp xzr,xzr,[$tp,#8*0]
  893. csel $t1,$acc1,$a1,lo
  894. ldp $a0,$a1,[$ap_end,#8*4]
  895. ldp $acc0,$acc1,[$ap,#8*4]
  896. csel $t2,$acc2,$a2,lo
  897. stp xzr,xzr,[$tp,#8*2]
  898. add $tp,$tp,#8*4
  899. csel $t3,$acc3,$a3,lo
  900. ldp $a2,$a3,[$ap_end,#8*6]
  901. ldp $acc2,$acc3,[$ap,#8*6]
  902. add $ap,$ap,#8*4
  903. stp $t0,$t1,[$ap_end,#8*0]
  904. stp $t2,$t3,[$ap_end,#8*2]
  905. add $ap_end,$ap_end,#8*4
  906. stp xzr,xzr,[$ap,#8*0]
  907. stp xzr,xzr,[$ap,#8*2]
  908. cbnz $cnt,.Lsqr4x_cond_copy
  909. csel $t0,$acc0,$a0,lo
  910. stp xzr,xzr,[$tp,#8*0]
  911. csel $t1,$acc1,$a1,lo
  912. stp xzr,xzr,[$tp,#8*2]
  913. csel $t2,$acc2,$a2,lo
  914. csel $t3,$acc3,$a3,lo
  915. stp $t0,$t1,[$ap_end,#8*0]
  916. stp $t2,$t3,[$ap_end,#8*2]
  917. b .Lsqr8x_done
  918. .align 4
  919. .Lsqr8x8_post_condition:
  920. adc $carry,xzr,xzr
  921. ldr x30,[x29,#8] // pull return address
  922. // $acc0-7,$carry hold result, $a0-7 hold modulus
  923. subs $a0,$acc0,$a0
  924. ldr $ap,[x29,#96] // pull rp
  925. sbcs $a1,$acc1,$a1
  926. stp xzr,xzr,[sp,#8*0]
  927. sbcs $a2,$acc2,$a2
  928. stp xzr,xzr,[sp,#8*2]
  929. sbcs $a3,$acc3,$a3
  930. stp xzr,xzr,[sp,#8*4]
  931. sbcs $a4,$acc4,$a4
  932. stp xzr,xzr,[sp,#8*6]
  933. sbcs $a5,$acc5,$a5
  934. stp xzr,xzr,[sp,#8*8]
  935. sbcs $a6,$acc6,$a6
  936. stp xzr,xzr,[sp,#8*10]
  937. sbcs $a7,$acc7,$a7
  938. stp xzr,xzr,[sp,#8*12]
  939. sbcs $carry,$carry,xzr // did it borrow?
  940. stp xzr,xzr,[sp,#8*14]
  941. // $a0-7 hold result-modulus
  942. csel $a0,$acc0,$a0,lo
  943. csel $a1,$acc1,$a1,lo
  944. csel $a2,$acc2,$a2,lo
  945. csel $a3,$acc3,$a3,lo
  946. stp $a0,$a1,[$ap,#8*0]
  947. csel $a4,$acc4,$a4,lo
  948. csel $a5,$acc5,$a5,lo
  949. stp $a2,$a3,[$ap,#8*2]
  950. csel $a6,$acc6,$a6,lo
  951. csel $a7,$acc7,$a7,lo
  952. stp $a4,$a5,[$ap,#8*4]
  953. stp $a6,$a7,[$ap,#8*6]
  954. .Lsqr8x_done:
  955. ldp x19,x20,[x29,#16]
  956. mov sp,x29
  957. ldp x21,x22,[x29,#32]
  958. mov x0,#1
  959. ldp x23,x24,[x29,#48]
  960. ldp x25,x26,[x29,#64]
  961. ldp x27,x28,[x29,#80]
  962. ldr x29,[sp],#128
  963. ret
  964. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  965. ___
  966. }
  967. {
  968. ########################################################################
  969. # Even though this might look as ARMv8 adaptation of mulx4x_mont from
  970. # x86_64-mont5 module, it's different in sense that it performs
  971. # reduction 256 bits at a time.
  972. my ($a0,$a1,$a2,$a3,
  973. $t0,$t1,$t2,$t3,
  974. $m0,$m1,$m2,$m3,
  975. $acc0,$acc1,$acc2,$acc3,$acc4,
  976. $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
  977. my $bp_end=$rp;
  978. my ($carry,$topmost) = ($rp,"x30");
  979. $code.=<<___;
  980. .type __bn_mul4x_mont,%function
  981. .align 5
  982. __bn_mul4x_mont:
  983. stp x29,x30,[sp,#-128]!
  984. add x29,sp,#0
  985. stp x19,x20,[sp,#16]
  986. stp x21,x22,[sp,#32]
  987. stp x23,x24,[sp,#48]
  988. stp x25,x26,[sp,#64]
  989. stp x27,x28,[sp,#80]
  990. sub $tp,sp,$num,lsl#3
  991. lsl $num,$num,#3
  992. ldr $n0,[$n0] // *n0
  993. sub sp,$tp,#8*4 // alloca
  994. add $t0,$bp,$num
  995. add $ap_end,$ap,$num
  996. stp $rp,$t0,[x29,#96] // offload rp and &b[num]
  997. ldr $bi,[$bp,#8*0] // b[0]
  998. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  999. ldp $a2,$a3,[$ap,#8*2]
  1000. add $ap,$ap,#8*4
  1001. mov $acc0,xzr
  1002. mov $acc1,xzr
  1003. mov $acc2,xzr
  1004. mov $acc3,xzr
  1005. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1006. ldp $m2,$m3,[$np,#8*2]
  1007. adds $np,$np,#8*4 // clear carry bit
  1008. mov $carry,xzr
  1009. mov $cnt,#0
  1010. mov $tp,sp
  1011. .Loop_mul4x_1st_reduction:
  1012. mul $t0,$a0,$bi // lo(a[0..3]*b[0])
  1013. adc $carry,$carry,xzr // modulo-scheduled
  1014. mul $t1,$a1,$bi
  1015. add $cnt,$cnt,#8
  1016. mul $t2,$a2,$bi
  1017. and $cnt,$cnt,#31
  1018. mul $t3,$a3,$bi
  1019. adds $acc0,$acc0,$t0
  1020. umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
  1021. adcs $acc1,$acc1,$t1
  1022. mul $mi,$acc0,$n0 // t[0]*n0
  1023. adcs $acc2,$acc2,$t2
  1024. umulh $t1,$a1,$bi
  1025. adcs $acc3,$acc3,$t3
  1026. umulh $t2,$a2,$bi
  1027. adc $acc4,xzr,xzr
  1028. umulh $t3,$a3,$bi
  1029. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1030. adds $acc1,$acc1,$t0
  1031. // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
  1032. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1033. adcs $acc2,$acc2,$t1
  1034. mul $t1,$m1,$mi
  1035. adcs $acc3,$acc3,$t2
  1036. mul $t2,$m2,$mi
  1037. adc $acc4,$acc4,$t3 // can't overflow
  1038. mul $t3,$m3,$mi
  1039. // (*) adds xzr,$acc0,$t0
  1040. subs xzr,$acc0,#1 // (*)
  1041. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
  1042. adcs $acc0,$acc1,$t1
  1043. umulh $t1,$m1,$mi
  1044. adcs $acc1,$acc2,$t2
  1045. umulh $t2,$m2,$mi
  1046. adcs $acc2,$acc3,$t3
  1047. umulh $t3,$m3,$mi
  1048. adcs $acc3,$acc4,$carry
  1049. adc $carry,xzr,xzr
  1050. adds $acc0,$acc0,$t0
  1051. sub $t0,$ap_end,$ap
  1052. adcs $acc1,$acc1,$t1
  1053. adcs $acc2,$acc2,$t2
  1054. adcs $acc3,$acc3,$t3
  1055. //adc $carry,$carry,xzr
  1056. cbnz $cnt,.Loop_mul4x_1st_reduction
  1057. cbz $t0,.Lmul4x4_post_condition
  1058. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1059. ldp $a2,$a3,[$ap,#8*2]
  1060. add $ap,$ap,#8*4
  1061. ldr $mi,[sp] // a[0]*n0
  1062. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1063. ldp $m2,$m3,[$np,#8*2]
  1064. add $np,$np,#8*4
  1065. .Loop_mul4x_1st_tail:
  1066. mul $t0,$a0,$bi // lo(a[4..7]*b[i])
  1067. adc $carry,$carry,xzr // modulo-scheduled
  1068. mul $t1,$a1,$bi
  1069. add $cnt,$cnt,#8
  1070. mul $t2,$a2,$bi
  1071. and $cnt,$cnt,#31
  1072. mul $t3,$a3,$bi
  1073. adds $acc0,$acc0,$t0
  1074. umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
  1075. adcs $acc1,$acc1,$t1
  1076. umulh $t1,$a1,$bi
  1077. adcs $acc2,$acc2,$t2
  1078. umulh $t2,$a2,$bi
  1079. adcs $acc3,$acc3,$t3
  1080. umulh $t3,$a3,$bi
  1081. adc $acc4,xzr,xzr
  1082. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1083. adds $acc1,$acc1,$t0
  1084. mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
  1085. adcs $acc2,$acc2,$t1
  1086. mul $t1,$m1,$mi
  1087. adcs $acc3,$acc3,$t2
  1088. mul $t2,$m2,$mi
  1089. adc $acc4,$acc4,$t3 // can't overflow
  1090. mul $t3,$m3,$mi
  1091. adds $acc0,$acc0,$t0
  1092. umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
  1093. adcs $acc1,$acc1,$t1
  1094. umulh $t1,$m1,$mi
  1095. adcs $acc2,$acc2,$t2
  1096. umulh $t2,$m2,$mi
  1097. adcs $acc3,$acc3,$t3
  1098. adcs $acc4,$acc4,$carry
  1099. umulh $t3,$m3,$mi
  1100. adc $carry,xzr,xzr
  1101. ldr $mi,[sp,$cnt] // next t[0]*n0
  1102. str $acc0,[$tp],#8 // result!!!
  1103. adds $acc0,$acc1,$t0
  1104. sub $t0,$ap_end,$ap // done yet?
  1105. adcs $acc1,$acc2,$t1
  1106. adcs $acc2,$acc3,$t2
  1107. adcs $acc3,$acc4,$t3
  1108. //adc $carry,$carry,xzr
  1109. cbnz $cnt,.Loop_mul4x_1st_tail
  1110. sub $t1,$ap_end,$num // rewinded $ap
  1111. cbz $t0,.Lmul4x_proceed
  1112. ldp $a0,$a1,[$ap,#8*0]
  1113. ldp $a2,$a3,[$ap,#8*2]
  1114. add $ap,$ap,#8*4
  1115. ldp $m0,$m1,[$np,#8*0]
  1116. ldp $m2,$m3,[$np,#8*2]
  1117. add $np,$np,#8*4
  1118. b .Loop_mul4x_1st_tail
  1119. .align 5
  1120. .Lmul4x_proceed:
  1121. ldr $bi,[$bp,#8*4]! // *++b
  1122. adc $topmost,$carry,xzr
  1123. ldp $a0,$a1,[$t1,#8*0] // a[0..3]
  1124. sub $np,$np,$num // rewind np
  1125. ldp $a2,$a3,[$t1,#8*2]
  1126. add $ap,$t1,#8*4
  1127. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1128. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1129. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1130. ldp $acc2,$acc3,[sp,#8*6]
  1131. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1132. mov $tp,sp
  1133. ldp $m2,$m3,[$np,#8*2]
  1134. adds $np,$np,#8*4 // clear carry bit
  1135. mov $carry,xzr
  1136. .align 4
  1137. .Loop_mul4x_reduction:
  1138. mul $t0,$a0,$bi // lo(a[0..3]*b[4])
  1139. adc $carry,$carry,xzr // modulo-scheduled
  1140. mul $t1,$a1,$bi
  1141. add $cnt,$cnt,#8
  1142. mul $t2,$a2,$bi
  1143. and $cnt,$cnt,#31
  1144. mul $t3,$a3,$bi
  1145. adds $acc0,$acc0,$t0
  1146. umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
  1147. adcs $acc1,$acc1,$t1
  1148. mul $mi,$acc0,$n0 // t[0]*n0
  1149. adcs $acc2,$acc2,$t2
  1150. umulh $t1,$a1,$bi
  1151. adcs $acc3,$acc3,$t3
  1152. umulh $t2,$a2,$bi
  1153. adc $acc4,xzr,xzr
  1154. umulh $t3,$a3,$bi
  1155. ldr $bi,[$bp,$cnt] // next b[i]
  1156. adds $acc1,$acc1,$t0
  1157. // (*) mul $t0,$m0,$mi
  1158. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1159. adcs $acc2,$acc2,$t1
  1160. mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
  1161. adcs $acc3,$acc3,$t2
  1162. mul $t2,$m2,$mi
  1163. adc $acc4,$acc4,$t3 // can't overflow
  1164. mul $t3,$m3,$mi
  1165. // (*) adds xzr,$acc0,$t0
  1166. subs xzr,$acc0,#1 // (*)
  1167. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
  1168. adcs $acc0,$acc1,$t1
  1169. umulh $t1,$m1,$mi
  1170. adcs $acc1,$acc2,$t2
  1171. umulh $t2,$m2,$mi
  1172. adcs $acc2,$acc3,$t3
  1173. umulh $t3,$m3,$mi
  1174. adcs $acc3,$acc4,$carry
  1175. adc $carry,xzr,xzr
  1176. adds $acc0,$acc0,$t0
  1177. adcs $acc1,$acc1,$t1
  1178. adcs $acc2,$acc2,$t2
  1179. adcs $acc3,$acc3,$t3
  1180. //adc $carry,$carry,xzr
  1181. cbnz $cnt,.Loop_mul4x_reduction
  1182. adc $carry,$carry,xzr
  1183. ldp $t0,$t1,[$tp,#8*4] // t[4..7]
  1184. ldp $t2,$t3,[$tp,#8*6]
  1185. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1186. ldp $a2,$a3,[$ap,#8*2]
  1187. add $ap,$ap,#8*4
  1188. adds $acc0,$acc0,$t0
  1189. adcs $acc1,$acc1,$t1
  1190. adcs $acc2,$acc2,$t2
  1191. adcs $acc3,$acc3,$t3
  1192. //adc $carry,$carry,xzr
  1193. ldr $mi,[sp] // t[0]*n0
  1194. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1195. ldp $m2,$m3,[$np,#8*2]
  1196. add $np,$np,#8*4
  1197. .align 4
  1198. .Loop_mul4x_tail:
  1199. mul $t0,$a0,$bi // lo(a[4..7]*b[4])
  1200. adc $carry,$carry,xzr // modulo-scheduled
  1201. mul $t1,$a1,$bi
  1202. add $cnt,$cnt,#8
  1203. mul $t2,$a2,$bi
  1204. and $cnt,$cnt,#31
  1205. mul $t3,$a3,$bi
  1206. adds $acc0,$acc0,$t0
  1207. umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
  1208. adcs $acc1,$acc1,$t1
  1209. umulh $t1,$a1,$bi
  1210. adcs $acc2,$acc2,$t2
  1211. umulh $t2,$a2,$bi
  1212. adcs $acc3,$acc3,$t3
  1213. umulh $t3,$a3,$bi
  1214. adc $acc4,xzr,xzr
  1215. ldr $bi,[$bp,$cnt] // next b[i]
  1216. adds $acc1,$acc1,$t0
  1217. mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
  1218. adcs $acc2,$acc2,$t1
  1219. mul $t1,$m1,$mi
  1220. adcs $acc3,$acc3,$t2
  1221. mul $t2,$m2,$mi
  1222. adc $acc4,$acc4,$t3 // can't overflow
  1223. mul $t3,$m3,$mi
  1224. adds $acc0,$acc0,$t0
  1225. umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
  1226. adcs $acc1,$acc1,$t1
  1227. umulh $t1,$m1,$mi
  1228. adcs $acc2,$acc2,$t2
  1229. umulh $t2,$m2,$mi
  1230. adcs $acc3,$acc3,$t3
  1231. umulh $t3,$m3,$mi
  1232. adcs $acc4,$acc4,$carry
  1233. ldr $mi,[sp,$cnt] // next a[0]*n0
  1234. adc $carry,xzr,xzr
  1235. str $acc0,[$tp],#8 // result!!!
  1236. adds $acc0,$acc1,$t0
  1237. sub $t0,$ap_end,$ap // done yet?
  1238. adcs $acc1,$acc2,$t1
  1239. adcs $acc2,$acc3,$t2
  1240. adcs $acc3,$acc4,$t3
  1241. //adc $carry,$carry,xzr
  1242. cbnz $cnt,.Loop_mul4x_tail
  1243. sub $t1,$np,$num // rewinded np?
  1244. adc $carry,$carry,xzr
  1245. cbz $t0,.Loop_mul4x_break
  1246. ldp $t0,$t1,[$tp,#8*4]
  1247. ldp $t2,$t3,[$tp,#8*6]
  1248. ldp $a0,$a1,[$ap,#8*0]
  1249. ldp $a2,$a3,[$ap,#8*2]
  1250. add $ap,$ap,#8*4
  1251. adds $acc0,$acc0,$t0
  1252. adcs $acc1,$acc1,$t1
  1253. adcs $acc2,$acc2,$t2
  1254. adcs $acc3,$acc3,$t3
  1255. //adc $carry,$carry,xzr
  1256. ldp $m0,$m1,[$np,#8*0]
  1257. ldp $m2,$m3,[$np,#8*2]
  1258. add $np,$np,#8*4
  1259. b .Loop_mul4x_tail
  1260. .align 4
  1261. .Loop_mul4x_break:
  1262. ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
  1263. adds $acc0,$acc0,$topmost
  1264. add $bp,$bp,#8*4 // bp++
  1265. adcs $acc1,$acc1,xzr
  1266. sub $ap,$ap,$num // rewind ap
  1267. adcs $acc2,$acc2,xzr
  1268. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1269. adcs $acc3,$acc3,xzr
  1270. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1271. adc $topmost,$carry,xzr
  1272. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1273. cmp $bp,$t3 // done yet?
  1274. ldp $acc2,$acc3,[sp,#8*6]
  1275. ldp $m0,$m1,[$t1,#8*0] // n[0..3]
  1276. ldp $m2,$m3,[$t1,#8*2]
  1277. add $np,$t1,#8*4
  1278. b.eq .Lmul4x_post
  1279. ldr $bi,[$bp]
  1280. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1281. ldp $a2,$a3,[$ap,#8*2]
  1282. adds $ap,$ap,#8*4 // clear carry bit
  1283. mov $carry,xzr
  1284. mov $tp,sp
  1285. b .Loop_mul4x_reduction
  1286. .align 4
  1287. .Lmul4x_post:
  1288. // Final step. We see if result is larger than modulus, and
  1289. // if it is, subtract the modulus. But comparison implies
  1290. // subtraction. So we subtract modulus, see if it borrowed,
  1291. // and conditionally copy original value.
  1292. mov $rp,$t2
  1293. mov $ap_end,$t2 // $rp copy
  1294. subs $t0,$acc0,$m0
  1295. add $tp,sp,#8*8
  1296. sbcs $t1,$acc1,$m1
  1297. sub $cnt,$num,#8*4
  1298. .Lmul4x_sub:
  1299. sbcs $t2,$acc2,$m2
  1300. ldp $m0,$m1,[$np,#8*0]
  1301. sub $cnt,$cnt,#8*4
  1302. ldp $acc0,$acc1,[$tp,#8*0]
  1303. sbcs $t3,$acc3,$m3
  1304. ldp $m2,$m3,[$np,#8*2]
  1305. add $np,$np,#8*4
  1306. ldp $acc2,$acc3,[$tp,#8*2]
  1307. add $tp,$tp,#8*4
  1308. stp $t0,$t1,[$rp,#8*0]
  1309. sbcs $t0,$acc0,$m0
  1310. stp $t2,$t3,[$rp,#8*2]
  1311. add $rp,$rp,#8*4
  1312. sbcs $t1,$acc1,$m1
  1313. cbnz $cnt,.Lmul4x_sub
  1314. sbcs $t2,$acc2,$m2
  1315. mov $tp,sp
  1316. add $ap,sp,#8*4
  1317. ldp $a0,$a1,[$ap_end,#8*0]
  1318. sbcs $t3,$acc3,$m3
  1319. stp $t0,$t1,[$rp,#8*0]
  1320. ldp $a2,$a3,[$ap_end,#8*2]
  1321. stp $t2,$t3,[$rp,#8*2]
  1322. ldp $acc0,$acc1,[$ap,#8*0]
  1323. ldp $acc2,$acc3,[$ap,#8*2]
  1324. sbcs xzr,$topmost,xzr // did it borrow?
  1325. ldr x30,[x29,#8] // pull return address
  1326. sub $cnt,$num,#8*4
  1327. .Lmul4x_cond_copy:
  1328. sub $cnt,$cnt,#8*4
  1329. csel $t0,$acc0,$a0,lo
  1330. stp xzr,xzr,[$tp,#8*0]
  1331. csel $t1,$acc1,$a1,lo
  1332. ldp $a0,$a1,[$ap_end,#8*4]
  1333. ldp $acc0,$acc1,[$ap,#8*4]
  1334. csel $t2,$acc2,$a2,lo
  1335. stp xzr,xzr,[$tp,#8*2]
  1336. add $tp,$tp,#8*4
  1337. csel $t3,$acc3,$a3,lo
  1338. ldp $a2,$a3,[$ap_end,#8*6]
  1339. ldp $acc2,$acc3,[$ap,#8*6]
  1340. add $ap,$ap,#8*4
  1341. stp $t0,$t1,[$ap_end,#8*0]
  1342. stp $t2,$t3,[$ap_end,#8*2]
  1343. add $ap_end,$ap_end,#8*4
  1344. cbnz $cnt,.Lmul4x_cond_copy
  1345. csel $t0,$acc0,$a0,lo
  1346. stp xzr,xzr,[$tp,#8*0]
  1347. csel $t1,$acc1,$a1,lo
  1348. stp xzr,xzr,[$tp,#8*2]
  1349. csel $t2,$acc2,$a2,lo
  1350. stp xzr,xzr,[$tp,#8*3]
  1351. csel $t3,$acc3,$a3,lo
  1352. stp xzr,xzr,[$tp,#8*4]
  1353. stp $t0,$t1,[$ap_end,#8*0]
  1354. stp $t2,$t3,[$ap_end,#8*2]
  1355. b .Lmul4x_done
  1356. .align 4
  1357. .Lmul4x4_post_condition:
  1358. adc $carry,$carry,xzr
  1359. ldr $ap,[x29,#96] // pull rp
  1360. // $acc0-3,$carry hold result, $m0-7 hold modulus
  1361. subs $a0,$acc0,$m0
  1362. ldr x30,[x29,#8] // pull return address
  1363. sbcs $a1,$acc1,$m1
  1364. stp xzr,xzr,[sp,#8*0]
  1365. sbcs $a2,$acc2,$m2
  1366. stp xzr,xzr,[sp,#8*2]
  1367. sbcs $a3,$acc3,$m3
  1368. stp xzr,xzr,[sp,#8*4]
  1369. sbcs xzr,$carry,xzr // did it borrow?
  1370. stp xzr,xzr,[sp,#8*6]
  1371. // $a0-3 hold result-modulus
  1372. csel $a0,$acc0,$a0,lo
  1373. csel $a1,$acc1,$a1,lo
  1374. csel $a2,$acc2,$a2,lo
  1375. csel $a3,$acc3,$a3,lo
  1376. stp $a0,$a1,[$ap,#8*0]
  1377. stp $a2,$a3,[$ap,#8*2]
  1378. .Lmul4x_done:
  1379. ldp x19,x20,[x29,#16]
  1380. mov sp,x29
  1381. ldp x21,x22,[x29,#32]
  1382. mov x0,#1
  1383. ldp x23,x24,[x29,#48]
  1384. ldp x25,x26,[x29,#64]
  1385. ldp x27,x28,[x29,#80]
  1386. ldr x29,[sp],#128
  1387. ret
  1388. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1389. ___
  1390. }
  1391. $code.=<<___;
  1392. .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  1393. .align 4
  1394. ___
  1395. print $code;
  1396. close STDOUT;