2
0

armv8-mont.pl 36 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # March 2015
  15. #
  16. # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  17. # work. While it does improve RSA sign performance by 20-30% (less for
  18. # longer keys) on most processors, for some reason RSA2048 is not
  19. # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  20. # instruction issue rate is limited on processor in question, meaning
  21. # that dedicated squaring procedure is a must. Well, actually all
  22. # contemporary AArch64 processors seem to have limited multiplication
  23. # issue rate, i.e. they can't issue multiplication every cycle, which
  24. # explains moderate improvement coefficients in comparison to
  25. # compiler-generated code. Recall that compiler is instructed to use
  26. # umulh and therefore uses same amount of multiplication instructions
  27. # to do the job. Assembly's edge is to minimize number of "collateral"
  28. # instructions and of course instruction scheduling.
  29. #
  30. # April 2015
  31. #
  32. # Squaring procedure that handles lengths divisible by 8 improves
  33. # RSA/DSA performance by 25-40-60% depending on processor and key
  34. # length. Overall improvement coefficients are always positive in
  35. # comparison to compiler-generated code. On Cortex-A57 improvement
  36. # is still modest on longest key lengths, while others exhibit e.g.
  37. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
  38. # on Cortex-A57 and ~60-100% faster on others.
  39. # $output is the last argument if it looks like a file (it has an extension)
  40. # $flavour is the first argument if it doesn't look like a file
  41. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  42. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  46. die "can't locate arm-xlate.pl";
  47. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  48. or die "can't call $xlate: $1";
  49. *STDOUT=*OUT;
  50. ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  51. $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  52. $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  53. # int bn_mul_mont(
  54. $rp="x0"; # BN_ULONG *rp,
  55. $ap="x1"; # const BN_ULONG *ap,
  56. $bp="x2"; # const BN_ULONG *bp,
  57. $np="x3"; # const BN_ULONG *np,
  58. $n0="x4"; # const BN_ULONG *n0,
  59. $num="x5"; # int num);
  60. $code.=<<___;
  61. .text
  62. .globl bn_mul_mont
  63. .type bn_mul_mont,%function
  64. .align 5
  65. bn_mul_mont:
  66. tst $num,#7
  67. b.eq __bn_sqr8x_mont
  68. tst $num,#3
  69. b.eq __bn_mul4x_mont
  70. .Lmul_mont:
  71. stp x29,x30,[sp,#-64]!
  72. add x29,sp,#0
  73. stp x19,x20,[sp,#16]
  74. stp x21,x22,[sp,#32]
  75. stp x23,x24,[sp,#48]
  76. ldr $m0,[$bp],#8 // bp[0]
  77. sub $tp,sp,$num,lsl#3
  78. ldp $hi0,$aj,[$ap],#16 // ap[0..1]
  79. lsl $num,$num,#3
  80. ldr $n0,[$n0] // *n0
  81. and $tp,$tp,#-16 // ABI says so
  82. ldp $hi1,$nj,[$np],#16 // np[0..1]
  83. mul $lo0,$hi0,$m0 // ap[0]*bp[0]
  84. sub $j,$num,#16 // j=num-2
  85. umulh $hi0,$hi0,$m0
  86. mul $alo,$aj,$m0 // ap[1]*bp[0]
  87. umulh $ahi,$aj,$m0
  88. mul $m1,$lo0,$n0 // "tp[0]"*n0
  89. mov sp,$tp // alloca
  90. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  91. umulh $hi1,$hi1,$m1
  92. mul $nlo,$nj,$m1 // np[1]*m1
  93. // (*) adds $lo1,$lo1,$lo0 // discarded
  94. // (*) As for removal of first multiplication and addition
  95. // instructions. The outcome of first addition is
  96. // guaranteed to be zero, which leaves two computationally
  97. // significant outcomes: it either carries or not. Then
  98. // question is when does it carry? Is there alternative
  99. // way to deduce it? If you follow operations, you can
  100. // observe that condition for carry is quite simple:
  101. // $lo0 being non-zero. So that carry can be calculated
  102. // by adding -1 to $lo0. That's what next instruction does.
  103. subs xzr,$lo0,#1 // (*)
  104. umulh $nhi,$nj,$m1
  105. adc $hi1,$hi1,xzr
  106. cbz $j,.L1st_skip
  107. .L1st:
  108. ldr $aj,[$ap],#8
  109. adds $lo0,$alo,$hi0
  110. sub $j,$j,#8 // j--
  111. adc $hi0,$ahi,xzr
  112. ldr $nj,[$np],#8
  113. adds $lo1,$nlo,$hi1
  114. mul $alo,$aj,$m0 // ap[j]*bp[0]
  115. adc $hi1,$nhi,xzr
  116. umulh $ahi,$aj,$m0
  117. adds $lo1,$lo1,$lo0
  118. mul $nlo,$nj,$m1 // np[j]*m1
  119. adc $hi1,$hi1,xzr
  120. umulh $nhi,$nj,$m1
  121. str $lo1,[$tp],#8 // tp[j-1]
  122. cbnz $j,.L1st
  123. .L1st_skip:
  124. adds $lo0,$alo,$hi0
  125. sub $ap,$ap,$num // rewind $ap
  126. adc $hi0,$ahi,xzr
  127. adds $lo1,$nlo,$hi1
  128. sub $np,$np,$num // rewind $np
  129. adc $hi1,$nhi,xzr
  130. adds $lo1,$lo1,$lo0
  131. sub $i,$num,#8 // i=num-1
  132. adcs $hi1,$hi1,$hi0
  133. adc $ovf,xzr,xzr // upmost overflow bit
  134. stp $lo1,$hi1,[$tp]
  135. .Louter:
  136. ldr $m0,[$bp],#8 // bp[i]
  137. ldp $hi0,$aj,[$ap],#16
  138. ldr $tj,[sp] // tp[0]
  139. add $tp,sp,#8
  140. mul $lo0,$hi0,$m0 // ap[0]*bp[i]
  141. sub $j,$num,#16 // j=num-2
  142. umulh $hi0,$hi0,$m0
  143. ldp $hi1,$nj,[$np],#16
  144. mul $alo,$aj,$m0 // ap[1]*bp[i]
  145. adds $lo0,$lo0,$tj
  146. umulh $ahi,$aj,$m0
  147. adc $hi0,$hi0,xzr
  148. mul $m1,$lo0,$n0
  149. sub $i,$i,#8 // i--
  150. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  151. umulh $hi1,$hi1,$m1
  152. mul $nlo,$nj,$m1 // np[1]*m1
  153. // (*) adds $lo1,$lo1,$lo0
  154. subs xzr,$lo0,#1 // (*)
  155. umulh $nhi,$nj,$m1
  156. cbz $j,.Linner_skip
  157. .Linner:
  158. ldr $aj,[$ap],#8
  159. adc $hi1,$hi1,xzr
  160. ldr $tj,[$tp],#8 // tp[j]
  161. adds $lo0,$alo,$hi0
  162. sub $j,$j,#8 // j--
  163. adc $hi0,$ahi,xzr
  164. adds $lo1,$nlo,$hi1
  165. ldr $nj,[$np],#8
  166. adc $hi1,$nhi,xzr
  167. mul $alo,$aj,$m0 // ap[j]*bp[i]
  168. adds $lo0,$lo0,$tj
  169. umulh $ahi,$aj,$m0
  170. adc $hi0,$hi0,xzr
  171. mul $nlo,$nj,$m1 // np[j]*m1
  172. adds $lo1,$lo1,$lo0
  173. umulh $nhi,$nj,$m1
  174. stur $lo1,[$tp,#-16] // tp[j-1]
  175. cbnz $j,.Linner
  176. .Linner_skip:
  177. ldr $tj,[$tp],#8 // tp[j]
  178. adc $hi1,$hi1,xzr
  179. adds $lo0,$alo,$hi0
  180. sub $ap,$ap,$num // rewind $ap
  181. adc $hi0,$ahi,xzr
  182. adds $lo1,$nlo,$hi1
  183. sub $np,$np,$num // rewind $np
  184. adcs $hi1,$nhi,$ovf
  185. adc $ovf,xzr,xzr
  186. adds $lo0,$lo0,$tj
  187. adc $hi0,$hi0,xzr
  188. adds $lo1,$lo1,$lo0
  189. adcs $hi1,$hi1,$hi0
  190. adc $ovf,$ovf,xzr // upmost overflow bit
  191. stp $lo1,$hi1,[$tp,#-16]
  192. cbnz $i,.Louter
  193. // Final step. We see if result is larger than modulus, and
  194. // if it is, subtract the modulus. But comparison implies
  195. // subtraction. So we subtract modulus, see if it borrowed,
  196. // and conditionally copy original value.
  197. ldr $tj,[sp] // tp[0]
  198. add $tp,sp,#8
  199. ldr $nj,[$np],#8 // np[0]
  200. subs $j,$num,#8 // j=num-1 and clear borrow
  201. mov $ap,$rp
  202. .Lsub:
  203. sbcs $aj,$tj,$nj // tp[j]-np[j]
  204. ldr $tj,[$tp],#8
  205. sub $j,$j,#8 // j--
  206. ldr $nj,[$np],#8
  207. str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
  208. cbnz $j,.Lsub
  209. sbcs $aj,$tj,$nj
  210. sbcs $ovf,$ovf,xzr // did it borrow?
  211. str $aj,[$ap],#8 // rp[num-1]
  212. ldr $tj,[sp] // tp[0]
  213. add $tp,sp,#8
  214. ldr $aj,[$rp],#8 // rp[0]
  215. sub $num,$num,#8 // num--
  216. nop
  217. .Lcond_copy:
  218. sub $num,$num,#8 // num--
  219. csel $nj,$tj,$aj,lo // did it borrow?
  220. ldr $tj,[$tp],#8
  221. ldr $aj,[$rp],#8
  222. stur xzr,[$tp,#-16] // wipe tp
  223. stur $nj,[$rp,#-16]
  224. cbnz $num,.Lcond_copy
  225. csel $nj,$tj,$aj,lo
  226. stur xzr,[$tp,#-8] // wipe tp
  227. stur $nj,[$rp,#-8]
  228. ldp x19,x20,[x29,#16]
  229. mov sp,x29
  230. ldp x21,x22,[x29,#32]
  231. mov x0,#1
  232. ldp x23,x24,[x29,#48]
  233. ldr x29,[sp],#64
  234. ret
  235. .size bn_mul_mont,.-bn_mul_mont
  236. ___
  237. {
  238. ########################################################################
  239. # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
  240. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
  241. my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
  242. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
  243. my ($cnt,$carry,$topmost)=("x27","x28","x30");
  244. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  245. $code.=<<___;
  246. .type __bn_sqr8x_mont,%function
  247. .align 5
  248. __bn_sqr8x_mont:
  249. cmp $ap,$bp
  250. b.ne __bn_mul4x_mont
  251. .Lsqr8x_mont:
  252. .inst 0xd503233f // paciasp
  253. stp x29,x30,[sp,#-128]!
  254. add x29,sp,#0
  255. stp x19,x20,[sp,#16]
  256. stp x21,x22,[sp,#32]
  257. stp x23,x24,[sp,#48]
  258. stp x25,x26,[sp,#64]
  259. stp x27,x28,[sp,#80]
  260. stp $rp,$np,[sp,#96] // offload rp and np
  261. ldp $a0,$a1,[$ap,#8*0]
  262. ldp $a2,$a3,[$ap,#8*2]
  263. ldp $a4,$a5,[$ap,#8*4]
  264. ldp $a6,$a7,[$ap,#8*6]
  265. sub $tp,sp,$num,lsl#4
  266. lsl $num,$num,#3
  267. ldr $n0,[$n0] // *n0
  268. mov sp,$tp // alloca
  269. sub $cnt,$num,#8*8
  270. b .Lsqr8x_zero_start
  271. .Lsqr8x_zero:
  272. sub $cnt,$cnt,#8*8
  273. stp xzr,xzr,[$tp,#8*0]
  274. stp xzr,xzr,[$tp,#8*2]
  275. stp xzr,xzr,[$tp,#8*4]
  276. stp xzr,xzr,[$tp,#8*6]
  277. .Lsqr8x_zero_start:
  278. stp xzr,xzr,[$tp,#8*8]
  279. stp xzr,xzr,[$tp,#8*10]
  280. stp xzr,xzr,[$tp,#8*12]
  281. stp xzr,xzr,[$tp,#8*14]
  282. add $tp,$tp,#8*16
  283. cbnz $cnt,.Lsqr8x_zero
  284. add $ap_end,$ap,$num
  285. add $ap,$ap,#8*8
  286. mov $acc0,xzr
  287. mov $acc1,xzr
  288. mov $acc2,xzr
  289. mov $acc3,xzr
  290. mov $acc4,xzr
  291. mov $acc5,xzr
  292. mov $acc6,xzr
  293. mov $acc7,xzr
  294. mov $tp,sp
  295. str $n0,[x29,#112] // offload n0
  296. // Multiply everything but a[i]*a[i]
  297. .align 4
  298. .Lsqr8x_outer_loop:
  299. // a[1]a[0] (i)
  300. // a[2]a[0]
  301. // a[3]a[0]
  302. // a[4]a[0]
  303. // a[5]a[0]
  304. // a[6]a[0]
  305. // a[7]a[0]
  306. // a[2]a[1] (ii)
  307. // a[3]a[1]
  308. // a[4]a[1]
  309. // a[5]a[1]
  310. // a[6]a[1]
  311. // a[7]a[1]
  312. // a[3]a[2] (iii)
  313. // a[4]a[2]
  314. // a[5]a[2]
  315. // a[6]a[2]
  316. // a[7]a[2]
  317. // a[4]a[3] (iv)
  318. // a[5]a[3]
  319. // a[6]a[3]
  320. // a[7]a[3]
  321. // a[5]a[4] (v)
  322. // a[6]a[4]
  323. // a[7]a[4]
  324. // a[6]a[5] (vi)
  325. // a[7]a[5]
  326. // a[7]a[6] (vii)
  327. mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
  328. mul $t1,$a2,$a0
  329. mul $t2,$a3,$a0
  330. mul $t3,$a4,$a0
  331. adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
  332. mul $t0,$a5,$a0
  333. adcs $acc2,$acc2,$t1
  334. mul $t1,$a6,$a0
  335. adcs $acc3,$acc3,$t2
  336. mul $t2,$a7,$a0
  337. adcs $acc4,$acc4,$t3
  338. umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
  339. adcs $acc5,$acc5,$t0
  340. umulh $t0,$a2,$a0
  341. adcs $acc6,$acc6,$t1
  342. umulh $t1,$a3,$a0
  343. adcs $acc7,$acc7,$t2
  344. umulh $t2,$a4,$a0
  345. stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
  346. adc $acc0,xzr,xzr // t[8]
  347. adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
  348. umulh $t3,$a5,$a0
  349. adcs $acc3,$acc3,$t0
  350. umulh $t0,$a6,$a0
  351. adcs $acc4,$acc4,$t1
  352. umulh $t1,$a7,$a0
  353. adcs $acc5,$acc5,$t2
  354. mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
  355. adcs $acc6,$acc6,$t3
  356. mul $t3,$a3,$a1
  357. adcs $acc7,$acc7,$t0
  358. mul $t0,$a4,$a1
  359. adc $acc0,$acc0,$t1
  360. mul $t1,$a5,$a1
  361. adds $acc3,$acc3,$t2
  362. mul $t2,$a6,$a1
  363. adcs $acc4,$acc4,$t3
  364. mul $t3,$a7,$a1
  365. adcs $acc5,$acc5,$t0
  366. umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
  367. adcs $acc6,$acc6,$t1
  368. umulh $t1,$a3,$a1
  369. adcs $acc7,$acc7,$t2
  370. umulh $t2,$a4,$a1
  371. adcs $acc0,$acc0,$t3
  372. umulh $t3,$a5,$a1
  373. stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
  374. adc $acc1,xzr,xzr // t[9]
  375. adds $acc4,$acc4,$t0
  376. umulh $t0,$a6,$a1
  377. adcs $acc5,$acc5,$t1
  378. umulh $t1,$a7,$a1
  379. adcs $acc6,$acc6,$t2
  380. mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
  381. adcs $acc7,$acc7,$t3
  382. mul $t3,$a4,$a2
  383. adcs $acc0,$acc0,$t0
  384. mul $t0,$a5,$a2
  385. adc $acc1,$acc1,$t1
  386. mul $t1,$a6,$a2
  387. adds $acc5,$acc5,$t2
  388. mul $t2,$a7,$a2
  389. adcs $acc6,$acc6,$t3
  390. umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
  391. adcs $acc7,$acc7,$t0
  392. umulh $t0,$a4,$a2
  393. adcs $acc0,$acc0,$t1
  394. umulh $t1,$a5,$a2
  395. adcs $acc1,$acc1,$t2
  396. umulh $t2,$a6,$a2
  397. stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
  398. adc $acc2,xzr,xzr // t[10]
  399. adds $acc6,$acc6,$t3
  400. umulh $t3,$a7,$a2
  401. adcs $acc7,$acc7,$t0
  402. mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
  403. adcs $acc0,$acc0,$t1
  404. mul $t1,$a5,$a3
  405. adcs $acc1,$acc1,$t2
  406. mul $t2,$a6,$a3
  407. adc $acc2,$acc2,$t3
  408. mul $t3,$a7,$a3
  409. adds $acc7,$acc7,$t0
  410. umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
  411. adcs $acc0,$acc0,$t1
  412. umulh $t1,$a5,$a3
  413. adcs $acc1,$acc1,$t2
  414. umulh $t2,$a6,$a3
  415. adcs $acc2,$acc2,$t3
  416. umulh $t3,$a7,$a3
  417. stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
  418. adc $acc3,xzr,xzr // t[11]
  419. adds $acc0,$acc0,$t0
  420. mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
  421. adcs $acc1,$acc1,$t1
  422. mul $t1,$a6,$a4
  423. adcs $acc2,$acc2,$t2
  424. mul $t2,$a7,$a4
  425. adc $acc3,$acc3,$t3
  426. umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
  427. adds $acc1,$acc1,$t0
  428. umulh $t0,$a6,$a4
  429. adcs $acc2,$acc2,$t1
  430. umulh $t1,$a7,$a4
  431. adcs $acc3,$acc3,$t2
  432. mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
  433. adc $acc4,xzr,xzr // t[12]
  434. adds $acc2,$acc2,$t3
  435. mul $t3,$a7,$a5
  436. adcs $acc3,$acc3,$t0
  437. umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
  438. adc $acc4,$acc4,$t1
  439. umulh $t1,$a7,$a5
  440. adds $acc3,$acc3,$t2
  441. mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
  442. adcs $acc4,$acc4,$t3
  443. umulh $t3,$a7,$a6 // hi(a[7]*a[6])
  444. adc $acc5,xzr,xzr // t[13]
  445. adds $acc4,$acc4,$t0
  446. sub $cnt,$ap_end,$ap // done yet?
  447. adc $acc5,$acc5,$t1
  448. adds $acc5,$acc5,$t2
  449. sub $t0,$ap_end,$num // rewinded ap
  450. adc $acc6,xzr,xzr // t[14]
  451. add $acc6,$acc6,$t3
  452. cbz $cnt,.Lsqr8x_outer_break
  453. mov $n0,$a0
  454. ldp $a0,$a1,[$tp,#8*0]
  455. ldp $a2,$a3,[$tp,#8*2]
  456. ldp $a4,$a5,[$tp,#8*4]
  457. ldp $a6,$a7,[$tp,#8*6]
  458. adds $acc0,$acc0,$a0
  459. adcs $acc1,$acc1,$a1
  460. ldp $a0,$a1,[$ap,#8*0]
  461. adcs $acc2,$acc2,$a2
  462. adcs $acc3,$acc3,$a3
  463. ldp $a2,$a3,[$ap,#8*2]
  464. adcs $acc4,$acc4,$a4
  465. adcs $acc5,$acc5,$a5
  466. ldp $a4,$a5,[$ap,#8*4]
  467. adcs $acc6,$acc6,$a6
  468. mov $rp,$ap
  469. adcs $acc7,xzr,$a7
  470. ldp $a6,$a7,[$ap,#8*6]
  471. add $ap,$ap,#8*8
  472. //adc $carry,xzr,xzr // moved below
  473. mov $cnt,#-8*8
  474. // a[8]a[0]
  475. // a[9]a[0]
  476. // a[a]a[0]
  477. // a[b]a[0]
  478. // a[c]a[0]
  479. // a[d]a[0]
  480. // a[e]a[0]
  481. // a[f]a[0]
  482. // a[8]a[1]
  483. // a[f]a[1]........................
  484. // a[8]a[2]
  485. // a[f]a[2]........................
  486. // a[8]a[3]
  487. // a[f]a[3]........................
  488. // a[8]a[4]
  489. // a[f]a[4]........................
  490. // a[8]a[5]
  491. // a[f]a[5]........................
  492. // a[8]a[6]
  493. // a[f]a[6]........................
  494. // a[8]a[7]
  495. // a[f]a[7]........................
  496. .Lsqr8x_mul:
  497. mul $t0,$a0,$n0
  498. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  499. mul $t1,$a1,$n0
  500. add $cnt,$cnt,#8
  501. mul $t2,$a2,$n0
  502. mul $t3,$a3,$n0
  503. adds $acc0,$acc0,$t0
  504. mul $t0,$a4,$n0
  505. adcs $acc1,$acc1,$t1
  506. mul $t1,$a5,$n0
  507. adcs $acc2,$acc2,$t2
  508. mul $t2,$a6,$n0
  509. adcs $acc3,$acc3,$t3
  510. mul $t3,$a7,$n0
  511. adcs $acc4,$acc4,$t0
  512. umulh $t0,$a0,$n0
  513. adcs $acc5,$acc5,$t1
  514. umulh $t1,$a1,$n0
  515. adcs $acc6,$acc6,$t2
  516. umulh $t2,$a2,$n0
  517. adcs $acc7,$acc7,$t3
  518. umulh $t3,$a3,$n0
  519. adc $carry,$carry,xzr
  520. str $acc0,[$tp],#8
  521. adds $acc0,$acc1,$t0
  522. umulh $t0,$a4,$n0
  523. adcs $acc1,$acc2,$t1
  524. umulh $t1,$a5,$n0
  525. adcs $acc2,$acc3,$t2
  526. umulh $t2,$a6,$n0
  527. adcs $acc3,$acc4,$t3
  528. umulh $t3,$a7,$n0
  529. ldr $n0,[$rp,$cnt]
  530. adcs $acc4,$acc5,$t0
  531. adcs $acc5,$acc6,$t1
  532. adcs $acc6,$acc7,$t2
  533. adcs $acc7,$carry,$t3
  534. //adc $carry,xzr,xzr // moved above
  535. cbnz $cnt,.Lsqr8x_mul
  536. // note that carry flag is guaranteed
  537. // to be zero at this point
  538. cmp $ap,$ap_end // done yet?
  539. b.eq .Lsqr8x_break
  540. ldp $a0,$a1,[$tp,#8*0]
  541. ldp $a2,$a3,[$tp,#8*2]
  542. ldp $a4,$a5,[$tp,#8*4]
  543. ldp $a6,$a7,[$tp,#8*6]
  544. adds $acc0,$acc0,$a0
  545. ldur $n0,[$rp,#-8*8]
  546. adcs $acc1,$acc1,$a1
  547. ldp $a0,$a1,[$ap,#8*0]
  548. adcs $acc2,$acc2,$a2
  549. adcs $acc3,$acc3,$a3
  550. ldp $a2,$a3,[$ap,#8*2]
  551. adcs $acc4,$acc4,$a4
  552. adcs $acc5,$acc5,$a5
  553. ldp $a4,$a5,[$ap,#8*4]
  554. adcs $acc6,$acc6,$a6
  555. mov $cnt,#-8*8
  556. adcs $acc7,$acc7,$a7
  557. ldp $a6,$a7,[$ap,#8*6]
  558. add $ap,$ap,#8*8
  559. //adc $carry,xzr,xzr // moved above
  560. b .Lsqr8x_mul
  561. .align 4
  562. .Lsqr8x_break:
  563. ldp $a0,$a1,[$rp,#8*0]
  564. add $ap,$rp,#8*8
  565. ldp $a2,$a3,[$rp,#8*2]
  566. sub $t0,$ap_end,$ap // is it last iteration?
  567. ldp $a4,$a5,[$rp,#8*4]
  568. sub $t1,$tp,$t0
  569. ldp $a6,$a7,[$rp,#8*6]
  570. cbz $t0,.Lsqr8x_outer_loop
  571. stp $acc0,$acc1,[$tp,#8*0]
  572. ldp $acc0,$acc1,[$t1,#8*0]
  573. stp $acc2,$acc3,[$tp,#8*2]
  574. ldp $acc2,$acc3,[$t1,#8*2]
  575. stp $acc4,$acc5,[$tp,#8*4]
  576. ldp $acc4,$acc5,[$t1,#8*4]
  577. stp $acc6,$acc7,[$tp,#8*6]
  578. mov $tp,$t1
  579. ldp $acc6,$acc7,[$t1,#8*6]
  580. b .Lsqr8x_outer_loop
  581. .align 4
  582. .Lsqr8x_outer_break:
  583. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  584. ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
  585. ldp $t1,$t2,[sp,#8*1]
  586. ldp $a5,$a7,[$t0,#8*2]
  587. add $ap,$t0,#8*4
  588. ldp $t3,$t0,[sp,#8*3]
  589. stp $acc0,$acc1,[$tp,#8*0]
  590. mul $acc0,$a1,$a1
  591. stp $acc2,$acc3,[$tp,#8*2]
  592. umulh $a1,$a1,$a1
  593. stp $acc4,$acc5,[$tp,#8*4]
  594. mul $a2,$a3,$a3
  595. stp $acc6,$acc7,[$tp,#8*6]
  596. mov $tp,sp
  597. umulh $a3,$a3,$a3
  598. adds $acc1,$a1,$t1,lsl#1
  599. extr $t1,$t2,$t1,#63
  600. sub $cnt,$num,#8*4
  601. .Lsqr4x_shift_n_add:
  602. adcs $acc2,$a2,$t1
  603. extr $t2,$t3,$t2,#63
  604. sub $cnt,$cnt,#8*4
  605. adcs $acc3,$a3,$t2
  606. ldp $t1,$t2,[$tp,#8*5]
  607. mul $a4,$a5,$a5
  608. ldp $a1,$a3,[$ap],#8*2
  609. umulh $a5,$a5,$a5
  610. mul $a6,$a7,$a7
  611. umulh $a7,$a7,$a7
  612. extr $t3,$t0,$t3,#63
  613. stp $acc0,$acc1,[$tp,#8*0]
  614. adcs $acc4,$a4,$t3
  615. extr $t0,$t1,$t0,#63
  616. stp $acc2,$acc3,[$tp,#8*2]
  617. adcs $acc5,$a5,$t0
  618. ldp $t3,$t0,[$tp,#8*7]
  619. extr $t1,$t2,$t1,#63
  620. adcs $acc6,$a6,$t1
  621. extr $t2,$t3,$t2,#63
  622. adcs $acc7,$a7,$t2
  623. ldp $t1,$t2,[$tp,#8*9]
  624. mul $a0,$a1,$a1
  625. ldp $a5,$a7,[$ap],#8*2
  626. umulh $a1,$a1,$a1
  627. mul $a2,$a3,$a3
  628. umulh $a3,$a3,$a3
  629. stp $acc4,$acc5,[$tp,#8*4]
  630. extr $t3,$t0,$t3,#63
  631. stp $acc6,$acc7,[$tp,#8*6]
  632. add $tp,$tp,#8*8
  633. adcs $acc0,$a0,$t3
  634. extr $t0,$t1,$t0,#63
  635. adcs $acc1,$a1,$t0
  636. ldp $t3,$t0,[$tp,#8*3]
  637. extr $t1,$t2,$t1,#63
  638. cbnz $cnt,.Lsqr4x_shift_n_add
  639. ___
  640. my ($np,$np_end)=($ap,$ap_end);
  641. $code.=<<___;
  642. ldp $np,$n0,[x29,#104] // pull np and n0
  643. adcs $acc2,$a2,$t1
  644. extr $t2,$t3,$t2,#63
  645. adcs $acc3,$a3,$t2
  646. ldp $t1,$t2,[$tp,#8*5]
  647. mul $a4,$a5,$a5
  648. umulh $a5,$a5,$a5
  649. stp $acc0,$acc1,[$tp,#8*0]
  650. mul $a6,$a7,$a7
  651. umulh $a7,$a7,$a7
  652. stp $acc2,$acc3,[$tp,#8*2]
  653. extr $t3,$t0,$t3,#63
  654. adcs $acc4,$a4,$t3
  655. extr $t0,$t1,$t0,#63
  656. ldp $acc0,$acc1,[sp,#8*0]
  657. adcs $acc5,$a5,$t0
  658. extr $t1,$t2,$t1,#63
  659. ldp $a0,$a1,[$np,#8*0]
  660. adcs $acc6,$a6,$t1
  661. extr $t2,xzr,$t2,#63
  662. ldp $a2,$a3,[$np,#8*2]
  663. adc $acc7,$a7,$t2
  664. ldp $a4,$a5,[$np,#8*4]
  665. // Reduce by 512 bits per iteration
  666. mul $na0,$n0,$acc0 // t[0]*n0
  667. ldp $a6,$a7,[$np,#8*6]
  668. add $np_end,$np,$num
  669. ldp $acc2,$acc3,[sp,#8*2]
  670. stp $acc4,$acc5,[$tp,#8*4]
  671. ldp $acc4,$acc5,[sp,#8*4]
  672. stp $acc6,$acc7,[$tp,#8*6]
  673. ldp $acc6,$acc7,[sp,#8*6]
  674. add $np,$np,#8*8
  675. mov $topmost,xzr // initial top-most carry
  676. mov $tp,sp
  677. mov $cnt,#8
  678. .Lsqr8x_reduction:
  679. // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
  680. mul $t1,$a1,$na0
  681. sub $cnt,$cnt,#1
  682. mul $t2,$a2,$na0
  683. str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
  684. mul $t3,$a3,$na0
  685. // (*) adds xzr,$acc0,$t0
  686. subs xzr,$acc0,#1 // (*)
  687. mul $t0,$a4,$na0
  688. adcs $acc0,$acc1,$t1
  689. mul $t1,$a5,$na0
  690. adcs $acc1,$acc2,$t2
  691. mul $t2,$a6,$na0
  692. adcs $acc2,$acc3,$t3
  693. mul $t3,$a7,$na0
  694. adcs $acc3,$acc4,$t0
  695. umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
  696. adcs $acc4,$acc5,$t1
  697. umulh $t1,$a1,$na0
  698. adcs $acc5,$acc6,$t2
  699. umulh $t2,$a2,$na0
  700. adcs $acc6,$acc7,$t3
  701. umulh $t3,$a3,$na0
  702. adc $acc7,xzr,xzr
  703. adds $acc0,$acc0,$t0
  704. umulh $t0,$a4,$na0
  705. adcs $acc1,$acc1,$t1
  706. umulh $t1,$a5,$na0
  707. adcs $acc2,$acc2,$t2
  708. umulh $t2,$a6,$na0
  709. adcs $acc3,$acc3,$t3
  710. umulh $t3,$a7,$na0
  711. mul $na0,$n0,$acc0 // next t[0]*n0
  712. adcs $acc4,$acc4,$t0
  713. adcs $acc5,$acc5,$t1
  714. adcs $acc6,$acc6,$t2
  715. adc $acc7,$acc7,$t3
  716. cbnz $cnt,.Lsqr8x_reduction
  717. ldp $t0,$t1,[$tp,#8*0]
  718. ldp $t2,$t3,[$tp,#8*2]
  719. mov $rp,$tp
  720. sub $cnt,$np_end,$np // done yet?
  721. adds $acc0,$acc0,$t0
  722. adcs $acc1,$acc1,$t1
  723. ldp $t0,$t1,[$tp,#8*4]
  724. adcs $acc2,$acc2,$t2
  725. adcs $acc3,$acc3,$t3
  726. ldp $t2,$t3,[$tp,#8*6]
  727. adcs $acc4,$acc4,$t0
  728. adcs $acc5,$acc5,$t1
  729. adcs $acc6,$acc6,$t2
  730. adcs $acc7,$acc7,$t3
  731. //adc $carry,xzr,xzr // moved below
  732. cbz $cnt,.Lsqr8x8_post_condition
  733. ldur $n0,[$tp,#-8*8]
  734. ldp $a0,$a1,[$np,#8*0]
  735. ldp $a2,$a3,[$np,#8*2]
  736. ldp $a4,$a5,[$np,#8*4]
  737. mov $cnt,#-8*8
  738. ldp $a6,$a7,[$np,#8*6]
  739. add $np,$np,#8*8
  740. .Lsqr8x_tail:
  741. mul $t0,$a0,$n0
  742. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  743. mul $t1,$a1,$n0
  744. add $cnt,$cnt,#8
  745. mul $t2,$a2,$n0
  746. mul $t3,$a3,$n0
  747. adds $acc0,$acc0,$t0
  748. mul $t0,$a4,$n0
  749. adcs $acc1,$acc1,$t1
  750. mul $t1,$a5,$n0
  751. adcs $acc2,$acc2,$t2
  752. mul $t2,$a6,$n0
  753. adcs $acc3,$acc3,$t3
  754. mul $t3,$a7,$n0
  755. adcs $acc4,$acc4,$t0
  756. umulh $t0,$a0,$n0
  757. adcs $acc5,$acc5,$t1
  758. umulh $t1,$a1,$n0
  759. adcs $acc6,$acc6,$t2
  760. umulh $t2,$a2,$n0
  761. adcs $acc7,$acc7,$t3
  762. umulh $t3,$a3,$n0
  763. adc $carry,$carry,xzr
  764. str $acc0,[$tp],#8
  765. adds $acc0,$acc1,$t0
  766. umulh $t0,$a4,$n0
  767. adcs $acc1,$acc2,$t1
  768. umulh $t1,$a5,$n0
  769. adcs $acc2,$acc3,$t2
  770. umulh $t2,$a6,$n0
  771. adcs $acc3,$acc4,$t3
  772. umulh $t3,$a7,$n0
  773. ldr $n0,[$rp,$cnt]
  774. adcs $acc4,$acc5,$t0
  775. adcs $acc5,$acc6,$t1
  776. adcs $acc6,$acc7,$t2
  777. adcs $acc7,$carry,$t3
  778. //adc $carry,xzr,xzr // moved above
  779. cbnz $cnt,.Lsqr8x_tail
  780. // note that carry flag is guaranteed
  781. // to be zero at this point
  782. ldp $a0,$a1,[$tp,#8*0]
  783. sub $cnt,$np_end,$np // done yet?
  784. sub $t2,$np_end,$num // rewinded np
  785. ldp $a2,$a3,[$tp,#8*2]
  786. ldp $a4,$a5,[$tp,#8*4]
  787. ldp $a6,$a7,[$tp,#8*6]
  788. cbz $cnt,.Lsqr8x_tail_break
  789. ldur $n0,[$rp,#-8*8]
  790. adds $acc0,$acc0,$a0
  791. adcs $acc1,$acc1,$a1
  792. ldp $a0,$a1,[$np,#8*0]
  793. adcs $acc2,$acc2,$a2
  794. adcs $acc3,$acc3,$a3
  795. ldp $a2,$a3,[$np,#8*2]
  796. adcs $acc4,$acc4,$a4
  797. adcs $acc5,$acc5,$a5
  798. ldp $a4,$a5,[$np,#8*4]
  799. adcs $acc6,$acc6,$a6
  800. mov $cnt,#-8*8
  801. adcs $acc7,$acc7,$a7
  802. ldp $a6,$a7,[$np,#8*6]
  803. add $np,$np,#8*8
  804. //adc $carry,xzr,xzr // moved above
  805. b .Lsqr8x_tail
  806. .align 4
  807. .Lsqr8x_tail_break:
  808. ldr $n0,[x29,#112] // pull n0
  809. add $cnt,$tp,#8*8 // end of current t[num] window
  810. subs xzr,$topmost,#1 // "move" top-most carry to carry bit
  811. adcs $t0,$acc0,$a0
  812. adcs $t1,$acc1,$a1
  813. ldp $acc0,$acc1,[$rp,#8*0]
  814. adcs $acc2,$acc2,$a2
  815. ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
  816. adcs $acc3,$acc3,$a3
  817. ldp $a2,$a3,[$t2,#8*2]
  818. adcs $acc4,$acc4,$a4
  819. adcs $acc5,$acc5,$a5
  820. ldp $a4,$a5,[$t2,#8*4]
  821. adcs $acc6,$acc6,$a6
  822. adcs $acc7,$acc7,$a7
  823. ldp $a6,$a7,[$t2,#8*6]
  824. add $np,$t2,#8*8
  825. adc $topmost,xzr,xzr // top-most carry
  826. mul $na0,$n0,$acc0
  827. stp $t0,$t1,[$tp,#8*0]
  828. stp $acc2,$acc3,[$tp,#8*2]
  829. ldp $acc2,$acc3,[$rp,#8*2]
  830. stp $acc4,$acc5,[$tp,#8*4]
  831. ldp $acc4,$acc5,[$rp,#8*4]
  832. cmp $cnt,x29 // did we hit the bottom?
  833. stp $acc6,$acc7,[$tp,#8*6]
  834. mov $tp,$rp // slide the window
  835. ldp $acc6,$acc7,[$rp,#8*6]
  836. mov $cnt,#8
  837. b.ne .Lsqr8x_reduction
  838. // Final step. We see if result is larger than modulus, and
  839. // if it is, subtract the modulus. But comparison implies
  840. // subtraction. So we subtract modulus, see if it borrowed,
  841. // and conditionally copy original value.
  842. ldr $rp,[x29,#96] // pull rp
  843. add $tp,$tp,#8*8
  844. subs $t0,$acc0,$a0
  845. sbcs $t1,$acc1,$a1
  846. sub $cnt,$num,#8*8
  847. mov $ap_end,$rp // $rp copy
  848. .Lsqr8x_sub:
  849. sbcs $t2,$acc2,$a2
  850. ldp $a0,$a1,[$np,#8*0]
  851. sbcs $t3,$acc3,$a3
  852. stp $t0,$t1,[$rp,#8*0]
  853. sbcs $t0,$acc4,$a4
  854. ldp $a2,$a3,[$np,#8*2]
  855. sbcs $t1,$acc5,$a5
  856. stp $t2,$t3,[$rp,#8*2]
  857. sbcs $t2,$acc6,$a6
  858. ldp $a4,$a5,[$np,#8*4]
  859. sbcs $t3,$acc7,$a7
  860. ldp $a6,$a7,[$np,#8*6]
  861. add $np,$np,#8*8
  862. ldp $acc0,$acc1,[$tp,#8*0]
  863. sub $cnt,$cnt,#8*8
  864. ldp $acc2,$acc3,[$tp,#8*2]
  865. ldp $acc4,$acc5,[$tp,#8*4]
  866. ldp $acc6,$acc7,[$tp,#8*6]
  867. add $tp,$tp,#8*8
  868. stp $t0,$t1,[$rp,#8*4]
  869. sbcs $t0,$acc0,$a0
  870. stp $t2,$t3,[$rp,#8*6]
  871. add $rp,$rp,#8*8
  872. sbcs $t1,$acc1,$a1
  873. cbnz $cnt,.Lsqr8x_sub
  874. sbcs $t2,$acc2,$a2
  875. mov $tp,sp
  876. add $ap,sp,$num
  877. ldp $a0,$a1,[$ap_end,#8*0]
  878. sbcs $t3,$acc3,$a3
  879. stp $t0,$t1,[$rp,#8*0]
  880. sbcs $t0,$acc4,$a4
  881. ldp $a2,$a3,[$ap_end,#8*2]
  882. sbcs $t1,$acc5,$a5
  883. stp $t2,$t3,[$rp,#8*2]
  884. sbcs $t2,$acc6,$a6
  885. ldp $acc0,$acc1,[$ap,#8*0]
  886. sbcs $t3,$acc7,$a7
  887. ldp $acc2,$acc3,[$ap,#8*2]
  888. sbcs xzr,$topmost,xzr // did it borrow?
  889. ldr x30,[x29,#8] // pull return address
  890. stp $t0,$t1,[$rp,#8*4]
  891. stp $t2,$t3,[$rp,#8*6]
  892. sub $cnt,$num,#8*4
  893. .Lsqr4x_cond_copy:
  894. sub $cnt,$cnt,#8*4
  895. csel $t0,$acc0,$a0,lo
  896. stp xzr,xzr,[$tp,#8*0]
  897. csel $t1,$acc1,$a1,lo
  898. ldp $a0,$a1,[$ap_end,#8*4]
  899. ldp $acc0,$acc1,[$ap,#8*4]
  900. csel $t2,$acc2,$a2,lo
  901. stp xzr,xzr,[$tp,#8*2]
  902. add $tp,$tp,#8*4
  903. csel $t3,$acc3,$a3,lo
  904. ldp $a2,$a3,[$ap_end,#8*6]
  905. ldp $acc2,$acc3,[$ap,#8*6]
  906. add $ap,$ap,#8*4
  907. stp $t0,$t1,[$ap_end,#8*0]
  908. stp $t2,$t3,[$ap_end,#8*2]
  909. add $ap_end,$ap_end,#8*4
  910. stp xzr,xzr,[$ap,#8*0]
  911. stp xzr,xzr,[$ap,#8*2]
  912. cbnz $cnt,.Lsqr4x_cond_copy
  913. csel $t0,$acc0,$a0,lo
  914. stp xzr,xzr,[$tp,#8*0]
  915. csel $t1,$acc1,$a1,lo
  916. stp xzr,xzr,[$tp,#8*2]
  917. csel $t2,$acc2,$a2,lo
  918. csel $t3,$acc3,$a3,lo
  919. stp $t0,$t1,[$ap_end,#8*0]
  920. stp $t2,$t3,[$ap_end,#8*2]
  921. b .Lsqr8x_done
  922. .align 4
  923. .Lsqr8x8_post_condition:
  924. adc $carry,xzr,xzr
  925. ldr x30,[x29,#8] // pull return address
  926. // $acc0-7,$carry hold result, $a0-7 hold modulus
  927. subs $a0,$acc0,$a0
  928. ldr $ap,[x29,#96] // pull rp
  929. sbcs $a1,$acc1,$a1
  930. stp xzr,xzr,[sp,#8*0]
  931. sbcs $a2,$acc2,$a2
  932. stp xzr,xzr,[sp,#8*2]
  933. sbcs $a3,$acc3,$a3
  934. stp xzr,xzr,[sp,#8*4]
  935. sbcs $a4,$acc4,$a4
  936. stp xzr,xzr,[sp,#8*6]
  937. sbcs $a5,$acc5,$a5
  938. stp xzr,xzr,[sp,#8*8]
  939. sbcs $a6,$acc6,$a6
  940. stp xzr,xzr,[sp,#8*10]
  941. sbcs $a7,$acc7,$a7
  942. stp xzr,xzr,[sp,#8*12]
  943. sbcs $carry,$carry,xzr // did it borrow?
  944. stp xzr,xzr,[sp,#8*14]
  945. // $a0-7 hold result-modulus
  946. csel $a0,$acc0,$a0,lo
  947. csel $a1,$acc1,$a1,lo
  948. csel $a2,$acc2,$a2,lo
  949. csel $a3,$acc3,$a3,lo
  950. stp $a0,$a1,[$ap,#8*0]
  951. csel $a4,$acc4,$a4,lo
  952. csel $a5,$acc5,$a5,lo
  953. stp $a2,$a3,[$ap,#8*2]
  954. csel $a6,$acc6,$a6,lo
  955. csel $a7,$acc7,$a7,lo
  956. stp $a4,$a5,[$ap,#8*4]
  957. stp $a6,$a7,[$ap,#8*6]
  958. .Lsqr8x_done:
  959. ldp x19,x20,[x29,#16]
  960. mov sp,x29
  961. ldp x21,x22,[x29,#32]
  962. mov x0,#1
  963. ldp x23,x24,[x29,#48]
  964. ldp x25,x26,[x29,#64]
  965. ldp x27,x28,[x29,#80]
  966. ldr x29,[sp],#128
  967. .inst 0xd50323bf // autiasp
  968. ret
  969. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  970. ___
  971. }
  972. {
  973. ########################################################################
  974. # Even though this might look as ARMv8 adaptation of mulx4x_mont from
  975. # x86_64-mont5 module, it's different in sense that it performs
  976. # reduction 256 bits at a time.
  977. my ($a0,$a1,$a2,$a3,
  978. $t0,$t1,$t2,$t3,
  979. $m0,$m1,$m2,$m3,
  980. $acc0,$acc1,$acc2,$acc3,$acc4,
  981. $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
  982. my $bp_end=$rp;
  983. my ($carry,$topmost) = ($rp,"x30");
  984. $code.=<<___;
  985. .type __bn_mul4x_mont,%function
  986. .align 5
  987. __bn_mul4x_mont:
  988. .inst 0xd503233f // paciasp
  989. stp x29,x30,[sp,#-128]!
  990. add x29,sp,#0
  991. stp x19,x20,[sp,#16]
  992. stp x21,x22,[sp,#32]
  993. stp x23,x24,[sp,#48]
  994. stp x25,x26,[sp,#64]
  995. stp x27,x28,[sp,#80]
  996. sub $tp,sp,$num,lsl#3
  997. lsl $num,$num,#3
  998. ldr $n0,[$n0] // *n0
  999. sub sp,$tp,#8*4 // alloca
  1000. add $t0,$bp,$num
  1001. add $ap_end,$ap,$num
  1002. stp $rp,$t0,[x29,#96] // offload rp and &b[num]
  1003. ldr $bi,[$bp,#8*0] // b[0]
  1004. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1005. ldp $a2,$a3,[$ap,#8*2]
  1006. add $ap,$ap,#8*4
  1007. mov $acc0,xzr
  1008. mov $acc1,xzr
  1009. mov $acc2,xzr
  1010. mov $acc3,xzr
  1011. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1012. ldp $m2,$m3,[$np,#8*2]
  1013. adds $np,$np,#8*4 // clear carry bit
  1014. mov $carry,xzr
  1015. mov $cnt,#0
  1016. mov $tp,sp
  1017. .Loop_mul4x_1st_reduction:
  1018. mul $t0,$a0,$bi // lo(a[0..3]*b[0])
  1019. adc $carry,$carry,xzr // modulo-scheduled
  1020. mul $t1,$a1,$bi
  1021. add $cnt,$cnt,#8
  1022. mul $t2,$a2,$bi
  1023. and $cnt,$cnt,#31
  1024. mul $t3,$a3,$bi
  1025. adds $acc0,$acc0,$t0
  1026. umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
  1027. adcs $acc1,$acc1,$t1
  1028. mul $mi,$acc0,$n0 // t[0]*n0
  1029. adcs $acc2,$acc2,$t2
  1030. umulh $t1,$a1,$bi
  1031. adcs $acc3,$acc3,$t3
  1032. umulh $t2,$a2,$bi
  1033. adc $acc4,xzr,xzr
  1034. umulh $t3,$a3,$bi
  1035. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1036. adds $acc1,$acc1,$t0
  1037. // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
  1038. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1039. adcs $acc2,$acc2,$t1
  1040. mul $t1,$m1,$mi
  1041. adcs $acc3,$acc3,$t2
  1042. mul $t2,$m2,$mi
  1043. adc $acc4,$acc4,$t3 // can't overflow
  1044. mul $t3,$m3,$mi
  1045. // (*) adds xzr,$acc0,$t0
  1046. subs xzr,$acc0,#1 // (*)
  1047. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
  1048. adcs $acc0,$acc1,$t1
  1049. umulh $t1,$m1,$mi
  1050. adcs $acc1,$acc2,$t2
  1051. umulh $t2,$m2,$mi
  1052. adcs $acc2,$acc3,$t3
  1053. umulh $t3,$m3,$mi
  1054. adcs $acc3,$acc4,$carry
  1055. adc $carry,xzr,xzr
  1056. adds $acc0,$acc0,$t0
  1057. sub $t0,$ap_end,$ap
  1058. adcs $acc1,$acc1,$t1
  1059. adcs $acc2,$acc2,$t2
  1060. adcs $acc3,$acc3,$t3
  1061. //adc $carry,$carry,xzr
  1062. cbnz $cnt,.Loop_mul4x_1st_reduction
  1063. cbz $t0,.Lmul4x4_post_condition
  1064. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1065. ldp $a2,$a3,[$ap,#8*2]
  1066. add $ap,$ap,#8*4
  1067. ldr $mi,[sp] // a[0]*n0
  1068. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1069. ldp $m2,$m3,[$np,#8*2]
  1070. add $np,$np,#8*4
  1071. .Loop_mul4x_1st_tail:
  1072. mul $t0,$a0,$bi // lo(a[4..7]*b[i])
  1073. adc $carry,$carry,xzr // modulo-scheduled
  1074. mul $t1,$a1,$bi
  1075. add $cnt,$cnt,#8
  1076. mul $t2,$a2,$bi
  1077. and $cnt,$cnt,#31
  1078. mul $t3,$a3,$bi
  1079. adds $acc0,$acc0,$t0
  1080. umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
  1081. adcs $acc1,$acc1,$t1
  1082. umulh $t1,$a1,$bi
  1083. adcs $acc2,$acc2,$t2
  1084. umulh $t2,$a2,$bi
  1085. adcs $acc3,$acc3,$t3
  1086. umulh $t3,$a3,$bi
  1087. adc $acc4,xzr,xzr
  1088. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1089. adds $acc1,$acc1,$t0
  1090. mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
  1091. adcs $acc2,$acc2,$t1
  1092. mul $t1,$m1,$mi
  1093. adcs $acc3,$acc3,$t2
  1094. mul $t2,$m2,$mi
  1095. adc $acc4,$acc4,$t3 // can't overflow
  1096. mul $t3,$m3,$mi
  1097. adds $acc0,$acc0,$t0
  1098. umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
  1099. adcs $acc1,$acc1,$t1
  1100. umulh $t1,$m1,$mi
  1101. adcs $acc2,$acc2,$t2
  1102. umulh $t2,$m2,$mi
  1103. adcs $acc3,$acc3,$t3
  1104. adcs $acc4,$acc4,$carry
  1105. umulh $t3,$m3,$mi
  1106. adc $carry,xzr,xzr
  1107. ldr $mi,[sp,$cnt] // next t[0]*n0
  1108. str $acc0,[$tp],#8 // result!!!
  1109. adds $acc0,$acc1,$t0
  1110. sub $t0,$ap_end,$ap // done yet?
  1111. adcs $acc1,$acc2,$t1
  1112. adcs $acc2,$acc3,$t2
  1113. adcs $acc3,$acc4,$t3
  1114. //adc $carry,$carry,xzr
  1115. cbnz $cnt,.Loop_mul4x_1st_tail
  1116. sub $t1,$ap_end,$num // rewinded $ap
  1117. cbz $t0,.Lmul4x_proceed
  1118. ldp $a0,$a1,[$ap,#8*0]
  1119. ldp $a2,$a3,[$ap,#8*2]
  1120. add $ap,$ap,#8*4
  1121. ldp $m0,$m1,[$np,#8*0]
  1122. ldp $m2,$m3,[$np,#8*2]
  1123. add $np,$np,#8*4
  1124. b .Loop_mul4x_1st_tail
  1125. .align 5
  1126. .Lmul4x_proceed:
  1127. ldr $bi,[$bp,#8*4]! // *++b
  1128. adc $topmost,$carry,xzr
  1129. ldp $a0,$a1,[$t1,#8*0] // a[0..3]
  1130. sub $np,$np,$num // rewind np
  1131. ldp $a2,$a3,[$t1,#8*2]
  1132. add $ap,$t1,#8*4
  1133. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1134. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1135. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1136. ldp $acc2,$acc3,[sp,#8*6]
  1137. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1138. mov $tp,sp
  1139. ldp $m2,$m3,[$np,#8*2]
  1140. adds $np,$np,#8*4 // clear carry bit
  1141. mov $carry,xzr
  1142. .align 4
  1143. .Loop_mul4x_reduction:
  1144. mul $t0,$a0,$bi // lo(a[0..3]*b[4])
  1145. adc $carry,$carry,xzr // modulo-scheduled
  1146. mul $t1,$a1,$bi
  1147. add $cnt,$cnt,#8
  1148. mul $t2,$a2,$bi
  1149. and $cnt,$cnt,#31
  1150. mul $t3,$a3,$bi
  1151. adds $acc0,$acc0,$t0
  1152. umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
  1153. adcs $acc1,$acc1,$t1
  1154. mul $mi,$acc0,$n0 // t[0]*n0
  1155. adcs $acc2,$acc2,$t2
  1156. umulh $t1,$a1,$bi
  1157. adcs $acc3,$acc3,$t3
  1158. umulh $t2,$a2,$bi
  1159. adc $acc4,xzr,xzr
  1160. umulh $t3,$a3,$bi
  1161. ldr $bi,[$bp,$cnt] // next b[i]
  1162. adds $acc1,$acc1,$t0
  1163. // (*) mul $t0,$m0,$mi
  1164. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1165. adcs $acc2,$acc2,$t1
  1166. mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
  1167. adcs $acc3,$acc3,$t2
  1168. mul $t2,$m2,$mi
  1169. adc $acc4,$acc4,$t3 // can't overflow
  1170. mul $t3,$m3,$mi
  1171. // (*) adds xzr,$acc0,$t0
  1172. subs xzr,$acc0,#1 // (*)
  1173. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
  1174. adcs $acc0,$acc1,$t1
  1175. umulh $t1,$m1,$mi
  1176. adcs $acc1,$acc2,$t2
  1177. umulh $t2,$m2,$mi
  1178. adcs $acc2,$acc3,$t3
  1179. umulh $t3,$m3,$mi
  1180. adcs $acc3,$acc4,$carry
  1181. adc $carry,xzr,xzr
  1182. adds $acc0,$acc0,$t0
  1183. adcs $acc1,$acc1,$t1
  1184. adcs $acc2,$acc2,$t2
  1185. adcs $acc3,$acc3,$t3
  1186. //adc $carry,$carry,xzr
  1187. cbnz $cnt,.Loop_mul4x_reduction
  1188. adc $carry,$carry,xzr
  1189. ldp $t0,$t1,[$tp,#8*4] // t[4..7]
  1190. ldp $t2,$t3,[$tp,#8*6]
  1191. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1192. ldp $a2,$a3,[$ap,#8*2]
  1193. add $ap,$ap,#8*4
  1194. adds $acc0,$acc0,$t0
  1195. adcs $acc1,$acc1,$t1
  1196. adcs $acc2,$acc2,$t2
  1197. adcs $acc3,$acc3,$t3
  1198. //adc $carry,$carry,xzr
  1199. ldr $mi,[sp] // t[0]*n0
  1200. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1201. ldp $m2,$m3,[$np,#8*2]
  1202. add $np,$np,#8*4
  1203. .align 4
  1204. .Loop_mul4x_tail:
  1205. mul $t0,$a0,$bi // lo(a[4..7]*b[4])
  1206. adc $carry,$carry,xzr // modulo-scheduled
  1207. mul $t1,$a1,$bi
  1208. add $cnt,$cnt,#8
  1209. mul $t2,$a2,$bi
  1210. and $cnt,$cnt,#31
  1211. mul $t3,$a3,$bi
  1212. adds $acc0,$acc0,$t0
  1213. umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
  1214. adcs $acc1,$acc1,$t1
  1215. umulh $t1,$a1,$bi
  1216. adcs $acc2,$acc2,$t2
  1217. umulh $t2,$a2,$bi
  1218. adcs $acc3,$acc3,$t3
  1219. umulh $t3,$a3,$bi
  1220. adc $acc4,xzr,xzr
  1221. ldr $bi,[$bp,$cnt] // next b[i]
  1222. adds $acc1,$acc1,$t0
  1223. mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
  1224. adcs $acc2,$acc2,$t1
  1225. mul $t1,$m1,$mi
  1226. adcs $acc3,$acc3,$t2
  1227. mul $t2,$m2,$mi
  1228. adc $acc4,$acc4,$t3 // can't overflow
  1229. mul $t3,$m3,$mi
  1230. adds $acc0,$acc0,$t0
  1231. umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
  1232. adcs $acc1,$acc1,$t1
  1233. umulh $t1,$m1,$mi
  1234. adcs $acc2,$acc2,$t2
  1235. umulh $t2,$m2,$mi
  1236. adcs $acc3,$acc3,$t3
  1237. umulh $t3,$m3,$mi
  1238. adcs $acc4,$acc4,$carry
  1239. ldr $mi,[sp,$cnt] // next a[0]*n0
  1240. adc $carry,xzr,xzr
  1241. str $acc0,[$tp],#8 // result!!!
  1242. adds $acc0,$acc1,$t0
  1243. sub $t0,$ap_end,$ap // done yet?
  1244. adcs $acc1,$acc2,$t1
  1245. adcs $acc2,$acc3,$t2
  1246. adcs $acc3,$acc4,$t3
  1247. //adc $carry,$carry,xzr
  1248. cbnz $cnt,.Loop_mul4x_tail
  1249. sub $t1,$np,$num // rewinded np?
  1250. adc $carry,$carry,xzr
  1251. cbz $t0,.Loop_mul4x_break
  1252. ldp $t0,$t1,[$tp,#8*4]
  1253. ldp $t2,$t3,[$tp,#8*6]
  1254. ldp $a0,$a1,[$ap,#8*0]
  1255. ldp $a2,$a3,[$ap,#8*2]
  1256. add $ap,$ap,#8*4
  1257. adds $acc0,$acc0,$t0
  1258. adcs $acc1,$acc1,$t1
  1259. adcs $acc2,$acc2,$t2
  1260. adcs $acc3,$acc3,$t3
  1261. //adc $carry,$carry,xzr
  1262. ldp $m0,$m1,[$np,#8*0]
  1263. ldp $m2,$m3,[$np,#8*2]
  1264. add $np,$np,#8*4
  1265. b .Loop_mul4x_tail
  1266. .align 4
  1267. .Loop_mul4x_break:
  1268. ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
  1269. adds $acc0,$acc0,$topmost
  1270. add $bp,$bp,#8*4 // bp++
  1271. adcs $acc1,$acc1,xzr
  1272. sub $ap,$ap,$num // rewind ap
  1273. adcs $acc2,$acc2,xzr
  1274. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1275. adcs $acc3,$acc3,xzr
  1276. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1277. adc $topmost,$carry,xzr
  1278. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1279. cmp $bp,$t3 // done yet?
  1280. ldp $acc2,$acc3,[sp,#8*6]
  1281. ldp $m0,$m1,[$t1,#8*0] // n[0..3]
  1282. ldp $m2,$m3,[$t1,#8*2]
  1283. add $np,$t1,#8*4
  1284. b.eq .Lmul4x_post
  1285. ldr $bi,[$bp]
  1286. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1287. ldp $a2,$a3,[$ap,#8*2]
  1288. adds $ap,$ap,#8*4 // clear carry bit
  1289. mov $carry,xzr
  1290. mov $tp,sp
  1291. b .Loop_mul4x_reduction
  1292. .align 4
  1293. .Lmul4x_post:
  1294. // Final step. We see if result is larger than modulus, and
  1295. // if it is, subtract the modulus. But comparison implies
  1296. // subtraction. So we subtract modulus, see if it borrowed,
  1297. // and conditionally copy original value.
  1298. mov $rp,$t2
  1299. mov $ap_end,$t2 // $rp copy
  1300. subs $t0,$acc0,$m0
  1301. add $tp,sp,#8*8
  1302. sbcs $t1,$acc1,$m1
  1303. sub $cnt,$num,#8*4
  1304. .Lmul4x_sub:
  1305. sbcs $t2,$acc2,$m2
  1306. ldp $m0,$m1,[$np,#8*0]
  1307. sub $cnt,$cnt,#8*4
  1308. ldp $acc0,$acc1,[$tp,#8*0]
  1309. sbcs $t3,$acc3,$m3
  1310. ldp $m2,$m3,[$np,#8*2]
  1311. add $np,$np,#8*4
  1312. ldp $acc2,$acc3,[$tp,#8*2]
  1313. add $tp,$tp,#8*4
  1314. stp $t0,$t1,[$rp,#8*0]
  1315. sbcs $t0,$acc0,$m0
  1316. stp $t2,$t3,[$rp,#8*2]
  1317. add $rp,$rp,#8*4
  1318. sbcs $t1,$acc1,$m1
  1319. cbnz $cnt,.Lmul4x_sub
  1320. sbcs $t2,$acc2,$m2
  1321. mov $tp,sp
  1322. add $ap,sp,#8*4
  1323. ldp $a0,$a1,[$ap_end,#8*0]
  1324. sbcs $t3,$acc3,$m3
  1325. stp $t0,$t1,[$rp,#8*0]
  1326. ldp $a2,$a3,[$ap_end,#8*2]
  1327. stp $t2,$t3,[$rp,#8*2]
  1328. ldp $acc0,$acc1,[$ap,#8*0]
  1329. ldp $acc2,$acc3,[$ap,#8*2]
  1330. sbcs xzr,$topmost,xzr // did it borrow?
  1331. ldr x30,[x29,#8] // pull return address
  1332. sub $cnt,$num,#8*4
  1333. .Lmul4x_cond_copy:
  1334. sub $cnt,$cnt,#8*4
  1335. csel $t0,$acc0,$a0,lo
  1336. stp xzr,xzr,[$tp,#8*0]
  1337. csel $t1,$acc1,$a1,lo
  1338. ldp $a0,$a1,[$ap_end,#8*4]
  1339. ldp $acc0,$acc1,[$ap,#8*4]
  1340. csel $t2,$acc2,$a2,lo
  1341. stp xzr,xzr,[$tp,#8*2]
  1342. add $tp,$tp,#8*4
  1343. csel $t3,$acc3,$a3,lo
  1344. ldp $a2,$a3,[$ap_end,#8*6]
  1345. ldp $acc2,$acc3,[$ap,#8*6]
  1346. add $ap,$ap,#8*4
  1347. stp $t0,$t1,[$ap_end,#8*0]
  1348. stp $t2,$t3,[$ap_end,#8*2]
  1349. add $ap_end,$ap_end,#8*4
  1350. cbnz $cnt,.Lmul4x_cond_copy
  1351. csel $t0,$acc0,$a0,lo
  1352. stp xzr,xzr,[$tp,#8*0]
  1353. csel $t1,$acc1,$a1,lo
  1354. stp xzr,xzr,[$tp,#8*2]
  1355. csel $t2,$acc2,$a2,lo
  1356. stp xzr,xzr,[$tp,#8*3]
  1357. csel $t3,$acc3,$a3,lo
  1358. stp xzr,xzr,[$tp,#8*4]
  1359. stp $t0,$t1,[$ap_end,#8*0]
  1360. stp $t2,$t3,[$ap_end,#8*2]
  1361. b .Lmul4x_done
  1362. .align 4
  1363. .Lmul4x4_post_condition:
  1364. adc $carry,$carry,xzr
  1365. ldr $ap,[x29,#96] // pull rp
  1366. // $acc0-3,$carry hold result, $m0-7 hold modulus
  1367. subs $a0,$acc0,$m0
  1368. ldr x30,[x29,#8] // pull return address
  1369. sbcs $a1,$acc1,$m1
  1370. stp xzr,xzr,[sp,#8*0]
  1371. sbcs $a2,$acc2,$m2
  1372. stp xzr,xzr,[sp,#8*2]
  1373. sbcs $a3,$acc3,$m3
  1374. stp xzr,xzr,[sp,#8*4]
  1375. sbcs xzr,$carry,xzr // did it borrow?
  1376. stp xzr,xzr,[sp,#8*6]
  1377. // $a0-3 hold result-modulus
  1378. csel $a0,$acc0,$a0,lo
  1379. csel $a1,$acc1,$a1,lo
  1380. csel $a2,$acc2,$a2,lo
  1381. csel $a3,$acc3,$a3,lo
  1382. stp $a0,$a1,[$ap,#8*0]
  1383. stp $a2,$a3,[$ap,#8*2]
  1384. .Lmul4x_done:
  1385. ldp x19,x20,[x29,#16]
  1386. mov sp,x29
  1387. ldp x21,x22,[x29,#32]
  1388. mov x0,#1
  1389. ldp x23,x24,[x29,#48]
  1390. ldp x25,x26,[x29,#64]
  1391. ldp x27,x28,[x29,#80]
  1392. ldr x29,[sp],#128
  1393. .inst 0xd50323bf // autiasp
  1394. ret
  1395. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1396. ___
  1397. }
  1398. $code.=<<___;
  1399. .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  1400. .align 4
  1401. ___
  1402. print $code;
  1403. close STDOUT or die "error closing STDOUT: $!";