armv8-mont.pl 36 KB


  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # March 2015
  9. #
  10. # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  11. # work. While it does improve RSA sign performance by 20-30% (less for
  12. # longer keys) on most processors, for some reason RSA2048 is not
  13. # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  14. # instruction issue rate is limited on processor in question, meaning
  15. # that dedicated squaring procedure is a must. Well, actually all
  16. # contemporary AArch64 processors seem to have limited multiplication
  17. # issue rate, i.e. they can't issue multiplication every cycle, which
  18. # explains moderate improvement coefficients in comparison to
  19. # compiler-generated code. Recall that compiler is instructed to use
  20. # umulh and therefore uses same amount of multiplication instructions
  21. # to do the job. Assembly's edge is to minimize number of "collateral"
  22. # instructions and of course instruction scheduling.
  23. #
  24. # April 2015
  25. #
  26. # Squaring procedure that handles lengths divisible by 8 improves
  27. # RSA/DSA performance by 25-40-60% depending on processor and key
  28. # length. Overall improvement coefficients are always positive in
  29. # comparison to compiler-generated code. On Cortex-A57 improvement
  30. # is still modest on longest key lengths, while others exhibit e.g.
  31. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
  32. # on Cortex-A57 and ~60-100% faster on others.
  33. $flavour = shift;
  34. $output = shift;
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  38. die "can't locate arm-xlate.pl";
  39. open OUT,"| \"$^X\" $xlate $flavour $output";
  40. *STDOUT=*OUT;
  41. ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  42. $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  43. $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  44. # int bn_mul_mont(
  45. $rp="x0"; # BN_ULONG *rp,
  46. $ap="x1"; # const BN_ULONG *ap,
  47. $bp="x2"; # const BN_ULONG *bp,
  48. $np="x3"; # const BN_ULONG *np,
  49. $n0="x4"; # const BN_ULONG *n0,
  50. $num="x5"; # int num);
  51. $code.=<<___;
  52. .text
  53. .globl bn_mul_mont
  54. .type bn_mul_mont,%function
  55. .align 5
  56. bn_mul_mont:
  57. tst $num,#7
  58. b.eq __bn_sqr8x_mont
  59. tst $num,#3
  60. b.eq __bn_mul4x_mont
  61. .Lmul_mont:
  62. stp x29,x30,[sp,#-64]!
  63. add x29,sp,#0
  64. stp x19,x20,[sp,#16]
  65. stp x21,x22,[sp,#32]
  66. stp x23,x24,[sp,#48]
  67. ldr $m0,[$bp],#8 // bp[0]
  68. sub $tp,sp,$num,lsl#3
  69. ldp $hi0,$aj,[$ap],#16 // ap[0..1]
  70. lsl $num,$num,#3
  71. ldr $n0,[$n0] // *n0
  72. and $tp,$tp,#-16 // ABI says so
  73. ldp $hi1,$nj,[$np],#16 // np[0..1]
  74. mul $lo0,$hi0,$m0 // ap[0]*bp[0]
  75. sub $j,$num,#16 // j=num-2
  76. umulh $hi0,$hi0,$m0
  77. mul $alo,$aj,$m0 // ap[1]*bp[0]
  78. umulh $ahi,$aj,$m0
  79. mul $m1,$lo0,$n0 // "tp[0]"*n0
  80. mov sp,$tp // alloca
  81. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  82. umulh $hi1,$hi1,$m1
  83. mul $nlo,$nj,$m1 // np[1]*m1
  84. // (*) adds $lo1,$lo1,$lo0 // discarded
  85. // (*) As for removal of first multiplication and addition
  86. // instructions. The outcome of first addition is
  87. // guaranteed to be zero, which leaves two computationally
  88. // significant outcomes: it either carries or not. Then
  89. // question is when does it carry? Is there alternative
  90. // way to deduce it? If you follow operations, you can
  91. // observe that condition for carry is quite simple:
  92. // $lo0 being non-zero. So that carry can be calculated
  93. // by adding -1 to $lo0. That's what next instruction does.
  94. subs xzr,$lo0,#1 // (*)
  95. umulh $nhi,$nj,$m1
  96. adc $hi1,$hi1,xzr
  97. cbz $j,.L1st_skip
  98. .L1st:
  99. ldr $aj,[$ap],#8
  100. adds $lo0,$alo,$hi0
  101. sub $j,$j,#8 // j--
  102. adc $hi0,$ahi,xzr
  103. ldr $nj,[$np],#8
  104. adds $lo1,$nlo,$hi1
  105. mul $alo,$aj,$m0 // ap[j]*bp[0]
  106. adc $hi1,$nhi,xzr
  107. umulh $ahi,$aj,$m0
  108. adds $lo1,$lo1,$lo0
  109. mul $nlo,$nj,$m1 // np[j]*m1
  110. adc $hi1,$hi1,xzr
  111. umulh $nhi,$nj,$m1
  112. str $lo1,[$tp],#8 // tp[j-1]
  113. cbnz $j,.L1st
  114. .L1st_skip:
  115. adds $lo0,$alo,$hi0
  116. sub $ap,$ap,$num // rewind $ap
  117. adc $hi0,$ahi,xzr
  118. adds $lo1,$nlo,$hi1
  119. sub $np,$np,$num // rewind $np
  120. adc $hi1,$nhi,xzr
  121. adds $lo1,$lo1,$lo0
  122. sub $i,$num,#8 // i=num-1
  123. adcs $hi1,$hi1,$hi0
  124. adc $ovf,xzr,xzr // upmost overflow bit
  125. stp $lo1,$hi1,[$tp]
  126. .Louter:
  127. ldr $m0,[$bp],#8 // bp[i]
  128. ldp $hi0,$aj,[$ap],#16
  129. ldr $tj,[sp] // tp[0]
  130. add $tp,sp,#8
  131. mul $lo0,$hi0,$m0 // ap[0]*bp[i]
  132. sub $j,$num,#16 // j=num-2
  133. umulh $hi0,$hi0,$m0
  134. ldp $hi1,$nj,[$np],#16
  135. mul $alo,$aj,$m0 // ap[1]*bp[i]
  136. adds $lo0,$lo0,$tj
  137. umulh $ahi,$aj,$m0
  138. adc $hi0,$hi0,xzr
  139. mul $m1,$lo0,$n0
  140. sub $i,$i,#8 // i--
  141. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  142. umulh $hi1,$hi1,$m1
  143. mul $nlo,$nj,$m1 // np[1]*m1
  144. // (*) adds $lo1,$lo1,$lo0
  145. subs xzr,$lo0,#1 // (*)
  146. umulh $nhi,$nj,$m1
  147. cbz $j,.Linner_skip
  148. .Linner:
  149. ldr $aj,[$ap],#8
  150. adc $hi1,$hi1,xzr
  151. ldr $tj,[$tp],#8 // tp[j]
  152. adds $lo0,$alo,$hi0
  153. sub $j,$j,#8 // j--
  154. adc $hi0,$ahi,xzr
  155. adds $lo1,$nlo,$hi1
  156. ldr $nj,[$np],#8
  157. adc $hi1,$nhi,xzr
  158. mul $alo,$aj,$m0 // ap[j]*bp[i]
  159. adds $lo0,$lo0,$tj
  160. umulh $ahi,$aj,$m0
  161. adc $hi0,$hi0,xzr
  162. mul $nlo,$nj,$m1 // np[j]*m1
  163. adds $lo1,$lo1,$lo0
  164. umulh $nhi,$nj,$m1
  165. str $lo1,[$tp,#-16] // tp[j-1]
  166. cbnz $j,.Linner
  167. .Linner_skip:
  168. ldr $tj,[$tp],#8 // tp[j]
  169. adc $hi1,$hi1,xzr
  170. adds $lo0,$alo,$hi0
  171. sub $ap,$ap,$num // rewind $ap
  172. adc $hi0,$ahi,xzr
  173. adds $lo1,$nlo,$hi1
  174. sub $np,$np,$num // rewind $np
  175. adcs $hi1,$nhi,$ovf
  176. adc $ovf,xzr,xzr
  177. adds $lo0,$lo0,$tj
  178. adc $hi0,$hi0,xzr
  179. adds $lo1,$lo1,$lo0
  180. adcs $hi1,$hi1,$hi0
  181. adc $ovf,$ovf,xzr // upmost overflow bit
  182. stp $lo1,$hi1,[$tp,#-16]
  183. cbnz $i,.Louter
  184. // Final step. We see if result is larger than modulus, and
  185. // if it is, subtract the modulus. But comparison implies
  186. // subtraction. So we subtract modulus, see if it borrowed,
  187. // and conditionally copy original value.
  188. ldr $tj,[sp] // tp[0]
  189. add $tp,sp,#8
  190. ldr $nj,[$np],#8 // np[0]
  191. subs $j,$num,#8 // j=num-1 and clear borrow
  192. mov $ap,$rp
  193. .Lsub:
  194. sbcs $aj,$tj,$nj // tp[j]-np[j]
  195. ldr $tj,[$tp],#8
  196. sub $j,$j,#8 // j--
  197. ldr $nj,[$np],#8
  198. str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
  199. cbnz $j,.Lsub
  200. sbcs $aj,$tj,$nj
  201. sbcs $ovf,$ovf,xzr // did it borrow?
  202. str $aj,[$ap],#8 // rp[num-1]
  203. ldr $tj,[sp] // tp[0]
  204. add $tp,sp,#8
  205. ldr $aj,[$rp],#8 // rp[0]
  206. sub $num,$num,#8 // num--
  207. nop
  208. .Lcond_copy:
  209. sub $num,$num,#8 // num--
  210. csel $nj,$tj,$aj,lo // did it borrow?
  211. ldr $tj,[$tp],#8
  212. ldr $aj,[$rp],#8
  213. str xzr,[$tp,#-16] // wipe tp
  214. str $nj,[$rp,#-16]
  215. cbnz $num,.Lcond_copy
  216. csel $nj,$tj,$aj,lo
  217. str xzr,[$tp,#-8] // wipe tp
  218. str $nj,[$rp,#-8]
  219. ldp x19,x20,[x29,#16]
  220. mov sp,x29
  221. ldp x21,x22,[x29,#32]
  222. mov x0,#1
  223. ldp x23,x24,[x29,#48]
  224. ldr x29,[sp],#64
  225. ret
  226. .size bn_mul_mont,.-bn_mul_mont
  227. ___
  228. {
  229. ########################################################################
  230. # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
  231. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
  232. my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
  233. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
  234. my ($cnt,$carry,$topmost)=("x27","x28","x30");
  235. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  236. $code.=<<___;
  237. .type __bn_sqr8x_mont,%function
  238. .align 5
  239. __bn_sqr8x_mont:
  240. cmp $ap,$bp
  241. b.ne __bn_mul4x_mont
  242. .Lsqr8x_mont:
  243. stp x29,x30,[sp,#-128]!
  244. add x29,sp,#0
  245. stp x19,x20,[sp,#16]
  246. stp x21,x22,[sp,#32]
  247. stp x23,x24,[sp,#48]
  248. stp x25,x26,[sp,#64]
  249. stp x27,x28,[sp,#80]
  250. stp $rp,$np,[sp,#96] // offload rp and np
  251. ldp $a0,$a1,[$ap,#8*0]
  252. ldp $a2,$a3,[$ap,#8*2]
  253. ldp $a4,$a5,[$ap,#8*4]
  254. ldp $a6,$a7,[$ap,#8*6]
  255. sub $tp,sp,$num,lsl#4
  256. lsl $num,$num,#3
  257. ldr $n0,[$n0] // *n0
  258. mov sp,$tp // alloca
  259. sub $cnt,$num,#8*8
  260. b .Lsqr8x_zero_start
  261. .Lsqr8x_zero:
  262. sub $cnt,$cnt,#8*8
  263. stp xzr,xzr,[$tp,#8*0]
  264. stp xzr,xzr,[$tp,#8*2]
  265. stp xzr,xzr,[$tp,#8*4]
  266. stp xzr,xzr,[$tp,#8*6]
  267. .Lsqr8x_zero_start:
  268. stp xzr,xzr,[$tp,#8*8]
  269. stp xzr,xzr,[$tp,#8*10]
  270. stp xzr,xzr,[$tp,#8*12]
  271. stp xzr,xzr,[$tp,#8*14]
  272. add $tp,$tp,#8*16
  273. cbnz $cnt,.Lsqr8x_zero
  274. add $ap_end,$ap,$num
  275. add $ap,$ap,#8*8
  276. mov $acc0,xzr
  277. mov $acc1,xzr
  278. mov $acc2,xzr
  279. mov $acc3,xzr
  280. mov $acc4,xzr
  281. mov $acc5,xzr
  282. mov $acc6,xzr
  283. mov $acc7,xzr
  284. mov $tp,sp
  285. str $n0,[x29,#112] // offload n0
  286. // Multiply everything but a[i]*a[i]
  287. .align 4
  288. .Lsqr8x_outer_loop:
  289. // a[1]a[0] (i)
  290. // a[2]a[0]
  291. // a[3]a[0]
  292. // a[4]a[0]
  293. // a[5]a[0]
  294. // a[6]a[0]
  295. // a[7]a[0]
  296. // a[2]a[1] (ii)
  297. // a[3]a[1]
  298. // a[4]a[1]
  299. // a[5]a[1]
  300. // a[6]a[1]
  301. // a[7]a[1]
  302. // a[3]a[2] (iii)
  303. // a[4]a[2]
  304. // a[5]a[2]
  305. // a[6]a[2]
  306. // a[7]a[2]
  307. // a[4]a[3] (iv)
  308. // a[5]a[3]
  309. // a[6]a[3]
  310. // a[7]a[3]
  311. // a[5]a[4] (v)
  312. // a[6]a[4]
  313. // a[7]a[4]
  314. // a[6]a[5] (vi)
  315. // a[7]a[5]
  316. // a[7]a[6] (vii)
  317. mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
  318. mul $t1,$a2,$a0
  319. mul $t2,$a3,$a0
  320. mul $t3,$a4,$a0
  321. adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
  322. mul $t0,$a5,$a0
  323. adcs $acc2,$acc2,$t1
  324. mul $t1,$a6,$a0
  325. adcs $acc3,$acc3,$t2
  326. mul $t2,$a7,$a0
  327. adcs $acc4,$acc4,$t3
  328. umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
  329. adcs $acc5,$acc5,$t0
  330. umulh $t0,$a2,$a0
  331. adcs $acc6,$acc6,$t1
  332. umulh $t1,$a3,$a0
  333. adcs $acc7,$acc7,$t2
  334. umulh $t2,$a4,$a0
  335. stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
  336. adc $acc0,xzr,xzr // t[8]
  337. adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
  338. umulh $t3,$a5,$a0
  339. adcs $acc3,$acc3,$t0
  340. umulh $t0,$a6,$a0
  341. adcs $acc4,$acc4,$t1
  342. umulh $t1,$a7,$a0
  343. adcs $acc5,$acc5,$t2
  344. mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
  345. adcs $acc6,$acc6,$t3
  346. mul $t3,$a3,$a1
  347. adcs $acc7,$acc7,$t0
  348. mul $t0,$a4,$a1
  349. adc $acc0,$acc0,$t1
  350. mul $t1,$a5,$a1
  351. adds $acc3,$acc3,$t2
  352. mul $t2,$a6,$a1
  353. adcs $acc4,$acc4,$t3
  354. mul $t3,$a7,$a1
  355. adcs $acc5,$acc5,$t0
  356. umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
  357. adcs $acc6,$acc6,$t1
  358. umulh $t1,$a3,$a1
  359. adcs $acc7,$acc7,$t2
  360. umulh $t2,$a4,$a1
  361. adcs $acc0,$acc0,$t3
  362. umulh $t3,$a5,$a1
  363. stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
  364. adc $acc1,xzr,xzr // t[9]
  365. adds $acc4,$acc4,$t0
  366. umulh $t0,$a6,$a1
  367. adcs $acc5,$acc5,$t1
  368. umulh $t1,$a7,$a1
  369. adcs $acc6,$acc6,$t2
  370. mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
  371. adcs $acc7,$acc7,$t3
  372. mul $t3,$a4,$a2
  373. adcs $acc0,$acc0,$t0
  374. mul $t0,$a5,$a2
  375. adc $acc1,$acc1,$t1
  376. mul $t1,$a6,$a2
  377. adds $acc5,$acc5,$t2
  378. mul $t2,$a7,$a2
  379. adcs $acc6,$acc6,$t3
  380. umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
  381. adcs $acc7,$acc7,$t0
  382. umulh $t0,$a4,$a2
  383. adcs $acc0,$acc0,$t1
  384. umulh $t1,$a5,$a2
  385. adcs $acc1,$acc1,$t2
  386. umulh $t2,$a6,$a2
  387. stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
  388. adc $acc2,xzr,xzr // t[10]
  389. adds $acc6,$acc6,$t3
  390. umulh $t3,$a7,$a2
  391. adcs $acc7,$acc7,$t0
  392. mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
  393. adcs $acc0,$acc0,$t1
  394. mul $t1,$a5,$a3
  395. adcs $acc1,$acc1,$t2
  396. mul $t2,$a6,$a3
  397. adc $acc2,$acc2,$t3
  398. mul $t3,$a7,$a3
  399. adds $acc7,$acc7,$t0
  400. umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
  401. adcs $acc0,$acc0,$t1
  402. umulh $t1,$a5,$a3
  403. adcs $acc1,$acc1,$t2
  404. umulh $t2,$a6,$a3
  405. adcs $acc2,$acc2,$t3
  406. umulh $t3,$a7,$a3
  407. stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
  408. adc $acc3,xzr,xzr // t[11]
  409. adds $acc0,$acc0,$t0
  410. mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
  411. adcs $acc1,$acc1,$t1
  412. mul $t1,$a6,$a4
  413. adcs $acc2,$acc2,$t2
  414. mul $t2,$a7,$a4
  415. adc $acc3,$acc3,$t3
  416. umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
  417. adds $acc1,$acc1,$t0
  418. umulh $t0,$a6,$a4
  419. adcs $acc2,$acc2,$t1
  420. umulh $t1,$a7,$a4
  421. adcs $acc3,$acc3,$t2
  422. mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
  423. adc $acc4,xzr,xzr // t[12]
  424. adds $acc2,$acc2,$t3
  425. mul $t3,$a7,$a5
  426. adcs $acc3,$acc3,$t0
  427. umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
  428. adc $acc4,$acc4,$t1
  429. umulh $t1,$a7,$a5
  430. adds $acc3,$acc3,$t2
  431. mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
  432. adcs $acc4,$acc4,$t3
  433. umulh $t3,$a7,$a6 // hi(a[7]*a[6])
  434. adc $acc5,xzr,xzr // t[13]
  435. adds $acc4,$acc4,$t0
  436. sub $cnt,$ap_end,$ap // done yet?
  437. adc $acc5,$acc5,$t1
  438. adds $acc5,$acc5,$t2
  439. sub $t0,$ap_end,$num // rewinded ap
  440. adc $acc6,xzr,xzr // t[14]
  441. add $acc6,$acc6,$t3
  442. cbz $cnt,.Lsqr8x_outer_break
  443. mov $n0,$a0
  444. ldp $a0,$a1,[$tp,#8*0]
  445. ldp $a2,$a3,[$tp,#8*2]
  446. ldp $a4,$a5,[$tp,#8*4]
  447. ldp $a6,$a7,[$tp,#8*6]
  448. adds $acc0,$acc0,$a0
  449. adcs $acc1,$acc1,$a1
  450. ldp $a0,$a1,[$ap,#8*0]
  451. adcs $acc2,$acc2,$a2
  452. adcs $acc3,$acc3,$a3
  453. ldp $a2,$a3,[$ap,#8*2]
  454. adcs $acc4,$acc4,$a4
  455. adcs $acc5,$acc5,$a5
  456. ldp $a4,$a5,[$ap,#8*4]
  457. adcs $acc6,$acc6,$a6
  458. mov $rp,$ap
  459. adcs $acc7,xzr,$a7
  460. ldp $a6,$a7,[$ap,#8*6]
  461. add $ap,$ap,#8*8
  462. //adc $carry,xzr,xzr // moved below
  463. mov $cnt,#-8*8
  464. // a[8]a[0]
  465. // a[9]a[0]
  466. // a[a]a[0]
  467. // a[b]a[0]
  468. // a[c]a[0]
  469. // a[d]a[0]
  470. // a[e]a[0]
  471. // a[f]a[0]
  472. // a[8]a[1]
  473. // a[f]a[1]........................
  474. // a[8]a[2]
  475. // a[f]a[2]........................
  476. // a[8]a[3]
  477. // a[f]a[3]........................
  478. // a[8]a[4]
  479. // a[f]a[4]........................
  480. // a[8]a[5]
  481. // a[f]a[5]........................
  482. // a[8]a[6]
  483. // a[f]a[6]........................
  484. // a[8]a[7]
  485. // a[f]a[7]........................
  486. .Lsqr8x_mul:
  487. mul $t0,$a0,$n0
  488. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  489. mul $t1,$a1,$n0
  490. add $cnt,$cnt,#8
  491. mul $t2,$a2,$n0
  492. mul $t3,$a3,$n0
  493. adds $acc0,$acc0,$t0
  494. mul $t0,$a4,$n0
  495. adcs $acc1,$acc1,$t1
  496. mul $t1,$a5,$n0
  497. adcs $acc2,$acc2,$t2
  498. mul $t2,$a6,$n0
  499. adcs $acc3,$acc3,$t3
  500. mul $t3,$a7,$n0
  501. adcs $acc4,$acc4,$t0
  502. umulh $t0,$a0,$n0
  503. adcs $acc5,$acc5,$t1
  504. umulh $t1,$a1,$n0
  505. adcs $acc6,$acc6,$t2
  506. umulh $t2,$a2,$n0
  507. adcs $acc7,$acc7,$t3
  508. umulh $t3,$a3,$n0
  509. adc $carry,$carry,xzr
  510. str $acc0,[$tp],#8
  511. adds $acc0,$acc1,$t0
  512. umulh $t0,$a4,$n0
  513. adcs $acc1,$acc2,$t1
  514. umulh $t1,$a5,$n0
  515. adcs $acc2,$acc3,$t2
  516. umulh $t2,$a6,$n0
  517. adcs $acc3,$acc4,$t3
  518. umulh $t3,$a7,$n0
  519. ldr $n0,[$rp,$cnt]
  520. adcs $acc4,$acc5,$t0
  521. adcs $acc5,$acc6,$t1
  522. adcs $acc6,$acc7,$t2
  523. adcs $acc7,$carry,$t3
  524. //adc $carry,xzr,xzr // moved above
  525. cbnz $cnt,.Lsqr8x_mul
  526. // note that carry flag is guaranteed
  527. // to be zero at this point
  528. cmp $ap,$ap_end // done yet?
  529. b.eq .Lsqr8x_break
  530. ldp $a0,$a1,[$tp,#8*0]
  531. ldp $a2,$a3,[$tp,#8*2]
  532. ldp $a4,$a5,[$tp,#8*4]
  533. ldp $a6,$a7,[$tp,#8*6]
  534. adds $acc0,$acc0,$a0
  535. ldr $n0,[$rp,#-8*8]
  536. adcs $acc1,$acc1,$a1
  537. ldp $a0,$a1,[$ap,#8*0]
  538. adcs $acc2,$acc2,$a2
  539. adcs $acc3,$acc3,$a3
  540. ldp $a2,$a3,[$ap,#8*2]
  541. adcs $acc4,$acc4,$a4
  542. adcs $acc5,$acc5,$a5
  543. ldp $a4,$a5,[$ap,#8*4]
  544. adcs $acc6,$acc6,$a6
  545. mov $cnt,#-8*8
  546. adcs $acc7,$acc7,$a7
  547. ldp $a6,$a7,[$ap,#8*6]
  548. add $ap,$ap,#8*8
  549. //adc $carry,xzr,xzr // moved above
  550. b .Lsqr8x_mul
  551. .align 4
  552. .Lsqr8x_break:
  553. ldp $a0,$a1,[$rp,#8*0]
  554. add $ap,$rp,#8*8
  555. ldp $a2,$a3,[$rp,#8*2]
  556. sub $t0,$ap_end,$ap // is it last iteration?
  557. ldp $a4,$a5,[$rp,#8*4]
  558. sub $t1,$tp,$t0
  559. ldp $a6,$a7,[$rp,#8*6]
  560. cbz $t0,.Lsqr8x_outer_loop
  561. stp $acc0,$acc1,[$tp,#8*0]
  562. ldp $acc0,$acc1,[$t1,#8*0]
  563. stp $acc2,$acc3,[$tp,#8*2]
  564. ldp $acc2,$acc3,[$t1,#8*2]
  565. stp $acc4,$acc5,[$tp,#8*4]
  566. ldp $acc4,$acc5,[$t1,#8*4]
  567. stp $acc6,$acc7,[$tp,#8*6]
  568. mov $tp,$t1
  569. ldp $acc6,$acc7,[$t1,#8*6]
  570. b .Lsqr8x_outer_loop
  571. .align 4
  572. .Lsqr8x_outer_break:
  573. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  574. ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
  575. ldp $t1,$t2,[sp,#8*1]
  576. ldp $a5,$a7,[$t0,#8*2]
  577. add $ap,$t0,#8*4
  578. ldp $t3,$t0,[sp,#8*3]
  579. stp $acc0,$acc1,[$tp,#8*0]
  580. mul $acc0,$a1,$a1
  581. stp $acc2,$acc3,[$tp,#8*2]
  582. umulh $a1,$a1,$a1
  583. stp $acc4,$acc5,[$tp,#8*4]
  584. mul $a2,$a3,$a3
  585. stp $acc6,$acc7,[$tp,#8*6]
  586. mov $tp,sp
  587. umulh $a3,$a3,$a3
  588. adds $acc1,$a1,$t1,lsl#1
  589. extr $t1,$t2,$t1,#63
  590. sub $cnt,$num,#8*4
  591. .Lsqr4x_shift_n_add:
  592. adcs $acc2,$a2,$t1
  593. extr $t2,$t3,$t2,#63
  594. sub $cnt,$cnt,#8*4
  595. adcs $acc3,$a3,$t2
  596. ldp $t1,$t2,[$tp,#8*5]
  597. mul $a4,$a5,$a5
  598. ldp $a1,$a3,[$ap],#8*2
  599. umulh $a5,$a5,$a5
  600. mul $a6,$a7,$a7
  601. umulh $a7,$a7,$a7
  602. extr $t3,$t0,$t3,#63
  603. stp $acc0,$acc1,[$tp,#8*0]
  604. adcs $acc4,$a4,$t3
  605. extr $t0,$t1,$t0,#63
  606. stp $acc2,$acc3,[$tp,#8*2]
  607. adcs $acc5,$a5,$t0
  608. ldp $t3,$t0,[$tp,#8*7]
  609. extr $t1,$t2,$t1,#63
  610. adcs $acc6,$a6,$t1
  611. extr $t2,$t3,$t2,#63
  612. adcs $acc7,$a7,$t2
  613. ldp $t1,$t2,[$tp,#8*9]
  614. mul $a0,$a1,$a1
  615. ldp $a5,$a7,[$ap],#8*2
  616. umulh $a1,$a1,$a1
  617. mul $a2,$a3,$a3
  618. umulh $a3,$a3,$a3
  619. stp $acc4,$acc5,[$tp,#8*4]
  620. extr $t3,$t0,$t3,#63
  621. stp $acc6,$acc7,[$tp,#8*6]
  622. add $tp,$tp,#8*8
  623. adcs $acc0,$a0,$t3
  624. extr $t0,$t1,$t0,#63
  625. adcs $acc1,$a1,$t0
  626. ldp $t3,$t0,[$tp,#8*3]
  627. extr $t1,$t2,$t1,#63
  628. cbnz $cnt,.Lsqr4x_shift_n_add
  629. ___
  630. my ($np,$np_end)=($ap,$ap_end);
  631. $code.=<<___;
  632. ldp $np,$n0,[x29,#104] // pull np and n0
  633. adcs $acc2,$a2,$t1
  634. extr $t2,$t3,$t2,#63
  635. adcs $acc3,$a3,$t2
  636. ldp $t1,$t2,[$tp,#8*5]
  637. mul $a4,$a5,$a5
  638. umulh $a5,$a5,$a5
  639. stp $acc0,$acc1,[$tp,#8*0]
  640. mul $a6,$a7,$a7
  641. umulh $a7,$a7,$a7
  642. stp $acc2,$acc3,[$tp,#8*2]
  643. extr $t3,$t0,$t3,#63
  644. adcs $acc4,$a4,$t3
  645. extr $t0,$t1,$t0,#63
  646. ldp $acc0,$acc1,[sp,#8*0]
  647. adcs $acc5,$a5,$t0
  648. extr $t1,$t2,$t1,#63
  649. ldp $a0,$a1,[$np,#8*0]
  650. adcs $acc6,$a6,$t1
  651. extr $t2,xzr,$t2,#63
  652. ldp $a2,$a3,[$np,#8*2]
  653. adc $acc7,$a7,$t2
  654. ldp $a4,$a5,[$np,#8*4]
  655. // Reduce by 512 bits per iteration
  656. mul $na0,$n0,$acc0 // t[0]*n0
  657. ldp $a6,$a7,[$np,#8*6]
  658. add $np_end,$np,$num
  659. ldp $acc2,$acc3,[sp,#8*2]
  660. stp $acc4,$acc5,[$tp,#8*4]
  661. ldp $acc4,$acc5,[sp,#8*4]
  662. stp $acc6,$acc7,[$tp,#8*6]
  663. ldp $acc6,$acc7,[sp,#8*6]
  664. add $np,$np,#8*8
  665. mov $topmost,xzr // initial top-most carry
  666. mov $tp,sp
  667. mov $cnt,#8
  668. .Lsqr8x_reduction:
  669. // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
  670. mul $t1,$a1,$na0
  671. sub $cnt,$cnt,#1
  672. mul $t2,$a2,$na0
  673. str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
  674. mul $t3,$a3,$na0
  675. // (*) adds xzr,$acc0,$t0
  676. subs xzr,$acc0,#1 // (*)
  677. mul $t0,$a4,$na0
  678. adcs $acc0,$acc1,$t1
  679. mul $t1,$a5,$na0
  680. adcs $acc1,$acc2,$t2
  681. mul $t2,$a6,$na0
  682. adcs $acc2,$acc3,$t3
  683. mul $t3,$a7,$na0
  684. adcs $acc3,$acc4,$t0
  685. umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
  686. adcs $acc4,$acc5,$t1
  687. umulh $t1,$a1,$na0
  688. adcs $acc5,$acc6,$t2
  689. umulh $t2,$a2,$na0
  690. adcs $acc6,$acc7,$t3
  691. umulh $t3,$a3,$na0
  692. adc $acc7,xzr,xzr
  693. adds $acc0,$acc0,$t0
  694. umulh $t0,$a4,$na0
  695. adcs $acc1,$acc1,$t1
  696. umulh $t1,$a5,$na0
  697. adcs $acc2,$acc2,$t2
  698. umulh $t2,$a6,$na0
  699. adcs $acc3,$acc3,$t3
  700. umulh $t3,$a7,$na0
  701. mul $na0,$n0,$acc0 // next t[0]*n0
  702. adcs $acc4,$acc4,$t0
  703. adcs $acc5,$acc5,$t1
  704. adcs $acc6,$acc6,$t2
  705. adc $acc7,$acc7,$t3
  706. cbnz $cnt,.Lsqr8x_reduction
  707. ldp $t0,$t1,[$tp,#8*0]
  708. ldp $t2,$t3,[$tp,#8*2]
  709. mov $rp,$tp
  710. sub $cnt,$np_end,$np // done yet?
  711. adds $acc0,$acc0,$t0
  712. adcs $acc1,$acc1,$t1
  713. ldp $t0,$t1,[$tp,#8*4]
  714. adcs $acc2,$acc2,$t2
  715. adcs $acc3,$acc3,$t3
  716. ldp $t2,$t3,[$tp,#8*6]
  717. adcs $acc4,$acc4,$t0
  718. adcs $acc5,$acc5,$t1
  719. adcs $acc6,$acc6,$t2
  720. adcs $acc7,$acc7,$t3
  721. //adc $carry,xzr,xzr // moved below
  722. cbz $cnt,.Lsqr8x8_post_condition
  723. ldr $n0,[$tp,#-8*8]
  724. ldp $a0,$a1,[$np,#8*0]
  725. ldp $a2,$a3,[$np,#8*2]
  726. ldp $a4,$a5,[$np,#8*4]
  727. mov $cnt,#-8*8
  728. ldp $a6,$a7,[$np,#8*6]
  729. add $np,$np,#8*8
  730. .Lsqr8x_tail:
  731. mul $t0,$a0,$n0
  732. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  733. mul $t1,$a1,$n0
  734. add $cnt,$cnt,#8
  735. mul $t2,$a2,$n0
  736. mul $t3,$a3,$n0
  737. adds $acc0,$acc0,$t0
  738. mul $t0,$a4,$n0
  739. adcs $acc1,$acc1,$t1
  740. mul $t1,$a5,$n0
  741. adcs $acc2,$acc2,$t2
  742. mul $t2,$a6,$n0
  743. adcs $acc3,$acc3,$t3
  744. mul $t3,$a7,$n0
  745. adcs $acc4,$acc4,$t0
  746. umulh $t0,$a0,$n0
  747. adcs $acc5,$acc5,$t1
  748. umulh $t1,$a1,$n0
  749. adcs $acc6,$acc6,$t2
  750. umulh $t2,$a2,$n0
  751. adcs $acc7,$acc7,$t3
  752. umulh $t3,$a3,$n0
  753. adc $carry,$carry,xzr
  754. str $acc0,[$tp],#8
  755. adds $acc0,$acc1,$t0
  756. umulh $t0,$a4,$n0
  757. adcs $acc1,$acc2,$t1
  758. umulh $t1,$a5,$n0
  759. adcs $acc2,$acc3,$t2
  760. umulh $t2,$a6,$n0
  761. adcs $acc3,$acc4,$t3
  762. umulh $t3,$a7,$n0
  763. ldr $n0,[$rp,$cnt]
  764. adcs $acc4,$acc5,$t0
  765. adcs $acc5,$acc6,$t1
  766. adcs $acc6,$acc7,$t2
  767. adcs $acc7,$carry,$t3
  768. //adc $carry,xzr,xzr // moved above
  769. cbnz $cnt,.Lsqr8x_tail
  770. // note that carry flag is guaranteed
  771. // to be zero at this point
  772. ldp $a0,$a1,[$tp,#8*0]
  773. sub $cnt,$np_end,$np // done yet?
  774. sub $t2,$np_end,$num // rewinded np
  775. ldp $a2,$a3,[$tp,#8*2]
  776. ldp $a4,$a5,[$tp,#8*4]
  777. ldp $a6,$a7,[$tp,#8*6]
  778. cbz $cnt,.Lsqr8x_tail_break
  779. ldr $n0,[$rp,#-8*8]
  780. adds $acc0,$acc0,$a0
  781. adcs $acc1,$acc1,$a1
  782. ldp $a0,$a1,[$np,#8*0]
  783. adcs $acc2,$acc2,$a2
  784. adcs $acc3,$acc3,$a3
  785. ldp $a2,$a3,[$np,#8*2]
  786. adcs $acc4,$acc4,$a4
  787. adcs $acc5,$acc5,$a5
  788. ldp $a4,$a5,[$np,#8*4]
  789. adcs $acc6,$acc6,$a6
  790. mov $cnt,#-8*8
  791. adcs $acc7,$acc7,$a7
  792. ldp $a6,$a7,[$np,#8*6]
  793. add $np,$np,#8*8
  794. //adc $carry,xzr,xzr // moved above
  795. b .Lsqr8x_tail
  796. .align 4
  797. .Lsqr8x_tail_break:
  798. ldr $n0,[x29,#112] // pull n0
  799. add $cnt,$tp,#8*8 // end of current t[num] window
  800. subs xzr,$topmost,#1 // "move" top-most carry to carry bit
  801. adcs $t0,$acc0,$a0
  802. adcs $t1,$acc1,$a1
  803. ldp $acc0,$acc1,[$rp,#8*0]
  804. adcs $acc2,$acc2,$a2
  805. ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
  806. adcs $acc3,$acc3,$a3
  807. ldp $a2,$a3,[$t2,#8*2]
  808. adcs $acc4,$acc4,$a4
  809. adcs $acc5,$acc5,$a5
  810. ldp $a4,$a5,[$t2,#8*4]
  811. adcs $acc6,$acc6,$a6
  812. adcs $acc7,$acc7,$a7
  813. ldp $a6,$a7,[$t2,#8*6]
  814. add $np,$t2,#8*8
  815. adc $topmost,xzr,xzr // top-most carry
  816. mul $na0,$n0,$acc0
  817. stp $t0,$t1,[$tp,#8*0]
  818. stp $acc2,$acc3,[$tp,#8*2]
  819. ldp $acc2,$acc3,[$rp,#8*2]
  820. stp $acc4,$acc5,[$tp,#8*4]
  821. ldp $acc4,$acc5,[$rp,#8*4]
  822. cmp $cnt,x29 // did we hit the bottom?
  823. stp $acc6,$acc7,[$tp,#8*6]
  824. mov $tp,$rp // slide the window
  825. ldp $acc6,$acc7,[$rp,#8*6]
  826. mov $cnt,#8
  827. b.ne .Lsqr8x_reduction
  828. // Final step. We see if result is larger than modulus, and
  829. // if it is, subtract the modulus. But comparison implies
  830. // subtraction. So we subtract modulus, see if it borrowed,
  831. // and conditionally copy original value.
  832. ldr $rp,[x29,#96] // pull rp
  833. add $tp,$tp,#8*8
  834. subs $t0,$acc0,$a0
  835. sbcs $t1,$acc1,$a1
  836. sub $cnt,$num,#8*8
  837. mov $ap_end,$rp // $rp copy
  838. .Lsqr8x_sub:
  839. sbcs $t2,$acc2,$a2
  840. ldp $a0,$a1,[$np,#8*0]
  841. sbcs $t3,$acc3,$a3
  842. stp $t0,$t1,[$rp,#8*0]
  843. sbcs $t0,$acc4,$a4
  844. ldp $a2,$a3,[$np,#8*2]
  845. sbcs $t1,$acc5,$a5
  846. stp $t2,$t3,[$rp,#8*2]
  847. sbcs $t2,$acc6,$a6
  848. ldp $a4,$a5,[$np,#8*4]
  849. sbcs $t3,$acc7,$a7
  850. ldp $a6,$a7,[$np,#8*6]
  851. add $np,$np,#8*8
  852. ldp $acc0,$acc1,[$tp,#8*0]
  853. sub $cnt,$cnt,#8*8
  854. ldp $acc2,$acc3,[$tp,#8*2]
  855. ldp $acc4,$acc5,[$tp,#8*4]
  856. ldp $acc6,$acc7,[$tp,#8*6]
  857. add $tp,$tp,#8*8
  858. stp $t0,$t1,[$rp,#8*4]
  859. sbcs $t0,$acc0,$a0
  860. stp $t2,$t3,[$rp,#8*6]
  861. add $rp,$rp,#8*8
  862. sbcs $t1,$acc1,$a1
  863. cbnz $cnt,.Lsqr8x_sub
  864. sbcs $t2,$acc2,$a2
  865. mov $tp,sp
  866. add $ap,sp,$num
  867. ldp $a0,$a1,[$ap_end,#8*0]
  868. sbcs $t3,$acc3,$a3
  869. stp $t0,$t1,[$rp,#8*0]
  870. sbcs $t0,$acc4,$a4
  871. ldp $a2,$a3,[$ap_end,#8*2]
  872. sbcs $t1,$acc5,$a5
  873. stp $t2,$t3,[$rp,#8*2]
  874. sbcs $t2,$acc6,$a6
  875. ldp $acc0,$acc1,[$ap,#8*0]
  876. sbcs $t3,$acc7,$a7
  877. ldp $acc2,$acc3,[$ap,#8*2]
  878. sbcs xzr,$topmost,xzr // did it borrow?
  879. ldr x30,[x29,#8] // pull return address
  880. stp $t0,$t1,[$rp,#8*4]
  881. stp $t2,$t3,[$rp,#8*6]
  882. sub $cnt,$num,#8*4
  883. .Lsqr4x_cond_copy:
  884. sub $cnt,$cnt,#8*4
  885. csel $t0,$acc0,$a0,lo
  886. stp xzr,xzr,[$tp,#8*0]
  887. csel $t1,$acc1,$a1,lo
  888. ldp $a0,$a1,[$ap_end,#8*4]
  889. ldp $acc0,$acc1,[$ap,#8*4]
  890. csel $t2,$acc2,$a2,lo
  891. stp xzr,xzr,[$tp,#8*2]
  892. add $tp,$tp,#8*4
  893. csel $t3,$acc3,$a3,lo
  894. ldp $a2,$a3,[$ap_end,#8*6]
  895. ldp $acc2,$acc3,[$ap,#8*6]
  896. add $ap,$ap,#8*4
  897. stp $t0,$t1,[$ap_end,#8*0]
  898. stp $t2,$t3,[$ap_end,#8*2]
  899. add $ap_end,$ap_end,#8*4
  900. stp xzr,xzr,[$ap,#8*0]
  901. stp xzr,xzr,[$ap,#8*2]
  902. cbnz $cnt,.Lsqr4x_cond_copy
  903. csel $t0,$acc0,$a0,lo
  904. stp xzr,xzr,[$tp,#8*0]
  905. csel $t1,$acc1,$a1,lo
  906. stp xzr,xzr,[$tp,#8*2]
  907. csel $t2,$acc2,$a2,lo
  908. csel $t3,$acc3,$a3,lo
  909. stp $t0,$t1,[$ap_end,#8*0]
  910. stp $t2,$t3,[$ap_end,#8*2]
  911. b .Lsqr8x_done
  912. .align 4
  913. .Lsqr8x8_post_condition:
  914. adc $carry,xzr,xzr
  915. ldr x30,[x29,#8] // pull return address
  916. // $acc0-7,$carry hold result, $a0-7 hold modulus
  917. subs $a0,$acc0,$a0
  918. ldr $ap,[x29,#96] // pull rp
  919. sbcs $a1,$acc1,$a1
  920. stp xzr,xzr,[sp,#8*0]
  921. sbcs $a2,$acc2,$a2
  922. stp xzr,xzr,[sp,#8*2]
  923. sbcs $a3,$acc3,$a3
  924. stp xzr,xzr,[sp,#8*4]
  925. sbcs $a4,$acc4,$a4
  926. stp xzr,xzr,[sp,#8*6]
  927. sbcs $a5,$acc5,$a5
  928. stp xzr,xzr,[sp,#8*8]
  929. sbcs $a6,$acc6,$a6
  930. stp xzr,xzr,[sp,#8*10]
  931. sbcs $a7,$acc7,$a7
  932. stp xzr,xzr,[sp,#8*12]
  933. sbcs $carry,$carry,xzr // did it borrow?
  934. stp xzr,xzr,[sp,#8*14]
  935. // $a0-7 hold result-modulus
  936. csel $a0,$acc0,$a0,lo
  937. csel $a1,$acc1,$a1,lo
  938. csel $a2,$acc2,$a2,lo
  939. csel $a3,$acc3,$a3,lo
  940. stp $a0,$a1,[$ap,#8*0]
  941. csel $a4,$acc4,$a4,lo
  942. csel $a5,$acc5,$a5,lo
  943. stp $a2,$a3,[$ap,#8*2]
  944. csel $a6,$acc6,$a6,lo
  945. csel $a7,$acc7,$a7,lo
  946. stp $a4,$a5,[$ap,#8*4]
  947. stp $a6,$a7,[$ap,#8*6]
  948. .Lsqr8x_done:
  949. ldp x19,x20,[x29,#16]
  950. mov sp,x29
  951. ldp x21,x22,[x29,#32]
  952. mov x0,#1
  953. ldp x23,x24,[x29,#48]
  954. ldp x25,x26,[x29,#64]
  955. ldp x27,x28,[x29,#80]
  956. ldr x29,[sp],#128
  957. ret
  958. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  959. ___
  960. }
  961. {
  962. ########################################################################
  963. # Even though this might look as ARMv8 adaptation of mulx4x_mont from
  964. # x86_64-mont5 module, it's different in sense that it performs
  965. # reduction 256 bits at a time.
  966. my ($a0,$a1,$a2,$a3,
  967. $t0,$t1,$t2,$t3,
  968. $m0,$m1,$m2,$m3,
  969. $acc0,$acc1,$acc2,$acc3,$acc4,
  970. $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
  971. my $bp_end=$rp;
  972. my ($carry,$topmost) = ($rp,"x30");
  973. $code.=<<___;
  974. .type __bn_mul4x_mont,%function
  975. .align 5
  976. __bn_mul4x_mont:
  977. stp x29,x30,[sp,#-128]!
  978. add x29,sp,#0
  979. stp x19,x20,[sp,#16]
  980. stp x21,x22,[sp,#32]
  981. stp x23,x24,[sp,#48]
  982. stp x25,x26,[sp,#64]
  983. stp x27,x28,[sp,#80]
  984. sub $tp,sp,$num,lsl#3
  985. lsl $num,$num,#3
  986. ldr $n0,[$n0] // *n0
  987. sub sp,$tp,#8*4 // alloca
  988. add $t0,$bp,$num
  989. add $ap_end,$ap,$num
  990. stp $rp,$t0,[x29,#96] // offload rp and &b[num]
  991. ldr $bi,[$bp,#8*0] // b[0]
  992. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  993. ldp $a2,$a3,[$ap,#8*2]
  994. add $ap,$ap,#8*4
  995. mov $acc0,xzr
  996. mov $acc1,xzr
  997. mov $acc2,xzr
  998. mov $acc3,xzr
  999. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1000. ldp $m2,$m3,[$np,#8*2]
  1001. adds $np,$np,#8*4 // clear carry bit
  1002. mov $carry,xzr
  1003. mov $cnt,#0
  1004. mov $tp,sp
  1005. .Loop_mul4x_1st_reduction:
  1006. mul $t0,$a0,$bi // lo(a[0..3]*b[0])
  1007. adc $carry,$carry,xzr // modulo-scheduled
  1008. mul $t1,$a1,$bi
  1009. add $cnt,$cnt,#8
  1010. mul $t2,$a2,$bi
  1011. and $cnt,$cnt,#31
  1012. mul $t3,$a3,$bi
  1013. adds $acc0,$acc0,$t0
  1014. umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
  1015. adcs $acc1,$acc1,$t1
  1016. mul $mi,$acc0,$n0 // t[0]*n0
  1017. adcs $acc2,$acc2,$t2
  1018. umulh $t1,$a1,$bi
  1019. adcs $acc3,$acc3,$t3
  1020. umulh $t2,$a2,$bi
  1021. adc $acc4,xzr,xzr
  1022. umulh $t3,$a3,$bi
  1023. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1024. adds $acc1,$acc1,$t0
  1025. // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
  1026. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1027. adcs $acc2,$acc2,$t1
  1028. mul $t1,$m1,$mi
  1029. adcs $acc3,$acc3,$t2
  1030. mul $t2,$m2,$mi
  1031. adc $acc4,$acc4,$t3 // can't overflow
  1032. mul $t3,$m3,$mi
  1033. // (*) adds xzr,$acc0,$t0
  1034. subs xzr,$acc0,#1 // (*)
  1035. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
  1036. adcs $acc0,$acc1,$t1
  1037. umulh $t1,$m1,$mi
  1038. adcs $acc1,$acc2,$t2
  1039. umulh $t2,$m2,$mi
  1040. adcs $acc2,$acc3,$t3
  1041. umulh $t3,$m3,$mi
  1042. adcs $acc3,$acc4,$carry
  1043. adc $carry,xzr,xzr
  1044. adds $acc0,$acc0,$t0
  1045. sub $t0,$ap_end,$ap
  1046. adcs $acc1,$acc1,$t1
  1047. adcs $acc2,$acc2,$t2
  1048. adcs $acc3,$acc3,$t3
  1049. //adc $carry,$carry,xzr
  1050. cbnz $cnt,.Loop_mul4x_1st_reduction
  1051. cbz $t0,.Lmul4x4_post_condition
  1052. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1053. ldp $a2,$a3,[$ap,#8*2]
  1054. add $ap,$ap,#8*4
  1055. ldr $mi,[sp] // a[0]*n0
  1056. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1057. ldp $m2,$m3,[$np,#8*2]
  1058. add $np,$np,#8*4
  1059. .Loop_mul4x_1st_tail:
  1060. mul $t0,$a0,$bi // lo(a[4..7]*b[i])
  1061. adc $carry,$carry,xzr // modulo-scheduled
  1062. mul $t1,$a1,$bi
  1063. add $cnt,$cnt,#8
  1064. mul $t2,$a2,$bi
  1065. and $cnt,$cnt,#31
  1066. mul $t3,$a3,$bi
  1067. adds $acc0,$acc0,$t0
  1068. umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
  1069. adcs $acc1,$acc1,$t1
  1070. umulh $t1,$a1,$bi
  1071. adcs $acc2,$acc2,$t2
  1072. umulh $t2,$a2,$bi
  1073. adcs $acc3,$acc3,$t3
  1074. umulh $t3,$a3,$bi
  1075. adc $acc4,xzr,xzr
  1076. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1077. adds $acc1,$acc1,$t0
  1078. mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
  1079. adcs $acc2,$acc2,$t1
  1080. mul $t1,$m1,$mi
  1081. adcs $acc3,$acc3,$t2
  1082. mul $t2,$m2,$mi
  1083. adc $acc4,$acc4,$t3 // can't overflow
  1084. mul $t3,$m3,$mi
  1085. adds $acc0,$acc0,$t0
  1086. umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
  1087. adcs $acc1,$acc1,$t1
  1088. umulh $t1,$m1,$mi
  1089. adcs $acc2,$acc2,$t2
  1090. umulh $t2,$m2,$mi
  1091. adcs $acc3,$acc3,$t3
  1092. adcs $acc4,$acc4,$carry
  1093. umulh $t3,$m3,$mi
  1094. adc $carry,xzr,xzr
  1095. ldr $mi,[sp,$cnt] // next t[0]*n0
  1096. str $acc0,[$tp],#8 // result!!!
  1097. adds $acc0,$acc1,$t0
  1098. sub $t0,$ap_end,$ap // done yet?
  1099. adcs $acc1,$acc2,$t1
  1100. adcs $acc2,$acc3,$t2
  1101. adcs $acc3,$acc4,$t3
  1102. //adc $carry,$carry,xzr
  1103. cbnz $cnt,.Loop_mul4x_1st_tail
  1104. sub $t1,$ap_end,$num // rewinded $ap
  1105. cbz $t0,.Lmul4x_proceed
  1106. ldp $a0,$a1,[$ap,#8*0]
  1107. ldp $a2,$a3,[$ap,#8*2]
  1108. add $ap,$ap,#8*4
  1109. ldp $m0,$m1,[$np,#8*0]
  1110. ldp $m2,$m3,[$np,#8*2]
  1111. add $np,$np,#8*4
  1112. b .Loop_mul4x_1st_tail
  1113. .align 5
  1114. .Lmul4x_proceed:
  1115. ldr $bi,[$bp,#8*4]! // *++b
  1116. adc $topmost,$carry,xzr
  1117. ldp $a0,$a1,[$t1,#8*0] // a[0..3]
  1118. sub $np,$np,$num // rewind np
  1119. ldp $a2,$a3,[$t1,#8*2]
  1120. add $ap,$t1,#8*4
  1121. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1122. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1123. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1124. ldp $acc2,$acc3,[sp,#8*6]
  1125. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1126. mov $tp,sp
  1127. ldp $m2,$m3,[$np,#8*2]
  1128. adds $np,$np,#8*4 // clear carry bit
  1129. mov $carry,xzr
  1130. .align 4
  1131. .Loop_mul4x_reduction:
  1132. mul $t0,$a0,$bi // lo(a[0..3]*b[4])
  1133. adc $carry,$carry,xzr // modulo-scheduled
  1134. mul $t1,$a1,$bi
  1135. add $cnt,$cnt,#8
  1136. mul $t2,$a2,$bi
  1137. and $cnt,$cnt,#31
  1138. mul $t3,$a3,$bi
  1139. adds $acc0,$acc0,$t0
  1140. umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
  1141. adcs $acc1,$acc1,$t1
  1142. mul $mi,$acc0,$n0 // t[0]*n0
  1143. adcs $acc2,$acc2,$t2
  1144. umulh $t1,$a1,$bi
  1145. adcs $acc3,$acc3,$t3
  1146. umulh $t2,$a2,$bi
  1147. adc $acc4,xzr,xzr
  1148. umulh $t3,$a3,$bi
  1149. ldr $bi,[$bp,$cnt] // next b[i]
  1150. adds $acc1,$acc1,$t0
  1151. // (*) mul $t0,$m0,$mi
  1152. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1153. adcs $acc2,$acc2,$t1
  1154. mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
  1155. adcs $acc3,$acc3,$t2
  1156. mul $t2,$m2,$mi
  1157. adc $acc4,$acc4,$t3 // can't overflow
  1158. mul $t3,$m3,$mi
  1159. // (*) adds xzr,$acc0,$t0
  1160. subs xzr,$acc0,#1 // (*)
  1161. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
  1162. adcs $acc0,$acc1,$t1
  1163. umulh $t1,$m1,$mi
  1164. adcs $acc1,$acc2,$t2
  1165. umulh $t2,$m2,$mi
  1166. adcs $acc2,$acc3,$t3
  1167. umulh $t3,$m3,$mi
  1168. adcs $acc3,$acc4,$carry
  1169. adc $carry,xzr,xzr
  1170. adds $acc0,$acc0,$t0
  1171. adcs $acc1,$acc1,$t1
  1172. adcs $acc2,$acc2,$t2
  1173. adcs $acc3,$acc3,$t3
  1174. //adc $carry,$carry,xzr
  1175. cbnz $cnt,.Loop_mul4x_reduction
  1176. adc $carry,$carry,xzr
  1177. ldp $t0,$t1,[$tp,#8*4] // t[4..7]
  1178. ldp $t2,$t3,[$tp,#8*6]
  1179. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1180. ldp $a2,$a3,[$ap,#8*2]
  1181. add $ap,$ap,#8*4
  1182. adds $acc0,$acc0,$t0
  1183. adcs $acc1,$acc1,$t1
  1184. adcs $acc2,$acc2,$t2
  1185. adcs $acc3,$acc3,$t3
  1186. //adc $carry,$carry,xzr
  1187. ldr $mi,[sp] // t[0]*n0
  1188. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1189. ldp $m2,$m3,[$np,#8*2]
  1190. add $np,$np,#8*4
  1191. .align 4
  1192. .Loop_mul4x_tail:
  1193. mul $t0,$a0,$bi // lo(a[4..7]*b[4])
  1194. adc $carry,$carry,xzr // modulo-scheduled
  1195. mul $t1,$a1,$bi
  1196. add $cnt,$cnt,#8
  1197. mul $t2,$a2,$bi
  1198. and $cnt,$cnt,#31
  1199. mul $t3,$a3,$bi
  1200. adds $acc0,$acc0,$t0
  1201. umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
  1202. adcs $acc1,$acc1,$t1
  1203. umulh $t1,$a1,$bi
  1204. adcs $acc2,$acc2,$t2
  1205. umulh $t2,$a2,$bi
  1206. adcs $acc3,$acc3,$t3
  1207. umulh $t3,$a3,$bi
  1208. adc $acc4,xzr,xzr
  1209. ldr $bi,[$bp,$cnt] // next b[i]
  1210. adds $acc1,$acc1,$t0
  1211. mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
  1212. adcs $acc2,$acc2,$t1
  1213. mul $t1,$m1,$mi
  1214. adcs $acc3,$acc3,$t2
  1215. mul $t2,$m2,$mi
  1216. adc $acc4,$acc4,$t3 // can't overflow
  1217. mul $t3,$m3,$mi
  1218. adds $acc0,$acc0,$t0
  1219. umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
  1220. adcs $acc1,$acc1,$t1
  1221. umulh $t1,$m1,$mi
  1222. adcs $acc2,$acc2,$t2
  1223. umulh $t2,$m2,$mi
  1224. adcs $acc3,$acc3,$t3
  1225. umulh $t3,$m3,$mi
  1226. adcs $acc4,$acc4,$carry
  1227. ldr $mi,[sp,$cnt] // next a[0]*n0
  1228. adc $carry,xzr,xzr
  1229. str $acc0,[$tp],#8 // result!!!
  1230. adds $acc0,$acc1,$t0
  1231. sub $t0,$ap_end,$ap // done yet?
  1232. adcs $acc1,$acc2,$t1
  1233. adcs $acc2,$acc3,$t2
  1234. adcs $acc3,$acc4,$t3
  1235. //adc $carry,$carry,xzr
  1236. cbnz $cnt,.Loop_mul4x_tail
  1237. sub $t1,$np,$num // rewinded np?
  1238. adc $carry,$carry,xzr
  1239. cbz $t0,.Loop_mul4x_break
  1240. ldp $t0,$t1,[$tp,#8*4]
  1241. ldp $t2,$t3,[$tp,#8*6]
  1242. ldp $a0,$a1,[$ap,#8*0]
  1243. ldp $a2,$a3,[$ap,#8*2]
  1244. add $ap,$ap,#8*4
  1245. adds $acc0,$acc0,$t0
  1246. adcs $acc1,$acc1,$t1
  1247. adcs $acc2,$acc2,$t2
  1248. adcs $acc3,$acc3,$t3
  1249. //adc $carry,$carry,xzr
  1250. ldp $m0,$m1,[$np,#8*0]
  1251. ldp $m2,$m3,[$np,#8*2]
  1252. add $np,$np,#8*4
  1253. b .Loop_mul4x_tail
  1254. .align 4
  1255. .Loop_mul4x_break:
  1256. ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
  1257. adds $acc0,$acc0,$topmost
  1258. add $bp,$bp,#8*4 // bp++
  1259. adcs $acc1,$acc1,xzr
  1260. sub $ap,$ap,$num // rewind ap
  1261. adcs $acc2,$acc2,xzr
  1262. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1263. adcs $acc3,$acc3,xzr
  1264. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1265. adc $topmost,$carry,xzr
  1266. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1267. cmp $bp,$t3 // done yet?
  1268. ldp $acc2,$acc3,[sp,#8*6]
  1269. ldp $m0,$m1,[$t1,#8*0] // n[0..3]
  1270. ldp $m2,$m3,[$t1,#8*2]
  1271. add $np,$t1,#8*4
  1272. b.eq .Lmul4x_post
  1273. ldr $bi,[$bp]
  1274. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1275. ldp $a2,$a3,[$ap,#8*2]
  1276. adds $ap,$ap,#8*4 // clear carry bit
  1277. mov $carry,xzr
  1278. mov $tp,sp
  1279. b .Loop_mul4x_reduction
  1280. .align 4
  1281. .Lmul4x_post:
  1282. // Final step. We see if result is larger than modulus, and
  1283. // if it is, subtract the modulus. But comparison implies
  1284. // subtraction. So we subtract modulus, see if it borrowed,
  1285. // and conditionally copy original value.
  1286. mov $rp,$t2
  1287. mov $ap_end,$t2 // $rp copy
  1288. subs $t0,$acc0,$m0
  1289. add $tp,sp,#8*8
  1290. sbcs $t1,$acc1,$m1
  1291. sub $cnt,$num,#8*4
  1292. .Lmul4x_sub:
  1293. sbcs $t2,$acc2,$m2
  1294. ldp $m0,$m1,[$np,#8*0]
  1295. sub $cnt,$cnt,#8*4
  1296. ldp $acc0,$acc1,[$tp,#8*0]
  1297. sbcs $t3,$acc3,$m3
  1298. ldp $m2,$m3,[$np,#8*2]
  1299. add $np,$np,#8*4
  1300. ldp $acc2,$acc3,[$tp,#8*2]
  1301. add $tp,$tp,#8*4
  1302. stp $t0,$t1,[$rp,#8*0]
  1303. sbcs $t0,$acc0,$m0
  1304. stp $t2,$t3,[$rp,#8*2]
  1305. add $rp,$rp,#8*4
  1306. sbcs $t1,$acc1,$m1
  1307. cbnz $cnt,.Lmul4x_sub
  1308. sbcs $t2,$acc2,$m2
  1309. mov $tp,sp
  1310. add $ap,sp,#8*4
  1311. ldp $a0,$a1,[$ap_end,#8*0]
  1312. sbcs $t3,$acc3,$m3
  1313. stp $t0,$t1,[$rp,#8*0]
  1314. ldp $a2,$a3,[$ap_end,#8*2]
  1315. stp $t2,$t3,[$rp,#8*2]
  1316. ldp $acc0,$acc1,[$ap,#8*0]
  1317. ldp $acc2,$acc3,[$ap,#8*2]
  1318. sbcs xzr,$topmost,xzr // did it borrow?
  1319. ldr x30,[x29,#8] // pull return address
  1320. sub $cnt,$num,#8*4
  1321. .Lmul4x_cond_copy:
  1322. sub $cnt,$cnt,#8*4
  1323. csel $t0,$acc0,$a0,lo
  1324. stp xzr,xzr,[$tp,#8*0]
  1325. csel $t1,$acc1,$a1,lo
  1326. ldp $a0,$a1,[$ap_end,#8*4]
  1327. ldp $acc0,$acc1,[$ap,#8*4]
  1328. csel $t2,$acc2,$a2,lo
  1329. stp xzr,xzr,[$tp,#8*2]
  1330. add $tp,$tp,#8*4
  1331. csel $t3,$acc3,$a3,lo
  1332. ldp $a2,$a3,[$ap_end,#8*6]
  1333. ldp $acc2,$acc3,[$ap,#8*6]
  1334. add $ap,$ap,#8*4
  1335. stp $t0,$t1,[$ap_end,#8*0]
  1336. stp $t2,$t3,[$ap_end,#8*2]
  1337. add $ap_end,$ap_end,#8*4
  1338. cbnz $cnt,.Lmul4x_cond_copy
  1339. csel $t0,$acc0,$a0,lo
  1340. stp xzr,xzr,[$tp,#8*0]
  1341. csel $t1,$acc1,$a1,lo
  1342. stp xzr,xzr,[$tp,#8*2]
  1343. csel $t2,$acc2,$a2,lo
  1344. stp xzr,xzr,[$tp,#8*3]
  1345. csel $t3,$acc3,$a3,lo
  1346. stp xzr,xzr,[$tp,#8*4]
  1347. stp $t0,$t1,[$ap_end,#8*0]
  1348. stp $t2,$t3,[$ap_end,#8*2]
  1349. b .Lmul4x_done
  1350. .align 4
  1351. .Lmul4x4_post_condition:
  1352. adc $carry,$carry,xzr
  1353. ldr $ap,[x29,#96] // pull rp
  1354. // $acc0-3,$carry hold result, $m0-7 hold modulus
  1355. subs $a0,$acc0,$m0
  1356. ldr x30,[x29,#8] // pull return address
  1357. sbcs $a1,$acc1,$m1
  1358. stp xzr,xzr,[sp,#8*0]
  1359. sbcs $a2,$acc2,$m2
  1360. stp xzr,xzr,[sp,#8*2]
  1361. sbcs $a3,$acc3,$m3
  1362. stp xzr,xzr,[sp,#8*4]
  1363. sbcs xzr,$carry,xzr // did it borrow?
  1364. stp xzr,xzr,[sp,#8*6]
  1365. // $a0-3 hold result-modulus
  1366. csel $a0,$acc0,$a0,lo
  1367. csel $a1,$acc1,$a1,lo
  1368. csel $a2,$acc2,$a2,lo
  1369. csel $a3,$acc3,$a3,lo
  1370. stp $a0,$a1,[$ap,#8*0]
  1371. stp $a2,$a3,[$ap,#8*2]
  1372. .Lmul4x_done:
  1373. ldp x19,x20,[x29,#16]
  1374. mov sp,x29
  1375. ldp x21,x22,[x29,#32]
  1376. mov x0,#1
  1377. ldp x23,x24,[x29,#48]
  1378. ldp x25,x26,[x29,#64]
  1379. ldp x27,x28,[x29,#80]
  1380. ldr x29,[sp],#128
  1381. ret
  1382. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1383. ___
  1384. }
  1385. $code.=<<___;
  1386. .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  1387. .align 4
  1388. ___
  1389. print $code;
  1390. close STDOUT;