ppc-mont.pl 48 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # April 2006
  15. # "Teaser" Montgomery multiplication module for PowerPC. It's possible
  16. # to gain a bit more by modulo-scheduling outer loop, then dedicated
  17. # squaring procedure should give further 20% and code can be adapted
  18. # for 32-bit application running on 64-bit CPU. As for the latter.
  19. # It won't be able to achieve "native" 64-bit performance, because in
  20. # 32-bit application context every addc instruction will have to be
  21. # expanded as addc, twice right shift by 32 and finally adde, etc.
  22. # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
  23. # for 64-bit application running on PPC970/G5 is:
  24. #
  25. # 512-bit +65%
  26. # 1024-bit +35%
  27. # 2048-bit +18%
  28. # 4096-bit +4%
  29. # September 2016
  30. #
  31. # Add multiplication procedure operating on lengths divisible by 4
  32. # and squaring procedure operating on lengths divisible by 8. Length
  33. # is expressed in number of limbs. RSA private key operations are
  34. # ~35-50% faster (more for longer keys) on contemporary high-end POWER
  35. # processors in 64-bit builds, [mysteriously enough] more in 32-bit
  36. # builds. On low-end 32-bit processors performance improvement turned
  37. # to be marginal...
  38. $flavour = shift;
  39. if ($flavour =~ /32/) {
  40. $BITS= 32;
  41. $BNSZ= $BITS/8;
  42. $SIZE_T=4;
  43. $RZONE= 224;
  44. $LD= "lwz"; # load
  45. $LDU= "lwzu"; # load and update
  46. $LDX= "lwzx"; # load indexed
  47. $ST= "stw"; # store
  48. $STU= "stwu"; # store and update
  49. $STX= "stwx"; # store indexed
  50. $STUX= "stwux"; # store indexed and update
  51. $UMULL= "mullw"; # unsigned multiply low
  52. $UMULH= "mulhwu"; # unsigned multiply high
  53. $UCMP= "cmplw"; # unsigned compare
  54. $SHRI= "srwi"; # unsigned shift right by immediate
  55. $SHLI= "slwi"; # unsigned shift left by immediate
  56. $PUSH= $ST;
  57. $POP= $LD;
  58. } elsif ($flavour =~ /64/) {
  59. $BITS= 64;
  60. $BNSZ= $BITS/8;
  61. $SIZE_T=8;
  62. $RZONE= 288;
  63. # same as above, but 64-bit mnemonics...
  64. $LD= "ld"; # load
  65. $LDU= "ldu"; # load and update
  66. $LDX= "ldx"; # load indexed
  67. $ST= "std"; # store
  68. $STU= "stdu"; # store and update
  69. $STX= "stdx"; # store indexed
  70. $STUX= "stdux"; # store indexed and update
  71. $UMULL= "mulld"; # unsigned multiply low
  72. $UMULH= "mulhdu"; # unsigned multiply high
  73. $UCMP= "cmpld"; # unsigned compare
  74. $SHRI= "srdi"; # unsigned shift right by immediate
  75. $SHLI= "sldi"; # unsigned shift left by immediate
  76. $PUSH= $ST;
  77. $POP= $LD;
  78. } else { die "nonsense $flavour"; }
  79. $FRAME=8*$SIZE_T+$RZONE;
  80. $LOCALS=8*$SIZE_T;
  81. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  82. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  83. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  84. die "can't locate ppc-xlate.pl";
  85. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  86. $sp="r1";
  87. $toc="r2";
  88. $rp="r3";
  89. $ap="r4";
  90. $bp="r5";
  91. $np="r6";
  92. $n0="r7";
  93. $num="r8";
  94. {
  95. my $ovf=$rp;
  96. my $rp="r9"; # $rp is reassigned
  97. my $aj="r10";
  98. my $nj="r11";
  99. my $tj="r12";
  100. # non-volatile registers
  101. my $i="r20";
  102. my $j="r21";
  103. my $tp="r22";
  104. my $m0="r23";
  105. my $m1="r24";
  106. my $lo0="r25";
  107. my $hi0="r26";
  108. my $lo1="r27";
  109. my $hi1="r28";
  110. my $alo="r29";
  111. my $ahi="r30";
  112. my $nlo="r31";
  113. #
  114. my $nhi="r0";
  115. $code=<<___;
  116. .machine "any"
  117. .text
  118. .globl .bn_mul_mont_int
  119. .align 5
  120. .bn_mul_mont_int:
  121. mr $rp,r3 ; $rp is reassigned
  122. li r3,0
  123. ___
  124. $code.=<<___ if ($BNSZ==4);
  125. cmpwi $num,32 ; longer key performance is not better
  126. bgelr
  127. ___
  128. $code.=<<___;
  129. slwi $num,$num,`log($BNSZ)/log(2)`
  130. li $tj,-4096
  131. addi $ovf,$num,$FRAME
  132. subf $ovf,$ovf,$sp ; $sp-$ovf
  133. and $ovf,$ovf,$tj ; minimize TLB usage
  134. subf $ovf,$sp,$ovf ; $ovf-$sp
  135. mr $tj,$sp
  136. srwi $num,$num,`log($BNSZ)/log(2)`
  137. $STUX $sp,$sp,$ovf
  138. $PUSH r20,`-12*$SIZE_T`($tj)
  139. $PUSH r21,`-11*$SIZE_T`($tj)
  140. $PUSH r22,`-10*$SIZE_T`($tj)
  141. $PUSH r23,`-9*$SIZE_T`($tj)
  142. $PUSH r24,`-8*$SIZE_T`($tj)
  143. $PUSH r25,`-7*$SIZE_T`($tj)
  144. $PUSH r26,`-6*$SIZE_T`($tj)
  145. $PUSH r27,`-5*$SIZE_T`($tj)
  146. $PUSH r28,`-4*$SIZE_T`($tj)
  147. $PUSH r29,`-3*$SIZE_T`($tj)
  148. $PUSH r30,`-2*$SIZE_T`($tj)
  149. $PUSH r31,`-1*$SIZE_T`($tj)
  150. $LD $n0,0($n0) ; pull n0[0] value
  151. addi $num,$num,-2 ; adjust $num for counter register
  152. $LD $m0,0($bp) ; m0=bp[0]
  153. $LD $aj,0($ap) ; ap[0]
  154. addi $tp,$sp,$LOCALS
  155. $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
  156. $UMULH $hi0,$aj,$m0
  157. $LD $aj,$BNSZ($ap) ; ap[1]
  158. $LD $nj,0($np) ; np[0]
  159. $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
  160. $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
  161. $UMULH $ahi,$aj,$m0
  162. $UMULL $lo1,$nj,$m1 ; np[0]*m1
  163. $UMULH $hi1,$nj,$m1
  164. $LD $nj,$BNSZ($np) ; np[1]
  165. addc $lo1,$lo1,$lo0
  166. addze $hi1,$hi1
  167. $UMULL $nlo,$nj,$m1 ; np[1]*m1
  168. $UMULH $nhi,$nj,$m1
  169. mtctr $num
  170. li $j,`2*$BNSZ`
  171. .align 4
  172. L1st:
  173. $LDX $aj,$ap,$j ; ap[j]
  174. addc $lo0,$alo,$hi0
  175. $LDX $nj,$np,$j ; np[j]
  176. addze $hi0,$ahi
  177. $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
  178. addc $lo1,$nlo,$hi1
  179. $UMULH $ahi,$aj,$m0
  180. addze $hi1,$nhi
  181. $UMULL $nlo,$nj,$m1 ; np[j]*m1
  182. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
  183. $UMULH $nhi,$nj,$m1
  184. addze $hi1,$hi1
  185. $ST $lo1,0($tp) ; tp[j-1]
  186. addi $j,$j,$BNSZ ; j++
  187. addi $tp,$tp,$BNSZ ; tp++
  188. bdnz L1st
  189. ;L1st
  190. addc $lo0,$alo,$hi0
  191. addze $hi0,$ahi
  192. addc $lo1,$nlo,$hi1
  193. addze $hi1,$nhi
  194. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
  195. addze $hi1,$hi1
  196. $ST $lo1,0($tp) ; tp[j-1]
  197. li $ovf,0
  198. addc $hi1,$hi1,$hi0
  199. addze $ovf,$ovf ; upmost overflow bit
  200. $ST $hi1,$BNSZ($tp)
  201. li $i,$BNSZ
  202. .align 4
  203. Louter:
  204. $LDX $m0,$bp,$i ; m0=bp[i]
  205. $LD $aj,0($ap) ; ap[0]
  206. addi $tp,$sp,$LOCALS
  207. $LD $tj,$LOCALS($sp); tp[0]
  208. $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
  209. $UMULH $hi0,$aj,$m0
  210. $LD $aj,$BNSZ($ap) ; ap[1]
  211. $LD $nj,0($np) ; np[0]
  212. addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
  213. $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
  214. addze $hi0,$hi0
  215. $UMULL $m1,$lo0,$n0 ; tp[0]*n0
  216. $UMULH $ahi,$aj,$m0
  217. $UMULL $lo1,$nj,$m1 ; np[0]*m1
  218. $UMULH $hi1,$nj,$m1
  219. $LD $nj,$BNSZ($np) ; np[1]
  220. addc $lo1,$lo1,$lo0
  221. $UMULL $nlo,$nj,$m1 ; np[1]*m1
  222. addze $hi1,$hi1
  223. $UMULH $nhi,$nj,$m1
  224. mtctr $num
  225. li $j,`2*$BNSZ`
  226. .align 4
  227. Linner:
  228. $LDX $aj,$ap,$j ; ap[j]
  229. addc $lo0,$alo,$hi0
  230. $LD $tj,$BNSZ($tp) ; tp[j]
  231. addze $hi0,$ahi
  232. $LDX $nj,$np,$j ; np[j]
  233. addc $lo1,$nlo,$hi1
  234. $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
  235. addze $hi1,$nhi
  236. $UMULH $ahi,$aj,$m0
  237. addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
  238. $UMULL $nlo,$nj,$m1 ; np[j]*m1
  239. addze $hi0,$hi0
  240. $UMULH $nhi,$nj,$m1
  241. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
  242. addi $j,$j,$BNSZ ; j++
  243. addze $hi1,$hi1
  244. $ST $lo1,0($tp) ; tp[j-1]
  245. addi $tp,$tp,$BNSZ ; tp++
  246. bdnz Linner
  247. ;Linner
  248. $LD $tj,$BNSZ($tp) ; tp[j]
  249. addc $lo0,$alo,$hi0
  250. addze $hi0,$ahi
  251. addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
  252. addze $hi0,$hi0
  253. addc $lo1,$nlo,$hi1
  254. addze $hi1,$nhi
  255. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
  256. addze $hi1,$hi1
  257. $ST $lo1,0($tp) ; tp[j-1]
  258. addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
  259. li $ovf,0
  260. adde $hi1,$hi1,$hi0
  261. addze $ovf,$ovf
  262. $ST $hi1,$BNSZ($tp)
  263. ;
  264. slwi $tj,$num,`log($BNSZ)/log(2)`
  265. $UCMP $i,$tj
  266. addi $i,$i,$BNSZ
  267. ble Louter
  268. addi $num,$num,2 ; restore $num
  269. subfc $j,$j,$j ; j=0 and "clear" XER[CA]
  270. addi $tp,$sp,$LOCALS
  271. mtctr $num
  272. .align 4
  273. Lsub: $LDX $tj,$tp,$j
  274. $LDX $nj,$np,$j
  275. subfe $aj,$nj,$tj ; tp[j]-np[j]
  276. $STX $aj,$rp,$j
  277. addi $j,$j,$BNSZ
  278. bdnz Lsub
  279. li $j,0
  280. mtctr $num
  281. subfe $ovf,$j,$ovf ; handle upmost overflow bit
  282. .align 4
  283. Lcopy: ; conditional copy
  284. $LDX $tj,$tp,$j
  285. $LDX $aj,$rp,$j
  286. and $tj,$tj,$ovf
  287. andc $aj,$aj,$ovf
  288. $STX $j,$tp,$j ; zap at once
  289. or $aj,$aj,$tj
  290. $STX $aj,$rp,$j
  291. addi $j,$j,$BNSZ
  292. bdnz Lcopy
  293. $POP $tj,0($sp)
  294. li r3,1
  295. $POP r20,`-12*$SIZE_T`($tj)
  296. $POP r21,`-11*$SIZE_T`($tj)
  297. $POP r22,`-10*$SIZE_T`($tj)
  298. $POP r23,`-9*$SIZE_T`($tj)
  299. $POP r24,`-8*$SIZE_T`($tj)
  300. $POP r25,`-7*$SIZE_T`($tj)
  301. $POP r26,`-6*$SIZE_T`($tj)
  302. $POP r27,`-5*$SIZE_T`($tj)
  303. $POP r28,`-4*$SIZE_T`($tj)
  304. $POP r29,`-3*$SIZE_T`($tj)
  305. $POP r30,`-2*$SIZE_T`($tj)
  306. $POP r31,`-1*$SIZE_T`($tj)
  307. mr $sp,$tj
  308. blr
  309. .long 0
  310. .byte 0,12,4,0,0x80,12,6,0
  311. .long 0
  312. .size .bn_mul_mont_int,.-.bn_mul_mont_int
  313. ___
  314. }
  315. if (1) {
  316. my ($a0,$a1,$a2,$a3,
  317. $t0,$t1,$t2,$t3,
  318. $m0,$m1,$m2,$m3,
  319. $acc0,$acc1,$acc2,$acc3,$acc4,
  320. $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
  321. my ($carry,$zero) = ($rp,"r0");
  322. # sp----------->+-------------------------------+
  323. # | saved sp |
  324. # +-------------------------------+
  325. # . .
  326. # +8*size_t +-------------------------------+
  327. # | 4 "n0*t0" |
  328. # . .
  329. # . .
  330. # +12*size_t +-------------------------------+
  331. # | size_t tmp[num] |
  332. # . .
  333. # . .
  334. # . .
  335. # +-------------------------------+
  336. # | topmost carry |
  337. # . .
  338. # -18*size_t +-------------------------------+
  339. # | 18 saved gpr, r14-r31 |
  340. # . .
  341. # . .
  342. # +-------------------------------+
  343. $code.=<<___;
  344. .globl .bn_mul4x_mont_int
  345. .align 5
  346. .bn_mul4x_mont_int:
  347. andi. r0,$num,7
  348. bne .Lmul4x_do
  349. $UCMP $ap,$bp
  350. bne .Lmul4x_do
  351. b .Lsqr8x_do
  352. .Lmul4x_do:
  353. slwi $num,$num,`log($SIZE_T)/log(2)`
  354. mr $a0,$sp
  355. li $a1,-32*$SIZE_T
  356. sub $a1,$a1,$num
  357. $STUX $sp,$sp,$a1 # alloca
  358. $PUSH r14,-$SIZE_T*18($a0)
  359. $PUSH r15,-$SIZE_T*17($a0)
  360. $PUSH r16,-$SIZE_T*16($a0)
  361. $PUSH r17,-$SIZE_T*15($a0)
  362. $PUSH r18,-$SIZE_T*14($a0)
  363. $PUSH r19,-$SIZE_T*13($a0)
  364. $PUSH r20,-$SIZE_T*12($a0)
  365. $PUSH r21,-$SIZE_T*11($a0)
  366. $PUSH r22,-$SIZE_T*10($a0)
  367. $PUSH r23,-$SIZE_T*9($a0)
  368. $PUSH r24,-$SIZE_T*8($a0)
  369. $PUSH r25,-$SIZE_T*7($a0)
  370. $PUSH r26,-$SIZE_T*6($a0)
  371. $PUSH r27,-$SIZE_T*5($a0)
  372. $PUSH r28,-$SIZE_T*4($a0)
  373. $PUSH r29,-$SIZE_T*3($a0)
  374. $PUSH r30,-$SIZE_T*2($a0)
  375. $PUSH r31,-$SIZE_T*1($a0)
  376. subi $ap,$ap,$SIZE_T # bias by -1
  377. subi $np,$np,$SIZE_T # bias by -1
  378. subi $rp,$rp,$SIZE_T # bias by -1
  379. $LD $n0,0($n0) # *n0
  380. add $t0,$bp,$num
  381. add $ap_end,$ap,$num
  382. subi $t0,$t0,$SIZE_T*4 # &b[num-4]
  383. $LD $bi,$SIZE_T*0($bp) # b[0]
  384. li $acc0,0
  385. $LD $a0,$SIZE_T*1($ap) # a[0..3]
  386. li $acc1,0
  387. $LD $a1,$SIZE_T*2($ap)
  388. li $acc2,0
  389. $LD $a2,$SIZE_T*3($ap)
  390. li $acc3,0
  391. $LDU $a3,$SIZE_T*4($ap)
  392. $LD $m0,$SIZE_T*1($np) # n[0..3]
  393. $LD $m1,$SIZE_T*2($np)
  394. $LD $m2,$SIZE_T*3($np)
  395. $LDU $m3,$SIZE_T*4($np)
  396. $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4]
  397. $PUSH $t0,$SIZE_T*7($sp)
  398. li $carry,0
  399. addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
  400. li $cnt,0
  401. li $zero,0
  402. b .Loop_mul4x_1st_reduction
  403. .align 5
  404. .Loop_mul4x_1st_reduction:
  405. $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0])
  406. addze $carry,$carry # modulo-scheduled
  407. $UMULL $t1,$a1,$bi
  408. addi $cnt,$cnt,$SIZE_T
  409. $UMULL $t2,$a2,$bi
  410. andi. $cnt,$cnt,$SIZE_T*4-1
  411. $UMULL $t3,$a3,$bi
  412. addc $acc0,$acc0,$t0
  413. $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0])
  414. adde $acc1,$acc1,$t1
  415. $UMULH $t1,$a1,$bi
  416. adde $acc2,$acc2,$t2
  417. $UMULL $mi,$acc0,$n0 # t[0]*n0
  418. adde $acc3,$acc3,$t3
  419. $UMULH $t2,$a2,$bi
  420. addze $acc4,$zero
  421. $UMULH $t3,$a3,$bi
  422. $LDX $bi,$bp,$cnt # next b[i] (or b[0])
  423. addc $acc1,$acc1,$t0
  424. # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0)
  425. $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
  426. adde $acc2,$acc2,$t1
  427. $UMULL $t1,$m1,$mi
  428. adde $acc3,$acc3,$t2
  429. $UMULL $t2,$m2,$mi
  430. adde $acc4,$acc4,$t3 # can't overflow
  431. $UMULL $t3,$m3,$mi
  432. # (*) addc $acc0,$acc0,$t0
  433. # (*) As for removal of first multiplication and addition
  434. # instructions. The outcome of first addition is
  435. # guaranteed to be zero, which leaves two computationally
  436. # significant outcomes: it either carries or not. Then
  437. # question is when does it carry? Is there alternative
  438. # way to deduce it? If you follow operations, you can
  439. # observe that condition for carry is quite simple:
  440. # $acc0 being non-zero. So that carry can be calculated
  441. # by adding -1 to $acc0. That's what next instruction does.
  442. addic $acc0,$acc0,-1 # (*), discarded
  443. $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0)
  444. adde $acc0,$acc1,$t1
  445. $UMULH $t1,$m1,$mi
  446. adde $acc1,$acc2,$t2
  447. $UMULH $t2,$m2,$mi
  448. adde $acc2,$acc3,$t3
  449. $UMULH $t3,$m3,$mi
  450. adde $acc3,$acc4,$carry
  451. addze $carry,$zero
  452. addc $acc0,$acc0,$t0
  453. adde $acc1,$acc1,$t1
  454. adde $acc2,$acc2,$t2
  455. adde $acc3,$acc3,$t3
  456. #addze $carry,$carry
  457. bne .Loop_mul4x_1st_reduction
  458. $UCMP $ap_end,$ap
  459. beq .Lmul4x4_post_condition
  460. $LD $a0,$SIZE_T*1($ap) # a[4..7]
  461. $LD $a1,$SIZE_T*2($ap)
  462. $LD $a2,$SIZE_T*3($ap)
  463. $LDU $a3,$SIZE_T*4($ap)
  464. $LD $mi,$SIZE_T*8($sp) # a[0]*n0
  465. $LD $m0,$SIZE_T*1($np) # n[4..7]
  466. $LD $m1,$SIZE_T*2($np)
  467. $LD $m2,$SIZE_T*3($np)
  468. $LDU $m3,$SIZE_T*4($np)
  469. b .Loop_mul4x_1st_tail
  470. .align 5
  471. .Loop_mul4x_1st_tail:
  472. $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i])
  473. addze $carry,$carry # modulo-scheduled
  474. $UMULL $t1,$a1,$bi
  475. addi $cnt,$cnt,$SIZE_T
  476. $UMULL $t2,$a2,$bi
  477. andi. $cnt,$cnt,$SIZE_T*4-1
  478. $UMULL $t3,$a3,$bi
  479. addc $acc0,$acc0,$t0
  480. $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i])
  481. adde $acc1,$acc1,$t1
  482. $UMULH $t1,$a1,$bi
  483. adde $acc2,$acc2,$t2
  484. $UMULH $t2,$a2,$bi
  485. adde $acc3,$acc3,$t3
  486. $UMULH $t3,$a3,$bi
  487. addze $acc4,$zero
  488. $LDX $bi,$bp,$cnt # next b[i] (or b[0])
  489. addc $acc1,$acc1,$t0
  490. $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0)
  491. adde $acc2,$acc2,$t1
  492. $UMULL $t1,$m1,$mi
  493. adde $acc3,$acc3,$t2
  494. $UMULL $t2,$m2,$mi
  495. adde $acc4,$acc4,$t3 # can't overflow
  496. $UMULL $t3,$m3,$mi
  497. addc $acc0,$acc0,$t0
  498. $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0)
  499. adde $acc1,$acc1,$t1
  500. $UMULH $t1,$m1,$mi
  501. adde $acc2,$acc2,$t2
  502. $UMULH $t2,$m2,$mi
  503. adde $acc3,$acc3,$t3
  504. adde $acc4,$acc4,$carry
  505. $UMULH $t3,$m3,$mi
  506. addze $carry,$zero
  507. addi $mi,$sp,$SIZE_T*8
  508. $LDX $mi,$mi,$cnt # next t[0]*n0
  509. $STU $acc0,$SIZE_T($tp) # word of result
  510. addc $acc0,$acc1,$t0
  511. adde $acc1,$acc2,$t1
  512. adde $acc2,$acc3,$t2
  513. adde $acc3,$acc4,$t3
  514. #addze $carry,$carry
  515. bne .Loop_mul4x_1st_tail
  516. sub $t1,$ap_end,$num # rewinded $ap
  517. $UCMP $ap_end,$ap # done yet?
  518. beq .Lmul4x_proceed
  519. $LD $a0,$SIZE_T*1($ap)
  520. $LD $a1,$SIZE_T*2($ap)
  521. $LD $a2,$SIZE_T*3($ap)
  522. $LDU $a3,$SIZE_T*4($ap)
  523. $LD $m0,$SIZE_T*1($np)
  524. $LD $m1,$SIZE_T*2($np)
  525. $LD $m2,$SIZE_T*3($np)
  526. $LDU $m3,$SIZE_T*4($np)
  527. b .Loop_mul4x_1st_tail
  528. .align 5
  529. .Lmul4x_proceed:
  530. $LDU $bi,$SIZE_T*4($bp) # *++b
  531. addze $carry,$carry # topmost carry
  532. $LD $a0,$SIZE_T*1($t1)
  533. $LD $a1,$SIZE_T*2($t1)
  534. $LD $a2,$SIZE_T*3($t1)
  535. $LD $a3,$SIZE_T*4($t1)
  536. addi $ap,$t1,$SIZE_T*4
  537. sub $np,$np,$num # rewind np
  538. $ST $acc0,$SIZE_T*1($tp) # result
  539. $ST $acc1,$SIZE_T*2($tp)
  540. $ST $acc2,$SIZE_T*3($tp)
  541. $ST $acc3,$SIZE_T*4($tp)
  542. $ST $carry,$SIZE_T*5($tp) # save topmost carry
  543. $LD $acc0,$SIZE_T*12($sp) # t[0..3]
  544. $LD $acc1,$SIZE_T*13($sp)
  545. $LD $acc2,$SIZE_T*14($sp)
  546. $LD $acc3,$SIZE_T*15($sp)
  547. $LD $m0,$SIZE_T*1($np) # n[0..3]
  548. $LD $m1,$SIZE_T*2($np)
  549. $LD $m2,$SIZE_T*3($np)
  550. $LDU $m3,$SIZE_T*4($np)
  551. addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
  552. li $carry,0
  553. b .Loop_mul4x_reduction
  554. .align 5
  555. .Loop_mul4x_reduction:
  556. $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4])
  557. addze $carry,$carry # modulo-scheduled
  558. $UMULL $t1,$a1,$bi
  559. addi $cnt,$cnt,$SIZE_T
  560. $UMULL $t2,$a2,$bi
  561. andi. $cnt,$cnt,$SIZE_T*4-1
  562. $UMULL $t3,$a3,$bi
  563. addc $acc0,$acc0,$t0
  564. $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4])
  565. adde $acc1,$acc1,$t1
  566. $UMULH $t1,$a1,$bi
  567. adde $acc2,$acc2,$t2
  568. $UMULL $mi,$acc0,$n0 # t[0]*n0
  569. adde $acc3,$acc3,$t3
  570. $UMULH $t2,$a2,$bi
  571. addze $acc4,$zero
  572. $UMULH $t3,$a3,$bi
  573. $LDX $bi,$bp,$cnt # next b[i]
  574. addc $acc1,$acc1,$t0
  575. # (*) mul $t0,$m0,$mi
  576. $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
  577. adde $acc2,$acc2,$t1
  578. $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0
  579. adde $acc3,$acc3,$t2
  580. $UMULL $t2,$m2,$mi
  581. adde $acc4,$acc4,$t3 # can't overflow
  582. $UMULL $t3,$m3,$mi
  583. # (*) addc $acc0,$acc0,$t0
  584. addic $acc0,$acc0,-1 # (*), discarded
  585. $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0
  586. adde $acc0,$acc1,$t1
  587. $UMULH $t1,$m1,$mi
  588. adde $acc1,$acc2,$t2
  589. $UMULH $t2,$m2,$mi
  590. adde $acc2,$acc3,$t3
  591. $UMULH $t3,$m3,$mi
  592. adde $acc3,$acc4,$carry
  593. addze $carry,$zero
  594. addc $acc0,$acc0,$t0
  595. adde $acc1,$acc1,$t1
  596. adde $acc2,$acc2,$t2
  597. adde $acc3,$acc3,$t3
  598. #addze $carry,$carry
  599. bne .Loop_mul4x_reduction
  600. $LD $t0,$SIZE_T*5($tp) # t[4..7]
  601. addze $carry,$carry
  602. $LD $t1,$SIZE_T*6($tp)
  603. $LD $t2,$SIZE_T*7($tp)
  604. $LD $t3,$SIZE_T*8($tp)
  605. $LD $a0,$SIZE_T*1($ap) # a[4..7]
  606. $LD $a1,$SIZE_T*2($ap)
  607. $LD $a2,$SIZE_T*3($ap)
  608. $LDU $a3,$SIZE_T*4($ap)
  609. addc $acc0,$acc0,$t0
  610. adde $acc1,$acc1,$t1
  611. adde $acc2,$acc2,$t2
  612. adde $acc3,$acc3,$t3
  613. #addze $carry,$carry
  614. $LD $mi,$SIZE_T*8($sp) # t[0]*n0
  615. $LD $m0,$SIZE_T*1($np) # n[4..7]
  616. $LD $m1,$SIZE_T*2($np)
  617. $LD $m2,$SIZE_T*3($np)
  618. $LDU $m3,$SIZE_T*4($np)
  619. b .Loop_mul4x_tail
  620. .align 5
  621. .Loop_mul4x_tail:
  622. $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4])
  623. addze $carry,$carry # modulo-scheduled
  624. $UMULL $t1,$a1,$bi
  625. addi $cnt,$cnt,$SIZE_T
  626. $UMULL $t2,$a2,$bi
  627. andi. $cnt,$cnt,$SIZE_T*4-1
  628. $UMULL $t3,$a3,$bi
  629. addc $acc0,$acc0,$t0
  630. $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4])
  631. adde $acc1,$acc1,$t1
  632. $UMULH $t1,$a1,$bi
  633. adde $acc2,$acc2,$t2
  634. $UMULH $t2,$a2,$bi
  635. adde $acc3,$acc3,$t3
  636. $UMULH $t3,$a3,$bi
  637. addze $acc4,$zero
  638. $LDX $bi,$bp,$cnt # next b[i]
  639. addc $acc1,$acc1,$t0
  640. $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0)
  641. adde $acc2,$acc2,$t1
  642. $UMULL $t1,$m1,$mi
  643. adde $acc3,$acc3,$t2
  644. $UMULL $t2,$m2,$mi
  645. adde $acc4,$acc4,$t3 # can't overflow
  646. $UMULL $t3,$m3,$mi
  647. addc $acc0,$acc0,$t0
  648. $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0)
  649. adde $acc1,$acc1,$t1
  650. $UMULH $t1,$m1,$mi
  651. adde $acc2,$acc2,$t2
  652. $UMULH $t2,$m2,$mi
  653. adde $acc3,$acc3,$t3
  654. $UMULH $t3,$m3,$mi
  655. adde $acc4,$acc4,$carry
  656. addi $mi,$sp,$SIZE_T*8
  657. $LDX $mi,$mi,$cnt # next a[0]*n0
  658. addze $carry,$zero
  659. $STU $acc0,$SIZE_T($tp) # word of result
  660. addc $acc0,$acc1,$t0
  661. adde $acc1,$acc2,$t1
  662. adde $acc2,$acc3,$t2
  663. adde $acc3,$acc4,$t3
  664. #addze $carry,$carry
  665. bne .Loop_mul4x_tail
  666. $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry
  667. sub $t1,$np,$num # rewinded np?
  668. addze $carry,$carry
  669. $UCMP $ap_end,$ap # done yet?
  670. beq .Loop_mul4x_break
  671. $LD $t1,$SIZE_T*6($tp)
  672. $LD $t2,$SIZE_T*7($tp)
  673. $LD $t3,$SIZE_T*8($tp)
  674. $LD $a0,$SIZE_T*1($ap)
  675. $LD $a1,$SIZE_T*2($ap)
  676. $LD $a2,$SIZE_T*3($ap)
  677. $LDU $a3,$SIZE_T*4($ap)
  678. addc $acc0,$acc0,$t0
  679. adde $acc1,$acc1,$t1
  680. adde $acc2,$acc2,$t2
  681. adde $acc3,$acc3,$t3
  682. #addze $carry,$carry
  683. $LD $m0,$SIZE_T*1($np) # n[4..7]
  684. $LD $m1,$SIZE_T*2($np)
  685. $LD $m2,$SIZE_T*3($np)
  686. $LDU $m3,$SIZE_T*4($np)
  687. b .Loop_mul4x_tail
  688. .align 5
  689. .Loop_mul4x_break:
  690. $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4]
  691. $POP $t3,$SIZE_T*7($sp)
  692. addc $a0,$acc0,$t0 # accumulate topmost carry
  693. $LD $acc0,$SIZE_T*12($sp) # t[0..3]
  694. addze $a1,$acc1
  695. $LD $acc1,$SIZE_T*13($sp)
  696. addze $a2,$acc2
  697. $LD $acc2,$SIZE_T*14($sp)
  698. addze $a3,$acc3
  699. $LD $acc3,$SIZE_T*15($sp)
  700. addze $carry,$carry # topmost carry
  701. $ST $a0,$SIZE_T*1($tp) # result
  702. sub $ap,$ap_end,$num # rewind ap
  703. $ST $a1,$SIZE_T*2($tp)
  704. $ST $a2,$SIZE_T*3($tp)
  705. $ST $a3,$SIZE_T*4($tp)
  706. $ST $carry,$SIZE_T*5($tp) # store topmost carry
  707. $LD $m0,$SIZE_T*1($t1) # n[0..3]
  708. $LD $m1,$SIZE_T*2($t1)
  709. $LD $m2,$SIZE_T*3($t1)
  710. $LD $m3,$SIZE_T*4($t1)
  711. addi $np,$t1,$SIZE_T*4
  712. $UCMP $bp,$t3 # done yet?
  713. beq .Lmul4x_post
  714. $LDU $bi,$SIZE_T*4($bp)
  715. $LD $a0,$SIZE_T*1($ap) # a[0..3]
  716. $LD $a1,$SIZE_T*2($ap)
  717. $LD $a2,$SIZE_T*3($ap)
  718. $LDU $a3,$SIZE_T*4($ap)
  719. li $carry,0
  720. addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
  721. b .Loop_mul4x_reduction
  722. .align 5
  723. .Lmul4x_post:
  724. # Final step. We see if result is larger than modulus, and
  725. # if it is, subtract the modulus. But comparison implies
  726. # subtraction. So we subtract modulus, see if it borrowed,
  727. # and conditionally copy original value.
  728. srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
  729. mr $bp,$t2 # &rp[-1]
  730. subi $cnt,$cnt,1
  731. mr $ap_end,$t2 # &rp[-1] copy
  732. subfc $t0,$m0,$acc0
  733. addi $tp,$sp,$SIZE_T*15
  734. subfe $t1,$m1,$acc1
  735. mtctr $cnt
  736. .Lmul4x_sub:
  737. $LD $m0,$SIZE_T*1($np)
  738. $LD $acc0,$SIZE_T*1($tp)
  739. subfe $t2,$m2,$acc2
  740. $LD $m1,$SIZE_T*2($np)
  741. $LD $acc1,$SIZE_T*2($tp)
  742. subfe $t3,$m3,$acc3
  743. $LD $m2,$SIZE_T*3($np)
  744. $LD $acc2,$SIZE_T*3($tp)
  745. $LDU $m3,$SIZE_T*4($np)
  746. $LDU $acc3,$SIZE_T*4($tp)
  747. $ST $t0,$SIZE_T*1($bp)
  748. $ST $t1,$SIZE_T*2($bp)
  749. subfe $t0,$m0,$acc0
  750. $ST $t2,$SIZE_T*3($bp)
  751. $STU $t3,$SIZE_T*4($bp)
  752. subfe $t1,$m1,$acc1
  753. bdnz .Lmul4x_sub
  754. $LD $a0,$SIZE_T*1($ap_end)
  755. $ST $t0,$SIZE_T*1($bp)
  756. $LD $t0,$SIZE_T*12($sp)
  757. subfe $t2,$m2,$acc2
  758. $LD $a1,$SIZE_T*2($ap_end)
  759. $ST $t1,$SIZE_T*2($bp)
  760. $LD $t1,$SIZE_T*13($sp)
  761. subfe $t3,$m3,$acc3
  762. subfe $carry,$zero,$carry # did it borrow?
  763. addi $tp,$sp,$SIZE_T*12
  764. $LD $a2,$SIZE_T*3($ap_end)
  765. $ST $t2,$SIZE_T*3($bp)
  766. $LD $t2,$SIZE_T*14($sp)
  767. $LD $a3,$SIZE_T*4($ap_end)
  768. $ST $t3,$SIZE_T*4($bp)
  769. $LD $t3,$SIZE_T*15($sp)
  770. mtctr $cnt
  771. .Lmul4x_cond_copy:
  772. and $t0,$t0,$carry
  773. andc $a0,$a0,$carry
  774. $ST $zero,$SIZE_T*0($tp) # wipe stack clean
  775. and $t1,$t1,$carry
  776. andc $a1,$a1,$carry
  777. $ST $zero,$SIZE_T*1($tp)
  778. and $t2,$t2,$carry
  779. andc $a2,$a2,$carry
  780. $ST $zero,$SIZE_T*2($tp)
  781. and $t3,$t3,$carry
  782. andc $a3,$a3,$carry
  783. $ST $zero,$SIZE_T*3($tp)
  784. or $acc0,$t0,$a0
  785. $LD $a0,$SIZE_T*5($ap_end)
  786. $LD $t0,$SIZE_T*4($tp)
  787. or $acc1,$t1,$a1
  788. $LD $a1,$SIZE_T*6($ap_end)
  789. $LD $t1,$SIZE_T*5($tp)
  790. or $acc2,$t2,$a2
  791. $LD $a2,$SIZE_T*7($ap_end)
  792. $LD $t2,$SIZE_T*6($tp)
  793. or $acc3,$t3,$a3
  794. $LD $a3,$SIZE_T*8($ap_end)
  795. $LD $t3,$SIZE_T*7($tp)
  796. addi $tp,$tp,$SIZE_T*4
  797. $ST $acc0,$SIZE_T*1($ap_end)
  798. $ST $acc1,$SIZE_T*2($ap_end)
  799. $ST $acc2,$SIZE_T*3($ap_end)
  800. $STU $acc3,$SIZE_T*4($ap_end)
  801. bdnz .Lmul4x_cond_copy
  802. $POP $bp,0($sp) # pull saved sp
  803. and $t0,$t0,$carry
  804. andc $a0,$a0,$carry
  805. $ST $zero,$SIZE_T*0($tp)
  806. and $t1,$t1,$carry
  807. andc $a1,$a1,$carry
  808. $ST $zero,$SIZE_T*1($tp)
  809. and $t2,$t2,$carry
  810. andc $a2,$a2,$carry
  811. $ST $zero,$SIZE_T*2($tp)
  812. and $t3,$t3,$carry
  813. andc $a3,$a3,$carry
  814. $ST $zero,$SIZE_T*3($tp)
  815. or $acc0,$t0,$a0
  816. or $acc1,$t1,$a1
  817. $ST $zero,$SIZE_T*4($tp)
  818. or $acc2,$t2,$a2
  819. or $acc3,$t3,$a3
  820. $ST $acc0,$SIZE_T*1($ap_end)
  821. $ST $acc1,$SIZE_T*2($ap_end)
  822. $ST $acc2,$SIZE_T*3($ap_end)
  823. $ST $acc3,$SIZE_T*4($ap_end)
  824. b .Lmul4x_done
  825. .align 4
  826. .Lmul4x4_post_condition:
  827. $POP $ap,$SIZE_T*6($sp) # pull &rp[-1]
  828. $POP $bp,0($sp) # pull saved sp
  829. addze $carry,$carry # modulo-scheduled
  830. # $acc0-3,$carry hold result, $m0-3 hold modulus
  831. subfc $a0,$m0,$acc0
  832. subfe $a1,$m1,$acc1
  833. subfe $a2,$m2,$acc2
  834. subfe $a3,$m3,$acc3
  835. subfe $carry,$zero,$carry # did it borrow?
  836. and $m0,$m0,$carry
  837. and $m1,$m1,$carry
  838. addc $a0,$a0,$m0
  839. and $m2,$m2,$carry
  840. adde $a1,$a1,$m1
  841. and $m3,$m3,$carry
  842. adde $a2,$a2,$m2
  843. adde $a3,$a3,$m3
  844. $ST $a0,$SIZE_T*1($ap) # write result
  845. $ST $a1,$SIZE_T*2($ap)
  846. $ST $a2,$SIZE_T*3($ap)
  847. $ST $a3,$SIZE_T*4($ap)
  848. .Lmul4x_done:
  849. $ST $zero,$SIZE_T*8($sp) # wipe stack clean
  850. $ST $zero,$SIZE_T*9($sp)
  851. $ST $zero,$SIZE_T*10($sp)
  852. $ST $zero,$SIZE_T*11($sp)
  853. li r3,1 # signal "done"
  854. $POP r14,-$SIZE_T*18($bp)
  855. $POP r15,-$SIZE_T*17($bp)
  856. $POP r16,-$SIZE_T*16($bp)
  857. $POP r17,-$SIZE_T*15($bp)
  858. $POP r18,-$SIZE_T*14($bp)
  859. $POP r19,-$SIZE_T*13($bp)
  860. $POP r20,-$SIZE_T*12($bp)
  861. $POP r21,-$SIZE_T*11($bp)
  862. $POP r22,-$SIZE_T*10($bp)
  863. $POP r23,-$SIZE_T*9($bp)
  864. $POP r24,-$SIZE_T*8($bp)
  865. $POP r25,-$SIZE_T*7($bp)
  866. $POP r26,-$SIZE_T*6($bp)
  867. $POP r27,-$SIZE_T*5($bp)
  868. $POP r28,-$SIZE_T*4($bp)
  869. $POP r29,-$SIZE_T*3($bp)
  870. $POP r30,-$SIZE_T*2($bp)
  871. $POP r31,-$SIZE_T*1($bp)
  872. mr $sp,$bp
  873. blr
  874. .long 0
  875. .byte 0,12,4,0x20,0x80,18,6,0
  876. .long 0
  877. .size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int
  878. ___
  879. }
  880. if (1) {
  881. ########################################################################
  882. # Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
  883. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
  884. my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
  885. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
  886. my ($cnt,$carry,$zero)=("r30","r31","r0");
  887. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  888. # sp----------->+-------------------------------+
  889. # | saved sp |
  890. # +-------------------------------+
  891. # . .
  892. # +12*size_t +-------------------------------+
  893. # | size_t tmp[2*num] |
  894. # . .
  895. # . .
  896. # . .
  897. # +-------------------------------+
  898. # . .
  899. # -18*size_t +-------------------------------+
  900. # | 18 saved gpr, r14-r31 |
  901. # . .
  902. # . .
  903. # +-------------------------------+
  904. $code.=<<___;
  905. .align 5
  906. __bn_sqr8x_mont:
  907. .Lsqr8x_do:
  908. mr $a0,$sp
  909. slwi $a1,$num,`log($SIZE_T)/log(2)+1`
  910. li $a2,-32*$SIZE_T
  911. sub $a1,$a2,$a1
  912. slwi $num,$num,`log($SIZE_T)/log(2)`
  913. $STUX $sp,$sp,$a1 # alloca
  914. $PUSH r14,-$SIZE_T*18($a0)
  915. $PUSH r15,-$SIZE_T*17($a0)
  916. $PUSH r16,-$SIZE_T*16($a0)
  917. $PUSH r17,-$SIZE_T*15($a0)
  918. $PUSH r18,-$SIZE_T*14($a0)
  919. $PUSH r19,-$SIZE_T*13($a0)
  920. $PUSH r20,-$SIZE_T*12($a0)
  921. $PUSH r21,-$SIZE_T*11($a0)
  922. $PUSH r22,-$SIZE_T*10($a0)
  923. $PUSH r23,-$SIZE_T*9($a0)
  924. $PUSH r24,-$SIZE_T*8($a0)
  925. $PUSH r25,-$SIZE_T*7($a0)
  926. $PUSH r26,-$SIZE_T*6($a0)
  927. $PUSH r27,-$SIZE_T*5($a0)
  928. $PUSH r28,-$SIZE_T*4($a0)
  929. $PUSH r29,-$SIZE_T*3($a0)
  930. $PUSH r30,-$SIZE_T*2($a0)
  931. $PUSH r31,-$SIZE_T*1($a0)
  932. subi $ap,$ap,$SIZE_T # bias by -1
  933. subi $t0,$np,$SIZE_T # bias by -1
  934. subi $rp,$rp,$SIZE_T # bias by -1
  935. $LD $n0,0($n0) # *n0
  936. li $zero,0
  937. add $ap_end,$ap,$num
  938. $LD $a0,$SIZE_T*1($ap)
  939. #li $acc0,0
  940. $LD $a1,$SIZE_T*2($ap)
  941. li $acc1,0
  942. $LD $a2,$SIZE_T*3($ap)
  943. li $acc2,0
  944. $LD $a3,$SIZE_T*4($ap)
  945. li $acc3,0
  946. $LD $a4,$SIZE_T*5($ap)
  947. li $acc4,0
  948. $LD $a5,$SIZE_T*6($ap)
  949. li $acc5,0
  950. $LD $a6,$SIZE_T*7($ap)
  951. li $acc6,0
  952. $LDU $a7,$SIZE_T*8($ap)
  953. li $acc7,0
  954. addi $tp,$sp,$SIZE_T*11 # &tp[-1]
  955. subic. $cnt,$num,$SIZE_T*8
  956. b .Lsqr8x_zero_start
  957. .align 5
  958. .Lsqr8x_zero:
  959. subic. $cnt,$cnt,$SIZE_T*8
  960. $ST $zero,$SIZE_T*1($tp)
  961. $ST $zero,$SIZE_T*2($tp)
  962. $ST $zero,$SIZE_T*3($tp)
  963. $ST $zero,$SIZE_T*4($tp)
  964. $ST $zero,$SIZE_T*5($tp)
  965. $ST $zero,$SIZE_T*6($tp)
  966. $ST $zero,$SIZE_T*7($tp)
  967. $ST $zero,$SIZE_T*8($tp)
  968. .Lsqr8x_zero_start:
  969. $ST $zero,$SIZE_T*9($tp)
  970. $ST $zero,$SIZE_T*10($tp)
  971. $ST $zero,$SIZE_T*11($tp)
  972. $ST $zero,$SIZE_T*12($tp)
  973. $ST $zero,$SIZE_T*13($tp)
  974. $ST $zero,$SIZE_T*14($tp)
  975. $ST $zero,$SIZE_T*15($tp)
  976. $STU $zero,$SIZE_T*16($tp)
  977. bne .Lsqr8x_zero
  978. $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1]
  979. $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1]
  980. $PUSH $n0,$SIZE_T*8($sp) # offload n0
  981. $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1]
  982. $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry
  983. addi $tp,$sp,$SIZE_T*11 # &tp[-1]
  984. # Multiply everything but a[i]*a[i]
  985. .align 5
  986. .Lsqr8x_outer_loop:
  987. # a[1]a[0] (i)
  988. # a[2]a[0]
  989. # a[3]a[0]
  990. # a[4]a[0]
  991. # a[5]a[0]
  992. # a[6]a[0]
  993. # a[7]a[0]
  994. # a[2]a[1] (ii)
  995. # a[3]a[1]
  996. # a[4]a[1]
  997. # a[5]a[1]
  998. # a[6]a[1]
  999. # a[7]a[1]
  1000. # a[3]a[2] (iii)
  1001. # a[4]a[2]
  1002. # a[5]a[2]
  1003. # a[6]a[2]
  1004. # a[7]a[2]
  1005. # a[4]a[3] (iv)
  1006. # a[5]a[3]
  1007. # a[6]a[3]
  1008. # a[7]a[3]
  1009. # a[5]a[4] (v)
  1010. # a[6]a[4]
  1011. # a[7]a[4]
  1012. # a[6]a[5] (vi)
  1013. # a[7]a[5]
  1014. # a[7]a[6] (vii)
  1015. $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i)
  1016. $UMULL $t1,$a2,$a0
  1017. $UMULL $t2,$a3,$a0
  1018. $UMULL $t3,$a4,$a0
  1019. addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0])
  1020. $UMULL $t0,$a5,$a0
  1021. adde $acc2,$acc2,$t1
  1022. $UMULL $t1,$a6,$a0
  1023. adde $acc3,$acc3,$t2
  1024. $UMULL $t2,$a7,$a0
  1025. adde $acc4,$acc4,$t3
  1026. $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0])
  1027. adde $acc5,$acc5,$t0
  1028. $UMULH $t0,$a2,$a0
  1029. adde $acc6,$acc6,$t1
  1030. $UMULH $t1,$a3,$a0
  1031. adde $acc7,$acc7,$t2
  1032. $UMULH $t2,$a4,$a0
  1033. $ST $acc0,$SIZE_T*1($tp) # t[0]
  1034. addze $acc0,$zero # t[8]
  1035. $ST $acc1,$SIZE_T*2($tp) # t[1]
  1036. addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0])
  1037. $UMULH $t3,$a5,$a0
  1038. adde $acc3,$acc3,$t0
  1039. $UMULH $t0,$a6,$a0
  1040. adde $acc4,$acc4,$t1
  1041. $UMULH $t1,$a7,$a0
  1042. adde $acc5,$acc5,$t2
  1043. $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii)
  1044. adde $acc6,$acc6,$t3
  1045. $UMULL $t3,$a3,$a1
  1046. adde $acc7,$acc7,$t0
  1047. $UMULL $t0,$a4,$a1
  1048. adde $acc0,$acc0,$t1
  1049. $UMULL $t1,$a5,$a1
  1050. addc $acc3,$acc3,$t2
  1051. $UMULL $t2,$a6,$a1
  1052. adde $acc4,$acc4,$t3
  1053. $UMULL $t3,$a7,$a1
  1054. adde $acc5,$acc5,$t0
  1055. $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1])
  1056. adde $acc6,$acc6,$t1
  1057. $UMULH $t1,$a3,$a1
  1058. adde $acc7,$acc7,$t2
  1059. $UMULH $t2,$a4,$a1
  1060. adde $acc0,$acc0,$t3
  1061. $UMULH $t3,$a5,$a1
  1062. $ST $acc2,$SIZE_T*3($tp) # t[2]
  1063. addze $acc1,$zero # t[9]
  1064. $ST $acc3,$SIZE_T*4($tp) # t[3]
  1065. addc $acc4,$acc4,$t0
  1066. $UMULH $t0,$a6,$a1
  1067. adde $acc5,$acc5,$t1
  1068. $UMULH $t1,$a7,$a1
  1069. adde $acc6,$acc6,$t2
  1070. $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii)
  1071. adde $acc7,$acc7,$t3
  1072. $UMULL $t3,$a4,$a2
  1073. adde $acc0,$acc0,$t0
  1074. $UMULL $t0,$a5,$a2
  1075. adde $acc1,$acc1,$t1
  1076. $UMULL $t1,$a6,$a2
  1077. addc $acc5,$acc5,$t2
  1078. $UMULL $t2,$a7,$a2
  1079. adde $acc6,$acc6,$t3
  1080. $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2])
  1081. adde $acc7,$acc7,$t0
  1082. $UMULH $t0,$a4,$a2
  1083. adde $acc0,$acc0,$t1
  1084. $UMULH $t1,$a5,$a2
  1085. adde $acc1,$acc1,$t2
  1086. $UMULH $t2,$a6,$a2
  1087. $ST $acc4,$SIZE_T*5($tp) # t[4]
  1088. addze $acc2,$zero # t[10]
  1089. $ST $acc5,$SIZE_T*6($tp) # t[5]
  1090. addc $acc6,$acc6,$t3
  1091. $UMULH $t3,$a7,$a2
  1092. adde $acc7,$acc7,$t0
  1093. $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv)
  1094. adde $acc0,$acc0,$t1
  1095. $UMULL $t1,$a5,$a3
  1096. adde $acc1,$acc1,$t2
  1097. $UMULL $t2,$a6,$a3
  1098. adde $acc2,$acc2,$t3
  1099. $UMULL $t3,$a7,$a3
  1100. addc $acc7,$acc7,$t0
  1101. $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3])
  1102. adde $acc0,$acc0,$t1
  1103. $UMULH $t1,$a5,$a3
  1104. adde $acc1,$acc1,$t2
  1105. $UMULH $t2,$a6,$a3
  1106. adde $acc2,$acc2,$t3
  1107. $UMULH $t3,$a7,$a3
  1108. $ST $acc6,$SIZE_T*7($tp) # t[6]
  1109. addze $acc3,$zero # t[11]
  1110. $STU $acc7,$SIZE_T*8($tp) # t[7]
  1111. addc $acc0,$acc0,$t0
  1112. $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v)
  1113. adde $acc1,$acc1,$t1
  1114. $UMULL $t1,$a6,$a4
  1115. adde $acc2,$acc2,$t2
  1116. $UMULL $t2,$a7,$a4
  1117. adde $acc3,$acc3,$t3
  1118. $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4])
  1119. addc $acc1,$acc1,$t0
  1120. $UMULH $t0,$a6,$a4
  1121. adde $acc2,$acc2,$t1
  1122. $UMULH $t1,$a7,$a4
  1123. adde $acc3,$acc3,$t2
  1124. $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi)
  1125. addze $acc4,$zero # t[12]
  1126. addc $acc2,$acc2,$t3
  1127. $UMULL $t3,$a7,$a5
  1128. adde $acc3,$acc3,$t0
  1129. $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5])
  1130. adde $acc4,$acc4,$t1
  1131. $UMULH $t1,$a7,$a5
  1132. addc $acc3,$acc3,$t2
  1133. $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii)
  1134. adde $acc4,$acc4,$t3
  1135. $UMULH $t3,$a7,$a6 # hi(a[7]*a[6])
  1136. addze $acc5,$zero # t[13]
  1137. addc $acc4,$acc4,$t0
  1138. $UCMP $ap_end,$ap # done yet?
  1139. adde $acc5,$acc5,$t1
  1140. addc $acc5,$acc5,$t2
  1141. sub $t0,$ap_end,$num # rewinded ap
  1142. addze $acc6,$zero # t[14]
  1143. add $acc6,$acc6,$t3
  1144. beq .Lsqr8x_outer_break
  1145. mr $n0,$a0
  1146. $LD $a0,$SIZE_T*1($tp)
  1147. $LD $a1,$SIZE_T*2($tp)
  1148. $LD $a2,$SIZE_T*3($tp)
  1149. $LD $a3,$SIZE_T*4($tp)
  1150. $LD $a4,$SIZE_T*5($tp)
  1151. $LD $a5,$SIZE_T*6($tp)
  1152. $LD $a6,$SIZE_T*7($tp)
  1153. $LD $a7,$SIZE_T*8($tp)
  1154. addc $acc0,$acc0,$a0
  1155. $LD $a0,$SIZE_T*1($ap)
  1156. adde $acc1,$acc1,$a1
  1157. $LD $a1,$SIZE_T*2($ap)
  1158. adde $acc2,$acc2,$a2
  1159. $LD $a2,$SIZE_T*3($ap)
  1160. adde $acc3,$acc3,$a3
  1161. $LD $a3,$SIZE_T*4($ap)
  1162. adde $acc4,$acc4,$a4
  1163. $LD $a4,$SIZE_T*5($ap)
  1164. adde $acc5,$acc5,$a5
  1165. $LD $a5,$SIZE_T*6($ap)
  1166. adde $acc6,$acc6,$a6
  1167. $LD $a6,$SIZE_T*7($ap)
  1168. subi $rp,$ap,$SIZE_T*7
  1169. addze $acc7,$a7
  1170. $LDU $a7,$SIZE_T*8($ap)
  1171. #addze $carry,$zero # moved below
  1172. li $cnt,0
  1173. b .Lsqr8x_mul
  1174. # a[8]a[0]
  1175. # a[9]a[0]
  1176. # a[a]a[0]
  1177. # a[b]a[0]
  1178. # a[c]a[0]
  1179. # a[d]a[0]
  1180. # a[e]a[0]
  1181. # a[f]a[0]
  1182. # a[8]a[1]
  1183. # a[f]a[1]........................
  1184. # a[8]a[2]
  1185. # a[f]a[2]........................
  1186. # a[8]a[3]
  1187. # a[f]a[3]........................
  1188. # a[8]a[4]
  1189. # a[f]a[4]........................
  1190. # a[8]a[5]
  1191. # a[f]a[5]........................
  1192. # a[8]a[6]
  1193. # a[f]a[6]........................
  1194. # a[8]a[7]
  1195. # a[f]a[7]........................
  1196. .align 5
  1197. .Lsqr8x_mul:
  1198. $UMULL $t0,$a0,$n0
  1199. addze $carry,$zero # carry bit, modulo-scheduled
  1200. $UMULL $t1,$a1,$n0
  1201. addi $cnt,$cnt,$SIZE_T
  1202. $UMULL $t2,$a2,$n0
  1203. andi. $cnt,$cnt,$SIZE_T*8-1
  1204. $UMULL $t3,$a3,$n0
  1205. addc $acc0,$acc0,$t0
  1206. $UMULL $t0,$a4,$n0
  1207. adde $acc1,$acc1,$t1
  1208. $UMULL $t1,$a5,$n0
  1209. adde $acc2,$acc2,$t2
  1210. $UMULL $t2,$a6,$n0
  1211. adde $acc3,$acc3,$t3
  1212. $UMULL $t3,$a7,$n0
  1213. adde $acc4,$acc4,$t0
  1214. $UMULH $t0,$a0,$n0
  1215. adde $acc5,$acc5,$t1
  1216. $UMULH $t1,$a1,$n0
  1217. adde $acc6,$acc6,$t2
  1218. $UMULH $t2,$a2,$n0
  1219. adde $acc7,$acc7,$t3
  1220. $UMULH $t3,$a3,$n0
  1221. addze $carry,$carry
  1222. $STU $acc0,$SIZE_T($tp)
  1223. addc $acc0,$acc1,$t0
  1224. $UMULH $t0,$a4,$n0
  1225. adde $acc1,$acc2,$t1
  1226. $UMULH $t1,$a5,$n0
  1227. adde $acc2,$acc3,$t2
  1228. $UMULH $t2,$a6,$n0
  1229. adde $acc3,$acc4,$t3
  1230. $UMULH $t3,$a7,$n0
  1231. $LDX $n0,$rp,$cnt
  1232. adde $acc4,$acc5,$t0
  1233. adde $acc5,$acc6,$t1
  1234. adde $acc6,$acc7,$t2
  1235. adde $acc7,$carry,$t3
  1236. #addze $carry,$zero # moved above
  1237. bne .Lsqr8x_mul
  1238. # note that carry flag is guaranteed
  1239. # to be zero at this point
  1240. $UCMP $ap,$ap_end # done yet?
  1241. beq .Lsqr8x_break
  1242. $LD $a0,$SIZE_T*1($tp)
  1243. $LD $a1,$SIZE_T*2($tp)
  1244. $LD $a2,$SIZE_T*3($tp)
  1245. $LD $a3,$SIZE_T*4($tp)
  1246. $LD $a4,$SIZE_T*5($tp)
  1247. $LD $a5,$SIZE_T*6($tp)
  1248. $LD $a6,$SIZE_T*7($tp)
  1249. $LD $a7,$SIZE_T*8($tp)
  1250. addc $acc0,$acc0,$a0
  1251. $LD $a0,$SIZE_T*1($ap)
  1252. adde $acc1,$acc1,$a1
  1253. $LD $a1,$SIZE_T*2($ap)
  1254. adde $acc2,$acc2,$a2
  1255. $LD $a2,$SIZE_T*3($ap)
  1256. adde $acc3,$acc3,$a3
  1257. $LD $a3,$SIZE_T*4($ap)
  1258. adde $acc4,$acc4,$a4
  1259. $LD $a4,$SIZE_T*5($ap)
  1260. adde $acc5,$acc5,$a5
  1261. $LD $a5,$SIZE_T*6($ap)
  1262. adde $acc6,$acc6,$a6
  1263. $LD $a6,$SIZE_T*7($ap)
  1264. adde $acc7,$acc7,$a7
  1265. $LDU $a7,$SIZE_T*8($ap)
  1266. #addze $carry,$zero # moved above
  1267. b .Lsqr8x_mul
  1268. .align 5
  1269. .Lsqr8x_break:
  1270. $LD $a0,$SIZE_T*8($rp)
  1271. addi $ap,$rp,$SIZE_T*15
  1272. $LD $a1,$SIZE_T*9($rp)
  1273. sub. $t0,$ap_end,$ap # is it last iteration?
  1274. $LD $a2,$SIZE_T*10($rp)
  1275. sub $t1,$tp,$t0
  1276. $LD $a3,$SIZE_T*11($rp)
  1277. $LD $a4,$SIZE_T*12($rp)
  1278. $LD $a5,$SIZE_T*13($rp)
  1279. $LD $a6,$SIZE_T*14($rp)
  1280. $LD $a7,$SIZE_T*15($rp)
  1281. beq .Lsqr8x_outer_loop
  1282. $ST $acc0,$SIZE_T*1($tp)
  1283. $LD $acc0,$SIZE_T*1($t1)
  1284. $ST $acc1,$SIZE_T*2($tp)
  1285. $LD $acc1,$SIZE_T*2($t1)
  1286. $ST $acc2,$SIZE_T*3($tp)
  1287. $LD $acc2,$SIZE_T*3($t1)
  1288. $ST $acc3,$SIZE_T*4($tp)
  1289. $LD $acc3,$SIZE_T*4($t1)
  1290. $ST $acc4,$SIZE_T*5($tp)
  1291. $LD $acc4,$SIZE_T*5($t1)
  1292. $ST $acc5,$SIZE_T*6($tp)
  1293. $LD $acc5,$SIZE_T*6($t1)
  1294. $ST $acc6,$SIZE_T*7($tp)
  1295. $LD $acc6,$SIZE_T*7($t1)
  1296. $ST $acc7,$SIZE_T*8($tp)
  1297. $LD $acc7,$SIZE_T*8($t1)
  1298. mr $tp,$t1
  1299. b .Lsqr8x_outer_loop
  1300. .align 5
  1301. .Lsqr8x_outer_break:
  1302. ####################################################################
  1303. # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  1304. $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1]
  1305. $LD $a3,$SIZE_T*2($t0)
  1306. $LD $a5,$SIZE_T*3($t0)
  1307. $LD $a7,$SIZE_T*4($t0)
  1308. addi $ap,$t0,$SIZE_T*4
  1309. # "tp[x]" comments are for num==8 case
  1310. $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting
  1311. $LD $t2,$SIZE_T*14($sp)
  1312. $LD $t3,$SIZE_T*15($sp)
  1313. $LD $t0,$SIZE_T*16($sp)
  1314. $ST $acc0,$SIZE_T*1($tp) # tp[8]=
  1315. srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
  1316. $ST $acc1,$SIZE_T*2($tp)
  1317. subi $cnt,$cnt,1
  1318. $ST $acc2,$SIZE_T*3($tp)
  1319. $ST $acc3,$SIZE_T*4($tp)
  1320. $ST $acc4,$SIZE_T*5($tp)
  1321. $ST $acc5,$SIZE_T*6($tp)
  1322. $ST $acc6,$SIZE_T*7($tp)
  1323. #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting
  1324. addi $tp,$sp,$SIZE_T*11 # &tp[-1]
  1325. $UMULL $acc0,$a1,$a1
  1326. $UMULH $a1,$a1,$a1
  1327. add $acc1,$t1,$t1 # <<1
  1328. $SHRI $t1,$t1,$BITS-1
  1329. $UMULL $a2,$a3,$a3
  1330. $UMULH $a3,$a3,$a3
  1331. addc $acc1,$acc1,$a1
  1332. add $acc2,$t2,$t2
  1333. $SHRI $t2,$t2,$BITS-1
  1334. add $acc3,$t3,$t3
  1335. $SHRI $t3,$t3,$BITS-1
  1336. or $acc2,$acc2,$t1
  1337. mtctr $cnt
  1338. .Lsqr4x_shift_n_add:
  1339. $UMULL $a4,$a5,$a5
  1340. $UMULH $a5,$a5,$a5
  1341. $LD $t1,$SIZE_T*6($tp) # =tp[5]
  1342. $LD $a1,$SIZE_T*1($ap)
  1343. adde $acc2,$acc2,$a2
  1344. add $acc4,$t0,$t0
  1345. $SHRI $t0,$t0,$BITS-1
  1346. or $acc3,$acc3,$t2
  1347. $LD $t2,$SIZE_T*7($tp) # =tp[6]
  1348. adde $acc3,$acc3,$a3
  1349. $LD $a3,$SIZE_T*2($ap)
  1350. add $acc5,$t1,$t1
  1351. $SHRI $t1,$t1,$BITS-1
  1352. or $acc4,$acc4,$t3
  1353. $LD $t3,$SIZE_T*8($tp) # =tp[7]
  1354. $UMULL $a6,$a7,$a7
  1355. $UMULH $a7,$a7,$a7
  1356. adde $acc4,$acc4,$a4
  1357. add $acc6,$t2,$t2
  1358. $SHRI $t2,$t2,$BITS-1
  1359. or $acc5,$acc5,$t0
  1360. $LD $t0,$SIZE_T*9($tp) # =tp[8]
  1361. adde $acc5,$acc5,$a5
  1362. $LD $a5,$SIZE_T*3($ap)
  1363. add $acc7,$t3,$t3
  1364. $SHRI $t3,$t3,$BITS-1
  1365. or $acc6,$acc6,$t1
  1366. $LD $t1,$SIZE_T*10($tp) # =tp[9]
  1367. $UMULL $a0,$a1,$a1
  1368. $UMULH $a1,$a1,$a1
  1369. adde $acc6,$acc6,$a6
  1370. $ST $acc0,$SIZE_T*1($tp) # tp[0]=
  1371. add $acc0,$t0,$t0
  1372. $SHRI $t0,$t0,$BITS-1
  1373. or $acc7,$acc7,$t2
  1374. $LD $t2,$SIZE_T*11($tp) # =tp[10]
  1375. adde $acc7,$acc7,$a7
  1376. $LDU $a7,$SIZE_T*4($ap)
  1377. $ST $acc1,$SIZE_T*2($tp) # tp[1]=
  1378. add $acc1,$t1,$t1
  1379. $SHRI $t1,$t1,$BITS-1
  1380. or $acc0,$acc0,$t3
  1381. $LD $t3,$SIZE_T*12($tp) # =tp[11]
  1382. $UMULL $a2,$a3,$a3
  1383. $UMULH $a3,$a3,$a3
  1384. adde $acc0,$acc0,$a0
  1385. $ST $acc2,$SIZE_T*3($tp) # tp[2]=
  1386. add $acc2,$t2,$t2
  1387. $SHRI $t2,$t2,$BITS-1
  1388. or $acc1,$acc1,$t0
  1389. $LD $t0,$SIZE_T*13($tp) # =tp[12]
  1390. adde $acc1,$acc1,$a1
  1391. $ST $acc3,$SIZE_T*4($tp) # tp[3]=
  1392. $ST $acc4,$SIZE_T*5($tp) # tp[4]=
  1393. $ST $acc5,$SIZE_T*6($tp) # tp[5]=
  1394. $ST $acc6,$SIZE_T*7($tp) # tp[6]=
  1395. $STU $acc7,$SIZE_T*8($tp) # tp[7]=
  1396. add $acc3,$t3,$t3
  1397. $SHRI $t3,$t3,$BITS-1
  1398. or $acc2,$acc2,$t1
  1399. bdnz .Lsqr4x_shift_n_add
  1400. ___
  1401. my ($np,$np_end)=($ap,$ap_end);
  1402. $code.=<<___;
  1403. $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0
  1404. $POP $n0,$SIZE_T*8($sp)
  1405. $UMULL $a4,$a5,$a5
  1406. $UMULH $a5,$a5,$a5
  1407. $ST $acc0,$SIZE_T*1($tp) # tp[8]=
  1408. $LD $acc0,$SIZE_T*12($sp) # =tp[0]
  1409. $LD $t1,$SIZE_T*6($tp) # =tp[13]
  1410. adde $acc2,$acc2,$a2
  1411. add $acc4,$t0,$t0
  1412. $SHRI $t0,$t0,$BITS-1
  1413. or $acc3,$acc3,$t2
  1414. $LD $t2,$SIZE_T*7($tp) # =tp[14]
  1415. adde $acc3,$acc3,$a3
  1416. add $acc5,$t1,$t1
  1417. $SHRI $t1,$t1,$BITS-1
  1418. or $acc4,$acc4,$t3
  1419. $UMULL $a6,$a7,$a7
  1420. $UMULH $a7,$a7,$a7
  1421. adde $acc4,$acc4,$a4
  1422. add $acc6,$t2,$t2
  1423. $SHRI $t2,$t2,$BITS-1
  1424. or $acc5,$acc5,$t0
  1425. $ST $acc1,$SIZE_T*2($tp) # tp[9]=
  1426. $LD $acc1,$SIZE_T*13($sp) # =tp[1]
  1427. adde $acc5,$acc5,$a5
  1428. or $acc6,$acc6,$t1
  1429. $LD $a0,$SIZE_T*1($np)
  1430. $LD $a1,$SIZE_T*2($np)
  1431. adde $acc6,$acc6,$a6
  1432. $LD $a2,$SIZE_T*3($np)
  1433. $LD $a3,$SIZE_T*4($np)
  1434. adde $acc7,$a7,$t2
  1435. $LD $a4,$SIZE_T*5($np)
  1436. $LD $a5,$SIZE_T*6($np)
  1437. ################################################################
  1438. # Reduce by 8 limbs per iteration
  1439. $UMULL $na0,$n0,$acc0 # t[0]*n0
  1440. li $cnt,8
  1441. $LD $a6,$SIZE_T*7($np)
  1442. add $np_end,$np,$num
  1443. $LDU $a7,$SIZE_T*8($np)
  1444. $ST $acc2,$SIZE_T*3($tp) # tp[10]=
  1445. $LD $acc2,$SIZE_T*14($sp)
  1446. $ST $acc3,$SIZE_T*4($tp) # tp[11]=
  1447. $LD $acc3,$SIZE_T*15($sp)
  1448. $ST $acc4,$SIZE_T*5($tp) # tp[12]=
  1449. $LD $acc4,$SIZE_T*16($sp)
  1450. $ST $acc5,$SIZE_T*6($tp) # tp[13]=
  1451. $LD $acc5,$SIZE_T*17($sp)
  1452. $ST $acc6,$SIZE_T*7($tp) # tp[14]=
  1453. $LD $acc6,$SIZE_T*18($sp)
  1454. $ST $acc7,$SIZE_T*8($tp) # tp[15]=
  1455. $LD $acc7,$SIZE_T*19($sp)
  1456. addi $tp,$sp,$SIZE_T*11 # &tp[-1]
  1457. mtctr $cnt
  1458. b .Lsqr8x_reduction
  1459. .align 5
  1460. .Lsqr8x_reduction:
  1461. # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0)
  1462. $UMULL $t1,$a1,$na0
  1463. $UMULL $t2,$a2,$na0
  1464. $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
  1465. $UMULL $t3,$a3,$na0
  1466. # (*) addc $acc0,$acc0,$t0
  1467. addic $acc0,$acc0,-1 # (*)
  1468. $UMULL $t0,$a4,$na0
  1469. adde $acc0,$acc1,$t1
  1470. $UMULL $t1,$a5,$na0
  1471. adde $acc1,$acc2,$t2
  1472. $UMULL $t2,$a6,$na0
  1473. adde $acc2,$acc3,$t3
  1474. $UMULL $t3,$a7,$na0
  1475. adde $acc3,$acc4,$t0
  1476. $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0)
  1477. adde $acc4,$acc5,$t1
  1478. $UMULH $t1,$a1,$na0
  1479. adde $acc5,$acc6,$t2
  1480. $UMULH $t2,$a2,$na0
  1481. adde $acc6,$acc7,$t3
  1482. $UMULH $t3,$a3,$na0
  1483. addze $acc7,$zero
  1484. addc $acc0,$acc0,$t0
  1485. $UMULH $t0,$a4,$na0
  1486. adde $acc1,$acc1,$t1
  1487. $UMULH $t1,$a5,$na0
  1488. adde $acc2,$acc2,$t2
  1489. $UMULH $t2,$a6,$na0
  1490. adde $acc3,$acc3,$t3
  1491. $UMULH $t3,$a7,$na0
  1492. $UMULL $na0,$n0,$acc0 # next t[0]*n0
  1493. adde $acc4,$acc4,$t0
  1494. adde $acc5,$acc5,$t1
  1495. adde $acc6,$acc6,$t2
  1496. adde $acc7,$acc7,$t3
  1497. bdnz .Lsqr8x_reduction
  1498. $LD $t0,$SIZE_T*1($tp)
  1499. $LD $t1,$SIZE_T*2($tp)
  1500. $LD $t2,$SIZE_T*3($tp)
  1501. $LD $t3,$SIZE_T*4($tp)
  1502. subi $rp,$tp,$SIZE_T*7
  1503. $UCMP $np_end,$np # done yet?
  1504. addc $acc0,$acc0,$t0
  1505. $LD $t0,$SIZE_T*5($tp)
  1506. adde $acc1,$acc1,$t1
  1507. $LD $t1,$SIZE_T*6($tp)
  1508. adde $acc2,$acc2,$t2
  1509. $LD $t2,$SIZE_T*7($tp)
  1510. adde $acc3,$acc3,$t3
  1511. $LD $t3,$SIZE_T*8($tp)
  1512. adde $acc4,$acc4,$t0
  1513. adde $acc5,$acc5,$t1
  1514. adde $acc6,$acc6,$t2
  1515. adde $acc7,$acc7,$t3
  1516. #addze $carry,$zero # moved below
  1517. beq .Lsqr8x8_post_condition
  1518. $LD $n0,$SIZE_T*0($rp)
  1519. $LD $a0,$SIZE_T*1($np)
  1520. $LD $a1,$SIZE_T*2($np)
  1521. $LD $a2,$SIZE_T*3($np)
  1522. $LD $a3,$SIZE_T*4($np)
  1523. $LD $a4,$SIZE_T*5($np)
  1524. $LD $a5,$SIZE_T*6($np)
  1525. $LD $a6,$SIZE_T*7($np)
  1526. $LDU $a7,$SIZE_T*8($np)
  1527. li $cnt,0
  1528. .align 5
  1529. .Lsqr8x_tail:
  1530. $UMULL $t0,$a0,$n0
  1531. addze $carry,$zero # carry bit, modulo-scheduled
  1532. $UMULL $t1,$a1,$n0
  1533. addi $cnt,$cnt,$SIZE_T
  1534. $UMULL $t2,$a2,$n0
  1535. andi. $cnt,$cnt,$SIZE_T*8-1
  1536. $UMULL $t3,$a3,$n0
  1537. addc $acc0,$acc0,$t0
  1538. $UMULL $t0,$a4,$n0
  1539. adde $acc1,$acc1,$t1
  1540. $UMULL $t1,$a5,$n0
  1541. adde $acc2,$acc2,$t2
  1542. $UMULL $t2,$a6,$n0
  1543. adde $acc3,$acc3,$t3
  1544. $UMULL $t3,$a7,$n0
  1545. adde $acc4,$acc4,$t0
  1546. $UMULH $t0,$a0,$n0
  1547. adde $acc5,$acc5,$t1
  1548. $UMULH $t1,$a1,$n0
  1549. adde $acc6,$acc6,$t2
  1550. $UMULH $t2,$a2,$n0
  1551. adde $acc7,$acc7,$t3
  1552. $UMULH $t3,$a3,$n0
  1553. addze $carry,$carry
  1554. $STU $acc0,$SIZE_T($tp)
  1555. addc $acc0,$acc1,$t0
  1556. $UMULH $t0,$a4,$n0
  1557. adde $acc1,$acc2,$t1
  1558. $UMULH $t1,$a5,$n0
  1559. adde $acc2,$acc3,$t2
  1560. $UMULH $t2,$a6,$n0
  1561. adde $acc3,$acc4,$t3
  1562. $UMULH $t3,$a7,$n0
  1563. $LDX $n0,$rp,$cnt
  1564. adde $acc4,$acc5,$t0
  1565. adde $acc5,$acc6,$t1
  1566. adde $acc6,$acc7,$t2
  1567. adde $acc7,$carry,$t3
  1568. #addze $carry,$zero # moved above
  1569. bne .Lsqr8x_tail
  1570. # note that carry flag is guaranteed
  1571. # to be zero at this point
  1572. $LD $a0,$SIZE_T*1($tp)
  1573. $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break
  1574. $UCMP $np_end,$np # done yet?
  1575. $LD $a1,$SIZE_T*2($tp)
  1576. sub $t2,$np_end,$num # rewinded np
  1577. $LD $a2,$SIZE_T*3($tp)
  1578. $LD $a3,$SIZE_T*4($tp)
  1579. $LD $a4,$SIZE_T*5($tp)
  1580. $LD $a5,$SIZE_T*6($tp)
  1581. $LD $a6,$SIZE_T*7($tp)
  1582. $LD $a7,$SIZE_T*8($tp)
  1583. beq .Lsqr8x_tail_break
  1584. addc $acc0,$acc0,$a0
  1585. $LD $a0,$SIZE_T*1($np)
  1586. adde $acc1,$acc1,$a1
  1587. $LD $a1,$SIZE_T*2($np)
  1588. adde $acc2,$acc2,$a2
  1589. $LD $a2,$SIZE_T*3($np)
  1590. adde $acc3,$acc3,$a3
  1591. $LD $a3,$SIZE_T*4($np)
  1592. adde $acc4,$acc4,$a4
  1593. $LD $a4,$SIZE_T*5($np)
  1594. adde $acc5,$acc5,$a5
  1595. $LD $a5,$SIZE_T*6($np)
  1596. adde $acc6,$acc6,$a6
  1597. $LD $a6,$SIZE_T*7($np)
  1598. adde $acc7,$acc7,$a7
  1599. $LDU $a7,$SIZE_T*8($np)
  1600. #addze $carry,$zero # moved above
  1601. b .Lsqr8x_tail
  1602. .align 5
  1603. .Lsqr8x_tail_break:
  1604. $POP $n0,$SIZE_T*8($sp) # pull n0
  1605. $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1]
  1606. addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window
  1607. addic $carry,$carry,-1 # "move" top-most carry to carry bit
  1608. adde $t0,$acc0,$a0
  1609. $LD $acc0,$SIZE_T*8($rp)
  1610. $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1]
  1611. adde $t1,$acc1,$a1
  1612. $LD $acc1,$SIZE_T*9($rp)
  1613. $LD $a1,$SIZE_T*2($t2)
  1614. adde $acc2,$acc2,$a2
  1615. $LD $a2,$SIZE_T*3($t2)
  1616. adde $acc3,$acc3,$a3
  1617. $LD $a3,$SIZE_T*4($t2)
  1618. adde $acc4,$acc4,$a4
  1619. $LD $a4,$SIZE_T*5($t2)
  1620. adde $acc5,$acc5,$a5
  1621. $LD $a5,$SIZE_T*6($t2)
  1622. adde $acc6,$acc6,$a6
  1623. $LD $a6,$SIZE_T*7($t2)
  1624. adde $acc7,$acc7,$a7
  1625. $LD $a7,$SIZE_T*8($t2)
  1626. addi $np,$t2,$SIZE_T*8
  1627. addze $t2,$zero # top-most carry
  1628. $UMULL $na0,$n0,$acc0
  1629. $ST $t0,$SIZE_T*1($tp)
  1630. $UCMP $cnt,$t3 # did we hit the bottom?
  1631. $ST $t1,$SIZE_T*2($tp)
  1632. li $cnt,8
  1633. $ST $acc2,$SIZE_T*3($tp)
  1634. $LD $acc2,$SIZE_T*10($rp)
  1635. $ST $acc3,$SIZE_T*4($tp)
  1636. $LD $acc3,$SIZE_T*11($rp)
  1637. $ST $acc4,$SIZE_T*5($tp)
  1638. $LD $acc4,$SIZE_T*12($rp)
  1639. $ST $acc5,$SIZE_T*6($tp)
  1640. $LD $acc5,$SIZE_T*13($rp)
  1641. $ST $acc6,$SIZE_T*7($tp)
  1642. $LD $acc6,$SIZE_T*14($rp)
  1643. $ST $acc7,$SIZE_T*8($tp)
  1644. $LD $acc7,$SIZE_T*15($rp)
  1645. $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry
  1646. addi $tp,$rp,$SIZE_T*7 # slide the window
  1647. mtctr $cnt
  1648. bne .Lsqr8x_reduction
  1649. ################################################################
  1650. # Final step. We see if result is larger than modulus, and
  1651. # if it is, subtract the modulus. But comparison implies
  1652. # subtraction. So we subtract modulus, see if it borrowed,
  1653. # and conditionally copy original value.
  1654. $POP $rp,$SIZE_T*6($sp) # pull &rp[-1]
  1655. srwi $cnt,$num,`log($SIZE_T)/log(2)+3`
  1656. mr $n0,$tp # put tp aside
  1657. addi $tp,$tp,$SIZE_T*8
  1658. subi $cnt,$cnt,1
  1659. subfc $t0,$a0,$acc0
  1660. subfe $t1,$a1,$acc1
  1661. mr $carry,$t2
  1662. mr $ap_end,$rp # $rp copy
  1663. mtctr $cnt
  1664. b .Lsqr8x_sub
  1665. .align 5
  1666. .Lsqr8x_sub:
  1667. $LD $a0,$SIZE_T*1($np)
  1668. $LD $acc0,$SIZE_T*1($tp)
  1669. $LD $a1,$SIZE_T*2($np)
  1670. $LD $acc1,$SIZE_T*2($tp)
  1671. subfe $t2,$a2,$acc2
  1672. $LD $a2,$SIZE_T*3($np)
  1673. $LD $acc2,$SIZE_T*3($tp)
  1674. subfe $t3,$a3,$acc3
  1675. $LD $a3,$SIZE_T*4($np)
  1676. $LD $acc3,$SIZE_T*4($tp)
  1677. $ST $t0,$SIZE_T*1($rp)
  1678. subfe $t0,$a4,$acc4
  1679. $LD $a4,$SIZE_T*5($np)
  1680. $LD $acc4,$SIZE_T*5($tp)
  1681. $ST $t1,$SIZE_T*2($rp)
  1682. subfe $t1,$a5,$acc5
  1683. $LD $a5,$SIZE_T*6($np)
  1684. $LD $acc5,$SIZE_T*6($tp)
  1685. $ST $t2,$SIZE_T*3($rp)
  1686. subfe $t2,$a6,$acc6
  1687. $LD $a6,$SIZE_T*7($np)
  1688. $LD $acc6,$SIZE_T*7($tp)
  1689. $ST $t3,$SIZE_T*4($rp)
  1690. subfe $t3,$a7,$acc7
  1691. $LDU $a7,$SIZE_T*8($np)
  1692. $LDU $acc7,$SIZE_T*8($tp)
  1693. $ST $t0,$SIZE_T*5($rp)
  1694. subfe $t0,$a0,$acc0
  1695. $ST $t1,$SIZE_T*6($rp)
  1696. subfe $t1,$a1,$acc1
  1697. $ST $t2,$SIZE_T*7($rp)
  1698. $STU $t3,$SIZE_T*8($rp)
  1699. bdnz .Lsqr8x_sub
  1700. srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
  1701. $LD $a0,$SIZE_T*1($ap_end) # original $rp
  1702. $LD $acc0,$SIZE_T*1($n0) # original $tp
  1703. subi $cnt,$cnt,1
  1704. $LD $a1,$SIZE_T*2($ap_end)
  1705. $LD $acc1,$SIZE_T*2($n0)
  1706. subfe $t2,$a2,$acc2
  1707. $LD $a2,$SIZE_T*3($ap_end)
  1708. $LD $acc2,$SIZE_T*3($n0)
  1709. subfe $t3,$a3,$acc3
  1710. $LD $a3,$SIZE_T*4($ap_end)
  1711. $LDU $acc3,$SIZE_T*4($n0)
  1712. $ST $t0,$SIZE_T*1($rp)
  1713. subfe $t0,$a4,$acc4
  1714. $ST $t1,$SIZE_T*2($rp)
  1715. subfe $t1,$a5,$acc5
  1716. $ST $t2,$SIZE_T*3($rp)
  1717. subfe $t2,$a6,$acc6
  1718. $ST $t3,$SIZE_T*4($rp)
  1719. subfe $t3,$a7,$acc7
  1720. $ST $t0,$SIZE_T*5($rp)
  1721. subfe $carry,$zero,$carry # did it borrow?
  1722. $ST $t1,$SIZE_T*6($rp)
  1723. $ST $t2,$SIZE_T*7($rp)
  1724. $ST $t3,$SIZE_T*8($rp)
  1725. addi $tp,$sp,$SIZE_T*11
  1726. mtctr $cnt
  1727. .Lsqr4x_cond_copy:
  1728. andc $a0,$a0,$carry
  1729. $ST $zero,-$SIZE_T*3($n0) # wipe stack clean
  1730. and $acc0,$acc0,$carry
  1731. $ST $zero,-$SIZE_T*2($n0)
  1732. andc $a1,$a1,$carry
  1733. $ST $zero,-$SIZE_T*1($n0)
  1734. and $acc1,$acc1,$carry
  1735. $ST $zero,-$SIZE_T*0($n0)
  1736. andc $a2,$a2,$carry
  1737. $ST $zero,$SIZE_T*1($tp)
  1738. and $acc2,$acc2,$carry
  1739. $ST $zero,$SIZE_T*2($tp)
  1740. andc $a3,$a3,$carry
  1741. $ST $zero,$SIZE_T*3($tp)
  1742. and $acc3,$acc3,$carry
  1743. $STU $zero,$SIZE_T*4($tp)
  1744. or $t0,$a0,$acc0
  1745. $LD $a0,$SIZE_T*5($ap_end)
  1746. $LD $acc0,$SIZE_T*1($n0)
  1747. or $t1,$a1,$acc1
  1748. $LD $a1,$SIZE_T*6($ap_end)
  1749. $LD $acc1,$SIZE_T*2($n0)
  1750. or $t2,$a2,$acc2
  1751. $LD $a2,$SIZE_T*7($ap_end)
  1752. $LD $acc2,$SIZE_T*3($n0)
  1753. or $t3,$a3,$acc3
  1754. $LD $a3,$SIZE_T*8($ap_end)
  1755. $LDU $acc3,$SIZE_T*4($n0)
  1756. $ST $t0,$SIZE_T*1($ap_end)
  1757. $ST $t1,$SIZE_T*2($ap_end)
  1758. $ST $t2,$SIZE_T*3($ap_end)
  1759. $STU $t3,$SIZE_T*4($ap_end)
  1760. bdnz .Lsqr4x_cond_copy
  1761. $POP $ap,0($sp) # pull saved sp
  1762. andc $a0,$a0,$carry
  1763. and $acc0,$acc0,$carry
  1764. andc $a1,$a1,$carry
  1765. and $acc1,$acc1,$carry
  1766. andc $a2,$a2,$carry
  1767. and $acc2,$acc2,$carry
  1768. andc $a3,$a3,$carry
  1769. and $acc3,$acc3,$carry
  1770. or $t0,$a0,$acc0
  1771. or $t1,$a1,$acc1
  1772. or $t2,$a2,$acc2
  1773. or $t3,$a3,$acc3
  1774. $ST $t0,$SIZE_T*1($ap_end)
  1775. $ST $t1,$SIZE_T*2($ap_end)
  1776. $ST $t2,$SIZE_T*3($ap_end)
  1777. $ST $t3,$SIZE_T*4($ap_end)
  1778. b .Lsqr8x_done
  1779. .align 5
  1780. .Lsqr8x8_post_condition:
  1781. $POP $rp,$SIZE_T*6($sp) # pull rp
  1782. $POP $ap,0($sp) # pull saved sp
  1783. addze $carry,$zero
  1784. # $acc0-7,$carry hold result, $a0-7 hold modulus
  1785. subfc $acc0,$a0,$acc0
  1786. subfe $acc1,$a1,$acc1
  1787. $ST $zero,$SIZE_T*12($sp) # wipe stack clean
  1788. $ST $zero,$SIZE_T*13($sp)
  1789. subfe $acc2,$a2,$acc2
  1790. $ST $zero,$SIZE_T*14($sp)
  1791. $ST $zero,$SIZE_T*15($sp)
  1792. subfe $acc3,$a3,$acc3
  1793. $ST $zero,$SIZE_T*16($sp)
  1794. $ST $zero,$SIZE_T*17($sp)
  1795. subfe $acc4,$a4,$acc4
  1796. $ST $zero,$SIZE_T*18($sp)
  1797. $ST $zero,$SIZE_T*19($sp)
  1798. subfe $acc5,$a5,$acc5
  1799. $ST $zero,$SIZE_T*20($sp)
  1800. $ST $zero,$SIZE_T*21($sp)
  1801. subfe $acc6,$a6,$acc6
  1802. $ST $zero,$SIZE_T*22($sp)
  1803. $ST $zero,$SIZE_T*23($sp)
  1804. subfe $acc7,$a7,$acc7
  1805. $ST $zero,$SIZE_T*24($sp)
  1806. $ST $zero,$SIZE_T*25($sp)
  1807. subfe $carry,$zero,$carry # did it borrow?
  1808. $ST $zero,$SIZE_T*26($sp)
  1809. $ST $zero,$SIZE_T*27($sp)
  1810. and $a0,$a0,$carry
  1811. and $a1,$a1,$carry
  1812. addc $acc0,$acc0,$a0 # add modulus back if borrowed
  1813. and $a2,$a2,$carry
  1814. adde $acc1,$acc1,$a1
  1815. and $a3,$a3,$carry
  1816. adde $acc2,$acc2,$a2
  1817. and $a4,$a4,$carry
  1818. adde $acc3,$acc3,$a3
  1819. and $a5,$a5,$carry
  1820. adde $acc4,$acc4,$a4
  1821. and $a6,$a6,$carry
  1822. adde $acc5,$acc5,$a5
  1823. and $a7,$a7,$carry
  1824. adde $acc6,$acc6,$a6
  1825. adde $acc7,$acc7,$a7
  1826. $ST $acc0,$SIZE_T*1($rp)
  1827. $ST $acc1,$SIZE_T*2($rp)
  1828. $ST $acc2,$SIZE_T*3($rp)
  1829. $ST $acc3,$SIZE_T*4($rp)
  1830. $ST $acc4,$SIZE_T*5($rp)
  1831. $ST $acc5,$SIZE_T*6($rp)
  1832. $ST $acc6,$SIZE_T*7($rp)
  1833. $ST $acc7,$SIZE_T*8($rp)
  1834. .Lsqr8x_done:
  1835. $PUSH $zero,$SIZE_T*8($sp)
  1836. $PUSH $zero,$SIZE_T*10($sp)
  1837. $POP r14,-$SIZE_T*18($ap)
  1838. li r3,1 # signal "done"
  1839. $POP r15,-$SIZE_T*17($ap)
  1840. $POP r16,-$SIZE_T*16($ap)
  1841. $POP r17,-$SIZE_T*15($ap)
  1842. $POP r18,-$SIZE_T*14($ap)
  1843. $POP r19,-$SIZE_T*13($ap)
  1844. $POP r20,-$SIZE_T*12($ap)
  1845. $POP r21,-$SIZE_T*11($ap)
  1846. $POP r22,-$SIZE_T*10($ap)
  1847. $POP r23,-$SIZE_T*9($ap)
  1848. $POP r24,-$SIZE_T*8($ap)
  1849. $POP r25,-$SIZE_T*7($ap)
  1850. $POP r26,-$SIZE_T*6($ap)
  1851. $POP r27,-$SIZE_T*5($ap)
  1852. $POP r28,-$SIZE_T*4($ap)
  1853. $POP r29,-$SIZE_T*3($ap)
  1854. $POP r30,-$SIZE_T*2($ap)
  1855. $POP r31,-$SIZE_T*1($ap)
  1856. mr $sp,$ap
  1857. blr
  1858. .long 0
  1859. .byte 0,12,4,0x20,0x80,18,6,0
  1860. .long 0
  1861. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  1862. ___
  1863. }
  1864. $code.=<<___;
  1865. .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
  1866. ___
  1867. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1868. print $code;
  1869. close STDOUT;