2
0

ppc64-mont.pl 40 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # December 2007
  15. # The reason for undertaken effort is basically following. Even though
  16. # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
  17. # performance was observed to be less than impressive, essentially as
  18. # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
  19. # Well, it's not surprising that IBM had to make some sacrifices to
  20. # boost the clock frequency that much, but no overall improvement?
  21. # Having observed how much difference did switching to FPU make on
  22. # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
  23. # Unfortunately the resulting performance improvement is not as
  24. # impressive, ~30%, and in absolute terms is still very far from what
  25. # one would expect from 4.7GHz CPU. There is a chance that I'm doing
  26. # something wrong, but in the lack of assembler level micro-profiling
  27. # data or at least decent platform guide I can't tell... Or better
  28. # results might be achieved with VMX... Anyway, this module provides
  29. # *worse* performance on other PowerPC implementations, ~40-15% slower
  30. # on PPC970 depending on key length and ~40% slower on Power 5 for all
  31. # key lengths. As it's obviously inappropriate as "best all-round"
  32. # alternative, it has to be complemented with run-time CPU family
  33. # detection. Oh! It should also be noted that unlike other PowerPC
  34. # implementation IALU ppc-mont.pl module performs *suboptimally* on
  35. # >=1024-bit key lengths on Power 6. It should also be noted that
  36. # *everything* said so far applies to 64-bit builds! As far as 32-bit
  37. # application executed on 64-bit CPU goes, this module is likely to
  38. # become preferred choice, because it's easy to adapt it for such
  39. # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
  40. # February 2008
  41. # Micro-profiling assisted optimization results in ~15% improvement
  42. # over original ppc64-mont.pl version, or overall ~50% improvement
  43. # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
  44. # Power 6 CPU, this module is 5-150% faster depending on key length,
  45. # [hereafter] more for longer keys. But if compared to ppc-mont.pl
  46. # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
  47. # in absolute terms, but it's apparently the way Power 6 is...
  48. # December 2009
  49. # Adapted for 32-bit build this module delivers 25-120%, yes, more
  50. # than *twice* for longer keys, performance improvement over 32-bit
  51. # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
  52. # even 64-bit integer operations and the trouble is that most PPC
  53. # operating systems don't preserve upper halves of general purpose
  54. # registers upon 32-bit signal delivery. They do preserve them upon
  55. # context switch, but not signalling:-( This means that asynchronous
  56. # signals have to be blocked upon entry to this subroutine. Signal
  57. # masking (and of course complementary unmasking) has quite an impact
  58. # on performance, naturally larger for shorter keys. It's so severe
  59. # that 512-bit key performance can be as low as 1/3 of expected one.
  60. # This is why this routine can be engaged for longer key operations
  61. # only on these OSes, see crypto/ppccap.c for further details. MacOS X
  62. # is an exception from this and doesn't require signal masking, and
  63. # that's where above improvement coefficients were collected. For
  64. # others alternative would be to break dependence on upper halves of
  65. # GPRs by sticking to 32-bit integer operations...
  66. # December 2012
  67. # Remove above mentioned dependence on GPRs' upper halves in 32-bit
  68. # build. No signal masking overhead, but integer instructions are
  69. # *more* numerous... It's still "universally" faster than 32-bit
  70. # ppc-mont.pl, but improvement coefficient is not as impressive
  71. # for longer keys...
  72. $flavour = shift;
  73. if ($flavour =~ /32/) {
  74. $SIZE_T=4;
  75. $RZONE= 224;
  76. $fname= "bn_mul_mont_fpu64";
  77. $STUX= "stwux"; # store indexed and update
  78. $PUSH= "stw";
  79. $POP= "lwz";
  80. } elsif ($flavour =~ /64/) {
  81. $SIZE_T=8;
  82. $RZONE= 288;
  83. $fname= "bn_mul_mont_fpu64";
  84. # same as above, but 64-bit mnemonics...
  85. $STUX= "stdux"; # store indexed and update
  86. $PUSH= "std";
  87. $POP= "ld";
  88. } else { die "nonsense $flavour"; }
  89. $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
  90. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  91. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  92. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  93. die "can't locate ppc-xlate.pl";
  94. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  95. $FRAME=64; # padded frame header
  96. $TRANSFER=16*8;
  97. $carry="r0";
  98. $sp="r1";
  99. $toc="r2";
  100. $rp="r3"; $ovf="r3";
  101. $ap="r4";
  102. $bp="r5";
  103. $np="r6";
  104. $n0="r7";
  105. $num="r8";
  106. $rp="r9"; # $rp is reassigned
  107. $tp="r10";
  108. $j="r11";
  109. $i="r12";
  110. # non-volatile registers
  111. $c1="r19";
  112. $n1="r20";
  113. $a1="r21";
  114. $nap_d="r22"; # interleaved ap and np in double format
  115. $a0="r23"; # ap[0]
  116. $t0="r24"; # temporary registers
  117. $t1="r25";
  118. $t2="r26";
  119. $t3="r27";
  120. $t4="r28";
  121. $t5="r29";
  122. $t6="r30";
  123. $t7="r31";
  124. # PPC offers enough register bank capacity to unroll inner loops twice
  125. #
  126. # ..A3A2A1A0
  127. # dcba
  128. # -----------
  129. # A0a
  130. # A0b
  131. # A0c
  132. # A0d
  133. # A1a
  134. # A1b
  135. # A1c
  136. # A1d
  137. # A2a
  138. # A2b
  139. # A2c
  140. # A2d
  141. # A3a
  142. # A3b
  143. # A3c
  144. # A3d
  145. # ..a
  146. # ..b
  147. #
  148. $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
  149. $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
  150. $dota="f8"; $dotb="f9";
  151. $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
  152. $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
  153. $T0a="f24"; $T0b="f25";
  154. $T1a="f26"; $T1b="f27";
  155. $T2a="f28"; $T2b="f29";
  156. $T3a="f30"; $T3b="f31";
  157. # sp----------->+-------------------------------+
  158. # | saved sp |
  159. # +-------------------------------+
  160. # . .
  161. # +64 +-------------------------------+
  162. # | 16 gpr<->fpr transfer zone |
  163. # . .
  164. # . .
  165. # +16*8 +-------------------------------+
  166. # | __int64 tmp[-1] |
  167. # +-------------------------------+
  168. # | __int64 tmp[num] |
  169. # . .
  170. # . .
  171. # . .
  172. # +(num+1)*8 +-------------------------------+
  173. # | padding to 64 byte boundary |
  174. # . .
  175. # +X +-------------------------------+
  176. # | double nap_d[4*num] |
  177. # . .
  178. # . .
  179. # . .
  180. # +-------------------------------+
  181. # . .
  182. # -13*size_t +-------------------------------+
  183. # | 13 saved gpr, r19-r31 |
  184. # . .
  185. # . .
  186. # -12*8 +-------------------------------+
  187. # | 12 saved fpr, f20-f31 |
  188. # . .
  189. # . .
  190. # +-------------------------------+
  191. $code=<<___;
  192. .machine "any"
  193. .text
  194. .globl .$fname
  195. .align 5
  196. .$fname:
  197. cmpwi $num,`3*8/$SIZE_T`
  198. mr $rp,r3 ; $rp is reassigned
  199. li r3,0 ; possible "not handled" return code
  200. bltlr-
  201. andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
  202. bnelr-
  203. slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
  204. li $i,-4096
  205. slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
  206. add $tp,$tp,$num ; place for tp[num+1]
  207. addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
  208. subf $tp,$tp,$sp ; $sp-$tp
  209. and $tp,$tp,$i ; minimize TLB usage
  210. subf $tp,$sp,$tp ; $tp-$sp
  211. mr $i,$sp
  212. $STUX $sp,$sp,$tp ; alloca
  213. $PUSH r19,`-12*8-13*$SIZE_T`($i)
  214. $PUSH r20,`-12*8-12*$SIZE_T`($i)
  215. $PUSH r21,`-12*8-11*$SIZE_T`($i)
  216. $PUSH r22,`-12*8-10*$SIZE_T`($i)
  217. $PUSH r23,`-12*8-9*$SIZE_T`($i)
  218. $PUSH r24,`-12*8-8*$SIZE_T`($i)
  219. $PUSH r25,`-12*8-7*$SIZE_T`($i)
  220. $PUSH r26,`-12*8-6*$SIZE_T`($i)
  221. $PUSH r27,`-12*8-5*$SIZE_T`($i)
  222. $PUSH r28,`-12*8-4*$SIZE_T`($i)
  223. $PUSH r29,`-12*8-3*$SIZE_T`($i)
  224. $PUSH r30,`-12*8-2*$SIZE_T`($i)
  225. $PUSH r31,`-12*8-1*$SIZE_T`($i)
  226. stfd f20,`-12*8`($i)
  227. stfd f21,`-11*8`($i)
  228. stfd f22,`-10*8`($i)
  229. stfd f23,`-9*8`($i)
  230. stfd f24,`-8*8`($i)
  231. stfd f25,`-7*8`($i)
  232. stfd f26,`-6*8`($i)
  233. stfd f27,`-5*8`($i)
  234. stfd f28,`-4*8`($i)
  235. stfd f29,`-3*8`($i)
  236. stfd f30,`-2*8`($i)
  237. stfd f31,`-1*8`($i)
  238. addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
  239. li $i,-64
  240. add $nap_d,$tp,$num
  241. and $nap_d,$nap_d,$i ; align to 64 bytes
  242. ; nap_d is off by 1, because it's used with stfdu/lfdu
  243. addi $nap_d,$nap_d,-8
  244. srwi $j,$num,`3+1` ; counter register, num/2
  245. addi $j,$j,-1
  246. addi $tp,$sp,`$FRAME+$TRANSFER-8`
  247. li $carry,0
  248. mtctr $j
  249. ___
  250. $code.=<<___ if ($SIZE_T==8);
  251. ld $a0,0($ap) ; pull ap[0] value
  252. ld $t3,0($bp) ; bp[0]
  253. ld $n0,0($n0) ; pull n0[0] value
  254. mulld $t7,$a0,$t3 ; ap[0]*bp[0]
  255. ; transfer bp[0] to FPU as 4x16-bit values
  256. extrdi $t0,$t3,16,48
  257. extrdi $t1,$t3,16,32
  258. extrdi $t2,$t3,16,16
  259. extrdi $t3,$t3,16,0
  260. std $t0,`$FRAME+0`($sp)
  261. std $t1,`$FRAME+8`($sp)
  262. std $t2,`$FRAME+16`($sp)
  263. std $t3,`$FRAME+24`($sp)
  264. mulld $t7,$t7,$n0 ; tp[0]*n0
  265. ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
  266. extrdi $t4,$t7,16,48
  267. extrdi $t5,$t7,16,32
  268. extrdi $t6,$t7,16,16
  269. extrdi $t7,$t7,16,0
  270. std $t4,`$FRAME+32`($sp)
  271. std $t5,`$FRAME+40`($sp)
  272. std $t6,`$FRAME+48`($sp)
  273. std $t7,`$FRAME+56`($sp)
  274. extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
  275. extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
  276. lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
  277. lwz $t3,`8^$LITTLE_ENDIAN`($ap)
  278. lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
  279. lwz $t5,`0^$LITTLE_ENDIAN`($np)
  280. lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
  281. lwz $t7,`8^$LITTLE_ENDIAN`($np)
  282. ___
  283. $code.=<<___ if ($SIZE_T==4);
  284. lwz $a0,0($ap) ; pull ap[0,1] value
  285. mr $n1,$n0
  286. lwz $a1,4($ap)
  287. li $c1,0
  288. lwz $t1,0($bp) ; bp[0,1]
  289. lwz $t3,4($bp)
  290. lwz $n0,0($n1) ; pull n0[0,1] value
  291. lwz $n1,4($n1)
  292. mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
  293. mulhwu $t5,$a0,$t1
  294. mullw $t6,$a1,$t1
  295. mullw $t7,$a0,$t3
  296. add $t5,$t5,$t6
  297. add $t5,$t5,$t7
  298. ; transfer bp[0] to FPU as 4x16-bit values
  299. extrwi $t0,$t1,16,16
  300. extrwi $t1,$t1,16,0
  301. extrwi $t2,$t3,16,16
  302. extrwi $t3,$t3,16,0
  303. std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
  304. std $t1,`$FRAME+8`($sp)
  305. std $t2,`$FRAME+16`($sp)
  306. std $t3,`$FRAME+24`($sp)
  307. mullw $t0,$t4,$n0 ; mulld tp[0]*n0
  308. mulhwu $t1,$t4,$n0
  309. mullw $t2,$t5,$n0
  310. mullw $t3,$t4,$n1
  311. add $t1,$t1,$t2
  312. add $t1,$t1,$t3
  313. ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
  314. extrwi $t4,$t0,16,16
  315. extrwi $t5,$t0,16,0
  316. extrwi $t6,$t1,16,16
  317. extrwi $t7,$t1,16,0
  318. std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
  319. std $t5,`$FRAME+40`($sp)
  320. std $t6,`$FRAME+48`($sp)
  321. std $t7,`$FRAME+56`($sp)
  322. mr $t0,$a0 ; lwz $t0,0($ap)
  323. mr $t1,$a1 ; lwz $t1,4($ap)
  324. lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
  325. lwz $t3,12($ap)
  326. lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
  327. lwz $t5,4($np)
  328. lwz $t6,8($np)
  329. lwz $t7,12($np)
  330. ___
  331. $code.=<<___;
  332. lfd $ba,`$FRAME+0`($sp)
  333. lfd $bb,`$FRAME+8`($sp)
  334. lfd $bc,`$FRAME+16`($sp)
  335. lfd $bd,`$FRAME+24`($sp)
  336. lfd $na,`$FRAME+32`($sp)
  337. lfd $nb,`$FRAME+40`($sp)
  338. lfd $nc,`$FRAME+48`($sp)
  339. lfd $nd,`$FRAME+56`($sp)
  340. std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
  341. std $t1,`$FRAME+72`($sp)
  342. std $t2,`$FRAME+80`($sp)
  343. std $t3,`$FRAME+88`($sp)
  344. std $t4,`$FRAME+96`($sp)
  345. std $t5,`$FRAME+104`($sp)
  346. std $t6,`$FRAME+112`($sp)
  347. std $t7,`$FRAME+120`($sp)
  348. fcfid $ba,$ba
  349. fcfid $bb,$bb
  350. fcfid $bc,$bc
  351. fcfid $bd,$bd
  352. fcfid $na,$na
  353. fcfid $nb,$nb
  354. fcfid $nc,$nc
  355. fcfid $nd,$nd
  356. lfd $A0,`$FRAME+64`($sp)
  357. lfd $A1,`$FRAME+72`($sp)
  358. lfd $A2,`$FRAME+80`($sp)
  359. lfd $A3,`$FRAME+88`($sp)
  360. lfd $N0,`$FRAME+96`($sp)
  361. lfd $N1,`$FRAME+104`($sp)
  362. lfd $N2,`$FRAME+112`($sp)
  363. lfd $N3,`$FRAME+120`($sp)
  364. fcfid $A0,$A0
  365. fcfid $A1,$A1
  366. fcfid $A2,$A2
  367. fcfid $A3,$A3
  368. fcfid $N0,$N0
  369. fcfid $N1,$N1
  370. fcfid $N2,$N2
  371. fcfid $N3,$N3
  372. addi $ap,$ap,16
  373. addi $np,$np,16
  374. fmul $T1a,$A1,$ba
  375. fmul $T1b,$A1,$bb
  376. stfd $A0,8($nap_d) ; save a[j] in double format
  377. stfd $A1,16($nap_d)
  378. fmul $T2a,$A2,$ba
  379. fmul $T2b,$A2,$bb
  380. stfd $A2,24($nap_d) ; save a[j+1] in double format
  381. stfd $A3,32($nap_d)
  382. fmul $T3a,$A3,$ba
  383. fmul $T3b,$A3,$bb
  384. stfd $N0,40($nap_d) ; save n[j] in double format
  385. stfd $N1,48($nap_d)
  386. fmul $T0a,$A0,$ba
  387. fmul $T0b,$A0,$bb
  388. stfd $N2,56($nap_d) ; save n[j+1] in double format
  389. stfdu $N3,64($nap_d)
  390. fmadd $T1a,$A0,$bc,$T1a
  391. fmadd $T1b,$A0,$bd,$T1b
  392. fmadd $T2a,$A1,$bc,$T2a
  393. fmadd $T2b,$A1,$bd,$T2b
  394. fmadd $T3a,$A2,$bc,$T3a
  395. fmadd $T3b,$A2,$bd,$T3b
  396. fmul $dota,$A3,$bc
  397. fmul $dotb,$A3,$bd
  398. fmadd $T1a,$N1,$na,$T1a
  399. fmadd $T1b,$N1,$nb,$T1b
  400. fmadd $T2a,$N2,$na,$T2a
  401. fmadd $T2b,$N2,$nb,$T2b
  402. fmadd $T3a,$N3,$na,$T3a
  403. fmadd $T3b,$N3,$nb,$T3b
  404. fmadd $T0a,$N0,$na,$T0a
  405. fmadd $T0b,$N0,$nb,$T0b
  406. fmadd $T1a,$N0,$nc,$T1a
  407. fmadd $T1b,$N0,$nd,$T1b
  408. fmadd $T2a,$N1,$nc,$T2a
  409. fmadd $T2b,$N1,$nd,$T2b
  410. fmadd $T3a,$N2,$nc,$T3a
  411. fmadd $T3b,$N2,$nd,$T3b
  412. fmadd $dota,$N3,$nc,$dota
  413. fmadd $dotb,$N3,$nd,$dotb
  414. fctid $T0a,$T0a
  415. fctid $T0b,$T0b
  416. fctid $T1a,$T1a
  417. fctid $T1b,$T1b
  418. fctid $T2a,$T2a
  419. fctid $T2b,$T2b
  420. fctid $T3a,$T3a
  421. fctid $T3b,$T3b
  422. stfd $T0a,`$FRAME+0`($sp)
  423. stfd $T0b,`$FRAME+8`($sp)
  424. stfd $T1a,`$FRAME+16`($sp)
  425. stfd $T1b,`$FRAME+24`($sp)
  426. stfd $T2a,`$FRAME+32`($sp)
  427. stfd $T2b,`$FRAME+40`($sp)
  428. stfd $T3a,`$FRAME+48`($sp)
  429. stfd $T3b,`$FRAME+56`($sp)
  430. .align 5
  431. L1st:
  432. ___
  433. $code.=<<___ if ($SIZE_T==8);
  434. lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
  435. lwz $t1,`0^$LITTLE_ENDIAN`($ap)
  436. lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
  437. lwz $t3,`8^$LITTLE_ENDIAN`($ap)
  438. lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
  439. lwz $t5,`0^$LITTLE_ENDIAN`($np)
  440. lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
  441. lwz $t7,`8^$LITTLE_ENDIAN`($np)
  442. ___
  443. $code.=<<___ if ($SIZE_T==4);
  444. lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
  445. lwz $t1,4($ap)
  446. lwz $t2,8($ap)
  447. lwz $t3,12($ap)
  448. lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
  449. lwz $t5,4($np)
  450. lwz $t6,8($np)
  451. lwz $t7,12($np)
  452. ___
  453. $code.=<<___;
  454. std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
  455. std $t1,`$FRAME+72`($sp)
  456. std $t2,`$FRAME+80`($sp)
  457. std $t3,`$FRAME+88`($sp)
  458. std $t4,`$FRAME+96`($sp)
  459. std $t5,`$FRAME+104`($sp)
  460. std $t6,`$FRAME+112`($sp)
  461. std $t7,`$FRAME+120`($sp)
  462. ___
  463. if ($SIZE_T==8 or $flavour =~ /osx/) {
  464. $code.=<<___;
  465. ld $t0,`$FRAME+0`($sp)
  466. ld $t1,`$FRAME+8`($sp)
  467. ld $t2,`$FRAME+16`($sp)
  468. ld $t3,`$FRAME+24`($sp)
  469. ld $t4,`$FRAME+32`($sp)
  470. ld $t5,`$FRAME+40`($sp)
  471. ld $t6,`$FRAME+48`($sp)
  472. ld $t7,`$FRAME+56`($sp)
  473. ___
  474. } else {
  475. $code.=<<___;
  476. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  477. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  478. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  479. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  480. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  481. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  482. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  483. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  484. ___
  485. }
  486. $code.=<<___;
  487. lfd $A0,`$FRAME+64`($sp)
  488. lfd $A1,`$FRAME+72`($sp)
  489. lfd $A2,`$FRAME+80`($sp)
  490. lfd $A3,`$FRAME+88`($sp)
  491. lfd $N0,`$FRAME+96`($sp)
  492. lfd $N1,`$FRAME+104`($sp)
  493. lfd $N2,`$FRAME+112`($sp)
  494. lfd $N3,`$FRAME+120`($sp)
  495. fcfid $A0,$A0
  496. fcfid $A1,$A1
  497. fcfid $A2,$A2
  498. fcfid $A3,$A3
  499. fcfid $N0,$N0
  500. fcfid $N1,$N1
  501. fcfid $N2,$N2
  502. fcfid $N3,$N3
  503. addi $ap,$ap,16
  504. addi $np,$np,16
  505. fmul $T1a,$A1,$ba
  506. fmul $T1b,$A1,$bb
  507. fmul $T2a,$A2,$ba
  508. fmul $T2b,$A2,$bb
  509. stfd $A0,8($nap_d) ; save a[j] in double format
  510. stfd $A1,16($nap_d)
  511. fmul $T3a,$A3,$ba
  512. fmul $T3b,$A3,$bb
  513. fmadd $T0a,$A0,$ba,$dota
  514. fmadd $T0b,$A0,$bb,$dotb
  515. stfd $A2,24($nap_d) ; save a[j+1] in double format
  516. stfd $A3,32($nap_d)
  517. ___
  518. if ($SIZE_T==8 or $flavour =~ /osx/) {
  519. $code.=<<___;
  520. fmadd $T1a,$A0,$bc,$T1a
  521. fmadd $T1b,$A0,$bd,$T1b
  522. fmadd $T2a,$A1,$bc,$T2a
  523. fmadd $T2b,$A1,$bd,$T2b
  524. stfd $N0,40($nap_d) ; save n[j] in double format
  525. stfd $N1,48($nap_d)
  526. fmadd $T3a,$A2,$bc,$T3a
  527. fmadd $T3b,$A2,$bd,$T3b
  528. add $t0,$t0,$carry ; can not overflow
  529. fmul $dota,$A3,$bc
  530. fmul $dotb,$A3,$bd
  531. stfd $N2,56($nap_d) ; save n[j+1] in double format
  532. stfdu $N3,64($nap_d)
  533. srdi $carry,$t0,16
  534. add $t1,$t1,$carry
  535. srdi $carry,$t1,16
  536. fmadd $T1a,$N1,$na,$T1a
  537. fmadd $T1b,$N1,$nb,$T1b
  538. insrdi $t0,$t1,16,32
  539. fmadd $T2a,$N2,$na,$T2a
  540. fmadd $T2b,$N2,$nb,$T2b
  541. add $t2,$t2,$carry
  542. fmadd $T3a,$N3,$na,$T3a
  543. fmadd $T3b,$N3,$nb,$T3b
  544. srdi $carry,$t2,16
  545. fmadd $T0a,$N0,$na,$T0a
  546. fmadd $T0b,$N0,$nb,$T0b
  547. insrdi $t0,$t2,16,16
  548. add $t3,$t3,$carry
  549. srdi $carry,$t3,16
  550. fmadd $T1a,$N0,$nc,$T1a
  551. fmadd $T1b,$N0,$nd,$T1b
  552. insrdi $t0,$t3,16,0 ; 0..63 bits
  553. fmadd $T2a,$N1,$nc,$T2a
  554. fmadd $T2b,$N1,$nd,$T2b
  555. add $t4,$t4,$carry
  556. fmadd $T3a,$N2,$nc,$T3a
  557. fmadd $T3b,$N2,$nd,$T3b
  558. srdi $carry,$t4,16
  559. fmadd $dota,$N3,$nc,$dota
  560. fmadd $dotb,$N3,$nd,$dotb
  561. add $t5,$t5,$carry
  562. srdi $carry,$t5,16
  563. insrdi $t4,$t5,16,32
  564. fctid $T0a,$T0a
  565. fctid $T0b,$T0b
  566. add $t6,$t6,$carry
  567. fctid $T1a,$T1a
  568. fctid $T1b,$T1b
  569. srdi $carry,$t6,16
  570. fctid $T2a,$T2a
  571. fctid $T2b,$T2b
  572. insrdi $t4,$t6,16,16
  573. fctid $T3a,$T3a
  574. fctid $T3b,$T3b
  575. add $t7,$t7,$carry
  576. insrdi $t4,$t7,16,0 ; 64..127 bits
  577. srdi $carry,$t7,16 ; upper 33 bits
  578. stfd $T0a,`$FRAME+0`($sp)
  579. stfd $T0b,`$FRAME+8`($sp)
  580. stfd $T1a,`$FRAME+16`($sp)
  581. stfd $T1b,`$FRAME+24`($sp)
  582. stfd $T2a,`$FRAME+32`($sp)
  583. stfd $T2b,`$FRAME+40`($sp)
  584. stfd $T3a,`$FRAME+48`($sp)
  585. stfd $T3b,`$FRAME+56`($sp)
  586. std $t0,8($tp) ; tp[j-1]
  587. stdu $t4,16($tp) ; tp[j]
  588. ___
  589. } else {
  590. $code.=<<___;
  591. fmadd $T1a,$A0,$bc,$T1a
  592. fmadd $T1b,$A0,$bd,$T1b
  593. addc $t0,$t0,$carry
  594. adde $t1,$t1,$c1
  595. srwi $carry,$t0,16
  596. fmadd $T2a,$A1,$bc,$T2a
  597. fmadd $T2b,$A1,$bd,$T2b
  598. stfd $N0,40($nap_d) ; save n[j] in double format
  599. stfd $N1,48($nap_d)
  600. srwi $c1,$t1,16
  601. insrwi $carry,$t1,16,0
  602. fmadd $T3a,$A2,$bc,$T3a
  603. fmadd $T3b,$A2,$bd,$T3b
  604. addc $t2,$t2,$carry
  605. adde $t3,$t3,$c1
  606. srwi $carry,$t2,16
  607. fmul $dota,$A3,$bc
  608. fmul $dotb,$A3,$bd
  609. stfd $N2,56($nap_d) ; save n[j+1] in double format
  610. stfdu $N3,64($nap_d)
  611. insrwi $t0,$t2,16,0 ; 0..31 bits
  612. srwi $c1,$t3,16
  613. insrwi $carry,$t3,16,0
  614. fmadd $T1a,$N1,$na,$T1a
  615. fmadd $T1b,$N1,$nb,$T1b
  616. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  617. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  618. addc $t4,$t4,$carry
  619. adde $t5,$t5,$c1
  620. srwi $carry,$t4,16
  621. fmadd $T2a,$N2,$na,$T2a
  622. fmadd $T2b,$N2,$nb,$T2b
  623. srwi $c1,$t5,16
  624. insrwi $carry,$t5,16,0
  625. fmadd $T3a,$N3,$na,$T3a
  626. fmadd $T3b,$N3,$nb,$T3b
  627. addc $t6,$t6,$carry
  628. adde $t7,$t7,$c1
  629. srwi $carry,$t6,16
  630. fmadd $T0a,$N0,$na,$T0a
  631. fmadd $T0b,$N0,$nb,$T0b
  632. insrwi $t4,$t6,16,0 ; 32..63 bits
  633. srwi $c1,$t7,16
  634. insrwi $carry,$t7,16,0
  635. fmadd $T1a,$N0,$nc,$T1a
  636. fmadd $T1b,$N0,$nd,$T1b
  637. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  638. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  639. addc $t2,$t2,$carry
  640. adde $t3,$t3,$c1
  641. srwi $carry,$t2,16
  642. fmadd $T2a,$N1,$nc,$T2a
  643. fmadd $T2b,$N1,$nd,$T2b
  644. stw $t0,12($tp) ; tp[j-1]
  645. stw $t4,8($tp)
  646. srwi $c1,$t3,16
  647. insrwi $carry,$t3,16,0
  648. fmadd $T3a,$N2,$nc,$T3a
  649. fmadd $T3b,$N2,$nd,$T3b
  650. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  651. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  652. addc $t6,$t6,$carry
  653. adde $t7,$t7,$c1
  654. srwi $carry,$t6,16
  655. fmadd $dota,$N3,$nc,$dota
  656. fmadd $dotb,$N3,$nd,$dotb
  657. insrwi $t2,$t6,16,0 ; 64..95 bits
  658. srwi $c1,$t7,16
  659. insrwi $carry,$t7,16,0
  660. fctid $T0a,$T0a
  661. fctid $T0b,$T0b
  662. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  663. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  664. addc $t0,$t0,$carry
  665. adde $t1,$t1,$c1
  666. srwi $carry,$t0,16
  667. fctid $T1a,$T1a
  668. fctid $T1b,$T1b
  669. srwi $c1,$t1,16
  670. insrwi $carry,$t1,16,0
  671. fctid $T2a,$T2a
  672. fctid $T2b,$T2b
  673. addc $t4,$t4,$carry
  674. adde $t5,$t5,$c1
  675. srwi $carry,$t4,16
  676. fctid $T3a,$T3a
  677. fctid $T3b,$T3b
  678. insrwi $t0,$t4,16,0 ; 96..127 bits
  679. srwi $c1,$t5,16
  680. insrwi $carry,$t5,16,0
  681. stfd $T0a,`$FRAME+0`($sp)
  682. stfd $T0b,`$FRAME+8`($sp)
  683. stfd $T1a,`$FRAME+16`($sp)
  684. stfd $T1b,`$FRAME+24`($sp)
  685. stfd $T2a,`$FRAME+32`($sp)
  686. stfd $T2b,`$FRAME+40`($sp)
  687. stfd $T3a,`$FRAME+48`($sp)
  688. stfd $T3b,`$FRAME+56`($sp)
  689. stw $t2,20($tp) ; tp[j]
  690. stwu $t0,16($tp)
  691. ___
  692. }
  693. $code.=<<___;
  694. bdnz L1st
  695. fctid $dota,$dota
  696. fctid $dotb,$dotb
  697. ___
  698. if ($SIZE_T==8 or $flavour =~ /osx/) {
  699. $code.=<<___;
  700. ld $t0,`$FRAME+0`($sp)
  701. ld $t1,`$FRAME+8`($sp)
  702. ld $t2,`$FRAME+16`($sp)
  703. ld $t3,`$FRAME+24`($sp)
  704. ld $t4,`$FRAME+32`($sp)
  705. ld $t5,`$FRAME+40`($sp)
  706. ld $t6,`$FRAME+48`($sp)
  707. ld $t7,`$FRAME+56`($sp)
  708. stfd $dota,`$FRAME+64`($sp)
  709. stfd $dotb,`$FRAME+72`($sp)
  710. add $t0,$t0,$carry ; can not overflow
  711. srdi $carry,$t0,16
  712. add $t1,$t1,$carry
  713. srdi $carry,$t1,16
  714. insrdi $t0,$t1,16,32
  715. add $t2,$t2,$carry
  716. srdi $carry,$t2,16
  717. insrdi $t0,$t2,16,16
  718. add $t3,$t3,$carry
  719. srdi $carry,$t3,16
  720. insrdi $t0,$t3,16,0 ; 0..63 bits
  721. add $t4,$t4,$carry
  722. srdi $carry,$t4,16
  723. add $t5,$t5,$carry
  724. srdi $carry,$t5,16
  725. insrdi $t4,$t5,16,32
  726. add $t6,$t6,$carry
  727. srdi $carry,$t6,16
  728. insrdi $t4,$t6,16,16
  729. add $t7,$t7,$carry
  730. insrdi $t4,$t7,16,0 ; 64..127 bits
  731. srdi $carry,$t7,16 ; upper 33 bits
  732. ld $t6,`$FRAME+64`($sp)
  733. ld $t7,`$FRAME+72`($sp)
  734. std $t0,8($tp) ; tp[j-1]
  735. stdu $t4,16($tp) ; tp[j]
  736. add $t6,$t6,$carry ; can not overflow
  737. srdi $carry,$t6,16
  738. add $t7,$t7,$carry
  739. insrdi $t6,$t7,48,0
  740. srdi $ovf,$t7,48
  741. std $t6,8($tp) ; tp[num-1]
  742. ___
  743. } else {
  744. $code.=<<___;
  745. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  746. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  747. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  748. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  749. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  750. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  751. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  752. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  753. stfd $dota,`$FRAME+64`($sp)
  754. stfd $dotb,`$FRAME+72`($sp)
  755. addc $t0,$t0,$carry
  756. adde $t1,$t1,$c1
  757. srwi $carry,$t0,16
  758. insrwi $carry,$t1,16,0
  759. srwi $c1,$t1,16
  760. addc $t2,$t2,$carry
  761. adde $t3,$t3,$c1
  762. srwi $carry,$t2,16
  763. insrwi $t0,$t2,16,0 ; 0..31 bits
  764. insrwi $carry,$t3,16,0
  765. srwi $c1,$t3,16
  766. addc $t4,$t4,$carry
  767. adde $t5,$t5,$c1
  768. srwi $carry,$t4,16
  769. insrwi $carry,$t5,16,0
  770. srwi $c1,$t5,16
  771. addc $t6,$t6,$carry
  772. adde $t7,$t7,$c1
  773. srwi $carry,$t6,16
  774. insrwi $t4,$t6,16,0 ; 32..63 bits
  775. insrwi $carry,$t7,16,0
  776. srwi $c1,$t7,16
  777. stw $t0,12($tp) ; tp[j-1]
  778. stw $t4,8($tp)
  779. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  780. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  781. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  782. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  783. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  784. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  785. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  786. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  787. addc $t2,$t2,$carry
  788. adde $t3,$t3,$c1
  789. srwi $carry,$t2,16
  790. insrwi $carry,$t3,16,0
  791. srwi $c1,$t3,16
  792. addc $t6,$t6,$carry
  793. adde $t7,$t7,$c1
  794. srwi $carry,$t6,16
  795. insrwi $t2,$t6,16,0 ; 64..95 bits
  796. insrwi $carry,$t7,16,0
  797. srwi $c1,$t7,16
  798. addc $t0,$t0,$carry
  799. adde $t1,$t1,$c1
  800. srwi $carry,$t0,16
  801. insrwi $carry,$t1,16,0
  802. srwi $c1,$t1,16
  803. addc $t4,$t4,$carry
  804. adde $t5,$t5,$c1
  805. srwi $carry,$t4,16
  806. insrwi $t0,$t4,16,0 ; 96..127 bits
  807. insrwi $carry,$t5,16,0
  808. srwi $c1,$t5,16
  809. stw $t2,20($tp) ; tp[j]
  810. stwu $t0,16($tp)
  811. lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
  812. lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
  813. lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
  814. lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
  815. addc $t6,$t6,$carry
  816. adde $t7,$t7,$c1
  817. srwi $carry,$t6,16
  818. insrwi $carry,$t7,16,0
  819. srwi $c1,$t7,16
  820. addc $t4,$t4,$carry
  821. adde $t5,$t5,$c1
  822. insrwi $t6,$t4,16,0
  823. srwi $t4,$t4,16
  824. insrwi $t4,$t5,16,0
  825. srwi $ovf,$t5,16
  826. stw $t6,12($tp) ; tp[num-1]
  827. stw $t4,8($tp)
  828. ___
  829. }
  830. $code.=<<___;
  831. slwi $t7,$num,2
  832. subf $nap_d,$t7,$nap_d ; rewind pointer
  833. li $i,8 ; i=1
  834. .align 5
  835. Louter:
  836. addi $tp,$sp,`$FRAME+$TRANSFER`
  837. li $carry,0
  838. mtctr $j
  839. ___
  840. $code.=<<___ if ($SIZE_T==8);
  841. ldx $t3,$bp,$i ; bp[i]
  842. ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
  843. mulld $t7,$a0,$t3 ; ap[0]*bp[i]
  844. add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
  845. ; transfer bp[i] to FPU as 4x16-bit values
  846. extrdi $t0,$t3,16,48
  847. extrdi $t1,$t3,16,32
  848. extrdi $t2,$t3,16,16
  849. extrdi $t3,$t3,16,0
  850. std $t0,`$FRAME+0`($sp)
  851. std $t1,`$FRAME+8`($sp)
  852. std $t2,`$FRAME+16`($sp)
  853. std $t3,`$FRAME+24`($sp)
  854. mulld $t7,$t7,$n0 ; tp[0]*n0
  855. ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
  856. extrdi $t4,$t7,16,48
  857. extrdi $t5,$t7,16,32
  858. extrdi $t6,$t7,16,16
  859. extrdi $t7,$t7,16,0
  860. std $t4,`$FRAME+32`($sp)
  861. std $t5,`$FRAME+40`($sp)
  862. std $t6,`$FRAME+48`($sp)
  863. std $t7,`$FRAME+56`($sp)
  864. ___
  865. $code.=<<___ if ($SIZE_T==4);
  866. add $t0,$bp,$i
  867. li $c1,0
  868. lwz $t1,0($t0) ; bp[i,i+1]
  869. lwz $t3,4($t0)
  870. mullw $t4,$a0,$t1 ; ap[0]*bp[i]
  871. lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
  872. mulhwu $t5,$a0,$t1
  873. lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
  874. mullw $t6,$a1,$t1
  875. mullw $t7,$a0,$t3
  876. add $t5,$t5,$t6
  877. add $t5,$t5,$t7
  878. addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
  879. adde $t5,$t5,$t2
  880. ; transfer bp[i] to FPU as 4x16-bit values
  881. extrwi $t0,$t1,16,16
  882. extrwi $t1,$t1,16,0
  883. extrwi $t2,$t3,16,16
  884. extrwi $t3,$t3,16,0
  885. std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
  886. std $t1,`$FRAME+8`($sp)
  887. std $t2,`$FRAME+16`($sp)
  888. std $t3,`$FRAME+24`($sp)
  889. mullw $t0,$t4,$n0 ; mulld tp[0]*n0
  890. mulhwu $t1,$t4,$n0
  891. mullw $t2,$t5,$n0
  892. mullw $t3,$t4,$n1
  893. add $t1,$t1,$t2
  894. add $t1,$t1,$t3
  895. ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
  896. extrwi $t4,$t0,16,16
  897. extrwi $t5,$t0,16,0
  898. extrwi $t6,$t1,16,16
  899. extrwi $t7,$t1,16,0
  900. std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
  901. std $t5,`$FRAME+40`($sp)
  902. std $t6,`$FRAME+48`($sp)
  903. std $t7,`$FRAME+56`($sp)
  904. ___
  905. $code.=<<___;
  906. lfd $A0,8($nap_d) ; load a[j] in double format
  907. lfd $A1,16($nap_d)
  908. lfd $A2,24($nap_d) ; load a[j+1] in double format
  909. lfd $A3,32($nap_d)
  910. lfd $N0,40($nap_d) ; load n[j] in double format
  911. lfd $N1,48($nap_d)
  912. lfd $N2,56($nap_d) ; load n[j+1] in double format
  913. lfdu $N3,64($nap_d)
  914. lfd $ba,`$FRAME+0`($sp)
  915. lfd $bb,`$FRAME+8`($sp)
  916. lfd $bc,`$FRAME+16`($sp)
  917. lfd $bd,`$FRAME+24`($sp)
  918. lfd $na,`$FRAME+32`($sp)
  919. lfd $nb,`$FRAME+40`($sp)
  920. lfd $nc,`$FRAME+48`($sp)
  921. lfd $nd,`$FRAME+56`($sp)
  922. fcfid $ba,$ba
  923. fcfid $bb,$bb
  924. fcfid $bc,$bc
  925. fcfid $bd,$bd
  926. fcfid $na,$na
  927. fcfid $nb,$nb
  928. fcfid $nc,$nc
  929. fcfid $nd,$nd
  930. fmul $T1a,$A1,$ba
  931. fmul $T1b,$A1,$bb
  932. fmul $T2a,$A2,$ba
  933. fmul $T2b,$A2,$bb
  934. fmul $T3a,$A3,$ba
  935. fmul $T3b,$A3,$bb
  936. fmul $T0a,$A0,$ba
  937. fmul $T0b,$A0,$bb
  938. fmadd $T1a,$A0,$bc,$T1a
  939. fmadd $T1b,$A0,$bd,$T1b
  940. fmadd $T2a,$A1,$bc,$T2a
  941. fmadd $T2b,$A1,$bd,$T2b
  942. fmadd $T3a,$A2,$bc,$T3a
  943. fmadd $T3b,$A2,$bd,$T3b
  944. fmul $dota,$A3,$bc
  945. fmul $dotb,$A3,$bd
  946. fmadd $T1a,$N1,$na,$T1a
  947. fmadd $T1b,$N1,$nb,$T1b
  948. lfd $A0,8($nap_d) ; load a[j] in double format
  949. lfd $A1,16($nap_d)
  950. fmadd $T2a,$N2,$na,$T2a
  951. fmadd $T2b,$N2,$nb,$T2b
  952. lfd $A2,24($nap_d) ; load a[j+1] in double format
  953. lfd $A3,32($nap_d)
  954. fmadd $T3a,$N3,$na,$T3a
  955. fmadd $T3b,$N3,$nb,$T3b
  956. fmadd $T0a,$N0,$na,$T0a
  957. fmadd $T0b,$N0,$nb,$T0b
  958. fmadd $T1a,$N0,$nc,$T1a
  959. fmadd $T1b,$N0,$nd,$T1b
  960. fmadd $T2a,$N1,$nc,$T2a
  961. fmadd $T2b,$N1,$nd,$T2b
  962. fmadd $T3a,$N2,$nc,$T3a
  963. fmadd $T3b,$N2,$nd,$T3b
  964. fmadd $dota,$N3,$nc,$dota
  965. fmadd $dotb,$N3,$nd,$dotb
  966. fctid $T0a,$T0a
  967. fctid $T0b,$T0b
  968. fctid $T1a,$T1a
  969. fctid $T1b,$T1b
  970. fctid $T2a,$T2a
  971. fctid $T2b,$T2b
  972. fctid $T3a,$T3a
  973. fctid $T3b,$T3b
  974. stfd $T0a,`$FRAME+0`($sp)
  975. stfd $T0b,`$FRAME+8`($sp)
  976. stfd $T1a,`$FRAME+16`($sp)
  977. stfd $T1b,`$FRAME+24`($sp)
  978. stfd $T2a,`$FRAME+32`($sp)
  979. stfd $T2b,`$FRAME+40`($sp)
  980. stfd $T3a,`$FRAME+48`($sp)
  981. stfd $T3b,`$FRAME+56`($sp)
  982. .align 5
  983. Linner:
  984. fmul $T1a,$A1,$ba
  985. fmul $T1b,$A1,$bb
  986. fmul $T2a,$A2,$ba
  987. fmul $T2b,$A2,$bb
  988. lfd $N0,40($nap_d) ; load n[j] in double format
  989. lfd $N1,48($nap_d)
  990. fmul $T3a,$A3,$ba
  991. fmul $T3b,$A3,$bb
  992. fmadd $T0a,$A0,$ba,$dota
  993. fmadd $T0b,$A0,$bb,$dotb
  994. lfd $N2,56($nap_d) ; load n[j+1] in double format
  995. lfdu $N3,64($nap_d)
  996. fmadd $T1a,$A0,$bc,$T1a
  997. fmadd $T1b,$A0,$bd,$T1b
  998. fmadd $T2a,$A1,$bc,$T2a
  999. fmadd $T2b,$A1,$bd,$T2b
  1000. lfd $A0,8($nap_d) ; load a[j] in double format
  1001. lfd $A1,16($nap_d)
  1002. fmadd $T3a,$A2,$bc,$T3a
  1003. fmadd $T3b,$A2,$bd,$T3b
  1004. fmul $dota,$A3,$bc
  1005. fmul $dotb,$A3,$bd
  1006. lfd $A2,24($nap_d) ; load a[j+1] in double format
  1007. lfd $A3,32($nap_d)
  1008. ___
  1009. if ($SIZE_T==8 or $flavour =~ /osx/) {
  1010. $code.=<<___;
  1011. fmadd $T1a,$N1,$na,$T1a
  1012. fmadd $T1b,$N1,$nb,$T1b
  1013. ld $t0,`$FRAME+0`($sp)
  1014. ld $t1,`$FRAME+8`($sp)
  1015. fmadd $T2a,$N2,$na,$T2a
  1016. fmadd $T2b,$N2,$nb,$T2b
  1017. ld $t2,`$FRAME+16`($sp)
  1018. ld $t3,`$FRAME+24`($sp)
  1019. fmadd $T3a,$N3,$na,$T3a
  1020. fmadd $T3b,$N3,$nb,$T3b
  1021. add $t0,$t0,$carry ; can not overflow
  1022. ld $t4,`$FRAME+32`($sp)
  1023. ld $t5,`$FRAME+40`($sp)
  1024. fmadd $T0a,$N0,$na,$T0a
  1025. fmadd $T0b,$N0,$nb,$T0b
  1026. srdi $carry,$t0,16
  1027. add $t1,$t1,$carry
  1028. srdi $carry,$t1,16
  1029. ld $t6,`$FRAME+48`($sp)
  1030. ld $t7,`$FRAME+56`($sp)
  1031. fmadd $T1a,$N0,$nc,$T1a
  1032. fmadd $T1b,$N0,$nd,$T1b
  1033. insrdi $t0,$t1,16,32
  1034. ld $t1,8($tp) ; tp[j]
  1035. fmadd $T2a,$N1,$nc,$T2a
  1036. fmadd $T2b,$N1,$nd,$T2b
  1037. add $t2,$t2,$carry
  1038. fmadd $T3a,$N2,$nc,$T3a
  1039. fmadd $T3b,$N2,$nd,$T3b
  1040. srdi $carry,$t2,16
  1041. insrdi $t0,$t2,16,16
  1042. fmadd $dota,$N3,$nc,$dota
  1043. fmadd $dotb,$N3,$nd,$dotb
  1044. add $t3,$t3,$carry
  1045. ldu $t2,16($tp) ; tp[j+1]
  1046. srdi $carry,$t3,16
  1047. insrdi $t0,$t3,16,0 ; 0..63 bits
  1048. add $t4,$t4,$carry
  1049. fctid $T0a,$T0a
  1050. fctid $T0b,$T0b
  1051. srdi $carry,$t4,16
  1052. fctid $T1a,$T1a
  1053. fctid $T1b,$T1b
  1054. add $t5,$t5,$carry
  1055. fctid $T2a,$T2a
  1056. fctid $T2b,$T2b
  1057. srdi $carry,$t5,16
  1058. insrdi $t4,$t5,16,32
  1059. fctid $T3a,$T3a
  1060. fctid $T3b,$T3b
  1061. add $t6,$t6,$carry
  1062. srdi $carry,$t6,16
  1063. insrdi $t4,$t6,16,16
  1064. stfd $T0a,`$FRAME+0`($sp)
  1065. stfd $T0b,`$FRAME+8`($sp)
  1066. add $t7,$t7,$carry
  1067. addc $t3,$t0,$t1
  1068. ___
  1069. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1070. extrdi $t0,$t0,32,0
  1071. extrdi $t1,$t1,32,0
  1072. adde $t0,$t0,$t1
  1073. ___
  1074. $code.=<<___;
  1075. stfd $T1a,`$FRAME+16`($sp)
  1076. stfd $T1b,`$FRAME+24`($sp)
  1077. insrdi $t4,$t7,16,0 ; 64..127 bits
  1078. srdi $carry,$t7,16 ; upper 33 bits
  1079. stfd $T2a,`$FRAME+32`($sp)
  1080. stfd $T2b,`$FRAME+40`($sp)
  1081. adde $t5,$t4,$t2
  1082. ___
  1083. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1084. extrdi $t4,$t4,32,0
  1085. extrdi $t2,$t2,32,0
  1086. adde $t4,$t4,$t2
  1087. ___
  1088. $code.=<<___;
  1089. stfd $T3a,`$FRAME+48`($sp)
  1090. stfd $T3b,`$FRAME+56`($sp)
  1091. addze $carry,$carry
  1092. std $t3,-16($tp) ; tp[j-1]
  1093. std $t5,-8($tp) ; tp[j]
  1094. ___
  1095. } else {
  1096. $code.=<<___;
  1097. fmadd $T1a,$N1,$na,$T1a
  1098. fmadd $T1b,$N1,$nb,$T1b
  1099. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  1100. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  1101. fmadd $T2a,$N2,$na,$T2a
  1102. fmadd $T2b,$N2,$nb,$T2b
  1103. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  1104. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  1105. fmadd $T3a,$N3,$na,$T3a
  1106. fmadd $T3b,$N3,$nb,$T3b
  1107. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  1108. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  1109. addc $t0,$t0,$carry
  1110. adde $t1,$t1,$c1
  1111. srwi $carry,$t0,16
  1112. fmadd $T0a,$N0,$na,$T0a
  1113. fmadd $T0b,$N0,$nb,$T0b
  1114. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  1115. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  1116. srwi $c1,$t1,16
  1117. insrwi $carry,$t1,16,0
  1118. fmadd $T1a,$N0,$nc,$T1a
  1119. fmadd $T1b,$N0,$nd,$T1b
  1120. addc $t2,$t2,$carry
  1121. adde $t3,$t3,$c1
  1122. srwi $carry,$t2,16
  1123. fmadd $T2a,$N1,$nc,$T2a
  1124. fmadd $T2b,$N1,$nd,$T2b
  1125. insrwi $t0,$t2,16,0 ; 0..31 bits
  1126. srwi $c1,$t3,16
  1127. insrwi $carry,$t3,16,0
  1128. fmadd $T3a,$N2,$nc,$T3a
  1129. fmadd $T3b,$N2,$nd,$T3b
  1130. lwz $t2,12($tp) ; tp[j]
  1131. lwz $t3,8($tp)
  1132. addc $t4,$t4,$carry
  1133. adde $t5,$t5,$c1
  1134. srwi $carry,$t4,16
  1135. fmadd $dota,$N3,$nc,$dota
  1136. fmadd $dotb,$N3,$nd,$dotb
  1137. srwi $c1,$t5,16
  1138. insrwi $carry,$t5,16,0
  1139. fctid $T0a,$T0a
  1140. addc $t6,$t6,$carry
  1141. adde $t7,$t7,$c1
  1142. srwi $carry,$t6,16
  1143. fctid $T0b,$T0b
  1144. insrwi $t4,$t6,16,0 ; 32..63 bits
  1145. srwi $c1,$t7,16
  1146. insrwi $carry,$t7,16,0
  1147. fctid $T1a,$T1a
  1148. addc $t0,$t0,$t2
  1149. adde $t4,$t4,$t3
  1150. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  1151. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  1152. fctid $T1b,$T1b
  1153. addze $carry,$carry
  1154. addze $c1,$c1
  1155. stw $t0,4($tp) ; tp[j-1]
  1156. stw $t4,0($tp)
  1157. fctid $T2a,$T2a
  1158. addc $t2,$t2,$carry
  1159. adde $t3,$t3,$c1
  1160. srwi $carry,$t2,16
  1161. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  1162. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  1163. fctid $T2b,$T2b
  1164. srwi $c1,$t3,16
  1165. insrwi $carry,$t3,16,0
  1166. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  1167. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  1168. fctid $T3a,$T3a
  1169. addc $t6,$t6,$carry
  1170. adde $t7,$t7,$c1
  1171. srwi $carry,$t6,16
  1172. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  1173. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  1174. fctid $T3b,$T3b
  1175. insrwi $t2,$t6,16,0 ; 64..95 bits
  1176. insrwi $carry,$t7,16,0
  1177. srwi $c1,$t7,16
  1178. lwz $t6,20($tp)
  1179. lwzu $t7,16($tp)
  1180. addc $t0,$t0,$carry
  1181. stfd $T0a,`$FRAME+0`($sp)
  1182. adde $t1,$t1,$c1
  1183. srwi $carry,$t0,16
  1184. stfd $T0b,`$FRAME+8`($sp)
  1185. insrwi $carry,$t1,16,0
  1186. srwi $c1,$t1,16
  1187. addc $t4,$t4,$carry
  1188. stfd $T1a,`$FRAME+16`($sp)
  1189. adde $t5,$t5,$c1
  1190. srwi $carry,$t4,16
  1191. insrwi $t0,$t4,16,0 ; 96..127 bits
  1192. stfd $T1b,`$FRAME+24`($sp)
  1193. insrwi $carry,$t5,16,0
  1194. srwi $c1,$t5,16
  1195. addc $t2,$t2,$t6
  1196. stfd $T2a,`$FRAME+32`($sp)
  1197. adde $t0,$t0,$t7
  1198. stfd $T2b,`$FRAME+40`($sp)
  1199. addze $carry,$carry
  1200. stfd $T3a,`$FRAME+48`($sp)
  1201. addze $c1,$c1
  1202. stfd $T3b,`$FRAME+56`($sp)
  1203. stw $t2,-4($tp) ; tp[j]
  1204. stw $t0,-8($tp)
  1205. ___
  1206. }
  1207. $code.=<<___;
  1208. bdnz Linner
  1209. fctid $dota,$dota
  1210. fctid $dotb,$dotb
  1211. ___
  1212. if ($SIZE_T==8 or $flavour =~ /osx/) {
  1213. $code.=<<___;
  1214. ld $t0,`$FRAME+0`($sp)
  1215. ld $t1,`$FRAME+8`($sp)
  1216. ld $t2,`$FRAME+16`($sp)
  1217. ld $t3,`$FRAME+24`($sp)
  1218. ld $t4,`$FRAME+32`($sp)
  1219. ld $t5,`$FRAME+40`($sp)
  1220. ld $t6,`$FRAME+48`($sp)
  1221. ld $t7,`$FRAME+56`($sp)
  1222. stfd $dota,`$FRAME+64`($sp)
  1223. stfd $dotb,`$FRAME+72`($sp)
  1224. add $t0,$t0,$carry ; can not overflow
  1225. srdi $carry,$t0,16
  1226. add $t1,$t1,$carry
  1227. srdi $carry,$t1,16
  1228. insrdi $t0,$t1,16,32
  1229. add $t2,$t2,$carry
  1230. ld $t1,8($tp) ; tp[j]
  1231. srdi $carry,$t2,16
  1232. insrdi $t0,$t2,16,16
  1233. add $t3,$t3,$carry
  1234. ldu $t2,16($tp) ; tp[j+1]
  1235. srdi $carry,$t3,16
  1236. insrdi $t0,$t3,16,0 ; 0..63 bits
  1237. add $t4,$t4,$carry
  1238. srdi $carry,$t4,16
  1239. add $t5,$t5,$carry
  1240. srdi $carry,$t5,16
  1241. insrdi $t4,$t5,16,32
  1242. add $t6,$t6,$carry
  1243. srdi $carry,$t6,16
  1244. insrdi $t4,$t6,16,16
  1245. add $t7,$t7,$carry
  1246. insrdi $t4,$t7,16,0 ; 64..127 bits
  1247. srdi $carry,$t7,16 ; upper 33 bits
  1248. ld $t6,`$FRAME+64`($sp)
  1249. ld $t7,`$FRAME+72`($sp)
  1250. addc $t3,$t0,$t1
  1251. ___
  1252. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1253. extrdi $t0,$t0,32,0
  1254. extrdi $t1,$t1,32,0
  1255. adde $t0,$t0,$t1
  1256. ___
  1257. $code.=<<___;
  1258. adde $t5,$t4,$t2
  1259. ___
  1260. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1261. extrdi $t4,$t4,32,0
  1262. extrdi $t2,$t2,32,0
  1263. adde $t4,$t4,$t2
  1264. ___
  1265. $code.=<<___;
  1266. addze $carry,$carry
  1267. std $t3,-16($tp) ; tp[j-1]
  1268. std $t5,-8($tp) ; tp[j]
  1269. add $carry,$carry,$ovf ; consume upmost overflow
  1270. add $t6,$t6,$carry ; can not overflow
  1271. srdi $carry,$t6,16
  1272. add $t7,$t7,$carry
  1273. insrdi $t6,$t7,48,0
  1274. srdi $ovf,$t7,48
  1275. std $t6,0($tp) ; tp[num-1]
  1276. ___
  1277. } else {
  1278. $code.=<<___;
  1279. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  1280. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  1281. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  1282. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  1283. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  1284. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  1285. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  1286. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  1287. stfd $dota,`$FRAME+64`($sp)
  1288. stfd $dotb,`$FRAME+72`($sp)
  1289. addc $t0,$t0,$carry
  1290. adde $t1,$t1,$c1
  1291. srwi $carry,$t0,16
  1292. insrwi $carry,$t1,16,0
  1293. srwi $c1,$t1,16
  1294. addc $t2,$t2,$carry
  1295. adde $t3,$t3,$c1
  1296. srwi $carry,$t2,16
  1297. insrwi $t0,$t2,16,0 ; 0..31 bits
  1298. lwz $t2,12($tp) ; tp[j]
  1299. insrwi $carry,$t3,16,0
  1300. srwi $c1,$t3,16
  1301. lwz $t3,8($tp)
  1302. addc $t4,$t4,$carry
  1303. adde $t5,$t5,$c1
  1304. srwi $carry,$t4,16
  1305. insrwi $carry,$t5,16,0
  1306. srwi $c1,$t5,16
  1307. addc $t6,$t6,$carry
  1308. adde $t7,$t7,$c1
  1309. srwi $carry,$t6,16
  1310. insrwi $t4,$t6,16,0 ; 32..63 bits
  1311. insrwi $carry,$t7,16,0
  1312. srwi $c1,$t7,16
  1313. addc $t0,$t0,$t2
  1314. adde $t4,$t4,$t3
  1315. addze $carry,$carry
  1316. addze $c1,$c1
  1317. stw $t0,4($tp) ; tp[j-1]
  1318. stw $t4,0($tp)
  1319. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  1320. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  1321. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  1322. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  1323. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  1324. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  1325. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  1326. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  1327. addc $t2,$t2,$carry
  1328. adde $t3,$t3,$c1
  1329. srwi $carry,$t2,16
  1330. insrwi $carry,$t3,16,0
  1331. srwi $c1,$t3,16
  1332. addc $t6,$t6,$carry
  1333. adde $t7,$t7,$c1
  1334. srwi $carry,$t6,16
  1335. insrwi $t2,$t6,16,0 ; 64..95 bits
  1336. lwz $t6,20($tp)
  1337. insrwi $carry,$t7,16,0
  1338. srwi $c1,$t7,16
  1339. lwzu $t7,16($tp)
  1340. addc $t0,$t0,$carry
  1341. adde $t1,$t1,$c1
  1342. srwi $carry,$t0,16
  1343. insrwi $carry,$t1,16,0
  1344. srwi $c1,$t1,16
  1345. addc $t4,$t4,$carry
  1346. adde $t5,$t5,$c1
  1347. srwi $carry,$t4,16
  1348. insrwi $t0,$t4,16,0 ; 96..127 bits
  1349. insrwi $carry,$t5,16,0
  1350. srwi $c1,$t5,16
  1351. addc $t2,$t2,$t6
  1352. adde $t0,$t0,$t7
  1353. lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
  1354. lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
  1355. addze $carry,$carry
  1356. addze $c1,$c1
  1357. lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
  1358. lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
  1359. addc $t6,$t6,$carry
  1360. adde $t7,$t7,$c1
  1361. stw $t2,-4($tp) ; tp[j]
  1362. stw $t0,-8($tp)
  1363. addc $t6,$t6,$ovf
  1364. addze $t7,$t7
  1365. srwi $carry,$t6,16
  1366. insrwi $carry,$t7,16,0
  1367. srwi $c1,$t7,16
  1368. addc $t4,$t4,$carry
  1369. adde $t5,$t5,$c1
  1370. insrwi $t6,$t4,16,0
  1371. srwi $t4,$t4,16
  1372. insrwi $t4,$t5,16,0
  1373. srwi $ovf,$t5,16
  1374. stw $t6,4($tp) ; tp[num-1]
  1375. stw $t4,0($tp)
  1376. ___
  1377. }
  1378. $code.=<<___;
  1379. slwi $t7,$num,2
  1380. addi $i,$i,8
  1381. subf $nap_d,$t7,$nap_d ; rewind pointer
  1382. cmpw $i,$num
  1383. blt- Louter
  1384. ___
  1385. $code.=<<___ if ($SIZE_T==8);
  1386. subf $np,$num,$np ; rewind np
  1387. addi $j,$j,1 ; restore counter
  1388. subfc $i,$i,$i ; j=0 and "clear" XER[CA]
  1389. addi $tp,$sp,`$FRAME+$TRANSFER+8`
  1390. addi $t4,$sp,`$FRAME+$TRANSFER+16`
  1391. addi $t5,$np,8
  1392. addi $t6,$rp,8
  1393. mtctr $j
  1394. .align 4
  1395. Lsub: ldx $t0,$tp,$i
  1396. ldx $t1,$np,$i
  1397. ldx $t2,$t4,$i
  1398. ldx $t3,$t5,$i
  1399. subfe $t0,$t1,$t0 ; tp[j]-np[j]
  1400. subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
  1401. stdx $t0,$rp,$i
  1402. stdx $t2,$t6,$i
  1403. addi $i,$i,16
  1404. bdnz Lsub
  1405. li $i,0
  1406. subfe $ovf,$i,$ovf ; handle upmost overflow bit
  1407. and $ap,$tp,$ovf
  1408. andc $np,$rp,$ovf
  1409. or $ap,$ap,$np ; ap=borrow?tp:rp
  1410. addi $t7,$ap,8
  1411. mtctr $j
  1412. .align 4
  1413. Lcopy: ; copy or in-place refresh
  1414. ldx $t0,$ap,$i
  1415. ldx $t1,$t7,$i
  1416. std $i,8($nap_d) ; zap nap_d
  1417. std $i,16($nap_d)
  1418. std $i,24($nap_d)
  1419. std $i,32($nap_d)
  1420. std $i,40($nap_d)
  1421. std $i,48($nap_d)
  1422. std $i,56($nap_d)
  1423. stdu $i,64($nap_d)
  1424. stdx $t0,$rp,$i
  1425. stdx $t1,$t6,$i
  1426. stdx $i,$tp,$i ; zap tp at once
  1427. stdx $i,$t4,$i
  1428. addi $i,$i,16
  1429. bdnz Lcopy
  1430. ___
  1431. $code.=<<___ if ($SIZE_T==4);
  1432. subf $np,$num,$np ; rewind np
  1433. addi $j,$j,1 ; restore counter
  1434. subfc $i,$i,$i ; j=0 and "clear" XER[CA]
  1435. addi $tp,$sp,`$FRAME+$TRANSFER`
  1436. addi $np,$np,-4
  1437. addi $rp,$rp,-4
  1438. addi $ap,$sp,`$FRAME+$TRANSFER+4`
  1439. mtctr $j
  1440. .align 4
  1441. Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
  1442. lwz $t1,8($tp)
  1443. lwz $t2,20($tp)
  1444. lwzu $t3,16($tp)
  1445. lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
  1446. lwz $t5,8($np)
  1447. lwz $t6,12($np)
  1448. lwzu $t7,16($np)
  1449. subfe $t4,$t4,$t0 ; tp[j]-np[j]
  1450. stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
  1451. subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
  1452. stw $t1,8($ap)
  1453. subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
  1454. stw $t2,12($ap)
  1455. subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
  1456. stwu $t3,16($ap)
  1457. stw $t4,4($rp)
  1458. stw $t5,8($rp)
  1459. stw $t6,12($rp)
  1460. stwu $t7,16($rp)
  1461. bdnz Lsub
  1462. li $i,0
  1463. subfe $ovf,$i,$ovf ; handle upmost overflow bit
  1464. addi $tp,$sp,`$FRAME+$TRANSFER+4`
  1465. subf $rp,$num,$rp ; rewind rp
  1466. and $ap,$tp,$ovf
  1467. andc $np,$rp,$ovf
  1468. or $ap,$ap,$np ; ap=borrow?tp:rp
  1469. addi $tp,$sp,`$FRAME+$TRANSFER`
  1470. mtctr $j
  1471. .align 4
  1472. Lcopy: ; copy or in-place refresh
  1473. lwz $t0,4($ap)
  1474. lwz $t1,8($ap)
  1475. lwz $t2,12($ap)
  1476. lwzu $t3,16($ap)
  1477. std $i,8($nap_d) ; zap nap_d
  1478. std $i,16($nap_d)
  1479. std $i,24($nap_d)
  1480. std $i,32($nap_d)
  1481. std $i,40($nap_d)
  1482. std $i,48($nap_d)
  1483. std $i,56($nap_d)
  1484. stdu $i,64($nap_d)
  1485. stw $t0,4($rp)
  1486. stw $t1,8($rp)
  1487. stw $t2,12($rp)
  1488. stwu $t3,16($rp)
  1489. std $i,8($tp) ; zap tp at once
  1490. stdu $i,16($tp)
  1491. bdnz Lcopy
  1492. ___
  1493. $code.=<<___;
  1494. $POP $i,0($sp)
  1495. li r3,1 ; signal "handled"
  1496. $POP r19,`-12*8-13*$SIZE_T`($i)
  1497. $POP r20,`-12*8-12*$SIZE_T`($i)
  1498. $POP r21,`-12*8-11*$SIZE_T`($i)
  1499. $POP r22,`-12*8-10*$SIZE_T`($i)
  1500. $POP r23,`-12*8-9*$SIZE_T`($i)
  1501. $POP r24,`-12*8-8*$SIZE_T`($i)
  1502. $POP r25,`-12*8-7*$SIZE_T`($i)
  1503. $POP r26,`-12*8-6*$SIZE_T`($i)
  1504. $POP r27,`-12*8-5*$SIZE_T`($i)
  1505. $POP r28,`-12*8-4*$SIZE_T`($i)
  1506. $POP r29,`-12*8-3*$SIZE_T`($i)
  1507. $POP r30,`-12*8-2*$SIZE_T`($i)
  1508. $POP r31,`-12*8-1*$SIZE_T`($i)
  1509. lfd f20,`-12*8`($i)
  1510. lfd f21,`-11*8`($i)
  1511. lfd f22,`-10*8`($i)
  1512. lfd f23,`-9*8`($i)
  1513. lfd f24,`-8*8`($i)
  1514. lfd f25,`-7*8`($i)
  1515. lfd f26,`-6*8`($i)
  1516. lfd f27,`-5*8`($i)
  1517. lfd f28,`-4*8`($i)
  1518. lfd f29,`-3*8`($i)
  1519. lfd f30,`-2*8`($i)
  1520. lfd f31,`-1*8`($i)
  1521. mr $sp,$i
  1522. blr
  1523. .long 0
  1524. .byte 0,12,4,0,0x8c,13,6,0
  1525. .long 0
  1526. .size .$fname,.-.$fname
  1527. .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
  1528. ___
  1529. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1530. print $code;
  1531. close STDOUT;