ppc64-mont.pl 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656
  1. #! /usr/bin/env perl
  2. # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # December 2007
  15. # The reason for undertaken effort is basically following. Even though
  16. # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
  17. # performance was observed to be less than impressive, essentially as
  18. # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
  19. # Well, it's not surprising that IBM had to make some sacrifices to
  20. # boost the clock frequency that much, but no overall improvement?
  21. # Having observed how much difference did switching to FPU make on
  22. # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
  23. # Unfortunately the resulting performance improvement is not as
  24. # impressive, ~30%, and in absolute terms is still very far from what
  25. # one would expect from 4.7GHz CPU. There is a chance that I'm doing
  26. # something wrong, but in the lack of assembler level micro-profiling
  27. # data or at least decent platform guide I can't tell... Or better
  28. # results might be achieved with VMX... Anyway, this module provides
  29. # *worse* performance on other PowerPC implementations, ~40-15% slower
  30. # on PPC970 depending on key length and ~40% slower on Power 5 for all
  31. # key lengths. As it's obviously inappropriate as "best all-round"
  32. # alternative, it has to be complemented with run-time CPU family
  33. # detection. Oh! It should also be noted that unlike other PowerPC
  34. # implementation IALU ppc-mont.pl module performs *suboptimally* on
  35. # >=1024-bit key lengths on Power 6. It should also be noted that
  36. # *everything* said so far applies to 64-bit builds! As far as 32-bit
  37. # application executed on 64-bit CPU goes, this module is likely to
  38. # become preferred choice, because it's easy to adapt it for such
  39. # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
  40. # February 2008
  41. # Micro-profiling assisted optimization results in ~15% improvement
  42. # over original ppc64-mont.pl version, or overall ~50% improvement
  43. # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
  44. # Power 6 CPU, this module is 5-150% faster depending on key length,
  45. # [hereafter] more for longer keys. But if compared to ppc-mont.pl
  46. # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
  47. # in absolute terms, but it's apparently the way Power 6 is...
  48. # December 2009
  49. # Adapted for 32-bit build this module delivers 25-120%, yes, more
  50. # than *twice* for longer keys, performance improvement over 32-bit
  51. # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
  52. # even 64-bit integer operations and the trouble is that most PPC
  53. # operating systems don't preserve upper halves of general purpose
  54. # registers upon 32-bit signal delivery. They do preserve them upon
  55. # context switch, but not signalling:-( This means that asynchronous
  56. # signals have to be blocked upon entry to this subroutine. Signal
  57. # masking (and of course complementary unmasking) has quite an impact
  58. # on performance, naturally larger for shorter keys. It's so severe
  59. # that 512-bit key performance can be as low as 1/3 of expected one.
  60. # This is why this routine can be engaged for longer key operations
  61. # only on these OSes, see crypto/ppccap.c for further details. MacOS X
  62. # is an exception from this and doesn't require signal masking, and
  63. # that's where above improvement coefficients were collected. For
  64. # others alternative would be to break dependence on upper halves of
  65. # GPRs by sticking to 32-bit integer operations...
  66. # December 2012
  67. # Remove above mentioned dependence on GPRs' upper halves in 32-bit
  68. # build. No signal masking overhead, but integer instructions are
  69. # *more* numerous... It's still "universally" faster than 32-bit
  70. # ppc-mont.pl, but improvement coefficient is not as impressive
  71. # for longer keys...
  72. # $output is the last argument if it looks like a file (it has an extension)
  73. # $flavour is the first argument if it doesn't look like a file
  74. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  75. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  76. if ($flavour =~ /32/) {
  77. $SIZE_T=4;
  78. $RZONE= 224;
  79. $fname= "bn_mul_mont_fpu64";
  80. $STUX= "stwux"; # store indexed and update
  81. $PUSH= "stw";
  82. $POP= "lwz";
  83. } elsif ($flavour =~ /64/) {
  84. $SIZE_T=8;
  85. $RZONE= 288;
  86. $fname= "bn_mul_mont_fpu64";
  87. # same as above, but 64-bit mnemonics...
  88. $STUX= "stdux"; # store indexed and update
  89. $PUSH= "std";
  90. $POP= "ld";
  91. } else { die "nonsense $flavour"; }
  92. $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
  93. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  94. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  95. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  96. die "can't locate ppc-xlate.pl";
  97. open STDOUT,"| $^X $xlate $flavour \"$output\""
  98. or die "can't call $xlate: $!";
  99. $FRAME=64; # padded frame header
  100. $TRANSFER=16*8;
  101. $carry="r0";
  102. $sp="r1";
  103. $toc="r2";
  104. $rp="r3"; $ovf="r3";
  105. $ap="r4";
  106. $bp="r5";
  107. $np="r6";
  108. $n0="r7";
  109. $num="r8";
  110. $rp="r9"; # $rp is reassigned
  111. $tp="r10";
  112. $j="r11";
  113. $i="r12";
  114. # non-volatile registers
  115. $c1="r19";
  116. $n1="r20";
  117. $a1="r21";
  118. $nap_d="r22"; # interleaved ap and np in double format
  119. $a0="r23"; # ap[0]
  120. $t0="r24"; # temporary registers
  121. $t1="r25";
  122. $t2="r26";
  123. $t3="r27";
  124. $t4="r28";
  125. $t5="r29";
  126. $t6="r30";
  127. $t7="r31";
  128. # PPC offers enough register bank capacity to unroll inner loops twice
  129. #
  130. # ..A3A2A1A0
  131. # dcba
  132. # -----------
  133. # A0a
  134. # A0b
  135. # A0c
  136. # A0d
  137. # A1a
  138. # A1b
  139. # A1c
  140. # A1d
  141. # A2a
  142. # A2b
  143. # A2c
  144. # A2d
  145. # A3a
  146. # A3b
  147. # A3c
  148. # A3d
  149. # ..a
  150. # ..b
  151. #
  152. $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
  153. $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
  154. $dota="f8"; $dotb="f9";
  155. $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
  156. $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
  157. $T0a="f24"; $T0b="f25";
  158. $T1a="f26"; $T1b="f27";
  159. $T2a="f28"; $T2b="f29";
  160. $T3a="f30"; $T3b="f31";
  161. # sp----------->+-------------------------------+
  162. # | saved sp |
  163. # +-------------------------------+
  164. # . .
  165. # +64 +-------------------------------+
  166. # | 16 gpr<->fpr transfer zone |
  167. # . .
  168. # . .
  169. # +16*8 +-------------------------------+
  170. # | __int64 tmp[-1] |
  171. # +-------------------------------+
  172. # | __int64 tmp[num] |
  173. # . .
  174. # . .
  175. # . .
  176. # +(num+1)*8 +-------------------------------+
  177. # | padding to 64 byte boundary |
  178. # . .
  179. # +X +-------------------------------+
  180. # | double nap_d[4*num] |
  181. # . .
  182. # . .
  183. # . .
  184. # +-------------------------------+
  185. # . .
  186. # -13*size_t +-------------------------------+
  187. # | 13 saved gpr, r19-r31 |
  188. # . .
  189. # . .
  190. # -12*8 +-------------------------------+
  191. # | 12 saved fpr, f20-f31 |
  192. # . .
  193. # . .
  194. # +-------------------------------+
  195. $code=<<___;
  196. .machine "any"
  197. .text
  198. .globl .$fname
  199. .align 5
  200. .$fname:
  201. cmpwi $num,`3*8/$SIZE_T`
  202. mr $rp,r3 ; $rp is reassigned
  203. li r3,0 ; possible "not handled" return code
  204. bltlr-
  205. andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
  206. bnelr-
  207. slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
  208. li $i,-4096
  209. slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
  210. add $tp,$tp,$num ; place for tp[num+1]
  211. addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
  212. subf $tp,$tp,$sp ; $sp-$tp
  213. and $tp,$tp,$i ; minimize TLB usage
  214. subf $tp,$sp,$tp ; $tp-$sp
  215. mr $i,$sp
  216. $STUX $sp,$sp,$tp ; alloca
  217. $PUSH r19,`-12*8-13*$SIZE_T`($i)
  218. $PUSH r20,`-12*8-12*$SIZE_T`($i)
  219. $PUSH r21,`-12*8-11*$SIZE_T`($i)
  220. $PUSH r22,`-12*8-10*$SIZE_T`($i)
  221. $PUSH r23,`-12*8-9*$SIZE_T`($i)
  222. $PUSH r24,`-12*8-8*$SIZE_T`($i)
  223. $PUSH r25,`-12*8-7*$SIZE_T`($i)
  224. $PUSH r26,`-12*8-6*$SIZE_T`($i)
  225. $PUSH r27,`-12*8-5*$SIZE_T`($i)
  226. $PUSH r28,`-12*8-4*$SIZE_T`($i)
  227. $PUSH r29,`-12*8-3*$SIZE_T`($i)
  228. $PUSH r30,`-12*8-2*$SIZE_T`($i)
  229. $PUSH r31,`-12*8-1*$SIZE_T`($i)
  230. stfd f20,`-12*8`($i)
  231. stfd f21,`-11*8`($i)
  232. stfd f22,`-10*8`($i)
  233. stfd f23,`-9*8`($i)
  234. stfd f24,`-8*8`($i)
  235. stfd f25,`-7*8`($i)
  236. stfd f26,`-6*8`($i)
  237. stfd f27,`-5*8`($i)
  238. stfd f28,`-4*8`($i)
  239. stfd f29,`-3*8`($i)
  240. stfd f30,`-2*8`($i)
  241. stfd f31,`-1*8`($i)
  242. addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
  243. li $i,-64
  244. add $nap_d,$tp,$num
  245. and $nap_d,$nap_d,$i ; align to 64 bytes
  246. ; nap_d is off by 1, because it's used with stfdu/lfdu
  247. addi $nap_d,$nap_d,-8
  248. srwi $j,$num,`3+1` ; counter register, num/2
  249. addi $j,$j,-1
  250. addi $tp,$sp,`$FRAME+$TRANSFER-8`
  251. li $carry,0
  252. mtctr $j
  253. ___
  254. $code.=<<___ if ($SIZE_T==8);
  255. ld $a0,0($ap) ; pull ap[0] value
  256. ld $t3,0($bp) ; bp[0]
  257. ld $n0,0($n0) ; pull n0[0] value
  258. mulld $t7,$a0,$t3 ; ap[0]*bp[0]
  259. ; transfer bp[0] to FPU as 4x16-bit values
  260. extrdi $t0,$t3,16,48
  261. extrdi $t1,$t3,16,32
  262. extrdi $t2,$t3,16,16
  263. extrdi $t3,$t3,16,0
  264. std $t0,`$FRAME+0`($sp)
  265. std $t1,`$FRAME+8`($sp)
  266. std $t2,`$FRAME+16`($sp)
  267. std $t3,`$FRAME+24`($sp)
  268. mulld $t7,$t7,$n0 ; tp[0]*n0
  269. ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
  270. extrdi $t4,$t7,16,48
  271. extrdi $t5,$t7,16,32
  272. extrdi $t6,$t7,16,16
  273. extrdi $t7,$t7,16,0
  274. std $t4,`$FRAME+32`($sp)
  275. std $t5,`$FRAME+40`($sp)
  276. std $t6,`$FRAME+48`($sp)
  277. std $t7,`$FRAME+56`($sp)
  278. extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
  279. extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
  280. lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
  281. lwz $t3,`8^$LITTLE_ENDIAN`($ap)
  282. lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
  283. lwz $t5,`0^$LITTLE_ENDIAN`($np)
  284. lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
  285. lwz $t7,`8^$LITTLE_ENDIAN`($np)
  286. ___
  287. $code.=<<___ if ($SIZE_T==4);
  288. lwz $a0,0($ap) ; pull ap[0,1] value
  289. mr $n1,$n0
  290. lwz $a1,4($ap)
  291. li $c1,0
  292. lwz $t1,0($bp) ; bp[0,1]
  293. lwz $t3,4($bp)
  294. lwz $n0,0($n1) ; pull n0[0,1] value
  295. lwz $n1,4($n1)
  296. mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
  297. mulhwu $t5,$a0,$t1
  298. mullw $t6,$a1,$t1
  299. mullw $t7,$a0,$t3
  300. add $t5,$t5,$t6
  301. add $t5,$t5,$t7
  302. ; transfer bp[0] to FPU as 4x16-bit values
  303. extrwi $t0,$t1,16,16
  304. extrwi $t1,$t1,16,0
  305. extrwi $t2,$t3,16,16
  306. extrwi $t3,$t3,16,0
  307. std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
  308. std $t1,`$FRAME+8`($sp)
  309. std $t2,`$FRAME+16`($sp)
  310. std $t3,`$FRAME+24`($sp)
  311. mullw $t0,$t4,$n0 ; mulld tp[0]*n0
  312. mulhwu $t1,$t4,$n0
  313. mullw $t2,$t5,$n0
  314. mullw $t3,$t4,$n1
  315. add $t1,$t1,$t2
  316. add $t1,$t1,$t3
  317. ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
  318. extrwi $t4,$t0,16,16
  319. extrwi $t5,$t0,16,0
  320. extrwi $t6,$t1,16,16
  321. extrwi $t7,$t1,16,0
  322. std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
  323. std $t5,`$FRAME+40`($sp)
  324. std $t6,`$FRAME+48`($sp)
  325. std $t7,`$FRAME+56`($sp)
  326. mr $t0,$a0 ; lwz $t0,0($ap)
  327. mr $t1,$a1 ; lwz $t1,4($ap)
  328. lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
  329. lwz $t3,12($ap)
  330. lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
  331. lwz $t5,4($np)
  332. lwz $t6,8($np)
  333. lwz $t7,12($np)
  334. ___
  335. $code.=<<___;
  336. lfd $ba,`$FRAME+0`($sp)
  337. lfd $bb,`$FRAME+8`($sp)
  338. lfd $bc,`$FRAME+16`($sp)
  339. lfd $bd,`$FRAME+24`($sp)
  340. lfd $na,`$FRAME+32`($sp)
  341. lfd $nb,`$FRAME+40`($sp)
  342. lfd $nc,`$FRAME+48`($sp)
  343. lfd $nd,`$FRAME+56`($sp)
  344. std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
  345. std $t1,`$FRAME+72`($sp)
  346. std $t2,`$FRAME+80`($sp)
  347. std $t3,`$FRAME+88`($sp)
  348. std $t4,`$FRAME+96`($sp)
  349. std $t5,`$FRAME+104`($sp)
  350. std $t6,`$FRAME+112`($sp)
  351. std $t7,`$FRAME+120`($sp)
  352. fcfid $ba,$ba
  353. fcfid $bb,$bb
  354. fcfid $bc,$bc
  355. fcfid $bd,$bd
  356. fcfid $na,$na
  357. fcfid $nb,$nb
  358. fcfid $nc,$nc
  359. fcfid $nd,$nd
  360. lfd $A0,`$FRAME+64`($sp)
  361. lfd $A1,`$FRAME+72`($sp)
  362. lfd $A2,`$FRAME+80`($sp)
  363. lfd $A3,`$FRAME+88`($sp)
  364. lfd $N0,`$FRAME+96`($sp)
  365. lfd $N1,`$FRAME+104`($sp)
  366. lfd $N2,`$FRAME+112`($sp)
  367. lfd $N3,`$FRAME+120`($sp)
  368. fcfid $A0,$A0
  369. fcfid $A1,$A1
  370. fcfid $A2,$A2
  371. fcfid $A3,$A3
  372. fcfid $N0,$N0
  373. fcfid $N1,$N1
  374. fcfid $N2,$N2
  375. fcfid $N3,$N3
  376. addi $ap,$ap,16
  377. addi $np,$np,16
  378. fmul $T1a,$A1,$ba
  379. fmul $T1b,$A1,$bb
  380. stfd $A0,8($nap_d) ; save a[j] in double format
  381. stfd $A1,16($nap_d)
  382. fmul $T2a,$A2,$ba
  383. fmul $T2b,$A2,$bb
  384. stfd $A2,24($nap_d) ; save a[j+1] in double format
  385. stfd $A3,32($nap_d)
  386. fmul $T3a,$A3,$ba
  387. fmul $T3b,$A3,$bb
  388. stfd $N0,40($nap_d) ; save n[j] in double format
  389. stfd $N1,48($nap_d)
  390. fmul $T0a,$A0,$ba
  391. fmul $T0b,$A0,$bb
  392. stfd $N2,56($nap_d) ; save n[j+1] in double format
  393. stfdu $N3,64($nap_d)
  394. fmadd $T1a,$A0,$bc,$T1a
  395. fmadd $T1b,$A0,$bd,$T1b
  396. fmadd $T2a,$A1,$bc,$T2a
  397. fmadd $T2b,$A1,$bd,$T2b
  398. fmadd $T3a,$A2,$bc,$T3a
  399. fmadd $T3b,$A2,$bd,$T3b
  400. fmul $dota,$A3,$bc
  401. fmul $dotb,$A3,$bd
  402. fmadd $T1a,$N1,$na,$T1a
  403. fmadd $T1b,$N1,$nb,$T1b
  404. fmadd $T2a,$N2,$na,$T2a
  405. fmadd $T2b,$N2,$nb,$T2b
  406. fmadd $T3a,$N3,$na,$T3a
  407. fmadd $T3b,$N3,$nb,$T3b
  408. fmadd $T0a,$N0,$na,$T0a
  409. fmadd $T0b,$N0,$nb,$T0b
  410. fmadd $T1a,$N0,$nc,$T1a
  411. fmadd $T1b,$N0,$nd,$T1b
  412. fmadd $T2a,$N1,$nc,$T2a
  413. fmadd $T2b,$N1,$nd,$T2b
  414. fmadd $T3a,$N2,$nc,$T3a
  415. fmadd $T3b,$N2,$nd,$T3b
  416. fmadd $dota,$N3,$nc,$dota
  417. fmadd $dotb,$N3,$nd,$dotb
  418. fctid $T0a,$T0a
  419. fctid $T0b,$T0b
  420. fctid $T1a,$T1a
  421. fctid $T1b,$T1b
  422. fctid $T2a,$T2a
  423. fctid $T2b,$T2b
  424. fctid $T3a,$T3a
  425. fctid $T3b,$T3b
  426. stfd $T0a,`$FRAME+0`($sp)
  427. stfd $T0b,`$FRAME+8`($sp)
  428. stfd $T1a,`$FRAME+16`($sp)
  429. stfd $T1b,`$FRAME+24`($sp)
  430. stfd $T2a,`$FRAME+32`($sp)
  431. stfd $T2b,`$FRAME+40`($sp)
  432. stfd $T3a,`$FRAME+48`($sp)
  433. stfd $T3b,`$FRAME+56`($sp)
  434. .align 5
  435. L1st:
  436. ___
  437. $code.=<<___ if ($SIZE_T==8);
  438. lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
  439. lwz $t1,`0^$LITTLE_ENDIAN`($ap)
  440. lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
  441. lwz $t3,`8^$LITTLE_ENDIAN`($ap)
  442. lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
  443. lwz $t5,`0^$LITTLE_ENDIAN`($np)
  444. lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
  445. lwz $t7,`8^$LITTLE_ENDIAN`($np)
  446. ___
  447. $code.=<<___ if ($SIZE_T==4);
  448. lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
  449. lwz $t1,4($ap)
  450. lwz $t2,8($ap)
  451. lwz $t3,12($ap)
  452. lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
  453. lwz $t5,4($np)
  454. lwz $t6,8($np)
  455. lwz $t7,12($np)
  456. ___
  457. $code.=<<___;
  458. std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
  459. std $t1,`$FRAME+72`($sp)
  460. std $t2,`$FRAME+80`($sp)
  461. std $t3,`$FRAME+88`($sp)
  462. std $t4,`$FRAME+96`($sp)
  463. std $t5,`$FRAME+104`($sp)
  464. std $t6,`$FRAME+112`($sp)
  465. std $t7,`$FRAME+120`($sp)
  466. ___
  467. if ($SIZE_T==8 or $flavour =~ /osx/) {
  468. $code.=<<___;
  469. ld $t0,`$FRAME+0`($sp)
  470. ld $t1,`$FRAME+8`($sp)
  471. ld $t2,`$FRAME+16`($sp)
  472. ld $t3,`$FRAME+24`($sp)
  473. ld $t4,`$FRAME+32`($sp)
  474. ld $t5,`$FRAME+40`($sp)
  475. ld $t6,`$FRAME+48`($sp)
  476. ld $t7,`$FRAME+56`($sp)
  477. ___
  478. } else {
  479. $code.=<<___;
  480. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  481. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  482. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  483. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  484. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  485. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  486. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  487. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  488. ___
  489. }
  490. $code.=<<___;
  491. lfd $A0,`$FRAME+64`($sp)
  492. lfd $A1,`$FRAME+72`($sp)
  493. lfd $A2,`$FRAME+80`($sp)
  494. lfd $A3,`$FRAME+88`($sp)
  495. lfd $N0,`$FRAME+96`($sp)
  496. lfd $N1,`$FRAME+104`($sp)
  497. lfd $N2,`$FRAME+112`($sp)
  498. lfd $N3,`$FRAME+120`($sp)
  499. fcfid $A0,$A0
  500. fcfid $A1,$A1
  501. fcfid $A2,$A2
  502. fcfid $A3,$A3
  503. fcfid $N0,$N0
  504. fcfid $N1,$N1
  505. fcfid $N2,$N2
  506. fcfid $N3,$N3
  507. addi $ap,$ap,16
  508. addi $np,$np,16
  509. fmul $T1a,$A1,$ba
  510. fmul $T1b,$A1,$bb
  511. fmul $T2a,$A2,$ba
  512. fmul $T2b,$A2,$bb
  513. stfd $A0,8($nap_d) ; save a[j] in double format
  514. stfd $A1,16($nap_d)
  515. fmul $T3a,$A3,$ba
  516. fmul $T3b,$A3,$bb
  517. fmadd $T0a,$A0,$ba,$dota
  518. fmadd $T0b,$A0,$bb,$dotb
  519. stfd $A2,24($nap_d) ; save a[j+1] in double format
  520. stfd $A3,32($nap_d)
  521. ___
  522. if ($SIZE_T==8 or $flavour =~ /osx/) {
  523. $code.=<<___;
  524. fmadd $T1a,$A0,$bc,$T1a
  525. fmadd $T1b,$A0,$bd,$T1b
  526. fmadd $T2a,$A1,$bc,$T2a
  527. fmadd $T2b,$A1,$bd,$T2b
  528. stfd $N0,40($nap_d) ; save n[j] in double format
  529. stfd $N1,48($nap_d)
  530. fmadd $T3a,$A2,$bc,$T3a
  531. fmadd $T3b,$A2,$bd,$T3b
  532. add $t0,$t0,$carry ; can not overflow
  533. fmul $dota,$A3,$bc
  534. fmul $dotb,$A3,$bd
  535. stfd $N2,56($nap_d) ; save n[j+1] in double format
  536. stfdu $N3,64($nap_d)
  537. srdi $carry,$t0,16
  538. add $t1,$t1,$carry
  539. srdi $carry,$t1,16
  540. fmadd $T1a,$N1,$na,$T1a
  541. fmadd $T1b,$N1,$nb,$T1b
  542. insrdi $t0,$t1,16,32
  543. fmadd $T2a,$N2,$na,$T2a
  544. fmadd $T2b,$N2,$nb,$T2b
  545. add $t2,$t2,$carry
  546. fmadd $T3a,$N3,$na,$T3a
  547. fmadd $T3b,$N3,$nb,$T3b
  548. srdi $carry,$t2,16
  549. fmadd $T0a,$N0,$na,$T0a
  550. fmadd $T0b,$N0,$nb,$T0b
  551. insrdi $t0,$t2,16,16
  552. add $t3,$t3,$carry
  553. srdi $carry,$t3,16
  554. fmadd $T1a,$N0,$nc,$T1a
  555. fmadd $T1b,$N0,$nd,$T1b
  556. insrdi $t0,$t3,16,0 ; 0..63 bits
  557. fmadd $T2a,$N1,$nc,$T2a
  558. fmadd $T2b,$N1,$nd,$T2b
  559. add $t4,$t4,$carry
  560. fmadd $T3a,$N2,$nc,$T3a
  561. fmadd $T3b,$N2,$nd,$T3b
  562. srdi $carry,$t4,16
  563. fmadd $dota,$N3,$nc,$dota
  564. fmadd $dotb,$N3,$nd,$dotb
  565. add $t5,$t5,$carry
  566. srdi $carry,$t5,16
  567. insrdi $t4,$t5,16,32
  568. fctid $T0a,$T0a
  569. fctid $T0b,$T0b
  570. add $t6,$t6,$carry
  571. fctid $T1a,$T1a
  572. fctid $T1b,$T1b
  573. srdi $carry,$t6,16
  574. fctid $T2a,$T2a
  575. fctid $T2b,$T2b
  576. insrdi $t4,$t6,16,16
  577. fctid $T3a,$T3a
  578. fctid $T3b,$T3b
  579. add $t7,$t7,$carry
  580. insrdi $t4,$t7,16,0 ; 64..127 bits
  581. srdi $carry,$t7,16 ; upper 33 bits
  582. stfd $T0a,`$FRAME+0`($sp)
  583. stfd $T0b,`$FRAME+8`($sp)
  584. stfd $T1a,`$FRAME+16`($sp)
  585. stfd $T1b,`$FRAME+24`($sp)
  586. stfd $T2a,`$FRAME+32`($sp)
  587. stfd $T2b,`$FRAME+40`($sp)
  588. stfd $T3a,`$FRAME+48`($sp)
  589. stfd $T3b,`$FRAME+56`($sp)
  590. std $t0,8($tp) ; tp[j-1]
  591. stdu $t4,16($tp) ; tp[j]
  592. ___
  593. } else {
  594. $code.=<<___;
  595. fmadd $T1a,$A0,$bc,$T1a
  596. fmadd $T1b,$A0,$bd,$T1b
  597. addc $t0,$t0,$carry
  598. adde $t1,$t1,$c1
  599. srwi $carry,$t0,16
  600. fmadd $T2a,$A1,$bc,$T2a
  601. fmadd $T2b,$A1,$bd,$T2b
  602. stfd $N0,40($nap_d) ; save n[j] in double format
  603. stfd $N1,48($nap_d)
  604. srwi $c1,$t1,16
  605. insrwi $carry,$t1,16,0
  606. fmadd $T3a,$A2,$bc,$T3a
  607. fmadd $T3b,$A2,$bd,$T3b
  608. addc $t2,$t2,$carry
  609. adde $t3,$t3,$c1
  610. srwi $carry,$t2,16
  611. fmul $dota,$A3,$bc
  612. fmul $dotb,$A3,$bd
  613. stfd $N2,56($nap_d) ; save n[j+1] in double format
  614. stfdu $N3,64($nap_d)
  615. insrwi $t0,$t2,16,0 ; 0..31 bits
  616. srwi $c1,$t3,16
  617. insrwi $carry,$t3,16,0
  618. fmadd $T1a,$N1,$na,$T1a
  619. fmadd $T1b,$N1,$nb,$T1b
  620. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  621. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  622. addc $t4,$t4,$carry
  623. adde $t5,$t5,$c1
  624. srwi $carry,$t4,16
  625. fmadd $T2a,$N2,$na,$T2a
  626. fmadd $T2b,$N2,$nb,$T2b
  627. srwi $c1,$t5,16
  628. insrwi $carry,$t5,16,0
  629. fmadd $T3a,$N3,$na,$T3a
  630. fmadd $T3b,$N3,$nb,$T3b
  631. addc $t6,$t6,$carry
  632. adde $t7,$t7,$c1
  633. srwi $carry,$t6,16
  634. fmadd $T0a,$N0,$na,$T0a
  635. fmadd $T0b,$N0,$nb,$T0b
  636. insrwi $t4,$t6,16,0 ; 32..63 bits
  637. srwi $c1,$t7,16
  638. insrwi $carry,$t7,16,0
  639. fmadd $T1a,$N0,$nc,$T1a
  640. fmadd $T1b,$N0,$nd,$T1b
  641. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  642. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  643. addc $t2,$t2,$carry
  644. adde $t3,$t3,$c1
  645. srwi $carry,$t2,16
  646. fmadd $T2a,$N1,$nc,$T2a
  647. fmadd $T2b,$N1,$nd,$T2b
  648. stw $t0,12($tp) ; tp[j-1]
  649. stw $t4,8($tp)
  650. srwi $c1,$t3,16
  651. insrwi $carry,$t3,16,0
  652. fmadd $T3a,$N2,$nc,$T3a
  653. fmadd $T3b,$N2,$nd,$T3b
  654. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  655. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  656. addc $t6,$t6,$carry
  657. adde $t7,$t7,$c1
  658. srwi $carry,$t6,16
  659. fmadd $dota,$N3,$nc,$dota
  660. fmadd $dotb,$N3,$nd,$dotb
  661. insrwi $t2,$t6,16,0 ; 64..95 bits
  662. srwi $c1,$t7,16
  663. insrwi $carry,$t7,16,0
  664. fctid $T0a,$T0a
  665. fctid $T0b,$T0b
  666. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  667. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  668. addc $t0,$t0,$carry
  669. adde $t1,$t1,$c1
  670. srwi $carry,$t0,16
  671. fctid $T1a,$T1a
  672. fctid $T1b,$T1b
  673. srwi $c1,$t1,16
  674. insrwi $carry,$t1,16,0
  675. fctid $T2a,$T2a
  676. fctid $T2b,$T2b
  677. addc $t4,$t4,$carry
  678. adde $t5,$t5,$c1
  679. srwi $carry,$t4,16
  680. fctid $T3a,$T3a
  681. fctid $T3b,$T3b
  682. insrwi $t0,$t4,16,0 ; 96..127 bits
  683. srwi $c1,$t5,16
  684. insrwi $carry,$t5,16,0
  685. stfd $T0a,`$FRAME+0`($sp)
  686. stfd $T0b,`$FRAME+8`($sp)
  687. stfd $T1a,`$FRAME+16`($sp)
  688. stfd $T1b,`$FRAME+24`($sp)
  689. stfd $T2a,`$FRAME+32`($sp)
  690. stfd $T2b,`$FRAME+40`($sp)
  691. stfd $T3a,`$FRAME+48`($sp)
  692. stfd $T3b,`$FRAME+56`($sp)
  693. stw $t2,20($tp) ; tp[j]
  694. stwu $t0,16($tp)
  695. ___
  696. }
  697. $code.=<<___;
  698. bdnz L1st
  699. fctid $dota,$dota
  700. fctid $dotb,$dotb
  701. ___
  702. if ($SIZE_T==8 or $flavour =~ /osx/) {
  703. $code.=<<___;
  704. ld $t0,`$FRAME+0`($sp)
  705. ld $t1,`$FRAME+8`($sp)
  706. ld $t2,`$FRAME+16`($sp)
  707. ld $t3,`$FRAME+24`($sp)
  708. ld $t4,`$FRAME+32`($sp)
  709. ld $t5,`$FRAME+40`($sp)
  710. ld $t6,`$FRAME+48`($sp)
  711. ld $t7,`$FRAME+56`($sp)
  712. stfd $dota,`$FRAME+64`($sp)
  713. stfd $dotb,`$FRAME+72`($sp)
  714. add $t0,$t0,$carry ; can not overflow
  715. srdi $carry,$t0,16
  716. add $t1,$t1,$carry
  717. srdi $carry,$t1,16
  718. insrdi $t0,$t1,16,32
  719. add $t2,$t2,$carry
  720. srdi $carry,$t2,16
  721. insrdi $t0,$t2,16,16
  722. add $t3,$t3,$carry
  723. srdi $carry,$t3,16
  724. insrdi $t0,$t3,16,0 ; 0..63 bits
  725. add $t4,$t4,$carry
  726. srdi $carry,$t4,16
  727. add $t5,$t5,$carry
  728. srdi $carry,$t5,16
  729. insrdi $t4,$t5,16,32
  730. add $t6,$t6,$carry
  731. srdi $carry,$t6,16
  732. insrdi $t4,$t6,16,16
  733. add $t7,$t7,$carry
  734. insrdi $t4,$t7,16,0 ; 64..127 bits
  735. srdi $carry,$t7,16 ; upper 33 bits
  736. ld $t6,`$FRAME+64`($sp)
  737. ld $t7,`$FRAME+72`($sp)
  738. std $t0,8($tp) ; tp[j-1]
  739. stdu $t4,16($tp) ; tp[j]
  740. add $t6,$t6,$carry ; can not overflow
  741. srdi $carry,$t6,16
  742. add $t7,$t7,$carry
  743. insrdi $t6,$t7,48,0
  744. srdi $ovf,$t7,48
  745. std $t6,8($tp) ; tp[num-1]
  746. ___
  747. } else {
  748. $code.=<<___;
  749. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  750. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  751. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  752. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  753. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  754. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  755. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  756. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  757. stfd $dota,`$FRAME+64`($sp)
  758. stfd $dotb,`$FRAME+72`($sp)
  759. addc $t0,$t0,$carry
  760. adde $t1,$t1,$c1
  761. srwi $carry,$t0,16
  762. insrwi $carry,$t1,16,0
  763. srwi $c1,$t1,16
  764. addc $t2,$t2,$carry
  765. adde $t3,$t3,$c1
  766. srwi $carry,$t2,16
  767. insrwi $t0,$t2,16,0 ; 0..31 bits
  768. insrwi $carry,$t3,16,0
  769. srwi $c1,$t3,16
  770. addc $t4,$t4,$carry
  771. adde $t5,$t5,$c1
  772. srwi $carry,$t4,16
  773. insrwi $carry,$t5,16,0
  774. srwi $c1,$t5,16
  775. addc $t6,$t6,$carry
  776. adde $t7,$t7,$c1
  777. srwi $carry,$t6,16
  778. insrwi $t4,$t6,16,0 ; 32..63 bits
  779. insrwi $carry,$t7,16,0
  780. srwi $c1,$t7,16
  781. stw $t0,12($tp) ; tp[j-1]
  782. stw $t4,8($tp)
  783. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  784. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  785. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  786. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  787. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  788. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  789. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  790. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  791. addc $t2,$t2,$carry
  792. adde $t3,$t3,$c1
  793. srwi $carry,$t2,16
  794. insrwi $carry,$t3,16,0
  795. srwi $c1,$t3,16
  796. addc $t6,$t6,$carry
  797. adde $t7,$t7,$c1
  798. srwi $carry,$t6,16
  799. insrwi $t2,$t6,16,0 ; 64..95 bits
  800. insrwi $carry,$t7,16,0
  801. srwi $c1,$t7,16
  802. addc $t0,$t0,$carry
  803. adde $t1,$t1,$c1
  804. srwi $carry,$t0,16
  805. insrwi $carry,$t1,16,0
  806. srwi $c1,$t1,16
  807. addc $t4,$t4,$carry
  808. adde $t5,$t5,$c1
  809. srwi $carry,$t4,16
  810. insrwi $t0,$t4,16,0 ; 96..127 bits
  811. insrwi $carry,$t5,16,0
  812. srwi $c1,$t5,16
  813. stw $t2,20($tp) ; tp[j]
  814. stwu $t0,16($tp)
  815. lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
  816. lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
  817. lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
  818. lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
  819. addc $t6,$t6,$carry
  820. adde $t7,$t7,$c1
  821. srwi $carry,$t6,16
  822. insrwi $carry,$t7,16,0
  823. srwi $c1,$t7,16
  824. addc $t4,$t4,$carry
  825. adde $t5,$t5,$c1
  826. insrwi $t6,$t4,16,0
  827. srwi $t4,$t4,16
  828. insrwi $t4,$t5,16,0
  829. srwi $ovf,$t5,16
  830. stw $t6,12($tp) ; tp[num-1]
  831. stw $t4,8($tp)
  832. ___
  833. }
  834. $code.=<<___;
  835. slwi $t7,$num,2
  836. subf $nap_d,$t7,$nap_d ; rewind pointer
  837. li $i,8 ; i=1
  838. .align 5
  839. Louter:
  840. addi $tp,$sp,`$FRAME+$TRANSFER`
  841. li $carry,0
  842. mtctr $j
  843. ___
  844. $code.=<<___ if ($SIZE_T==8);
  845. ldx $t3,$bp,$i ; bp[i]
  846. ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
  847. mulld $t7,$a0,$t3 ; ap[0]*bp[i]
  848. add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
  849. ; transfer bp[i] to FPU as 4x16-bit values
  850. extrdi $t0,$t3,16,48
  851. extrdi $t1,$t3,16,32
  852. extrdi $t2,$t3,16,16
  853. extrdi $t3,$t3,16,0
  854. std $t0,`$FRAME+0`($sp)
  855. std $t1,`$FRAME+8`($sp)
  856. std $t2,`$FRAME+16`($sp)
  857. std $t3,`$FRAME+24`($sp)
  858. mulld $t7,$t7,$n0 ; tp[0]*n0
  859. ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
  860. extrdi $t4,$t7,16,48
  861. extrdi $t5,$t7,16,32
  862. extrdi $t6,$t7,16,16
  863. extrdi $t7,$t7,16,0
  864. std $t4,`$FRAME+32`($sp)
  865. std $t5,`$FRAME+40`($sp)
  866. std $t6,`$FRAME+48`($sp)
  867. std $t7,`$FRAME+56`($sp)
  868. ___
  869. $code.=<<___ if ($SIZE_T==4);
  870. add $t0,$bp,$i
  871. li $c1,0
  872. lwz $t1,0($t0) ; bp[i,i+1]
  873. lwz $t3,4($t0)
  874. mullw $t4,$a0,$t1 ; ap[0]*bp[i]
  875. lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
  876. mulhwu $t5,$a0,$t1
  877. lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
  878. mullw $t6,$a1,$t1
  879. mullw $t7,$a0,$t3
  880. add $t5,$t5,$t6
  881. add $t5,$t5,$t7
  882. addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
  883. adde $t5,$t5,$t2
  884. ; transfer bp[i] to FPU as 4x16-bit values
  885. extrwi $t0,$t1,16,16
  886. extrwi $t1,$t1,16,0
  887. extrwi $t2,$t3,16,16
  888. extrwi $t3,$t3,16,0
  889. std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
  890. std $t1,`$FRAME+8`($sp)
  891. std $t2,`$FRAME+16`($sp)
  892. std $t3,`$FRAME+24`($sp)
  893. mullw $t0,$t4,$n0 ; mulld tp[0]*n0
  894. mulhwu $t1,$t4,$n0
  895. mullw $t2,$t5,$n0
  896. mullw $t3,$t4,$n1
  897. add $t1,$t1,$t2
  898. add $t1,$t1,$t3
  899. ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
  900. extrwi $t4,$t0,16,16
  901. extrwi $t5,$t0,16,0
  902. extrwi $t6,$t1,16,16
  903. extrwi $t7,$t1,16,0
  904. std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
  905. std $t5,`$FRAME+40`($sp)
  906. std $t6,`$FRAME+48`($sp)
  907. std $t7,`$FRAME+56`($sp)
  908. ___
  909. $code.=<<___;
  910. lfd $A0,8($nap_d) ; load a[j] in double format
  911. lfd $A1,16($nap_d)
  912. lfd $A2,24($nap_d) ; load a[j+1] in double format
  913. lfd $A3,32($nap_d)
  914. lfd $N0,40($nap_d) ; load n[j] in double format
  915. lfd $N1,48($nap_d)
  916. lfd $N2,56($nap_d) ; load n[j+1] in double format
  917. lfdu $N3,64($nap_d)
  918. lfd $ba,`$FRAME+0`($sp)
  919. lfd $bb,`$FRAME+8`($sp)
  920. lfd $bc,`$FRAME+16`($sp)
  921. lfd $bd,`$FRAME+24`($sp)
  922. lfd $na,`$FRAME+32`($sp)
  923. lfd $nb,`$FRAME+40`($sp)
  924. lfd $nc,`$FRAME+48`($sp)
  925. lfd $nd,`$FRAME+56`($sp)
  926. fcfid $ba,$ba
  927. fcfid $bb,$bb
  928. fcfid $bc,$bc
  929. fcfid $bd,$bd
  930. fcfid $na,$na
  931. fcfid $nb,$nb
  932. fcfid $nc,$nc
  933. fcfid $nd,$nd
  934. fmul $T1a,$A1,$ba
  935. fmul $T1b,$A1,$bb
  936. fmul $T2a,$A2,$ba
  937. fmul $T2b,$A2,$bb
  938. fmul $T3a,$A3,$ba
  939. fmul $T3b,$A3,$bb
  940. fmul $T0a,$A0,$ba
  941. fmul $T0b,$A0,$bb
  942. fmadd $T1a,$A0,$bc,$T1a
  943. fmadd $T1b,$A0,$bd,$T1b
  944. fmadd $T2a,$A1,$bc,$T2a
  945. fmadd $T2b,$A1,$bd,$T2b
  946. fmadd $T3a,$A2,$bc,$T3a
  947. fmadd $T3b,$A2,$bd,$T3b
  948. fmul $dota,$A3,$bc
  949. fmul $dotb,$A3,$bd
  950. fmadd $T1a,$N1,$na,$T1a
  951. fmadd $T1b,$N1,$nb,$T1b
  952. lfd $A0,8($nap_d) ; load a[j] in double format
  953. lfd $A1,16($nap_d)
  954. fmadd $T2a,$N2,$na,$T2a
  955. fmadd $T2b,$N2,$nb,$T2b
  956. lfd $A2,24($nap_d) ; load a[j+1] in double format
  957. lfd $A3,32($nap_d)
  958. fmadd $T3a,$N3,$na,$T3a
  959. fmadd $T3b,$N3,$nb,$T3b
  960. fmadd $T0a,$N0,$na,$T0a
  961. fmadd $T0b,$N0,$nb,$T0b
  962. fmadd $T1a,$N0,$nc,$T1a
  963. fmadd $T1b,$N0,$nd,$T1b
  964. fmadd $T2a,$N1,$nc,$T2a
  965. fmadd $T2b,$N1,$nd,$T2b
  966. fmadd $T3a,$N2,$nc,$T3a
  967. fmadd $T3b,$N2,$nd,$T3b
  968. fmadd $dota,$N3,$nc,$dota
  969. fmadd $dotb,$N3,$nd,$dotb
  970. fctid $T0a,$T0a
  971. fctid $T0b,$T0b
  972. fctid $T1a,$T1a
  973. fctid $T1b,$T1b
  974. fctid $T2a,$T2a
  975. fctid $T2b,$T2b
  976. fctid $T3a,$T3a
  977. fctid $T3b,$T3b
  978. stfd $T0a,`$FRAME+0`($sp)
  979. stfd $T0b,`$FRAME+8`($sp)
  980. stfd $T1a,`$FRAME+16`($sp)
  981. stfd $T1b,`$FRAME+24`($sp)
  982. stfd $T2a,`$FRAME+32`($sp)
  983. stfd $T2b,`$FRAME+40`($sp)
  984. stfd $T3a,`$FRAME+48`($sp)
  985. stfd $T3b,`$FRAME+56`($sp)
  986. .align 5
  987. Linner:
  988. fmul $T1a,$A1,$ba
  989. fmul $T1b,$A1,$bb
  990. fmul $T2a,$A2,$ba
  991. fmul $T2b,$A2,$bb
  992. lfd $N0,40($nap_d) ; load n[j] in double format
  993. lfd $N1,48($nap_d)
  994. fmul $T3a,$A3,$ba
  995. fmul $T3b,$A3,$bb
  996. fmadd $T0a,$A0,$ba,$dota
  997. fmadd $T0b,$A0,$bb,$dotb
  998. lfd $N2,56($nap_d) ; load n[j+1] in double format
  999. lfdu $N3,64($nap_d)
  1000. fmadd $T1a,$A0,$bc,$T1a
  1001. fmadd $T1b,$A0,$bd,$T1b
  1002. fmadd $T2a,$A1,$bc,$T2a
  1003. fmadd $T2b,$A1,$bd,$T2b
  1004. lfd $A0,8($nap_d) ; load a[j] in double format
  1005. lfd $A1,16($nap_d)
  1006. fmadd $T3a,$A2,$bc,$T3a
  1007. fmadd $T3b,$A2,$bd,$T3b
  1008. fmul $dota,$A3,$bc
  1009. fmul $dotb,$A3,$bd
  1010. lfd $A2,24($nap_d) ; load a[j+1] in double format
  1011. lfd $A3,32($nap_d)
  1012. ___
  1013. if ($SIZE_T==8 or $flavour =~ /osx/) {
  1014. $code.=<<___;
  1015. fmadd $T1a,$N1,$na,$T1a
  1016. fmadd $T1b,$N1,$nb,$T1b
  1017. ld $t0,`$FRAME+0`($sp)
  1018. ld $t1,`$FRAME+8`($sp)
  1019. fmadd $T2a,$N2,$na,$T2a
  1020. fmadd $T2b,$N2,$nb,$T2b
  1021. ld $t2,`$FRAME+16`($sp)
  1022. ld $t3,`$FRAME+24`($sp)
  1023. fmadd $T3a,$N3,$na,$T3a
  1024. fmadd $T3b,$N3,$nb,$T3b
  1025. add $t0,$t0,$carry ; can not overflow
  1026. ld $t4,`$FRAME+32`($sp)
  1027. ld $t5,`$FRAME+40`($sp)
  1028. fmadd $T0a,$N0,$na,$T0a
  1029. fmadd $T0b,$N0,$nb,$T0b
  1030. srdi $carry,$t0,16
  1031. add $t1,$t1,$carry
  1032. srdi $carry,$t1,16
  1033. ld $t6,`$FRAME+48`($sp)
  1034. ld $t7,`$FRAME+56`($sp)
  1035. fmadd $T1a,$N0,$nc,$T1a
  1036. fmadd $T1b,$N0,$nd,$T1b
  1037. insrdi $t0,$t1,16,32
  1038. ld $t1,8($tp) ; tp[j]
  1039. fmadd $T2a,$N1,$nc,$T2a
  1040. fmadd $T2b,$N1,$nd,$T2b
  1041. add $t2,$t2,$carry
  1042. fmadd $T3a,$N2,$nc,$T3a
  1043. fmadd $T3b,$N2,$nd,$T3b
  1044. srdi $carry,$t2,16
  1045. insrdi $t0,$t2,16,16
  1046. fmadd $dota,$N3,$nc,$dota
  1047. fmadd $dotb,$N3,$nd,$dotb
  1048. add $t3,$t3,$carry
  1049. ldu $t2,16($tp) ; tp[j+1]
  1050. srdi $carry,$t3,16
  1051. insrdi $t0,$t3,16,0 ; 0..63 bits
  1052. add $t4,$t4,$carry
  1053. fctid $T0a,$T0a
  1054. fctid $T0b,$T0b
  1055. srdi $carry,$t4,16
  1056. fctid $T1a,$T1a
  1057. fctid $T1b,$T1b
  1058. add $t5,$t5,$carry
  1059. fctid $T2a,$T2a
  1060. fctid $T2b,$T2b
  1061. srdi $carry,$t5,16
  1062. insrdi $t4,$t5,16,32
  1063. fctid $T3a,$T3a
  1064. fctid $T3b,$T3b
  1065. add $t6,$t6,$carry
  1066. srdi $carry,$t6,16
  1067. insrdi $t4,$t6,16,16
  1068. stfd $T0a,`$FRAME+0`($sp)
  1069. stfd $T0b,`$FRAME+8`($sp)
  1070. add $t7,$t7,$carry
  1071. addc $t3,$t0,$t1
  1072. ___
  1073. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1074. extrdi $t0,$t0,32,0
  1075. extrdi $t1,$t1,32,0
  1076. adde $t0,$t0,$t1
  1077. ___
  1078. $code.=<<___;
  1079. stfd $T1a,`$FRAME+16`($sp)
  1080. stfd $T1b,`$FRAME+24`($sp)
  1081. insrdi $t4,$t7,16,0 ; 64..127 bits
  1082. srdi $carry,$t7,16 ; upper 33 bits
  1083. stfd $T2a,`$FRAME+32`($sp)
  1084. stfd $T2b,`$FRAME+40`($sp)
  1085. adde $t5,$t4,$t2
  1086. ___
  1087. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1088. extrdi $t4,$t4,32,0
  1089. extrdi $t2,$t2,32,0
  1090. adde $t4,$t4,$t2
  1091. ___
  1092. $code.=<<___;
  1093. stfd $T3a,`$FRAME+48`($sp)
  1094. stfd $T3b,`$FRAME+56`($sp)
  1095. addze $carry,$carry
  1096. std $t3,-16($tp) ; tp[j-1]
  1097. std $t5,-8($tp) ; tp[j]
  1098. ___
  1099. } else {
  1100. $code.=<<___;
  1101. fmadd $T1a,$N1,$na,$T1a
  1102. fmadd $T1b,$N1,$nb,$T1b
  1103. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  1104. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  1105. fmadd $T2a,$N2,$na,$T2a
  1106. fmadd $T2b,$N2,$nb,$T2b
  1107. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  1108. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  1109. fmadd $T3a,$N3,$na,$T3a
  1110. fmadd $T3b,$N3,$nb,$T3b
  1111. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  1112. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  1113. addc $t0,$t0,$carry
  1114. adde $t1,$t1,$c1
  1115. srwi $carry,$t0,16
  1116. fmadd $T0a,$N0,$na,$T0a
  1117. fmadd $T0b,$N0,$nb,$T0b
  1118. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  1119. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  1120. srwi $c1,$t1,16
  1121. insrwi $carry,$t1,16,0
  1122. fmadd $T1a,$N0,$nc,$T1a
  1123. fmadd $T1b,$N0,$nd,$T1b
  1124. addc $t2,$t2,$carry
  1125. adde $t3,$t3,$c1
  1126. srwi $carry,$t2,16
  1127. fmadd $T2a,$N1,$nc,$T2a
  1128. fmadd $T2b,$N1,$nd,$T2b
  1129. insrwi $t0,$t2,16,0 ; 0..31 bits
  1130. srwi $c1,$t3,16
  1131. insrwi $carry,$t3,16,0
  1132. fmadd $T3a,$N2,$nc,$T3a
  1133. fmadd $T3b,$N2,$nd,$T3b
  1134. lwz $t2,12($tp) ; tp[j]
  1135. lwz $t3,8($tp)
  1136. addc $t4,$t4,$carry
  1137. adde $t5,$t5,$c1
  1138. srwi $carry,$t4,16
  1139. fmadd $dota,$N3,$nc,$dota
  1140. fmadd $dotb,$N3,$nd,$dotb
  1141. srwi $c1,$t5,16
  1142. insrwi $carry,$t5,16,0
  1143. fctid $T0a,$T0a
  1144. addc $t6,$t6,$carry
  1145. adde $t7,$t7,$c1
  1146. srwi $carry,$t6,16
  1147. fctid $T0b,$T0b
  1148. insrwi $t4,$t6,16,0 ; 32..63 bits
  1149. srwi $c1,$t7,16
  1150. insrwi $carry,$t7,16,0
  1151. fctid $T1a,$T1a
  1152. addc $t0,$t0,$t2
  1153. adde $t4,$t4,$t3
  1154. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  1155. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  1156. fctid $T1b,$T1b
  1157. addze $carry,$carry
  1158. addze $c1,$c1
  1159. stw $t0,4($tp) ; tp[j-1]
  1160. stw $t4,0($tp)
  1161. fctid $T2a,$T2a
  1162. addc $t2,$t2,$carry
  1163. adde $t3,$t3,$c1
  1164. srwi $carry,$t2,16
  1165. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  1166. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  1167. fctid $T2b,$T2b
  1168. srwi $c1,$t3,16
  1169. insrwi $carry,$t3,16,0
  1170. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  1171. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  1172. fctid $T3a,$T3a
  1173. addc $t6,$t6,$carry
  1174. adde $t7,$t7,$c1
  1175. srwi $carry,$t6,16
  1176. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  1177. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  1178. fctid $T3b,$T3b
  1179. insrwi $t2,$t6,16,0 ; 64..95 bits
  1180. insrwi $carry,$t7,16,0
  1181. srwi $c1,$t7,16
  1182. lwz $t6,20($tp)
  1183. lwzu $t7,16($tp)
  1184. addc $t0,$t0,$carry
  1185. stfd $T0a,`$FRAME+0`($sp)
  1186. adde $t1,$t1,$c1
  1187. srwi $carry,$t0,16
  1188. stfd $T0b,`$FRAME+8`($sp)
  1189. insrwi $carry,$t1,16,0
  1190. srwi $c1,$t1,16
  1191. addc $t4,$t4,$carry
  1192. stfd $T1a,`$FRAME+16`($sp)
  1193. adde $t5,$t5,$c1
  1194. srwi $carry,$t4,16
  1195. insrwi $t0,$t4,16,0 ; 96..127 bits
  1196. stfd $T1b,`$FRAME+24`($sp)
  1197. insrwi $carry,$t5,16,0
  1198. srwi $c1,$t5,16
  1199. addc $t2,$t2,$t6
  1200. stfd $T2a,`$FRAME+32`($sp)
  1201. adde $t0,$t0,$t7
  1202. stfd $T2b,`$FRAME+40`($sp)
  1203. addze $carry,$carry
  1204. stfd $T3a,`$FRAME+48`($sp)
  1205. addze $c1,$c1
  1206. stfd $T3b,`$FRAME+56`($sp)
  1207. stw $t2,-4($tp) ; tp[j]
  1208. stw $t0,-8($tp)
  1209. ___
  1210. }
  1211. $code.=<<___;
  1212. bdnz Linner
  1213. fctid $dota,$dota
  1214. fctid $dotb,$dotb
  1215. ___
  1216. if ($SIZE_T==8 or $flavour =~ /osx/) {
  1217. $code.=<<___;
  1218. ld $t0,`$FRAME+0`($sp)
  1219. ld $t1,`$FRAME+8`($sp)
  1220. ld $t2,`$FRAME+16`($sp)
  1221. ld $t3,`$FRAME+24`($sp)
  1222. ld $t4,`$FRAME+32`($sp)
  1223. ld $t5,`$FRAME+40`($sp)
  1224. ld $t6,`$FRAME+48`($sp)
  1225. ld $t7,`$FRAME+56`($sp)
  1226. stfd $dota,`$FRAME+64`($sp)
  1227. stfd $dotb,`$FRAME+72`($sp)
  1228. add $t0,$t0,$carry ; can not overflow
  1229. srdi $carry,$t0,16
  1230. add $t1,$t1,$carry
  1231. srdi $carry,$t1,16
  1232. insrdi $t0,$t1,16,32
  1233. add $t2,$t2,$carry
  1234. ld $t1,8($tp) ; tp[j]
  1235. srdi $carry,$t2,16
  1236. insrdi $t0,$t2,16,16
  1237. add $t3,$t3,$carry
  1238. ldu $t2,16($tp) ; tp[j+1]
  1239. srdi $carry,$t3,16
  1240. insrdi $t0,$t3,16,0 ; 0..63 bits
  1241. add $t4,$t4,$carry
  1242. srdi $carry,$t4,16
  1243. add $t5,$t5,$carry
  1244. srdi $carry,$t5,16
  1245. insrdi $t4,$t5,16,32
  1246. add $t6,$t6,$carry
  1247. srdi $carry,$t6,16
  1248. insrdi $t4,$t6,16,16
  1249. add $t7,$t7,$carry
  1250. insrdi $t4,$t7,16,0 ; 64..127 bits
  1251. srdi $carry,$t7,16 ; upper 33 bits
  1252. ld $t6,`$FRAME+64`($sp)
  1253. ld $t7,`$FRAME+72`($sp)
  1254. addc $t3,$t0,$t1
  1255. ___
  1256. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1257. extrdi $t0,$t0,32,0
  1258. extrdi $t1,$t1,32,0
  1259. adde $t0,$t0,$t1
  1260. ___
  1261. $code.=<<___;
  1262. adde $t5,$t4,$t2
  1263. ___
  1264. $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
  1265. extrdi $t4,$t4,32,0
  1266. extrdi $t2,$t2,32,0
  1267. adde $t4,$t4,$t2
  1268. ___
  1269. $code.=<<___;
  1270. addze $carry,$carry
  1271. std $t3,-16($tp) ; tp[j-1]
  1272. std $t5,-8($tp) ; tp[j]
  1273. add $carry,$carry,$ovf ; consume upmost overflow
  1274. add $t6,$t6,$carry ; can not overflow
  1275. srdi $carry,$t6,16
  1276. add $t7,$t7,$carry
  1277. insrdi $t6,$t7,48,0
  1278. srdi $ovf,$t7,48
  1279. std $t6,0($tp) ; tp[num-1]
  1280. ___
  1281. } else {
  1282. $code.=<<___;
  1283. lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
  1284. lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
  1285. lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
  1286. lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
  1287. lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
  1288. lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
  1289. lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
  1290. lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
  1291. stfd $dota,`$FRAME+64`($sp)
  1292. stfd $dotb,`$FRAME+72`($sp)
  1293. addc $t0,$t0,$carry
  1294. adde $t1,$t1,$c1
  1295. srwi $carry,$t0,16
  1296. insrwi $carry,$t1,16,0
  1297. srwi $c1,$t1,16
  1298. addc $t2,$t2,$carry
  1299. adde $t3,$t3,$c1
  1300. srwi $carry,$t2,16
  1301. insrwi $t0,$t2,16,0 ; 0..31 bits
  1302. lwz $t2,12($tp) ; tp[j]
  1303. insrwi $carry,$t3,16,0
  1304. srwi $c1,$t3,16
  1305. lwz $t3,8($tp)
  1306. addc $t4,$t4,$carry
  1307. adde $t5,$t5,$c1
  1308. srwi $carry,$t4,16
  1309. insrwi $carry,$t5,16,0
  1310. srwi $c1,$t5,16
  1311. addc $t6,$t6,$carry
  1312. adde $t7,$t7,$c1
  1313. srwi $carry,$t6,16
  1314. insrwi $t4,$t6,16,0 ; 32..63 bits
  1315. insrwi $carry,$t7,16,0
  1316. srwi $c1,$t7,16
  1317. addc $t0,$t0,$t2
  1318. adde $t4,$t4,$t3
  1319. addze $carry,$carry
  1320. addze $c1,$c1
  1321. stw $t0,4($tp) ; tp[j-1]
  1322. stw $t4,0($tp)
  1323. lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
  1324. lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
  1325. lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
  1326. lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
  1327. lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
  1328. lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
  1329. lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
  1330. lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
  1331. addc $t2,$t2,$carry
  1332. adde $t3,$t3,$c1
  1333. srwi $carry,$t2,16
  1334. insrwi $carry,$t3,16,0
  1335. srwi $c1,$t3,16
  1336. addc $t6,$t6,$carry
  1337. adde $t7,$t7,$c1
  1338. srwi $carry,$t6,16
  1339. insrwi $t2,$t6,16,0 ; 64..95 bits
  1340. lwz $t6,20($tp)
  1341. insrwi $carry,$t7,16,0
  1342. srwi $c1,$t7,16
  1343. lwzu $t7,16($tp)
  1344. addc $t0,$t0,$carry
  1345. adde $t1,$t1,$c1
  1346. srwi $carry,$t0,16
  1347. insrwi $carry,$t1,16,0
  1348. srwi $c1,$t1,16
  1349. addc $t4,$t4,$carry
  1350. adde $t5,$t5,$c1
  1351. srwi $carry,$t4,16
  1352. insrwi $t0,$t4,16,0 ; 96..127 bits
  1353. insrwi $carry,$t5,16,0
  1354. srwi $c1,$t5,16
  1355. addc $t2,$t2,$t6
  1356. adde $t0,$t0,$t7
  1357. lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
  1358. lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
  1359. addze $carry,$carry
  1360. addze $c1,$c1
  1361. lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
  1362. lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
  1363. addc $t6,$t6,$carry
  1364. adde $t7,$t7,$c1
  1365. stw $t2,-4($tp) ; tp[j]
  1366. stw $t0,-8($tp)
  1367. addc $t6,$t6,$ovf
  1368. addze $t7,$t7
  1369. srwi $carry,$t6,16
  1370. insrwi $carry,$t7,16,0
  1371. srwi $c1,$t7,16
  1372. addc $t4,$t4,$carry
  1373. adde $t5,$t5,$c1
  1374. insrwi $t6,$t4,16,0
  1375. srwi $t4,$t4,16
  1376. insrwi $t4,$t5,16,0
  1377. srwi $ovf,$t5,16
  1378. stw $t6,4($tp) ; tp[num-1]
  1379. stw $t4,0($tp)
  1380. ___
  1381. }
  1382. $code.=<<___;
  1383. slwi $t7,$num,2
  1384. addi $i,$i,8
  1385. subf $nap_d,$t7,$nap_d ; rewind pointer
  1386. cmpw $i,$num
  1387. blt- Louter
  1388. ___
  1389. $code.=<<___ if ($SIZE_T==8);
  1390. subf $np,$num,$np ; rewind np
  1391. addi $j,$j,1 ; restore counter
  1392. subfc $i,$i,$i ; j=0 and "clear" XER[CA]
  1393. addi $tp,$sp,`$FRAME+$TRANSFER+8`
  1394. addi $t4,$sp,`$FRAME+$TRANSFER+16`
  1395. addi $t5,$np,8
  1396. addi $t6,$rp,8
  1397. mtctr $j
  1398. .align 4
  1399. Lsub: ldx $t0,$tp,$i
  1400. ldx $t1,$np,$i
  1401. ldx $t2,$t4,$i
  1402. ldx $t3,$t5,$i
  1403. subfe $t0,$t1,$t0 ; tp[j]-np[j]
  1404. subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
  1405. stdx $t0,$rp,$i
  1406. stdx $t2,$t6,$i
  1407. addi $i,$i,16
  1408. bdnz Lsub
  1409. li $i,0
  1410. subfe $ovf,$i,$ovf ; handle upmost overflow bit
  1411. mtctr $j
  1412. .align 4
  1413. Lcopy: ; conditional copy
  1414. ldx $t0,$tp,$i
  1415. ldx $t1,$t4,$i
  1416. ldx $t2,$rp,$i
  1417. ldx $t3,$t6,$i
  1418. std $i,8($nap_d) ; zap nap_d
  1419. std $i,16($nap_d)
  1420. std $i,24($nap_d)
  1421. std $i,32($nap_d)
  1422. std $i,40($nap_d)
  1423. std $i,48($nap_d)
  1424. std $i,56($nap_d)
  1425. stdu $i,64($nap_d)
  1426. and $t0,$t0,$ovf
  1427. and $t1,$t1,$ovf
  1428. andc $t2,$t2,$ovf
  1429. andc $t3,$t3,$ovf
  1430. or $t0,$t0,$t2
  1431. or $t1,$t1,$t3
  1432. stdx $t0,$rp,$i
  1433. stdx $t1,$t6,$i
  1434. stdx $i,$tp,$i ; zap tp at once
  1435. stdx $i,$t4,$i
  1436. addi $i,$i,16
  1437. bdnz Lcopy
  1438. ___
  1439. $code.=<<___ if ($SIZE_T==4);
  1440. subf $np,$num,$np ; rewind np
  1441. addi $j,$j,1 ; restore counter
  1442. subfc $i,$i,$i ; j=0 and "clear" XER[CA]
  1443. addi $tp,$sp,`$FRAME+$TRANSFER`
  1444. addi $np,$np,-4
  1445. addi $rp,$rp,-4
  1446. addi $ap,$sp,`$FRAME+$TRANSFER+4`
  1447. mtctr $j
  1448. .align 4
  1449. Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
  1450. lwz $t1,8($tp)
  1451. lwz $t2,20($tp)
  1452. lwzu $t3,16($tp)
  1453. lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
  1454. lwz $t5,8($np)
  1455. lwz $t6,12($np)
  1456. lwzu $t7,16($np)
  1457. subfe $t4,$t4,$t0 ; tp[j]-np[j]
  1458. stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
  1459. subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
  1460. stw $t1,8($ap)
  1461. subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
  1462. stw $t2,12($ap)
  1463. subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
  1464. stwu $t3,16($ap)
  1465. stw $t4,4($rp)
  1466. stw $t5,8($rp)
  1467. stw $t6,12($rp)
  1468. stwu $t7,16($rp)
  1469. bdnz Lsub
  1470. li $i,0
  1471. subfe $ovf,$i,$ovf ; handle upmost overflow bit
  1472. addi $ap,$sp,`$FRAME+$TRANSFER+4`
  1473. subf $rp,$num,$rp ; rewind rp
  1474. addi $tp,$sp,`$FRAME+$TRANSFER`
  1475. mtctr $j
  1476. .align 4
  1477. Lcopy: ; conditional copy
  1478. lwz $t0,4($ap)
  1479. lwz $t1,8($ap)
  1480. lwz $t2,12($ap)
  1481. lwzu $t3,16($ap)
  1482. lwz $t4,4($rp)
  1483. lwz $t5,8($rp)
  1484. lwz $t6,12($rp)
  1485. lwz $t7,16($rp)
  1486. std $i,8($nap_d) ; zap nap_d
  1487. std $i,16($nap_d)
  1488. std $i,24($nap_d)
  1489. std $i,32($nap_d)
  1490. std $i,40($nap_d)
  1491. std $i,48($nap_d)
  1492. std $i,56($nap_d)
  1493. stdu $i,64($nap_d)
  1494. and $t0,$t0,$ovf
  1495. and $t1,$t1,$ovf
  1496. and $t2,$t2,$ovf
  1497. and $t3,$t3,$ovf
  1498. andc $t4,$t4,$ovf
  1499. andc $t5,$t5,$ovf
  1500. andc $t6,$t6,$ovf
  1501. andc $t7,$t7,$ovf
  1502. or $t0,$t0,$t4
  1503. or $t1,$t1,$t5
  1504. or $t2,$t2,$t6
  1505. or $t3,$t3,$t7
  1506. stw $t0,4($rp)
  1507. stw $t1,8($rp)
  1508. stw $t2,12($rp)
  1509. stwu $t3,16($rp)
  1510. std $i,8($tp) ; zap tp at once
  1511. stdu $i,16($tp)
  1512. bdnz Lcopy
  1513. ___
  1514. $code.=<<___;
  1515. $POP $i,0($sp)
  1516. li r3,1 ; signal "handled"
  1517. $POP r19,`-12*8-13*$SIZE_T`($i)
  1518. $POP r20,`-12*8-12*$SIZE_T`($i)
  1519. $POP r21,`-12*8-11*$SIZE_T`($i)
  1520. $POP r22,`-12*8-10*$SIZE_T`($i)
  1521. $POP r23,`-12*8-9*$SIZE_T`($i)
  1522. $POP r24,`-12*8-8*$SIZE_T`($i)
  1523. $POP r25,`-12*8-7*$SIZE_T`($i)
  1524. $POP r26,`-12*8-6*$SIZE_T`($i)
  1525. $POP r27,`-12*8-5*$SIZE_T`($i)
  1526. $POP r28,`-12*8-4*$SIZE_T`($i)
  1527. $POP r29,`-12*8-3*$SIZE_T`($i)
  1528. $POP r30,`-12*8-2*$SIZE_T`($i)
  1529. $POP r31,`-12*8-1*$SIZE_T`($i)
  1530. lfd f20,`-12*8`($i)
  1531. lfd f21,`-11*8`($i)
  1532. lfd f22,`-10*8`($i)
  1533. lfd f23,`-9*8`($i)
  1534. lfd f24,`-8*8`($i)
  1535. lfd f25,`-7*8`($i)
  1536. lfd f26,`-6*8`($i)
  1537. lfd f27,`-5*8`($i)
  1538. lfd f28,`-4*8`($i)
  1539. lfd f29,`-3*8`($i)
  1540. lfd f30,`-2*8`($i)
  1541. lfd f31,`-1*8`($i)
  1542. mr $sp,$i
  1543. blr
  1544. .long 0
  1545. .byte 0,12,4,0,0x8c,13,6,0
  1546. .long 0
  1547. .size .$fname,.-.$fname
  1548. .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
  1549. ___
  1550. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1551. print $code;
  1552. close STDOUT or die "error closing STDOUT: $!";