ia64-mont.pl 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. # January 2010
  16. #
  17. # "Teaser" Montgomery multiplication module for IA-64. There are
  18. # several possibilities for improvement:
  19. #
  20. # - modulo-scheduling outer loop would eliminate quite a number of
  21. # stalls after ldf8, xma and getf.sig outside inner loop and
  22. # improve shorter key performance;
  23. # - shorter vector support [with input vectors being fetched only
  24. # once] should be added;
  25. # - 2x unroll with help of n0[1] would make the code scalable on
  26. # "wider" IA-64, "wider" than Itanium 2 that is, which is not of
  27. # acute interest, because upcoming Tukwila's individual cores are
  28. # reportedly based on Itanium 2 design;
  29. # - dedicated squaring procedure(?);
  30. #
  31. # January 2010
  32. #
  33. # Shorter vector support is implemented by zero-padding ap and np
  34. # vectors up to 8 elements, or 512 bits. This means that 256-bit
  35. # inputs will be processed only 2 times faster than 512-bit inputs,
  36. # not 4 [as one would expect, because algorithm complexity is n^2].
  37. # The reason for padding is that inputs shorter than 512 bits won't
  38. # be processed faster anyway, because minimal critical path of the
  39. # core loop happens to match 512-bit timing. Either way, it resulted
  40. # in >100% improvement of 512-bit RSA sign benchmark and 50% - of
  41. # 1024-bit one [in comparison to original version of *this* module].
  42. #
  43. # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
  44. # this module is:
  45. # sign verify sign/s verify/s
  46. # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
  47. # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
  48. # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
  49. # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
  50. # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
  51. # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
  52. # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
  53. #
  54. # ... and *without* (but still with ia64.S):
  55. #
  56. # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
  57. # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
  58. # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
  59. # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
  60. # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
  61. # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
  62. # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
  63. #
  64. # As it can be seen, RSA sign performance improves by 130-30%,
  65. # hereafter less for longer keys, while verify - by 74-13%.
  66. # DSA performance improves by 115-30%.
  67. $output=pop;
  68. if ($^O eq "hpux") {
  69. $ADDP="addp4";
  70. for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
  71. } else { $ADDP="add"; }
  72. $code=<<___;
  73. .explicit
  74. .text
  75. // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
  76. // const BN_ULONG *bp,const BN_ULONG *np,
  77. // const BN_ULONG *n0p,int num);
  78. .align 64
  79. .global bn_mul_mont#
  80. .proc bn_mul_mont#
  81. bn_mul_mont:
  82. .prologue
  83. .body
  84. { .mmi; cmp4.le p6,p7=2,r37;;
  85. (p6) cmp4.lt.unc p8,p9=8,r37
  86. mov ret0=r0 };;
  87. { .bbb;
  88. (p9) br.cond.dptk.many bn_mul_mont_8
  89. (p8) br.cond.dpnt.many bn_mul_mont_general
  90. (p7) br.ret.spnt.many b0 };;
  91. .endp bn_mul_mont#
  92. prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
  93. rptr=r8; aptr=r9; bptr=r14; nptr=r15;
  94. tptr=r16; // &tp[0]
  95. tp_1=r17; // &tp[-1]
  96. num=r18; len=r19; lc=r20;
  97. topbit=r21; // carry bit from tmp[num]
  98. n0=f6;
  99. m0=f7;
  100. bi=f8;
  101. .align 64
  102. .local bn_mul_mont_general#
  103. .proc bn_mul_mont_general#
  104. bn_mul_mont_general:
  105. .prologue
  106. { .mmi; .save ar.pfs,prevfs
  107. alloc prevfs=ar.pfs,6,2,0,8
  108. $ADDP aptr=0,in1
  109. .save ar.lc,prevlc
  110. mov prevlc=ar.lc }
  111. { .mmi; .vframe prevsp
  112. mov prevsp=sp
  113. $ADDP bptr=0,in2
  114. .save pr,prevpr
  115. mov prevpr=pr };;
  116. .body
  117. .rotf alo[6],nlo[4],ahi[8],nhi[6]
  118. .rotr a[3],n[3],t[2]
  119. { .mmi; ldf8 bi=[bptr],8 // (*bp++)
  120. ldf8 alo[4]=[aptr],16 // ap[0]
  121. $ADDP r30=8,in1 };;
  122. { .mmi; ldf8 alo[3]=[r30],16 // ap[1]
  123. ldf8 alo[2]=[aptr],16 // ap[2]
  124. $ADDP in4=0,in4 };;
  125. { .mmi; ldf8 alo[1]=[r30] // ap[3]
  126. ldf8 n0=[in4] // n0
  127. $ADDP rptr=0,in0 }
  128. { .mmi; $ADDP nptr=0,in3
  129. mov r31=16
  130. zxt4 num=in5 };;
  131. { .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
  132. shladd len=num,3,r0
  133. shladd r31=num,3,r31 };;
  134. { .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
  135. add lc=-5,num
  136. sub r31=sp,r31 };;
  137. { .mfb; and sp=-16,r31 // alloca
  138. xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
  139. nop.b 0 }
  140. { .mfb; nop.m 0
  141. xmpy.lu alo[4]=alo[4],bi
  142. brp.loop.imp .L1st_ctop,.L1st_cend-16
  143. };;
  144. { .mfi; nop.m 0
  145. xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
  146. add tp_1=8,sp }
  147. { .mfi; nop.m 0
  148. xma.lu alo[3]=alo[3],bi,ahi[2]
  149. mov pr.rot=0x20001f<<16
  150. // ------^----- (p40) at first (p23)
  151. // ----------^^ p[16:20]=1
  152. };;
  153. { .mfi; nop.m 0
  154. xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
  155. mov ar.lc=lc }
  156. { .mfi; nop.m 0
  157. fcvt.fxu.s1 nhi[1]=f0
  158. mov ar.ec=8 };;
  159. .align 32
  160. .L1st_ctop:
  161. .pred.rel "mutex",p40,p42
  162. { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
  163. (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
  164. (p40) add n[2]=n[2],a[2] } // (p23) }
  165. { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
  166. (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
  167. (p42) add n[2]=n[2],a[2],1 };; // (p23)
  168. { .mfi; (p21) getf.sig a[0]=alo[5]
  169. (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
  170. (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
  171. { .mfi; (p23) st8 [tp_1]=n[2],8
  172. (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
  173. (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
  174. { .mmb; (p21) getf.sig n[0]=nlo[3]
  175. (p16) nop.m 0
  176. br.ctop.sptk .L1st_ctop };;
  177. .L1st_cend:
  178. { .mmi; getf.sig a[0]=ahi[6] // (p24)
  179. getf.sig n[0]=nhi[4]
  180. add num=-1,num };; // num--
  181. { .mmi; .pred.rel "mutex",p40,p42
  182. (p40) add n[0]=n[0],a[0]
  183. (p42) add n[0]=n[0],a[0],1
  184. sub aptr=aptr,len };; // rewind
  185. { .mmi; .pred.rel "mutex",p40,p42
  186. (p40) cmp.ltu p41,p39=n[0],a[0]
  187. (p42) cmp.leu p41,p39=n[0],a[0]
  188. sub nptr=nptr,len };;
  189. { .mmi; .pred.rel "mutex",p39,p41
  190. (p39) add topbit=r0,r0
  191. (p41) add topbit=r0,r0,1
  192. nop.i 0 }
  193. { .mmi; st8 [tp_1]=n[0]
  194. add tptr=16,sp
  195. add tp_1=8,sp };;
  196. .Louter:
  197. { .mmi; ldf8 bi=[bptr],8 // (*bp++)
  198. ldf8 ahi[3]=[tptr] // tp[0]
  199. add r30=8,aptr };;
  200. { .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
  201. ldf8 alo[3]=[r30],16 // ap[1]
  202. add r31=8,nptr };;
  203. { .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
  204. xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
  205. brp.loop.imp .Linner_ctop,.Linner_cend-16
  206. }
  207. { .mfb; ldf8 alo[1]=[r30] // ap[3]
  208. xma.lu alo[4]=alo[4],bi,ahi[3]
  209. clrrrb.pr };;
  210. { .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
  211. xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
  212. nop.i 0 }
  213. { .mfi; ldf8 nlo[1]=[r31] // np[1]
  214. xma.lu alo[3]=alo[3],bi,ahi[2]
  215. mov pr.rot=0x20101f<<16
  216. // ------^----- (p40) at first (p23)
  217. // --------^--- (p30) at first (p22)
  218. // ----------^^ p[16:20]=1
  219. };;
  220. { .mfi; st8 [tptr]=r0 // tp[0] is already accounted
  221. xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
  222. mov ar.lc=lc }
  223. { .mfi;
  224. fcvt.fxu.s1 nhi[1]=f0
  225. mov ar.ec=8 };;
  226. // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
  227. // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
  228. // in latter case accounts for two-tick pipeline stall, which means
  229. // that its performance would be ~20% lower than optimal one. No
  230. // attempt was made to address this, because original Itanium is
  231. // hardly represented out in the wild...
  232. .align 32
  233. .Linner_ctop:
  234. .pred.rel "mutex",p40,p42
  235. .pred.rel "mutex",p30,p32
  236. { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
  237. (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
  238. (p40) add n[2]=n[2],a[2] } // (p23)
  239. { .mfi; (p16) nop.m 0
  240. (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
  241. (p42) add n[2]=n[2],a[2],1 };; // (p23)
  242. { .mfi; (p21) getf.sig a[0]=alo[5]
  243. (p16) nop.f 0
  244. (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
  245. { .mfi; (p21) ld8 t[0]=[tptr],8
  246. (p16) nop.f 0
  247. (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
  248. { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
  249. (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
  250. (p30) add a[1]=a[1],t[1] } // (p22)
  251. { .mfi; (p16) nop.m 0
  252. (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
  253. (p32) add a[1]=a[1],t[1],1 };; // (p22)
  254. { .mmi; (p21) getf.sig n[0]=nlo[3]
  255. (p16) nop.m 0
  256. (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
  257. { .mmb; (p23) st8 [tp_1]=n[2],8
  258. (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
  259. br.ctop.sptk .Linner_ctop };;
  260. .Linner_cend:
  261. { .mmi; getf.sig a[0]=ahi[6] // (p24)
  262. getf.sig n[0]=nhi[4]
  263. nop.i 0 };;
  264. { .mmi; .pred.rel "mutex",p31,p33
  265. (p31) add a[0]=a[0],topbit
  266. (p33) add a[0]=a[0],topbit,1
  267. mov topbit=r0 };;
  268. { .mfi; .pred.rel "mutex",p31,p33
  269. (p31) cmp.ltu p32,p30=a[0],topbit
  270. (p33) cmp.leu p32,p30=a[0],topbit
  271. }
  272. { .mfi; .pred.rel "mutex",p40,p42
  273. (p40) add n[0]=n[0],a[0]
  274. (p42) add n[0]=n[0],a[0],1
  275. };;
  276. { .mmi; .pred.rel "mutex",p44,p46
  277. (p40) cmp.ltu p41,p39=n[0],a[0]
  278. (p42) cmp.leu p41,p39=n[0],a[0]
  279. (p32) add topbit=r0,r0,1 }
  280. { .mmi; st8 [tp_1]=n[0],8
  281. cmp4.ne p6,p0=1,num
  282. sub aptr=aptr,len };; // rewind
  283. { .mmi; sub nptr=nptr,len
  284. (p41) add topbit=r0,r0,1
  285. add tptr=16,sp }
  286. { .mmb; add tp_1=8,sp
  287. add num=-1,num // num--
  288. (p6) br.cond.sptk.many .Louter };;
  289. { .mbb; add lc=4,lc
  290. brp.loop.imp .Lsub_ctop,.Lsub_cend-16
  291. clrrrb.pr };;
  292. { .mii; nop.m 0
  293. mov pr.rot=0x10001<<16
  294. // ------^---- (p33) at first (p17)
  295. mov ar.lc=lc }
  296. { .mii; nop.m 0
  297. mov ar.ec=3
  298. nop.i 0 };;
  299. .Lsub_ctop:
  300. .pred.rel "mutex",p33,p35
  301. { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
  302. (p16) nop.f 0
  303. (p33) sub n[1]=t[1],n[1] } // (p17)
  304. { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
  305. (p16) nop.f 0
  306. (p35) sub n[1]=t[1],n[1],1 };; // (p17)
  307. { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
  308. (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
  309. (p18) nop.b 0 }
  310. { .mib; (p18) nop.m 0
  311. (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
  312. br.ctop.sptk .Lsub_ctop };;
  313. .Lsub_cend:
  314. { .mmb; .pred.rel "mutex",p34,p36
  315. (p34) sub topbit=topbit,r0 // (p19)
  316. (p36) sub topbit=topbit,r0,1
  317. brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
  318. }
  319. { .mmb; sub rptr=rptr,len // rewind
  320. sub tptr=tptr,len
  321. clrrrb.pr };;
  322. { .mmi; and aptr=tptr,topbit
  323. andcm bptr=rptr,topbit
  324. mov pr.rot=1<<16 };;
  325. { .mii; or nptr=aptr,bptr
  326. mov ar.lc=lc
  327. mov ar.ec=3 };;
  328. .Lcopy_ctop:
  329. { .mmb; (p16) ld8 n[0]=[nptr],8
  330. (p18) st8 [tptr]=r0,8
  331. (p16) nop.b 0 }
  332. { .mmb; (p16) nop.m 0
  333. (p18) st8 [rptr]=n[2],8
  334. br.ctop.sptk .Lcopy_ctop };;
  335. .Lcopy_cend:
  336. { .mmi; mov ret0=1 // signal "handled"
  337. rum 1<<5 // clear um.mfh
  338. mov ar.lc=prevlc }
  339. { .mib; .restore sp
  340. mov sp=prevsp
  341. mov pr=prevpr,0x1ffff
  342. br.ret.sptk.many b0 };;
  343. .endp bn_mul_mont_general#
  344. a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
  345. n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
  346. t0=r15;
  347. ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
  348. ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
  349. .align 64
  350. .skip 48 // aligns loop body
  351. .local bn_mul_mont_8#
  352. .proc bn_mul_mont_8#
  353. bn_mul_mont_8:
  354. .prologue
  355. { .mmi; .save ar.pfs,prevfs
  356. alloc prevfs=ar.pfs,6,2,0,8
  357. .vframe prevsp
  358. mov prevsp=sp
  359. .save ar.lc,prevlc
  360. mov prevlc=ar.lc }
  361. { .mmi; add r17=-6*16,sp
  362. add sp=-7*16,sp
  363. .save pr,prevpr
  364. mov prevpr=pr };;
  365. { .mmi; .save.gf 0,0x10
  366. stf.spill [sp]=f16,-16
  367. .save.gf 0,0x20
  368. stf.spill [r17]=f17,32
  369. add r16=-5*16,prevsp};;
  370. { .mmi; .save.gf 0,0x40
  371. stf.spill [r16]=f18,32
  372. .save.gf 0,0x80
  373. stf.spill [r17]=f19,32
  374. $ADDP aptr=0,in1 };;
  375. { .mmi; .save.gf 0,0x100
  376. stf.spill [r16]=f20,32
  377. .save.gf 0,0x200
  378. stf.spill [r17]=f21,32
  379. $ADDP r29=8,in1 };;
  380. { .mmi; .save.gf 0,0x400
  381. stf.spill [r16]=f22
  382. .save.gf 0,0x800
  383. stf.spill [r17]=f23
  384. $ADDP rptr=0,in0 };;
  385. .body
  386. .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
  387. .rotr t[8]
  388. // load input vectors padding them to 8 elements
  389. { .mmi; ldf8 ai0=[aptr],16 // ap[0]
  390. ldf8 ai1=[r29],16 // ap[1]
  391. $ADDP bptr=0,in2 }
  392. { .mmi; $ADDP r30=8,in2
  393. $ADDP nptr=0,in3
  394. $ADDP r31=8,in3 };;
  395. { .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
  396. ldf8 bj[6]=[r30],16 // bp[1]
  397. cmp4.le p4,p5=3,in5 }
  398. { .mmi; ldf8 ni0=[nptr],16 // np[0]
  399. ldf8 ni1=[r31],16 // np[1]
  400. cmp4.le p6,p7=4,in5 };;
  401. { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
  402. (p5)fcvt.fxu ai2=f0
  403. cmp4.le p8,p9=5,in5 }
  404. { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
  405. (p7)fcvt.fxu ai3=f0
  406. cmp4.le p10,p11=6,in5 }
  407. { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
  408. (p5)fcvt.fxu bj[5]=f0
  409. cmp4.le p12,p13=7,in5 }
  410. { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
  411. (p7)fcvt.fxu bj[4]=f0
  412. cmp4.le p14,p15=8,in5 }
  413. { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
  414. (p5)fcvt.fxu ni2=f0
  415. addp4 r28=-1,in5 }
  416. { .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
  417. (p7)fcvt.fxu ni3=f0
  418. $ADDP in4=0,in4 };;
  419. { .mfi; ldf8 n0=[in4]
  420. fcvt.fxu tf[1]=f0
  421. nop.i 0 }
  422. { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
  423. (p9)fcvt.fxu ai4=f0
  424. mov t[0]=r0 }
  425. { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
  426. (p11)fcvt.fxu ai5=f0
  427. mov t[1]=r0 }
  428. { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
  429. (p9)fcvt.fxu bj[3]=f0
  430. mov t[2]=r0 }
  431. { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
  432. (p11)fcvt.fxu bj[2]=f0
  433. mov t[3]=r0 }
  434. { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
  435. (p9)fcvt.fxu ni4=f0
  436. mov t[4]=r0 }
  437. { .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
  438. (p11)fcvt.fxu ni5=f0
  439. mov t[5]=r0 };;
  440. { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
  441. (p13)fcvt.fxu ai6=f0
  442. mov t[6]=r0 }
  443. { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
  444. (p15)fcvt.fxu ai7=f0
  445. mov t[7]=r0 }
  446. { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
  447. (p13)fcvt.fxu bj[1]=f0
  448. mov ar.lc=r28 }
  449. { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
  450. (p15)fcvt.fxu bj[0]=f0
  451. mov ar.ec=1 }
  452. { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
  453. (p13)fcvt.fxu ni6=f0
  454. mov pr.rot=1<<16 }
  455. { .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
  456. (p15)fcvt.fxu ni7=f0
  457. brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
  458. };;
  459. // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
  460. // to measure with help of Interval Time Counter indicated that the
  461. // factor is a tad higher: 33 or 34, if not 35. Exact measurement and
  462. // addressing the issue is problematic, because I don't have access
  463. // to platform-specific instruction-level profiler. On Itanium it
  464. // should run in 56*n ticks, because of higher xma latency...
  465. .Louter_8_ctop:
  466. .pred.rel "mutex",p40,p42
  467. .pred.rel "mutex",p48,p50
  468. { .mfi; (p16) nop.m 0 // 0:
  469. (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
  470. (p40) add a3=a3,n3 } // (p17) a3+=n3
  471. { .mfi; (p42) add a3=a3,n3,1
  472. (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
  473. (p16) nop.i 0 };;
  474. { .mii; (p17) getf.sig a7=alo[8] // 1:
  475. (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
  476. (p50) add t[6]=t[6],a3,1 };;
  477. { .mfi; (p17) getf.sig a8=ahi[8] // 2:
  478. (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
  479. (p40) cmp.ltu p43,p41=a3,n3 }
  480. { .mfi; (p42) cmp.leu p43,p41=a3,n3
  481. (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
  482. (p16) nop.i 0 };;
  483. { .mii; (p17) getf.sig n5=nlo[6] // 3:
  484. (p48) cmp.ltu p51,p49=t[6],a3
  485. (p50) cmp.leu p51,p49=t[6],a3 };;
  486. .pred.rel "mutex",p41,p43
  487. .pred.rel "mutex",p49,p51
  488. { .mfi; (p16) nop.m 0 // 4:
  489. (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
  490. (p41) add a4=a4,n4 } // (p17) a4+=n4
  491. { .mfi; (p43) add a4=a4,n4,1
  492. (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
  493. (p16) nop.i 0 };;
  494. { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
  495. (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
  496. (p51) add t[5]=t[5],a4,1 };;
  497. { .mfi; (p16) nop.m 0 // 6:
  498. (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
  499. (p41) cmp.ltu p42,p40=a4,n4 }
  500. { .mfi; (p43) cmp.leu p42,p40=a4,n4
  501. (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
  502. (p16) nop.i 0 };;
  503. { .mii; (p17) getf.sig n6=nlo[7] // 7:
  504. (p49) cmp.ltu p50,p48=t[5],a4
  505. (p51) cmp.leu p50,p48=t[5],a4 };;
  506. .pred.rel "mutex",p40,p42
  507. .pred.rel "mutex",p48,p50
  508. { .mfi; (p16) nop.m 0 // 8:
  509. (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
  510. (p40) add a5=a5,n5 } // (p17) a5+=n5
  511. { .mfi; (p42) add a5=a5,n5,1
  512. (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
  513. (p16) nop.i 0 };;
  514. { .mii; (p16) getf.sig a1=alo[1] // 9:
  515. (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
  516. (p50) add t[4]=t[4],a5,1 };;
  517. { .mfi; (p16) nop.m 0 // 10:
  518. (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
  519. (p40) cmp.ltu p43,p41=a5,n5 }
  520. { .mfi; (p42) cmp.leu p43,p41=a5,n5
  521. (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
  522. (p16) nop.i 0 };;
  523. { .mii; (p17) getf.sig n7=nlo[8] // 11:
  524. (p48) cmp.ltu p51,p49=t[4],a5
  525. (p50) cmp.leu p51,p49=t[4],a5 };;
  526. .pred.rel "mutex",p41,p43
  527. .pred.rel "mutex",p49,p51
  528. { .mfi; (p17) getf.sig n8=nhi[8] // 12:
  529. (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
  530. (p41) add a6=a6,n6 } // (p17) a6+=n6
  531. { .mfi; (p43) add a6=a6,n6,1
  532. (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
  533. (p16) nop.i 0 };;
  534. { .mii; (p16) getf.sig a2=alo[2] // 13:
  535. (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
  536. (p51) add t[3]=t[3],a6,1 };;
  537. { .mfi; (p16) nop.m 0 // 14:
  538. (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
  539. (p41) cmp.ltu p42,p40=a6,n6 }
  540. { .mfi; (p43) cmp.leu p42,p40=a6,n6
  541. (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
  542. (p16) nop.i 0 };;
  543. { .mii; (p16) nop.m 0 // 15:
  544. (p49) cmp.ltu p50,p48=t[3],a6
  545. (p51) cmp.leu p50,p48=t[3],a6 };;
  546. .pred.rel "mutex",p40,p42
  547. .pred.rel "mutex",p48,p50
  548. { .mfi; (p16) nop.m 0 // 16:
  549. (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
  550. (p40) add a7=a7,n7 } // (p17) a7+=n7
  551. { .mfi; (p42) add a7=a7,n7,1
  552. (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
  553. (p16) nop.i 0 };;
  554. { .mii; (p16) getf.sig a3=alo[3] // 17:
  555. (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
  556. (p50) add t[2]=t[2],a7,1 };;
  557. { .mfi; (p16) nop.m 0 // 18:
  558. (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
  559. (p40) cmp.ltu p43,p41=a7,n7 }
  560. { .mfi; (p42) cmp.leu p43,p41=a7,n7
  561. (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
  562. (p16) nop.i 0 };;
  563. { .mii; (p16) getf.sig n1=nlo[1] // 19:
  564. (p48) cmp.ltu p51,p49=t[2],a7
  565. (p50) cmp.leu p51,p49=t[2],a7 };;
  566. .pred.rel "mutex",p41,p43
  567. .pred.rel "mutex",p49,p51
  568. { .mfi; (p16) nop.m 0 // 20:
  569. (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
  570. (p41) add a8=a8,n8 } // (p17) a8+=n8
  571. { .mfi; (p43) add a8=a8,n8,1
  572. (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
  573. (p16) nop.i 0 };;
  574. { .mii; (p16) getf.sig a4=alo[4] // 21:
  575. (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
  576. (p51) add t[1]=t[1],a8,1 };;
  577. { .mfi; (p16) nop.m 0 // 22:
  578. (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
  579. (p41) cmp.ltu p42,p40=a8,n8 }
  580. { .mfi; (p43) cmp.leu p42,p40=a8,n8
  581. (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
  582. (p16) nop.i 0 };;
  583. { .mii; (p16) getf.sig n2=nlo[2] // 23:
  584. (p49) cmp.ltu p50,p48=t[1],a8
  585. (p51) cmp.leu p50,p48=t[1],a8 };;
  586. { .mfi; (p16) nop.m 0 // 24:
  587. (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
  588. (p16) add a1=a1,n1 } // (p16) a1+=n1
  589. { .mfi; (p16) nop.m 0
  590. (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
  591. (p17) mov t[0]=r0 };;
  592. { .mii; (p16) getf.sig a5=alo[5] // 25:
  593. (p16) add t0=t[7],a1 // (p16) t[7]+=a1
  594. (p42) add t[0]=t[0],r0,1 };;
  595. { .mfi; (p16) setf.sig tf[0]=t0 // 26:
  596. (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
  597. (p50) add t[0]=t[0],r0,1 }
  598. { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
  599. (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
  600. (p16) nop.i 0 };;
  601. { .mii; (p16) getf.sig n3=nlo[3] // 27:
  602. (p16) cmp.ltu.unc p50,p48=t0,a1
  603. (p16) nop.i 0 };;
  604. .pred.rel "mutex",p40,p42
  605. .pred.rel "mutex",p48,p50
  606. { .mfi; (p16) nop.m 0 // 28:
  607. (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
  608. (p40) add a2=a2,n2 } // (p16) a2+=n2
  609. { .mfi; (p42) add a2=a2,n2,1
  610. (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
  611. (p16) nop.i 0 };;
  612. { .mii; (p16) getf.sig a6=alo[6] // 29:
  613. (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
  614. (p50) add t[6]=t[6],a2,1 };;
  615. { .mfi; (p16) nop.m 0 // 30:
  616. (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
  617. (p40) cmp.ltu p41,p39=a2,n2 }
  618. { .mfi; (p42) cmp.leu p41,p39=a2,n2
  619. (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
  620. (p16) nop.i 0 };;
  621. { .mfi; (p16) getf.sig n4=nlo[4] // 31:
  622. (p16) nop.f 0
  623. (p48) cmp.ltu p49,p47=t[6],a2 }
  624. { .mfb; (p50) cmp.leu p49,p47=t[6],a2
  625. (p16) nop.f 0
  626. br.ctop.sptk.many .Louter_8_ctop };;
  627. .Louter_8_cend:
  628. // above loop has to execute one more time, without (p16), which is
  629. // replaced with merged move of np[8] to GPR bank
  630. .pred.rel "mutex",p40,p42
  631. .pred.rel "mutex",p48,p50
  632. { .mmi; (p0) getf.sig n1=ni0 // 0:
  633. (p40) add a3=a3,n3 // (p17) a3+=n3
  634. (p42) add a3=a3,n3,1 };;
  635. { .mii; (p17) getf.sig a7=alo[8] // 1:
  636. (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
  637. (p50) add t[6]=t[6],a3,1 };;
  638. { .mfi; (p17) getf.sig a8=ahi[8] // 2:
  639. (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
  640. (p40) cmp.ltu p43,p41=a3,n3 }
  641. { .mfi; (p42) cmp.leu p43,p41=a3,n3
  642. (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
  643. (p0) nop.i 0 };;
  644. { .mii; (p17) getf.sig n5=nlo[6] // 3:
  645. (p48) cmp.ltu p51,p49=t[6],a3
  646. (p50) cmp.leu p51,p49=t[6],a3 };;
  647. .pred.rel "mutex",p41,p43
  648. .pred.rel "mutex",p49,p51
  649. { .mmi; (p0) getf.sig n2=ni1 // 4:
  650. (p41) add a4=a4,n4 // (p17) a4+=n4
  651. (p43) add a4=a4,n4,1 };;
  652. { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
  653. (p0) nop.f 0
  654. (p51) add t[5]=t[5],a4,1 };;
  655. { .mfi; (p0) getf.sig n3=ni2 // 6:
  656. (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
  657. (p41) cmp.ltu p42,p40=a4,n4 }
  658. { .mfi; (p43) cmp.leu p42,p40=a4,n4
  659. (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
  660. (p0) nop.i 0 };;
  661. { .mii; (p17) getf.sig n6=nlo[7] // 7:
  662. (p49) cmp.ltu p50,p48=t[5],a4
  663. (p51) cmp.leu p50,p48=t[5],a4 };;
  664. .pred.rel "mutex",p40,p42
  665. .pred.rel "mutex",p48,p50
  666. { .mii; (p0) getf.sig n4=ni3 // 8:
  667. (p40) add a5=a5,n5 // (p17) a5+=n5
  668. (p42) add a5=a5,n5,1 };;
  669. { .mii; (p0) nop.m 0 // 9:
  670. (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
  671. (p50) add t[4]=t[4],a5,1 };;
  672. { .mii; (p0) nop.m 0 // 10:
  673. (p40) cmp.ltu p43,p41=a5,n5
  674. (p42) cmp.leu p43,p41=a5,n5 };;
  675. { .mii; (p17) getf.sig n7=nlo[8] // 11:
  676. (p48) cmp.ltu p51,p49=t[4],a5
  677. (p50) cmp.leu p51,p49=t[4],a5 };;
  678. .pred.rel "mutex",p41,p43
  679. .pred.rel "mutex",p49,p51
  680. { .mii; (p17) getf.sig n8=nhi[8] // 12:
  681. (p41) add a6=a6,n6 // (p17) a6+=n6
  682. (p43) add a6=a6,n6,1 };;
  683. { .mii; (p0) getf.sig n5=ni4 // 13:
  684. (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
  685. (p51) add t[3]=t[3],a6,1 };;
  686. { .mii; (p0) nop.m 0 // 14:
  687. (p41) cmp.ltu p42,p40=a6,n6
  688. (p43) cmp.leu p42,p40=a6,n6 };;
  689. { .mii; (p0) getf.sig n6=ni5 // 15:
  690. (p49) cmp.ltu p50,p48=t[3],a6
  691. (p51) cmp.leu p50,p48=t[3],a6 };;
  692. .pred.rel "mutex",p40,p42
  693. .pred.rel "mutex",p48,p50
  694. { .mii; (p0) nop.m 0 // 16:
  695. (p40) add a7=a7,n7 // (p17) a7+=n7
  696. (p42) add a7=a7,n7,1 };;
  697. { .mii; (p0) nop.m 0 // 17:
  698. (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
  699. (p50) add t[2]=t[2],a7,1 };;
  700. { .mii; (p0) nop.m 0 // 18:
  701. (p40) cmp.ltu p43,p41=a7,n7
  702. (p42) cmp.leu p43,p41=a7,n7 };;
  703. { .mii; (p0) getf.sig n7=ni6 // 19:
  704. (p48) cmp.ltu p51,p49=t[2],a7
  705. (p50) cmp.leu p51,p49=t[2],a7 };;
  706. .pred.rel "mutex",p41,p43
  707. .pred.rel "mutex",p49,p51
  708. { .mii; (p0) nop.m 0 // 20:
  709. (p41) add a8=a8,n8 // (p17) a8+=n8
  710. (p43) add a8=a8,n8,1 };;
  711. { .mmi; (p0) nop.m 0 // 21:
  712. (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
  713. (p51) add t[1]=t[1],a8,1 }
  714. { .mmi; (p17) mov t[0]=r0
  715. (p41) cmp.ltu p42,p40=a8,n8
  716. (p43) cmp.leu p42,p40=a8,n8 };;
  717. { .mmi; (p0) getf.sig n8=ni7 // 22:
  718. (p49) cmp.ltu p50,p48=t[1],a8
  719. (p51) cmp.leu p50,p48=t[1],a8 }
  720. { .mmi; (p42) add t[0]=t[0],r0,1
  721. (p0) add r16=-7*16,prevsp
  722. (p0) add r17=-6*16,prevsp };;
  723. // subtract np[8] from carrybit|tmp[8]
  724. // carrybit|tmp[8] layout upon exit from above loop is:
  725. // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
  726. { .mmi; (p50)add t[0]=t[0],r0,1
  727. add r18=-5*16,prevsp
  728. sub n1=t0,n1 };;
  729. { .mmi; cmp.gtu p34,p32=n1,t0;;
  730. .pred.rel "mutex",p32,p34
  731. (p32)sub n2=t[7],n2
  732. (p34)sub n2=t[7],n2,1 };;
  733. { .mii; (p32)cmp.gtu p35,p33=n2,t[7]
  734. (p34)cmp.geu p35,p33=n2,t[7];;
  735. .pred.rel "mutex",p33,p35
  736. (p33)sub n3=t[6],n3 }
  737. { .mmi; (p35)sub n3=t[6],n3,1;;
  738. (p33)cmp.gtu p34,p32=n3,t[6]
  739. (p35)cmp.geu p34,p32=n3,t[6] };;
  740. .pred.rel "mutex",p32,p34
  741. { .mii; (p32)sub n4=t[5],n4
  742. (p34)sub n4=t[5],n4,1;;
  743. (p32)cmp.gtu p35,p33=n4,t[5] }
  744. { .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
  745. .pred.rel "mutex",p33,p35
  746. (p33)sub n5=t[4],n5
  747. (p35)sub n5=t[4],n5,1 };;
  748. { .mii; (p33)cmp.gtu p34,p32=n5,t[4]
  749. (p35)cmp.geu p34,p32=n5,t[4];;
  750. .pred.rel "mutex",p32,p34
  751. (p32)sub n6=t[3],n6 }
  752. { .mmi; (p34)sub n6=t[3],n6,1;;
  753. (p32)cmp.gtu p35,p33=n6,t[3]
  754. (p34)cmp.geu p35,p33=n6,t[3] };;
  755. .pred.rel "mutex",p33,p35
  756. { .mii; (p33)sub n7=t[2],n7
  757. (p35)sub n7=t[2],n7,1;;
  758. (p33)cmp.gtu p34,p32=n7,t[2] }
  759. { .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
  760. .pred.rel "mutex",p32,p34
  761. (p32)sub n8=t[1],n8
  762. (p34)sub n8=t[1],n8,1 };;
  763. { .mii; (p32)cmp.gtu p35,p33=n8,t[1]
  764. (p34)cmp.geu p35,p33=n8,t[1];;
  765. .pred.rel "mutex",p33,p35
  766. (p33)sub a8=t[0],r0 }
  767. { .mmi; (p35)sub a8=t[0],r0,1;;
  768. (p33)cmp.gtu p34,p32=a8,t[0]
  769. (p35)cmp.geu p34,p32=a8,t[0] };;
  770. // save the result, either tmp[num] or tmp[num]-np[num]
  771. .pred.rel "mutex",p32,p34
  772. { .mmi; (p32)st8 [rptr]=n1,8
  773. (p34)st8 [rptr]=t0,8
  774. add r19=-4*16,prevsp};;
  775. { .mmb; (p32)st8 [rptr]=n2,8
  776. (p34)st8 [rptr]=t[7],8
  777. (p5)br.cond.dpnt.few .Ldone };;
  778. { .mmb; (p32)st8 [rptr]=n3,8
  779. (p34)st8 [rptr]=t[6],8
  780. (p7)br.cond.dpnt.few .Ldone };;
  781. { .mmb; (p32)st8 [rptr]=n4,8
  782. (p34)st8 [rptr]=t[5],8
  783. (p9)br.cond.dpnt.few .Ldone };;
  784. { .mmb; (p32)st8 [rptr]=n5,8
  785. (p34)st8 [rptr]=t[4],8
  786. (p11)br.cond.dpnt.few .Ldone };;
  787. { .mmb; (p32)st8 [rptr]=n6,8
  788. (p34)st8 [rptr]=t[3],8
  789. (p13)br.cond.dpnt.few .Ldone };;
  790. { .mmb; (p32)st8 [rptr]=n7,8
  791. (p34)st8 [rptr]=t[2],8
  792. (p15)br.cond.dpnt.few .Ldone };;
  793. { .mmb; (p32)st8 [rptr]=n8,8
  794. (p34)st8 [rptr]=t[1],8
  795. nop.b 0 };;
  796. .Ldone: // epilogue
  797. { .mmi; ldf.fill f16=[r16],64
  798. ldf.fill f17=[r17],64
  799. nop.i 0 }
  800. { .mmi; ldf.fill f18=[r18],64
  801. ldf.fill f19=[r19],64
  802. mov pr=prevpr,0x1ffff };;
  803. { .mmi; ldf.fill f20=[r16]
  804. ldf.fill f21=[r17]
  805. mov ar.lc=prevlc }
  806. { .mmi; ldf.fill f22=[r18]
  807. ldf.fill f23=[r19]
  808. mov ret0=1 } // signal "handled"
  809. { .mib; rum 1<<5
  810. .restore sp
  811. mov sp=prevsp
  812. br.ret.sptk.many b0 };;
  813. .endp bn_mul_mont_8#
  814. .type copyright#,\@object
  815. copyright:
  816. stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
  817. ___
  818. open STDOUT,">$output" if $output;
  819. print $code;
  820. close STDOUT;