sparcv9a-mont.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # October 2005
  9. #
  10. # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
  11. # Because unlike integer multiplier, which simply stalls whole CPU,
  12. # FPU is fully pipelined and can effectively emit 48 bit partial
  13. # product every cycle. Why not blended SPARC v9? One can argue that
  14. # making this module dependent on UltraSPARC VIS extension limits its
  15. # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
  16. # implementations from compatibility matrix. But the rest, whole Sun
  17. # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
  18. # VIS extension instructions used in this module. This is considered
  19. # good enough to not care about HAL SPARC64 users [if any] who have
  20. # integer-only pure SPARCv9 module to "fall down" to.
  21. # USI&II cores currently exhibit uniform 2x improvement [over pre-
  22. # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
  23. # performance improves few percents for shorter keys and worsens few
  24. # percents for longer keys. This is because USIII integer multiplier
  25. # is >3x faster than USI&II one, which is harder to match [but see
  26. # TODO list below]. It should also be noted that SPARC64 V features
  27. # out-of-order execution, which *might* mean that integer multiplier
  28. # is pipelined, which in turn *might* be impossible to match... On
  29. # additional note, SPARC64 V implements FP Multiply-Add instruction,
  30. # which is perfectly usable in this context... In other words, as far
  31. # as Fujitsu SPARC64 V goes, talk to the author:-)
  32. # The implementation implies following "non-natural" limitations on
  33. # input arguments:
  34. # - num may not be less than 4;
  35. # - num has to be even;
  36. # Failure to meet either condition has no fatal effects, simply
  37. # doesn't give any performance gain.
  38. # TODO:
  39. # - modulo-schedule inner loop for better performance (on in-order
  40. # execution core such as UltraSPARC this shall result in further
  41. # noticeable(!) improvement);
  42. # - dedicated squaring procedure[?];
  43. ######################################################################
  44. # November 2006
  45. #
  46. # Modulo-scheduled inner loops allow to interleave floating point and
  47. # integer instructions and minimize Read-After-Write penalties. This
  48. # results in *further* 20-50% perfromance improvement [depending on
  49. # key length, more for longer keys] on USI&II cores and 30-80% - on
  50. # USIII&IV.
  51. $fname="bn_mul_mont_fpu";
  52. $bits=32;
  53. for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  54. if ($bits==64) {
  55. $bias=2047;
  56. $frame=192;
  57. } else {
  58. $bias=0;
  59. $frame=128; # 96 rounded up to largest known cache-line
  60. }
  61. $locals=64;
  62. # In order to provide for 32-/64-bit ABI duality, I keep integers wider
  63. # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
  64. # exclusively for pointers, indexes and other small values...
  65. # int bn_mul_mont(
  66. $rp="%i0"; # BN_ULONG *rp,
  67. $ap="%i1"; # const BN_ULONG *ap,
  68. $bp="%i2"; # const BN_ULONG *bp,
  69. $np="%i3"; # const BN_ULONG *np,
  70. $n0="%i4"; # const BN_ULONG *n0,
  71. $num="%i5"; # int num);
  72. $tp="%l0"; # t[num]
  73. $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
  74. $ap_h="%l2"; # to these four vectors as double-precision FP values.
  75. $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
  76. $np_h="%l4"; # loop and L1-cache aliasing is minimized...
  77. $i="%l5";
  78. $j="%l6";
  79. $mask="%l7"; # 16-bit mask, 0xffff
  80. $n0="%g4"; # reassigned(!) to "64-bit" register
  81. $carry="%i4"; # %i4 reused(!) for a carry bit
  82. # FP register naming chart
  83. #
  84. # ..HILO
  85. # dcba
  86. # --------
  87. # LOa
  88. # LOb
  89. # LOc
  90. # LOd
  91. # HIa
  92. # HIb
  93. # HIc
  94. # HId
  95. # ..a
  96. # ..b
  97. $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
  98. $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
  99. $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
  100. $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
  101. $dota="%f24"; $dotb="%f26";
  102. $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
  103. $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
  104. $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
  105. $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
  106. $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
  107. $code=<<___;
  108. .section ".text",#alloc,#execinstr
  109. .global $fname
  110. .align 32
  111. $fname:
  112. save %sp,-$frame-$locals,%sp
  113. cmp $num,4
  114. bl,a,pn %icc,.Lret
  115. clr %i0
  116. andcc $num,1,%g0 ! $num has to be even...
  117. bnz,a,pn %icc,.Lret
  118. clr %i0 ! signal "unsupported input value"
  119. srl $num,1,$num
  120. sethi %hi(0xffff),$mask
  121. ld [%i4+0],$n0 ! $n0 reassigned, remember?
  122. or $mask,%lo(0xffff),$mask
  123. ld [%i4+4],%o0
  124. sllx %o0,32,%o0
  125. or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
  126. sll $num,3,$num ! num*=8
  127. add %sp,$bias,%o0 ! real top of stack
  128. sll $num,2,%o1
  129. add %o1,$num,%o1 ! %o1=num*5
  130. sub %o0,%o1,%o0
  131. and %o0,-2048,%o0 ! optimize TLB utilization
  132. sub %o0,$bias,%sp ! alloca(5*num*8)
  133. rd %asi,%o7 ! save %asi
  134. add %sp,$bias+$frame+$locals,$tp
  135. add $tp,$num,$ap_l
  136. add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
  137. add $ap_l,$num,$ap_h
  138. add $ap_h,$num,$np_l
  139. add $np_l,$num,$np_h
  140. wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
  141. add $rp,$num,$rp ! readjust input pointers to point
  142. add $ap,$num,$ap ! at the ends too...
  143. add $bp,$num,$bp
  144. add $np,$num,$np
  145. stx %o7,[%sp+$bias+$frame+48] ! save %asi
  146. sub %g0,$num,$i ! i=-num
  147. sub %g0,$num,$j ! j=-num
  148. add $ap,$j,%o3
  149. add $bp,$i,%o4
  150. ld [%o3+4],%g1 ! bp[0]
  151. ld [%o3+0],%o0
  152. ld [%o4+4],%g5 ! ap[0]
  153. sllx %g1,32,%g1
  154. ld [%o4+0],%o1
  155. sllx %g5,32,%g5
  156. or %g1,%o0,%o0
  157. or %g5,%o1,%o1
  158. add $np,$j,%o5
  159. mulx %o1,%o0,%o0 ! ap[0]*bp[0]
  160. mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
  161. stx %o0,[%sp+$bias+$frame+0]
  162. ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
  163. fzeros $alo
  164. ld [%o3+4],$ahi_
  165. fzeros $ahi
  166. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  167. fzeros $nlo
  168. ld [%o5+4],$nhi_
  169. fzeros $nhi
  170. ! transfer b[i] to FPU as 4x16-bit values
  171. ldda [%o4+2]%asi,$ba
  172. fxtod $alo,$alo
  173. ldda [%o4+0]%asi,$bb
  174. fxtod $ahi,$ahi
  175. ldda [%o4+6]%asi,$bc
  176. fxtod $nlo,$nlo
  177. ldda [%o4+4]%asi,$bd
  178. fxtod $nhi,$nhi
  179. ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
  180. ldda [%sp+$bias+$frame+6]%asi,$na
  181. fxtod $ba,$ba
  182. ldda [%sp+$bias+$frame+4]%asi,$nb
  183. fxtod $bb,$bb
  184. ldda [%sp+$bias+$frame+2]%asi,$nc
  185. fxtod $bc,$bc
  186. ldda [%sp+$bias+$frame+0]%asi,$nd
  187. fxtod $bd,$bd
  188. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  189. fxtod $na,$na
  190. std $ahi,[$ap_h+$j]
  191. fxtod $nb,$nb
  192. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  193. fxtod $nc,$nc
  194. std $nhi,[$np_h+$j]
  195. fxtod $nd,$nd
  196. fmuld $alo,$ba,$aloa
  197. fmuld $nlo,$na,$nloa
  198. fmuld $alo,$bb,$alob
  199. fmuld $nlo,$nb,$nlob
  200. fmuld $alo,$bc,$aloc
  201. faddd $aloa,$nloa,$nloa
  202. fmuld $nlo,$nc,$nloc
  203. fmuld $alo,$bd,$alod
  204. faddd $alob,$nlob,$nlob
  205. fmuld $nlo,$nd,$nlod
  206. fmuld $ahi,$ba,$ahia
  207. faddd $aloc,$nloc,$nloc
  208. fmuld $nhi,$na,$nhia
  209. fmuld $ahi,$bb,$ahib
  210. faddd $alod,$nlod,$nlod
  211. fmuld $nhi,$nb,$nhib
  212. fmuld $ahi,$bc,$ahic
  213. faddd $ahia,$nhia,$nhia
  214. fmuld $nhi,$nc,$nhic
  215. fmuld $ahi,$bd,$ahid
  216. faddd $ahib,$nhib,$nhib
  217. fmuld $nhi,$nd,$nhid
  218. faddd $ahic,$nhic,$dota ! $nhic
  219. faddd $ahid,$nhid,$dotb ! $nhid
  220. faddd $nloc,$nhia,$nloc
  221. faddd $nlod,$nhib,$nlod
  222. fdtox $nloa,$nloa
  223. fdtox $nlob,$nlob
  224. fdtox $nloc,$nloc
  225. fdtox $nlod,$nlod
  226. std $nloa,[%sp+$bias+$frame+0]
  227. add $j,8,$j
  228. std $nlob,[%sp+$bias+$frame+8]
  229. add $ap,$j,%o4
  230. std $nloc,[%sp+$bias+$frame+16]
  231. add $np,$j,%o5
  232. std $nlod,[%sp+$bias+$frame+24]
  233. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  234. fzeros $alo
  235. ld [%o4+4],$ahi_
  236. fzeros $ahi
  237. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  238. fzeros $nlo
  239. ld [%o5+4],$nhi_
  240. fzeros $nhi
  241. fxtod $alo,$alo
  242. fxtod $ahi,$ahi
  243. fxtod $nlo,$nlo
  244. fxtod $nhi,$nhi
  245. ldx [%sp+$bias+$frame+0],%o0
  246. fmuld $alo,$ba,$aloa
  247. ldx [%sp+$bias+$frame+8],%o1
  248. fmuld $nlo,$na,$nloa
  249. ldx [%sp+$bias+$frame+16],%o2
  250. fmuld $alo,$bb,$alob
  251. ldx [%sp+$bias+$frame+24],%o3
  252. fmuld $nlo,$nb,$nlob
  253. srlx %o0,16,%o7
  254. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  255. fmuld $alo,$bc,$aloc
  256. add %o7,%o1,%o1
  257. std $ahi,[$ap_h+$j]
  258. faddd $aloa,$nloa,$nloa
  259. fmuld $nlo,$nc,$nloc
  260. srlx %o1,16,%o7
  261. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  262. fmuld $alo,$bd,$alod
  263. add %o7,%o2,%o2
  264. std $nhi,[$np_h+$j]
  265. faddd $alob,$nlob,$nlob
  266. fmuld $nlo,$nd,$nlod
  267. srlx %o2,16,%o7
  268. fmuld $ahi,$ba,$ahia
  269. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  270. faddd $aloc,$nloc,$nloc
  271. fmuld $nhi,$na,$nhia
  272. !and %o0,$mask,%o0
  273. !and %o1,$mask,%o1
  274. !and %o2,$mask,%o2
  275. !sllx %o1,16,%o1
  276. !sllx %o2,32,%o2
  277. !sllx %o3,48,%o7
  278. !or %o1,%o0,%o0
  279. !or %o2,%o0,%o0
  280. !or %o7,%o0,%o0 ! 64-bit result
  281. srlx %o3,16,%g1 ! 34-bit carry
  282. fmuld $ahi,$bb,$ahib
  283. faddd $alod,$nlod,$nlod
  284. fmuld $nhi,$nb,$nhib
  285. fmuld $ahi,$bc,$ahic
  286. faddd $ahia,$nhia,$nhia
  287. fmuld $nhi,$nc,$nhic
  288. fmuld $ahi,$bd,$ahid
  289. faddd $ahib,$nhib,$nhib
  290. fmuld $nhi,$nd,$nhid
  291. faddd $dota,$nloa,$nloa
  292. faddd $dotb,$nlob,$nlob
  293. faddd $ahic,$nhic,$dota ! $nhic
  294. faddd $ahid,$nhid,$dotb ! $nhid
  295. faddd $nloc,$nhia,$nloc
  296. faddd $nlod,$nhib,$nlod
  297. fdtox $nloa,$nloa
  298. fdtox $nlob,$nlob
  299. fdtox $nloc,$nloc
  300. fdtox $nlod,$nlod
  301. std $nloa,[%sp+$bias+$frame+0]
  302. std $nlob,[%sp+$bias+$frame+8]
  303. addcc $j,8,$j
  304. std $nloc,[%sp+$bias+$frame+16]
  305. bz,pn %icc,.L1stskip
  306. std $nlod,[%sp+$bias+$frame+24]
  307. .align 32 ! incidentally already aligned !
  308. .L1st:
  309. add $ap,$j,%o4
  310. add $np,$j,%o5
  311. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  312. fzeros $alo
  313. ld [%o4+4],$ahi_
  314. fzeros $ahi
  315. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  316. fzeros $nlo
  317. ld [%o5+4],$nhi_
  318. fzeros $nhi
  319. fxtod $alo,$alo
  320. fxtod $ahi,$ahi
  321. fxtod $nlo,$nlo
  322. fxtod $nhi,$nhi
  323. ldx [%sp+$bias+$frame+0],%o0
  324. fmuld $alo,$ba,$aloa
  325. ldx [%sp+$bias+$frame+8],%o1
  326. fmuld $nlo,$na,$nloa
  327. ldx [%sp+$bias+$frame+16],%o2
  328. fmuld $alo,$bb,$alob
  329. ldx [%sp+$bias+$frame+24],%o3
  330. fmuld $nlo,$nb,$nlob
  331. srlx %o0,16,%o7
  332. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  333. fmuld $alo,$bc,$aloc
  334. add %o7,%o1,%o1
  335. std $ahi,[$ap_h+$j]
  336. faddd $aloa,$nloa,$nloa
  337. fmuld $nlo,$nc,$nloc
  338. srlx %o1,16,%o7
  339. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  340. fmuld $alo,$bd,$alod
  341. add %o7,%o2,%o2
  342. std $nhi,[$np_h+$j]
  343. faddd $alob,$nlob,$nlob
  344. fmuld $nlo,$nd,$nlod
  345. srlx %o2,16,%o7
  346. fmuld $ahi,$ba,$ahia
  347. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  348. and %o0,$mask,%o0
  349. faddd $aloc,$nloc,$nloc
  350. fmuld $nhi,$na,$nhia
  351. and %o1,$mask,%o1
  352. and %o2,$mask,%o2
  353. fmuld $ahi,$bb,$ahib
  354. sllx %o1,16,%o1
  355. faddd $alod,$nlod,$nlod
  356. fmuld $nhi,$nb,$nhib
  357. sllx %o2,32,%o2
  358. fmuld $ahi,$bc,$ahic
  359. sllx %o3,48,%o7
  360. or %o1,%o0,%o0
  361. faddd $ahia,$nhia,$nhia
  362. fmuld $nhi,$nc,$nhic
  363. or %o2,%o0,%o0
  364. fmuld $ahi,$bd,$ahid
  365. or %o7,%o0,%o0 ! 64-bit result
  366. faddd $ahib,$nhib,$nhib
  367. fmuld $nhi,$nd,$nhid
  368. addcc %g1,%o0,%o0
  369. faddd $dota,$nloa,$nloa
  370. srlx %o3,16,%g1 ! 34-bit carry
  371. faddd $dotb,$nlob,$nlob
  372. bcs,a %xcc,.+8
  373. add %g1,1,%g1
  374. stx %o0,[$tp] ! tp[j-1]=
  375. faddd $ahic,$nhic,$dota ! $nhic
  376. faddd $ahid,$nhid,$dotb ! $nhid
  377. faddd $nloc,$nhia,$nloc
  378. faddd $nlod,$nhib,$nlod
  379. fdtox $nloa,$nloa
  380. fdtox $nlob,$nlob
  381. fdtox $nloc,$nloc
  382. fdtox $nlod,$nlod
  383. std $nloa,[%sp+$bias+$frame+0]
  384. std $nlob,[%sp+$bias+$frame+8]
  385. std $nloc,[%sp+$bias+$frame+16]
  386. std $nlod,[%sp+$bias+$frame+24]
  387. addcc $j,8,$j
  388. bnz,pt %icc,.L1st
  389. add $tp,8,$tp
  390. .L1stskip:
  391. fdtox $dota,$dota
  392. fdtox $dotb,$dotb
  393. ldx [%sp+$bias+$frame+0],%o0
  394. ldx [%sp+$bias+$frame+8],%o1
  395. ldx [%sp+$bias+$frame+16],%o2
  396. ldx [%sp+$bias+$frame+24],%o3
  397. srlx %o0,16,%o7
  398. std $dota,[%sp+$bias+$frame+32]
  399. add %o7,%o1,%o1
  400. std $dotb,[%sp+$bias+$frame+40]
  401. srlx %o1,16,%o7
  402. add %o7,%o2,%o2
  403. srlx %o2,16,%o7
  404. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  405. and %o0,$mask,%o0
  406. and %o1,$mask,%o1
  407. and %o2,$mask,%o2
  408. sllx %o1,16,%o1
  409. sllx %o2,32,%o2
  410. sllx %o3,48,%o7
  411. or %o1,%o0,%o0
  412. or %o2,%o0,%o0
  413. or %o7,%o0,%o0 ! 64-bit result
  414. ldx [%sp+$bias+$frame+32],%o4
  415. addcc %g1,%o0,%o0
  416. ldx [%sp+$bias+$frame+40],%o5
  417. srlx %o3,16,%g1 ! 34-bit carry
  418. bcs,a %xcc,.+8
  419. add %g1,1,%g1
  420. stx %o0,[$tp] ! tp[j-1]=
  421. add $tp,8,$tp
  422. srlx %o4,16,%o7
  423. add %o7,%o5,%o5
  424. and %o4,$mask,%o4
  425. sllx %o5,16,%o7
  426. or %o7,%o4,%o4
  427. addcc %g1,%o4,%o4
  428. srlx %o5,48,%g1
  429. bcs,a %xcc,.+8
  430. add %g1,1,%g1
  431. mov %g1,$carry
  432. stx %o4,[$tp] ! tp[num-1]=
  433. ba .Louter
  434. add $i,8,$i
  435. .align 32
  436. .Louter:
  437. sub %g0,$num,$j ! j=-num
  438. add %sp,$bias+$frame+$locals,$tp
  439. add $ap,$j,%o3
  440. add $bp,$i,%o4
  441. ld [%o3+4],%g1 ! bp[i]
  442. ld [%o3+0],%o0
  443. ld [%o4+4],%g5 ! ap[0]
  444. sllx %g1,32,%g1
  445. ld [%o4+0],%o1
  446. sllx %g5,32,%g5
  447. or %g1,%o0,%o0
  448. or %g5,%o1,%o1
  449. ldx [$tp],%o2 ! tp[0]
  450. mulx %o1,%o0,%o0
  451. addcc %o2,%o0,%o0
  452. mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
  453. stx %o0,[%sp+$bias+$frame+0]
  454. ! transfer b[i] to FPU as 4x16-bit values
  455. ldda [%o4+2]%asi,$ba
  456. ldda [%o4+0]%asi,$bb
  457. ldda [%o4+6]%asi,$bc
  458. ldda [%o4+4]%asi,$bd
  459. ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
  460. ldda [%sp+$bias+$frame+6]%asi,$na
  461. fxtod $ba,$ba
  462. ldda [%sp+$bias+$frame+4]%asi,$nb
  463. fxtod $bb,$bb
  464. ldda [%sp+$bias+$frame+2]%asi,$nc
  465. fxtod $bc,$bc
  466. ldda [%sp+$bias+$frame+0]%asi,$nd
  467. fxtod $bd,$bd
  468. ldd [$ap_l+$j],$alo ! load a[j] in double format
  469. fxtod $na,$na
  470. ldd [$ap_h+$j],$ahi
  471. fxtod $nb,$nb
  472. ldd [$np_l+$j],$nlo ! load n[j] in double format
  473. fxtod $nc,$nc
  474. ldd [$np_h+$j],$nhi
  475. fxtod $nd,$nd
  476. fmuld $alo,$ba,$aloa
  477. fmuld $nlo,$na,$nloa
  478. fmuld $alo,$bb,$alob
  479. fmuld $nlo,$nb,$nlob
  480. fmuld $alo,$bc,$aloc
  481. faddd $aloa,$nloa,$nloa
  482. fmuld $nlo,$nc,$nloc
  483. fmuld $alo,$bd,$alod
  484. faddd $alob,$nlob,$nlob
  485. fmuld $nlo,$nd,$nlod
  486. fmuld $ahi,$ba,$ahia
  487. faddd $aloc,$nloc,$nloc
  488. fmuld $nhi,$na,$nhia
  489. fmuld $ahi,$bb,$ahib
  490. faddd $alod,$nlod,$nlod
  491. fmuld $nhi,$nb,$nhib
  492. fmuld $ahi,$bc,$ahic
  493. faddd $ahia,$nhia,$nhia
  494. fmuld $nhi,$nc,$nhic
  495. fmuld $ahi,$bd,$ahid
  496. faddd $ahib,$nhib,$nhib
  497. fmuld $nhi,$nd,$nhid
  498. faddd $ahic,$nhic,$dota ! $nhic
  499. faddd $ahid,$nhid,$dotb ! $nhid
  500. faddd $nloc,$nhia,$nloc
  501. faddd $nlod,$nhib,$nlod
  502. fdtox $nloa,$nloa
  503. fdtox $nlob,$nlob
  504. fdtox $nloc,$nloc
  505. fdtox $nlod,$nlod
  506. std $nloa,[%sp+$bias+$frame+0]
  507. std $nlob,[%sp+$bias+$frame+8]
  508. std $nloc,[%sp+$bias+$frame+16]
  509. add $j,8,$j
  510. std $nlod,[%sp+$bias+$frame+24]
  511. ldd [$ap_l+$j],$alo ! load a[j] in double format
  512. ldd [$ap_h+$j],$ahi
  513. ldd [$np_l+$j],$nlo ! load n[j] in double format
  514. ldd [$np_h+$j],$nhi
  515. fmuld $alo,$ba,$aloa
  516. fmuld $nlo,$na,$nloa
  517. fmuld $alo,$bb,$alob
  518. fmuld $nlo,$nb,$nlob
  519. fmuld $alo,$bc,$aloc
  520. ldx [%sp+$bias+$frame+0],%o0
  521. faddd $aloa,$nloa,$nloa
  522. fmuld $nlo,$nc,$nloc
  523. ldx [%sp+$bias+$frame+8],%o1
  524. fmuld $alo,$bd,$alod
  525. ldx [%sp+$bias+$frame+16],%o2
  526. faddd $alob,$nlob,$nlob
  527. fmuld $nlo,$nd,$nlod
  528. ldx [%sp+$bias+$frame+24],%o3
  529. fmuld $ahi,$ba,$ahia
  530. srlx %o0,16,%o7
  531. faddd $aloc,$nloc,$nloc
  532. fmuld $nhi,$na,$nhia
  533. add %o7,%o1,%o1
  534. fmuld $ahi,$bb,$ahib
  535. srlx %o1,16,%o7
  536. faddd $alod,$nlod,$nlod
  537. fmuld $nhi,$nb,$nhib
  538. add %o7,%o2,%o2
  539. fmuld $ahi,$bc,$ahic
  540. srlx %o2,16,%o7
  541. faddd $ahia,$nhia,$nhia
  542. fmuld $nhi,$nc,$nhic
  543. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  544. ! why?
  545. and %o0,$mask,%o0
  546. fmuld $ahi,$bd,$ahid
  547. and %o1,$mask,%o1
  548. and %o2,$mask,%o2
  549. faddd $ahib,$nhib,$nhib
  550. fmuld $nhi,$nd,$nhid
  551. sllx %o1,16,%o1
  552. faddd $dota,$nloa,$nloa
  553. sllx %o2,32,%o2
  554. faddd $dotb,$nlob,$nlob
  555. sllx %o3,48,%o7
  556. or %o1,%o0,%o0
  557. faddd $ahic,$nhic,$dota ! $nhic
  558. or %o2,%o0,%o0
  559. faddd $ahid,$nhid,$dotb ! $nhid
  560. or %o7,%o0,%o0 ! 64-bit result
  561. ldx [$tp],%o7
  562. faddd $nloc,$nhia,$nloc
  563. addcc %o7,%o0,%o0
  564. ! end-of-why?
  565. faddd $nlod,$nhib,$nlod
  566. srlx %o3,16,%g1 ! 34-bit carry
  567. fdtox $nloa,$nloa
  568. bcs,a %xcc,.+8
  569. add %g1,1,%g1
  570. fdtox $nlob,$nlob
  571. fdtox $nloc,$nloc
  572. fdtox $nlod,$nlod
  573. std $nloa,[%sp+$bias+$frame+0]
  574. std $nlob,[%sp+$bias+$frame+8]
  575. addcc $j,8,$j
  576. std $nloc,[%sp+$bias+$frame+16]
  577. bz,pn %icc,.Linnerskip
  578. std $nlod,[%sp+$bias+$frame+24]
  579. ba .Linner
  580. nop
  581. .align 32
  582. .Linner:
  583. ldd [$ap_l+$j],$alo ! load a[j] in double format
  584. ldd [$ap_h+$j],$ahi
  585. ldd [$np_l+$j],$nlo ! load n[j] in double format
  586. ldd [$np_h+$j],$nhi
  587. fmuld $alo,$ba,$aloa
  588. fmuld $nlo,$na,$nloa
  589. fmuld $alo,$bb,$alob
  590. fmuld $nlo,$nb,$nlob
  591. fmuld $alo,$bc,$aloc
  592. ldx [%sp+$bias+$frame+0],%o0
  593. faddd $aloa,$nloa,$nloa
  594. fmuld $nlo,$nc,$nloc
  595. ldx [%sp+$bias+$frame+8],%o1
  596. fmuld $alo,$bd,$alod
  597. ldx [%sp+$bias+$frame+16],%o2
  598. faddd $alob,$nlob,$nlob
  599. fmuld $nlo,$nd,$nlod
  600. ldx [%sp+$bias+$frame+24],%o3
  601. fmuld $ahi,$ba,$ahia
  602. srlx %o0,16,%o7
  603. faddd $aloc,$nloc,$nloc
  604. fmuld $nhi,$na,$nhia
  605. add %o7,%o1,%o1
  606. fmuld $ahi,$bb,$ahib
  607. srlx %o1,16,%o7
  608. faddd $alod,$nlod,$nlod
  609. fmuld $nhi,$nb,$nhib
  610. add %o7,%o2,%o2
  611. fmuld $ahi,$bc,$ahic
  612. srlx %o2,16,%o7
  613. faddd $ahia,$nhia,$nhia
  614. fmuld $nhi,$nc,$nhic
  615. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  616. and %o0,$mask,%o0
  617. fmuld $ahi,$bd,$ahid
  618. and %o1,$mask,%o1
  619. and %o2,$mask,%o2
  620. faddd $ahib,$nhib,$nhib
  621. fmuld $nhi,$nd,$nhid
  622. sllx %o1,16,%o1
  623. faddd $dota,$nloa,$nloa
  624. sllx %o2,32,%o2
  625. faddd $dotb,$nlob,$nlob
  626. sllx %o3,48,%o7
  627. or %o1,%o0,%o0
  628. faddd $ahic,$nhic,$dota ! $nhic
  629. or %o2,%o0,%o0
  630. faddd $ahid,$nhid,$dotb ! $nhid
  631. or %o7,%o0,%o0 ! 64-bit result
  632. faddd $nloc,$nhia,$nloc
  633. addcc %g1,%o0,%o0
  634. ldx [$tp+8],%o7 ! tp[j]
  635. faddd $nlod,$nhib,$nlod
  636. srlx %o3,16,%g1 ! 34-bit carry
  637. fdtox $nloa,$nloa
  638. bcs,a %xcc,.+8
  639. add %g1,1,%g1
  640. fdtox $nlob,$nlob
  641. addcc %o7,%o0,%o0
  642. fdtox $nloc,$nloc
  643. bcs,a %xcc,.+8
  644. add %g1,1,%g1
  645. stx %o0,[$tp] ! tp[j-1]
  646. fdtox $nlod,$nlod
  647. std $nloa,[%sp+$bias+$frame+0]
  648. std $nlob,[%sp+$bias+$frame+8]
  649. std $nloc,[%sp+$bias+$frame+16]
  650. addcc $j,8,$j
  651. std $nlod,[%sp+$bias+$frame+24]
  652. bnz,pt %icc,.Linner
  653. add $tp,8,$tp
  654. .Linnerskip:
  655. fdtox $dota,$dota
  656. fdtox $dotb,$dotb
  657. ldx [%sp+$bias+$frame+0],%o0
  658. ldx [%sp+$bias+$frame+8],%o1
  659. ldx [%sp+$bias+$frame+16],%o2
  660. ldx [%sp+$bias+$frame+24],%o3
  661. srlx %o0,16,%o7
  662. std $dota,[%sp+$bias+$frame+32]
  663. add %o7,%o1,%o1
  664. std $dotb,[%sp+$bias+$frame+40]
  665. srlx %o1,16,%o7
  666. add %o7,%o2,%o2
  667. srlx %o2,16,%o7
  668. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  669. and %o0,$mask,%o0
  670. and %o1,$mask,%o1
  671. and %o2,$mask,%o2
  672. sllx %o1,16,%o1
  673. sllx %o2,32,%o2
  674. sllx %o3,48,%o7
  675. or %o1,%o0,%o0
  676. or %o2,%o0,%o0
  677. ldx [%sp+$bias+$frame+32],%o4
  678. or %o7,%o0,%o0 ! 64-bit result
  679. ldx [%sp+$bias+$frame+40],%o5
  680. addcc %g1,%o0,%o0
  681. ldx [$tp+8],%o7 ! tp[j]
  682. srlx %o3,16,%g1 ! 34-bit carry
  683. bcs,a %xcc,.+8
  684. add %g1,1,%g1
  685. addcc %o7,%o0,%o0
  686. bcs,a %xcc,.+8
  687. add %g1,1,%g1
  688. stx %o0,[$tp] ! tp[j-1]
  689. add $tp,8,$tp
  690. srlx %o4,16,%o7
  691. add %o7,%o5,%o5
  692. and %o4,$mask,%o4
  693. sllx %o5,16,%o7
  694. or %o7,%o4,%o4
  695. addcc %g1,%o4,%o4
  696. srlx %o5,48,%g1
  697. bcs,a %xcc,.+8
  698. add %g1,1,%g1
  699. addcc $carry,%o4,%o4
  700. stx %o4,[$tp] ! tp[num-1]
  701. mov %g1,$carry
  702. bcs,a %xcc,.+8
  703. add $carry,1,$carry
  704. addcc $i,8,$i
  705. bnz %icc,.Louter
  706. nop
  707. add $tp,8,$tp ! adjust tp to point at the end
  708. orn %g0,%g0,%g4
  709. sub %g0,$num,%o7 ! n=-num
  710. ba .Lsub
  711. subcc %g0,%g0,%g0 ! clear %icc.c
  712. .align 32
  713. .Lsub:
  714. ldx [$tp+%o7],%o0
  715. add $np,%o7,%g1
  716. ld [%g1+0],%o2
  717. ld [%g1+4],%o3
  718. srlx %o0,32,%o1
  719. subccc %o0,%o2,%o2
  720. add $rp,%o7,%g1
  721. subccc %o1,%o3,%o3
  722. st %o2,[%g1+0]
  723. add %o7,8,%o7
  724. brnz,pt %o7,.Lsub
  725. st %o3,[%g1+4]
  726. subc $carry,0,%g4
  727. sub %g0,$num,%o7 ! n=-num
  728. ba .Lcopy
  729. nop
  730. .align 32
  731. .Lcopy:
  732. ldx [$tp+%o7],%o0
  733. add $rp,%o7,%g1
  734. ld [%g1+0],%o2
  735. ld [%g1+4],%o3
  736. stx %g0,[$tp+%o7]
  737. and %o0,%g4,%o0
  738. srlx %o0,32,%o1
  739. andn %o2,%g4,%o2
  740. andn %o3,%g4,%o3
  741. or %o2,%o0,%o0
  742. or %o3,%o1,%o1
  743. st %o0,[%g1+0]
  744. add %o7,8,%o7
  745. brnz,pt %o7,.Lcopy
  746. st %o1,[%g1+4]
  747. sub %g0,$num,%o7 ! n=-num
  748. .Lzap:
  749. stx %g0,[$ap_l+%o7]
  750. stx %g0,[$ap_h+%o7]
  751. stx %g0,[$np_l+%o7]
  752. stx %g0,[$np_h+%o7]
  753. add %o7,8,%o7
  754. brnz,pt %o7,.Lzap
  755. nop
  756. ldx [%sp+$bias+$frame+48],%o7
  757. wr %g0,%o7,%asi ! restore %asi
  758. mov 1,%i0
  759. .Lret:
  760. ret
  761. restore
  762. .type $fname,#function
  763. .size $fname,(.-$fname)
  764. .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
  765. .align 32
  766. ___
  767. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  768. # Below substitution makes it possible to compile without demanding
  769. # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
  770. # dare to do this, because VIS capability is detected at run-time now
  771. # and this routine is not called on CPU not capable to execute it. Do
  772. # note that fzeros is not the only VIS dependency! Another dependency
  773. # is implicit and is just _a_ numerical value loaded to %asi register,
  774. # which assembler can't recognize as VIS specific...
  775. $code =~ s/fzeros\s+%f([0-9]+)/
  776. sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
  777. /gem;
  778. print $code;
  779. # flush
  780. close STDOUT;