sparcv9a-mont.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # October 2005
  15. #
  16. # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
  17. # Because unlike integer multiplier, which simply stalls whole CPU,
  18. # FPU is fully pipelined and can effectively emit 48 bit partial
  19. # product every cycle. Why not blended SPARC v9? One can argue that
  20. # making this module dependent on UltraSPARC VIS extension limits its
  21. # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
  22. # implementations from compatibility matrix. But the rest, whole Sun
  23. # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
  24. # VIS extension instructions used in this module. This is considered
  25. # good enough to not care about HAL SPARC64 users [if any] who have
  26. # integer-only pure SPARCv9 module to "fall down" to.
  27. # USI&II cores currently exhibit uniform 2x improvement [over pre-
  28. # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
  29. # performance improves few percents for shorter keys and worsens few
  30. # percents for longer keys. This is because USIII integer multiplier
  31. # is >3x faster than USI&II one, which is harder to match [but see
  32. # TODO list below]. It should also be noted that SPARC64 V features
  33. # out-of-order execution, which *might* mean that integer multiplier
  34. # is pipelined, which in turn *might* be impossible to match... On
  35. # additional note, SPARC64 V implements FP Multiply-Add instruction,
  36. # which is perfectly usable in this context... In other words, as far
  37. # as Fujitsu SPARC64 V goes, talk to the author:-)
  38. # The implementation implies following "non-natural" limitations on
  39. # input arguments:
  40. # - num may not be less than 4;
  41. # - num has to be even;
  42. # Failure to meet either condition has no fatal effects, simply
  43. # doesn't give any performance gain.
  44. # TODO:
  45. # - modulo-schedule inner loop for better performance (on in-order
  46. # execution core such as UltraSPARC this shall result in further
  47. # noticeable(!) improvement);
  48. # - dedicated squaring procedure[?];
  49. ######################################################################
  50. # November 2006
  51. #
  52. # Modulo-scheduled inner loops allow to interleave floating point and
  53. # integer instructions and minimize Read-After-Write penalties. This
  54. # results in *further* 20-50% performance improvement [depending on
  55. # key length, more for longer keys] on USI&II cores and 30-80% - on
  56. # USIII&IV.
  57. # $output is the last argument if it looks like a file (it has an extension)
  58. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  59. $output and open STDOUT,">$output";
  60. $fname="bn_mul_mont_fpu";
  61. $frame="STACK_FRAME";
  62. $bias="STACK_BIAS";
  63. $locals=64;
  64. # In order to provide for 32-/64-bit ABI duality, I keep integers wider
  65. # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
  66. # exclusively for pointers, indexes and other small values...
  67. # int bn_mul_mont(
  68. $rp="%i0"; # BN_ULONG *rp,
  69. $ap="%i1"; # const BN_ULONG *ap,
  70. $bp="%i2"; # const BN_ULONG *bp,
  71. $np="%i3"; # const BN_ULONG *np,
  72. $n0="%i4"; # const BN_ULONG *n0,
  73. $num="%i5"; # int num);
  74. $tp="%l0"; # t[num]
  75. $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
  76. $ap_h="%l2"; # to these four vectors as double-precision FP values.
  77. $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
  78. $np_h="%l4"; # loop and L1-cache aliasing is minimized...
  79. $i="%l5";
  80. $j="%l6";
  81. $mask="%l7"; # 16-bit mask, 0xffff
  82. $n0="%g4"; # reassigned(!) to "64-bit" register
  83. $carry="%i4"; # %i4 reused(!) for a carry bit
  84. # FP register naming chart
  85. #
  86. # ..HILO
  87. # dcba
  88. # --------
  89. # LOa
  90. # LOb
  91. # LOc
  92. # LOd
  93. # HIa
  94. # HIb
  95. # HIc
  96. # HId
  97. # ..a
  98. # ..b
  99. $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
  100. $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
  101. $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
  102. $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
  103. $dota="%f24"; $dotb="%f26";
  104. $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
  105. $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
  106. $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
  107. $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
  108. $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
  109. $code=<<___;
  110. #include "sparc_arch.h"
  111. .section ".text",#alloc,#execinstr
  112. .global $fname
  113. .align 32
  114. $fname:
  115. save %sp,-$frame-$locals,%sp
  116. cmp $num,4
  117. bl,a,pn %icc,.Lret
  118. clr %i0
  119. andcc $num,1,%g0 ! $num has to be even...
  120. bnz,a,pn %icc,.Lret
  121. clr %i0 ! signal "unsupported input value"
  122. srl $num,1,$num
  123. sethi %hi(0xffff),$mask
  124. ld [%i4+0],$n0 ! $n0 reassigned, remember?
  125. or $mask,%lo(0xffff),$mask
  126. ld [%i4+4],%o0
  127. sllx %o0,32,%o0
  128. or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
  129. sll $num,3,$num ! num*=8
  130. add %sp,$bias,%o0 ! real top of stack
  131. sll $num,2,%o1
  132. add %o1,$num,%o1 ! %o1=num*5
  133. sub %o0,%o1,%o0
  134. and %o0,-2048,%o0 ! optimize TLB utilization
  135. sub %o0,$bias,%sp ! alloca(5*num*8)
  136. rd %asi,%o7 ! save %asi
  137. add %sp,$bias+$frame+$locals,$tp
  138. add $tp,$num,$ap_l
  139. add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
  140. add $ap_l,$num,$ap_h
  141. add $ap_h,$num,$np_l
  142. add $np_l,$num,$np_h
  143. wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
  144. add $rp,$num,$rp ! readjust input pointers to point
  145. add $ap,$num,$ap ! at the ends too...
  146. add $bp,$num,$bp
  147. add $np,$num,$np
  148. stx %o7,[%sp+$bias+$frame+48] ! save %asi
  149. sub %g0,$num,$i ! i=-num
  150. sub %g0,$num,$j ! j=-num
  151. add $ap,$j,%o3
  152. add $bp,$i,%o4
  153. ld [%o3+4],%g1 ! bp[0]
  154. ld [%o3+0],%o0
  155. ld [%o4+4],%g5 ! ap[0]
  156. sllx %g1,32,%g1
  157. ld [%o4+0],%o1
  158. sllx %g5,32,%g5
  159. or %g1,%o0,%o0
  160. or %g5,%o1,%o1
  161. add $np,$j,%o5
  162. mulx %o1,%o0,%o0 ! ap[0]*bp[0]
  163. mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
  164. stx %o0,[%sp+$bias+$frame+0]
  165. ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
  166. fzeros $alo
  167. ld [%o3+4],$ahi_
  168. fzeros $ahi
  169. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  170. fzeros $nlo
  171. ld [%o5+4],$nhi_
  172. fzeros $nhi
  173. ! transfer b[i] to FPU as 4x16-bit values
  174. ldda [%o4+2]%asi,$ba
  175. fxtod $alo,$alo
  176. ldda [%o4+0]%asi,$bb
  177. fxtod $ahi,$ahi
  178. ldda [%o4+6]%asi,$bc
  179. fxtod $nlo,$nlo
  180. ldda [%o4+4]%asi,$bd
  181. fxtod $nhi,$nhi
  182. ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
  183. ldda [%sp+$bias+$frame+6]%asi,$na
  184. fxtod $ba,$ba
  185. ldda [%sp+$bias+$frame+4]%asi,$nb
  186. fxtod $bb,$bb
  187. ldda [%sp+$bias+$frame+2]%asi,$nc
  188. fxtod $bc,$bc
  189. ldda [%sp+$bias+$frame+0]%asi,$nd
  190. fxtod $bd,$bd
  191. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  192. fxtod $na,$na
  193. std $ahi,[$ap_h+$j]
  194. fxtod $nb,$nb
  195. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  196. fxtod $nc,$nc
  197. std $nhi,[$np_h+$j]
  198. fxtod $nd,$nd
  199. fmuld $alo,$ba,$aloa
  200. fmuld $nlo,$na,$nloa
  201. fmuld $alo,$bb,$alob
  202. fmuld $nlo,$nb,$nlob
  203. fmuld $alo,$bc,$aloc
  204. faddd $aloa,$nloa,$nloa
  205. fmuld $nlo,$nc,$nloc
  206. fmuld $alo,$bd,$alod
  207. faddd $alob,$nlob,$nlob
  208. fmuld $nlo,$nd,$nlod
  209. fmuld $ahi,$ba,$ahia
  210. faddd $aloc,$nloc,$nloc
  211. fmuld $nhi,$na,$nhia
  212. fmuld $ahi,$bb,$ahib
  213. faddd $alod,$nlod,$nlod
  214. fmuld $nhi,$nb,$nhib
  215. fmuld $ahi,$bc,$ahic
  216. faddd $ahia,$nhia,$nhia
  217. fmuld $nhi,$nc,$nhic
  218. fmuld $ahi,$bd,$ahid
  219. faddd $ahib,$nhib,$nhib
  220. fmuld $nhi,$nd,$nhid
  221. faddd $ahic,$nhic,$dota ! $nhic
  222. faddd $ahid,$nhid,$dotb ! $nhid
  223. faddd $nloc,$nhia,$nloc
  224. faddd $nlod,$nhib,$nlod
  225. fdtox $nloa,$nloa
  226. fdtox $nlob,$nlob
  227. fdtox $nloc,$nloc
  228. fdtox $nlod,$nlod
  229. std $nloa,[%sp+$bias+$frame+0]
  230. add $j,8,$j
  231. std $nlob,[%sp+$bias+$frame+8]
  232. add $ap,$j,%o4
  233. std $nloc,[%sp+$bias+$frame+16]
  234. add $np,$j,%o5
  235. std $nlod,[%sp+$bias+$frame+24]
  236. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  237. fzeros $alo
  238. ld [%o4+4],$ahi_
  239. fzeros $ahi
  240. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  241. fzeros $nlo
  242. ld [%o5+4],$nhi_
  243. fzeros $nhi
  244. fxtod $alo,$alo
  245. fxtod $ahi,$ahi
  246. fxtod $nlo,$nlo
  247. fxtod $nhi,$nhi
  248. ldx [%sp+$bias+$frame+0],%o0
  249. fmuld $alo,$ba,$aloa
  250. ldx [%sp+$bias+$frame+8],%o1
  251. fmuld $nlo,$na,$nloa
  252. ldx [%sp+$bias+$frame+16],%o2
  253. fmuld $alo,$bb,$alob
  254. ldx [%sp+$bias+$frame+24],%o3
  255. fmuld $nlo,$nb,$nlob
  256. srlx %o0,16,%o7
  257. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  258. fmuld $alo,$bc,$aloc
  259. add %o7,%o1,%o1
  260. std $ahi,[$ap_h+$j]
  261. faddd $aloa,$nloa,$nloa
  262. fmuld $nlo,$nc,$nloc
  263. srlx %o1,16,%o7
  264. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  265. fmuld $alo,$bd,$alod
  266. add %o7,%o2,%o2
  267. std $nhi,[$np_h+$j]
  268. faddd $alob,$nlob,$nlob
  269. fmuld $nlo,$nd,$nlod
  270. srlx %o2,16,%o7
  271. fmuld $ahi,$ba,$ahia
  272. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  273. faddd $aloc,$nloc,$nloc
  274. fmuld $nhi,$na,$nhia
  275. !and %o0,$mask,%o0
  276. !and %o1,$mask,%o1
  277. !and %o2,$mask,%o2
  278. !sllx %o1,16,%o1
  279. !sllx %o2,32,%o2
  280. !sllx %o3,48,%o7
  281. !or %o1,%o0,%o0
  282. !or %o2,%o0,%o0
  283. !or %o7,%o0,%o0 ! 64-bit result
  284. srlx %o3,16,%g1 ! 34-bit carry
  285. fmuld $ahi,$bb,$ahib
  286. faddd $alod,$nlod,$nlod
  287. fmuld $nhi,$nb,$nhib
  288. fmuld $ahi,$bc,$ahic
  289. faddd $ahia,$nhia,$nhia
  290. fmuld $nhi,$nc,$nhic
  291. fmuld $ahi,$bd,$ahid
  292. faddd $ahib,$nhib,$nhib
  293. fmuld $nhi,$nd,$nhid
  294. faddd $dota,$nloa,$nloa
  295. faddd $dotb,$nlob,$nlob
  296. faddd $ahic,$nhic,$dota ! $nhic
  297. faddd $ahid,$nhid,$dotb ! $nhid
  298. faddd $nloc,$nhia,$nloc
  299. faddd $nlod,$nhib,$nlod
  300. fdtox $nloa,$nloa
  301. fdtox $nlob,$nlob
  302. fdtox $nloc,$nloc
  303. fdtox $nlod,$nlod
  304. std $nloa,[%sp+$bias+$frame+0]
  305. std $nlob,[%sp+$bias+$frame+8]
  306. addcc $j,8,$j
  307. std $nloc,[%sp+$bias+$frame+16]
  308. bz,pn %icc,.L1stskip
  309. std $nlod,[%sp+$bias+$frame+24]
  310. .align 32 ! incidentally already aligned !
  311. .L1st:
  312. add $ap,$j,%o4
  313. add $np,$j,%o5
  314. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  315. fzeros $alo
  316. ld [%o4+4],$ahi_
  317. fzeros $ahi
  318. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  319. fzeros $nlo
  320. ld [%o5+4],$nhi_
  321. fzeros $nhi
  322. fxtod $alo,$alo
  323. fxtod $ahi,$ahi
  324. fxtod $nlo,$nlo
  325. fxtod $nhi,$nhi
  326. ldx [%sp+$bias+$frame+0],%o0
  327. fmuld $alo,$ba,$aloa
  328. ldx [%sp+$bias+$frame+8],%o1
  329. fmuld $nlo,$na,$nloa
  330. ldx [%sp+$bias+$frame+16],%o2
  331. fmuld $alo,$bb,$alob
  332. ldx [%sp+$bias+$frame+24],%o3
  333. fmuld $nlo,$nb,$nlob
  334. srlx %o0,16,%o7
  335. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  336. fmuld $alo,$bc,$aloc
  337. add %o7,%o1,%o1
  338. std $ahi,[$ap_h+$j]
  339. faddd $aloa,$nloa,$nloa
  340. fmuld $nlo,$nc,$nloc
  341. srlx %o1,16,%o7
  342. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  343. fmuld $alo,$bd,$alod
  344. add %o7,%o2,%o2
  345. std $nhi,[$np_h+$j]
  346. faddd $alob,$nlob,$nlob
  347. fmuld $nlo,$nd,$nlod
  348. srlx %o2,16,%o7
  349. fmuld $ahi,$ba,$ahia
  350. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  351. and %o0,$mask,%o0
  352. faddd $aloc,$nloc,$nloc
  353. fmuld $nhi,$na,$nhia
  354. and %o1,$mask,%o1
  355. and %o2,$mask,%o2
  356. fmuld $ahi,$bb,$ahib
  357. sllx %o1,16,%o1
  358. faddd $alod,$nlod,$nlod
  359. fmuld $nhi,$nb,$nhib
  360. sllx %o2,32,%o2
  361. fmuld $ahi,$bc,$ahic
  362. sllx %o3,48,%o7
  363. or %o1,%o0,%o0
  364. faddd $ahia,$nhia,$nhia
  365. fmuld $nhi,$nc,$nhic
  366. or %o2,%o0,%o0
  367. fmuld $ahi,$bd,$ahid
  368. or %o7,%o0,%o0 ! 64-bit result
  369. faddd $ahib,$nhib,$nhib
  370. fmuld $nhi,$nd,$nhid
  371. addcc %g1,%o0,%o0
  372. faddd $dota,$nloa,$nloa
  373. srlx %o3,16,%g1 ! 34-bit carry
  374. faddd $dotb,$nlob,$nlob
  375. bcs,a %xcc,.+8
  376. add %g1,1,%g1
  377. stx %o0,[$tp] ! tp[j-1]=
  378. faddd $ahic,$nhic,$dota ! $nhic
  379. faddd $ahid,$nhid,$dotb ! $nhid
  380. faddd $nloc,$nhia,$nloc
  381. faddd $nlod,$nhib,$nlod
  382. fdtox $nloa,$nloa
  383. fdtox $nlob,$nlob
  384. fdtox $nloc,$nloc
  385. fdtox $nlod,$nlod
  386. std $nloa,[%sp+$bias+$frame+0]
  387. std $nlob,[%sp+$bias+$frame+8]
  388. std $nloc,[%sp+$bias+$frame+16]
  389. std $nlod,[%sp+$bias+$frame+24]
  390. addcc $j,8,$j
  391. bnz,pt %icc,.L1st
  392. add $tp,8,$tp
  393. .L1stskip:
  394. fdtox $dota,$dota
  395. fdtox $dotb,$dotb
  396. ldx [%sp+$bias+$frame+0],%o0
  397. ldx [%sp+$bias+$frame+8],%o1
  398. ldx [%sp+$bias+$frame+16],%o2
  399. ldx [%sp+$bias+$frame+24],%o3
  400. srlx %o0,16,%o7
  401. std $dota,[%sp+$bias+$frame+32]
  402. add %o7,%o1,%o1
  403. std $dotb,[%sp+$bias+$frame+40]
  404. srlx %o1,16,%o7
  405. add %o7,%o2,%o2
  406. srlx %o2,16,%o7
  407. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  408. and %o0,$mask,%o0
  409. and %o1,$mask,%o1
  410. and %o2,$mask,%o2
  411. sllx %o1,16,%o1
  412. sllx %o2,32,%o2
  413. sllx %o3,48,%o7
  414. or %o1,%o0,%o0
  415. or %o2,%o0,%o0
  416. or %o7,%o0,%o0 ! 64-bit result
  417. ldx [%sp+$bias+$frame+32],%o4
  418. addcc %g1,%o0,%o0
  419. ldx [%sp+$bias+$frame+40],%o5
  420. srlx %o3,16,%g1 ! 34-bit carry
  421. bcs,a %xcc,.+8
  422. add %g1,1,%g1
  423. stx %o0,[$tp] ! tp[j-1]=
  424. add $tp,8,$tp
  425. srlx %o4,16,%o7
  426. add %o7,%o5,%o5
  427. and %o4,$mask,%o4
  428. sllx %o5,16,%o7
  429. or %o7,%o4,%o4
  430. addcc %g1,%o4,%o4
  431. srlx %o5,48,%g1
  432. bcs,a %xcc,.+8
  433. add %g1,1,%g1
  434. mov %g1,$carry
  435. stx %o4,[$tp] ! tp[num-1]=
  436. ba .Louter
  437. add $i,8,$i
  438. .align 32
  439. .Louter:
  440. sub %g0,$num,$j ! j=-num
  441. add %sp,$bias+$frame+$locals,$tp
  442. add $ap,$j,%o3
  443. add $bp,$i,%o4
  444. ld [%o3+4],%g1 ! bp[i]
  445. ld [%o3+0],%o0
  446. ld [%o4+4],%g5 ! ap[0]
  447. sllx %g1,32,%g1
  448. ld [%o4+0],%o1
  449. sllx %g5,32,%g5
  450. or %g1,%o0,%o0
  451. or %g5,%o1,%o1
  452. ldx [$tp],%o2 ! tp[0]
  453. mulx %o1,%o0,%o0
  454. addcc %o2,%o0,%o0
  455. mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
  456. stx %o0,[%sp+$bias+$frame+0]
  457. ! transfer b[i] to FPU as 4x16-bit values
  458. ldda [%o4+2]%asi,$ba
  459. ldda [%o4+0]%asi,$bb
  460. ldda [%o4+6]%asi,$bc
  461. ldda [%o4+4]%asi,$bd
  462. ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
  463. ldda [%sp+$bias+$frame+6]%asi,$na
  464. fxtod $ba,$ba
  465. ldda [%sp+$bias+$frame+4]%asi,$nb
  466. fxtod $bb,$bb
  467. ldda [%sp+$bias+$frame+2]%asi,$nc
  468. fxtod $bc,$bc
  469. ldda [%sp+$bias+$frame+0]%asi,$nd
  470. fxtod $bd,$bd
  471. ldd [$ap_l+$j],$alo ! load a[j] in double format
  472. fxtod $na,$na
  473. ldd [$ap_h+$j],$ahi
  474. fxtod $nb,$nb
  475. ldd [$np_l+$j],$nlo ! load n[j] in double format
  476. fxtod $nc,$nc
  477. ldd [$np_h+$j],$nhi
  478. fxtod $nd,$nd
  479. fmuld $alo,$ba,$aloa
  480. fmuld $nlo,$na,$nloa
  481. fmuld $alo,$bb,$alob
  482. fmuld $nlo,$nb,$nlob
  483. fmuld $alo,$bc,$aloc
  484. faddd $aloa,$nloa,$nloa
  485. fmuld $nlo,$nc,$nloc
  486. fmuld $alo,$bd,$alod
  487. faddd $alob,$nlob,$nlob
  488. fmuld $nlo,$nd,$nlod
  489. fmuld $ahi,$ba,$ahia
  490. faddd $aloc,$nloc,$nloc
  491. fmuld $nhi,$na,$nhia
  492. fmuld $ahi,$bb,$ahib
  493. faddd $alod,$nlod,$nlod
  494. fmuld $nhi,$nb,$nhib
  495. fmuld $ahi,$bc,$ahic
  496. faddd $ahia,$nhia,$nhia
  497. fmuld $nhi,$nc,$nhic
  498. fmuld $ahi,$bd,$ahid
  499. faddd $ahib,$nhib,$nhib
  500. fmuld $nhi,$nd,$nhid
  501. faddd $ahic,$nhic,$dota ! $nhic
  502. faddd $ahid,$nhid,$dotb ! $nhid
  503. faddd $nloc,$nhia,$nloc
  504. faddd $nlod,$nhib,$nlod
  505. fdtox $nloa,$nloa
  506. fdtox $nlob,$nlob
  507. fdtox $nloc,$nloc
  508. fdtox $nlod,$nlod
  509. std $nloa,[%sp+$bias+$frame+0]
  510. std $nlob,[%sp+$bias+$frame+8]
  511. std $nloc,[%sp+$bias+$frame+16]
  512. add $j,8,$j
  513. std $nlod,[%sp+$bias+$frame+24]
  514. ldd [$ap_l+$j],$alo ! load a[j] in double format
  515. ldd [$ap_h+$j],$ahi
  516. ldd [$np_l+$j],$nlo ! load n[j] in double format
  517. ldd [$np_h+$j],$nhi
  518. fmuld $alo,$ba,$aloa
  519. fmuld $nlo,$na,$nloa
  520. fmuld $alo,$bb,$alob
  521. fmuld $nlo,$nb,$nlob
  522. fmuld $alo,$bc,$aloc
  523. ldx [%sp+$bias+$frame+0],%o0
  524. faddd $aloa,$nloa,$nloa
  525. fmuld $nlo,$nc,$nloc
  526. ldx [%sp+$bias+$frame+8],%o1
  527. fmuld $alo,$bd,$alod
  528. ldx [%sp+$bias+$frame+16],%o2
  529. faddd $alob,$nlob,$nlob
  530. fmuld $nlo,$nd,$nlod
  531. ldx [%sp+$bias+$frame+24],%o3
  532. fmuld $ahi,$ba,$ahia
  533. srlx %o0,16,%o7
  534. faddd $aloc,$nloc,$nloc
  535. fmuld $nhi,$na,$nhia
  536. add %o7,%o1,%o1
  537. fmuld $ahi,$bb,$ahib
  538. srlx %o1,16,%o7
  539. faddd $alod,$nlod,$nlod
  540. fmuld $nhi,$nb,$nhib
  541. add %o7,%o2,%o2
  542. fmuld $ahi,$bc,$ahic
  543. srlx %o2,16,%o7
  544. faddd $ahia,$nhia,$nhia
  545. fmuld $nhi,$nc,$nhic
  546. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  547. ! why?
  548. and %o0,$mask,%o0
  549. fmuld $ahi,$bd,$ahid
  550. and %o1,$mask,%o1
  551. and %o2,$mask,%o2
  552. faddd $ahib,$nhib,$nhib
  553. fmuld $nhi,$nd,$nhid
  554. sllx %o1,16,%o1
  555. faddd $dota,$nloa,$nloa
  556. sllx %o2,32,%o2
  557. faddd $dotb,$nlob,$nlob
  558. sllx %o3,48,%o7
  559. or %o1,%o0,%o0
  560. faddd $ahic,$nhic,$dota ! $nhic
  561. or %o2,%o0,%o0
  562. faddd $ahid,$nhid,$dotb ! $nhid
  563. or %o7,%o0,%o0 ! 64-bit result
  564. ldx [$tp],%o7
  565. faddd $nloc,$nhia,$nloc
  566. addcc %o7,%o0,%o0
  567. ! end-of-why?
  568. faddd $nlod,$nhib,$nlod
  569. srlx %o3,16,%g1 ! 34-bit carry
  570. fdtox $nloa,$nloa
  571. bcs,a %xcc,.+8
  572. add %g1,1,%g1
  573. fdtox $nlob,$nlob
  574. fdtox $nloc,$nloc
  575. fdtox $nlod,$nlod
  576. std $nloa,[%sp+$bias+$frame+0]
  577. std $nlob,[%sp+$bias+$frame+8]
  578. addcc $j,8,$j
  579. std $nloc,[%sp+$bias+$frame+16]
  580. bz,pn %icc,.Linnerskip
  581. std $nlod,[%sp+$bias+$frame+24]
  582. ba .Linner
  583. nop
  584. .align 32
  585. .Linner:
  586. ldd [$ap_l+$j],$alo ! load a[j] in double format
  587. ldd [$ap_h+$j],$ahi
  588. ldd [$np_l+$j],$nlo ! load n[j] in double format
  589. ldd [$np_h+$j],$nhi
  590. fmuld $alo,$ba,$aloa
  591. fmuld $nlo,$na,$nloa
  592. fmuld $alo,$bb,$alob
  593. fmuld $nlo,$nb,$nlob
  594. fmuld $alo,$bc,$aloc
  595. ldx [%sp+$bias+$frame+0],%o0
  596. faddd $aloa,$nloa,$nloa
  597. fmuld $nlo,$nc,$nloc
  598. ldx [%sp+$bias+$frame+8],%o1
  599. fmuld $alo,$bd,$alod
  600. ldx [%sp+$bias+$frame+16],%o2
  601. faddd $alob,$nlob,$nlob
  602. fmuld $nlo,$nd,$nlod
  603. ldx [%sp+$bias+$frame+24],%o3
  604. fmuld $ahi,$ba,$ahia
  605. srlx %o0,16,%o7
  606. faddd $aloc,$nloc,$nloc
  607. fmuld $nhi,$na,$nhia
  608. add %o7,%o1,%o1
  609. fmuld $ahi,$bb,$ahib
  610. srlx %o1,16,%o7
  611. faddd $alod,$nlod,$nlod
  612. fmuld $nhi,$nb,$nhib
  613. add %o7,%o2,%o2
  614. fmuld $ahi,$bc,$ahic
  615. srlx %o2,16,%o7
  616. faddd $ahia,$nhia,$nhia
  617. fmuld $nhi,$nc,$nhic
  618. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  619. and %o0,$mask,%o0
  620. fmuld $ahi,$bd,$ahid
  621. and %o1,$mask,%o1
  622. and %o2,$mask,%o2
  623. faddd $ahib,$nhib,$nhib
  624. fmuld $nhi,$nd,$nhid
  625. sllx %o1,16,%o1
  626. faddd $dota,$nloa,$nloa
  627. sllx %o2,32,%o2
  628. faddd $dotb,$nlob,$nlob
  629. sllx %o3,48,%o7
  630. or %o1,%o0,%o0
  631. faddd $ahic,$nhic,$dota ! $nhic
  632. or %o2,%o0,%o0
  633. faddd $ahid,$nhid,$dotb ! $nhid
  634. or %o7,%o0,%o0 ! 64-bit result
  635. faddd $nloc,$nhia,$nloc
  636. addcc %g1,%o0,%o0
  637. ldx [$tp+8],%o7 ! tp[j]
  638. faddd $nlod,$nhib,$nlod
  639. srlx %o3,16,%g1 ! 34-bit carry
  640. fdtox $nloa,$nloa
  641. bcs,a %xcc,.+8
  642. add %g1,1,%g1
  643. fdtox $nlob,$nlob
  644. addcc %o7,%o0,%o0
  645. fdtox $nloc,$nloc
  646. bcs,a %xcc,.+8
  647. add %g1,1,%g1
  648. stx %o0,[$tp] ! tp[j-1]
  649. fdtox $nlod,$nlod
  650. std $nloa,[%sp+$bias+$frame+0]
  651. std $nlob,[%sp+$bias+$frame+8]
  652. std $nloc,[%sp+$bias+$frame+16]
  653. addcc $j,8,$j
  654. std $nlod,[%sp+$bias+$frame+24]
  655. bnz,pt %icc,.Linner
  656. add $tp,8,$tp
  657. .Linnerskip:
  658. fdtox $dota,$dota
  659. fdtox $dotb,$dotb
  660. ldx [%sp+$bias+$frame+0],%o0
  661. ldx [%sp+$bias+$frame+8],%o1
  662. ldx [%sp+$bias+$frame+16],%o2
  663. ldx [%sp+$bias+$frame+24],%o3
  664. srlx %o0,16,%o7
  665. std $dota,[%sp+$bias+$frame+32]
  666. add %o7,%o1,%o1
  667. std $dotb,[%sp+$bias+$frame+40]
  668. srlx %o1,16,%o7
  669. add %o7,%o2,%o2
  670. srlx %o2,16,%o7
  671. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  672. and %o0,$mask,%o0
  673. and %o1,$mask,%o1
  674. and %o2,$mask,%o2
  675. sllx %o1,16,%o1
  676. sllx %o2,32,%o2
  677. sllx %o3,48,%o7
  678. or %o1,%o0,%o0
  679. or %o2,%o0,%o0
  680. ldx [%sp+$bias+$frame+32],%o4
  681. or %o7,%o0,%o0 ! 64-bit result
  682. ldx [%sp+$bias+$frame+40],%o5
  683. addcc %g1,%o0,%o0
  684. ldx [$tp+8],%o7 ! tp[j]
  685. srlx %o3,16,%g1 ! 34-bit carry
  686. bcs,a %xcc,.+8
  687. add %g1,1,%g1
  688. addcc %o7,%o0,%o0
  689. bcs,a %xcc,.+8
  690. add %g1,1,%g1
  691. stx %o0,[$tp] ! tp[j-1]
  692. add $tp,8,$tp
  693. srlx %o4,16,%o7
  694. add %o7,%o5,%o5
  695. and %o4,$mask,%o4
  696. sllx %o5,16,%o7
  697. or %o7,%o4,%o4
  698. addcc %g1,%o4,%o4
  699. srlx %o5,48,%g1
  700. bcs,a %xcc,.+8
  701. add %g1,1,%g1
  702. addcc $carry,%o4,%o4
  703. stx %o4,[$tp] ! tp[num-1]
  704. mov %g1,$carry
  705. bcs,a %xcc,.+8
  706. add $carry,1,$carry
  707. addcc $i,8,$i
  708. bnz %icc,.Louter
  709. nop
  710. add $tp,8,$tp ! adjust tp to point at the end
  711. orn %g0,%g0,%g4
  712. sub %g0,$num,%o7 ! n=-num
  713. ba .Lsub
  714. subcc %g0,%g0,%g0 ! clear %icc.c
  715. .align 32
  716. .Lsub:
  717. ldx [$tp+%o7],%o0
  718. add $np,%o7,%g1
  719. ld [%g1+0],%o2
  720. ld [%g1+4],%o3
  721. srlx %o0,32,%o1
  722. subccc %o0,%o2,%o2
  723. add $rp,%o7,%g1
  724. subccc %o1,%o3,%o3
  725. st %o2,[%g1+0]
  726. add %o7,8,%o7
  727. brnz,pt %o7,.Lsub
  728. st %o3,[%g1+4]
  729. subc $carry,0,%g4
  730. sub %g0,$num,%o7 ! n=-num
  731. ba .Lcopy
  732. nop
  733. .align 32
  734. .Lcopy:
  735. ldx [$tp+%o7],%o0
  736. add $rp,%o7,%g1
  737. ld [%g1+0],%o2
  738. ld [%g1+4],%o3
  739. stx %g0,[$tp+%o7]
  740. and %o0,%g4,%o0
  741. srlx %o0,32,%o1
  742. andn %o2,%g4,%o2
  743. andn %o3,%g4,%o3
  744. or %o2,%o0,%o0
  745. or %o3,%o1,%o1
  746. st %o0,[%g1+0]
  747. add %o7,8,%o7
  748. brnz,pt %o7,.Lcopy
  749. st %o1,[%g1+4]
  750. sub %g0,$num,%o7 ! n=-num
  751. .Lzap:
  752. stx %g0,[$ap_l+%o7]
  753. stx %g0,[$ap_h+%o7]
  754. stx %g0,[$np_l+%o7]
  755. stx %g0,[$np_h+%o7]
  756. add %o7,8,%o7
  757. brnz,pt %o7,.Lzap
  758. nop
  759. ldx [%sp+$bias+$frame+48],%o7
  760. wr %g0,%o7,%asi ! restore %asi
  761. mov 1,%i0
  762. .Lret:
  763. ret
  764. restore
  765. .type $fname,#function
  766. .size $fname,(.-$fname)
  767. .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
  768. .align 32
  769. ___
  770. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  771. # Below substitution makes it possible to compile without demanding
  772. # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
  773. # dare to do this, because VIS capability is detected at run-time now
  774. # and this routine is not called on CPU not capable to execute it. Do
  775. # note that fzeros is not the only VIS dependency! Another dependency
  776. # is implicit and is just _a_ numerical value loaded to %asi register,
  777. # which assembler can't recognize as VIS specific...
  778. $code =~ s/fzeros\s+%f([0-9]+)/
  779. sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
  780. /gem;
  781. print $code;
  782. # flush
  783. close STDOUT or die "error closing STDOUT: $!";