sparcv9a-mont.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # October 2005
  15. #
  16. # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
  17. # Because unlike integer multiplier, which simply stalls whole CPU,
  18. # FPU is fully pipelined and can effectively emit 48 bit partial
  19. # product every cycle. Why not blended SPARC v9? One can argue that
  20. # making this module dependent on UltraSPARC VIS extension limits its
  21. # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
  22. # implementations from compatibility matrix. But the rest, whole Sun
  23. # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
  24. # VIS extension instructions used in this module. This is considered
  25. # good enough to not care about HAL SPARC64 users [if any] who have
  26. # integer-only pure SPARCv9 module to "fall down" to.
  27. # USI&II cores currently exhibit uniform 2x improvement [over pre-
  28. # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
  29. # performance improves few percents for shorter keys and worsens few
  30. # percents for longer keys. This is because USIII integer multiplier
  31. # is >3x faster than USI&II one, which is harder to match [but see
  32. # TODO list below]. It should also be noted that SPARC64 V features
  33. # out-of-order execution, which *might* mean that integer multiplier
  34. # is pipelined, which in turn *might* be impossible to match... On
  35. # additional note, SPARC64 V implements FP Multiply-Add instruction,
  36. # which is perfectly usable in this context... In other words, as far
  37. # as Fujitsu SPARC64 V goes, talk to the author:-)
  38. # The implementation implies following "non-natural" limitations on
  39. # input arguments:
  40. # - num may not be less than 4;
  41. # - num has to be even;
  42. # Failure to meet either condition has no fatal effects, simply
  43. # doesn't give any performance gain.
  44. # TODO:
  45. # - modulo-schedule inner loop for better performance (on in-order
  46. # execution core such as UltraSPARC this shall result in further
  47. # noticeable(!) improvement);
  48. # - dedicated squaring procedure[?];
  49. ######################################################################
  50. # November 2006
  51. #
  52. # Modulo-scheduled inner loops allow to interleave floating point and
  53. # integer instructions and minimize Read-After-Write penalties. This
  54. # results in *further* 20-50% performance improvement [depending on
  55. # key length, more for longer keys] on USI&II cores and 30-80% - on
  56. # USIII&IV.
  57. $output = pop;
  58. open STDOUT,">$output";
  59. $fname="bn_mul_mont_fpu";
  60. $frame="STACK_FRAME";
  61. $bias="STACK_BIAS";
  62. $locals=64;
  63. # In order to provide for 32-/64-bit ABI duality, I keep integers wider
  64. # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
  65. # exclusively for pointers, indexes and other small values...
  66. # int bn_mul_mont(
  67. $rp="%i0"; # BN_ULONG *rp,
  68. $ap="%i1"; # const BN_ULONG *ap,
  69. $bp="%i2"; # const BN_ULONG *bp,
  70. $np="%i3"; # const BN_ULONG *np,
  71. $n0="%i4"; # const BN_ULONG *n0,
  72. $num="%i5"; # int num);
  73. $tp="%l0"; # t[num]
  74. $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
  75. $ap_h="%l2"; # to these four vectors as double-precision FP values.
  76. $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
  77. $np_h="%l4"; # loop and L1-cache aliasing is minimized...
  78. $i="%l5";
  79. $j="%l6";
  80. $mask="%l7"; # 16-bit mask, 0xffff
  81. $n0="%g4"; # reassigned(!) to "64-bit" register
  82. $carry="%i4"; # %i4 reused(!) for a carry bit
  83. # FP register naming chart
  84. #
  85. # ..HILO
  86. # dcba
  87. # --------
  88. # LOa
  89. # LOb
  90. # LOc
  91. # LOd
  92. # HIa
  93. # HIb
  94. # HIc
  95. # HId
  96. # ..a
  97. # ..b
  98. $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
  99. $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
  100. $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
  101. $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
  102. $dota="%f24"; $dotb="%f26";
  103. $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
  104. $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
  105. $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
  106. $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
  107. $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
  108. $code=<<___;
  109. #include "sparc_arch.h"
  110. .section ".text",#alloc,#execinstr
  111. .global $fname
  112. .align 32
  113. $fname:
  114. save %sp,-$frame-$locals,%sp
  115. cmp $num,4
  116. bl,a,pn %icc,.Lret
  117. clr %i0
  118. andcc $num,1,%g0 ! $num has to be even...
  119. bnz,a,pn %icc,.Lret
  120. clr %i0 ! signal "unsupported input value"
  121. srl $num,1,$num
  122. sethi %hi(0xffff),$mask
  123. ld [%i4+0],$n0 ! $n0 reassigned, remember?
  124. or $mask,%lo(0xffff),$mask
  125. ld [%i4+4],%o0
  126. sllx %o0,32,%o0
  127. or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
  128. sll $num,3,$num ! num*=8
  129. add %sp,$bias,%o0 ! real top of stack
  130. sll $num,2,%o1
  131. add %o1,$num,%o1 ! %o1=num*5
  132. sub %o0,%o1,%o0
  133. and %o0,-2048,%o0 ! optimize TLB utilization
  134. sub %o0,$bias,%sp ! alloca(5*num*8)
  135. rd %asi,%o7 ! save %asi
  136. add %sp,$bias+$frame+$locals,$tp
  137. add $tp,$num,$ap_l
  138. add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
  139. add $ap_l,$num,$ap_h
  140. add $ap_h,$num,$np_l
  141. add $np_l,$num,$np_h
  142. wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
  143. add $rp,$num,$rp ! readjust input pointers to point
  144. add $ap,$num,$ap ! at the ends too...
  145. add $bp,$num,$bp
  146. add $np,$num,$np
  147. stx %o7,[%sp+$bias+$frame+48] ! save %asi
  148. sub %g0,$num,$i ! i=-num
  149. sub %g0,$num,$j ! j=-num
  150. add $ap,$j,%o3
  151. add $bp,$i,%o4
  152. ld [%o3+4],%g1 ! bp[0]
  153. ld [%o3+0],%o0
  154. ld [%o4+4],%g5 ! ap[0]
  155. sllx %g1,32,%g1
  156. ld [%o4+0],%o1
  157. sllx %g5,32,%g5
  158. or %g1,%o0,%o0
  159. or %g5,%o1,%o1
  160. add $np,$j,%o5
  161. mulx %o1,%o0,%o0 ! ap[0]*bp[0]
  162. mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
  163. stx %o0,[%sp+$bias+$frame+0]
  164. ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
  165. fzeros $alo
  166. ld [%o3+4],$ahi_
  167. fzeros $ahi
  168. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  169. fzeros $nlo
  170. ld [%o5+4],$nhi_
  171. fzeros $nhi
  172. ! transfer b[i] to FPU as 4x16-bit values
  173. ldda [%o4+2]%asi,$ba
  174. fxtod $alo,$alo
  175. ldda [%o4+0]%asi,$bb
  176. fxtod $ahi,$ahi
  177. ldda [%o4+6]%asi,$bc
  178. fxtod $nlo,$nlo
  179. ldda [%o4+4]%asi,$bd
  180. fxtod $nhi,$nhi
  181. ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
  182. ldda [%sp+$bias+$frame+6]%asi,$na
  183. fxtod $ba,$ba
  184. ldda [%sp+$bias+$frame+4]%asi,$nb
  185. fxtod $bb,$bb
  186. ldda [%sp+$bias+$frame+2]%asi,$nc
  187. fxtod $bc,$bc
  188. ldda [%sp+$bias+$frame+0]%asi,$nd
  189. fxtod $bd,$bd
  190. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  191. fxtod $na,$na
  192. std $ahi,[$ap_h+$j]
  193. fxtod $nb,$nb
  194. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  195. fxtod $nc,$nc
  196. std $nhi,[$np_h+$j]
  197. fxtod $nd,$nd
  198. fmuld $alo,$ba,$aloa
  199. fmuld $nlo,$na,$nloa
  200. fmuld $alo,$bb,$alob
  201. fmuld $nlo,$nb,$nlob
  202. fmuld $alo,$bc,$aloc
  203. faddd $aloa,$nloa,$nloa
  204. fmuld $nlo,$nc,$nloc
  205. fmuld $alo,$bd,$alod
  206. faddd $alob,$nlob,$nlob
  207. fmuld $nlo,$nd,$nlod
  208. fmuld $ahi,$ba,$ahia
  209. faddd $aloc,$nloc,$nloc
  210. fmuld $nhi,$na,$nhia
  211. fmuld $ahi,$bb,$ahib
  212. faddd $alod,$nlod,$nlod
  213. fmuld $nhi,$nb,$nhib
  214. fmuld $ahi,$bc,$ahic
  215. faddd $ahia,$nhia,$nhia
  216. fmuld $nhi,$nc,$nhic
  217. fmuld $ahi,$bd,$ahid
  218. faddd $ahib,$nhib,$nhib
  219. fmuld $nhi,$nd,$nhid
  220. faddd $ahic,$nhic,$dota ! $nhic
  221. faddd $ahid,$nhid,$dotb ! $nhid
  222. faddd $nloc,$nhia,$nloc
  223. faddd $nlod,$nhib,$nlod
  224. fdtox $nloa,$nloa
  225. fdtox $nlob,$nlob
  226. fdtox $nloc,$nloc
  227. fdtox $nlod,$nlod
  228. std $nloa,[%sp+$bias+$frame+0]
  229. add $j,8,$j
  230. std $nlob,[%sp+$bias+$frame+8]
  231. add $ap,$j,%o4
  232. std $nloc,[%sp+$bias+$frame+16]
  233. add $np,$j,%o5
  234. std $nlod,[%sp+$bias+$frame+24]
  235. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  236. fzeros $alo
  237. ld [%o4+4],$ahi_
  238. fzeros $ahi
  239. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  240. fzeros $nlo
  241. ld [%o5+4],$nhi_
  242. fzeros $nhi
  243. fxtod $alo,$alo
  244. fxtod $ahi,$ahi
  245. fxtod $nlo,$nlo
  246. fxtod $nhi,$nhi
  247. ldx [%sp+$bias+$frame+0],%o0
  248. fmuld $alo,$ba,$aloa
  249. ldx [%sp+$bias+$frame+8],%o1
  250. fmuld $nlo,$na,$nloa
  251. ldx [%sp+$bias+$frame+16],%o2
  252. fmuld $alo,$bb,$alob
  253. ldx [%sp+$bias+$frame+24],%o3
  254. fmuld $nlo,$nb,$nlob
  255. srlx %o0,16,%o7
  256. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  257. fmuld $alo,$bc,$aloc
  258. add %o7,%o1,%o1
  259. std $ahi,[$ap_h+$j]
  260. faddd $aloa,$nloa,$nloa
  261. fmuld $nlo,$nc,$nloc
  262. srlx %o1,16,%o7
  263. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  264. fmuld $alo,$bd,$alod
  265. add %o7,%o2,%o2
  266. std $nhi,[$np_h+$j]
  267. faddd $alob,$nlob,$nlob
  268. fmuld $nlo,$nd,$nlod
  269. srlx %o2,16,%o7
  270. fmuld $ahi,$ba,$ahia
  271. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  272. faddd $aloc,$nloc,$nloc
  273. fmuld $nhi,$na,$nhia
  274. !and %o0,$mask,%o0
  275. !and %o1,$mask,%o1
  276. !and %o2,$mask,%o2
  277. !sllx %o1,16,%o1
  278. !sllx %o2,32,%o2
  279. !sllx %o3,48,%o7
  280. !or %o1,%o0,%o0
  281. !or %o2,%o0,%o0
  282. !or %o7,%o0,%o0 ! 64-bit result
  283. srlx %o3,16,%g1 ! 34-bit carry
  284. fmuld $ahi,$bb,$ahib
  285. faddd $alod,$nlod,$nlod
  286. fmuld $nhi,$nb,$nhib
  287. fmuld $ahi,$bc,$ahic
  288. faddd $ahia,$nhia,$nhia
  289. fmuld $nhi,$nc,$nhic
  290. fmuld $ahi,$bd,$ahid
  291. faddd $ahib,$nhib,$nhib
  292. fmuld $nhi,$nd,$nhid
  293. faddd $dota,$nloa,$nloa
  294. faddd $dotb,$nlob,$nlob
  295. faddd $ahic,$nhic,$dota ! $nhic
  296. faddd $ahid,$nhid,$dotb ! $nhid
  297. faddd $nloc,$nhia,$nloc
  298. faddd $nlod,$nhib,$nlod
  299. fdtox $nloa,$nloa
  300. fdtox $nlob,$nlob
  301. fdtox $nloc,$nloc
  302. fdtox $nlod,$nlod
  303. std $nloa,[%sp+$bias+$frame+0]
  304. std $nlob,[%sp+$bias+$frame+8]
  305. addcc $j,8,$j
  306. std $nloc,[%sp+$bias+$frame+16]
  307. bz,pn %icc,.L1stskip
  308. std $nlod,[%sp+$bias+$frame+24]
  309. .align 32 ! incidentally already aligned !
  310. .L1st:
  311. add $ap,$j,%o4
  312. add $np,$j,%o5
  313. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  314. fzeros $alo
  315. ld [%o4+4],$ahi_
  316. fzeros $ahi
  317. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  318. fzeros $nlo
  319. ld [%o5+4],$nhi_
  320. fzeros $nhi
  321. fxtod $alo,$alo
  322. fxtod $ahi,$ahi
  323. fxtod $nlo,$nlo
  324. fxtod $nhi,$nhi
  325. ldx [%sp+$bias+$frame+0],%o0
  326. fmuld $alo,$ba,$aloa
  327. ldx [%sp+$bias+$frame+8],%o1
  328. fmuld $nlo,$na,$nloa
  329. ldx [%sp+$bias+$frame+16],%o2
  330. fmuld $alo,$bb,$alob
  331. ldx [%sp+$bias+$frame+24],%o3
  332. fmuld $nlo,$nb,$nlob
  333. srlx %o0,16,%o7
  334. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  335. fmuld $alo,$bc,$aloc
  336. add %o7,%o1,%o1
  337. std $ahi,[$ap_h+$j]
  338. faddd $aloa,$nloa,$nloa
  339. fmuld $nlo,$nc,$nloc
  340. srlx %o1,16,%o7
  341. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  342. fmuld $alo,$bd,$alod
  343. add %o7,%o2,%o2
  344. std $nhi,[$np_h+$j]
  345. faddd $alob,$nlob,$nlob
  346. fmuld $nlo,$nd,$nlod
  347. srlx %o2,16,%o7
  348. fmuld $ahi,$ba,$ahia
  349. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  350. and %o0,$mask,%o0
  351. faddd $aloc,$nloc,$nloc
  352. fmuld $nhi,$na,$nhia
  353. and %o1,$mask,%o1
  354. and %o2,$mask,%o2
  355. fmuld $ahi,$bb,$ahib
  356. sllx %o1,16,%o1
  357. faddd $alod,$nlod,$nlod
  358. fmuld $nhi,$nb,$nhib
  359. sllx %o2,32,%o2
  360. fmuld $ahi,$bc,$ahic
  361. sllx %o3,48,%o7
  362. or %o1,%o0,%o0
  363. faddd $ahia,$nhia,$nhia
  364. fmuld $nhi,$nc,$nhic
  365. or %o2,%o0,%o0
  366. fmuld $ahi,$bd,$ahid
  367. or %o7,%o0,%o0 ! 64-bit result
  368. faddd $ahib,$nhib,$nhib
  369. fmuld $nhi,$nd,$nhid
  370. addcc %g1,%o0,%o0
  371. faddd $dota,$nloa,$nloa
  372. srlx %o3,16,%g1 ! 34-bit carry
  373. faddd $dotb,$nlob,$nlob
  374. bcs,a %xcc,.+8
  375. add %g1,1,%g1
  376. stx %o0,[$tp] ! tp[j-1]=
  377. faddd $ahic,$nhic,$dota ! $nhic
  378. faddd $ahid,$nhid,$dotb ! $nhid
  379. faddd $nloc,$nhia,$nloc
  380. faddd $nlod,$nhib,$nlod
  381. fdtox $nloa,$nloa
  382. fdtox $nlob,$nlob
  383. fdtox $nloc,$nloc
  384. fdtox $nlod,$nlod
  385. std $nloa,[%sp+$bias+$frame+0]
  386. std $nlob,[%sp+$bias+$frame+8]
  387. std $nloc,[%sp+$bias+$frame+16]
  388. std $nlod,[%sp+$bias+$frame+24]
  389. addcc $j,8,$j
  390. bnz,pt %icc,.L1st
  391. add $tp,8,$tp
  392. .L1stskip:
  393. fdtox $dota,$dota
  394. fdtox $dotb,$dotb
  395. ldx [%sp+$bias+$frame+0],%o0
  396. ldx [%sp+$bias+$frame+8],%o1
  397. ldx [%sp+$bias+$frame+16],%o2
  398. ldx [%sp+$bias+$frame+24],%o3
  399. srlx %o0,16,%o7
  400. std $dota,[%sp+$bias+$frame+32]
  401. add %o7,%o1,%o1
  402. std $dotb,[%sp+$bias+$frame+40]
  403. srlx %o1,16,%o7
  404. add %o7,%o2,%o2
  405. srlx %o2,16,%o7
  406. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  407. and %o0,$mask,%o0
  408. and %o1,$mask,%o1
  409. and %o2,$mask,%o2
  410. sllx %o1,16,%o1
  411. sllx %o2,32,%o2
  412. sllx %o3,48,%o7
  413. or %o1,%o0,%o0
  414. or %o2,%o0,%o0
  415. or %o7,%o0,%o0 ! 64-bit result
  416. ldx [%sp+$bias+$frame+32],%o4
  417. addcc %g1,%o0,%o0
  418. ldx [%sp+$bias+$frame+40],%o5
  419. srlx %o3,16,%g1 ! 34-bit carry
  420. bcs,a %xcc,.+8
  421. add %g1,1,%g1
  422. stx %o0,[$tp] ! tp[j-1]=
  423. add $tp,8,$tp
  424. srlx %o4,16,%o7
  425. add %o7,%o5,%o5
  426. and %o4,$mask,%o4
  427. sllx %o5,16,%o7
  428. or %o7,%o4,%o4
  429. addcc %g1,%o4,%o4
  430. srlx %o5,48,%g1
  431. bcs,a %xcc,.+8
  432. add %g1,1,%g1
  433. mov %g1,$carry
  434. stx %o4,[$tp] ! tp[num-1]=
  435. ba .Louter
  436. add $i,8,$i
  437. .align 32
  438. .Louter:
  439. sub %g0,$num,$j ! j=-num
  440. add %sp,$bias+$frame+$locals,$tp
  441. add $ap,$j,%o3
  442. add $bp,$i,%o4
  443. ld [%o3+4],%g1 ! bp[i]
  444. ld [%o3+0],%o0
  445. ld [%o4+4],%g5 ! ap[0]
  446. sllx %g1,32,%g1
  447. ld [%o4+0],%o1
  448. sllx %g5,32,%g5
  449. or %g1,%o0,%o0
  450. or %g5,%o1,%o1
  451. ldx [$tp],%o2 ! tp[0]
  452. mulx %o1,%o0,%o0
  453. addcc %o2,%o0,%o0
  454. mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
  455. stx %o0,[%sp+$bias+$frame+0]
  456. ! transfer b[i] to FPU as 4x16-bit values
  457. ldda [%o4+2]%asi,$ba
  458. ldda [%o4+0]%asi,$bb
  459. ldda [%o4+6]%asi,$bc
  460. ldda [%o4+4]%asi,$bd
  461. ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
  462. ldda [%sp+$bias+$frame+6]%asi,$na
  463. fxtod $ba,$ba
  464. ldda [%sp+$bias+$frame+4]%asi,$nb
  465. fxtod $bb,$bb
  466. ldda [%sp+$bias+$frame+2]%asi,$nc
  467. fxtod $bc,$bc
  468. ldda [%sp+$bias+$frame+0]%asi,$nd
  469. fxtod $bd,$bd
  470. ldd [$ap_l+$j],$alo ! load a[j] in double format
  471. fxtod $na,$na
  472. ldd [$ap_h+$j],$ahi
  473. fxtod $nb,$nb
  474. ldd [$np_l+$j],$nlo ! load n[j] in double format
  475. fxtod $nc,$nc
  476. ldd [$np_h+$j],$nhi
  477. fxtod $nd,$nd
  478. fmuld $alo,$ba,$aloa
  479. fmuld $nlo,$na,$nloa
  480. fmuld $alo,$bb,$alob
  481. fmuld $nlo,$nb,$nlob
  482. fmuld $alo,$bc,$aloc
  483. faddd $aloa,$nloa,$nloa
  484. fmuld $nlo,$nc,$nloc
  485. fmuld $alo,$bd,$alod
  486. faddd $alob,$nlob,$nlob
  487. fmuld $nlo,$nd,$nlod
  488. fmuld $ahi,$ba,$ahia
  489. faddd $aloc,$nloc,$nloc
  490. fmuld $nhi,$na,$nhia
  491. fmuld $ahi,$bb,$ahib
  492. faddd $alod,$nlod,$nlod
  493. fmuld $nhi,$nb,$nhib
  494. fmuld $ahi,$bc,$ahic
  495. faddd $ahia,$nhia,$nhia
  496. fmuld $nhi,$nc,$nhic
  497. fmuld $ahi,$bd,$ahid
  498. faddd $ahib,$nhib,$nhib
  499. fmuld $nhi,$nd,$nhid
  500. faddd $ahic,$nhic,$dota ! $nhic
  501. faddd $ahid,$nhid,$dotb ! $nhid
  502. faddd $nloc,$nhia,$nloc
  503. faddd $nlod,$nhib,$nlod
  504. fdtox $nloa,$nloa
  505. fdtox $nlob,$nlob
  506. fdtox $nloc,$nloc
  507. fdtox $nlod,$nlod
  508. std $nloa,[%sp+$bias+$frame+0]
  509. std $nlob,[%sp+$bias+$frame+8]
  510. std $nloc,[%sp+$bias+$frame+16]
  511. add $j,8,$j
  512. std $nlod,[%sp+$bias+$frame+24]
  513. ldd [$ap_l+$j],$alo ! load a[j] in double format
  514. ldd [$ap_h+$j],$ahi
  515. ldd [$np_l+$j],$nlo ! load n[j] in double format
  516. ldd [$np_h+$j],$nhi
  517. fmuld $alo,$ba,$aloa
  518. fmuld $nlo,$na,$nloa
  519. fmuld $alo,$bb,$alob
  520. fmuld $nlo,$nb,$nlob
  521. fmuld $alo,$bc,$aloc
  522. ldx [%sp+$bias+$frame+0],%o0
  523. faddd $aloa,$nloa,$nloa
  524. fmuld $nlo,$nc,$nloc
  525. ldx [%sp+$bias+$frame+8],%o1
  526. fmuld $alo,$bd,$alod
  527. ldx [%sp+$bias+$frame+16],%o2
  528. faddd $alob,$nlob,$nlob
  529. fmuld $nlo,$nd,$nlod
  530. ldx [%sp+$bias+$frame+24],%o3
  531. fmuld $ahi,$ba,$ahia
  532. srlx %o0,16,%o7
  533. faddd $aloc,$nloc,$nloc
  534. fmuld $nhi,$na,$nhia
  535. add %o7,%o1,%o1
  536. fmuld $ahi,$bb,$ahib
  537. srlx %o1,16,%o7
  538. faddd $alod,$nlod,$nlod
  539. fmuld $nhi,$nb,$nhib
  540. add %o7,%o2,%o2
  541. fmuld $ahi,$bc,$ahic
  542. srlx %o2,16,%o7
  543. faddd $ahia,$nhia,$nhia
  544. fmuld $nhi,$nc,$nhic
  545. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  546. ! why?
  547. and %o0,$mask,%o0
  548. fmuld $ahi,$bd,$ahid
  549. and %o1,$mask,%o1
  550. and %o2,$mask,%o2
  551. faddd $ahib,$nhib,$nhib
  552. fmuld $nhi,$nd,$nhid
  553. sllx %o1,16,%o1
  554. faddd $dota,$nloa,$nloa
  555. sllx %o2,32,%o2
  556. faddd $dotb,$nlob,$nlob
  557. sllx %o3,48,%o7
  558. or %o1,%o0,%o0
  559. faddd $ahic,$nhic,$dota ! $nhic
  560. or %o2,%o0,%o0
  561. faddd $ahid,$nhid,$dotb ! $nhid
  562. or %o7,%o0,%o0 ! 64-bit result
  563. ldx [$tp],%o7
  564. faddd $nloc,$nhia,$nloc
  565. addcc %o7,%o0,%o0
  566. ! end-of-why?
  567. faddd $nlod,$nhib,$nlod
  568. srlx %o3,16,%g1 ! 34-bit carry
  569. fdtox $nloa,$nloa
  570. bcs,a %xcc,.+8
  571. add %g1,1,%g1
  572. fdtox $nlob,$nlob
  573. fdtox $nloc,$nloc
  574. fdtox $nlod,$nlod
  575. std $nloa,[%sp+$bias+$frame+0]
  576. std $nlob,[%sp+$bias+$frame+8]
  577. addcc $j,8,$j
  578. std $nloc,[%sp+$bias+$frame+16]
  579. bz,pn %icc,.Linnerskip
  580. std $nlod,[%sp+$bias+$frame+24]
  581. ba .Linner
  582. nop
  583. .align 32
  584. .Linner:
  585. ldd [$ap_l+$j],$alo ! load a[j] in double format
  586. ldd [$ap_h+$j],$ahi
  587. ldd [$np_l+$j],$nlo ! load n[j] in double format
  588. ldd [$np_h+$j],$nhi
  589. fmuld $alo,$ba,$aloa
  590. fmuld $nlo,$na,$nloa
  591. fmuld $alo,$bb,$alob
  592. fmuld $nlo,$nb,$nlob
  593. fmuld $alo,$bc,$aloc
  594. ldx [%sp+$bias+$frame+0],%o0
  595. faddd $aloa,$nloa,$nloa
  596. fmuld $nlo,$nc,$nloc
  597. ldx [%sp+$bias+$frame+8],%o1
  598. fmuld $alo,$bd,$alod
  599. ldx [%sp+$bias+$frame+16],%o2
  600. faddd $alob,$nlob,$nlob
  601. fmuld $nlo,$nd,$nlod
  602. ldx [%sp+$bias+$frame+24],%o3
  603. fmuld $ahi,$ba,$ahia
  604. srlx %o0,16,%o7
  605. faddd $aloc,$nloc,$nloc
  606. fmuld $nhi,$na,$nhia
  607. add %o7,%o1,%o1
  608. fmuld $ahi,$bb,$ahib
  609. srlx %o1,16,%o7
  610. faddd $alod,$nlod,$nlod
  611. fmuld $nhi,$nb,$nhib
  612. add %o7,%o2,%o2
  613. fmuld $ahi,$bc,$ahic
  614. srlx %o2,16,%o7
  615. faddd $ahia,$nhia,$nhia
  616. fmuld $nhi,$nc,$nhic
  617. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  618. and %o0,$mask,%o0
  619. fmuld $ahi,$bd,$ahid
  620. and %o1,$mask,%o1
  621. and %o2,$mask,%o2
  622. faddd $ahib,$nhib,$nhib
  623. fmuld $nhi,$nd,$nhid
  624. sllx %o1,16,%o1
  625. faddd $dota,$nloa,$nloa
  626. sllx %o2,32,%o2
  627. faddd $dotb,$nlob,$nlob
  628. sllx %o3,48,%o7
  629. or %o1,%o0,%o0
  630. faddd $ahic,$nhic,$dota ! $nhic
  631. or %o2,%o0,%o0
  632. faddd $ahid,$nhid,$dotb ! $nhid
  633. or %o7,%o0,%o0 ! 64-bit result
  634. faddd $nloc,$nhia,$nloc
  635. addcc %g1,%o0,%o0
  636. ldx [$tp+8],%o7 ! tp[j]
  637. faddd $nlod,$nhib,$nlod
  638. srlx %o3,16,%g1 ! 34-bit carry
  639. fdtox $nloa,$nloa
  640. bcs,a %xcc,.+8
  641. add %g1,1,%g1
  642. fdtox $nlob,$nlob
  643. addcc %o7,%o0,%o0
  644. fdtox $nloc,$nloc
  645. bcs,a %xcc,.+8
  646. add %g1,1,%g1
  647. stx %o0,[$tp] ! tp[j-1]
  648. fdtox $nlod,$nlod
  649. std $nloa,[%sp+$bias+$frame+0]
  650. std $nlob,[%sp+$bias+$frame+8]
  651. std $nloc,[%sp+$bias+$frame+16]
  652. addcc $j,8,$j
  653. std $nlod,[%sp+$bias+$frame+24]
  654. bnz,pt %icc,.Linner
  655. add $tp,8,$tp
  656. .Linnerskip:
  657. fdtox $dota,$dota
  658. fdtox $dotb,$dotb
  659. ldx [%sp+$bias+$frame+0],%o0
  660. ldx [%sp+$bias+$frame+8],%o1
  661. ldx [%sp+$bias+$frame+16],%o2
  662. ldx [%sp+$bias+$frame+24],%o3
  663. srlx %o0,16,%o7
  664. std $dota,[%sp+$bias+$frame+32]
  665. add %o7,%o1,%o1
  666. std $dotb,[%sp+$bias+$frame+40]
  667. srlx %o1,16,%o7
  668. add %o7,%o2,%o2
  669. srlx %o2,16,%o7
  670. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  671. and %o0,$mask,%o0
  672. and %o1,$mask,%o1
  673. and %o2,$mask,%o2
  674. sllx %o1,16,%o1
  675. sllx %o2,32,%o2
  676. sllx %o3,48,%o7
  677. or %o1,%o0,%o0
  678. or %o2,%o0,%o0
  679. ldx [%sp+$bias+$frame+32],%o4
  680. or %o7,%o0,%o0 ! 64-bit result
  681. ldx [%sp+$bias+$frame+40],%o5
  682. addcc %g1,%o0,%o0
  683. ldx [$tp+8],%o7 ! tp[j]
  684. srlx %o3,16,%g1 ! 34-bit carry
  685. bcs,a %xcc,.+8
  686. add %g1,1,%g1
  687. addcc %o7,%o0,%o0
  688. bcs,a %xcc,.+8
  689. add %g1,1,%g1
  690. stx %o0,[$tp] ! tp[j-1]
  691. add $tp,8,$tp
  692. srlx %o4,16,%o7
  693. add %o7,%o5,%o5
  694. and %o4,$mask,%o4
  695. sllx %o5,16,%o7
  696. or %o7,%o4,%o4
  697. addcc %g1,%o4,%o4
  698. srlx %o5,48,%g1
  699. bcs,a %xcc,.+8
  700. add %g1,1,%g1
  701. addcc $carry,%o4,%o4
  702. stx %o4,[$tp] ! tp[num-1]
  703. mov %g1,$carry
  704. bcs,a %xcc,.+8
  705. add $carry,1,$carry
  706. addcc $i,8,$i
  707. bnz %icc,.Louter
  708. nop
  709. add $tp,8,$tp ! adjust tp to point at the end
  710. orn %g0,%g0,%g4
  711. sub %g0,$num,%o7 ! n=-num
  712. ba .Lsub
  713. subcc %g0,%g0,%g0 ! clear %icc.c
  714. .align 32
  715. .Lsub:
  716. ldx [$tp+%o7],%o0
  717. add $np,%o7,%g1
  718. ld [%g1+0],%o2
  719. ld [%g1+4],%o3
  720. srlx %o0,32,%o1
  721. subccc %o0,%o2,%o2
  722. add $rp,%o7,%g1
  723. subccc %o1,%o3,%o3
  724. st %o2,[%g1+0]
  725. add %o7,8,%o7
  726. brnz,pt %o7,.Lsub
  727. st %o3,[%g1+4]
  728. subc $carry,0,%g4
  729. sub %g0,$num,%o7 ! n=-num
  730. ba .Lcopy
  731. nop
  732. .align 32
  733. .Lcopy:
  734. ldx [$tp+%o7],%o0
  735. add $rp,%o7,%g1
  736. ld [%g1+0],%o2
  737. ld [%g1+4],%o3
  738. stx %g0,[$tp+%o7]
  739. and %o0,%g4,%o0
  740. srlx %o0,32,%o1
  741. andn %o2,%g4,%o2
  742. andn %o3,%g4,%o3
  743. or %o2,%o0,%o0
  744. or %o3,%o1,%o1
  745. st %o0,[%g1+0]
  746. add %o7,8,%o7
  747. brnz,pt %o7,.Lcopy
  748. st %o1,[%g1+4]
  749. sub %g0,$num,%o7 ! n=-num
  750. .Lzap:
  751. stx %g0,[$ap_l+%o7]
  752. stx %g0,[$ap_h+%o7]
  753. stx %g0,[$np_l+%o7]
  754. stx %g0,[$np_h+%o7]
  755. add %o7,8,%o7
  756. brnz,pt %o7,.Lzap
  757. nop
  758. ldx [%sp+$bias+$frame+48],%o7
  759. wr %g0,%o7,%asi ! restore %asi
  760. mov 1,%i0
  761. .Lret:
  762. ret
  763. restore
  764. .type $fname,#function
  765. .size $fname,(.-$fname)
  766. .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
  767. .align 32
  768. ___
  769. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  770. # Below substitution makes it possible to compile without demanding
  771. # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
  772. # dare to do this, because VIS capability is detected at run-time now
  773. # and this routine is not called on CPU not capable to execute it. Do
  774. # note that fzeros is not the only VIS dependency! Another dependency
  775. # is implicit and is just _a_ numerical value loaded to %asi register,
  776. # which assembler can't recognize as VIS specific...
  777. $code =~ s/fzeros\s+%f([0-9]+)/
  778. sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
  779. /gem;
  780. print $code;
  781. # flush
  782. close STDOUT;