sparcv9a-mont.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # October 2005
  15. #
  16. # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
  17. # Because unlike integer multiplier, which simply stalls whole CPU,
  18. # FPU is fully pipelined and can effectively emit 48 bit partial
  19. # product every cycle. Why not blended SPARC v9? One can argue that
  20. # making this module dependent on UltraSPARC VIS extension limits its
  21. # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
  22. # implementations from compatibility matrix. But the rest, whole Sun
  23. # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
  24. # VIS extension instructions used in this module. This is considered
  25. # good enough to not care about HAL SPARC64 users [if any] who have
  26. # integer-only pure SPARCv9 module to "fall down" to.
  27. # USI&II cores currently exhibit uniform 2x improvement [over pre-
  28. # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
  29. # performance improves few percents for shorter keys and worsens few
  30. # percents for longer keys. This is because USIII integer multiplier
  31. # is >3x faster than USI&II one, which is harder to match [but see
  32. # TODO list below]. It should also be noted that SPARC64 V features
  33. # out-of-order execution, which *might* mean that integer multiplier
  34. # is pipelined, which in turn *might* be impossible to match... On
  35. # additional note, SPARC64 V implements FP Multiply-Add instruction,
  36. # which is perfectly usable in this context... In other words, as far
  37. # as Fujitsu SPARC64 V goes, talk to the author:-)
  38. # The implementation implies following "non-natural" limitations on
  39. # input arguments:
  40. # - num may not be less than 4;
  41. # - num has to be even;
  42. # Failure to meet either condition has no fatal effects, simply
  43. # doesn't give any performance gain.
  44. # TODO:
  45. # - modulo-schedule inner loop for better performance (on in-order
  46. # execution core such as UltraSPARC this shall result in further
  47. # noticeable(!) improvement);
  48. # - dedicated squaring procedure[?];
  49. ######################################################################
  50. # November 2006
  51. #
  52. # Modulo-scheduled inner loops allow to interleave floating point and
  53. # integer instructions and minimize Read-After-Write penalties. This
  54. # results in *further* 20-50% performance improvement [depending on
  55. # key length, more for longer keys] on USI&II cores and 30-80% - on
  56. # USIII&IV.
  57. # $output is the last argument if it looks like a file (it has an extension)
  58. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  59. $output and open STDOUT,">$output";
  60. $fname="bn_mul_mont_fpu";
  61. $frame="STACK_FRAME";
  62. $bias="STACK_BIAS";
  63. $locals=64;
  64. # In order to provide for 32-/64-bit ABI duality, I keep integers wider
  65. # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
  66. # exclusively for pointers, indexes and other small values...
  67. # int bn_mul_mont(
  68. $rp="%i0"; # BN_ULONG *rp,
  69. $ap="%i1"; # const BN_ULONG *ap,
  70. $bp="%i2"; # const BN_ULONG *bp,
  71. $np="%i3"; # const BN_ULONG *np,
  72. $n0="%i4"; # const BN_ULONG *n0,
  73. $num="%i5"; # int num);
  74. $tp="%l0"; # t[num]
  75. $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
  76. $ap_h="%l2"; # to these four vectors as double-precision FP values.
  77. $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
  78. $np_h="%l4"; # loop and L1-cache aliasing is minimized...
  79. $i="%l5";
  80. $j="%l6";
  81. $mask="%l7"; # 16-bit mask, 0xffff
  82. $n0="%g4"; # reassigned(!) to "64-bit" register
  83. $carry="%i4"; # %i4 reused(!) for a carry bit
  84. # FP register naming chart
  85. #
  86. # ..HILO
  87. # dcba
  88. # --------
  89. # LOa
  90. # LOb
  91. # LOc
  92. # LOd
  93. # HIa
  94. # HIb
  95. # HIc
  96. # HId
  97. # ..a
  98. # ..b
  99. $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
  100. $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
  101. $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
  102. $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
  103. $dota="%f24"; $dotb="%f26";
  104. $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
  105. $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
  106. $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
  107. $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
  108. $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
  109. $code=<<___;
  110. #ifndef __ASSEMBLER__
  111. # define __ASSEMBLER__ 1
  112. #endif
  113. #include "crypto/sparc_arch.h"
  114. .section ".text",#alloc,#execinstr
  115. .global $fname
  116. .align 32
  117. $fname:
  118. save %sp,-$frame-$locals,%sp
  119. cmp $num,4
  120. bl,a,pn %icc,.Lret
  121. clr %i0
  122. andcc $num,1,%g0 ! $num has to be even...
  123. bnz,a,pn %icc,.Lret
  124. clr %i0 ! signal "unsupported input value"
  125. srl $num,1,$num
  126. sethi %hi(0xffff),$mask
  127. ld [%i4+0],$n0 ! $n0 reassigned, remember?
  128. or $mask,%lo(0xffff),$mask
  129. ld [%i4+4],%o0
  130. sllx %o0,32,%o0
  131. or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
  132. sll $num,3,$num ! num*=8
  133. add %sp,$bias,%o0 ! real top of stack
  134. sll $num,2,%o1
  135. add %o1,$num,%o1 ! %o1=num*5
  136. sub %o0,%o1,%o0
  137. and %o0,-2048,%o0 ! optimize TLB utilization
  138. sub %o0,$bias,%sp ! alloca(5*num*8)
  139. rd %asi,%o7 ! save %asi
  140. add %sp,$bias+$frame+$locals,$tp
  141. add $tp,$num,$ap_l
  142. add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
  143. add $ap_l,$num,$ap_h
  144. add $ap_h,$num,$np_l
  145. add $np_l,$num,$np_h
  146. wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
  147. add $rp,$num,$rp ! readjust input pointers to point
  148. add $ap,$num,$ap ! at the ends too...
  149. add $bp,$num,$bp
  150. add $np,$num,$np
  151. stx %o7,[%sp+$bias+$frame+48] ! save %asi
  152. sub %g0,$num,$i ! i=-num
  153. sub %g0,$num,$j ! j=-num
  154. add $ap,$j,%o3
  155. add $bp,$i,%o4
  156. ld [%o3+4],%g1 ! bp[0]
  157. ld [%o3+0],%o0
  158. ld [%o4+4],%g5 ! ap[0]
  159. sllx %g1,32,%g1
  160. ld [%o4+0],%o1
  161. sllx %g5,32,%g5
  162. or %g1,%o0,%o0
  163. or %g5,%o1,%o1
  164. add $np,$j,%o5
  165. mulx %o1,%o0,%o0 ! ap[0]*bp[0]
  166. mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
  167. stx %o0,[%sp+$bias+$frame+0]
  168. ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
  169. fzeros $alo
  170. ld [%o3+4],$ahi_
  171. fzeros $ahi
  172. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  173. fzeros $nlo
  174. ld [%o5+4],$nhi_
  175. fzeros $nhi
  176. ! transfer b[i] to FPU as 4x16-bit values
  177. ldda [%o4+2]%asi,$ba
  178. fxtod $alo,$alo
  179. ldda [%o4+0]%asi,$bb
  180. fxtod $ahi,$ahi
  181. ldda [%o4+6]%asi,$bc
  182. fxtod $nlo,$nlo
  183. ldda [%o4+4]%asi,$bd
  184. fxtod $nhi,$nhi
  185. ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
  186. ldda [%sp+$bias+$frame+6]%asi,$na
  187. fxtod $ba,$ba
  188. ldda [%sp+$bias+$frame+4]%asi,$nb
  189. fxtod $bb,$bb
  190. ldda [%sp+$bias+$frame+2]%asi,$nc
  191. fxtod $bc,$bc
  192. ldda [%sp+$bias+$frame+0]%asi,$nd
  193. fxtod $bd,$bd
  194. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  195. fxtod $na,$na
  196. std $ahi,[$ap_h+$j]
  197. fxtod $nb,$nb
  198. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  199. fxtod $nc,$nc
  200. std $nhi,[$np_h+$j]
  201. fxtod $nd,$nd
  202. fmuld $alo,$ba,$aloa
  203. fmuld $nlo,$na,$nloa
  204. fmuld $alo,$bb,$alob
  205. fmuld $nlo,$nb,$nlob
  206. fmuld $alo,$bc,$aloc
  207. faddd $aloa,$nloa,$nloa
  208. fmuld $nlo,$nc,$nloc
  209. fmuld $alo,$bd,$alod
  210. faddd $alob,$nlob,$nlob
  211. fmuld $nlo,$nd,$nlod
  212. fmuld $ahi,$ba,$ahia
  213. faddd $aloc,$nloc,$nloc
  214. fmuld $nhi,$na,$nhia
  215. fmuld $ahi,$bb,$ahib
  216. faddd $alod,$nlod,$nlod
  217. fmuld $nhi,$nb,$nhib
  218. fmuld $ahi,$bc,$ahic
  219. faddd $ahia,$nhia,$nhia
  220. fmuld $nhi,$nc,$nhic
  221. fmuld $ahi,$bd,$ahid
  222. faddd $ahib,$nhib,$nhib
  223. fmuld $nhi,$nd,$nhid
  224. faddd $ahic,$nhic,$dota ! $nhic
  225. faddd $ahid,$nhid,$dotb ! $nhid
  226. faddd $nloc,$nhia,$nloc
  227. faddd $nlod,$nhib,$nlod
  228. fdtox $nloa,$nloa
  229. fdtox $nlob,$nlob
  230. fdtox $nloc,$nloc
  231. fdtox $nlod,$nlod
  232. std $nloa,[%sp+$bias+$frame+0]
  233. add $j,8,$j
  234. std $nlob,[%sp+$bias+$frame+8]
  235. add $ap,$j,%o4
  236. std $nloc,[%sp+$bias+$frame+16]
  237. add $np,$j,%o5
  238. std $nlod,[%sp+$bias+$frame+24]
  239. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  240. fzeros $alo
  241. ld [%o4+4],$ahi_
  242. fzeros $ahi
  243. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  244. fzeros $nlo
  245. ld [%o5+4],$nhi_
  246. fzeros $nhi
  247. fxtod $alo,$alo
  248. fxtod $ahi,$ahi
  249. fxtod $nlo,$nlo
  250. fxtod $nhi,$nhi
  251. ldx [%sp+$bias+$frame+0],%o0
  252. fmuld $alo,$ba,$aloa
  253. ldx [%sp+$bias+$frame+8],%o1
  254. fmuld $nlo,$na,$nloa
  255. ldx [%sp+$bias+$frame+16],%o2
  256. fmuld $alo,$bb,$alob
  257. ldx [%sp+$bias+$frame+24],%o3
  258. fmuld $nlo,$nb,$nlob
  259. srlx %o0,16,%o7
  260. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  261. fmuld $alo,$bc,$aloc
  262. add %o7,%o1,%o1
  263. std $ahi,[$ap_h+$j]
  264. faddd $aloa,$nloa,$nloa
  265. fmuld $nlo,$nc,$nloc
  266. srlx %o1,16,%o7
  267. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  268. fmuld $alo,$bd,$alod
  269. add %o7,%o2,%o2
  270. std $nhi,[$np_h+$j]
  271. faddd $alob,$nlob,$nlob
  272. fmuld $nlo,$nd,$nlod
  273. srlx %o2,16,%o7
  274. fmuld $ahi,$ba,$ahia
  275. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  276. faddd $aloc,$nloc,$nloc
  277. fmuld $nhi,$na,$nhia
  278. !and %o0,$mask,%o0
  279. !and %o1,$mask,%o1
  280. !and %o2,$mask,%o2
  281. !sllx %o1,16,%o1
  282. !sllx %o2,32,%o2
  283. !sllx %o3,48,%o7
  284. !or %o1,%o0,%o0
  285. !or %o2,%o0,%o0
  286. !or %o7,%o0,%o0 ! 64-bit result
  287. srlx %o3,16,%g1 ! 34-bit carry
  288. fmuld $ahi,$bb,$ahib
  289. faddd $alod,$nlod,$nlod
  290. fmuld $nhi,$nb,$nhib
  291. fmuld $ahi,$bc,$ahic
  292. faddd $ahia,$nhia,$nhia
  293. fmuld $nhi,$nc,$nhic
  294. fmuld $ahi,$bd,$ahid
  295. faddd $ahib,$nhib,$nhib
  296. fmuld $nhi,$nd,$nhid
  297. faddd $dota,$nloa,$nloa
  298. faddd $dotb,$nlob,$nlob
  299. faddd $ahic,$nhic,$dota ! $nhic
  300. faddd $ahid,$nhid,$dotb ! $nhid
  301. faddd $nloc,$nhia,$nloc
  302. faddd $nlod,$nhib,$nlod
  303. fdtox $nloa,$nloa
  304. fdtox $nlob,$nlob
  305. fdtox $nloc,$nloc
  306. fdtox $nlod,$nlod
  307. std $nloa,[%sp+$bias+$frame+0]
  308. std $nlob,[%sp+$bias+$frame+8]
  309. addcc $j,8,$j
  310. std $nloc,[%sp+$bias+$frame+16]
  311. bz,pn %icc,.L1stskip
  312. std $nlod,[%sp+$bias+$frame+24]
  313. .align 32 ! incidentally already aligned !
  314. .L1st:
  315. add $ap,$j,%o4
  316. add $np,$j,%o5
  317. ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
  318. fzeros $alo
  319. ld [%o4+4],$ahi_
  320. fzeros $ahi
  321. ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
  322. fzeros $nlo
  323. ld [%o5+4],$nhi_
  324. fzeros $nhi
  325. fxtod $alo,$alo
  326. fxtod $ahi,$ahi
  327. fxtod $nlo,$nlo
  328. fxtod $nhi,$nhi
  329. ldx [%sp+$bias+$frame+0],%o0
  330. fmuld $alo,$ba,$aloa
  331. ldx [%sp+$bias+$frame+8],%o1
  332. fmuld $nlo,$na,$nloa
  333. ldx [%sp+$bias+$frame+16],%o2
  334. fmuld $alo,$bb,$alob
  335. ldx [%sp+$bias+$frame+24],%o3
  336. fmuld $nlo,$nb,$nlob
  337. srlx %o0,16,%o7
  338. std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
  339. fmuld $alo,$bc,$aloc
  340. add %o7,%o1,%o1
  341. std $ahi,[$ap_h+$j]
  342. faddd $aloa,$nloa,$nloa
  343. fmuld $nlo,$nc,$nloc
  344. srlx %o1,16,%o7
  345. std $nlo,[$np_l+$j] ! save smashed np[j] in double format
  346. fmuld $alo,$bd,$alod
  347. add %o7,%o2,%o2
  348. std $nhi,[$np_h+$j]
  349. faddd $alob,$nlob,$nlob
  350. fmuld $nlo,$nd,$nlod
  351. srlx %o2,16,%o7
  352. fmuld $ahi,$ba,$ahia
  353. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  354. and %o0,$mask,%o0
  355. faddd $aloc,$nloc,$nloc
  356. fmuld $nhi,$na,$nhia
  357. and %o1,$mask,%o1
  358. and %o2,$mask,%o2
  359. fmuld $ahi,$bb,$ahib
  360. sllx %o1,16,%o1
  361. faddd $alod,$nlod,$nlod
  362. fmuld $nhi,$nb,$nhib
  363. sllx %o2,32,%o2
  364. fmuld $ahi,$bc,$ahic
  365. sllx %o3,48,%o7
  366. or %o1,%o0,%o0
  367. faddd $ahia,$nhia,$nhia
  368. fmuld $nhi,$nc,$nhic
  369. or %o2,%o0,%o0
  370. fmuld $ahi,$bd,$ahid
  371. or %o7,%o0,%o0 ! 64-bit result
  372. faddd $ahib,$nhib,$nhib
  373. fmuld $nhi,$nd,$nhid
  374. addcc %g1,%o0,%o0
  375. faddd $dota,$nloa,$nloa
  376. srlx %o3,16,%g1 ! 34-bit carry
  377. faddd $dotb,$nlob,$nlob
  378. bcs,a %xcc,.+8
  379. add %g1,1,%g1
  380. stx %o0,[$tp] ! tp[j-1]=
  381. faddd $ahic,$nhic,$dota ! $nhic
  382. faddd $ahid,$nhid,$dotb ! $nhid
  383. faddd $nloc,$nhia,$nloc
  384. faddd $nlod,$nhib,$nlod
  385. fdtox $nloa,$nloa
  386. fdtox $nlob,$nlob
  387. fdtox $nloc,$nloc
  388. fdtox $nlod,$nlod
  389. std $nloa,[%sp+$bias+$frame+0]
  390. std $nlob,[%sp+$bias+$frame+8]
  391. std $nloc,[%sp+$bias+$frame+16]
  392. std $nlod,[%sp+$bias+$frame+24]
  393. addcc $j,8,$j
  394. bnz,pt %icc,.L1st
  395. add $tp,8,$tp
  396. .L1stskip:
  397. fdtox $dota,$dota
  398. fdtox $dotb,$dotb
  399. ldx [%sp+$bias+$frame+0],%o0
  400. ldx [%sp+$bias+$frame+8],%o1
  401. ldx [%sp+$bias+$frame+16],%o2
  402. ldx [%sp+$bias+$frame+24],%o3
  403. srlx %o0,16,%o7
  404. std $dota,[%sp+$bias+$frame+32]
  405. add %o7,%o1,%o1
  406. std $dotb,[%sp+$bias+$frame+40]
  407. srlx %o1,16,%o7
  408. add %o7,%o2,%o2
  409. srlx %o2,16,%o7
  410. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  411. and %o0,$mask,%o0
  412. and %o1,$mask,%o1
  413. and %o2,$mask,%o2
  414. sllx %o1,16,%o1
  415. sllx %o2,32,%o2
  416. sllx %o3,48,%o7
  417. or %o1,%o0,%o0
  418. or %o2,%o0,%o0
  419. or %o7,%o0,%o0 ! 64-bit result
  420. ldx [%sp+$bias+$frame+32],%o4
  421. addcc %g1,%o0,%o0
  422. ldx [%sp+$bias+$frame+40],%o5
  423. srlx %o3,16,%g1 ! 34-bit carry
  424. bcs,a %xcc,.+8
  425. add %g1,1,%g1
  426. stx %o0,[$tp] ! tp[j-1]=
  427. add $tp,8,$tp
  428. srlx %o4,16,%o7
  429. add %o7,%o5,%o5
  430. and %o4,$mask,%o4
  431. sllx %o5,16,%o7
  432. or %o7,%o4,%o4
  433. addcc %g1,%o4,%o4
  434. srlx %o5,48,%g1
  435. bcs,a %xcc,.+8
  436. add %g1,1,%g1
  437. mov %g1,$carry
  438. stx %o4,[$tp] ! tp[num-1]=
  439. ba .Louter
  440. add $i,8,$i
  441. .align 32
  442. .Louter:
  443. sub %g0,$num,$j ! j=-num
  444. add %sp,$bias+$frame+$locals,$tp
  445. add $ap,$j,%o3
  446. add $bp,$i,%o4
  447. ld [%o3+4],%g1 ! bp[i]
  448. ld [%o3+0],%o0
  449. ld [%o4+4],%g5 ! ap[0]
  450. sllx %g1,32,%g1
  451. ld [%o4+0],%o1
  452. sllx %g5,32,%g5
  453. or %g1,%o0,%o0
  454. or %g5,%o1,%o1
  455. ldx [$tp],%o2 ! tp[0]
  456. mulx %o1,%o0,%o0
  457. addcc %o2,%o0,%o0
  458. mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
  459. stx %o0,[%sp+$bias+$frame+0]
  460. ! transfer b[i] to FPU as 4x16-bit values
  461. ldda [%o4+2]%asi,$ba
  462. ldda [%o4+0]%asi,$bb
  463. ldda [%o4+6]%asi,$bc
  464. ldda [%o4+4]%asi,$bd
  465. ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
  466. ldda [%sp+$bias+$frame+6]%asi,$na
  467. fxtod $ba,$ba
  468. ldda [%sp+$bias+$frame+4]%asi,$nb
  469. fxtod $bb,$bb
  470. ldda [%sp+$bias+$frame+2]%asi,$nc
  471. fxtod $bc,$bc
  472. ldda [%sp+$bias+$frame+0]%asi,$nd
  473. fxtod $bd,$bd
  474. ldd [$ap_l+$j],$alo ! load a[j] in double format
  475. fxtod $na,$na
  476. ldd [$ap_h+$j],$ahi
  477. fxtod $nb,$nb
  478. ldd [$np_l+$j],$nlo ! load n[j] in double format
  479. fxtod $nc,$nc
  480. ldd [$np_h+$j],$nhi
  481. fxtod $nd,$nd
  482. fmuld $alo,$ba,$aloa
  483. fmuld $nlo,$na,$nloa
  484. fmuld $alo,$bb,$alob
  485. fmuld $nlo,$nb,$nlob
  486. fmuld $alo,$bc,$aloc
  487. faddd $aloa,$nloa,$nloa
  488. fmuld $nlo,$nc,$nloc
  489. fmuld $alo,$bd,$alod
  490. faddd $alob,$nlob,$nlob
  491. fmuld $nlo,$nd,$nlod
  492. fmuld $ahi,$ba,$ahia
  493. faddd $aloc,$nloc,$nloc
  494. fmuld $nhi,$na,$nhia
  495. fmuld $ahi,$bb,$ahib
  496. faddd $alod,$nlod,$nlod
  497. fmuld $nhi,$nb,$nhib
  498. fmuld $ahi,$bc,$ahic
  499. faddd $ahia,$nhia,$nhia
  500. fmuld $nhi,$nc,$nhic
  501. fmuld $ahi,$bd,$ahid
  502. faddd $ahib,$nhib,$nhib
  503. fmuld $nhi,$nd,$nhid
  504. faddd $ahic,$nhic,$dota ! $nhic
  505. faddd $ahid,$nhid,$dotb ! $nhid
  506. faddd $nloc,$nhia,$nloc
  507. faddd $nlod,$nhib,$nlod
  508. fdtox $nloa,$nloa
  509. fdtox $nlob,$nlob
  510. fdtox $nloc,$nloc
  511. fdtox $nlod,$nlod
  512. std $nloa,[%sp+$bias+$frame+0]
  513. std $nlob,[%sp+$bias+$frame+8]
  514. std $nloc,[%sp+$bias+$frame+16]
  515. add $j,8,$j
  516. std $nlod,[%sp+$bias+$frame+24]
  517. ldd [$ap_l+$j],$alo ! load a[j] in double format
  518. ldd [$ap_h+$j],$ahi
  519. ldd [$np_l+$j],$nlo ! load n[j] in double format
  520. ldd [$np_h+$j],$nhi
  521. fmuld $alo,$ba,$aloa
  522. fmuld $nlo,$na,$nloa
  523. fmuld $alo,$bb,$alob
  524. fmuld $nlo,$nb,$nlob
  525. fmuld $alo,$bc,$aloc
  526. ldx [%sp+$bias+$frame+0],%o0
  527. faddd $aloa,$nloa,$nloa
  528. fmuld $nlo,$nc,$nloc
  529. ldx [%sp+$bias+$frame+8],%o1
  530. fmuld $alo,$bd,$alod
  531. ldx [%sp+$bias+$frame+16],%o2
  532. faddd $alob,$nlob,$nlob
  533. fmuld $nlo,$nd,$nlod
  534. ldx [%sp+$bias+$frame+24],%o3
  535. fmuld $ahi,$ba,$ahia
  536. srlx %o0,16,%o7
  537. faddd $aloc,$nloc,$nloc
  538. fmuld $nhi,$na,$nhia
  539. add %o7,%o1,%o1
  540. fmuld $ahi,$bb,$ahib
  541. srlx %o1,16,%o7
  542. faddd $alod,$nlod,$nlod
  543. fmuld $nhi,$nb,$nhib
  544. add %o7,%o2,%o2
  545. fmuld $ahi,$bc,$ahic
  546. srlx %o2,16,%o7
  547. faddd $ahia,$nhia,$nhia
  548. fmuld $nhi,$nc,$nhic
  549. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  550. ! why?
  551. and %o0,$mask,%o0
  552. fmuld $ahi,$bd,$ahid
  553. and %o1,$mask,%o1
  554. and %o2,$mask,%o2
  555. faddd $ahib,$nhib,$nhib
  556. fmuld $nhi,$nd,$nhid
  557. sllx %o1,16,%o1
  558. faddd $dota,$nloa,$nloa
  559. sllx %o2,32,%o2
  560. faddd $dotb,$nlob,$nlob
  561. sllx %o3,48,%o7
  562. or %o1,%o0,%o0
  563. faddd $ahic,$nhic,$dota ! $nhic
  564. or %o2,%o0,%o0
  565. faddd $ahid,$nhid,$dotb ! $nhid
  566. or %o7,%o0,%o0 ! 64-bit result
  567. ldx [$tp],%o7
  568. faddd $nloc,$nhia,$nloc
  569. addcc %o7,%o0,%o0
  570. ! end-of-why?
  571. faddd $nlod,$nhib,$nlod
  572. srlx %o3,16,%g1 ! 34-bit carry
  573. fdtox $nloa,$nloa
  574. bcs,a %xcc,.+8
  575. add %g1,1,%g1
  576. fdtox $nlob,$nlob
  577. fdtox $nloc,$nloc
  578. fdtox $nlod,$nlod
  579. std $nloa,[%sp+$bias+$frame+0]
  580. std $nlob,[%sp+$bias+$frame+8]
  581. addcc $j,8,$j
  582. std $nloc,[%sp+$bias+$frame+16]
  583. bz,pn %icc,.Linnerskip
  584. std $nlod,[%sp+$bias+$frame+24]
  585. ba .Linner
  586. nop
  587. .align 32
  588. .Linner:
  589. ldd [$ap_l+$j],$alo ! load a[j] in double format
  590. ldd [$ap_h+$j],$ahi
  591. ldd [$np_l+$j],$nlo ! load n[j] in double format
  592. ldd [$np_h+$j],$nhi
  593. fmuld $alo,$ba,$aloa
  594. fmuld $nlo,$na,$nloa
  595. fmuld $alo,$bb,$alob
  596. fmuld $nlo,$nb,$nlob
  597. fmuld $alo,$bc,$aloc
  598. ldx [%sp+$bias+$frame+0],%o0
  599. faddd $aloa,$nloa,$nloa
  600. fmuld $nlo,$nc,$nloc
  601. ldx [%sp+$bias+$frame+8],%o1
  602. fmuld $alo,$bd,$alod
  603. ldx [%sp+$bias+$frame+16],%o2
  604. faddd $alob,$nlob,$nlob
  605. fmuld $nlo,$nd,$nlod
  606. ldx [%sp+$bias+$frame+24],%o3
  607. fmuld $ahi,$ba,$ahia
  608. srlx %o0,16,%o7
  609. faddd $aloc,$nloc,$nloc
  610. fmuld $nhi,$na,$nhia
  611. add %o7,%o1,%o1
  612. fmuld $ahi,$bb,$ahib
  613. srlx %o1,16,%o7
  614. faddd $alod,$nlod,$nlod
  615. fmuld $nhi,$nb,$nhib
  616. add %o7,%o2,%o2
  617. fmuld $ahi,$bc,$ahic
  618. srlx %o2,16,%o7
  619. faddd $ahia,$nhia,$nhia
  620. fmuld $nhi,$nc,$nhic
  621. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  622. and %o0,$mask,%o0
  623. fmuld $ahi,$bd,$ahid
  624. and %o1,$mask,%o1
  625. and %o2,$mask,%o2
  626. faddd $ahib,$nhib,$nhib
  627. fmuld $nhi,$nd,$nhid
  628. sllx %o1,16,%o1
  629. faddd $dota,$nloa,$nloa
  630. sllx %o2,32,%o2
  631. faddd $dotb,$nlob,$nlob
  632. sllx %o3,48,%o7
  633. or %o1,%o0,%o0
  634. faddd $ahic,$nhic,$dota ! $nhic
  635. or %o2,%o0,%o0
  636. faddd $ahid,$nhid,$dotb ! $nhid
  637. or %o7,%o0,%o0 ! 64-bit result
  638. faddd $nloc,$nhia,$nloc
  639. addcc %g1,%o0,%o0
  640. ldx [$tp+8],%o7 ! tp[j]
  641. faddd $nlod,$nhib,$nlod
  642. srlx %o3,16,%g1 ! 34-bit carry
  643. fdtox $nloa,$nloa
  644. bcs,a %xcc,.+8
  645. add %g1,1,%g1
  646. fdtox $nlob,$nlob
  647. addcc %o7,%o0,%o0
  648. fdtox $nloc,$nloc
  649. bcs,a %xcc,.+8
  650. add %g1,1,%g1
  651. stx %o0,[$tp] ! tp[j-1]
  652. fdtox $nlod,$nlod
  653. std $nloa,[%sp+$bias+$frame+0]
  654. std $nlob,[%sp+$bias+$frame+8]
  655. std $nloc,[%sp+$bias+$frame+16]
  656. addcc $j,8,$j
  657. std $nlod,[%sp+$bias+$frame+24]
  658. bnz,pt %icc,.Linner
  659. add $tp,8,$tp
  660. .Linnerskip:
  661. fdtox $dota,$dota
  662. fdtox $dotb,$dotb
  663. ldx [%sp+$bias+$frame+0],%o0
  664. ldx [%sp+$bias+$frame+8],%o1
  665. ldx [%sp+$bias+$frame+16],%o2
  666. ldx [%sp+$bias+$frame+24],%o3
  667. srlx %o0,16,%o7
  668. std $dota,[%sp+$bias+$frame+32]
  669. add %o7,%o1,%o1
  670. std $dotb,[%sp+$bias+$frame+40]
  671. srlx %o1,16,%o7
  672. add %o7,%o2,%o2
  673. srlx %o2,16,%o7
  674. add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
  675. and %o0,$mask,%o0
  676. and %o1,$mask,%o1
  677. and %o2,$mask,%o2
  678. sllx %o1,16,%o1
  679. sllx %o2,32,%o2
  680. sllx %o3,48,%o7
  681. or %o1,%o0,%o0
  682. or %o2,%o0,%o0
  683. ldx [%sp+$bias+$frame+32],%o4
  684. or %o7,%o0,%o0 ! 64-bit result
  685. ldx [%sp+$bias+$frame+40],%o5
  686. addcc %g1,%o0,%o0
  687. ldx [$tp+8],%o7 ! tp[j]
  688. srlx %o3,16,%g1 ! 34-bit carry
  689. bcs,a %xcc,.+8
  690. add %g1,1,%g1
  691. addcc %o7,%o0,%o0
  692. bcs,a %xcc,.+8
  693. add %g1,1,%g1
  694. stx %o0,[$tp] ! tp[j-1]
  695. add $tp,8,$tp
  696. srlx %o4,16,%o7
  697. add %o7,%o5,%o5
  698. and %o4,$mask,%o4
  699. sllx %o5,16,%o7
  700. or %o7,%o4,%o4
  701. addcc %g1,%o4,%o4
  702. srlx %o5,48,%g1
  703. bcs,a %xcc,.+8
  704. add %g1,1,%g1
  705. addcc $carry,%o4,%o4
  706. stx %o4,[$tp] ! tp[num-1]
  707. mov %g1,$carry
  708. bcs,a %xcc,.+8
  709. add $carry,1,$carry
  710. addcc $i,8,$i
  711. bnz %icc,.Louter
  712. nop
  713. add $tp,8,$tp ! adjust tp to point at the end
  714. orn %g0,%g0,%g4
  715. sub %g0,$num,%o7 ! n=-num
  716. ba .Lsub
  717. subcc %g0,%g0,%g0 ! clear %icc.c
  718. .align 32
  719. .Lsub:
  720. ldx [$tp+%o7],%o0
  721. add $np,%o7,%g1
  722. ld [%g1+0],%o2
  723. ld [%g1+4],%o3
  724. srlx %o0,32,%o1
  725. subccc %o0,%o2,%o2
  726. add $rp,%o7,%g1
  727. subccc %o1,%o3,%o3
  728. st %o2,[%g1+0]
  729. add %o7,8,%o7
  730. brnz,pt %o7,.Lsub
  731. st %o3,[%g1+4]
  732. subc $carry,0,%g4
  733. sub %g0,$num,%o7 ! n=-num
  734. ba .Lcopy
  735. nop
  736. .align 32
  737. .Lcopy:
  738. ldx [$tp+%o7],%o0
  739. add $rp,%o7,%g1
  740. ld [%g1+0],%o2
  741. ld [%g1+4],%o3
  742. stx %g0,[$tp+%o7]
  743. and %o0,%g4,%o0
  744. srlx %o0,32,%o1
  745. andn %o2,%g4,%o2
  746. andn %o3,%g4,%o3
  747. or %o2,%o0,%o0
  748. or %o3,%o1,%o1
  749. st %o0,[%g1+0]
  750. add %o7,8,%o7
  751. brnz,pt %o7,.Lcopy
  752. st %o1,[%g1+4]
  753. sub %g0,$num,%o7 ! n=-num
  754. .Lzap:
  755. stx %g0,[$ap_l+%o7]
  756. stx %g0,[$ap_h+%o7]
  757. stx %g0,[$np_l+%o7]
  758. stx %g0,[$np_h+%o7]
  759. add %o7,8,%o7
  760. brnz,pt %o7,.Lzap
  761. nop
  762. ldx [%sp+$bias+$frame+48],%o7
  763. wr %g0,%o7,%asi ! restore %asi
  764. mov 1,%i0
  765. .Lret:
  766. ret
  767. restore
  768. .type $fname,#function
  769. .size $fname,(.-$fname)
  770. .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
  771. .align 32
  772. ___
  773. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  774. # Below substitution makes it possible to compile without demanding
  775. # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
  776. # dare to do this, because VIS capability is detected at run-time now
  777. # and this routine is not called on CPU not capable to execute it. Do
  778. # note that fzeros is not the only VIS dependency! Another dependency
  779. # is implicit and is just _a_ numerical value loaded to %asi register,
  780. # which assembler can't recognize as VIS specific...
  781. $code =~ s/fzeros\s+%f([0-9]+)/
  782. sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
  783. /gem;
  784. print $code;
  785. # flush
  786. close STDOUT or die "error closing STDOUT: $!";