sparcv9-mont.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # December 2005
  15. #
  16. # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
  17. # for undertaken effort are multiple. First of all, UltraSPARC is not
  18. # the whole SPARCv9 universe and other VIS-free implementations deserve
  19. # optimized code as much. Secondly, newly introduced UltraSPARC T1,
  20. # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
  21. # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
  22. # several integrated RSA/DSA accelerator circuits accessible through
  23. # kernel driver [only(*)], but having decent user-land software
  24. # implementation is important too. Finally, reasons like desire to
  25. # experiment with dedicated squaring procedure. Yes, this module
  26. # implements one, because it was easiest to draft it in SPARCv9
  27. # instructions...
  28. # (*) Engine accessing the driver in question is on my TODO list.
  29. # For reference, accelerator is estimated to give 6 to 10 times
  30. # improvement on single-threaded RSA sign. It should be noted
  31. # that 6-10x improvement coefficient does not actually mean
  32. # something extraordinary in terms of absolute [single-threaded]
  33. # performance, as SPARCv9 instruction set is by all means least
  34. # suitable for high performance crypto among other 64 bit
  35. # platforms. 6-10x factor simply places T1 in same performance
  36. # domain as say AMD64 and IA-64. Improvement of RSA verify don't
  37. # appear impressive at all, but it's the sign operation which is
  38. # far more critical/interesting.
  39. # You might notice that inner loops are modulo-scheduled:-) This has
  40. # essentially negligible impact on UltraSPARC performance, it's
  41. # Fujitsu SPARC64 V users who should notice and hopefully appreciate
  42. # the advantage... Currently this module surpasses sparcv9a-mont.pl
  43. # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
  44. # module still have hidden potential [see TODO list there], which is
  45. # estimated to be larger than 20%...
  46. $output = pop;
  47. open STDOUT,">$output";
  48. # int bn_mul_mont(
  49. $rp="%i0"; # BN_ULONG *rp,
  50. $ap="%i1"; # const BN_ULONG *ap,
  51. $bp="%i2"; # const BN_ULONG *bp,
  52. $np="%i3"; # const BN_ULONG *np,
  53. $n0="%i4"; # const BN_ULONG *n0,
  54. $num="%i5"; # int num);
  55. $frame="STACK_FRAME";
  56. $bias="STACK_BIAS";
  57. $car0="%o0";
  58. $car1="%o1";
  59. $car2="%o2"; # 1 bit
  60. $acc0="%o3";
  61. $acc1="%o4";
  62. $mask="%g1"; # 32 bits, what a waste...
  63. $tmp0="%g4";
  64. $tmp1="%g5";
  65. $i="%l0";
  66. $j="%l1";
  67. $mul0="%l2";
  68. $mul1="%l3";
  69. $tp="%l4";
  70. $apj="%l5";
  71. $npj="%l6";
  72. $tpj="%l7";
  73. $fname="bn_mul_mont_int";
  74. $code=<<___;
  75. #include "sparc_arch.h"
  76. .section ".text",#alloc,#execinstr
  77. .global $fname
  78. .align 32
  79. $fname:
  80. cmp %o5,4 ! 128 bits minimum
  81. bge,pt %icc,.Lenter
  82. sethi %hi(0xffffffff),$mask
  83. retl
  84. clr %o0
  85. .align 32
  86. .Lenter:
  87. save %sp,-$frame,%sp
  88. sll $num,2,$num ! num*=4
  89. or $mask,%lo(0xffffffff),$mask
  90. ld [$n0],$n0
  91. cmp $ap,$bp
  92. and $num,$mask,$num
  93. ld [$bp],$mul0 ! bp[0]
  94. nop
  95. add %sp,$bias,%o7 ! real top of stack
  96. ld [$ap],$car0 ! ap[0] ! redundant in squaring context
  97. sub %o7,$num,%o7
  98. ld [$ap+4],$apj ! ap[1]
  99. and %o7,-1024,%o7
  100. ld [$np],$car1 ! np[0]
  101. sub %o7,$bias,%sp ! alloca
  102. ld [$np+4],$npj ! np[1]
  103. be,pt SIZE_T_CC,.Lbn_sqr_mont
  104. mov 12,$j
  105. mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
  106. mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
  107. and $car0,$mask,$acc0
  108. add %sp,$bias+$frame,$tp
  109. ld [$ap+8],$apj !prologue!
  110. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  111. and $mul1,$mask,$mul1
  112. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  113. mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
  114. srlx $car0,32,$car0
  115. add $acc0,$car1,$car1
  116. ld [$np+8],$npj !prologue!
  117. srlx $car1,32,$car1
  118. mov $tmp0,$acc0 !prologue!
  119. .L1st:
  120. mulx $apj,$mul0,$tmp0
  121. mulx $npj,$mul1,$tmp1
  122. add $acc0,$car0,$car0
  123. ld [$ap+$j],$apj ! ap[j]
  124. and $car0,$mask,$acc0
  125. add $acc1,$car1,$car1
  126. ld [$np+$j],$npj ! np[j]
  127. srlx $car0,32,$car0
  128. add $acc0,$car1,$car1
  129. add $j,4,$j ! j++
  130. mov $tmp0,$acc0
  131. st $car1,[$tp]
  132. cmp $j,$num
  133. mov $tmp1,$acc1
  134. srlx $car1,32,$car1
  135. bl %icc,.L1st
  136. add $tp,4,$tp ! tp++
  137. !.L1st
  138. mulx $apj,$mul0,$tmp0 !epilogue!
  139. mulx $npj,$mul1,$tmp1
  140. add $acc0,$car0,$car0
  141. and $car0,$mask,$acc0
  142. add $acc1,$car1,$car1
  143. srlx $car0,32,$car0
  144. add $acc0,$car1,$car1
  145. st $car1,[$tp]
  146. srlx $car1,32,$car1
  147. add $tmp0,$car0,$car0
  148. and $car0,$mask,$acc0
  149. add $tmp1,$car1,$car1
  150. srlx $car0,32,$car0
  151. add $acc0,$car1,$car1
  152. st $car1,[$tp+4]
  153. srlx $car1,32,$car1
  154. add $car0,$car1,$car1
  155. st $car1,[$tp+8]
  156. srlx $car1,32,$car2
  157. mov 4,$i ! i++
  158. ld [$bp+4],$mul0 ! bp[1]
  159. .Louter:
  160. add %sp,$bias+$frame,$tp
  161. ld [$ap],$car0 ! ap[0]
  162. ld [$ap+4],$apj ! ap[1]
  163. ld [$np],$car1 ! np[0]
  164. ld [$np+4],$npj ! np[1]
  165. ld [$tp],$tmp1 ! tp[0]
  166. ld [$tp+4],$tpj ! tp[1]
  167. mov 12,$j
  168. mulx $car0,$mul0,$car0
  169. mulx $apj,$mul0,$tmp0 !prologue!
  170. add $tmp1,$car0,$car0
  171. ld [$ap+8],$apj !prologue!
  172. and $car0,$mask,$acc0
  173. mulx $n0,$acc0,$mul1
  174. and $mul1,$mask,$mul1
  175. mulx $car1,$mul1,$car1
  176. mulx $npj,$mul1,$acc1 !prologue!
  177. srlx $car0,32,$car0
  178. add $acc0,$car1,$car1
  179. ld [$np+8],$npj !prologue!
  180. srlx $car1,32,$car1
  181. mov $tmp0,$acc0 !prologue!
  182. .Linner:
  183. mulx $apj,$mul0,$tmp0
  184. mulx $npj,$mul1,$tmp1
  185. add $tpj,$car0,$car0
  186. ld [$ap+$j],$apj ! ap[j]
  187. add $acc0,$car0,$car0
  188. add $acc1,$car1,$car1
  189. ld [$np+$j],$npj ! np[j]
  190. and $car0,$mask,$acc0
  191. ld [$tp+8],$tpj ! tp[j]
  192. srlx $car0,32,$car0
  193. add $acc0,$car1,$car1
  194. add $j,4,$j ! j++
  195. mov $tmp0,$acc0
  196. st $car1,[$tp] ! tp[j-1]
  197. srlx $car1,32,$car1
  198. mov $tmp1,$acc1
  199. cmp $j,$num
  200. bl %icc,.Linner
  201. add $tp,4,$tp ! tp++
  202. !.Linner
  203. mulx $apj,$mul0,$tmp0 !epilogue!
  204. mulx $npj,$mul1,$tmp1
  205. add $tpj,$car0,$car0
  206. add $acc0,$car0,$car0
  207. ld [$tp+8],$tpj ! tp[j]
  208. and $car0,$mask,$acc0
  209. add $acc1,$car1,$car1
  210. srlx $car0,32,$car0
  211. add $acc0,$car1,$car1
  212. st $car1,[$tp] ! tp[j-1]
  213. srlx $car1,32,$car1
  214. add $tpj,$car0,$car0
  215. add $tmp0,$car0,$car0
  216. and $car0,$mask,$acc0
  217. add $tmp1,$car1,$car1
  218. add $acc0,$car1,$car1
  219. st $car1,[$tp+4] ! tp[j-1]
  220. srlx $car0,32,$car0
  221. add $i,4,$i ! i++
  222. srlx $car1,32,$car1
  223. add $car0,$car1,$car1
  224. cmp $i,$num
  225. add $car2,$car1,$car1
  226. st $car1,[$tp+8]
  227. srlx $car1,32,$car2
  228. bl,a %icc,.Louter
  229. ld [$bp+$i],$mul0 ! bp[i]
  230. !.Louter
  231. add $tp,12,$tp
  232. .Ltail:
  233. add $np,$num,$np
  234. add $rp,$num,$rp
  235. sub %g0,$num,%o7 ! k=-num
  236. ba .Lsub
  237. subcc %g0,%g0,%g0 ! clear %icc.c
  238. .align 16
  239. .Lsub:
  240. ld [$tp+%o7],%o0
  241. ld [$np+%o7],%o1
  242. subccc %o0,%o1,%o1 ! tp[j]-np[j]
  243. add $rp,%o7,$i
  244. add %o7,4,%o7
  245. brnz %o7,.Lsub
  246. st %o1,[$i]
  247. subccc $car2,0,$car2 ! handle upmost overflow bit
  248. sub %g0,$num,%o7
  249. .Lcopy:
  250. ld [$tp+%o7],%o1 ! conditional copy
  251. ld [$rp+%o7],%o0
  252. st %g0,[$tp+%o7] ! zap tp
  253. movcs %icc,%o1,%o0
  254. st %o0,[$rp+%o7]
  255. add %o7,4,%o7
  256. brnz %o7,.Lcopy
  257. nop
  258. mov 1,%i0
  259. ret
  260. restore
  261. ___
  262. ########
  263. ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
  264. ######## code without following dedicated squaring procedure.
  265. ########
  266. $sbit="%o5";
  267. $code.=<<___;
  268. .align 32
  269. .Lbn_sqr_mont:
  270. mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
  271. mulx $apj,$mul0,$tmp0 !prologue!
  272. and $car0,$mask,$acc0
  273. add %sp,$bias+$frame,$tp
  274. ld [$ap+8],$apj !prologue!
  275. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  276. srlx $car0,32,$car0
  277. and $mul1,$mask,$mul1
  278. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  279. mulx $npj,$mul1,$acc1 !prologue!
  280. and $car0,1,$sbit
  281. ld [$np+8],$npj !prologue!
  282. srlx $car0,1,$car0
  283. add $acc0,$car1,$car1
  284. srlx $car1,32,$car1
  285. mov $tmp0,$acc0 !prologue!
  286. .Lsqr_1st:
  287. mulx $apj,$mul0,$tmp0
  288. mulx $npj,$mul1,$tmp1
  289. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  290. add $acc1,$car1,$car1
  291. ld [$ap+$j],$apj ! ap[j]
  292. and $car0,$mask,$acc0
  293. ld [$np+$j],$npj ! np[j]
  294. srlx $car0,32,$car0
  295. add $acc0,$acc0,$acc0
  296. or $sbit,$acc0,$acc0
  297. mov $tmp1,$acc1
  298. srlx $acc0,32,$sbit
  299. add $j,4,$j ! j++
  300. and $acc0,$mask,$acc0
  301. cmp $j,$num
  302. add $acc0,$car1,$car1
  303. st $car1,[$tp]
  304. mov $tmp0,$acc0
  305. srlx $car1,32,$car1
  306. bl %icc,.Lsqr_1st
  307. add $tp,4,$tp ! tp++
  308. !.Lsqr_1st
  309. mulx $apj,$mul0,$tmp0 ! epilogue
  310. mulx $npj,$mul1,$tmp1
  311. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  312. add $acc1,$car1,$car1
  313. and $car0,$mask,$acc0
  314. srlx $car0,32,$car0
  315. add $acc0,$acc0,$acc0
  316. or $sbit,$acc0,$acc0
  317. srlx $acc0,32,$sbit
  318. and $acc0,$mask,$acc0
  319. add $acc0,$car1,$car1
  320. st $car1,[$tp]
  321. srlx $car1,32,$car1
  322. add $tmp0,$car0,$car0 ! ap[j]*a0+c0
  323. add $tmp1,$car1,$car1
  324. and $car0,$mask,$acc0
  325. srlx $car0,32,$car0
  326. add $acc0,$acc0,$acc0
  327. or $sbit,$acc0,$acc0
  328. srlx $acc0,32,$sbit
  329. and $acc0,$mask,$acc0
  330. add $acc0,$car1,$car1
  331. st $car1,[$tp+4]
  332. srlx $car1,32,$car1
  333. add $car0,$car0,$car0
  334. or $sbit,$car0,$car0
  335. add $car0,$car1,$car1
  336. st $car1,[$tp+8]
  337. srlx $car1,32,$car2
  338. ld [%sp+$bias+$frame],$tmp0 ! tp[0]
  339. ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
  340. ld [%sp+$bias+$frame+8],$tpj ! tp[2]
  341. ld [$ap+4],$mul0 ! ap[1]
  342. ld [$ap+8],$apj ! ap[2]
  343. ld [$np],$car1 ! np[0]
  344. ld [$np+4],$npj ! np[1]
  345. mulx $n0,$tmp0,$mul1
  346. mulx $mul0,$mul0,$car0
  347. and $mul1,$mask,$mul1
  348. mulx $car1,$mul1,$car1
  349. mulx $npj,$mul1,$acc1
  350. add $tmp0,$car1,$car1
  351. and $car0,$mask,$acc0
  352. ld [$np+8],$npj ! np[2]
  353. srlx $car1,32,$car1
  354. add $tmp1,$car1,$car1
  355. srlx $car0,32,$car0
  356. add $acc0,$car1,$car1
  357. and $car0,1,$sbit
  358. add $acc1,$car1,$car1
  359. srlx $car0,1,$car0
  360. mov 12,$j
  361. st $car1,[%sp+$bias+$frame] ! tp[0]=
  362. srlx $car1,32,$car1
  363. add %sp,$bias+$frame+4,$tp
  364. .Lsqr_2nd:
  365. mulx $apj,$mul0,$acc0
  366. mulx $npj,$mul1,$acc1
  367. add $acc0,$car0,$car0
  368. add $tpj,$sbit,$sbit
  369. ld [$ap+$j],$apj ! ap[j]
  370. and $car0,$mask,$acc0
  371. ld [$np+$j],$npj ! np[j]
  372. srlx $car0,32,$car0
  373. add $acc1,$car1,$car1
  374. ld [$tp+8],$tpj ! tp[j]
  375. add $acc0,$acc0,$acc0
  376. add $j,4,$j ! j++
  377. add $sbit,$acc0,$acc0
  378. srlx $acc0,32,$sbit
  379. and $acc0,$mask,$acc0
  380. cmp $j,$num
  381. add $acc0,$car1,$car1
  382. st $car1,[$tp] ! tp[j-1]
  383. srlx $car1,32,$car1
  384. bl %icc,.Lsqr_2nd
  385. add $tp,4,$tp ! tp++
  386. !.Lsqr_2nd
  387. mulx $apj,$mul0,$acc0
  388. mulx $npj,$mul1,$acc1
  389. add $acc0,$car0,$car0
  390. add $tpj,$sbit,$sbit
  391. and $car0,$mask,$acc0
  392. srlx $car0,32,$car0
  393. add $acc1,$car1,$car1
  394. add $acc0,$acc0,$acc0
  395. add $sbit,$acc0,$acc0
  396. srlx $acc0,32,$sbit
  397. and $acc0,$mask,$acc0
  398. add $acc0,$car1,$car1
  399. st $car1,[$tp] ! tp[j-1]
  400. srlx $car1,32,$car1
  401. add $car0,$car0,$car0
  402. add $sbit,$car0,$car0
  403. add $car0,$car1,$car1
  404. add $car2,$car1,$car1
  405. st $car1,[$tp+4]
  406. srlx $car1,32,$car2
  407. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  408. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  409. ld [$ap+8],$mul0 ! ap[2]
  410. ld [$np],$car1 ! np[0]
  411. ld [$np+4],$npj ! np[1]
  412. mulx $n0,$tmp1,$mul1
  413. and $mul1,$mask,$mul1
  414. mov 8,$i
  415. mulx $mul0,$mul0,$car0
  416. mulx $car1,$mul1,$car1
  417. and $car0,$mask,$acc0
  418. add $tmp1,$car1,$car1
  419. srlx $car0,32,$car0
  420. add %sp,$bias+$frame,$tp
  421. srlx $car1,32,$car1
  422. and $car0,1,$sbit
  423. srlx $car0,1,$car0
  424. mov 4,$j
  425. .Lsqr_outer:
  426. .Lsqr_inner1:
  427. mulx $npj,$mul1,$acc1
  428. add $tpj,$car1,$car1
  429. add $j,4,$j
  430. ld [$tp+8],$tpj
  431. cmp $j,$i
  432. add $acc1,$car1,$car1
  433. ld [$np+$j],$npj
  434. st $car1,[$tp]
  435. srlx $car1,32,$car1
  436. bl %icc,.Lsqr_inner1
  437. add $tp,4,$tp
  438. !.Lsqr_inner1
  439. add $j,4,$j
  440. ld [$ap+$j],$apj ! ap[j]
  441. mulx $npj,$mul1,$acc1
  442. add $tpj,$car1,$car1
  443. ld [$np+$j],$npj ! np[j]
  444. srlx $car1,32,$tmp0
  445. and $car1,$mask,$car1
  446. add $tmp0,$sbit,$sbit
  447. add $acc0,$car1,$car1
  448. ld [$tp+8],$tpj ! tp[j]
  449. add $acc1,$car1,$car1
  450. st $car1,[$tp]
  451. srlx $car1,32,$car1
  452. add $j,4,$j
  453. cmp $j,$num
  454. be,pn %icc,.Lsqr_no_inner2
  455. add $tp,4,$tp
  456. .Lsqr_inner2:
  457. mulx $apj,$mul0,$acc0
  458. mulx $npj,$mul1,$acc1
  459. add $tpj,$sbit,$sbit
  460. add $acc0,$car0,$car0
  461. ld [$ap+$j],$apj ! ap[j]
  462. and $car0,$mask,$acc0
  463. ld [$np+$j],$npj ! np[j]
  464. srlx $car0,32,$car0
  465. add $acc0,$acc0,$acc0
  466. ld [$tp+8],$tpj ! tp[j]
  467. add $sbit,$acc0,$acc0
  468. add $j,4,$j ! j++
  469. srlx $acc0,32,$sbit
  470. and $acc0,$mask,$acc0
  471. cmp $j,$num
  472. add $acc0,$car1,$car1
  473. add $acc1,$car1,$car1
  474. st $car1,[$tp] ! tp[j-1]
  475. srlx $car1,32,$car1
  476. bl %icc,.Lsqr_inner2
  477. add $tp,4,$tp ! tp++
  478. .Lsqr_no_inner2:
  479. mulx $apj,$mul0,$acc0
  480. mulx $npj,$mul1,$acc1
  481. add $tpj,$sbit,$sbit
  482. add $acc0,$car0,$car0
  483. and $car0,$mask,$acc0
  484. srlx $car0,32,$car0
  485. add $acc0,$acc0,$acc0
  486. add $sbit,$acc0,$acc0
  487. srlx $acc0,32,$sbit
  488. and $acc0,$mask,$acc0
  489. add $acc0,$car1,$car1
  490. add $acc1,$car1,$car1
  491. st $car1,[$tp] ! tp[j-1]
  492. srlx $car1,32,$car1
  493. add $car0,$car0,$car0
  494. add $sbit,$car0,$car0
  495. add $car0,$car1,$car1
  496. add $car2,$car1,$car1
  497. st $car1,[$tp+4]
  498. srlx $car1,32,$car2
  499. add $i,4,$i ! i++
  500. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  501. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  502. ld [$ap+$i],$mul0 ! ap[j]
  503. ld [$np],$car1 ! np[0]
  504. ld [$np+4],$npj ! np[1]
  505. mulx $n0,$tmp1,$mul1
  506. and $mul1,$mask,$mul1
  507. add $i,4,$tmp0
  508. mulx $mul0,$mul0,$car0
  509. mulx $car1,$mul1,$car1
  510. and $car0,$mask,$acc0
  511. add $tmp1,$car1,$car1
  512. srlx $car0,32,$car0
  513. add %sp,$bias+$frame,$tp
  514. srlx $car1,32,$car1
  515. and $car0,1,$sbit
  516. srlx $car0,1,$car0
  517. cmp $tmp0,$num ! i<num-1
  518. bl %icc,.Lsqr_outer
  519. mov 4,$j
  520. .Lsqr_last:
  521. mulx $npj,$mul1,$acc1
  522. add $tpj,$car1,$car1
  523. add $j,4,$j
  524. ld [$tp+8],$tpj
  525. cmp $j,$i
  526. add $acc1,$car1,$car1
  527. ld [$np+$j],$npj
  528. st $car1,[$tp]
  529. srlx $car1,32,$car1
  530. bl %icc,.Lsqr_last
  531. add $tp,4,$tp
  532. !.Lsqr_last
  533. mulx $npj,$mul1,$acc1
  534. add $tpj,$acc0,$acc0
  535. srlx $acc0,32,$tmp0
  536. and $acc0,$mask,$acc0
  537. add $tmp0,$sbit,$sbit
  538. add $acc0,$car1,$car1
  539. add $acc1,$car1,$car1
  540. st $car1,[$tp]
  541. srlx $car1,32,$car1
  542. add $car0,$car0,$car0 ! recover $car0
  543. add $sbit,$car0,$car0
  544. add $car0,$car1,$car1
  545. add $car2,$car1,$car1
  546. st $car1,[$tp+4]
  547. srlx $car1,32,$car2
  548. ba .Ltail
  549. add $tp,8,$tp
  550. .type $fname,#function
  551. .size $fname,(.-$fname)
  552. .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  553. .align 32
  554. ___
  555. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  556. print $code;
  557. close STDOUT;