sparcv9-mont.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # December 2005
  15. #
  16. # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
  17. # for undertaken effort are multiple. First of all, UltraSPARC is not
  18. # the whole SPARCv9 universe and other VIS-free implementations deserve
  19. # optimized code as much. Secondly, newly introduced UltraSPARC T1,
  20. # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
  21. # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
  22. # several integrated RSA/DSA accelerator circuits accessible through
  23. # kernel driver [only(*)], but having decent user-land software
  24. # implementation is important too. Finally, reasons like desire to
  25. # experiment with dedicated squaring procedure. Yes, this module
  26. # implements one, because it was easiest to draft it in SPARCv9
  27. # instructions...
  28. # (*) Engine accessing the driver in question is on my TODO list.
  29. # For reference, accelerator is estimated to give 6 to 10 times
  30. # improvement on single-threaded RSA sign. It should be noted
  31. # that 6-10x improvement coefficient does not actually mean
  32. # something extraordinary in terms of absolute [single-threaded]
  33. # performance, as SPARCv9 instruction set is by all means least
  34. # suitable for high performance crypto among other 64 bit
  35. # platforms. 6-10x factor simply places T1 in same performance
  36. # domain as say AMD64 and IA-64. Improvement of RSA verify don't
  37. # appear impressive at all, but it's the sign operation which is
  38. # far more critical/interesting.
  39. # You might notice that inner loops are modulo-scheduled:-) This has
  40. # essentially negligible impact on UltraSPARC performance, it's
  41. # Fujitsu SPARC64 V users who should notice and hopefully appreciate
  42. # the advantage... Currently this module surpasses sparcv9a-mont.pl
  43. # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
  44. # module still have hidden potential [see TODO list there], which is
  45. # estimated to be larger than 20%...
  46. $output = pop and open STDOUT,">$output";
  47. # int bn_mul_mont(
  48. $rp="%i0"; # BN_ULONG *rp,
  49. $ap="%i1"; # const BN_ULONG *ap,
  50. $bp="%i2"; # const BN_ULONG *bp,
  51. $np="%i3"; # const BN_ULONG *np,
  52. $n0="%i4"; # const BN_ULONG *n0,
  53. $num="%i5"; # int num);
  54. $frame="STACK_FRAME";
  55. $bias="STACK_BIAS";
  56. $car0="%o0";
  57. $car1="%o1";
  58. $car2="%o2"; # 1 bit
  59. $acc0="%o3";
  60. $acc1="%o4";
  61. $mask="%g1"; # 32 bits, what a waste...
  62. $tmp0="%g4";
  63. $tmp1="%g5";
  64. $i="%l0";
  65. $j="%l1";
  66. $mul0="%l2";
  67. $mul1="%l3";
  68. $tp="%l4";
  69. $apj="%l5";
  70. $npj="%l6";
  71. $tpj="%l7";
  72. $fname="bn_mul_mont_int";
  73. $code=<<___;
  74. #include "sparc_arch.h"
  75. .section ".text",#alloc,#execinstr
  76. .global $fname
  77. .align 32
  78. $fname:
  79. cmp %o5,4 ! 128 bits minimum
  80. bge,pt %icc,.Lenter
  81. sethi %hi(0xffffffff),$mask
  82. retl
  83. clr %o0
  84. .align 32
  85. .Lenter:
  86. save %sp,-$frame,%sp
  87. sll $num,2,$num ! num*=4
  88. or $mask,%lo(0xffffffff),$mask
  89. ld [$n0],$n0
  90. cmp $ap,$bp
  91. and $num,$mask,$num
  92. ld [$bp],$mul0 ! bp[0]
  93. nop
  94. add %sp,$bias,%o7 ! real top of stack
  95. ld [$ap],$car0 ! ap[0] ! redundant in squaring context
  96. sub %o7,$num,%o7
  97. ld [$ap+4],$apj ! ap[1]
  98. and %o7,-1024,%o7
  99. ld [$np],$car1 ! np[0]
  100. sub %o7,$bias,%sp ! alloca
  101. ld [$np+4],$npj ! np[1]
  102. be,pt SIZE_T_CC,.Lbn_sqr_mont
  103. mov 12,$j
  104. mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
  105. mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
  106. and $car0,$mask,$acc0
  107. add %sp,$bias+$frame,$tp
  108. ld [$ap+8],$apj !prologue!
  109. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  110. and $mul1,$mask,$mul1
  111. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  112. mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
  113. srlx $car0,32,$car0
  114. add $acc0,$car1,$car1
  115. ld [$np+8],$npj !prologue!
  116. srlx $car1,32,$car1
  117. mov $tmp0,$acc0 !prologue!
  118. .L1st:
  119. mulx $apj,$mul0,$tmp0
  120. mulx $npj,$mul1,$tmp1
  121. add $acc0,$car0,$car0
  122. ld [$ap+$j],$apj ! ap[j]
  123. and $car0,$mask,$acc0
  124. add $acc1,$car1,$car1
  125. ld [$np+$j],$npj ! np[j]
  126. srlx $car0,32,$car0
  127. add $acc0,$car1,$car1
  128. add $j,4,$j ! j++
  129. mov $tmp0,$acc0
  130. st $car1,[$tp]
  131. cmp $j,$num
  132. mov $tmp1,$acc1
  133. srlx $car1,32,$car1
  134. bl %icc,.L1st
  135. add $tp,4,$tp ! tp++
  136. !.L1st
  137. mulx $apj,$mul0,$tmp0 !epilogue!
  138. mulx $npj,$mul1,$tmp1
  139. add $acc0,$car0,$car0
  140. and $car0,$mask,$acc0
  141. add $acc1,$car1,$car1
  142. srlx $car0,32,$car0
  143. add $acc0,$car1,$car1
  144. st $car1,[$tp]
  145. srlx $car1,32,$car1
  146. add $tmp0,$car0,$car0
  147. and $car0,$mask,$acc0
  148. add $tmp1,$car1,$car1
  149. srlx $car0,32,$car0
  150. add $acc0,$car1,$car1
  151. st $car1,[$tp+4]
  152. srlx $car1,32,$car1
  153. add $car0,$car1,$car1
  154. st $car1,[$tp+8]
  155. srlx $car1,32,$car2
  156. mov 4,$i ! i++
  157. ld [$bp+4],$mul0 ! bp[1]
  158. .Louter:
  159. add %sp,$bias+$frame,$tp
  160. ld [$ap],$car0 ! ap[0]
  161. ld [$ap+4],$apj ! ap[1]
  162. ld [$np],$car1 ! np[0]
  163. ld [$np+4],$npj ! np[1]
  164. ld [$tp],$tmp1 ! tp[0]
  165. ld [$tp+4],$tpj ! tp[1]
  166. mov 12,$j
  167. mulx $car0,$mul0,$car0
  168. mulx $apj,$mul0,$tmp0 !prologue!
  169. add $tmp1,$car0,$car0
  170. ld [$ap+8],$apj !prologue!
  171. and $car0,$mask,$acc0
  172. mulx $n0,$acc0,$mul1
  173. and $mul1,$mask,$mul1
  174. mulx $car1,$mul1,$car1
  175. mulx $npj,$mul1,$acc1 !prologue!
  176. srlx $car0,32,$car0
  177. add $acc0,$car1,$car1
  178. ld [$np+8],$npj !prologue!
  179. srlx $car1,32,$car1
  180. mov $tmp0,$acc0 !prologue!
  181. .Linner:
  182. mulx $apj,$mul0,$tmp0
  183. mulx $npj,$mul1,$tmp1
  184. add $tpj,$car0,$car0
  185. ld [$ap+$j],$apj ! ap[j]
  186. add $acc0,$car0,$car0
  187. add $acc1,$car1,$car1
  188. ld [$np+$j],$npj ! np[j]
  189. and $car0,$mask,$acc0
  190. ld [$tp+8],$tpj ! tp[j]
  191. srlx $car0,32,$car0
  192. add $acc0,$car1,$car1
  193. add $j,4,$j ! j++
  194. mov $tmp0,$acc0
  195. st $car1,[$tp] ! tp[j-1]
  196. srlx $car1,32,$car1
  197. mov $tmp1,$acc1
  198. cmp $j,$num
  199. bl %icc,.Linner
  200. add $tp,4,$tp ! tp++
  201. !.Linner
  202. mulx $apj,$mul0,$tmp0 !epilogue!
  203. mulx $npj,$mul1,$tmp1
  204. add $tpj,$car0,$car0
  205. add $acc0,$car0,$car0
  206. ld [$tp+8],$tpj ! tp[j]
  207. and $car0,$mask,$acc0
  208. add $acc1,$car1,$car1
  209. srlx $car0,32,$car0
  210. add $acc0,$car1,$car1
  211. st $car1,[$tp] ! tp[j-1]
  212. srlx $car1,32,$car1
  213. add $tpj,$car0,$car0
  214. add $tmp0,$car0,$car0
  215. and $car0,$mask,$acc0
  216. add $tmp1,$car1,$car1
  217. add $acc0,$car1,$car1
  218. st $car1,[$tp+4] ! tp[j-1]
  219. srlx $car0,32,$car0
  220. add $i,4,$i ! i++
  221. srlx $car1,32,$car1
  222. add $car0,$car1,$car1
  223. cmp $i,$num
  224. add $car2,$car1,$car1
  225. st $car1,[$tp+8]
  226. srlx $car1,32,$car2
  227. bl,a %icc,.Louter
  228. ld [$bp+$i],$mul0 ! bp[i]
  229. !.Louter
  230. add $tp,12,$tp
  231. .Ltail:
  232. add $np,$num,$np
  233. add $rp,$num,$rp
  234. sub %g0,$num,%o7 ! k=-num
  235. ba .Lsub
  236. subcc %g0,%g0,%g0 ! clear %icc.c
  237. .align 16
  238. .Lsub:
  239. ld [$tp+%o7],%o0
  240. ld [$np+%o7],%o1
  241. subccc %o0,%o1,%o1 ! tp[j]-np[j]
  242. add $rp,%o7,$i
  243. add %o7,4,%o7
  244. brnz %o7,.Lsub
  245. st %o1,[$i]
  246. subccc $car2,0,$car2 ! handle upmost overflow bit
  247. sub %g0,$num,%o7
  248. .Lcopy:
  249. ld [$tp+%o7],%o1 ! conditional copy
  250. ld [$rp+%o7],%o0
  251. st %g0,[$tp+%o7] ! zap tp
  252. movcs %icc,%o1,%o0
  253. st %o0,[$rp+%o7]
  254. add %o7,4,%o7
  255. brnz %o7,.Lcopy
  256. nop
  257. mov 1,%i0
  258. ret
  259. restore
  260. ___
  261. ########
  262. ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
  263. ######## code without following dedicated squaring procedure.
  264. ########
  265. $sbit="%o5";
  266. $code.=<<___;
  267. .align 32
  268. .Lbn_sqr_mont:
  269. mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
  270. mulx $apj,$mul0,$tmp0 !prologue!
  271. and $car0,$mask,$acc0
  272. add %sp,$bias+$frame,$tp
  273. ld [$ap+8],$apj !prologue!
  274. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  275. srlx $car0,32,$car0
  276. and $mul1,$mask,$mul1
  277. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  278. mulx $npj,$mul1,$acc1 !prologue!
  279. and $car0,1,$sbit
  280. ld [$np+8],$npj !prologue!
  281. srlx $car0,1,$car0
  282. add $acc0,$car1,$car1
  283. srlx $car1,32,$car1
  284. mov $tmp0,$acc0 !prologue!
  285. .Lsqr_1st:
  286. mulx $apj,$mul0,$tmp0
  287. mulx $npj,$mul1,$tmp1
  288. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  289. add $acc1,$car1,$car1
  290. ld [$ap+$j],$apj ! ap[j]
  291. and $car0,$mask,$acc0
  292. ld [$np+$j],$npj ! np[j]
  293. srlx $car0,32,$car0
  294. add $acc0,$acc0,$acc0
  295. or $sbit,$acc0,$acc0
  296. mov $tmp1,$acc1
  297. srlx $acc0,32,$sbit
  298. add $j,4,$j ! j++
  299. and $acc0,$mask,$acc0
  300. cmp $j,$num
  301. add $acc0,$car1,$car1
  302. st $car1,[$tp]
  303. mov $tmp0,$acc0
  304. srlx $car1,32,$car1
  305. bl %icc,.Lsqr_1st
  306. add $tp,4,$tp ! tp++
  307. !.Lsqr_1st
  308. mulx $apj,$mul0,$tmp0 ! epilogue
  309. mulx $npj,$mul1,$tmp1
  310. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  311. add $acc1,$car1,$car1
  312. and $car0,$mask,$acc0
  313. srlx $car0,32,$car0
  314. add $acc0,$acc0,$acc0
  315. or $sbit,$acc0,$acc0
  316. srlx $acc0,32,$sbit
  317. and $acc0,$mask,$acc0
  318. add $acc0,$car1,$car1
  319. st $car1,[$tp]
  320. srlx $car1,32,$car1
  321. add $tmp0,$car0,$car0 ! ap[j]*a0+c0
  322. add $tmp1,$car1,$car1
  323. and $car0,$mask,$acc0
  324. srlx $car0,32,$car0
  325. add $acc0,$acc0,$acc0
  326. or $sbit,$acc0,$acc0
  327. srlx $acc0,32,$sbit
  328. and $acc0,$mask,$acc0
  329. add $acc0,$car1,$car1
  330. st $car1,[$tp+4]
  331. srlx $car1,32,$car1
  332. add $car0,$car0,$car0
  333. or $sbit,$car0,$car0
  334. add $car0,$car1,$car1
  335. st $car1,[$tp+8]
  336. srlx $car1,32,$car2
  337. ld [%sp+$bias+$frame],$tmp0 ! tp[0]
  338. ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
  339. ld [%sp+$bias+$frame+8],$tpj ! tp[2]
  340. ld [$ap+4],$mul0 ! ap[1]
  341. ld [$ap+8],$apj ! ap[2]
  342. ld [$np],$car1 ! np[0]
  343. ld [$np+4],$npj ! np[1]
  344. mulx $n0,$tmp0,$mul1
  345. mulx $mul0,$mul0,$car0
  346. and $mul1,$mask,$mul1
  347. mulx $car1,$mul1,$car1
  348. mulx $npj,$mul1,$acc1
  349. add $tmp0,$car1,$car1
  350. and $car0,$mask,$acc0
  351. ld [$np+8],$npj ! np[2]
  352. srlx $car1,32,$car1
  353. add $tmp1,$car1,$car1
  354. srlx $car0,32,$car0
  355. add $acc0,$car1,$car1
  356. and $car0,1,$sbit
  357. add $acc1,$car1,$car1
  358. srlx $car0,1,$car0
  359. mov 12,$j
  360. st $car1,[%sp+$bias+$frame] ! tp[0]=
  361. srlx $car1,32,$car1
  362. add %sp,$bias+$frame+4,$tp
  363. .Lsqr_2nd:
  364. mulx $apj,$mul0,$acc0
  365. mulx $npj,$mul1,$acc1
  366. add $acc0,$car0,$car0
  367. add $tpj,$sbit,$sbit
  368. ld [$ap+$j],$apj ! ap[j]
  369. and $car0,$mask,$acc0
  370. ld [$np+$j],$npj ! np[j]
  371. srlx $car0,32,$car0
  372. add $acc1,$car1,$car1
  373. ld [$tp+8],$tpj ! tp[j]
  374. add $acc0,$acc0,$acc0
  375. add $j,4,$j ! j++
  376. add $sbit,$acc0,$acc0
  377. srlx $acc0,32,$sbit
  378. and $acc0,$mask,$acc0
  379. cmp $j,$num
  380. add $acc0,$car1,$car1
  381. st $car1,[$tp] ! tp[j-1]
  382. srlx $car1,32,$car1
  383. bl %icc,.Lsqr_2nd
  384. add $tp,4,$tp ! tp++
  385. !.Lsqr_2nd
  386. mulx $apj,$mul0,$acc0
  387. mulx $npj,$mul1,$acc1
  388. add $acc0,$car0,$car0
  389. add $tpj,$sbit,$sbit
  390. and $car0,$mask,$acc0
  391. srlx $car0,32,$car0
  392. add $acc1,$car1,$car1
  393. add $acc0,$acc0,$acc0
  394. add $sbit,$acc0,$acc0
  395. srlx $acc0,32,$sbit
  396. and $acc0,$mask,$acc0
  397. add $acc0,$car1,$car1
  398. st $car1,[$tp] ! tp[j-1]
  399. srlx $car1,32,$car1
  400. add $car0,$car0,$car0
  401. add $sbit,$car0,$car0
  402. add $car0,$car1,$car1
  403. add $car2,$car1,$car1
  404. st $car1,[$tp+4]
  405. srlx $car1,32,$car2
  406. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  407. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  408. ld [$ap+8],$mul0 ! ap[2]
  409. ld [$np],$car1 ! np[0]
  410. ld [$np+4],$npj ! np[1]
  411. mulx $n0,$tmp1,$mul1
  412. and $mul1,$mask,$mul1
  413. mov 8,$i
  414. mulx $mul0,$mul0,$car0
  415. mulx $car1,$mul1,$car1
  416. and $car0,$mask,$acc0
  417. add $tmp1,$car1,$car1
  418. srlx $car0,32,$car0
  419. add %sp,$bias+$frame,$tp
  420. srlx $car1,32,$car1
  421. and $car0,1,$sbit
  422. srlx $car0,1,$car0
  423. mov 4,$j
  424. .Lsqr_outer:
  425. .Lsqr_inner1:
  426. mulx $npj,$mul1,$acc1
  427. add $tpj,$car1,$car1
  428. add $j,4,$j
  429. ld [$tp+8],$tpj
  430. cmp $j,$i
  431. add $acc1,$car1,$car1
  432. ld [$np+$j],$npj
  433. st $car1,[$tp]
  434. srlx $car1,32,$car1
  435. bl %icc,.Lsqr_inner1
  436. add $tp,4,$tp
  437. !.Lsqr_inner1
  438. add $j,4,$j
  439. ld [$ap+$j],$apj ! ap[j]
  440. mulx $npj,$mul1,$acc1
  441. add $tpj,$car1,$car1
  442. ld [$np+$j],$npj ! np[j]
  443. srlx $car1,32,$tmp0
  444. and $car1,$mask,$car1
  445. add $tmp0,$sbit,$sbit
  446. add $acc0,$car1,$car1
  447. ld [$tp+8],$tpj ! tp[j]
  448. add $acc1,$car1,$car1
  449. st $car1,[$tp]
  450. srlx $car1,32,$car1
  451. add $j,4,$j
  452. cmp $j,$num
  453. be,pn %icc,.Lsqr_no_inner2
  454. add $tp,4,$tp
  455. .Lsqr_inner2:
  456. mulx $apj,$mul0,$acc0
  457. mulx $npj,$mul1,$acc1
  458. add $tpj,$sbit,$sbit
  459. add $acc0,$car0,$car0
  460. ld [$ap+$j],$apj ! ap[j]
  461. and $car0,$mask,$acc0
  462. ld [$np+$j],$npj ! np[j]
  463. srlx $car0,32,$car0
  464. add $acc0,$acc0,$acc0
  465. ld [$tp+8],$tpj ! tp[j]
  466. add $sbit,$acc0,$acc0
  467. add $j,4,$j ! j++
  468. srlx $acc0,32,$sbit
  469. and $acc0,$mask,$acc0
  470. cmp $j,$num
  471. add $acc0,$car1,$car1
  472. add $acc1,$car1,$car1
  473. st $car1,[$tp] ! tp[j-1]
  474. srlx $car1,32,$car1
  475. bl %icc,.Lsqr_inner2
  476. add $tp,4,$tp ! tp++
  477. .Lsqr_no_inner2:
  478. mulx $apj,$mul0,$acc0
  479. mulx $npj,$mul1,$acc1
  480. add $tpj,$sbit,$sbit
  481. add $acc0,$car0,$car0
  482. and $car0,$mask,$acc0
  483. srlx $car0,32,$car0
  484. add $acc0,$acc0,$acc0
  485. add $sbit,$acc0,$acc0
  486. srlx $acc0,32,$sbit
  487. and $acc0,$mask,$acc0
  488. add $acc0,$car1,$car1
  489. add $acc1,$car1,$car1
  490. st $car1,[$tp] ! tp[j-1]
  491. srlx $car1,32,$car1
  492. add $car0,$car0,$car0
  493. add $sbit,$car0,$car0
  494. add $car0,$car1,$car1
  495. add $car2,$car1,$car1
  496. st $car1,[$tp+4]
  497. srlx $car1,32,$car2
  498. add $i,4,$i ! i++
  499. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  500. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  501. ld [$ap+$i],$mul0 ! ap[j]
  502. ld [$np],$car1 ! np[0]
  503. ld [$np+4],$npj ! np[1]
  504. mulx $n0,$tmp1,$mul1
  505. and $mul1,$mask,$mul1
  506. add $i,4,$tmp0
  507. mulx $mul0,$mul0,$car0
  508. mulx $car1,$mul1,$car1
  509. and $car0,$mask,$acc0
  510. add $tmp1,$car1,$car1
  511. srlx $car0,32,$car0
  512. add %sp,$bias+$frame,$tp
  513. srlx $car1,32,$car1
  514. and $car0,1,$sbit
  515. srlx $car0,1,$car0
  516. cmp $tmp0,$num ! i<num-1
  517. bl %icc,.Lsqr_outer
  518. mov 4,$j
  519. .Lsqr_last:
  520. mulx $npj,$mul1,$acc1
  521. add $tpj,$car1,$car1
  522. add $j,4,$j
  523. ld [$tp+8],$tpj
  524. cmp $j,$i
  525. add $acc1,$car1,$car1
  526. ld [$np+$j],$npj
  527. st $car1,[$tp]
  528. srlx $car1,32,$car1
  529. bl %icc,.Lsqr_last
  530. add $tp,4,$tp
  531. !.Lsqr_last
  532. mulx $npj,$mul1,$acc1
  533. add $tpj,$acc0,$acc0
  534. srlx $acc0,32,$tmp0
  535. and $acc0,$mask,$acc0
  536. add $tmp0,$sbit,$sbit
  537. add $acc0,$car1,$car1
  538. add $acc1,$car1,$car1
  539. st $car1,[$tp]
  540. srlx $car1,32,$car1
  541. add $car0,$car0,$car0 ! recover $car0
  542. add $sbit,$car0,$car0
  543. add $car0,$car1,$car1
  544. add $car2,$car1,$car1
  545. st $car1,[$tp+4]
  546. srlx $car1,32,$car2
  547. ba .Ltail
  548. add $tp,8,$tp
  549. .type $fname,#function
  550. .size $fname,(.-$fname)
  551. .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  552. .align 32
  553. ___
  554. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  555. print $code;
  556. close STDOUT or die "error closing STDOUT: $!";