sparcv9-mont.pl 13 KB


  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # December 2005
  9. #
  10. # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
  11. # for undertaken effort are multiple. First of all, UltraSPARC is not
  12. # the whole SPARCv9 universe and other VIS-free implementations deserve
  13. # optimized code as much. Secondly, newly introduced UltraSPARC T1,
  14. # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
  15. # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
  16. # several integrated RSA/DSA accelerator circuits accessible through
  17. # kernel driver [only(*)], but having decent user-land software
  18. # implementation is important too. Finally, reasons like desire to
  19. # experiment with dedicated squaring procedure. Yes, this module
  20. # implements one, because it was easiest to draft it in SPARCv9
  21. # instructions...
  22. # (*) Engine accessing the driver in question is on my TODO list.
  23. # For reference, acceleator is estimated to give 6 to 10 times
  24. # improvement on single-threaded RSA sign. It should be noted
  25. # that 6-10x improvement coefficient does not actually mean
  26. # something extraordinary in terms of absolute [single-threaded]
  27. # performance, as SPARCv9 instruction set is by all means least
  28. # suitable for high performance crypto among other 64 bit
  29. # platforms. 6-10x factor simply places T1 in same performance
  30. # domain as say AMD64 and IA-64. Improvement of RSA verify don't
  31. # appear impressive at all, but it's the sign operation which is
  32. # far more critical/interesting.
  33. # You might notice that inner loops are modulo-scheduled:-) This has
  34. # essentially negligible impact on UltraSPARC performance, it's
  35. # Fujitsu SPARC64 V users who should notice and hopefully appreciate
  36. # the advantage... Currently this module surpasses sparcv9a-mont.pl
  37. # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
  38. # module still have hidden potential [see TODO list there], which is
  39. # estimated to be larger than 20%...
  40. # int bn_mul_mont(
  41. $rp="%i0"; # BN_ULONG *rp,
  42. $ap="%i1"; # const BN_ULONG *ap,
  43. $bp="%i2"; # const BN_ULONG *bp,
  44. $np="%i3"; # const BN_ULONG *np,
  45. $n0="%i4"; # const BN_ULONG *n0,
  46. $num="%i5"; # int num);
  47. $bits=32;
  48. for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  49. if ($bits==64) { $bias=2047; $frame=192; }
  50. else { $bias=0; $frame=128; }
  51. $car0="%o0";
  52. $car1="%o1";
  53. $car2="%o2"; # 1 bit
  54. $acc0="%o3";
  55. $acc1="%o4";
  56. $mask="%g1"; # 32 bits, what a waste...
  57. $tmp0="%g4";
  58. $tmp1="%g5";
  59. $i="%l0";
  60. $j="%l1";
  61. $mul0="%l2";
  62. $mul1="%l3";
  63. $tp="%l4";
  64. $apj="%l5";
  65. $npj="%l6";
  66. $tpj="%l7";
  67. $fname="bn_mul_mont_int";
  68. $code=<<___;
  69. .section ".text",#alloc,#execinstr
  70. .global $fname
  71. .align 32
  72. $fname:
  73. cmp %o5,4 ! 128 bits minimum
  74. bge,pt %icc,.Lenter
  75. sethi %hi(0xffffffff),$mask
  76. retl
  77. clr %o0
  78. .align 32
  79. .Lenter:
  80. save %sp,-$frame,%sp
  81. sll $num,2,$num ! num*=4
  82. or $mask,%lo(0xffffffff),$mask
  83. ld [$n0],$n0
  84. cmp $ap,$bp
  85. and $num,$mask,$num
  86. ld [$bp],$mul0 ! bp[0]
  87. nop
  88. add %sp,$bias,%o7 ! real top of stack
  89. ld [$ap],$car0 ! ap[0] ! redundant in squaring context
  90. sub %o7,$num,%o7
  91. ld [$ap+4],$apj ! ap[1]
  92. and %o7,-1024,%o7
  93. ld [$np],$car1 ! np[0]
  94. sub %o7,$bias,%sp ! alloca
  95. ld [$np+4],$npj ! np[1]
  96. be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
  97. mov 12,$j
  98. mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
  99. mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
  100. and $car0,$mask,$acc0
  101. add %sp,$bias+$frame,$tp
  102. ld [$ap+8],$apj !prologue!
  103. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  104. and $mul1,$mask,$mul1
  105. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  106. mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
  107. srlx $car0,32,$car0
  108. add $acc0,$car1,$car1
  109. ld [$np+8],$npj !prologue!
  110. srlx $car1,32,$car1
  111. mov $tmp0,$acc0 !prologue!
  112. .L1st:
  113. mulx $apj,$mul0,$tmp0
  114. mulx $npj,$mul1,$tmp1
  115. add $acc0,$car0,$car0
  116. ld [$ap+$j],$apj ! ap[j]
  117. and $car0,$mask,$acc0
  118. add $acc1,$car1,$car1
  119. ld [$np+$j],$npj ! np[j]
  120. srlx $car0,32,$car0
  121. add $acc0,$car1,$car1
  122. add $j,4,$j ! j++
  123. mov $tmp0,$acc0
  124. st $car1,[$tp]
  125. cmp $j,$num
  126. mov $tmp1,$acc1
  127. srlx $car1,32,$car1
  128. bl %icc,.L1st
  129. add $tp,4,$tp ! tp++
  130. !.L1st
  131. mulx $apj,$mul0,$tmp0 !epilogue!
  132. mulx $npj,$mul1,$tmp1
  133. add $acc0,$car0,$car0
  134. and $car0,$mask,$acc0
  135. add $acc1,$car1,$car1
  136. srlx $car0,32,$car0
  137. add $acc0,$car1,$car1
  138. st $car1,[$tp]
  139. srlx $car1,32,$car1
  140. add $tmp0,$car0,$car0
  141. and $car0,$mask,$acc0
  142. add $tmp1,$car1,$car1
  143. srlx $car0,32,$car0
  144. add $acc0,$car1,$car1
  145. st $car1,[$tp+4]
  146. srlx $car1,32,$car1
  147. add $car0,$car1,$car1
  148. st $car1,[$tp+8]
  149. srlx $car1,32,$car2
  150. mov 4,$i ! i++
  151. ld [$bp+4],$mul0 ! bp[1]
  152. .Louter:
  153. add %sp,$bias+$frame,$tp
  154. ld [$ap],$car0 ! ap[0]
  155. ld [$ap+4],$apj ! ap[1]
  156. ld [$np],$car1 ! np[0]
  157. ld [$np+4],$npj ! np[1]
  158. ld [$tp],$tmp1 ! tp[0]
  159. ld [$tp+4],$tpj ! tp[1]
  160. mov 12,$j
  161. mulx $car0,$mul0,$car0
  162. mulx $apj,$mul0,$tmp0 !prologue!
  163. add $tmp1,$car0,$car0
  164. ld [$ap+8],$apj !prologue!
  165. and $car0,$mask,$acc0
  166. mulx $n0,$acc0,$mul1
  167. and $mul1,$mask,$mul1
  168. mulx $car1,$mul1,$car1
  169. mulx $npj,$mul1,$acc1 !prologue!
  170. srlx $car0,32,$car0
  171. add $acc0,$car1,$car1
  172. ld [$np+8],$npj !prologue!
  173. srlx $car1,32,$car1
  174. mov $tmp0,$acc0 !prologue!
  175. .Linner:
  176. mulx $apj,$mul0,$tmp0
  177. mulx $npj,$mul1,$tmp1
  178. add $tpj,$car0,$car0
  179. ld [$ap+$j],$apj ! ap[j]
  180. add $acc0,$car0,$car0
  181. add $acc1,$car1,$car1
  182. ld [$np+$j],$npj ! np[j]
  183. and $car0,$mask,$acc0
  184. ld [$tp+8],$tpj ! tp[j]
  185. srlx $car0,32,$car0
  186. add $acc0,$car1,$car1
  187. add $j,4,$j ! j++
  188. mov $tmp0,$acc0
  189. st $car1,[$tp] ! tp[j-1]
  190. srlx $car1,32,$car1
  191. mov $tmp1,$acc1
  192. cmp $j,$num
  193. bl %icc,.Linner
  194. add $tp,4,$tp ! tp++
  195. !.Linner
  196. mulx $apj,$mul0,$tmp0 !epilogue!
  197. mulx $npj,$mul1,$tmp1
  198. add $tpj,$car0,$car0
  199. add $acc0,$car0,$car0
  200. ld [$tp+8],$tpj ! tp[j]
  201. and $car0,$mask,$acc0
  202. add $acc1,$car1,$car1
  203. srlx $car0,32,$car0
  204. add $acc0,$car1,$car1
  205. st $car1,[$tp] ! tp[j-1]
  206. srlx $car1,32,$car1
  207. add $tpj,$car0,$car0
  208. add $tmp0,$car0,$car0
  209. and $car0,$mask,$acc0
  210. add $tmp1,$car1,$car1
  211. add $acc0,$car1,$car1
  212. st $car1,[$tp+4] ! tp[j-1]
  213. srlx $car0,32,$car0
  214. add $i,4,$i ! i++
  215. srlx $car1,32,$car1
  216. add $car0,$car1,$car1
  217. cmp $i,$num
  218. add $car2,$car1,$car1
  219. st $car1,[$tp+8]
  220. srlx $car1,32,$car2
  221. bl,a %icc,.Louter
  222. ld [$bp+$i],$mul0 ! bp[i]
  223. !.Louter
  224. add $tp,12,$tp
  225. .Ltail:
  226. add $np,$num,$np
  227. add $rp,$num,$rp
  228. mov $tp,$ap
  229. sub %g0,$num,%o7 ! k=-num
  230. ba .Lsub
  231. subcc %g0,%g0,%g0 ! clear %icc.c
  232. .align 16
  233. .Lsub:
  234. ld [$tp+%o7],%o0
  235. ld [$np+%o7],%o1
  236. subccc %o0,%o1,%o1 ! tp[j]-np[j]
  237. add $rp,%o7,$i
  238. add %o7,4,%o7
  239. brnz %o7,.Lsub
  240. st %o1,[$i]
  241. subc $car2,0,$car2 ! handle upmost overflow bit
  242. and $tp,$car2,$ap
  243. andn $rp,$car2,$np
  244. or $ap,$np,$ap
  245. sub %g0,$num,%o7
  246. .Lcopy:
  247. ld [$ap+%o7],%o0 ! copy or in-place refresh
  248. st %g0,[$tp+%o7] ! zap tp
  249. st %o0,[$rp+%o7]
  250. add %o7,4,%o7
  251. brnz %o7,.Lcopy
  252. nop
  253. mov 1,%i0
  254. ret
  255. restore
  256. ___
  257. ########
  258. ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
  259. ######## code without following dedicated squaring procedure.
  260. ########
  261. $sbit="%i2"; # re-use $bp!
  262. $code.=<<___;
  263. .align 32
  264. .Lbn_sqr_mont:
  265. mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
  266. mulx $apj,$mul0,$tmp0 !prologue!
  267. and $car0,$mask,$acc0
  268. add %sp,$bias+$frame,$tp
  269. ld [$ap+8],$apj !prologue!
  270. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  271. srlx $car0,32,$car0
  272. and $mul1,$mask,$mul1
  273. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  274. mulx $npj,$mul1,$acc1 !prologue!
  275. and $car0,1,$sbit
  276. ld [$np+8],$npj !prologue!
  277. srlx $car0,1,$car0
  278. add $acc0,$car1,$car1
  279. srlx $car1,32,$car1
  280. mov $tmp0,$acc0 !prologue!
  281. .Lsqr_1st:
  282. mulx $apj,$mul0,$tmp0
  283. mulx $npj,$mul1,$tmp1
  284. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  285. add $acc1,$car1,$car1
  286. ld [$ap+$j],$apj ! ap[j]
  287. and $car0,$mask,$acc0
  288. ld [$np+$j],$npj ! np[j]
  289. srlx $car0,32,$car0
  290. add $acc0,$acc0,$acc0
  291. or $sbit,$acc0,$acc0
  292. mov $tmp1,$acc1
  293. srlx $acc0,32,$sbit
  294. add $j,4,$j ! j++
  295. and $acc0,$mask,$acc0
  296. cmp $j,$num
  297. add $acc0,$car1,$car1
  298. st $car1,[$tp]
  299. mov $tmp0,$acc0
  300. srlx $car1,32,$car1
  301. bl %icc,.Lsqr_1st
  302. add $tp,4,$tp ! tp++
  303. !.Lsqr_1st
  304. mulx $apj,$mul0,$tmp0 ! epilogue
  305. mulx $npj,$mul1,$tmp1
  306. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  307. add $acc1,$car1,$car1
  308. and $car0,$mask,$acc0
  309. srlx $car0,32,$car0
  310. add $acc0,$acc0,$acc0
  311. or $sbit,$acc0,$acc0
  312. srlx $acc0,32,$sbit
  313. and $acc0,$mask,$acc0
  314. add $acc0,$car1,$car1
  315. st $car1,[$tp]
  316. srlx $car1,32,$car1
  317. add $tmp0,$car0,$car0 ! ap[j]*a0+c0
  318. add $tmp1,$car1,$car1
  319. and $car0,$mask,$acc0
  320. srlx $car0,32,$car0
  321. add $acc0,$acc0,$acc0
  322. or $sbit,$acc0,$acc0
  323. srlx $acc0,32,$sbit
  324. and $acc0,$mask,$acc0
  325. add $acc0,$car1,$car1
  326. st $car1,[$tp+4]
  327. srlx $car1,32,$car1
  328. add $car0,$car0,$car0
  329. or $sbit,$car0,$car0
  330. add $car0,$car1,$car1
  331. st $car1,[$tp+8]
  332. srlx $car1,32,$car2
  333. ld [%sp+$bias+$frame],$tmp0 ! tp[0]
  334. ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
  335. ld [%sp+$bias+$frame+8],$tpj ! tp[2]
  336. ld [$ap+4],$mul0 ! ap[1]
  337. ld [$ap+8],$apj ! ap[2]
  338. ld [$np],$car1 ! np[0]
  339. ld [$np+4],$npj ! np[1]
  340. mulx $n0,$tmp0,$mul1
  341. mulx $mul0,$mul0,$car0
  342. and $mul1,$mask,$mul1
  343. mulx $car1,$mul1,$car1
  344. mulx $npj,$mul1,$acc1
  345. add $tmp0,$car1,$car1
  346. and $car0,$mask,$acc0
  347. ld [$np+8],$npj ! np[2]
  348. srlx $car1,32,$car1
  349. add $tmp1,$car1,$car1
  350. srlx $car0,32,$car0
  351. add $acc0,$car1,$car1
  352. and $car0,1,$sbit
  353. add $acc1,$car1,$car1
  354. srlx $car0,1,$car0
  355. mov 12,$j
  356. st $car1,[%sp+$bias+$frame] ! tp[0]=
  357. srlx $car1,32,$car1
  358. add %sp,$bias+$frame+4,$tp
  359. .Lsqr_2nd:
  360. mulx $apj,$mul0,$acc0
  361. mulx $npj,$mul1,$acc1
  362. add $acc0,$car0,$car0
  363. add $tpj,$car1,$car1
  364. ld [$ap+$j],$apj ! ap[j]
  365. and $car0,$mask,$acc0
  366. ld [$np+$j],$npj ! np[j]
  367. srlx $car0,32,$car0
  368. add $acc1,$car1,$car1
  369. ld [$tp+8],$tpj ! tp[j]
  370. add $acc0,$acc0,$acc0
  371. add $j,4,$j ! j++
  372. or $sbit,$acc0,$acc0
  373. srlx $acc0,32,$sbit
  374. and $acc0,$mask,$acc0
  375. cmp $j,$num
  376. add $acc0,$car1,$car1
  377. st $car1,[$tp] ! tp[j-1]
  378. srlx $car1,32,$car1
  379. bl %icc,.Lsqr_2nd
  380. add $tp,4,$tp ! tp++
  381. !.Lsqr_2nd
  382. mulx $apj,$mul0,$acc0
  383. mulx $npj,$mul1,$acc1
  384. add $acc0,$car0,$car0
  385. add $tpj,$car1,$car1
  386. and $car0,$mask,$acc0
  387. srlx $car0,32,$car0
  388. add $acc1,$car1,$car1
  389. add $acc0,$acc0,$acc0
  390. or $sbit,$acc0,$acc0
  391. srlx $acc0,32,$sbit
  392. and $acc0,$mask,$acc0
  393. add $acc0,$car1,$car1
  394. st $car1,[$tp] ! tp[j-1]
  395. srlx $car1,32,$car1
  396. add $car0,$car0,$car0
  397. or $sbit,$car0,$car0
  398. add $car0,$car1,$car1
  399. add $car2,$car1,$car1
  400. st $car1,[$tp+4]
  401. srlx $car1,32,$car2
  402. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  403. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  404. ld [$ap+8],$mul0 ! ap[2]
  405. ld [$np],$car1 ! np[0]
  406. ld [$np+4],$npj ! np[1]
  407. mulx $n0,$tmp1,$mul1
  408. and $mul1,$mask,$mul1
  409. mov 8,$i
  410. mulx $mul0,$mul0,$car0
  411. mulx $car1,$mul1,$car1
  412. and $car0,$mask,$acc0
  413. add $tmp1,$car1,$car1
  414. srlx $car0,32,$car0
  415. add %sp,$bias+$frame,$tp
  416. srlx $car1,32,$car1
  417. and $car0,1,$sbit
  418. srlx $car0,1,$car0
  419. mov 4,$j
  420. .Lsqr_outer:
  421. .Lsqr_inner1:
  422. mulx $npj,$mul1,$acc1
  423. add $tpj,$car1,$car1
  424. add $j,4,$j
  425. ld [$tp+8],$tpj
  426. cmp $j,$i
  427. add $acc1,$car1,$car1
  428. ld [$np+$j],$npj
  429. st $car1,[$tp]
  430. srlx $car1,32,$car1
  431. bl %icc,.Lsqr_inner1
  432. add $tp,4,$tp
  433. !.Lsqr_inner1
  434. add $j,4,$j
  435. ld [$ap+$j],$apj ! ap[j]
  436. mulx $npj,$mul1,$acc1
  437. add $tpj,$car1,$car1
  438. ld [$np+$j],$npj ! np[j]
  439. add $acc0,$car1,$car1
  440. ld [$tp+8],$tpj ! tp[j]
  441. add $acc1,$car1,$car1
  442. st $car1,[$tp]
  443. srlx $car1,32,$car1
  444. add $j,4,$j
  445. cmp $j,$num
  446. be,pn %icc,.Lsqr_no_inner2
  447. add $tp,4,$tp
  448. .Lsqr_inner2:
  449. mulx $apj,$mul0,$acc0
  450. mulx $npj,$mul1,$acc1
  451. add $tpj,$car1,$car1
  452. add $acc0,$car0,$car0
  453. ld [$ap+$j],$apj ! ap[j]
  454. and $car0,$mask,$acc0
  455. ld [$np+$j],$npj ! np[j]
  456. srlx $car0,32,$car0
  457. add $acc0,$acc0,$acc0
  458. ld [$tp+8],$tpj ! tp[j]
  459. or $sbit,$acc0,$acc0
  460. add $j,4,$j ! j++
  461. srlx $acc0,32,$sbit
  462. and $acc0,$mask,$acc0
  463. cmp $j,$num
  464. add $acc0,$car1,$car1
  465. add $acc1,$car1,$car1
  466. st $car1,[$tp] ! tp[j-1]
  467. srlx $car1,32,$car1
  468. bl %icc,.Lsqr_inner2
  469. add $tp,4,$tp ! tp++
  470. .Lsqr_no_inner2:
  471. mulx $apj,$mul0,$acc0
  472. mulx $npj,$mul1,$acc1
  473. add $tpj,$car1,$car1
  474. add $acc0,$car0,$car0
  475. and $car0,$mask,$acc0
  476. srlx $car0,32,$car0
  477. add $acc0,$acc0,$acc0
  478. or $sbit,$acc0,$acc0
  479. srlx $acc0,32,$sbit
  480. and $acc0,$mask,$acc0
  481. add $acc0,$car1,$car1
  482. add $acc1,$car1,$car1
  483. st $car1,[$tp] ! tp[j-1]
  484. srlx $car1,32,$car1
  485. add $car0,$car0,$car0
  486. or $sbit,$car0,$car0
  487. add $car0,$car1,$car1
  488. add $car2,$car1,$car1
  489. st $car1,[$tp+4]
  490. srlx $car1,32,$car2
  491. add $i,4,$i ! i++
  492. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  493. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  494. ld [$ap+$i],$mul0 ! ap[j]
  495. ld [$np],$car1 ! np[0]
  496. ld [$np+4],$npj ! np[1]
  497. mulx $n0,$tmp1,$mul1
  498. and $mul1,$mask,$mul1
  499. add $i,4,$tmp0
  500. mulx $mul0,$mul0,$car0
  501. mulx $car1,$mul1,$car1
  502. and $car0,$mask,$acc0
  503. add $tmp1,$car1,$car1
  504. srlx $car0,32,$car0
  505. add %sp,$bias+$frame,$tp
  506. srlx $car1,32,$car1
  507. and $car0,1,$sbit
  508. srlx $car0,1,$car0
  509. cmp $tmp0,$num ! i<num-1
  510. bl %icc,.Lsqr_outer
  511. mov 4,$j
  512. .Lsqr_last:
  513. mulx $npj,$mul1,$acc1
  514. add $tpj,$car1,$car1
  515. add $j,4,$j
  516. ld [$tp+8],$tpj
  517. cmp $j,$i
  518. add $acc1,$car1,$car1
  519. ld [$np+$j],$npj
  520. st $car1,[$tp]
  521. srlx $car1,32,$car1
  522. bl %icc,.Lsqr_last
  523. add $tp,4,$tp
  524. !.Lsqr_last
  525. mulx $npj,$mul1,$acc1
  526. add $tpj,$car1,$car1
  527. add $acc0,$car1,$car1
  528. add $acc1,$car1,$car1
  529. st $car1,[$tp]
  530. srlx $car1,32,$car1
  531. add $car0,$car0,$car0 ! recover $car0
  532. or $sbit,$car0,$car0
  533. add $car0,$car1,$car1
  534. add $car2,$car1,$car1
  535. st $car1,[$tp+4]
  536. srlx $car1,32,$car2
  537. ba .Ltail
  538. add $tp,8,$tp
  539. .type $fname,#function
  540. .size $fname,(.-$fname)
  541. .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
  542. .align 32
  543. ___
  544. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  545. print $code;
  546. close STDOUT;