2
0

sparcv9-mont.pl 13 KB


  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. Rights for redistribution and usage in source and binary
  5. # forms are granted according to the OpenSSL license.
  6. # ====================================================================
  7. # December 2005
  8. #
  9. # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
  10. # for undertaken effort are multiple. First of all, UltraSPARC is not
  11. # the whole SPARCv9 universe and other VIS-free implementations deserve
  12. # optimized code as much. Secondly, newly introduced UltraSPARC T1,
  13. # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
  14. # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
  15. # several integrated RSA/DSA accelerator circuits accessible through
  16. # kernel driver [only(*)], but having decent user-land software
  17. # implementation is important too. Finally, reasons like desire to
  18. # experiment with dedicated squaring procedure. Yes, this module
  19. # implements one, because it was easiest to draft it in SPARCv9
  20. # instructions...
  21. # (*) Engine accessing the driver in question is on my TODO list.
  22. # For reference, acceleator is estimated to give 6 to 10 times
  23. # improvement on single-threaded RSA sign. It should be noted
  24. # that 6-10x improvement coefficient does not actually mean
  25. # something extraordinary in terms of absolute [single-threaded]
  26. # performance, as SPARCv9 instruction set is by all means least
  27. # suitable for high performance crypto among other 64 bit
  28. # platforms. 6-10x factor simply places T1 in same performance
  29. # domain as say AMD64 and IA-64. Improvement of RSA verify don't
  30. # appear impressive at all, but it's the sign operation which is
  31. # far more critical/interesting.
  32. # You might notice that inner loops are modulo-scheduled:-) This has
  33. # essentially negligible impact on UltraSPARC performance, it's
  34. # Fujitsu SPARC64 V users who should notice and hopefully appreciate
  35. # the advantage... Currently this module surpasses sparcv9a-mont.pl
  36. # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
  37. # module still have hidden potential [see TODO list there], which is
  38. # estimated to be larger than 20%...
  39. # int bn_mul_mont(
  40. $rp="%i0"; # BN_ULONG *rp,
  41. $ap="%i1"; # const BN_ULONG *ap,
  42. $bp="%i2"; # const BN_ULONG *bp,
  43. $np="%i3"; # const BN_ULONG *np,
  44. $n0="%i4"; # const BN_ULONG *n0,
  45. $num="%i5"; # int num);
  46. $bits=32;
  47. for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  48. if ($bits==64) { $bias=2047; $frame=192; }
  49. else { $bias=0; $frame=128; }
  50. $car0="%o0";
  51. $car1="%o1";
  52. $car2="%o2"; # 1 bit
  53. $acc0="%o3";
  54. $acc1="%o4";
  55. $mask="%g1"; # 32 bits, what a waste...
  56. $tmp0="%g4";
  57. $tmp1="%g5";
  58. $i="%l0";
  59. $j="%l1";
  60. $mul0="%l2";
  61. $mul1="%l3";
  62. $tp="%l4";
  63. $apj="%l5";
  64. $npj="%l6";
  65. $tpj="%l7";
  66. $fname="bn_mul_mont_int";
  67. $code=<<___;
  68. .section ".text",#alloc,#execinstr
  69. .global $fname
  70. .align 32
  71. $fname:
  72. cmp %o5,4 ! 128 bits minimum
  73. bge,pt %icc,.Lenter
  74. sethi %hi(0xffffffff),$mask
  75. retl
  76. clr %o0
  77. .align 32
  78. .Lenter:
  79. save %sp,-$frame,%sp
  80. sll $num,2,$num ! num*=4
  81. or $mask,%lo(0xffffffff),$mask
  82. ld [$n0],$n0
  83. cmp $ap,$bp
  84. and $num,$mask,$num
  85. ld [$bp],$mul0 ! bp[0]
  86. nop
  87. add %sp,$bias,%o7 ! real top of stack
  88. ld [$ap],$car0 ! ap[0] ! redundant in squaring context
  89. sub %o7,$num,%o7
  90. ld [$ap+4],$apj ! ap[1]
  91. and %o7,-1024,%o7
  92. ld [$np],$car1 ! np[0]
  93. sub %o7,$bias,%sp ! alloca
  94. ld [$np+4],$npj ! np[1]
  95. be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
  96. mov 12,$j
  97. mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
  98. mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
  99. and $car0,$mask,$acc0
  100. add %sp,$bias+$frame,$tp
  101. ld [$ap+8],$apj !prologue!
  102. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  103. and $mul1,$mask,$mul1
  104. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  105. mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
  106. srlx $car0,32,$car0
  107. add $acc0,$car1,$car1
  108. ld [$np+8],$npj !prologue!
  109. srlx $car1,32,$car1
  110. mov $tmp0,$acc0 !prologue!
  111. .L1st:
  112. mulx $apj,$mul0,$tmp0
  113. mulx $npj,$mul1,$tmp1
  114. add $acc0,$car0,$car0
  115. ld [$ap+$j],$apj ! ap[j]
  116. and $car0,$mask,$acc0
  117. add $acc1,$car1,$car1
  118. ld [$np+$j],$npj ! np[j]
  119. srlx $car0,32,$car0
  120. add $acc0,$car1,$car1
  121. add $j,4,$j ! j++
  122. mov $tmp0,$acc0
  123. st $car1,[$tp]
  124. cmp $j,$num
  125. mov $tmp1,$acc1
  126. srlx $car1,32,$car1
  127. bl %icc,.L1st
  128. add $tp,4,$tp ! tp++
  129. !.L1st
  130. mulx $apj,$mul0,$tmp0 !epilogue!
  131. mulx $npj,$mul1,$tmp1
  132. add $acc0,$car0,$car0
  133. and $car0,$mask,$acc0
  134. add $acc1,$car1,$car1
  135. srlx $car0,32,$car0
  136. add $acc0,$car1,$car1
  137. st $car1,[$tp]
  138. srlx $car1,32,$car1
  139. add $tmp0,$car0,$car0
  140. and $car0,$mask,$acc0
  141. add $tmp1,$car1,$car1
  142. srlx $car0,32,$car0
  143. add $acc0,$car1,$car1
  144. st $car1,[$tp+4]
  145. srlx $car1,32,$car1
  146. add $car0,$car1,$car1
  147. st $car1,[$tp+8]
  148. srlx $car1,32,$car2
  149. mov 4,$i ! i++
  150. ld [$bp+4],$mul0 ! bp[1]
  151. .Louter:
  152. add %sp,$bias+$frame,$tp
  153. ld [$ap],$car0 ! ap[0]
  154. ld [$ap+4],$apj ! ap[1]
  155. ld [$np],$car1 ! np[0]
  156. ld [$np+4],$npj ! np[1]
  157. ld [$tp],$tmp1 ! tp[0]
  158. ld [$tp+4],$tpj ! tp[1]
  159. mov 12,$j
  160. mulx $car0,$mul0,$car0
  161. mulx $apj,$mul0,$tmp0 !prologue!
  162. add $tmp1,$car0,$car0
  163. ld [$ap+8],$apj !prologue!
  164. and $car0,$mask,$acc0
  165. mulx $n0,$acc0,$mul1
  166. and $mul1,$mask,$mul1
  167. mulx $car1,$mul1,$car1
  168. mulx $npj,$mul1,$acc1 !prologue!
  169. srlx $car0,32,$car0
  170. add $acc0,$car1,$car1
  171. ld [$np+8],$npj !prologue!
  172. srlx $car1,32,$car1
  173. mov $tmp0,$acc0 !prologue!
  174. .Linner:
  175. mulx $apj,$mul0,$tmp0
  176. mulx $npj,$mul1,$tmp1
  177. add $tpj,$car0,$car0
  178. ld [$ap+$j],$apj ! ap[j]
  179. add $acc0,$car0,$car0
  180. add $acc1,$car1,$car1
  181. ld [$np+$j],$npj ! np[j]
  182. and $car0,$mask,$acc0
  183. ld [$tp+8],$tpj ! tp[j]
  184. srlx $car0,32,$car0
  185. add $acc0,$car1,$car1
  186. add $j,4,$j ! j++
  187. mov $tmp0,$acc0
  188. st $car1,[$tp] ! tp[j-1]
  189. srlx $car1,32,$car1
  190. mov $tmp1,$acc1
  191. cmp $j,$num
  192. bl %icc,.Linner
  193. add $tp,4,$tp ! tp++
  194. !.Linner
  195. mulx $apj,$mul0,$tmp0 !epilogue!
  196. mulx $npj,$mul1,$tmp1
  197. add $tpj,$car0,$car0
  198. add $acc0,$car0,$car0
  199. ld [$tp+8],$tpj ! tp[j]
  200. and $car0,$mask,$acc0
  201. add $acc1,$car1,$car1
  202. srlx $car0,32,$car0
  203. add $acc0,$car1,$car1
  204. st $car1,[$tp] ! tp[j-1]
  205. srlx $car1,32,$car1
  206. add $tpj,$car0,$car0
  207. add $tmp0,$car0,$car0
  208. and $car0,$mask,$acc0
  209. add $tmp1,$car1,$car1
  210. add $acc0,$car1,$car1
  211. st $car1,[$tp+4] ! tp[j-1]
  212. srlx $car0,32,$car0
  213. add $i,4,$i ! i++
  214. srlx $car1,32,$car1
  215. add $car0,$car1,$car1
  216. cmp $i,$num
  217. add $car2,$car1,$car1
  218. st $car1,[$tp+8]
  219. srlx $car1,32,$car2
  220. bl,a %icc,.Louter
  221. ld [$bp+$i],$mul0 ! bp[i]
  222. !.Louter
  223. add $tp,12,$tp
  224. .Ltail:
  225. add $np,$num,$np
  226. add $rp,$num,$rp
  227. cmp $car2,0 ! clears %icc.c
  228. bne,pn %icc,.Lsub
  229. sub %g0,$num,%o7 ! k=-num
  230. cmp $car1,$npj ! compare top-most $tp and $np words
  231. bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
  232. nop
  233. .align 16,0x1000000
  234. .Lsub:
  235. ld [$tp+%o7],%o0
  236. ld [$np+%o7],%o1
  237. subccc %o0,%o1,%o1
  238. st %o1,[$rp+%o7]
  239. add %o7,4,%o7
  240. brnz %o7,.Lsub
  241. nop
  242. subccc $car2,0,$car2
  243. bcc %icc,.Lzap
  244. sub %g0,$num,%o7
  245. .align 16,0x1000000
  246. .Lcopy:
  247. ld [$tp+%o7],%o0
  248. st %o0,[$rp+%o7]
  249. add %o7,4,%o7
  250. brnz %o7,.Lcopy
  251. nop
  252. ba .Lzap
  253. sub %g0,$num,%o7
  254. .align 32
  255. .Lzap:
  256. st %g0,[$tp+%o7]
  257. add %o7,4,%o7
  258. brnz %o7,.Lzap
  259. nop
  260. mov 1,%i0
  261. ret
  262. restore
  263. ___
  264. ########
  265. ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
  266. ######## code without following dedicated squaring procedure.
  267. ########
  268. $sbit="%i2"; # re-use $bp!
  269. $code.=<<___;
  270. .align 32
  271. .Lbn_sqr_mont:
  272. mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
  273. mulx $apj,$mul0,$tmp0 !prologue!
  274. and $car0,$mask,$acc0
  275. add %sp,$bias+$frame,$tp
  276. ld [$ap+8],$apj !prologue!
  277. mulx $n0,$acc0,$mul1 ! "t[0]"*n0
  278. srlx $car0,32,$car0
  279. and $mul1,$mask,$mul1
  280. mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
  281. mulx $npj,$mul1,$acc1 !prologue!
  282. and $car0,1,$sbit
  283. ld [$np+8],$npj !prologue!
  284. srlx $car0,1,$car0
  285. add $acc0,$car1,$car1
  286. srlx $car1,32,$car1
  287. mov $tmp0,$acc0 !prologue!
  288. .Lsqr_1st:
  289. mulx $apj,$mul0,$tmp0
  290. mulx $npj,$mul1,$tmp1
  291. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  292. add $acc1,$car1,$car1
  293. ld [$ap+$j],$apj ! ap[j]
  294. and $car0,$mask,$acc0
  295. ld [$np+$j],$npj ! np[j]
  296. srlx $car0,32,$car0
  297. add $acc0,$acc0,$acc0
  298. or $sbit,$acc0,$acc0
  299. mov $tmp1,$acc1
  300. srlx $acc0,32,$sbit
  301. add $j,4,$j ! j++
  302. and $acc0,$mask,$acc0
  303. cmp $j,$num
  304. add $acc0,$car1,$car1
  305. st $car1,[$tp]
  306. mov $tmp0,$acc0
  307. srlx $car1,32,$car1
  308. bl %icc,.Lsqr_1st
  309. add $tp,4,$tp ! tp++
  310. !.Lsqr_1st
  311. mulx $apj,$mul0,$tmp0 ! epilogue
  312. mulx $npj,$mul1,$tmp1
  313. add $acc0,$car0,$car0 ! ap[j]*a0+c0
  314. add $acc1,$car1,$car1
  315. and $car0,$mask,$acc0
  316. srlx $car0,32,$car0
  317. add $acc0,$acc0,$acc0
  318. or $sbit,$acc0,$acc0
  319. srlx $acc0,32,$sbit
  320. and $acc0,$mask,$acc0
  321. add $acc0,$car1,$car1
  322. st $car1,[$tp]
  323. srlx $car1,32,$car1
  324. add $tmp0,$car0,$car0 ! ap[j]*a0+c0
  325. add $tmp1,$car1,$car1
  326. and $car0,$mask,$acc0
  327. srlx $car0,32,$car0
  328. add $acc0,$acc0,$acc0
  329. or $sbit,$acc0,$acc0
  330. srlx $acc0,32,$sbit
  331. and $acc0,$mask,$acc0
  332. add $acc0,$car1,$car1
  333. st $car1,[$tp+4]
  334. srlx $car1,32,$car1
  335. add $car0,$car0,$car0
  336. or $sbit,$car0,$car0
  337. add $car0,$car1,$car1
  338. st $car1,[$tp+8]
  339. srlx $car1,32,$car2
  340. ld [%sp+$bias+$frame],$tmp0 ! tp[0]
  341. ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
  342. ld [%sp+$bias+$frame+8],$tpj ! tp[2]
  343. ld [$ap+4],$mul0 ! ap[1]
  344. ld [$ap+8],$apj ! ap[2]
  345. ld [$np],$car1 ! np[0]
  346. ld [$np+4],$npj ! np[1]
  347. mulx $n0,$tmp0,$mul1
  348. mulx $mul0,$mul0,$car0
  349. and $mul1,$mask,$mul1
  350. mulx $car1,$mul1,$car1
  351. mulx $npj,$mul1,$acc1
  352. add $tmp0,$car1,$car1
  353. and $car0,$mask,$acc0
  354. ld [$np+8],$npj ! np[2]
  355. srlx $car1,32,$car1
  356. add $tmp1,$car1,$car1
  357. srlx $car0,32,$car0
  358. add $acc0,$car1,$car1
  359. and $car0,1,$sbit
  360. add $acc1,$car1,$car1
  361. srlx $car0,1,$car0
  362. mov 12,$j
  363. st $car1,[%sp+$bias+$frame] ! tp[0]=
  364. srlx $car1,32,$car1
  365. add %sp,$bias+$frame+4,$tp
  366. .Lsqr_2nd:
  367. mulx $apj,$mul0,$acc0
  368. mulx $npj,$mul1,$acc1
  369. add $acc0,$car0,$car0
  370. add $tpj,$car1,$car1
  371. ld [$ap+$j],$apj ! ap[j]
  372. and $car0,$mask,$acc0
  373. ld [$np+$j],$npj ! np[j]
  374. srlx $car0,32,$car0
  375. add $acc1,$car1,$car1
  376. ld [$tp+8],$tpj ! tp[j]
  377. add $acc0,$acc0,$acc0
  378. add $j,4,$j ! j++
  379. or $sbit,$acc0,$acc0
  380. srlx $acc0,32,$sbit
  381. and $acc0,$mask,$acc0
  382. cmp $j,$num
  383. add $acc0,$car1,$car1
  384. st $car1,[$tp] ! tp[j-1]
  385. srlx $car1,32,$car1
  386. bl %icc,.Lsqr_2nd
  387. add $tp,4,$tp ! tp++
  388. !.Lsqr_2nd
  389. mulx $apj,$mul0,$acc0
  390. mulx $npj,$mul1,$acc1
  391. add $acc0,$car0,$car0
  392. add $tpj,$car1,$car1
  393. and $car0,$mask,$acc0
  394. srlx $car0,32,$car0
  395. add $acc1,$car1,$car1
  396. add $acc0,$acc0,$acc0
  397. or $sbit,$acc0,$acc0
  398. srlx $acc0,32,$sbit
  399. and $acc0,$mask,$acc0
  400. add $acc0,$car1,$car1
  401. st $car1,[$tp] ! tp[j-1]
  402. srlx $car1,32,$car1
  403. add $car0,$car0,$car0
  404. or $sbit,$car0,$car0
  405. add $car0,$car1,$car1
  406. add $car2,$car1,$car1
  407. st $car1,[$tp+4]
  408. srlx $car1,32,$car2
  409. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  410. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  411. ld [$ap+8],$mul0 ! ap[2]
  412. ld [$np],$car1 ! np[0]
  413. ld [$np+4],$npj ! np[1]
  414. mulx $n0,$tmp1,$mul1
  415. and $mul1,$mask,$mul1
  416. mov 8,$i
  417. mulx $mul0,$mul0,$car0
  418. mulx $car1,$mul1,$car1
  419. and $car0,$mask,$acc0
  420. add $tmp1,$car1,$car1
  421. srlx $car0,32,$car0
  422. add %sp,$bias+$frame,$tp
  423. srlx $car1,32,$car1
  424. and $car0,1,$sbit
  425. srlx $car0,1,$car0
  426. mov 4,$j
  427. .Lsqr_outer:
  428. .Lsqr_inner1:
  429. mulx $npj,$mul1,$acc1
  430. add $tpj,$car1,$car1
  431. add $j,4,$j
  432. ld [$tp+8],$tpj
  433. cmp $j,$i
  434. add $acc1,$car1,$car1
  435. ld [$np+$j],$npj
  436. st $car1,[$tp]
  437. srlx $car1,32,$car1
  438. bl %icc,.Lsqr_inner1
  439. add $tp,4,$tp
  440. !.Lsqr_inner1
  441. add $j,4,$j
  442. ld [$ap+$j],$apj ! ap[j]
  443. mulx $npj,$mul1,$acc1
  444. add $tpj,$car1,$car1
  445. ld [$np+$j],$npj ! np[j]
  446. add $acc0,$car1,$car1
  447. ld [$tp+8],$tpj ! tp[j]
  448. add $acc1,$car1,$car1
  449. st $car1,[$tp]
  450. srlx $car1,32,$car1
  451. add $j,4,$j
  452. cmp $j,$num
  453. be,pn %icc,.Lsqr_no_inner2
  454. add $tp,4,$tp
  455. .Lsqr_inner2:
  456. mulx $apj,$mul0,$acc0
  457. mulx $npj,$mul1,$acc1
  458. add $tpj,$car1,$car1
  459. add $acc0,$car0,$car0
  460. ld [$ap+$j],$apj ! ap[j]
  461. and $car0,$mask,$acc0
  462. ld [$np+$j],$npj ! np[j]
  463. srlx $car0,32,$car0
  464. add $acc0,$acc0,$acc0
  465. ld [$tp+8],$tpj ! tp[j]
  466. or $sbit,$acc0,$acc0
  467. add $j,4,$j ! j++
  468. srlx $acc0,32,$sbit
  469. and $acc0,$mask,$acc0
  470. cmp $j,$num
  471. add $acc0,$car1,$car1
  472. add $acc1,$car1,$car1
  473. st $car1,[$tp] ! tp[j-1]
  474. srlx $car1,32,$car1
  475. bl %icc,.Lsqr_inner2
  476. add $tp,4,$tp ! tp++
  477. .Lsqr_no_inner2:
  478. mulx $apj,$mul0,$acc0
  479. mulx $npj,$mul1,$acc1
  480. add $tpj,$car1,$car1
  481. add $acc0,$car0,$car0
  482. and $car0,$mask,$acc0
  483. srlx $car0,32,$car0
  484. add $acc0,$acc0,$acc0
  485. or $sbit,$acc0,$acc0
  486. srlx $acc0,32,$sbit
  487. and $acc0,$mask,$acc0
  488. add $acc0,$car1,$car1
  489. add $acc1,$car1,$car1
  490. st $car1,[$tp] ! tp[j-1]
  491. srlx $car1,32,$car1
  492. add $car0,$car0,$car0
  493. or $sbit,$car0,$car0
  494. add $car0,$car1,$car1
  495. add $car2,$car1,$car1
  496. st $car1,[$tp+4]
  497. srlx $car1,32,$car2
  498. add $i,4,$i ! i++
  499. ld [%sp+$bias+$frame],$tmp1 ! tp[0]
  500. ld [%sp+$bias+$frame+4],$tpj ! tp[1]
  501. ld [$ap+$i],$mul0 ! ap[j]
  502. ld [$np],$car1 ! np[0]
  503. ld [$np+4],$npj ! np[1]
  504. mulx $n0,$tmp1,$mul1
  505. and $mul1,$mask,$mul1
  506. add $i,4,$tmp0
  507. mulx $mul0,$mul0,$car0
  508. mulx $car1,$mul1,$car1
  509. and $car0,$mask,$acc0
  510. add $tmp1,$car1,$car1
  511. srlx $car0,32,$car0
  512. add %sp,$bias+$frame,$tp
  513. srlx $car1,32,$car1
  514. and $car0,1,$sbit
  515. srlx $car0,1,$car0
  516. cmp $tmp0,$num ! i<num-1
  517. bl %icc,.Lsqr_outer
  518. mov 4,$j
  519. .Lsqr_last:
  520. mulx $npj,$mul1,$acc1
  521. add $tpj,$car1,$car1
  522. add $j,4,$j
  523. ld [$tp+8],$tpj
  524. cmp $j,$i
  525. add $acc1,$car1,$car1
  526. ld [$np+$j],$npj
  527. st $car1,[$tp]
  528. srlx $car1,32,$car1
  529. bl %icc,.Lsqr_last
  530. add $tp,4,$tp
  531. !.Lsqr_last
  532. mulx $npj,$mul1,$acc1
  533. add $tpj,$car1,$car1
  534. add $acc0,$car1,$car1
  535. add $acc1,$car1,$car1
  536. st $car1,[$tp]
  537. srlx $car1,32,$car1
  538. add $car0,$car0,$car0 ! recover $car0
  539. or $sbit,$car0,$car0
  540. add $car0,$car1,$car1
  541. add $car2,$car1,$car1
  542. st $car1,[$tp+4]
  543. srlx $car1,32,$car2
  544. ba .Ltail
  545. add $tp,8,$tp
  546. .type $fname,#function
  547. .size $fname,(.-$fname)
  548. ___
  549. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  550. print $code;
  551. close STDOUT;