sparcv8plus.S 32 KB


  1. .ident "sparcv8plus.s, Version 1.4"
  2. .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
  3. /*
  4. * ====================================================================
  5. * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  6. * project.
  7. *
  8. * Rights for redistribution and usage in source and binary forms are
  9. * granted according to the OpenSSL license. Warranty of any kind is
  10. * disclaimed.
  11. * ====================================================================
  12. */
  13. /*
  14. * This is my modest contributon to OpenSSL project (see
  15. * http://www.openssl.org/ for more information about it) and is
  16. * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  17. * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  18. *
  19. * Questions-n-answers.
  20. *
  21. * Q. How to compile?
  22. * A. With SC4.x/SC5.x:
  23. *
  24. * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  25. *
  26. * and with gcc:
  27. *
  28. * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  29. *
  30. * or if above fails (it does if you have gas installed):
  31. *
  32. * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  33. *
  34. * Quick-n-dirty way to fuse the module into the library.
  35. * Provided that the library is already configured and built
  36. * (in 0.9.2 case with no-asm option):
  37. *
  38. * # cd crypto/bn
  39. * # cp /some/place/bn_asm.sparc.v8plus.S .
  40. * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  41. * # make
  42. * # cd ../..
  43. * # make; make test
  44. *
  45. * Quick-n-dirty way to get rid of it:
  46. *
  47. * # cd crypto/bn
  48. * # touch bn_asm.c
  49. * # make
  50. * # cd ../..
  51. * # make; make test
  52. *
  53. * Q. V8plus achitecture? What kind of beast is that?
  54. * A. Well, it's rather a programming model than an architecture...
  55. * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  56. * special conditions, namely when kernel doesn't preserve upper
  57. * 32 bits of otherwise 64-bit registers during a context switch.
  58. *
  59. * Q. Why just UltraSPARC? What about SuperSPARC?
  60. * A. Original release did target UltraSPARC only. Now SuperSPARC
  61. * version is provided along. Both version share bn_*comba[48]
  62. * implementations (see comment later in code for explanation).
  63. * But what's so special about this UltraSPARC implementation?
  64. * Why didn't I let compiler do the job? Trouble is that most of
  65. * available compilers (well, SC5.0 is the only exception) don't
  66. * attempt to take advantage of UltraSPARC's 64-bitness under
  67. * 32-bit kernels even though it's perfectly possible (see next
  68. * question).
  69. *
  70. * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  71. * doesn't work?
  72. * A. You can't adress *all* registers as 64-bit wide:-( The catch is
  73. * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  74. * preserved if you're in a leaf function, i.e. such never calling
  75. * any other functions. All functions in this module are leaf and
  76. * 10 registers is a handful. And as a matter of fact none-"comba"
  77. * routines don't require even that much and I could even afford to
  78. * not allocate own stack frame for 'em:-)
  79. *
  80. * Q. What about 64-bit kernels?
  81. * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  82. * under evaluation and development...
  83. *
  84. * Q. What about shared libraries?
  85. * A. What about 'em? Kidding again:-) Code does *not* contain any
  86. * code position dependencies and it's safe to include it into
  87. * shared library as is.
  88. *
  89. * Q. How much faster does it go?
  90. * A. Do you have a good benchmark? In either case below is what I
  91. * experience with crypto/bn/expspeed.c test program:
  92. *
  93. * v8plus module on U10/300MHz against bn_asm.c compiled with:
  94. *
  95. * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
  96. * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
  97. * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
  98. *
  99. * v8 module on SS10/60MHz against bn_asm.c compiled with:
  100. *
  101. * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
  102. * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
  103. * egcs-1.1.2 -mv8 -O3 +35-45%
  104. *
  105. * As you can see it's damn hard to beat the new Sun C compiler
  106. * and it's in first place GNU C users who will appreciate this
  107. * assembler implementation:-)
  108. */
  109. /*
  110. * Revision history.
  111. *
  112. * 1.0 - initial release;
  113. * 1.1 - new loop unrolling model(*);
  114. * - some more fine tuning;
  115. * 1.2 - made gas friendly;
  116. * - updates to documentation concerning v9;
  117. * - new performance comparison matrix;
  118. * 1.3 - fixed problem with /usr/ccs/lib/cpp;
  119. * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
  120. * resulting in slight overall performance kick;
  121. * - some retunes;
  122. * - support for GNU as added;
  123. *
  124. * (*) Originally unrolled loop looked like this:
  125. * for (;;) {
  126. * op(p+0); if (--n==0) break;
  127. * op(p+1); if (--n==0) break;
  128. * op(p+2); if (--n==0) break;
  129. * op(p+3); if (--n==0) break;
  130. * p+=4;
  131. * }
  132. * I unroll according to following:
  133. * while (n&~3) {
  134. * op(p+0); op(p+1); op(p+2); op(p+3);
  135. * p+=4; n=-4;
  136. * }
  137. * if (n) {
  138. * op(p+0); if (--n==0) return;
  139. * op(p+2); if (--n==0) return;
  140. * op(p+3); return;
  141. * }
  142. */
  143. #ifdef OPENSSL_FIPSCANISTER
  144. #include <openssl/fipssyms.h>
  145. #endif
  146. #if defined(__SUNPRO_C) && defined(__sparcv9)
  147. /* They've said -xarch=v9 at command line */
  148. .register %g2,#scratch
  149. .register %g3,#scratch
  150. # define FRAME_SIZE -192
  151. #elif defined(__GNUC__) && defined(__arch64__)
  152. /* They've said -m64 at command line */
  153. .register %g2,#scratch
  154. .register %g3,#scratch
  155. # define FRAME_SIZE -192
  156. #else
  157. # define FRAME_SIZE -96
  158. #endif
  159. /*
  160. * GNU assembler can't stand stuw:-(
  161. */
  162. #define stuw st
  163. .section ".text",#alloc,#execinstr
  164. .file "bn_asm.sparc.v8plus.S"
  165. .align 32
  166. .global bn_mul_add_words
  167. /*
  168. * BN_ULONG bn_mul_add_words(rp,ap,num,w)
  169. * BN_ULONG *rp,*ap;
  170. * int num;
  171. * BN_ULONG w;
  172. */
  173. bn_mul_add_words:
  174. sra %o2,%g0,%o2 ! signx %o2
  175. brgz,a %o2,.L_bn_mul_add_words_proceed
  176. lduw [%o1],%g2
  177. retl
  178. clr %o0
  179. nop
  180. nop
  181. nop
  182. .L_bn_mul_add_words_proceed:
  183. srl %o3,%g0,%o3 ! clruw %o3
  184. andcc %o2,-4,%g0
  185. bz,pn %icc,.L_bn_mul_add_words_tail
  186. clr %o5
  187. .L_bn_mul_add_words_loop: ! wow! 32 aligned!
  188. lduw [%o0],%g1
  189. lduw [%o1+4],%g3
  190. mulx %o3,%g2,%g2
  191. add %g1,%o5,%o4
  192. nop
  193. add %o4,%g2,%o4
  194. stuw %o4,[%o0]
  195. srlx %o4,32,%o5
  196. lduw [%o0+4],%g1
  197. lduw [%o1+8],%g2
  198. mulx %o3,%g3,%g3
  199. add %g1,%o5,%o4
  200. dec 4,%o2
  201. add %o4,%g3,%o4
  202. stuw %o4,[%o0+4]
  203. srlx %o4,32,%o5
  204. lduw [%o0+8],%g1
  205. lduw [%o1+12],%g3
  206. mulx %o3,%g2,%g2
  207. add %g1,%o5,%o4
  208. inc 16,%o1
  209. add %o4,%g2,%o4
  210. stuw %o4,[%o0+8]
  211. srlx %o4,32,%o5
  212. lduw [%o0+12],%g1
  213. mulx %o3,%g3,%g3
  214. add %g1,%o5,%o4
  215. inc 16,%o0
  216. add %o4,%g3,%o4
  217. andcc %o2,-4,%g0
  218. stuw %o4,[%o0-4]
  219. srlx %o4,32,%o5
  220. bnz,a,pt %icc,.L_bn_mul_add_words_loop
  221. lduw [%o1],%g2
  222. brnz,a,pn %o2,.L_bn_mul_add_words_tail
  223. lduw [%o1],%g2
  224. .L_bn_mul_add_words_return:
  225. retl
  226. mov %o5,%o0
  227. .L_bn_mul_add_words_tail:
  228. lduw [%o0],%g1
  229. mulx %o3,%g2,%g2
  230. add %g1,%o5,%o4
  231. dec %o2
  232. add %o4,%g2,%o4
  233. srlx %o4,32,%o5
  234. brz,pt %o2,.L_bn_mul_add_words_return
  235. stuw %o4,[%o0]
  236. lduw [%o1+4],%g2
  237. lduw [%o0+4],%g1
  238. mulx %o3,%g2,%g2
  239. add %g1,%o5,%o4
  240. dec %o2
  241. add %o4,%g2,%o4
  242. srlx %o4,32,%o5
  243. brz,pt %o2,.L_bn_mul_add_words_return
  244. stuw %o4,[%o0+4]
  245. lduw [%o1+8],%g2
  246. lduw [%o0+8],%g1
  247. mulx %o3,%g2,%g2
  248. add %g1,%o5,%o4
  249. add %o4,%g2,%o4
  250. stuw %o4,[%o0+8]
  251. retl
  252. srlx %o4,32,%o0
  253. .type bn_mul_add_words,#function
  254. .size bn_mul_add_words,(.-bn_mul_add_words)
  255. .align 32
  256. .global bn_mul_words
  257. /*
  258. * BN_ULONG bn_mul_words(rp,ap,num,w)
  259. * BN_ULONG *rp,*ap;
  260. * int num;
  261. * BN_ULONG w;
  262. */
  263. bn_mul_words:
  264. sra %o2,%g0,%o2 ! signx %o2
  265. brgz,a %o2,.L_bn_mul_words_proceeed
  266. lduw [%o1],%g2
  267. retl
  268. clr %o0
  269. nop
  270. nop
  271. nop
  272. .L_bn_mul_words_proceeed:
  273. srl %o3,%g0,%o3 ! clruw %o3
  274. andcc %o2,-4,%g0
  275. bz,pn %icc,.L_bn_mul_words_tail
  276. clr %o5
  277. .L_bn_mul_words_loop: ! wow! 32 aligned!
  278. lduw [%o1+4],%g3
  279. mulx %o3,%g2,%g2
  280. add %g2,%o5,%o4
  281. nop
  282. stuw %o4,[%o0]
  283. srlx %o4,32,%o5
  284. lduw [%o1+8],%g2
  285. mulx %o3,%g3,%g3
  286. add %g3,%o5,%o4
  287. dec 4,%o2
  288. stuw %o4,[%o0+4]
  289. srlx %o4,32,%o5
  290. lduw [%o1+12],%g3
  291. mulx %o3,%g2,%g2
  292. add %g2,%o5,%o4
  293. inc 16,%o1
  294. stuw %o4,[%o0+8]
  295. srlx %o4,32,%o5
  296. mulx %o3,%g3,%g3
  297. add %g3,%o5,%o4
  298. inc 16,%o0
  299. stuw %o4,[%o0-4]
  300. srlx %o4,32,%o5
  301. andcc %o2,-4,%g0
  302. bnz,a,pt %icc,.L_bn_mul_words_loop
  303. lduw [%o1],%g2
  304. nop
  305. nop
  306. brnz,a,pn %o2,.L_bn_mul_words_tail
  307. lduw [%o1],%g2
  308. .L_bn_mul_words_return:
  309. retl
  310. mov %o5,%o0
  311. .L_bn_mul_words_tail:
  312. mulx %o3,%g2,%g2
  313. add %g2,%o5,%o4
  314. dec %o2
  315. srlx %o4,32,%o5
  316. brz,pt %o2,.L_bn_mul_words_return
  317. stuw %o4,[%o0]
  318. lduw [%o1+4],%g2
  319. mulx %o3,%g2,%g2
  320. add %g2,%o5,%o4
  321. dec %o2
  322. srlx %o4,32,%o5
  323. brz,pt %o2,.L_bn_mul_words_return
  324. stuw %o4,[%o0+4]
  325. lduw [%o1+8],%g2
  326. mulx %o3,%g2,%g2
  327. add %g2,%o5,%o4
  328. stuw %o4,[%o0+8]
  329. retl
  330. srlx %o4,32,%o0
  331. .type bn_mul_words,#function
  332. .size bn_mul_words,(.-bn_mul_words)
  333. .align 32
  334. .global bn_sqr_words
  335. /*
  336. * void bn_sqr_words(r,a,n)
  337. * BN_ULONG *r,*a;
  338. * int n;
  339. */
  340. bn_sqr_words:
  341. sra %o2,%g0,%o2 ! signx %o2
  342. brgz,a %o2,.L_bn_sqr_words_proceeed
  343. lduw [%o1],%g2
  344. retl
  345. clr %o0
  346. nop
  347. nop
  348. nop
  349. .L_bn_sqr_words_proceeed:
  350. andcc %o2,-4,%g0
  351. nop
  352. bz,pn %icc,.L_bn_sqr_words_tail
  353. nop
  354. .L_bn_sqr_words_loop: ! wow! 32 aligned!
  355. lduw [%o1+4],%g3
  356. mulx %g2,%g2,%o4
  357. stuw %o4,[%o0]
  358. srlx %o4,32,%o5
  359. stuw %o5,[%o0+4]
  360. nop
  361. lduw [%o1+8],%g2
  362. mulx %g3,%g3,%o4
  363. dec 4,%o2
  364. stuw %o4,[%o0+8]
  365. srlx %o4,32,%o5
  366. stuw %o5,[%o0+12]
  367. lduw [%o1+12],%g3
  368. mulx %g2,%g2,%o4
  369. srlx %o4,32,%o5
  370. stuw %o4,[%o0+16]
  371. inc 16,%o1
  372. stuw %o5,[%o0+20]
  373. mulx %g3,%g3,%o4
  374. inc 32,%o0
  375. stuw %o4,[%o0-8]
  376. srlx %o4,32,%o5
  377. andcc %o2,-4,%g2
  378. stuw %o5,[%o0-4]
  379. bnz,a,pt %icc,.L_bn_sqr_words_loop
  380. lduw [%o1],%g2
  381. nop
  382. brnz,a,pn %o2,.L_bn_sqr_words_tail
  383. lduw [%o1],%g2
  384. .L_bn_sqr_words_return:
  385. retl
  386. clr %o0
  387. .L_bn_sqr_words_tail:
  388. mulx %g2,%g2,%o4
  389. dec %o2
  390. stuw %o4,[%o0]
  391. srlx %o4,32,%o5
  392. brz,pt %o2,.L_bn_sqr_words_return
  393. stuw %o5,[%o0+4]
  394. lduw [%o1+4],%g2
  395. mulx %g2,%g2,%o4
  396. dec %o2
  397. stuw %o4,[%o0+8]
  398. srlx %o4,32,%o5
  399. brz,pt %o2,.L_bn_sqr_words_return
  400. stuw %o5,[%o0+12]
  401. lduw [%o1+8],%g2
  402. mulx %g2,%g2,%o4
  403. srlx %o4,32,%o5
  404. stuw %o4,[%o0+16]
  405. stuw %o5,[%o0+20]
  406. retl
  407. clr %o0
  408. .type bn_sqr_words,#function
  409. .size bn_sqr_words,(.-bn_sqr_words)
  410. .align 32
  411. .global bn_div_words
  412. /*
  413. * BN_ULONG bn_div_words(h,l,d)
  414. * BN_ULONG h,l,d;
  415. */
  416. bn_div_words:
  417. sllx %o0,32,%o0
  418. or %o0,%o1,%o0
  419. udivx %o0,%o2,%o0
  420. retl
  421. srl %o0,%g0,%o0 ! clruw %o0
  422. .type bn_div_words,#function
  423. .size bn_div_words,(.-bn_div_words)
  424. .align 32
  425. .global bn_add_words
  426. /*
  427. * BN_ULONG bn_add_words(rp,ap,bp,n)
  428. * BN_ULONG *rp,*ap,*bp;
  429. * int n;
  430. */
  431. bn_add_words:
  432. sra %o3,%g0,%o3 ! signx %o3
  433. brgz,a %o3,.L_bn_add_words_proceed
  434. lduw [%o1],%o4
  435. retl
  436. clr %o0
  437. .L_bn_add_words_proceed:
  438. andcc %o3,-4,%g0
  439. bz,pn %icc,.L_bn_add_words_tail
  440. addcc %g0,0,%g0 ! clear carry flag
  441. .L_bn_add_words_loop: ! wow! 32 aligned!
  442. dec 4,%o3
  443. lduw [%o2],%o5
  444. lduw [%o1+4],%g1
  445. lduw [%o2+4],%g2
  446. lduw [%o1+8],%g3
  447. lduw [%o2+8],%g4
  448. addccc %o5,%o4,%o5
  449. stuw %o5,[%o0]
  450. lduw [%o1+12],%o4
  451. lduw [%o2+12],%o5
  452. inc 16,%o1
  453. addccc %g1,%g2,%g1
  454. stuw %g1,[%o0+4]
  455. inc 16,%o2
  456. addccc %g3,%g4,%g3
  457. stuw %g3,[%o0+8]
  458. inc 16,%o0
  459. addccc %o5,%o4,%o5
  460. stuw %o5,[%o0-4]
  461. and %o3,-4,%g1
  462. brnz,a,pt %g1,.L_bn_add_words_loop
  463. lduw [%o1],%o4
  464. brnz,a,pn %o3,.L_bn_add_words_tail
  465. lduw [%o1],%o4
  466. .L_bn_add_words_return:
  467. clr %o0
  468. retl
  469. movcs %icc,1,%o0
  470. nop
  471. .L_bn_add_words_tail:
  472. lduw [%o2],%o5
  473. dec %o3
  474. addccc %o5,%o4,%o5
  475. brz,pt %o3,.L_bn_add_words_return
  476. stuw %o5,[%o0]
  477. lduw [%o1+4],%o4
  478. lduw [%o2+4],%o5
  479. dec %o3
  480. addccc %o5,%o4,%o5
  481. brz,pt %o3,.L_bn_add_words_return
  482. stuw %o5,[%o0+4]
  483. lduw [%o1+8],%o4
  484. lduw [%o2+8],%o5
  485. addccc %o5,%o4,%o5
  486. stuw %o5,[%o0+8]
  487. clr %o0
  488. retl
  489. movcs %icc,1,%o0
  490. .type bn_add_words,#function
  491. .size bn_add_words,(.-bn_add_words)
  492. .global bn_sub_words
  493. /*
  494. * BN_ULONG bn_sub_words(rp,ap,bp,n)
  495. * BN_ULONG *rp,*ap,*bp;
  496. * int n;
  497. */
  498. bn_sub_words:
  499. sra %o3,%g0,%o3 ! signx %o3
  500. brgz,a %o3,.L_bn_sub_words_proceed
  501. lduw [%o1],%o4
  502. retl
  503. clr %o0
  504. .L_bn_sub_words_proceed:
  505. andcc %o3,-4,%g0
  506. bz,pn %icc,.L_bn_sub_words_tail
  507. addcc %g0,0,%g0 ! clear carry flag
  508. .L_bn_sub_words_loop: ! wow! 32 aligned!
  509. dec 4,%o3
  510. lduw [%o2],%o5
  511. lduw [%o1+4],%g1
  512. lduw [%o2+4],%g2
  513. lduw [%o1+8],%g3
  514. lduw [%o2+8],%g4
  515. subccc %o4,%o5,%o5
  516. stuw %o5,[%o0]
  517. lduw [%o1+12],%o4
  518. lduw [%o2+12],%o5
  519. inc 16,%o1
  520. subccc %g1,%g2,%g2
  521. stuw %g2,[%o0+4]
  522. inc 16,%o2
  523. subccc %g3,%g4,%g4
  524. stuw %g4,[%o0+8]
  525. inc 16,%o0
  526. subccc %o4,%o5,%o5
  527. stuw %o5,[%o0-4]
  528. and %o3,-4,%g1
  529. brnz,a,pt %g1,.L_bn_sub_words_loop
  530. lduw [%o1],%o4
  531. brnz,a,pn %o3,.L_bn_sub_words_tail
  532. lduw [%o1],%o4
  533. .L_bn_sub_words_return:
  534. clr %o0
  535. retl
  536. movcs %icc,1,%o0
  537. nop
  538. .L_bn_sub_words_tail: ! wow! 32 aligned!
  539. lduw [%o2],%o5
  540. dec %o3
  541. subccc %o4,%o5,%o5
  542. brz,pt %o3,.L_bn_sub_words_return
  543. stuw %o5,[%o0]
  544. lduw [%o1+4],%o4
  545. lduw [%o2+4],%o5
  546. dec %o3
  547. subccc %o4,%o5,%o5
  548. brz,pt %o3,.L_bn_sub_words_return
  549. stuw %o5,[%o0+4]
  550. lduw [%o1+8],%o4
  551. lduw [%o2+8],%o5
  552. subccc %o4,%o5,%o5
  553. stuw %o5,[%o0+8]
  554. clr %o0
  555. retl
  556. movcs %icc,1,%o0
  557. .type bn_sub_words,#function
  558. .size bn_sub_words,(.-bn_sub_words)
  559. /*
  560. * Code below depends on the fact that upper parts of the %l0-%l7
  561. * and %i0-%i7 are zeroed by kernel after context switch. In
  562. * previous versions this comment stated that "the trouble is that
  563. * it's not feasible to implement the mumbo-jumbo in less V9
  564. * instructions:-(" which apparently isn't true thanks to
  565. * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
  566. * results not from the shorter code, but from elimination of
  567. * multicycle none-pairable 'rd %y,%rd' instructions.
  568. *
  569. * Andy.
  570. */
  571. /*
  572. * Here is register usage map for *all* routines below.
  573. */
  574. #define t_1 %o0
  575. #define t_2 %o1
  576. #define c_12 %o2
  577. #define c_3 %o3
  578. #define ap(I) [%i1+4*I]
  579. #define bp(I) [%i2+4*I]
  580. #define rp(I) [%i0+4*I]
  581. #define a_0 %l0
  582. #define a_1 %l1
  583. #define a_2 %l2
  584. #define a_3 %l3
  585. #define a_4 %l4
  586. #define a_5 %l5
  587. #define a_6 %l6
  588. #define a_7 %l7
  589. #define b_0 %i3
  590. #define b_1 %i4
  591. #define b_2 %i5
  592. #define b_3 %o4
  593. #define b_4 %o5
  594. #define b_5 %o7
  595. #define b_6 %g1
  596. #define b_7 %g4
  597. .align 32
  598. .global bn_mul_comba8
  599. /*
  600. * void bn_mul_comba8(r,a,b)
  601. * BN_ULONG *r,*a,*b;
  602. */
  603. bn_mul_comba8:
  604. save %sp,FRAME_SIZE,%sp
  605. mov 1,t_2
  606. lduw ap(0),a_0
  607. sllx t_2,32,t_2
  608. lduw bp(0),b_0 !=
  609. lduw bp(1),b_1
  610. mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
  611. srlx t_1,32,c_12
  612. stuw t_1,rp(0) !=!r[0]=c1;
  613. lduw ap(1),a_1
  614. mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
  615. addcc c_12,t_1,c_12
  616. clr c_3 !=
  617. bcs,a %xcc,.+8
  618. add c_3,t_2,c_3
  619. lduw ap(2),a_2
  620. mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
  621. addcc c_12,t_1,t_1
  622. bcs,a %xcc,.+8
  623. add c_3,t_2,c_3
  624. srlx t_1,32,c_12 !=
  625. stuw t_1,rp(1) !r[1]=c2;
  626. or c_12,c_3,c_12
  627. mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
  628. addcc c_12,t_1,c_12 !=
  629. clr c_3
  630. bcs,a %xcc,.+8
  631. add c_3,t_2,c_3
  632. lduw bp(2),b_2 !=
  633. mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
  634. addcc c_12,t_1,c_12
  635. bcs,a %xcc,.+8
  636. add c_3,t_2,c_3 !=
  637. lduw bp(3),b_3
  638. mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
  639. addcc c_12,t_1,t_1
  640. bcs,a %xcc,.+8 !=
  641. add c_3,t_2,c_3
  642. srlx t_1,32,c_12
  643. stuw t_1,rp(2) !r[2]=c3;
  644. or c_12,c_3,c_12 !=
  645. mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
  646. addcc c_12,t_1,c_12
  647. clr c_3
  648. bcs,a %xcc,.+8 !=
  649. add c_3,t_2,c_3
  650. mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
  651. addcc c_12,t_1,c_12
  652. bcs,a %xcc,.+8 !=
  653. add c_3,t_2,c_3
  654. lduw ap(3),a_3
  655. mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
  656. addcc c_12,t_1,c_12 !=
  657. bcs,a %xcc,.+8
  658. add c_3,t_2,c_3
  659. lduw ap(4),a_4
  660. mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
  661. addcc c_12,t_1,t_1
  662. bcs,a %xcc,.+8
  663. add c_3,t_2,c_3
  664. srlx t_1,32,c_12 !=
  665. stuw t_1,rp(3) !r[3]=c1;
  666. or c_12,c_3,c_12
  667. mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
  668. addcc c_12,t_1,c_12 !=
  669. clr c_3
  670. bcs,a %xcc,.+8
  671. add c_3,t_2,c_3
  672. mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
  673. addcc c_12,t_1,c_12
  674. bcs,a %xcc,.+8
  675. add c_3,t_2,c_3
  676. mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
  677. addcc c_12,t_1,c_12
  678. bcs,a %xcc,.+8
  679. add c_3,t_2,c_3
  680. lduw bp(4),b_4 !=
  681. mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
  682. addcc c_12,t_1,c_12
  683. bcs,a %xcc,.+8
  684. add c_3,t_2,c_3 !=
  685. lduw bp(5),b_5
  686. mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
  687. addcc c_12,t_1,t_1
  688. bcs,a %xcc,.+8 !=
  689. add c_3,t_2,c_3
  690. srlx t_1,32,c_12
  691. stuw t_1,rp(4) !r[4]=c2;
  692. or c_12,c_3,c_12 !=
  693. mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
  694. addcc c_12,t_1,c_12
  695. clr c_3
  696. bcs,a %xcc,.+8 !=
  697. add c_3,t_2,c_3
  698. mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
  699. addcc c_12,t_1,c_12
  700. bcs,a %xcc,.+8 !=
  701. add c_3,t_2,c_3
  702. mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
  703. addcc c_12,t_1,c_12
  704. bcs,a %xcc,.+8 !=
  705. add c_3,t_2,c_3
  706. mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
  707. addcc c_12,t_1,c_12
  708. bcs,a %xcc,.+8 !=
  709. add c_3,t_2,c_3
  710. lduw ap(5),a_5
  711. mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
  712. addcc c_12,t_1,c_12 !=
  713. bcs,a %xcc,.+8
  714. add c_3,t_2,c_3
  715. lduw ap(6),a_6
  716. mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
  717. addcc c_12,t_1,t_1
  718. bcs,a %xcc,.+8
  719. add c_3,t_2,c_3
  720. srlx t_1,32,c_12 !=
  721. stuw t_1,rp(5) !r[5]=c3;
  722. or c_12,c_3,c_12
  723. mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
  724. addcc c_12,t_1,c_12 !=
  725. clr c_3
  726. bcs,a %xcc,.+8
  727. add c_3,t_2,c_3
  728. mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
  729. addcc c_12,t_1,c_12
  730. bcs,a %xcc,.+8
  731. add c_3,t_2,c_3
  732. mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
  733. addcc c_12,t_1,c_12
  734. bcs,a %xcc,.+8
  735. add c_3,t_2,c_3
  736. mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
  737. addcc c_12,t_1,c_12
  738. bcs,a %xcc,.+8
  739. add c_3,t_2,c_3
  740. mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
  741. addcc c_12,t_1,c_12
  742. bcs,a %xcc,.+8
  743. add c_3,t_2,c_3
  744. lduw bp(6),b_6 !=
  745. mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
  746. addcc c_12,t_1,c_12
  747. bcs,a %xcc,.+8
  748. add c_3,t_2,c_3 !=
  749. lduw bp(7),b_7
  750. mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
  751. addcc c_12,t_1,t_1
  752. bcs,a %xcc,.+8 !=
  753. add c_3,t_2,c_3
  754. srlx t_1,32,c_12
  755. stuw t_1,rp(6) !r[6]=c1;
  756. or c_12,c_3,c_12 !=
  757. mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
  758. addcc c_12,t_1,c_12
  759. clr c_3
  760. bcs,a %xcc,.+8 !=
  761. add c_3,t_2,c_3
  762. mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
  763. addcc c_12,t_1,c_12
  764. bcs,a %xcc,.+8 !=
  765. add c_3,t_2,c_3
  766. mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
  767. addcc c_12,t_1,c_12
  768. bcs,a %xcc,.+8 !=
  769. add c_3,t_2,c_3
  770. mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
  771. addcc c_12,t_1,c_12
  772. bcs,a %xcc,.+8 !=
  773. add c_3,t_2,c_3
  774. mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
  775. addcc c_12,t_1,c_12
  776. bcs,a %xcc,.+8 !=
  777. add c_3,t_2,c_3
  778. mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
  779. addcc c_12,t_1,c_12
  780. bcs,a %xcc,.+8 !=
  781. add c_3,t_2,c_3
  782. lduw ap(7),a_7
  783. mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
  784. addcc c_12,t_1,c_12
  785. bcs,a %xcc,.+8
  786. add c_3,t_2,c_3
  787. mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
  788. addcc c_12,t_1,t_1
  789. bcs,a %xcc,.+8
  790. add c_3,t_2,c_3
  791. srlx t_1,32,c_12 !=
  792. stuw t_1,rp(7) !r[7]=c2;
  793. or c_12,c_3,c_12
  794. mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
  795. addcc c_12,t_1,c_12
  796. clr c_3
  797. bcs,a %xcc,.+8
  798. add c_3,t_2,c_3 !=
  799. mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
  800. addcc c_12,t_1,c_12
  801. bcs,a %xcc,.+8
  802. add c_3,t_2,c_3 !=
  803. mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
  804. addcc c_12,t_1,c_12
  805. bcs,a %xcc,.+8
  806. add c_3,t_2,c_3 !=
  807. mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
  808. addcc c_12,t_1,c_12
  809. bcs,a %xcc,.+8
  810. add c_3,t_2,c_3 !=
  811. mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
  812. addcc c_12,t_1,c_12
  813. bcs,a %xcc,.+8
  814. add c_3,t_2,c_3 !=
  815. mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
  816. addcc c_12,t_1,c_12
  817. bcs,a %xcc,.+8
  818. add c_3,t_2,c_3 !=
  819. mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
  820. addcc c_12,t_1,t_1
  821. bcs,a %xcc,.+8
  822. add c_3,t_2,c_3 !=
  823. srlx t_1,32,c_12
  824. stuw t_1,rp(8) !r[8]=c3;
  825. or c_12,c_3,c_12
  826. mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
  827. addcc c_12,t_1,c_12
  828. clr c_3
  829. bcs,a %xcc,.+8
  830. add c_3,t_2,c_3 !=
  831. mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
  832. addcc c_12,t_1,c_12
  833. bcs,a %xcc,.+8 !=
  834. add c_3,t_2,c_3
  835. mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
  836. addcc c_12,t_1,c_12
  837. bcs,a %xcc,.+8 !=
  838. add c_3,t_2,c_3
  839. mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
  840. addcc c_12,t_1,c_12
  841. bcs,a %xcc,.+8 !=
  842. add c_3,t_2,c_3
  843. mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
  844. addcc c_12,t_1,c_12
  845. bcs,a %xcc,.+8 !=
  846. add c_3,t_2,c_3
  847. mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
  848. addcc c_12,t_1,t_1
  849. bcs,a %xcc,.+8 !=
  850. add c_3,t_2,c_3
  851. srlx t_1,32,c_12
  852. stuw t_1,rp(9) !r[9]=c1;
  853. or c_12,c_3,c_12 !=
  854. mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
  855. addcc c_12,t_1,c_12
  856. clr c_3
  857. bcs,a %xcc,.+8 !=
  858. add c_3,t_2,c_3
  859. mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
  860. addcc c_12,t_1,c_12
  861. bcs,a %xcc,.+8 !=
  862. add c_3,t_2,c_3
  863. mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
  864. addcc c_12,t_1,c_12
  865. bcs,a %xcc,.+8 !=
  866. add c_3,t_2,c_3
  867. mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
  868. addcc c_12,t_1,c_12
  869. bcs,a %xcc,.+8 !=
  870. add c_3,t_2,c_3
  871. mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
  872. addcc c_12,t_1,t_1
  873. bcs,a %xcc,.+8 !=
  874. add c_3,t_2,c_3
  875. srlx t_1,32,c_12
  876. stuw t_1,rp(10) !r[10]=c2;
  877. or c_12,c_3,c_12 !=
  878. mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
  879. addcc c_12,t_1,c_12
  880. clr c_3
  881. bcs,a %xcc,.+8 !=
  882. add c_3,t_2,c_3
  883. mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
  884. addcc c_12,t_1,c_12
  885. bcs,a %xcc,.+8 !=
  886. add c_3,t_2,c_3
  887. mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
  888. addcc c_12,t_1,c_12
  889. bcs,a %xcc,.+8 !=
  890. add c_3,t_2,c_3
  891. mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
  892. addcc c_12,t_1,t_1
  893. bcs,a %xcc,.+8 !=
  894. add c_3,t_2,c_3
  895. srlx t_1,32,c_12
  896. stuw t_1,rp(11) !r[11]=c3;
  897. or c_12,c_3,c_12 !=
  898. mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
  899. addcc c_12,t_1,c_12
  900. clr c_3
  901. bcs,a %xcc,.+8 !=
  902. add c_3,t_2,c_3
  903. mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
  904. addcc c_12,t_1,c_12
  905. bcs,a %xcc,.+8 !=
  906. add c_3,t_2,c_3
  907. mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
  908. addcc c_12,t_1,t_1
  909. bcs,a %xcc,.+8 !=
  910. add c_3,t_2,c_3
  911. srlx t_1,32,c_12
  912. stuw t_1,rp(12) !r[12]=c1;
  913. or c_12,c_3,c_12 !=
  914. mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
  915. addcc c_12,t_1,c_12
  916. clr c_3
  917. bcs,a %xcc,.+8 !=
  918. add c_3,t_2,c_3
  919. mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
  920. addcc c_12,t_1,t_1
  921. bcs,a %xcc,.+8 !=
  922. add c_3,t_2,c_3
  923. srlx t_1,32,c_12
  924. st t_1,rp(13) !r[13]=c2;
  925. or c_12,c_3,c_12 !=
  926. mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
  927. addcc c_12,t_1,t_1
  928. srlx t_1,32,c_12 !=
  929. stuw t_1,rp(14) !r[14]=c3;
  930. stuw c_12,rp(15) !r[15]=c1;
  931. ret
  932. restore %g0,%g0,%o0 !=
  933. .type bn_mul_comba8,#function
  934. .size bn_mul_comba8,(.-bn_mul_comba8)
  935. .align 32
  936. .global bn_mul_comba4
  937. /*
  938. * void bn_mul_comba4(r,a,b)
  939. * BN_ULONG *r,*a,*b;
  940. */
  941. bn_mul_comba4:
  942. save %sp,FRAME_SIZE,%sp
  943. lduw ap(0),a_0
  944. mov 1,t_2
  945. lduw bp(0),b_0
  946. sllx t_2,32,t_2 !=
  947. lduw bp(1),b_1
  948. mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
  949. srlx t_1,32,c_12
  950. stuw t_1,rp(0) !=!r[0]=c1;
  951. lduw ap(1),a_1
  952. mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
  953. addcc c_12,t_1,c_12
  954. clr c_3 !=
  955. bcs,a %xcc,.+8
  956. add c_3,t_2,c_3
  957. lduw ap(2),a_2
  958. mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
  959. addcc c_12,t_1,t_1
  960. bcs,a %xcc,.+8
  961. add c_3,t_2,c_3
  962. srlx t_1,32,c_12 !=
  963. stuw t_1,rp(1) !r[1]=c2;
  964. or c_12,c_3,c_12
  965. mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
  966. addcc c_12,t_1,c_12 !=
  967. clr c_3
  968. bcs,a %xcc,.+8
  969. add c_3,t_2,c_3
  970. lduw bp(2),b_2 !=
  971. mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
  972. addcc c_12,t_1,c_12
  973. bcs,a %xcc,.+8
  974. add c_3,t_2,c_3 !=
  975. lduw bp(3),b_3
  976. mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
  977. addcc c_12,t_1,t_1
  978. bcs,a %xcc,.+8 !=
  979. add c_3,t_2,c_3
  980. srlx t_1,32,c_12
  981. stuw t_1,rp(2) !r[2]=c3;
  982. or c_12,c_3,c_12 !=
  983. mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
  984. addcc c_12,t_1,c_12
  985. clr c_3
  986. bcs,a %xcc,.+8 !=
  987. add c_3,t_2,c_3
  988. mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
  989. addcc c_12,t_1,c_12
  990. bcs,a %xcc,.+8 !=
  991. add c_3,t_2,c_3
  992. lduw ap(3),a_3
  993. mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
  994. addcc c_12,t_1,c_12 !=
  995. bcs,a %xcc,.+8
  996. add c_3,t_2,c_3
  997. mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
  998. addcc c_12,t_1,t_1 !=
  999. bcs,a %xcc,.+8
  1000. add c_3,t_2,c_3
  1001. srlx t_1,32,c_12
  1002. stuw t_1,rp(3) !=!r[3]=c1;
  1003. or c_12,c_3,c_12
  1004. mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
  1005. addcc c_12,t_1,c_12
  1006. clr c_3 !=
  1007. bcs,a %xcc,.+8
  1008. add c_3,t_2,c_3
  1009. mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
  1010. addcc c_12,t_1,c_12 !=
  1011. bcs,a %xcc,.+8
  1012. add c_3,t_2,c_3
  1013. mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
  1014. addcc c_12,t_1,t_1 !=
  1015. bcs,a %xcc,.+8
  1016. add c_3,t_2,c_3
  1017. srlx t_1,32,c_12
  1018. stuw t_1,rp(4) !=!r[4]=c2;
  1019. or c_12,c_3,c_12
  1020. mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
  1021. addcc c_12,t_1,c_12
  1022. clr c_3 !=
  1023. bcs,a %xcc,.+8
  1024. add c_3,t_2,c_3
  1025. mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
  1026. addcc c_12,t_1,t_1 !=
  1027. bcs,a %xcc,.+8
  1028. add c_3,t_2,c_3
  1029. srlx t_1,32,c_12
  1030. stuw t_1,rp(5) !=!r[5]=c3;
  1031. or c_12,c_3,c_12
  1032. mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
  1033. addcc c_12,t_1,t_1
  1034. srlx t_1,32,c_12 !=
  1035. stuw t_1,rp(6) !r[6]=c1;
  1036. stuw c_12,rp(7) !r[7]=c2;
  1037. ret
  1038. restore %g0,%g0,%o0
  1039. .type bn_mul_comba4,#function
  1040. .size bn_mul_comba4,(.-bn_mul_comba4)
  1041. .align 32
  1042. .global bn_sqr_comba8
  1043. bn_sqr_comba8:
  1044. save %sp,FRAME_SIZE,%sp
  1045. mov 1,t_2
  1046. lduw ap(0),a_0
  1047. sllx t_2,32,t_2
  1048. lduw ap(1),a_1
  1049. mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
  1050. srlx t_1,32,c_12
  1051. stuw t_1,rp(0) !r[0]=c1;
  1052. lduw ap(2),a_2
  1053. mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
  1054. addcc c_12,t_1,c_12
  1055. clr c_3
  1056. bcs,a %xcc,.+8
  1057. add c_3,t_2,c_3
  1058. addcc c_12,t_1,t_1
  1059. bcs,a %xcc,.+8
  1060. add c_3,t_2,c_3
  1061. srlx t_1,32,c_12
  1062. stuw t_1,rp(1) !r[1]=c2;
  1063. or c_12,c_3,c_12
  1064. mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
  1065. addcc c_12,t_1,c_12
  1066. clr c_3
  1067. bcs,a %xcc,.+8
  1068. add c_3,t_2,c_3
  1069. addcc c_12,t_1,c_12
  1070. bcs,a %xcc,.+8
  1071. add c_3,t_2,c_3
  1072. lduw ap(3),a_3
  1073. mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
  1074. addcc c_12,t_1,t_1
  1075. bcs,a %xcc,.+8
  1076. add c_3,t_2,c_3
  1077. srlx t_1,32,c_12
  1078. stuw t_1,rp(2) !r[2]=c3;
  1079. or c_12,c_3,c_12
  1080. mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
  1081. addcc c_12,t_1,c_12
  1082. clr c_3
  1083. bcs,a %xcc,.+8
  1084. add c_3,t_2,c_3
  1085. addcc c_12,t_1,c_12
  1086. bcs,a %xcc,.+8
  1087. add c_3,t_2,c_3
  1088. lduw ap(4),a_4
  1089. mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
  1090. addcc c_12,t_1,c_12
  1091. bcs,a %xcc,.+8
  1092. add c_3,t_2,c_3
  1093. addcc c_12,t_1,t_1
  1094. bcs,a %xcc,.+8
  1095. add c_3,t_2,c_3
  1096. srlx t_1,32,c_12
  1097. st t_1,rp(3) !r[3]=c1;
  1098. or c_12,c_3,c_12
  1099. mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
  1100. addcc c_12,t_1,c_12
  1101. clr c_3
  1102. bcs,a %xcc,.+8
  1103. add c_3,t_2,c_3
  1104. addcc c_12,t_1,c_12
  1105. bcs,a %xcc,.+8
  1106. add c_3,t_2,c_3
  1107. mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
  1108. addcc c_12,t_1,c_12
  1109. bcs,a %xcc,.+8
  1110. add c_3,t_2,c_3
  1111. addcc c_12,t_1,c_12
  1112. bcs,a %xcc,.+8
  1113. add c_3,t_2,c_3
  1114. lduw ap(5),a_5
  1115. mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
  1116. addcc c_12,t_1,t_1
  1117. bcs,a %xcc,.+8
  1118. add c_3,t_2,c_3
  1119. srlx t_1,32,c_12
  1120. stuw t_1,rp(4) !r[4]=c2;
  1121. or c_12,c_3,c_12
  1122. mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
  1123. addcc c_12,t_1,c_12
  1124. clr c_3
  1125. bcs,a %xcc,.+8
  1126. add c_3,t_2,c_3
  1127. addcc c_12,t_1,c_12
  1128. bcs,a %xcc,.+8
  1129. add c_3,t_2,c_3
  1130. mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
  1131. addcc c_12,t_1,c_12
  1132. bcs,a %xcc,.+8
  1133. add c_3,t_2,c_3
  1134. addcc c_12,t_1,c_12
  1135. bcs,a %xcc,.+8
  1136. add c_3,t_2,c_3
  1137. lduw ap(6),a_6
  1138. mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
  1139. addcc c_12,t_1,c_12
  1140. bcs,a %xcc,.+8
  1141. add c_3,t_2,c_3
  1142. addcc c_12,t_1,t_1
  1143. bcs,a %xcc,.+8
  1144. add c_3,t_2,c_3
  1145. srlx t_1,32,c_12
  1146. stuw t_1,rp(5) !r[5]=c3;
  1147. or c_12,c_3,c_12
  1148. mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
  1149. addcc c_12,t_1,c_12
  1150. clr c_3
  1151. bcs,a %xcc,.+8
  1152. add c_3,t_2,c_3
  1153. addcc c_12,t_1,c_12
  1154. bcs,a %xcc,.+8
  1155. add c_3,t_2,c_3
  1156. mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
  1157. addcc c_12,t_1,c_12
  1158. bcs,a %xcc,.+8
  1159. add c_3,t_2,c_3
  1160. addcc c_12,t_1,c_12
  1161. bcs,a %xcc,.+8
  1162. add c_3,t_2,c_3
  1163. mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
  1164. addcc c_12,t_1,c_12
  1165. bcs,a %xcc,.+8
  1166. add c_3,t_2,c_3
  1167. addcc c_12,t_1,c_12
  1168. bcs,a %xcc,.+8
  1169. add c_3,t_2,c_3
  1170. lduw ap(7),a_7
  1171. mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
  1172. addcc c_12,t_1,t_1
  1173. bcs,a %xcc,.+8
  1174. add c_3,t_2,c_3
  1175. srlx t_1,32,c_12
  1176. stuw t_1,rp(6) !r[6]=c1;
  1177. or c_12,c_3,c_12
  1178. mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
  1179. addcc c_12,t_1,c_12
  1180. clr c_3
  1181. bcs,a %xcc,.+8
  1182. add c_3,t_2,c_3
  1183. addcc c_12,t_1,c_12
  1184. bcs,a %xcc,.+8
  1185. add c_3,t_2,c_3
  1186. mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
  1187. addcc c_12,t_1,c_12
  1188. bcs,a %xcc,.+8
  1189. add c_3,t_2,c_3
  1190. addcc c_12,t_1,c_12
  1191. bcs,a %xcc,.+8
  1192. add c_3,t_2,c_3
  1193. mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
  1194. addcc c_12,t_1,c_12
  1195. bcs,a %xcc,.+8
  1196. add c_3,t_2,c_3
  1197. addcc c_12,t_1,c_12
  1198. bcs,a %xcc,.+8
  1199. add c_3,t_2,c_3
  1200. mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
  1201. addcc c_12,t_1,c_12
  1202. bcs,a %xcc,.+8
  1203. add c_3,t_2,c_3
  1204. addcc c_12,t_1,t_1
  1205. bcs,a %xcc,.+8
  1206. add c_3,t_2,c_3
  1207. srlx t_1,32,c_12
  1208. stuw t_1,rp(7) !r[7]=c2;
  1209. or c_12,c_3,c_12
  1210. mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
  1211. addcc c_12,t_1,c_12
  1212. clr c_3
  1213. bcs,a %xcc,.+8
  1214. add c_3,t_2,c_3
  1215. addcc c_12,t_1,c_12
  1216. bcs,a %xcc,.+8
  1217. add c_3,t_2,c_3
  1218. mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
  1219. addcc c_12,t_1,c_12
  1220. bcs,a %xcc,.+8
  1221. add c_3,t_2,c_3
  1222. addcc c_12,t_1,c_12
  1223. bcs,a %xcc,.+8
  1224. add c_3,t_2,c_3
  1225. mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
  1226. addcc c_12,t_1,c_12
  1227. bcs,a %xcc,.+8
  1228. add c_3,t_2,c_3
  1229. addcc c_12,t_1,c_12
  1230. bcs,a %xcc,.+8
  1231. add c_3,t_2,c_3
  1232. mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
  1233. addcc c_12,t_1,t_1
  1234. bcs,a %xcc,.+8
  1235. add c_3,t_2,c_3
  1236. srlx t_1,32,c_12
  1237. stuw t_1,rp(8) !r[8]=c3;
  1238. or c_12,c_3,c_12
  1239. mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
  1240. addcc c_12,t_1,c_12
  1241. clr c_3
  1242. bcs,a %xcc,.+8
  1243. add c_3,t_2,c_3
  1244. addcc c_12,t_1,c_12
  1245. bcs,a %xcc,.+8
  1246. add c_3,t_2,c_3
  1247. mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
  1248. addcc c_12,t_1,c_12
  1249. bcs,a %xcc,.+8
  1250. add c_3,t_2,c_3
  1251. addcc c_12,t_1,c_12
  1252. bcs,a %xcc,.+8
  1253. add c_3,t_2,c_3
  1254. mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
  1255. addcc c_12,t_1,c_12
  1256. bcs,a %xcc,.+8
  1257. add c_3,t_2,c_3
  1258. addcc c_12,t_1,t_1
  1259. bcs,a %xcc,.+8
  1260. add c_3,t_2,c_3
  1261. srlx t_1,32,c_12
  1262. stuw t_1,rp(9) !r[9]=c1;
  1263. or c_12,c_3,c_12
  1264. mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
  1265. addcc c_12,t_1,c_12
  1266. clr c_3
  1267. bcs,a %xcc,.+8
  1268. add c_3,t_2,c_3
  1269. addcc c_12,t_1,c_12
  1270. bcs,a %xcc,.+8
  1271. add c_3,t_2,c_3
  1272. mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
  1273. addcc c_12,t_1,c_12
  1274. bcs,a %xcc,.+8
  1275. add c_3,t_2,c_3
  1276. addcc c_12,t_1,c_12
  1277. bcs,a %xcc,.+8
  1278. add c_3,t_2,c_3
  1279. mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
  1280. addcc c_12,t_1,t_1
  1281. bcs,a %xcc,.+8
  1282. add c_3,t_2,c_3
  1283. srlx t_1,32,c_12
  1284. stuw t_1,rp(10) !r[10]=c2;
  1285. or c_12,c_3,c_12
  1286. mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
  1287. addcc c_12,t_1,c_12
  1288. clr c_3
  1289. bcs,a %xcc,.+8
  1290. add c_3,t_2,c_3
  1291. addcc c_12,t_1,c_12
  1292. bcs,a %xcc,.+8
  1293. add c_3,t_2,c_3
  1294. mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
  1295. addcc c_12,t_1,c_12
  1296. bcs,a %xcc,.+8
  1297. add c_3,t_2,c_3
  1298. addcc c_12,t_1,t_1
  1299. bcs,a %xcc,.+8
  1300. add c_3,t_2,c_3
  1301. srlx t_1,32,c_12
  1302. stuw t_1,rp(11) !r[11]=c3;
  1303. or c_12,c_3,c_12
  1304. mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
  1305. addcc c_12,t_1,c_12
  1306. clr c_3
  1307. bcs,a %xcc,.+8
  1308. add c_3,t_2,c_3
  1309. addcc c_12,t_1,c_12
  1310. bcs,a %xcc,.+8
  1311. add c_3,t_2,c_3
  1312. mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
  1313. addcc c_12,t_1,t_1
  1314. bcs,a %xcc,.+8
  1315. add c_3,t_2,c_3
  1316. srlx t_1,32,c_12
  1317. stuw t_1,rp(12) !r[12]=c1;
  1318. or c_12,c_3,c_12
  1319. mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
  1320. addcc c_12,t_1,c_12
  1321. clr c_3
  1322. bcs,a %xcc,.+8
  1323. add c_3,t_2,c_3
  1324. addcc c_12,t_1,t_1
  1325. bcs,a %xcc,.+8
  1326. add c_3,t_2,c_3
  1327. srlx t_1,32,c_12
  1328. stuw t_1,rp(13) !r[13]=c2;
  1329. or c_12,c_3,c_12
  1330. mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
  1331. addcc c_12,t_1,t_1
  1332. srlx t_1,32,c_12
  1333. stuw t_1,rp(14) !r[14]=c3;
  1334. stuw c_12,rp(15) !r[15]=c1;
  1335. ret
  1336. restore %g0,%g0,%o0
  1337. .type bn_sqr_comba8,#function
  1338. .size bn_sqr_comba8,(.-bn_sqr_comba8)
  1339. .align 32
  1340. .global bn_sqr_comba4
  1341. /*
  1342. * void bn_sqr_comba4(r,a)
  1343. * BN_ULONG *r,*a;
  1344. */
  1345. bn_sqr_comba4:
  1346. save %sp,FRAME_SIZE,%sp
  1347. mov 1,t_2
  1348. lduw ap(0),a_0
  1349. sllx t_2,32,t_2
  1350. lduw ap(1),a_1
  1351. mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
  1352. srlx t_1,32,c_12
  1353. stuw t_1,rp(0) !r[0]=c1;
  1354. lduw ap(2),a_2
  1355. mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
  1356. addcc c_12,t_1,c_12
  1357. clr c_3
  1358. bcs,a %xcc,.+8
  1359. add c_3,t_2,c_3
  1360. addcc c_12,t_1,t_1
  1361. bcs,a %xcc,.+8
  1362. add c_3,t_2,c_3
  1363. srlx t_1,32,c_12
  1364. stuw t_1,rp(1) !r[1]=c2;
  1365. or c_12,c_3,c_12
  1366. mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
  1367. addcc c_12,t_1,c_12
  1368. clr c_3
  1369. bcs,a %xcc,.+8
  1370. add c_3,t_2,c_3
  1371. addcc c_12,t_1,c_12
  1372. bcs,a %xcc,.+8
  1373. add c_3,t_2,c_3
  1374. lduw ap(3),a_3
  1375. mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
  1376. addcc c_12,t_1,t_1
  1377. bcs,a %xcc,.+8
  1378. add c_3,t_2,c_3
  1379. srlx t_1,32,c_12
  1380. stuw t_1,rp(2) !r[2]=c3;
  1381. or c_12,c_3,c_12
  1382. mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
  1383. addcc c_12,t_1,c_12
  1384. clr c_3
  1385. bcs,a %xcc,.+8
  1386. add c_3,t_2,c_3
  1387. addcc c_12,t_1,c_12
  1388. bcs,a %xcc,.+8
  1389. add c_3,t_2,c_3
  1390. mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
  1391. addcc c_12,t_1,c_12
  1392. bcs,a %xcc,.+8
  1393. add c_3,t_2,c_3
  1394. addcc c_12,t_1,t_1
  1395. bcs,a %xcc,.+8
  1396. add c_3,t_2,c_3
  1397. srlx t_1,32,c_12
  1398. stuw t_1,rp(3) !r[3]=c1;
  1399. or c_12,c_3,c_12
  1400. mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
  1401. addcc c_12,t_1,c_12
  1402. clr c_3
  1403. bcs,a %xcc,.+8
  1404. add c_3,t_2,c_3
  1405. addcc c_12,t_1,c_12
  1406. bcs,a %xcc,.+8
  1407. add c_3,t_2,c_3
  1408. mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
  1409. addcc c_12,t_1,t_1
  1410. bcs,a %xcc,.+8
  1411. add c_3,t_2,c_3
  1412. srlx t_1,32,c_12
  1413. stuw t_1,rp(4) !r[4]=c2;
  1414. or c_12,c_3,c_12
  1415. mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
  1416. addcc c_12,t_1,c_12
  1417. clr c_3
  1418. bcs,a %xcc,.+8
  1419. add c_3,t_2,c_3
  1420. addcc c_12,t_1,t_1
  1421. bcs,a %xcc,.+8
  1422. add c_3,t_2,c_3
  1423. srlx t_1,32,c_12
  1424. stuw t_1,rp(5) !r[5]=c3;
  1425. or c_12,c_3,c_12
  1426. mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
  1427. addcc c_12,t_1,t_1
  1428. srlx t_1,32,c_12
  1429. stuw t_1,rp(6) !r[6]=c1;
  1430. stuw c_12,rp(7) !r[7]=c2;
  1431. ret
  1432. restore %g0,%g0,%o0
  1433. .type bn_sqr_comba4,#function
  1434. .size bn_sqr_comba4,(.-bn_sqr_comba4)
  1435. .align 32