2
0

sparcv8plus.S 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558
  1. .ident "sparcv8plus.s, Version 1.4"
  2. .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@openssl.org>"
  3. /*
  4. * ====================================================================
  5. * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
  6. *
  7. * Licensed under the Apache License 2.0 (the "License"). You may not use
  8. * this file except in compliance with the License. You can obtain a copy
  9. * in the file LICENSE in the source distribution or at
  10. * https://www.openssl.org/source/license.html
  11. * ====================================================================
  12. */
  13. /*
  14. * This is my modest contribution to OpenSSL project (see
  15. * http://www.openssl.org/ for more information about it) and is
  16. * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  17. * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  18. *
  19. * Questions-n-answers.
  20. *
  21. * Q. How to compile?
  22. * A. With SC4.x/SC5.x:
  23. *
  24. * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  25. *
  26. * and with gcc:
  27. *
  28. * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  29. *
  30. * or if above fails (it does if you have gas installed):
  31. *
  32. * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  33. *
  34. * Quick-n-dirty way to fuse the module into the library.
  35. * Provided that the library is already configured and built
  36. * (in 0.9.2 case with no-asm option):
  37. *
  38. * # cd crypto/bn
  39. * # cp /some/place/bn_asm.sparc.v8plus.S .
  40. * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  41. * # make
  42. * # cd ../..
  43. * # make; make test
  44. *
  45. * Quick-n-dirty way to get rid of it:
  46. *
  47. * # cd crypto/bn
  48. * # touch bn_asm.c
  49. * # make
  50. * # cd ../..
  51. * # make; make test
  52. *
  53. * Q. V8plus architecture? What kind of beast is that?
  54. * A. Well, it's rather a programming model than an architecture...
  55. * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  56. * special conditions, namely when kernel doesn't preserve upper
  57. * 32 bits of otherwise 64-bit registers during a context switch.
  58. *
  59. * Q. Why just UltraSPARC? What about SuperSPARC?
  60. * A. Original release did target UltraSPARC only. Now SuperSPARC
  61. * version is provided along. Both version share bn_*comba[48]
  62. * implementations (see comment later in code for explanation).
  63. * But what's so special about this UltraSPARC implementation?
  64. * Why didn't I let compiler do the job? Trouble is that most of
  65. * available compilers (well, SC5.0 is the only exception) don't
  66. * attempt to take advantage of UltraSPARC's 64-bitness under
  67. * 32-bit kernels even though it's perfectly possible (see next
  68. * question).
  69. *
  70. * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  71. * doesn't work?
  72. * A. You can't address *all* registers as 64-bit wide:-( The catch is
  73. * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  74. * preserved if you're in a leaf function, i.e. such never calling
  75. * any other functions. All functions in this module are leaf and
  76. * 10 registers is a handful. And as a matter of fact none-"comba"
  77. * routines don't require even that much and I could even afford to
  78. * not allocate own stack frame for 'em:-)
  79. *
  80. * Q. What about 64-bit kernels?
  81. * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  82. * under evaluation and development...
  83. *
  84. * Q. What about shared libraries?
  85. * A. What about 'em? Kidding again:-) Code does *not* contain any
  86. * code position dependencies and it's safe to include it into
  87. * shared library as is.
  88. *
  89. * Q. How much faster does it go?
  90. * A. Do you have a good benchmark? In either case below is what I
  91. * experience with crypto/bn/expspeed.c test program:
  92. *
  93. * v8plus module on U10/300MHz against bn_asm.c compiled with:
  94. *
  95. * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
  96. * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
  97. * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
  98. *
  99. * v8 module on SS10/60MHz against bn_asm.c compiled with:
  100. *
  101. * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
  102. * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
  103. * egcs-1.1.2 -mv8 -O3 +35-45%
  104. *
  105. * As you can see it's damn hard to beat the new Sun C compiler
  106. * and it's in first place GNU C users who will appreciate this
  107. * assembler implementation:-)
  108. */
  109. /*
  110. * Revision history.
  111. *
  112. * 1.0 - initial release;
  113. * 1.1 - new loop unrolling model(*);
  114. * - some more fine tuning;
  115. * 1.2 - made gas friendly;
  116. * - updates to documentation concerning v9;
  117. * - new performance comparison matrix;
  118. * 1.3 - fixed problem with /usr/ccs/lib/cpp;
  119. * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
  120. * resulting in slight overall performance kick;
  121. * - some retunes;
  122. * - support for GNU as added;
  123. *
  124. * (*) Originally unrolled loop looked like this:
  125. * for (;;) {
  126. * op(p+0); if (--n==0) break;
  127. * op(p+1); if (--n==0) break;
  128. * op(p+2); if (--n==0) break;
  129. * op(p+3); if (--n==0) break;
  130. * p+=4;
  131. * }
  132. * I unroll according to following:
  133. * while (n&~3) {
  134. * op(p+0); op(p+1); op(p+2); op(p+3);
  135. * p+=4; n=-4;
  136. * }
  137. * if (n) {
  138. * op(p+0); if (--n==0) return;
  139. * op(p+2); if (--n==0) return;
  140. * op(p+3); return;
  141. * }
  142. */
  143. #if defined(__SUNPRO_C) && defined(__sparcv9)
  144. /* They've said -xarch=v9 at command line */
  145. .register %g2,#scratch
  146. .register %g3,#scratch
  147. # define FRAME_SIZE -192
  148. #elif defined(__GNUC__) && defined(__arch64__)
  149. /* They've said -m64 at command line */
  150. .register %g2,#scratch
  151. .register %g3,#scratch
  152. # define FRAME_SIZE -192
  153. #else
  154. # define FRAME_SIZE -96
  155. #endif
  156. /*
  157. * GNU assembler can't stand stuw:-(
  158. */
  159. #define stuw st
  160. .section ".text",#alloc,#execinstr
  161. .file "bn_asm.sparc.v8plus.S"
  162. .align 32
  163. .global bn_mul_add_words
  164. /*
  165. * BN_ULONG bn_mul_add_words(rp,ap,num,w)
  166. * BN_ULONG *rp,*ap;
  167. * int num;
  168. * BN_ULONG w;
  169. */
  170. bn_mul_add_words:
  171. sra %o2,%g0,%o2 ! signx %o2
  172. brgz,a %o2,.L_bn_mul_add_words_proceed
  173. lduw [%o1],%g2
  174. retl
  175. clr %o0
  176. nop
  177. nop
  178. nop
  179. .L_bn_mul_add_words_proceed:
  180. srl %o3,%g0,%o3 ! clruw %o3
  181. andcc %o2,-4,%g0
  182. bz,pn %icc,.L_bn_mul_add_words_tail
  183. clr %o5
  184. .L_bn_mul_add_words_loop: ! wow! 32 aligned!
  185. lduw [%o0],%g1
  186. lduw [%o1+4],%g3
  187. mulx %o3,%g2,%g2
  188. add %g1,%o5,%o4
  189. nop
  190. add %o4,%g2,%o4
  191. stuw %o4,[%o0]
  192. srlx %o4,32,%o5
  193. lduw [%o0+4],%g1
  194. lduw [%o1+8],%g2
  195. mulx %o3,%g3,%g3
  196. add %g1,%o5,%o4
  197. dec 4,%o2
  198. add %o4,%g3,%o4
  199. stuw %o4,[%o0+4]
  200. srlx %o4,32,%o5
  201. lduw [%o0+8],%g1
  202. lduw [%o1+12],%g3
  203. mulx %o3,%g2,%g2
  204. add %g1,%o5,%o4
  205. inc 16,%o1
  206. add %o4,%g2,%o4
  207. stuw %o4,[%o0+8]
  208. srlx %o4,32,%o5
  209. lduw [%o0+12],%g1
  210. mulx %o3,%g3,%g3
  211. add %g1,%o5,%o4
  212. inc 16,%o0
  213. add %o4,%g3,%o4
  214. andcc %o2,-4,%g0
  215. stuw %o4,[%o0-4]
  216. srlx %o4,32,%o5
  217. bnz,a,pt %icc,.L_bn_mul_add_words_loop
  218. lduw [%o1],%g2
  219. brnz,a,pn %o2,.L_bn_mul_add_words_tail
  220. lduw [%o1],%g2
  221. .L_bn_mul_add_words_return:
  222. retl
  223. mov %o5,%o0
  224. .L_bn_mul_add_words_tail:
  225. lduw [%o0],%g1
  226. mulx %o3,%g2,%g2
  227. add %g1,%o5,%o4
  228. dec %o2
  229. add %o4,%g2,%o4
  230. srlx %o4,32,%o5
  231. brz,pt %o2,.L_bn_mul_add_words_return
  232. stuw %o4,[%o0]
  233. lduw [%o1+4],%g2
  234. lduw [%o0+4],%g1
  235. mulx %o3,%g2,%g2
  236. add %g1,%o5,%o4
  237. dec %o2
  238. add %o4,%g2,%o4
  239. srlx %o4,32,%o5
  240. brz,pt %o2,.L_bn_mul_add_words_return
  241. stuw %o4,[%o0+4]
  242. lduw [%o1+8],%g2
  243. lduw [%o0+8],%g1
  244. mulx %o3,%g2,%g2
  245. add %g1,%o5,%o4
  246. add %o4,%g2,%o4
  247. stuw %o4,[%o0+8]
  248. retl
  249. srlx %o4,32,%o0
  250. .type bn_mul_add_words,#function
  251. .size bn_mul_add_words,(.-bn_mul_add_words)
  252. .align 32
  253. .global bn_mul_words
  254. /*
  255. * BN_ULONG bn_mul_words(rp,ap,num,w)
  256. * BN_ULONG *rp,*ap;
  257. * int num;
  258. * BN_ULONG w;
  259. */
  260. bn_mul_words:
  261. sra %o2,%g0,%o2 ! signx %o2
  262. brgz,a %o2,.L_bn_mul_words_proceed
  263. lduw [%o1],%g2
  264. retl
  265. clr %o0
  266. nop
  267. nop
  268. nop
  269. .L_bn_mul_words_proceed:
  270. srl %o3,%g0,%o3 ! clruw %o3
  271. andcc %o2,-4,%g0
  272. bz,pn %icc,.L_bn_mul_words_tail
  273. clr %o5
  274. .L_bn_mul_words_loop: ! wow! 32 aligned!
  275. lduw [%o1+4],%g3
  276. mulx %o3,%g2,%g2
  277. add %g2,%o5,%o4
  278. nop
  279. stuw %o4,[%o0]
  280. srlx %o4,32,%o5
  281. lduw [%o1+8],%g2
  282. mulx %o3,%g3,%g3
  283. add %g3,%o5,%o4
  284. dec 4,%o2
  285. stuw %o4,[%o0+4]
  286. srlx %o4,32,%o5
  287. lduw [%o1+12],%g3
  288. mulx %o3,%g2,%g2
  289. add %g2,%o5,%o4
  290. inc 16,%o1
  291. stuw %o4,[%o0+8]
  292. srlx %o4,32,%o5
  293. mulx %o3,%g3,%g3
  294. add %g3,%o5,%o4
  295. inc 16,%o0
  296. stuw %o4,[%o0-4]
  297. srlx %o4,32,%o5
  298. andcc %o2,-4,%g0
  299. bnz,a,pt %icc,.L_bn_mul_words_loop
  300. lduw [%o1],%g2
  301. nop
  302. nop
  303. brnz,a,pn %o2,.L_bn_mul_words_tail
  304. lduw [%o1],%g2
  305. .L_bn_mul_words_return:
  306. retl
  307. mov %o5,%o0
  308. .L_bn_mul_words_tail:
  309. mulx %o3,%g2,%g2
  310. add %g2,%o5,%o4
  311. dec %o2
  312. srlx %o4,32,%o5
  313. brz,pt %o2,.L_bn_mul_words_return
  314. stuw %o4,[%o0]
  315. lduw [%o1+4],%g2
  316. mulx %o3,%g2,%g2
  317. add %g2,%o5,%o4
  318. dec %o2
  319. srlx %o4,32,%o5
  320. brz,pt %o2,.L_bn_mul_words_return
  321. stuw %o4,[%o0+4]
  322. lduw [%o1+8],%g2
  323. mulx %o3,%g2,%g2
  324. add %g2,%o5,%o4
  325. stuw %o4,[%o0+8]
  326. retl
  327. srlx %o4,32,%o0
  328. .type bn_mul_words,#function
  329. .size bn_mul_words,(.-bn_mul_words)
  330. .align 32
  331. .global bn_sqr_words
  332. /*
  333. * void bn_sqr_words(r,a,n)
  334. * BN_ULONG *r,*a;
  335. * int n;
  336. */
  337. bn_sqr_words:
  338. sra %o2,%g0,%o2 ! signx %o2
  339. brgz,a %o2,.L_bn_sqr_words_proceed
  340. lduw [%o1],%g2
  341. retl
  342. clr %o0
  343. nop
  344. nop
  345. nop
  346. .L_bn_sqr_words_proceed:
  347. andcc %o2,-4,%g0
  348. nop
  349. bz,pn %icc,.L_bn_sqr_words_tail
  350. nop
  351. .L_bn_sqr_words_loop: ! wow! 32 aligned!
  352. lduw [%o1+4],%g3
  353. mulx %g2,%g2,%o4
  354. stuw %o4,[%o0]
  355. srlx %o4,32,%o5
  356. stuw %o5,[%o0+4]
  357. nop
  358. lduw [%o1+8],%g2
  359. mulx %g3,%g3,%o4
  360. dec 4,%o2
  361. stuw %o4,[%o0+8]
  362. srlx %o4,32,%o5
  363. stuw %o5,[%o0+12]
  364. lduw [%o1+12],%g3
  365. mulx %g2,%g2,%o4
  366. srlx %o4,32,%o5
  367. stuw %o4,[%o0+16]
  368. inc 16,%o1
  369. stuw %o5,[%o0+20]
  370. mulx %g3,%g3,%o4
  371. inc 32,%o0
  372. stuw %o4,[%o0-8]
  373. srlx %o4,32,%o5
  374. andcc %o2,-4,%g2
  375. stuw %o5,[%o0-4]
  376. bnz,a,pt %icc,.L_bn_sqr_words_loop
  377. lduw [%o1],%g2
  378. nop
  379. brnz,a,pn %o2,.L_bn_sqr_words_tail
  380. lduw [%o1],%g2
  381. .L_bn_sqr_words_return:
  382. retl
  383. clr %o0
  384. .L_bn_sqr_words_tail:
  385. mulx %g2,%g2,%o4
  386. dec %o2
  387. stuw %o4,[%o0]
  388. srlx %o4,32,%o5
  389. brz,pt %o2,.L_bn_sqr_words_return
  390. stuw %o5,[%o0+4]
  391. lduw [%o1+4],%g2
  392. mulx %g2,%g2,%o4
  393. dec %o2
  394. stuw %o4,[%o0+8]
  395. srlx %o4,32,%o5
  396. brz,pt %o2,.L_bn_sqr_words_return
  397. stuw %o5,[%o0+12]
  398. lduw [%o1+8],%g2
  399. mulx %g2,%g2,%o4
  400. srlx %o4,32,%o5
  401. stuw %o4,[%o0+16]
  402. stuw %o5,[%o0+20]
  403. retl
  404. clr %o0
  405. .type bn_sqr_words,#function
  406. .size bn_sqr_words,(.-bn_sqr_words)
  407. .align 32
  408. .global bn_div_words
  409. /*
  410. * BN_ULONG bn_div_words(h,l,d)
  411. * BN_ULONG h,l,d;
  412. */
  413. bn_div_words:
  414. sllx %o0,32,%o0
  415. or %o0,%o1,%o0
  416. udivx %o0,%o2,%o0
  417. retl
  418. srl %o0,%g0,%o0 ! clruw %o0
  419. .type bn_div_words,#function
  420. .size bn_div_words,(.-bn_div_words)
  421. .align 32
  422. .global bn_add_words
  423. /*
  424. * BN_ULONG bn_add_words(rp,ap,bp,n)
  425. * BN_ULONG *rp,*ap,*bp;
  426. * int n;
  427. */
  428. bn_add_words:
  429. sra %o3,%g0,%o3 ! signx %o3
  430. brgz,a %o3,.L_bn_add_words_proceed
  431. lduw [%o1],%o4
  432. retl
  433. clr %o0
  434. .L_bn_add_words_proceed:
  435. andcc %o3,-4,%g0
  436. bz,pn %icc,.L_bn_add_words_tail
  437. addcc %g0,0,%g0 ! clear carry flag
  438. .L_bn_add_words_loop: ! wow! 32 aligned!
  439. dec 4,%o3
  440. lduw [%o2],%o5
  441. lduw [%o1+4],%g1
  442. lduw [%o2+4],%g2
  443. lduw [%o1+8],%g3
  444. lduw [%o2+8],%g4
  445. addccc %o5,%o4,%o5
  446. stuw %o5,[%o0]
  447. lduw [%o1+12],%o4
  448. lduw [%o2+12],%o5
  449. inc 16,%o1
  450. addccc %g1,%g2,%g1
  451. stuw %g1,[%o0+4]
  452. inc 16,%o2
  453. addccc %g3,%g4,%g3
  454. stuw %g3,[%o0+8]
  455. inc 16,%o0
  456. addccc %o5,%o4,%o5
  457. stuw %o5,[%o0-4]
  458. and %o3,-4,%g1
  459. brnz,a,pt %g1,.L_bn_add_words_loop
  460. lduw [%o1],%o4
  461. brnz,a,pn %o3,.L_bn_add_words_tail
  462. lduw [%o1],%o4
  463. .L_bn_add_words_return:
  464. clr %o0
  465. retl
  466. movcs %icc,1,%o0
  467. nop
  468. .L_bn_add_words_tail:
  469. lduw [%o2],%o5
  470. dec %o3
  471. addccc %o5,%o4,%o5
  472. brz,pt %o3,.L_bn_add_words_return
  473. stuw %o5,[%o0]
  474. lduw [%o1+4],%o4
  475. lduw [%o2+4],%o5
  476. dec %o3
  477. addccc %o5,%o4,%o5
  478. brz,pt %o3,.L_bn_add_words_return
  479. stuw %o5,[%o0+4]
  480. lduw [%o1+8],%o4
  481. lduw [%o2+8],%o5
  482. addccc %o5,%o4,%o5
  483. stuw %o5,[%o0+8]
  484. clr %o0
  485. retl
  486. movcs %icc,1,%o0
  487. .type bn_add_words,#function
  488. .size bn_add_words,(.-bn_add_words)
  489. .global bn_sub_words
  490. /*
  491. * BN_ULONG bn_sub_words(rp,ap,bp,n)
  492. * BN_ULONG *rp,*ap,*bp;
  493. * int n;
  494. */
  495. bn_sub_words:
  496. sra %o3,%g0,%o3 ! signx %o3
  497. brgz,a %o3,.L_bn_sub_words_proceed
  498. lduw [%o1],%o4
  499. retl
  500. clr %o0
  501. .L_bn_sub_words_proceed:
  502. andcc %o3,-4,%g0
  503. bz,pn %icc,.L_bn_sub_words_tail
  504. addcc %g0,0,%g0 ! clear carry flag
  505. .L_bn_sub_words_loop: ! wow! 32 aligned!
  506. dec 4,%o3
  507. lduw [%o2],%o5
  508. lduw [%o1+4],%g1
  509. lduw [%o2+4],%g2
  510. lduw [%o1+8],%g3
  511. lduw [%o2+8],%g4
  512. subccc %o4,%o5,%o5
  513. stuw %o5,[%o0]
  514. lduw [%o1+12],%o4
  515. lduw [%o2+12],%o5
  516. inc 16,%o1
  517. subccc %g1,%g2,%g2
  518. stuw %g2,[%o0+4]
  519. inc 16,%o2
  520. subccc %g3,%g4,%g4
  521. stuw %g4,[%o0+8]
  522. inc 16,%o0
  523. subccc %o4,%o5,%o5
  524. stuw %o5,[%o0-4]
  525. and %o3,-4,%g1
  526. brnz,a,pt %g1,.L_bn_sub_words_loop
  527. lduw [%o1],%o4
  528. brnz,a,pn %o3,.L_bn_sub_words_tail
  529. lduw [%o1],%o4
  530. .L_bn_sub_words_return:
  531. clr %o0
  532. retl
  533. movcs %icc,1,%o0
  534. nop
  535. .L_bn_sub_words_tail: ! wow! 32 aligned!
  536. lduw [%o2],%o5
  537. dec %o3
  538. subccc %o4,%o5,%o5
  539. brz,pt %o3,.L_bn_sub_words_return
  540. stuw %o5,[%o0]
  541. lduw [%o1+4],%o4
  542. lduw [%o2+4],%o5
  543. dec %o3
  544. subccc %o4,%o5,%o5
  545. brz,pt %o3,.L_bn_sub_words_return
  546. stuw %o5,[%o0+4]
  547. lduw [%o1+8],%o4
  548. lduw [%o2+8],%o5
  549. subccc %o4,%o5,%o5
  550. stuw %o5,[%o0+8]
  551. clr %o0
  552. retl
  553. movcs %icc,1,%o0
  554. .type bn_sub_words,#function
  555. .size bn_sub_words,(.-bn_sub_words)
  556. /*
  557. * Code below depends on the fact that upper parts of the %l0-%l7
  558. * and %i0-%i7 are zeroed by kernel after context switch. In
  559. * previous versions this comment stated that "the trouble is that
  560. * it's not feasible to implement the mumbo-jumbo in less V9
  561. * instructions:-(" which apparently isn't true thanks to
  562. * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
  563. * results not from the shorter code, but from elimination of
  564. * multicycle none-pairable 'rd %y,%rd' instructions.
  565. *
  566. * Andy.
  567. */
  568. /*
  569. * Here is register usage map for *all* routines below.
  570. */
  571. #define t_1 %o0
  572. #define t_2 %o1
  573. #define c_12 %o2
  574. #define c_3 %o3
  575. #define ap(I) [%i1+4*I]
  576. #define bp(I) [%i2+4*I]
  577. #define rp(I) [%i0+4*I]
  578. #define a_0 %l0
  579. #define a_1 %l1
  580. #define a_2 %l2
  581. #define a_3 %l3
  582. #define a_4 %l4
  583. #define a_5 %l5
  584. #define a_6 %l6
  585. #define a_7 %l7
  586. #define b_0 %i3
  587. #define b_1 %i4
  588. #define b_2 %i5
  589. #define b_3 %o4
  590. #define b_4 %o5
  591. #define b_5 %o7
  592. #define b_6 %g1
  593. #define b_7 %g4
  594. .align 32
  595. .global bn_mul_comba8
  596. /*
  597. * void bn_mul_comba8(r,a,b)
  598. * BN_ULONG *r,*a,*b;
  599. */
  600. bn_mul_comba8:
  601. save %sp,FRAME_SIZE,%sp
  602. mov 1,t_2
  603. lduw ap(0),a_0
  604. sllx t_2,32,t_2
  605. lduw bp(0),b_0 !=
  606. lduw bp(1),b_1
  607. mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
  608. srlx t_1,32,c_12
  609. stuw t_1,rp(0) !=!r[0]=c1;
  610. lduw ap(1),a_1
  611. mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
  612. addcc c_12,t_1,c_12
  613. clr c_3 !=
  614. bcs,a %xcc,.+8
  615. add c_3,t_2,c_3
  616. lduw ap(2),a_2
  617. mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
  618. addcc c_12,t_1,t_1
  619. bcs,a %xcc,.+8
  620. add c_3,t_2,c_3
  621. srlx t_1,32,c_12 !=
  622. stuw t_1,rp(1) !r[1]=c2;
  623. or c_12,c_3,c_12
  624. mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
  625. addcc c_12,t_1,c_12 !=
  626. clr c_3
  627. bcs,a %xcc,.+8
  628. add c_3,t_2,c_3
  629. lduw bp(2),b_2 !=
  630. mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
  631. addcc c_12,t_1,c_12
  632. bcs,a %xcc,.+8
  633. add c_3,t_2,c_3 !=
  634. lduw bp(3),b_3
  635. mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
  636. addcc c_12,t_1,t_1
  637. bcs,a %xcc,.+8 !=
  638. add c_3,t_2,c_3
  639. srlx t_1,32,c_12
  640. stuw t_1,rp(2) !r[2]=c3;
  641. or c_12,c_3,c_12 !=
  642. mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
  643. addcc c_12,t_1,c_12
  644. clr c_3
  645. bcs,a %xcc,.+8 !=
  646. add c_3,t_2,c_3
  647. mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
  648. addcc c_12,t_1,c_12
  649. bcs,a %xcc,.+8 !=
  650. add c_3,t_2,c_3
  651. lduw ap(3),a_3
  652. mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
  653. addcc c_12,t_1,c_12 !=
  654. bcs,a %xcc,.+8
  655. add c_3,t_2,c_3
  656. lduw ap(4),a_4
  657. mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
  658. addcc c_12,t_1,t_1
  659. bcs,a %xcc,.+8
  660. add c_3,t_2,c_3
  661. srlx t_1,32,c_12 !=
  662. stuw t_1,rp(3) !r[3]=c1;
  663. or c_12,c_3,c_12
  664. mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
  665. addcc c_12,t_1,c_12 !=
  666. clr c_3
  667. bcs,a %xcc,.+8
  668. add c_3,t_2,c_3
  669. mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
  670. addcc c_12,t_1,c_12
  671. bcs,a %xcc,.+8
  672. add c_3,t_2,c_3
  673. mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
  674. addcc c_12,t_1,c_12
  675. bcs,a %xcc,.+8
  676. add c_3,t_2,c_3
  677. lduw bp(4),b_4 !=
  678. mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
  679. addcc c_12,t_1,c_12
  680. bcs,a %xcc,.+8
  681. add c_3,t_2,c_3 !=
  682. lduw bp(5),b_5
  683. mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
  684. addcc c_12,t_1,t_1
  685. bcs,a %xcc,.+8 !=
  686. add c_3,t_2,c_3
  687. srlx t_1,32,c_12
  688. stuw t_1,rp(4) !r[4]=c2;
  689. or c_12,c_3,c_12 !=
  690. mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
  691. addcc c_12,t_1,c_12
  692. clr c_3
  693. bcs,a %xcc,.+8 !=
  694. add c_3,t_2,c_3
  695. mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
  696. addcc c_12,t_1,c_12
  697. bcs,a %xcc,.+8 !=
  698. add c_3,t_2,c_3
  699. mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
  700. addcc c_12,t_1,c_12
  701. bcs,a %xcc,.+8 !=
  702. add c_3,t_2,c_3
  703. mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
  704. addcc c_12,t_1,c_12
  705. bcs,a %xcc,.+8 !=
  706. add c_3,t_2,c_3
  707. lduw ap(5),a_5
  708. mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
  709. addcc c_12,t_1,c_12 !=
  710. bcs,a %xcc,.+8
  711. add c_3,t_2,c_3
  712. lduw ap(6),a_6
  713. mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
  714. addcc c_12,t_1,t_1
  715. bcs,a %xcc,.+8
  716. add c_3,t_2,c_3
  717. srlx t_1,32,c_12 !=
  718. stuw t_1,rp(5) !r[5]=c3;
  719. or c_12,c_3,c_12
  720. mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
  721. addcc c_12,t_1,c_12 !=
  722. clr c_3
  723. bcs,a %xcc,.+8
  724. add c_3,t_2,c_3
  725. mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
  726. addcc c_12,t_1,c_12
  727. bcs,a %xcc,.+8
  728. add c_3,t_2,c_3
  729. mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
  730. addcc c_12,t_1,c_12
  731. bcs,a %xcc,.+8
  732. add c_3,t_2,c_3
  733. mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
  734. addcc c_12,t_1,c_12
  735. bcs,a %xcc,.+8
  736. add c_3,t_2,c_3
  737. mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
  738. addcc c_12,t_1,c_12
  739. bcs,a %xcc,.+8
  740. add c_3,t_2,c_3
  741. lduw bp(6),b_6 !=
  742. mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
  743. addcc c_12,t_1,c_12
  744. bcs,a %xcc,.+8
  745. add c_3,t_2,c_3 !=
  746. lduw bp(7),b_7
  747. mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
  748. addcc c_12,t_1,t_1
  749. bcs,a %xcc,.+8 !=
  750. add c_3,t_2,c_3
  751. srlx t_1,32,c_12
  752. stuw t_1,rp(6) !r[6]=c1;
  753. or c_12,c_3,c_12 !=
  754. mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
  755. addcc c_12,t_1,c_12
  756. clr c_3
  757. bcs,a %xcc,.+8 !=
  758. add c_3,t_2,c_3
  759. mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
  760. addcc c_12,t_1,c_12
  761. bcs,a %xcc,.+8 !=
  762. add c_3,t_2,c_3
  763. mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
  764. addcc c_12,t_1,c_12
  765. bcs,a %xcc,.+8 !=
  766. add c_3,t_2,c_3
  767. mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
  768. addcc c_12,t_1,c_12
  769. bcs,a %xcc,.+8 !=
  770. add c_3,t_2,c_3
  771. mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
  772. addcc c_12,t_1,c_12
  773. bcs,a %xcc,.+8 !=
  774. add c_3,t_2,c_3
  775. mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
  776. addcc c_12,t_1,c_12
  777. bcs,a %xcc,.+8 !=
  778. add c_3,t_2,c_3
  779. lduw ap(7),a_7
  780. mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
  781. addcc c_12,t_1,c_12
  782. bcs,a %xcc,.+8
  783. add c_3,t_2,c_3
  784. mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
  785. addcc c_12,t_1,t_1
  786. bcs,a %xcc,.+8
  787. add c_3,t_2,c_3
  788. srlx t_1,32,c_12 !=
  789. stuw t_1,rp(7) !r[7]=c2;
  790. or c_12,c_3,c_12
  791. mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
  792. addcc c_12,t_1,c_12
  793. clr c_3
  794. bcs,a %xcc,.+8
  795. add c_3,t_2,c_3 !=
  796. mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
  797. addcc c_12,t_1,c_12
  798. bcs,a %xcc,.+8
  799. add c_3,t_2,c_3 !=
  800. mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
  801. addcc c_12,t_1,c_12
  802. bcs,a %xcc,.+8
  803. add c_3,t_2,c_3 !=
  804. mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
  805. addcc c_12,t_1,c_12
  806. bcs,a %xcc,.+8
  807. add c_3,t_2,c_3 !=
  808. mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
  809. addcc c_12,t_1,c_12
  810. bcs,a %xcc,.+8
  811. add c_3,t_2,c_3 !=
  812. mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
  813. addcc c_12,t_1,c_12
  814. bcs,a %xcc,.+8
  815. add c_3,t_2,c_3 !=
  816. mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
  817. addcc c_12,t_1,t_1
  818. bcs,a %xcc,.+8
  819. add c_3,t_2,c_3 !=
  820. srlx t_1,32,c_12
  821. stuw t_1,rp(8) !r[8]=c3;
  822. or c_12,c_3,c_12
  823. mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
  824. addcc c_12,t_1,c_12
  825. clr c_3
  826. bcs,a %xcc,.+8
  827. add c_3,t_2,c_3 !=
  828. mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
  829. addcc c_12,t_1,c_12
  830. bcs,a %xcc,.+8 !=
  831. add c_3,t_2,c_3
  832. mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
  833. addcc c_12,t_1,c_12
  834. bcs,a %xcc,.+8 !=
  835. add c_3,t_2,c_3
  836. mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
  837. addcc c_12,t_1,c_12
  838. bcs,a %xcc,.+8 !=
  839. add c_3,t_2,c_3
  840. mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
  841. addcc c_12,t_1,c_12
  842. bcs,a %xcc,.+8 !=
  843. add c_3,t_2,c_3
  844. mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
  845. addcc c_12,t_1,t_1
  846. bcs,a %xcc,.+8 !=
  847. add c_3,t_2,c_3
  848. srlx t_1,32,c_12
  849. stuw t_1,rp(9) !r[9]=c1;
  850. or c_12,c_3,c_12 !=
  851. mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
  852. addcc c_12,t_1,c_12
  853. clr c_3
  854. bcs,a %xcc,.+8 !=
  855. add c_3,t_2,c_3
  856. mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
  857. addcc c_12,t_1,c_12
  858. bcs,a %xcc,.+8 !=
  859. add c_3,t_2,c_3
  860. mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
  861. addcc c_12,t_1,c_12
  862. bcs,a %xcc,.+8 !=
  863. add c_3,t_2,c_3
  864. mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
  865. addcc c_12,t_1,c_12
  866. bcs,a %xcc,.+8 !=
  867. add c_3,t_2,c_3
  868. mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
  869. addcc c_12,t_1,t_1
  870. bcs,a %xcc,.+8 !=
  871. add c_3,t_2,c_3
  872. srlx t_1,32,c_12
  873. stuw t_1,rp(10) !r[10]=c2;
  874. or c_12,c_3,c_12 !=
  875. mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
  876. addcc c_12,t_1,c_12
  877. clr c_3
  878. bcs,a %xcc,.+8 !=
  879. add c_3,t_2,c_3
  880. mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
  881. addcc c_12,t_1,c_12
  882. bcs,a %xcc,.+8 !=
  883. add c_3,t_2,c_3
  884. mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
  885. addcc c_12,t_1,c_12
  886. bcs,a %xcc,.+8 !=
  887. add c_3,t_2,c_3
  888. mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
  889. addcc c_12,t_1,t_1
  890. bcs,a %xcc,.+8 !=
  891. add c_3,t_2,c_3
  892. srlx t_1,32,c_12
  893. stuw t_1,rp(11) !r[11]=c3;
  894. or c_12,c_3,c_12 !=
  895. mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
  896. addcc c_12,t_1,c_12
  897. clr c_3
  898. bcs,a %xcc,.+8 !=
  899. add c_3,t_2,c_3
  900. mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
  901. addcc c_12,t_1,c_12
  902. bcs,a %xcc,.+8 !=
  903. add c_3,t_2,c_3
  904. mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
  905. addcc c_12,t_1,t_1
  906. bcs,a %xcc,.+8 !=
  907. add c_3,t_2,c_3
  908. srlx t_1,32,c_12
  909. stuw t_1,rp(12) !r[12]=c1;
  910. or c_12,c_3,c_12 !=
  911. mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
  912. addcc c_12,t_1,c_12
  913. clr c_3
  914. bcs,a %xcc,.+8 !=
  915. add c_3,t_2,c_3
  916. mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
  917. addcc c_12,t_1,t_1
  918. bcs,a %xcc,.+8 !=
  919. add c_3,t_2,c_3
  920. srlx t_1,32,c_12
  921. st t_1,rp(13) !r[13]=c2;
  922. or c_12,c_3,c_12 !=
  923. mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
  924. addcc c_12,t_1,t_1
  925. srlx t_1,32,c_12 !=
  926. stuw t_1,rp(14) !r[14]=c3;
  927. stuw c_12,rp(15) !r[15]=c1;
  928. ret
  929. restore %g0,%g0,%o0 !=
  930. .type bn_mul_comba8,#function
  931. .size bn_mul_comba8,(.-bn_mul_comba8)
  932. .align 32
  933. .global bn_mul_comba4
  934. /*
  935. * void bn_mul_comba4(r,a,b)
  936. * BN_ULONG *r,*a,*b;
  937. */
  938. bn_mul_comba4:
  939. save %sp,FRAME_SIZE,%sp
  940. lduw ap(0),a_0
  941. mov 1,t_2
  942. lduw bp(0),b_0
  943. sllx t_2,32,t_2 !=
  944. lduw bp(1),b_1
  945. mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
  946. srlx t_1,32,c_12
  947. stuw t_1,rp(0) !=!r[0]=c1;
  948. lduw ap(1),a_1
  949. mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
  950. addcc c_12,t_1,c_12
  951. clr c_3 !=
  952. bcs,a %xcc,.+8
  953. add c_3,t_2,c_3
  954. lduw ap(2),a_2
  955. mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
  956. addcc c_12,t_1,t_1
  957. bcs,a %xcc,.+8
  958. add c_3,t_2,c_3
  959. srlx t_1,32,c_12 !=
  960. stuw t_1,rp(1) !r[1]=c2;
  961. or c_12,c_3,c_12
  962. mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
  963. addcc c_12,t_1,c_12 !=
  964. clr c_3
  965. bcs,a %xcc,.+8
  966. add c_3,t_2,c_3
  967. lduw bp(2),b_2 !=
  968. mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
  969. addcc c_12,t_1,c_12
  970. bcs,a %xcc,.+8
  971. add c_3,t_2,c_3 !=
  972. lduw bp(3),b_3
  973. mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
  974. addcc c_12,t_1,t_1
  975. bcs,a %xcc,.+8 !=
  976. add c_3,t_2,c_3
  977. srlx t_1,32,c_12
  978. stuw t_1,rp(2) !r[2]=c3;
  979. or c_12,c_3,c_12 !=
  980. mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
  981. addcc c_12,t_1,c_12
  982. clr c_3
  983. bcs,a %xcc,.+8 !=
  984. add c_3,t_2,c_3
  985. mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
  986. addcc c_12,t_1,c_12
  987. bcs,a %xcc,.+8 !=
  988. add c_3,t_2,c_3
  989. lduw ap(3),a_3
  990. mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
  991. addcc c_12,t_1,c_12 !=
  992. bcs,a %xcc,.+8
  993. add c_3,t_2,c_3
  994. mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
  995. addcc c_12,t_1,t_1 !=
  996. bcs,a %xcc,.+8
  997. add c_3,t_2,c_3
  998. srlx t_1,32,c_12
  999. stuw t_1,rp(3) !=!r[3]=c1;
  1000. or c_12,c_3,c_12
  1001. mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
  1002. addcc c_12,t_1,c_12
  1003. clr c_3 !=
  1004. bcs,a %xcc,.+8
  1005. add c_3,t_2,c_3
  1006. mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
  1007. addcc c_12,t_1,c_12 !=
  1008. bcs,a %xcc,.+8
  1009. add c_3,t_2,c_3
  1010. mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
  1011. addcc c_12,t_1,t_1 !=
  1012. bcs,a %xcc,.+8
  1013. add c_3,t_2,c_3
  1014. srlx t_1,32,c_12
  1015. stuw t_1,rp(4) !=!r[4]=c2;
  1016. or c_12,c_3,c_12
  1017. mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
  1018. addcc c_12,t_1,c_12
  1019. clr c_3 !=
  1020. bcs,a %xcc,.+8
  1021. add c_3,t_2,c_3
  1022. mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
  1023. addcc c_12,t_1,t_1 !=
  1024. bcs,a %xcc,.+8
  1025. add c_3,t_2,c_3
  1026. srlx t_1,32,c_12
  1027. stuw t_1,rp(5) !=!r[5]=c3;
  1028. or c_12,c_3,c_12
  1029. mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
  1030. addcc c_12,t_1,t_1
  1031. srlx t_1,32,c_12 !=
  1032. stuw t_1,rp(6) !r[6]=c1;
  1033. stuw c_12,rp(7) !r[7]=c2;
  1034. ret
  1035. restore %g0,%g0,%o0
  1036. .type bn_mul_comba4,#function
  1037. .size bn_mul_comba4,(.-bn_mul_comba4)
  1038. .align 32
  1039. .global bn_sqr_comba8
  1040. bn_sqr_comba8:
  1041. save %sp,FRAME_SIZE,%sp
  1042. mov 1,t_2
  1043. lduw ap(0),a_0
  1044. sllx t_2,32,t_2
  1045. lduw ap(1),a_1
  1046. mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
  1047. srlx t_1,32,c_12
  1048. stuw t_1,rp(0) !r[0]=c1;
  1049. lduw ap(2),a_2
  1050. mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
  1051. addcc c_12,t_1,c_12
  1052. clr c_3
  1053. bcs,a %xcc,.+8
  1054. add c_3,t_2,c_3
  1055. addcc c_12,t_1,t_1
  1056. bcs,a %xcc,.+8
  1057. add c_3,t_2,c_3
  1058. srlx t_1,32,c_12
  1059. stuw t_1,rp(1) !r[1]=c2;
  1060. or c_12,c_3,c_12
  1061. mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
  1062. addcc c_12,t_1,c_12
  1063. clr c_3
  1064. bcs,a %xcc,.+8
  1065. add c_3,t_2,c_3
  1066. addcc c_12,t_1,c_12
  1067. bcs,a %xcc,.+8
  1068. add c_3,t_2,c_3
  1069. lduw ap(3),a_3
  1070. mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
  1071. addcc c_12,t_1,t_1
  1072. bcs,a %xcc,.+8
  1073. add c_3,t_2,c_3
  1074. srlx t_1,32,c_12
  1075. stuw t_1,rp(2) !r[2]=c3;
  1076. or c_12,c_3,c_12
  1077. mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
  1078. addcc c_12,t_1,c_12
  1079. clr c_3
  1080. bcs,a %xcc,.+8
  1081. add c_3,t_2,c_3
  1082. addcc c_12,t_1,c_12
  1083. bcs,a %xcc,.+8
  1084. add c_3,t_2,c_3
  1085. lduw ap(4),a_4
  1086. mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
  1087. addcc c_12,t_1,c_12
  1088. bcs,a %xcc,.+8
  1089. add c_3,t_2,c_3
  1090. addcc c_12,t_1,t_1
  1091. bcs,a %xcc,.+8
  1092. add c_3,t_2,c_3
  1093. srlx t_1,32,c_12
  1094. st t_1,rp(3) !r[3]=c1;
  1095. or c_12,c_3,c_12
  1096. mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
  1097. addcc c_12,t_1,c_12
  1098. clr c_3
  1099. bcs,a %xcc,.+8
  1100. add c_3,t_2,c_3
  1101. addcc c_12,t_1,c_12
  1102. bcs,a %xcc,.+8
  1103. add c_3,t_2,c_3
  1104. mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
  1105. addcc c_12,t_1,c_12
  1106. bcs,a %xcc,.+8
  1107. add c_3,t_2,c_3
  1108. addcc c_12,t_1,c_12
  1109. bcs,a %xcc,.+8
  1110. add c_3,t_2,c_3
  1111. lduw ap(5),a_5
  1112. mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
  1113. addcc c_12,t_1,t_1
  1114. bcs,a %xcc,.+8
  1115. add c_3,t_2,c_3
  1116. srlx t_1,32,c_12
  1117. stuw t_1,rp(4) !r[4]=c2;
  1118. or c_12,c_3,c_12
  1119. mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
  1120. addcc c_12,t_1,c_12
  1121. clr c_3
  1122. bcs,a %xcc,.+8
  1123. add c_3,t_2,c_3
  1124. addcc c_12,t_1,c_12
  1125. bcs,a %xcc,.+8
  1126. add c_3,t_2,c_3
  1127. mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
  1128. addcc c_12,t_1,c_12
  1129. bcs,a %xcc,.+8
  1130. add c_3,t_2,c_3
  1131. addcc c_12,t_1,c_12
  1132. bcs,a %xcc,.+8
  1133. add c_3,t_2,c_3
  1134. lduw ap(6),a_6
  1135. mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
  1136. addcc c_12,t_1,c_12
  1137. bcs,a %xcc,.+8
  1138. add c_3,t_2,c_3
  1139. addcc c_12,t_1,t_1
  1140. bcs,a %xcc,.+8
  1141. add c_3,t_2,c_3
  1142. srlx t_1,32,c_12
  1143. stuw t_1,rp(5) !r[5]=c3;
  1144. or c_12,c_3,c_12
  1145. mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
  1146. addcc c_12,t_1,c_12
  1147. clr c_3
  1148. bcs,a %xcc,.+8
  1149. add c_3,t_2,c_3
  1150. addcc c_12,t_1,c_12
  1151. bcs,a %xcc,.+8
  1152. add c_3,t_2,c_3
  1153. mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
  1154. addcc c_12,t_1,c_12
  1155. bcs,a %xcc,.+8
  1156. add c_3,t_2,c_3
  1157. addcc c_12,t_1,c_12
  1158. bcs,a %xcc,.+8
  1159. add c_3,t_2,c_3
  1160. mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
  1161. addcc c_12,t_1,c_12
  1162. bcs,a %xcc,.+8
  1163. add c_3,t_2,c_3
  1164. addcc c_12,t_1,c_12
  1165. bcs,a %xcc,.+8
  1166. add c_3,t_2,c_3
  1167. lduw ap(7),a_7
  1168. mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
  1169. addcc c_12,t_1,t_1
  1170. bcs,a %xcc,.+8
  1171. add c_3,t_2,c_3
  1172. srlx t_1,32,c_12
  1173. stuw t_1,rp(6) !r[6]=c1;
  1174. or c_12,c_3,c_12
  1175. mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
  1176. addcc c_12,t_1,c_12
  1177. clr c_3
  1178. bcs,a %xcc,.+8
  1179. add c_3,t_2,c_3
  1180. addcc c_12,t_1,c_12
  1181. bcs,a %xcc,.+8
  1182. add c_3,t_2,c_3
  1183. mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
  1184. addcc c_12,t_1,c_12
  1185. bcs,a %xcc,.+8
  1186. add c_3,t_2,c_3
  1187. addcc c_12,t_1,c_12
  1188. bcs,a %xcc,.+8
  1189. add c_3,t_2,c_3
  1190. mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
  1191. addcc c_12,t_1,c_12
  1192. bcs,a %xcc,.+8
  1193. add c_3,t_2,c_3
  1194. addcc c_12,t_1,c_12
  1195. bcs,a %xcc,.+8
  1196. add c_3,t_2,c_3
  1197. mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
  1198. addcc c_12,t_1,c_12
  1199. bcs,a %xcc,.+8
  1200. add c_3,t_2,c_3
  1201. addcc c_12,t_1,t_1
  1202. bcs,a %xcc,.+8
  1203. add c_3,t_2,c_3
  1204. srlx t_1,32,c_12
  1205. stuw t_1,rp(7) !r[7]=c2;
  1206. or c_12,c_3,c_12
  1207. mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
  1208. addcc c_12,t_1,c_12
  1209. clr c_3
  1210. bcs,a %xcc,.+8
  1211. add c_3,t_2,c_3
  1212. addcc c_12,t_1,c_12
  1213. bcs,a %xcc,.+8
  1214. add c_3,t_2,c_3
  1215. mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
  1216. addcc c_12,t_1,c_12
  1217. bcs,a %xcc,.+8
  1218. add c_3,t_2,c_3
  1219. addcc c_12,t_1,c_12
  1220. bcs,a %xcc,.+8
  1221. add c_3,t_2,c_3
  1222. mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
  1223. addcc c_12,t_1,c_12
  1224. bcs,a %xcc,.+8
  1225. add c_3,t_2,c_3
  1226. addcc c_12,t_1,c_12
  1227. bcs,a %xcc,.+8
  1228. add c_3,t_2,c_3
  1229. mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
  1230. addcc c_12,t_1,t_1
  1231. bcs,a %xcc,.+8
  1232. add c_3,t_2,c_3
  1233. srlx t_1,32,c_12
  1234. stuw t_1,rp(8) !r[8]=c3;
  1235. or c_12,c_3,c_12
  1236. mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
  1237. addcc c_12,t_1,c_12
  1238. clr c_3
  1239. bcs,a %xcc,.+8
  1240. add c_3,t_2,c_3
  1241. addcc c_12,t_1,c_12
  1242. bcs,a %xcc,.+8
  1243. add c_3,t_2,c_3
  1244. mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
  1245. addcc c_12,t_1,c_12
  1246. bcs,a %xcc,.+8
  1247. add c_3,t_2,c_3
  1248. addcc c_12,t_1,c_12
  1249. bcs,a %xcc,.+8
  1250. add c_3,t_2,c_3
  1251. mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
  1252. addcc c_12,t_1,c_12
  1253. bcs,a %xcc,.+8
  1254. add c_3,t_2,c_3
  1255. addcc c_12,t_1,t_1
  1256. bcs,a %xcc,.+8
  1257. add c_3,t_2,c_3
  1258. srlx t_1,32,c_12
  1259. stuw t_1,rp(9) !r[9]=c1;
  1260. or c_12,c_3,c_12
  1261. mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
  1262. addcc c_12,t_1,c_12
  1263. clr c_3
  1264. bcs,a %xcc,.+8
  1265. add c_3,t_2,c_3
  1266. addcc c_12,t_1,c_12
  1267. bcs,a %xcc,.+8
  1268. add c_3,t_2,c_3
  1269. mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
  1270. addcc c_12,t_1,c_12
  1271. bcs,a %xcc,.+8
  1272. add c_3,t_2,c_3
  1273. addcc c_12,t_1,c_12
  1274. bcs,a %xcc,.+8
  1275. add c_3,t_2,c_3
  1276. mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
  1277. addcc c_12,t_1,t_1
  1278. bcs,a %xcc,.+8
  1279. add c_3,t_2,c_3
  1280. srlx t_1,32,c_12
  1281. stuw t_1,rp(10) !r[10]=c2;
  1282. or c_12,c_3,c_12
  1283. mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
  1284. addcc c_12,t_1,c_12
  1285. clr c_3
  1286. bcs,a %xcc,.+8
  1287. add c_3,t_2,c_3
  1288. addcc c_12,t_1,c_12
  1289. bcs,a %xcc,.+8
  1290. add c_3,t_2,c_3
  1291. mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
  1292. addcc c_12,t_1,c_12
  1293. bcs,a %xcc,.+8
  1294. add c_3,t_2,c_3
  1295. addcc c_12,t_1,t_1
  1296. bcs,a %xcc,.+8
  1297. add c_3,t_2,c_3
  1298. srlx t_1,32,c_12
  1299. stuw t_1,rp(11) !r[11]=c3;
  1300. or c_12,c_3,c_12
  1301. mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
  1302. addcc c_12,t_1,c_12
  1303. clr c_3
  1304. bcs,a %xcc,.+8
  1305. add c_3,t_2,c_3
  1306. addcc c_12,t_1,c_12
  1307. bcs,a %xcc,.+8
  1308. add c_3,t_2,c_3
  1309. mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
  1310. addcc c_12,t_1,t_1
  1311. bcs,a %xcc,.+8
  1312. add c_3,t_2,c_3
  1313. srlx t_1,32,c_12
  1314. stuw t_1,rp(12) !r[12]=c1;
  1315. or c_12,c_3,c_12
  1316. mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
  1317. addcc c_12,t_1,c_12
  1318. clr c_3
  1319. bcs,a %xcc,.+8
  1320. add c_3,t_2,c_3
  1321. addcc c_12,t_1,t_1
  1322. bcs,a %xcc,.+8
  1323. add c_3,t_2,c_3
  1324. srlx t_1,32,c_12
  1325. stuw t_1,rp(13) !r[13]=c2;
  1326. or c_12,c_3,c_12
  1327. mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
  1328. addcc c_12,t_1,t_1
  1329. srlx t_1,32,c_12
  1330. stuw t_1,rp(14) !r[14]=c3;
  1331. stuw c_12,rp(15) !r[15]=c1;
  1332. ret
  1333. restore %g0,%g0,%o0
  1334. .type bn_sqr_comba8,#function
  1335. .size bn_sqr_comba8,(.-bn_sqr_comba8)
  1336. .align 32
  1337. .global bn_sqr_comba4
  1338. /*
  1339. * void bn_sqr_comba4(r,a)
  1340. * BN_ULONG *r,*a;
  1341. */
  1342. bn_sqr_comba4:
  1343. save %sp,FRAME_SIZE,%sp
  1344. mov 1,t_2
  1345. lduw ap(0),a_0
  1346. sllx t_2,32,t_2
  1347. lduw ap(1),a_1
  1348. mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
  1349. srlx t_1,32,c_12
  1350. stuw t_1,rp(0) !r[0]=c1;
  1351. lduw ap(2),a_2
  1352. mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
  1353. addcc c_12,t_1,c_12
  1354. clr c_3
  1355. bcs,a %xcc,.+8
  1356. add c_3,t_2,c_3
  1357. addcc c_12,t_1,t_1
  1358. bcs,a %xcc,.+8
  1359. add c_3,t_2,c_3
  1360. srlx t_1,32,c_12
  1361. stuw t_1,rp(1) !r[1]=c2;
  1362. or c_12,c_3,c_12
  1363. mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
  1364. addcc c_12,t_1,c_12
  1365. clr c_3
  1366. bcs,a %xcc,.+8
  1367. add c_3,t_2,c_3
  1368. addcc c_12,t_1,c_12
  1369. bcs,a %xcc,.+8
  1370. add c_3,t_2,c_3
  1371. lduw ap(3),a_3
  1372. mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
  1373. addcc c_12,t_1,t_1
  1374. bcs,a %xcc,.+8
  1375. add c_3,t_2,c_3
  1376. srlx t_1,32,c_12
  1377. stuw t_1,rp(2) !r[2]=c3;
  1378. or c_12,c_3,c_12
  1379. mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
  1380. addcc c_12,t_1,c_12
  1381. clr c_3
  1382. bcs,a %xcc,.+8
  1383. add c_3,t_2,c_3
  1384. addcc c_12,t_1,c_12
  1385. bcs,a %xcc,.+8
  1386. add c_3,t_2,c_3
  1387. mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
  1388. addcc c_12,t_1,c_12
  1389. bcs,a %xcc,.+8
  1390. add c_3,t_2,c_3
  1391. addcc c_12,t_1,t_1
  1392. bcs,a %xcc,.+8
  1393. add c_3,t_2,c_3
  1394. srlx t_1,32,c_12
  1395. stuw t_1,rp(3) !r[3]=c1;
  1396. or c_12,c_3,c_12
  1397. mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
  1398. addcc c_12,t_1,c_12
  1399. clr c_3
  1400. bcs,a %xcc,.+8
  1401. add c_3,t_2,c_3
  1402. addcc c_12,t_1,c_12
  1403. bcs,a %xcc,.+8
  1404. add c_3,t_2,c_3
  1405. mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
  1406. addcc c_12,t_1,t_1
  1407. bcs,a %xcc,.+8
  1408. add c_3,t_2,c_3
  1409. srlx t_1,32,c_12
  1410. stuw t_1,rp(4) !r[4]=c2;
  1411. or c_12,c_3,c_12
  1412. mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
  1413. addcc c_12,t_1,c_12
  1414. clr c_3
  1415. bcs,a %xcc,.+8
  1416. add c_3,t_2,c_3
  1417. addcc c_12,t_1,t_1
  1418. bcs,a %xcc,.+8
  1419. add c_3,t_2,c_3
  1420. srlx t_1,32,c_12
  1421. stuw t_1,rp(5) !r[5]=c3;
  1422. or c_12,c_3,c_12
  1423. mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
  1424. addcc c_12,t_1,t_1
  1425. srlx t_1,32,c_12
  1426. stuw t_1,rp(6) !r[6]=c1;
  1427. stuw c_12,rp(7) !r[7]=c2;
  1428. ret
  1429. restore %g0,%g0,%o0
  1430. .type bn_sqr_comba4,#function
  1431. .size bn_sqr_comba4,(.-bn_sqr_comba4)
  1432. .align 32