sparct4-mont.pl 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
  4. # <appro@openssl.org>. The module is licensed under 2-clause BSD
  5. # license. November 2012. All rights reserved.
  6. # ====================================================================
  7. ######################################################################
  8. # Montgomery squaring-n-multiplication module for SPARC T4.
  9. #
  10. # The module consists of three parts:
  11. #
  12. # 1) collection of "single-op" subroutines that perform single
  13. # operation, Montgomery squaring or multiplication, on 512-,
  14. # 1024-, 1536- and 2048-bit operands;
  15. # 2) collection of "multi-op" subroutines that perform 5 squaring and
  16. # 1 multiplication operations on operands of above lengths;
  17. # 3) fall-back and helper VIS3 subroutines.
  18. #
  19. # RSA sign is dominated by multi-op subroutine, while RSA verify and
  20. # DSA - by single-op. Special note about 4096-bit RSA verify result.
  21. # Operands are too long for dedicated hardware and it's handled by
  22. # VIS3 code, which is why you don't see any improvement. It's surely
  23. # possible to improve it [by deploying 'mpmul' instruction], maybe in
  24. # the future...
  25. #
  26. # Performance improvement.
  27. #
  28. # 64-bit process, VIS3:
  29. # sign verify sign/s verify/s
  30. # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
  31. # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
  32. # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
  33. # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
  34. # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
  35. #
  36. # 64-bit process, this module:
  37. # sign verify sign/s verify/s
  38. # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
  39. # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
  40. # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
  41. # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
  42. # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
  43. #
  44. ######################################################################
  45. # 32-bit process, VIS3:
  46. # sign verify sign/s verify/s
  47. # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
  48. # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
  49. # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
  50. # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
  51. # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
  52. #
  53. # 32-bit process, this module:
  54. # sign verify sign/s verify/s
  55. # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
  56. # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
  57. # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
  58. # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
  59. # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
  60. #
  61. # 32-bit code is prone to performance degradation as interrupt rate
  62. # dispatched to CPU executing the code grows. This is because in
  63. # standard process of handling interrupt in 32-bit process context
  64. # upper halves of most integer registers used as input or output are
  65. # zeroed. This renders result invalid, and operation has to be re-run.
  66. # If CPU is "bothered" with timer interrupts only, the penalty is
  67. # hardly measurable. But in order to mitigate this problem for higher
  68. # interrupt rates contemporary Linux kernel recognizes biased stack
  69. # even in 32-bit process context and preserves full register contents.
  70. # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
  71. # for details.
  72. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  73. push(@INC,"${dir}","${dir}../../perlasm");
  74. require "sparcv9_modes.pl";
  75. $code.=<<___;
  76. #include "sparc_arch.h"
  77. #ifdef __arch64__
  78. .register %g2,#scratch
  79. .register %g3,#scratch
  80. #endif
  81. .section ".text",#alloc,#execinstr
  82. #ifdef __PIC__
  83. SPARC_PIC_THUNK(%g1)
  84. #endif
  85. ___
  86. ########################################################################
  87. # Register layout for mont[mul|sqr] instructions.
  88. # For details see "Oracle SPARC Architecture 2011" manual at
  89. # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
  90. #
  91. my @R=map("%f".2*$_,(0..11,30,31,12..29));
  92. my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
  93. my @A=(@N[0..13],@R[14..31]);
  94. my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
  95. ########################################################################
  96. # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
  97. # const u64 *np,const BN_ULONG *n0);
  98. #
  99. sub generate_bn_mul_mont_t4() {
  100. my $NUM=shift;
  101. my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
  102. $code.=<<___;
  103. .globl bn_mul_mont_t4_$NUM
  104. .align 32
  105. bn_mul_mont_t4_$NUM:
  106. #ifdef __arch64__
  107. mov 0,$sentinel
  108. mov -128,%g4
  109. #elif defined(SPARCV9_64BIT_STACK)
  110. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  111. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  112. mov -2047,%g4
  113. and %g1,SPARCV9_64BIT_STACK,%g1
  114. movrz %g1,0,%g4
  115. mov -1,$sentinel
  116. add %g4,-128,%g4
  117. #else
  118. mov -1,$sentinel
  119. mov -128,%g4
  120. #endif
  121. sllx $sentinel,32,$sentinel
  122. save %sp,%g4,%sp
  123. #ifndef __arch64__
  124. save %sp,-128,%sp ! warm it up
  125. save %sp,-128,%sp
  126. save %sp,-128,%sp
  127. save %sp,-128,%sp
  128. save %sp,-128,%sp
  129. save %sp,-128,%sp
  130. restore
  131. restore
  132. restore
  133. restore
  134. restore
  135. restore
  136. #endif
  137. and %sp,1,%g4
  138. or $sentinel,%fp,%fp
  139. or %g4,$sentinel,$sentinel
  140. ! copy arguments to global registers
  141. mov %i0,$rp
  142. mov %i1,$ap
  143. mov %i2,$bp
  144. mov %i3,$np
  145. ld [%i4+0],%f1 ! load *n0
  146. ld [%i4+4],%f0
  147. fsrc2 %f0,%f60
  148. ___
  149. # load ap[$NUM] ########################################################
  150. $code.=<<___;
  151. save %sp,-128,%sp; or $sentinel,%fp,%fp
  152. ___
  153. for($i=0; $i<14 && $i<$NUM; $i++) {
  154. my $lo=$i<13?@A[$i+1]:"%o7";
  155. $code.=<<___;
  156. ld [$ap+$i*8+0],$lo
  157. ld [$ap+$i*8+4],@A[$i]
  158. sllx @A[$i],32,@A[$i]
  159. or $lo,@A[$i],@A[$i]
  160. ___
  161. }
  162. for(; $i<$NUM; $i++) {
  163. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  164. $code.=<<___;
  165. ld [$ap+$i*8+0],$lo
  166. ld [$ap+$i*8+4],$hi
  167. fsrc2 $hi,@A[$i]
  168. ___
  169. }
  170. # load np[$NUM] ########################################################
  171. $code.=<<___;
  172. save %sp,-128,%sp; or $sentinel,%fp,%fp
  173. ___
  174. for($i=0; $i<14 && $i<$NUM; $i++) {
  175. my $lo=$i<13?@N[$i+1]:"%o7";
  176. $code.=<<___;
  177. ld [$np+$i*8+0],$lo
  178. ld [$np+$i*8+4],@N[$i]
  179. sllx @N[$i],32,@N[$i]
  180. or $lo,@N[$i],@N[$i]
  181. ___
  182. }
  183. $code.=<<___;
  184. save %sp,-128,%sp; or $sentinel,%fp,%fp
  185. ___
  186. for(; $i<28 && $i<$NUM; $i++) {
  187. my $lo=$i<27?@N[$i+1]:"%o7";
  188. $code.=<<___;
  189. ld [$np+$i*8+0],$lo
  190. ld [$np+$i*8+4],@N[$i]
  191. sllx @N[$i],32,@N[$i]
  192. or $lo,@N[$i],@N[$i]
  193. ___
  194. }
  195. $code.=<<___;
  196. save %sp,-128,%sp; or $sentinel,%fp,%fp
  197. ___
  198. for(; $i<$NUM; $i++) {
  199. my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
  200. $code.=<<___;
  201. ld [$np+$i*8+0],$lo
  202. ld [$np+$i*8+4],@N[$i]
  203. sllx @N[$i],32,@N[$i]
  204. or $lo,@N[$i],@N[$i]
  205. ___
  206. }
  207. $code.=<<___;
  208. cmp $ap,$bp
  209. be SIZE_T_CC,.Lmsquare_$NUM
  210. nop
  211. ___
  212. # load bp[$NUM] ########################################################
  213. $code.=<<___;
  214. save %sp,-128,%sp; or $sentinel,%fp,%fp
  215. ___
  216. for($i=0; $i<14 && $i<$NUM; $i++) {
  217. my $lo=$i<13?@B[$i+1]:"%o7";
  218. $code.=<<___;
  219. ld [$bp+$i*8+0],$lo
  220. ld [$bp+$i*8+4],@B[$i]
  221. sllx @B[$i],32,@B[$i]
  222. or $lo,@B[$i],@B[$i]
  223. ___
  224. }
  225. $code.=<<___;
  226. save %sp,-128,%sp; or $sentinel,%fp,%fp
  227. ___
  228. for(; $i<$NUM; $i++) {
  229. my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
  230. $code.=<<___;
  231. ld [$bp+$i*8+0],$lo
  232. ld [$bp+$i*8+4],@B[$i]
  233. sllx @B[$i],32,@B[$i]
  234. or $lo,@B[$i],@B[$i]
  235. ___
  236. }
  237. # magic ################################################################
  238. $code.=<<___;
  239. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  240. .Lmresume_$NUM:
  241. fbu,pn %fcc3,.Lmabort_$NUM
  242. #ifndef __arch64__
  243. and %fp,$sentinel,$sentinel
  244. brz,pn $sentinel,.Lmabort_$NUM
  245. #endif
  246. nop
  247. #ifdef __arch64__
  248. restore
  249. restore
  250. restore
  251. restore
  252. restore
  253. #else
  254. restore; and %fp,$sentinel,$sentinel
  255. restore; and %fp,$sentinel,$sentinel
  256. restore; and %fp,$sentinel,$sentinel
  257. restore; and %fp,$sentinel,$sentinel
  258. brz,pn $sentinel,.Lmabort1_$NUM
  259. restore
  260. #endif
  261. ___
  262. # save tp[$NUM] ########################################################
  263. for($i=0; $i<14 && $i<$NUM; $i++) {
  264. $code.=<<___;
  265. movxtod @A[$i],@R[$i]
  266. ___
  267. }
  268. $code.=<<___;
  269. #ifdef __arch64__
  270. restore
  271. #else
  272. and %fp,$sentinel,$sentinel
  273. restore
  274. and $sentinel,1,%o7
  275. and %fp,$sentinel,$sentinel
  276. srl %fp,0,%fp ! just in case?
  277. or %o7,$sentinel,$sentinel
  278. brz,a,pn $sentinel,.Lmdone_$NUM
  279. mov 0,%i0 ! return failure
  280. #endif
  281. ___
  282. for($i=0; $i<12 && $i<$NUM; $i++) {
  283. @R[$i] =~ /%f([0-9]+)/;
  284. my $lo = "%f".($1+1);
  285. $code.=<<___;
  286. st $lo,[$rp+$i*8+0]
  287. st @R[$i],[$rp+$i*8+4]
  288. ___
  289. }
  290. for(; $i<$NUM; $i++) {
  291. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  292. $code.=<<___;
  293. fsrc2 @R[$i],$hi
  294. st $lo,[$rp+$i*8+0]
  295. st $hi,[$rp+$i*8+4]
  296. ___
  297. }
  298. $code.=<<___;
  299. mov 1,%i0 ! return success
  300. .Lmdone_$NUM:
  301. ret
  302. restore
  303. .Lmabort_$NUM:
  304. restore
  305. restore
  306. restore
  307. restore
  308. restore
  309. .Lmabort1_$NUM:
  310. restore
  311. mov 0,%i0 ! return failure
  312. ret
  313. restore
  314. .align 32
  315. .Lmsquare_$NUM:
  316. save %sp,-128,%sp; or $sentinel,%fp,%fp
  317. save %sp,-128,%sp; or $sentinel,%fp,%fp
  318. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  319. ba .Lmresume_$NUM
  320. nop
  321. .type bn_mul_mont_t4_$NUM, #function
  322. .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
  323. ___
  324. }
  325. for ($i=8;$i<=32;$i+=8) {
  326. &generate_bn_mul_mont_t4($i);
  327. }
  328. ########################################################################
  329. #
  330. sub load_ccr {
  331. my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
  332. $code.=<<___;
  333. srl $pwr, 2, %o4
  334. and $pwr, 3, %o5
  335. and %o4, 7, %o4
  336. sll %o5, 3, %o5 ! offset within first cache line
  337. add %o5, $ptbl, $ptbl ! of the pwrtbl
  338. or %g0, 1, %o5
  339. sll %o5, %o4, $ccr
  340. ___
  341. $code.=<<___ if (!$skip_wr);
  342. wr $ccr, %g0, %ccr
  343. ___
  344. }
  345. sub load_b_pair {
  346. my ($pwrtbl,$B0,$B1)=@_;
  347. $code.=<<___;
  348. ldx [$pwrtbl+0*32], $B0
  349. ldx [$pwrtbl+8*32], $B1
  350. ldx [$pwrtbl+1*32], %o4
  351. ldx [$pwrtbl+9*32], %o5
  352. movvs %icc, %o4, $B0
  353. ldx [$pwrtbl+2*32], %o4
  354. movvs %icc, %o5, $B1
  355. ldx [$pwrtbl+10*32],%o5
  356. move %icc, %o4, $B0
  357. ldx [$pwrtbl+3*32], %o4
  358. move %icc, %o5, $B1
  359. ldx [$pwrtbl+11*32],%o5
  360. movneg %icc, %o4, $B0
  361. ldx [$pwrtbl+4*32], %o4
  362. movneg %icc, %o5, $B1
  363. ldx [$pwrtbl+12*32],%o5
  364. movcs %xcc, %o4, $B0
  365. ldx [$pwrtbl+5*32],%o4
  366. movcs %xcc, %o5, $B1
  367. ldx [$pwrtbl+13*32],%o5
  368. movvs %xcc, %o4, $B0
  369. ldx [$pwrtbl+6*32], %o4
  370. movvs %xcc, %o5, $B1
  371. ldx [$pwrtbl+14*32],%o5
  372. move %xcc, %o4, $B0
  373. ldx [$pwrtbl+7*32], %o4
  374. move %xcc, %o5, $B1
  375. ldx [$pwrtbl+15*32],%o5
  376. movneg %xcc, %o4, $B0
  377. add $pwrtbl,16*32, $pwrtbl
  378. movneg %xcc, %o5, $B1
  379. ___
  380. }
  381. sub load_b {
  382. my ($pwrtbl,$Bi)=@_;
  383. $code.=<<___;
  384. ldx [$pwrtbl+0*32], $Bi
  385. ldx [$pwrtbl+1*32], %o4
  386. ldx [$pwrtbl+2*32], %o5
  387. movvs %icc, %o4, $Bi
  388. ldx [$pwrtbl+3*32], %o4
  389. move %icc, %o5, $Bi
  390. ldx [$pwrtbl+4*32], %o5
  391. movneg %icc, %o4, $Bi
  392. ldx [$pwrtbl+5*32], %o4
  393. movcs %xcc, %o5, $Bi
  394. ldx [$pwrtbl+6*32], %o5
  395. movvs %xcc, %o4, $Bi
  396. ldx [$pwrtbl+7*32], %o4
  397. move %xcc, %o5, $Bi
  398. add $pwrtbl,8*32, $pwrtbl
  399. movneg %xcc, %o4, $Bi
  400. ___
  401. }
  402. ########################################################################
  403. # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
  404. # const u64 *pwrtbl,int pwr,int stride);
  405. #
  406. sub generate_bn_pwr5_mont_t4() {
  407. my $NUM=shift;
  408. my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
  409. $code.=<<___;
  410. .globl bn_pwr5_mont_t4_$NUM
  411. .align 32
  412. bn_pwr5_mont_t4_$NUM:
  413. #ifdef __arch64__
  414. mov 0,$sentinel
  415. mov -128,%g4
  416. #elif defined(SPARCV9_64BIT_STACK)
  417. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  418. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  419. mov -2047,%g4
  420. and %g1,SPARCV9_64BIT_STACK,%g1
  421. movrz %g1,0,%g4
  422. mov -1,$sentinel
  423. add %g4,-128,%g4
  424. #else
  425. mov -1,$sentinel
  426. mov -128,%g4
  427. #endif
  428. sllx $sentinel,32,$sentinel
  429. save %sp,%g4,%sp
  430. #ifndef __arch64__
  431. save %sp,-128,%sp ! warm it up
  432. save %sp,-128,%sp
  433. save %sp,-128,%sp
  434. save %sp,-128,%sp
  435. save %sp,-128,%sp
  436. save %sp,-128,%sp
  437. restore
  438. restore
  439. restore
  440. restore
  441. restore
  442. restore
  443. #endif
  444. and %sp,1,%g4
  445. or $sentinel,%fp,%fp
  446. or %g4,$sentinel,$sentinel
  447. ! copy arguments to global registers
  448. mov %i0,$tp
  449. mov %i1,$np
  450. ld [%i2+0],%f1 ! load *n0
  451. ld [%i2+4],%f0
  452. mov %i3,$pwrtbl
  453. srl %i4,%g0,%i4 ! pack last arguments
  454. sllx %i5,32,$pwr
  455. or %i4,$pwr,$pwr
  456. fsrc2 %f0,%f60
  457. ___
  458. # load tp[$NUM] ########################################################
  459. $code.=<<___;
  460. save %sp,-128,%sp; or $sentinel,%fp,%fp
  461. ___
  462. for($i=0; $i<14 && $i<$NUM; $i++) {
  463. $code.=<<___;
  464. ldx [$tp+$i*8],@A[$i]
  465. ___
  466. }
  467. for(; $i<$NUM; $i++) {
  468. $code.=<<___;
  469. ldd [$tp+$i*8],@A[$i]
  470. ___
  471. }
  472. # load np[$NUM] ########################################################
  473. $code.=<<___;
  474. save %sp,-128,%sp; or $sentinel,%fp,%fp
  475. ___
  476. for($i=0; $i<14 && $i<$NUM; $i++) {
  477. $code.=<<___;
  478. ldx [$np+$i*8],@N[$i]
  479. ___
  480. }
  481. $code.=<<___;
  482. save %sp,-128,%sp; or $sentinel,%fp,%fp
  483. ___
  484. for(; $i<28 && $i<$NUM; $i++) {
  485. $code.=<<___;
  486. ldx [$np+$i*8],@N[$i]
  487. ___
  488. }
  489. $code.=<<___;
  490. save %sp,-128,%sp; or $sentinel,%fp,%fp
  491. ___
  492. for(; $i<$NUM; $i++) {
  493. $code.=<<___;
  494. ldx [$np+$i*8],@N[$i]
  495. ___
  496. }
  497. # load pwrtbl[pwr] ########################################################
  498. $code.=<<___;
  499. save %sp,-128,%sp; or $sentinel,%fp,%fp
  500. srlx $pwr, 32, %o4 ! unpack $pwr
  501. srl $pwr, %g0, %o5
  502. sub %o4, 5, %o4
  503. mov $pwrtbl, %o7
  504. sllx %o4, 32, $pwr ! re-pack $pwr
  505. or %o5, $pwr, $pwr
  506. srl %o5, %o4, %o5
  507. ___
  508. &load_ccr("%o7","%o5","%o4");
  509. $code.=<<___;
  510. b .Lstride_$NUM
  511. nop
  512. .align 16
  513. .Lstride_$NUM:
  514. ___
  515. for($i=0; $i<14 && $i<$NUM; $i+=2) {
  516. &load_b_pair("%o7",@B[$i],@B[$i+1]);
  517. }
  518. $code.=<<___;
  519. save %sp,-128,%sp; or $sentinel,%fp,%fp
  520. ___
  521. for(; $i<$NUM; $i+=2) {
  522. &load_b_pair("%i7",@B[$i],@B[$i+1]);
  523. }
  524. $code.=<<___;
  525. srax $pwr, 32, %o4 ! unpack $pwr
  526. srl $pwr, %g0, %o5
  527. sub %o4, 5, %o4
  528. mov $pwrtbl, %i7
  529. sllx %o4, 32, $pwr ! re-pack $pwr
  530. or %o5, $pwr, $pwr
  531. srl %o5, %o4, %o5
  532. ___
  533. &load_ccr("%i7","%o5","%o4",1);
  534. # magic ################################################################
  535. for($i=0; $i<5; $i++) {
  536. $code.=<<___;
  537. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  538. fbu,pn %fcc3,.Labort_$NUM
  539. #ifndef __arch64__
  540. and %fp,$sentinel,$sentinel
  541. brz,pn $sentinel,.Labort_$NUM
  542. #endif
  543. nop
  544. ___
  545. }
  546. $code.=<<___;
  547. wr %o4, %g0, %ccr
  548. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  549. fbu,pn %fcc3,.Labort_$NUM
  550. #ifndef __arch64__
  551. and %fp,$sentinel,$sentinel
  552. brz,pn $sentinel,.Labort_$NUM
  553. #endif
  554. srax $pwr, 32, %o4
  555. #ifdef __arch64__
  556. brgez %o4,.Lstride_$NUM
  557. restore
  558. restore
  559. restore
  560. restore
  561. restore
  562. #else
  563. brgez %o4,.Lstride_$NUM
  564. restore; and %fp,$sentinel,$sentinel
  565. restore; and %fp,$sentinel,$sentinel
  566. restore; and %fp,$sentinel,$sentinel
  567. restore; and %fp,$sentinel,$sentinel
  568. brz,pn $sentinel,.Labort1_$NUM
  569. restore
  570. #endif
  571. ___
  572. # save tp[$NUM] ########################################################
  573. for($i=0; $i<14 && $i<$NUM; $i++) {
  574. $code.=<<___;
  575. movxtod @A[$i],@R[$i]
  576. ___
  577. }
  578. $code.=<<___;
  579. #ifdef __arch64__
  580. restore
  581. #else
  582. and %fp,$sentinel,$sentinel
  583. restore
  584. and $sentinel,1,%o7
  585. and %fp,$sentinel,$sentinel
  586. srl %fp,0,%fp ! just in case?
  587. or %o7,$sentinel,$sentinel
  588. brz,a,pn $sentinel,.Ldone_$NUM
  589. mov 0,%i0 ! return failure
  590. #endif
  591. ___
  592. for($i=0; $i<$NUM; $i++) {
  593. $code.=<<___;
  594. std @R[$i],[$tp+$i*8]
  595. ___
  596. }
  597. $code.=<<___;
  598. mov 1,%i0 ! return success
  599. .Ldone_$NUM:
  600. ret
  601. restore
  602. .Labort_$NUM:
  603. restore
  604. restore
  605. restore
  606. restore
  607. restore
  608. .Labort1_$NUM:
  609. restore
  610. mov 0,%i0 ! return failure
  611. ret
  612. restore
  613. .type bn_pwr5_mont_t4_$NUM, #function
  614. .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
  615. ___
  616. }
  617. for ($i=8;$i<=32;$i+=8) {
  618. &generate_bn_pwr5_mont_t4($i);
  619. }
  620. {
  621. ########################################################################
  622. # Fall-back subroutines
  623. #
  624. # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
  625. #
  626. ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
  627. (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
  628. # int bn_mul_mont(
  629. $rp="%o0"; # u64 *rp,
  630. $ap="%o1"; # const u64 *ap,
  631. $bp="%o2"; # const u64 *bp,
  632. $np="%o3"; # const u64 *np,
  633. $n0p="%o4"; # const BN_ULONG *n0,
  634. $num="%o5"; # int num); # caller ensures that num is >=3
  635. $code.=<<___;
  636. .globl bn_mul_mont_t4
  637. .align 32
  638. bn_mul_mont_t4:
  639. add %sp, STACK_BIAS, %g4 ! real top of stack
  640. sll $num, 3, $num ! size in bytes
  641. add $num, 63, %g1
  642. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  643. sub %g4, %g1, %g1
  644. andn %g1, 63, %g1 ! align at 64 byte
  645. sub %g1, STACK_FRAME, %g1 ! new top of stack
  646. sub %g1, %g4, %g1
  647. save %sp, %g1, %sp
  648. ___
  649. # +-------------------------------+<----- %sp
  650. # . .
  651. # +-------------------------------+<----- aligned at 64 bytes
  652. # | __int64 tmp[0] |
  653. # +-------------------------------+
  654. # . .
  655. # . .
  656. # +-------------------------------+<----- aligned at 64 bytes
  657. # . .
  658. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  659. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
  660. ($ovf,$i)=($t0,$t1);
  661. $code.=<<___;
  662. ld [$n0p+0], $t0 ! pull n0[0..1] value
  663. ld [$n0p+4], $t1
  664. add %sp, STACK_BIAS+STACK_FRAME, $tp
  665. ldx [$bp+0], $m0 ! m0=bp[0]
  666. sllx $t1, 32, $n0
  667. add $bp, 8, $bp
  668. or $t0, $n0, $n0
  669. ldx [$ap+0], $aj ! ap[0]
  670. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  671. umulxhi $aj, $m0, $hi0
  672. ldx [$ap+8], $aj ! ap[1]
  673. add $ap, 16, $ap
  674. ldx [$np+0], $nj ! np[0]
  675. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  676. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  677. umulxhi $aj, $m0, $aj ! ahi=aj
  678. mulx $nj, $m1, $lo1 ! np[0]*m1
  679. umulxhi $nj, $m1, $hi1
  680. ldx [$np+8], $nj ! np[1]
  681. addcc $lo0, $lo1, $lo1
  682. add $np, 16, $np
  683. addxc %g0, $hi1, $hi1
  684. mulx $nj, $m1, $nlo ! np[1]*m1
  685. umulxhi $nj, $m1, $nj ! nhi=nj
  686. ba .L1st
  687. sub $num, 24, $cnt ! cnt=num-3
  688. .align 16
  689. .L1st:
  690. addcc $alo, $hi0, $lo0
  691. addxc $aj, %g0, $hi0
  692. ldx [$ap+0], $aj ! ap[j]
  693. addcc $nlo, $hi1, $lo1
  694. add $ap, 8, $ap
  695. addxc $nj, %g0, $hi1 ! nhi=nj
  696. ldx [$np+0], $nj ! np[j]
  697. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  698. add $np, 8, $np
  699. umulxhi $aj, $m0, $aj ! ahi=aj
  700. mulx $nj, $m1, $nlo ! np[j]*m1
  701. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  702. umulxhi $nj, $m1, $nj ! nhi=nj
  703. addxc %g0, $hi1, $hi1
  704. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  705. add $tp, 8, $tp ! tp++
  706. brnz,pt $cnt, .L1st
  707. sub $cnt, 8, $cnt ! j--
  708. !.L1st
  709. addcc $alo, $hi0, $lo0
  710. addxc $aj, %g0, $hi0 ! ahi=aj
  711. addcc $nlo, $hi1, $lo1
  712. addxc $nj, %g0, $hi1
  713. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  714. addxc %g0, $hi1, $hi1
  715. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  716. add $tp, 8, $tp
  717. addcc $hi0, $hi1, $hi1
  718. addxc %g0, %g0, $ovf ! upmost overflow bit
  719. stxa $hi1, [$tp]0xe2
  720. add $tp, 8, $tp
  721. ba .Louter
  722. sub $num, 16, $i ! i=num-2
  723. .align 16
  724. .Louter:
  725. ldx [$bp+0], $m0 ! m0=bp[i]
  726. add $bp, 8, $bp
  727. sub $ap, $num, $ap ! rewind
  728. sub $np, $num, $np
  729. sub $tp, $num, $tp
  730. ldx [$ap+0], $aj ! ap[0]
  731. ldx [$np+0], $nj ! np[0]
  732. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  733. ldx [$tp], $tj ! tp[0]
  734. umulxhi $aj, $m0, $hi0
  735. ldx [$ap+8], $aj ! ap[1]
  736. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  737. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  738. addxc %g0, $hi0, $hi0
  739. mulx $lo0, $n0, $m1 ! tp[0]*n0
  740. umulxhi $aj, $m0, $aj ! ahi=aj
  741. mulx $nj, $m1, $lo1 ! np[0]*m1
  742. add $ap, 16, $ap
  743. umulxhi $nj, $m1, $hi1
  744. ldx [$np+8], $nj ! np[1]
  745. add $np, 16, $np
  746. addcc $lo1, $lo0, $lo1
  747. mulx $nj, $m1, $nlo ! np[1]*m1
  748. addxc %g0, $hi1, $hi1
  749. umulxhi $nj, $m1, $nj ! nhi=nj
  750. ba .Linner
  751. sub $num, 24, $cnt ! cnt=num-3
  752. .align 16
  753. .Linner:
  754. addcc $alo, $hi0, $lo0
  755. ldx [$tp+8], $tj ! tp[j]
  756. addxc $aj, %g0, $hi0 ! ahi=aj
  757. ldx [$ap+0], $aj ! ap[j]
  758. add $ap, 8, $ap
  759. addcc $nlo, $hi1, $lo1
  760. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  761. addxc $nj, %g0, $hi1 ! nhi=nj
  762. ldx [$np+0], $nj ! np[j]
  763. add $np, 8, $np
  764. umulxhi $aj, $m0, $aj ! ahi=aj
  765. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  766. mulx $nj, $m1, $nlo ! np[j]*m1
  767. addxc %g0, $hi0, $hi0
  768. umulxhi $nj, $m1, $nj ! nhi=nj
  769. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  770. addxc %g0, $hi1, $hi1
  771. stx $lo1, [$tp] ! tp[j-1]
  772. add $tp, 8, $tp
  773. brnz,pt $cnt, .Linner
  774. sub $cnt, 8, $cnt
  775. !.Linner
  776. ldx [$tp+8], $tj ! tp[j]
  777. addcc $alo, $hi0, $lo0
  778. addxc $aj, %g0, $hi0 ! ahi=aj
  779. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  780. addxc %g0, $hi0, $hi0
  781. addcc $nlo, $hi1, $lo1
  782. addxc $nj, %g0, $hi1 ! nhi=nj
  783. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  784. addxc %g0, $hi1, $hi1
  785. stx $lo1, [$tp] ! tp[j-1]
  786. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  787. addxccc $hi1, $hi0, $hi1
  788. addxc %g0, %g0, $ovf
  789. stx $hi1, [$tp+8]
  790. add $tp, 16, $tp
  791. brnz,pt $i, .Louter
  792. sub $i, 8, $i
  793. sub $ap, $num, $ap ! rewind
  794. sub $np, $num, $np
  795. sub $tp, $num, $tp
  796. ba .Lsub
  797. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  798. .align 16
  799. .Lsub:
  800. ldx [$tp], $tj
  801. add $tp, 8, $tp
  802. ldx [$np+0], $nj
  803. add $np, 8, $np
  804. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  805. srlx $tj, 32, $tj
  806. srlx $nj, 32, $nj
  807. subccc $tj, $nj, $t3
  808. add $rp, 8, $rp
  809. st $t2, [$rp-4] ! reverse order
  810. st $t3, [$rp-8]
  811. brnz,pt $cnt, .Lsub
  812. sub $cnt, 8, $cnt
  813. sub $np, $num, $np ! rewind
  814. sub $tp, $num, $tp
  815. sub $rp, $num, $rp
  816. subc $ovf, %g0, $ovf ! handle upmost overflow bit
  817. and $tp, $ovf, $ap
  818. andn $rp, $ovf, $np
  819. or $np, $ap, $ap ! ap=borrow?tp:rp
  820. ba .Lcopy
  821. sub $num, 8, $cnt
  822. .align 16
  823. .Lcopy: ! copy or in-place refresh
  824. ldx [$ap+0], $t2
  825. add $ap, 8, $ap
  826. stx %g0, [$tp] ! zap
  827. add $tp, 8, $tp
  828. stx $t2, [$rp+0]
  829. add $rp, 8, $rp
  830. brnz $cnt, .Lcopy
  831. sub $cnt, 8, $cnt
  832. mov 1, %o0
  833. ret
  834. restore
  835. .type bn_mul_mont_t4, #function
  836. .size bn_mul_mont_t4, .-bn_mul_mont_t4
  837. ___
  838. # int bn_mul_mont_gather5(
  839. $rp="%o0"; # u64 *rp,
  840. $ap="%o1"; # const u64 *ap,
  841. $bp="%o2"; # const u64 *pwrtbl,
  842. $np="%o3"; # const u64 *np,
  843. $n0p="%o4"; # const BN_ULONG *n0,
  844. $num="%o5"; # int num, # caller ensures that num is >=3
  845. # int power);
  846. $code.=<<___;
  847. .globl bn_mul_mont_gather5_t4
  848. .align 32
  849. bn_mul_mont_gather5_t4:
  850. add %sp, STACK_BIAS, %g4 ! real top of stack
  851. sll $num, 3, $num ! size in bytes
  852. add $num, 63, %g1
  853. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  854. sub %g4, %g1, %g1
  855. andn %g1, 63, %g1 ! align at 64 byte
  856. sub %g1, STACK_FRAME, %g1 ! new top of stack
  857. sub %g1, %g4, %g1
  858. LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
  859. save %sp, %g1, %sp
  860. ___
  861. # +-------------------------------+<----- %sp
  862. # . .
  863. # +-------------------------------+<----- aligned at 64 bytes
  864. # | __int64 tmp[0] |
  865. # +-------------------------------+
  866. # . .
  867. # . .
  868. # +-------------------------------+<----- aligned at 64 bytes
  869. # . .
  870. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  871. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
  872. ($ovf,$i)=($t0,$t1);
  873. &load_ccr($bp,"%g4",$ccr);
  874. &load_b($bp,$m0,"%o7"); # m0=bp[0]
  875. $code.=<<___;
  876. ld [$n0p+0], $t0 ! pull n0[0..1] value
  877. ld [$n0p+4], $t1
  878. add %sp, STACK_BIAS+STACK_FRAME, $tp
  879. sllx $t1, 32, $n0
  880. or $t0, $n0, $n0
  881. ldx [$ap+0], $aj ! ap[0]
  882. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  883. umulxhi $aj, $m0, $hi0
  884. ldx [$ap+8], $aj ! ap[1]
  885. add $ap, 16, $ap
  886. ldx [$np+0], $nj ! np[0]
  887. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  888. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  889. umulxhi $aj, $m0, $aj ! ahi=aj
  890. mulx $nj, $m1, $lo1 ! np[0]*m1
  891. umulxhi $nj, $m1, $hi1
  892. ldx [$np+8], $nj ! np[1]
  893. addcc $lo0, $lo1, $lo1
  894. add $np, 16, $np
  895. addxc %g0, $hi1, $hi1
  896. mulx $nj, $m1, $nlo ! np[1]*m1
  897. umulxhi $nj, $m1, $nj ! nhi=nj
  898. ba .L1st_g5
  899. sub $num, 24, $cnt ! cnt=num-3
  900. .align 16
  901. .L1st_g5:
  902. addcc $alo, $hi0, $lo0
  903. addxc $aj, %g0, $hi0
  904. ldx [$ap+0], $aj ! ap[j]
  905. addcc $nlo, $hi1, $lo1
  906. add $ap, 8, $ap
  907. addxc $nj, %g0, $hi1 ! nhi=nj
  908. ldx [$np+0], $nj ! np[j]
  909. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  910. add $np, 8, $np
  911. umulxhi $aj, $m0, $aj ! ahi=aj
  912. mulx $nj, $m1, $nlo ! np[j]*m1
  913. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  914. umulxhi $nj, $m1, $nj ! nhi=nj
  915. addxc %g0, $hi1, $hi1
  916. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  917. add $tp, 8, $tp ! tp++
  918. brnz,pt $cnt, .L1st_g5
  919. sub $cnt, 8, $cnt ! j--
  920. !.L1st_g5
  921. addcc $alo, $hi0, $lo0
  922. addxc $aj, %g0, $hi0 ! ahi=aj
  923. addcc $nlo, $hi1, $lo1
  924. addxc $nj, %g0, $hi1
  925. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  926. addxc %g0, $hi1, $hi1
  927. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  928. add $tp, 8, $tp
  929. addcc $hi0, $hi1, $hi1
  930. addxc %g0, %g0, $ovf ! upmost overflow bit
  931. stxa $hi1, [$tp]0xe2
  932. add $tp, 8, $tp
  933. ba .Louter_g5
  934. sub $num, 16, $i ! i=num-2
  935. .align 16
  936. .Louter_g5:
  937. wr $ccr, %g0, %ccr
  938. ___
  939. &load_b($bp,$m0); # m0=bp[i]
  940. $code.=<<___;
  941. sub $ap, $num, $ap ! rewind
  942. sub $np, $num, $np
  943. sub $tp, $num, $tp
  944. ldx [$ap+0], $aj ! ap[0]
  945. ldx [$np+0], $nj ! np[0]
  946. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  947. ldx [$tp], $tj ! tp[0]
  948. umulxhi $aj, $m0, $hi0
  949. ldx [$ap+8], $aj ! ap[1]
  950. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  951. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  952. addxc %g0, $hi0, $hi0
  953. mulx $lo0, $n0, $m1 ! tp[0]*n0
  954. umulxhi $aj, $m0, $aj ! ahi=aj
  955. mulx $nj, $m1, $lo1 ! np[0]*m1
  956. add $ap, 16, $ap
  957. umulxhi $nj, $m1, $hi1
  958. ldx [$np+8], $nj ! np[1]
  959. add $np, 16, $np
  960. addcc $lo1, $lo0, $lo1
  961. mulx $nj, $m1, $nlo ! np[1]*m1
  962. addxc %g0, $hi1, $hi1
  963. umulxhi $nj, $m1, $nj ! nhi=nj
  964. ba .Linner_g5
  965. sub $num, 24, $cnt ! cnt=num-3
  966. .align 16
  967. .Linner_g5:
  968. addcc $alo, $hi0, $lo0
  969. ldx [$tp+8], $tj ! tp[j]
  970. addxc $aj, %g0, $hi0 ! ahi=aj
  971. ldx [$ap+0], $aj ! ap[j]
  972. add $ap, 8, $ap
  973. addcc $nlo, $hi1, $lo1
  974. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  975. addxc $nj, %g0, $hi1 ! nhi=nj
  976. ldx [$np+0], $nj ! np[j]
  977. add $np, 8, $np
  978. umulxhi $aj, $m0, $aj ! ahi=aj
  979. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  980. mulx $nj, $m1, $nlo ! np[j]*m1
  981. addxc %g0, $hi0, $hi0
  982. umulxhi $nj, $m1, $nj ! nhi=nj
  983. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  984. addxc %g0, $hi1, $hi1
  985. stx $lo1, [$tp] ! tp[j-1]
  986. add $tp, 8, $tp
  987. brnz,pt $cnt, .Linner_g5
  988. sub $cnt, 8, $cnt
  989. !.Linner_g5
  990. ldx [$tp+8], $tj ! tp[j]
  991. addcc $alo, $hi0, $lo0
  992. addxc $aj, %g0, $hi0 ! ahi=aj
  993. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  994. addxc %g0, $hi0, $hi0
  995. addcc $nlo, $hi1, $lo1
  996. addxc $nj, %g0, $hi1 ! nhi=nj
  997. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  998. addxc %g0, $hi1, $hi1
  999. stx $lo1, [$tp] ! tp[j-1]
  1000. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  1001. addxccc $hi1, $hi0, $hi1
  1002. addxc %g0, %g0, $ovf
  1003. stx $hi1, [$tp+8]
  1004. add $tp, 16, $tp
  1005. brnz,pt $i, .Louter_g5
  1006. sub $i, 8, $i
  1007. sub $ap, $num, $ap ! rewind
  1008. sub $np, $num, $np
  1009. sub $tp, $num, $tp
  1010. ba .Lsub_g5
  1011. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  1012. .align 16
  1013. .Lsub_g5:
  1014. ldx [$tp], $tj
  1015. add $tp, 8, $tp
  1016. ldx [$np+0], $nj
  1017. add $np, 8, $np
  1018. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  1019. srlx $tj, 32, $tj
  1020. srlx $nj, 32, $nj
  1021. subccc $tj, $nj, $t3
  1022. add $rp, 8, $rp
  1023. st $t2, [$rp-4] ! reverse order
  1024. st $t3, [$rp-8]
  1025. brnz,pt $cnt, .Lsub_g5
  1026. sub $cnt, 8, $cnt
  1027. sub $np, $num, $np ! rewind
  1028. sub $tp, $num, $tp
  1029. sub $rp, $num, $rp
  1030. subc $ovf, %g0, $ovf ! handle upmost overflow bit
  1031. and $tp, $ovf, $ap
  1032. andn $rp, $ovf, $np
  1033. or $np, $ap, $ap ! ap=borrow?tp:rp
  1034. ba .Lcopy_g5
  1035. sub $num, 8, $cnt
  1036. .align 16
  1037. .Lcopy_g5: ! copy or in-place refresh
  1038. ldx [$ap+0], $t2
  1039. add $ap, 8, $ap
  1040. stx %g0, [$tp] ! zap
  1041. add $tp, 8, $tp
  1042. stx $t2, [$rp+0]
  1043. add $rp, 8, $rp
  1044. brnz $cnt, .Lcopy_g5
  1045. sub $cnt, 8, $cnt
  1046. mov 1, %o0
  1047. ret
  1048. restore
  1049. .type bn_mul_mont_gather5_t4, #function
  1050. .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
  1051. ___
  1052. }
  1053. $code.=<<___;
  1054. .globl bn_flip_t4
  1055. .align 32
  1056. bn_flip_t4:
  1057. .Loop_flip:
  1058. ld [%o1+0], %o4
  1059. sub %o2, 1, %o2
  1060. ld [%o1+4], %o5
  1061. add %o1, 8, %o1
  1062. st %o5, [%o0+0]
  1063. st %o4, [%o0+4]
  1064. brnz %o2, .Loop_flip
  1065. add %o0, 8, %o0
  1066. retl
  1067. nop
  1068. .type bn_flip_t4, #function
  1069. .size bn_flip_t4, .-bn_flip_t4
  1070. .globl bn_flip_n_scatter5_t4
  1071. .align 32
  1072. bn_flip_n_scatter5_t4:
  1073. sll %o3, 3, %o3
  1074. srl %o1, 1, %o1
  1075. add %o3, %o2, %o2 ! &pwrtbl[pwr]
  1076. sub %o1, 1, %o1
  1077. .Loop_flip_n_scatter5:
  1078. ld [%o0+0], %o4 ! inp[i]
  1079. ld [%o0+4], %o5
  1080. add %o0, 8, %o0
  1081. sllx %o5, 32, %o5
  1082. or %o4, %o5, %o5
  1083. stx %o5, [%o2]
  1084. add %o2, 32*8, %o2
  1085. brnz %o1, .Loop_flip_n_scatter5
  1086. sub %o1, 1, %o1
  1087. retl
  1088. nop
  1089. .type bn_flip_n_scatter5_t4, #function
  1090. .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
  1091. .globl bn_gather5_t4
  1092. .align 32
  1093. bn_gather5_t4:
  1094. ___
  1095. &load_ccr("%o2","%o3","%g1");
  1096. $code.=<<___;
  1097. sub %o1, 1, %o1
  1098. .Loop_gather5:
  1099. ___
  1100. &load_b("%o2","%g1");
  1101. $code.=<<___;
  1102. stx %g1, [%o0]
  1103. add %o0, 8, %o0
  1104. brnz %o1, .Loop_gather5
  1105. sub %o1, 1, %o1
  1106. retl
  1107. nop
  1108. .type bn_gather5_t4, #function
  1109. .size bn_gather5_t4, .-bn_gather5_t4
  1110. .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
  1111. .align 4
  1112. ___
  1113. &emit_assembler();
  1114. close STDOUT;