sparct4-mont.pl 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov
  10. # The module is licensed under 2-clause BSD license.
  11. # November 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Montgomery squaring-n-multiplication module for SPARC T4.
  15. #
  16. # The module consists of three parts:
  17. #
  18. # 1) collection of "single-op" subroutines that perform single
  19. # operation, Montgomery squaring or multiplication, on 512-,
  20. # 1024-, 1536- and 2048-bit operands;
  21. # 2) collection of "multi-op" subroutines that perform 5 squaring and
  22. # 1 multiplication operations on operands of above lengths;
  23. # 3) fall-back and helper VIS3 subroutines.
  24. #
  25. # RSA sign is dominated by multi-op subroutine, while RSA verify and
  26. # DSA - by single-op. Special note about 4096-bit RSA verify result.
  27. # Operands are too long for dedicated hardware and it's handled by
  28. # VIS3 code, which is why you don't see any improvement. It's surely
  29. # possible to improve it [by deploying 'mpmul' instruction], maybe in
  30. # the future...
  31. #
  32. # Performance improvement.
  33. #
  34. # 64-bit process, VIS3:
  35. # sign verify sign/s verify/s
  36. # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
  37. # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
  38. # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
  39. # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
  40. # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
  41. #
  42. # 64-bit process, this module:
  43. # sign verify sign/s verify/s
  44. # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
  45. # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
  46. # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
  47. # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
  48. # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
  49. #
  50. ######################################################################
  51. # 32-bit process, VIS3:
  52. # sign verify sign/s verify/s
  53. # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
  54. # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
  55. # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
  56. # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
  57. # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
  58. #
  59. # 32-bit process, this module:
  60. # sign verify sign/s verify/s
  61. # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
  62. # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
  63. # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
  64. # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
  65. # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
  66. #
  67. # 32-bit code is prone to performance degradation as interrupt rate
  68. # dispatched to CPU executing the code grows. This is because in
  69. # standard process of handling interrupt in 32-bit process context
  70. # upper halves of most integer registers used as input or output are
  71. # zeroed. This renders result invalid, and operation has to be re-run.
  72. # If CPU is "bothered" with timer interrupts only, the penalty is
  73. # hardly measurable. But in order to mitigate this problem for higher
  74. # interrupt rates contemporary Linux kernel recognizes biased stack
  75. # even in 32-bit process context and preserves full register contents.
  76. # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
  77. # for details.
  78. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  79. push(@INC,"${dir}","${dir}../../perlasm");
  80. require "sparcv9_modes.pl";
  81. $output = pop and open STDOUT,">$output";
  82. $code.=<<___;
  83. #include "sparc_arch.h"
  84. #ifdef __arch64__
  85. .register %g2,#scratch
  86. .register %g3,#scratch
  87. #endif
  88. .section ".text",#alloc,#execinstr
  89. #ifdef __PIC__
  90. SPARC_PIC_THUNK(%g1)
  91. #endif
  92. ___
  93. ########################################################################
  94. # Register layout for mont[mul|sqr] instructions.
  95. # For details see "Oracle SPARC Architecture 2011" manual at
  96. # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
  97. #
  98. my @R=map("%f".2*$_,(0..11,30,31,12..29));
  99. my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
  100. my @A=(@N[0..13],@R[14..31]);
  101. my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
  102. ########################################################################
  103. # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
  104. # const u64 *np,const BN_ULONG *n0);
  105. #
  106. sub generate_bn_mul_mont_t4() {
  107. my $NUM=shift;
  108. my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
  109. $code.=<<___;
  110. .globl bn_mul_mont_t4_$NUM
  111. .align 32
  112. bn_mul_mont_t4_$NUM:
  113. #ifdef __arch64__
  114. mov 0,$sentinel
  115. mov -128,%g4
  116. #elif defined(SPARCV9_64BIT_STACK)
  117. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  118. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  119. mov -2047,%g4
  120. and %g1,SPARCV9_64BIT_STACK,%g1
  121. movrz %g1,0,%g4
  122. mov -1,$sentinel
  123. add %g4,-128,%g4
  124. #else
  125. mov -1,$sentinel
  126. mov -128,%g4
  127. #endif
  128. sllx $sentinel,32,$sentinel
  129. save %sp,%g4,%sp
  130. #ifndef __arch64__
  131. save %sp,-128,%sp ! warm it up
  132. save %sp,-128,%sp
  133. save %sp,-128,%sp
  134. save %sp,-128,%sp
  135. save %sp,-128,%sp
  136. save %sp,-128,%sp
  137. restore
  138. restore
  139. restore
  140. restore
  141. restore
  142. restore
  143. #endif
  144. and %sp,1,%g4
  145. or $sentinel,%fp,%fp
  146. or %g4,$sentinel,$sentinel
  147. ! copy arguments to global registers
  148. mov %i0,$rp
  149. mov %i1,$ap
  150. mov %i2,$bp
  151. mov %i3,$np
  152. ld [%i4+0],%f1 ! load *n0
  153. ld [%i4+4],%f0
  154. fsrc2 %f0,%f60
  155. ___
  156. # load ap[$NUM] ########################################################
  157. $code.=<<___;
  158. save %sp,-128,%sp; or $sentinel,%fp,%fp
  159. ___
  160. for($i=0; $i<14 && $i<$NUM; $i++) {
  161. my $lo=$i<13?@A[$i+1]:"%o7";
  162. $code.=<<___;
  163. ld [$ap+$i*8+0],$lo
  164. ld [$ap+$i*8+4],@A[$i]
  165. sllx @A[$i],32,@A[$i]
  166. or $lo,@A[$i],@A[$i]
  167. ___
  168. }
  169. for(; $i<$NUM; $i++) {
  170. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  171. $code.=<<___;
  172. ld [$ap+$i*8+0],$lo
  173. ld [$ap+$i*8+4],$hi
  174. fsrc2 $hi,@A[$i]
  175. ___
  176. }
  177. # load np[$NUM] ########################################################
  178. $code.=<<___;
  179. save %sp,-128,%sp; or $sentinel,%fp,%fp
  180. ___
  181. for($i=0; $i<14 && $i<$NUM; $i++) {
  182. my $lo=$i<13?@N[$i+1]:"%o7";
  183. $code.=<<___;
  184. ld [$np+$i*8+0],$lo
  185. ld [$np+$i*8+4],@N[$i]
  186. sllx @N[$i],32,@N[$i]
  187. or $lo,@N[$i],@N[$i]
  188. ___
  189. }
  190. $code.=<<___;
  191. save %sp,-128,%sp; or $sentinel,%fp,%fp
  192. ___
  193. for(; $i<28 && $i<$NUM; $i++) {
  194. my $lo=$i<27?@N[$i+1]:"%o7";
  195. $code.=<<___;
  196. ld [$np+$i*8+0],$lo
  197. ld [$np+$i*8+4],@N[$i]
  198. sllx @N[$i],32,@N[$i]
  199. or $lo,@N[$i],@N[$i]
  200. ___
  201. }
  202. $code.=<<___;
  203. save %sp,-128,%sp; or $sentinel,%fp,%fp
  204. ___
  205. for(; $i<$NUM; $i++) {
  206. my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
  207. $code.=<<___;
  208. ld [$np+$i*8+0],$lo
  209. ld [$np+$i*8+4],@N[$i]
  210. sllx @N[$i],32,@N[$i]
  211. or $lo,@N[$i],@N[$i]
  212. ___
  213. }
  214. $code.=<<___;
  215. cmp $ap,$bp
  216. be SIZE_T_CC,.Lmsquare_$NUM
  217. nop
  218. ___
  219. # load bp[$NUM] ########################################################
  220. $code.=<<___;
  221. save %sp,-128,%sp; or $sentinel,%fp,%fp
  222. ___
  223. for($i=0; $i<14 && $i<$NUM; $i++) {
  224. my $lo=$i<13?@B[$i+1]:"%o7";
  225. $code.=<<___;
  226. ld [$bp+$i*8+0],$lo
  227. ld [$bp+$i*8+4],@B[$i]
  228. sllx @B[$i],32,@B[$i]
  229. or $lo,@B[$i],@B[$i]
  230. ___
  231. }
  232. $code.=<<___;
  233. save %sp,-128,%sp; or $sentinel,%fp,%fp
  234. ___
  235. for(; $i<$NUM; $i++) {
  236. my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
  237. $code.=<<___;
  238. ld [$bp+$i*8+0],$lo
  239. ld [$bp+$i*8+4],@B[$i]
  240. sllx @B[$i],32,@B[$i]
  241. or $lo,@B[$i],@B[$i]
  242. ___
  243. }
  244. # magic ################################################################
  245. $code.=<<___;
  246. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  247. .Lmresume_$NUM:
  248. fbu,pn %fcc3,.Lmabort_$NUM
  249. #ifndef __arch64__
  250. and %fp,$sentinel,$sentinel
  251. brz,pn $sentinel,.Lmabort_$NUM
  252. #endif
  253. nop
  254. #ifdef __arch64__
  255. restore
  256. restore
  257. restore
  258. restore
  259. restore
  260. #else
  261. restore; and %fp,$sentinel,$sentinel
  262. restore; and %fp,$sentinel,$sentinel
  263. restore; and %fp,$sentinel,$sentinel
  264. restore; and %fp,$sentinel,$sentinel
  265. brz,pn $sentinel,.Lmabort1_$NUM
  266. restore
  267. #endif
  268. ___
  269. # save tp[$NUM] ########################################################
  270. for($i=0; $i<14 && $i<$NUM; $i++) {
  271. $code.=<<___;
  272. movxtod @A[$i],@R[$i]
  273. ___
  274. }
  275. $code.=<<___;
  276. #ifdef __arch64__
  277. restore
  278. #else
  279. and %fp,$sentinel,$sentinel
  280. restore
  281. and $sentinel,1,%o7
  282. and %fp,$sentinel,$sentinel
  283. srl %fp,0,%fp ! just in case?
  284. or %o7,$sentinel,$sentinel
  285. brz,a,pn $sentinel,.Lmdone_$NUM
  286. mov 0,%i0 ! return failure
  287. #endif
  288. ___
  289. for($i=0; $i<12 && $i<$NUM; $i++) {
  290. @R[$i] =~ /%f([0-9]+)/;
  291. my $lo = "%f".($1+1);
  292. $code.=<<___;
  293. st $lo,[$rp+$i*8+0]
  294. st @R[$i],[$rp+$i*8+4]
  295. ___
  296. }
  297. for(; $i<$NUM; $i++) {
  298. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  299. $code.=<<___;
  300. fsrc2 @R[$i],$hi
  301. st $lo,[$rp+$i*8+0]
  302. st $hi,[$rp+$i*8+4]
  303. ___
  304. }
  305. $code.=<<___;
  306. mov 1,%i0 ! return success
  307. .Lmdone_$NUM:
  308. ret
  309. restore
  310. .Lmabort_$NUM:
  311. restore
  312. restore
  313. restore
  314. restore
  315. restore
  316. .Lmabort1_$NUM:
  317. restore
  318. mov 0,%i0 ! return failure
  319. ret
  320. restore
  321. .align 32
  322. .Lmsquare_$NUM:
  323. save %sp,-128,%sp; or $sentinel,%fp,%fp
  324. save %sp,-128,%sp; or $sentinel,%fp,%fp
  325. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  326. ba .Lmresume_$NUM
  327. nop
  328. .type bn_mul_mont_t4_$NUM, #function
  329. .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
  330. ___
  331. }
  332. for ($i=8;$i<=32;$i+=8) {
  333. &generate_bn_mul_mont_t4($i);
  334. }
  335. ########################################################################
  336. #
  337. sub load_ccr {
  338. my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
  339. $code.=<<___;
  340. srl $pwr, 2, %o4
  341. and $pwr, 3, %o5
  342. and %o4, 7, %o4
  343. sll %o5, 3, %o5 ! offset within first cache line
  344. add %o5, $ptbl, $ptbl ! of the pwrtbl
  345. or %g0, 1, %o5
  346. sll %o5, %o4, $ccr
  347. ___
  348. $code.=<<___ if (!$skip_wr);
  349. wr $ccr, %g0, %ccr
  350. ___
  351. }
  352. sub load_b_pair {
  353. my ($pwrtbl,$B0,$B1)=@_;
  354. $code.=<<___;
  355. ldx [$pwrtbl+0*32], $B0
  356. ldx [$pwrtbl+8*32], $B1
  357. ldx [$pwrtbl+1*32], %o4
  358. ldx [$pwrtbl+9*32], %o5
  359. movvs %icc, %o4, $B0
  360. ldx [$pwrtbl+2*32], %o4
  361. movvs %icc, %o5, $B1
  362. ldx [$pwrtbl+10*32],%o5
  363. move %icc, %o4, $B0
  364. ldx [$pwrtbl+3*32], %o4
  365. move %icc, %o5, $B1
  366. ldx [$pwrtbl+11*32],%o5
  367. movneg %icc, %o4, $B0
  368. ldx [$pwrtbl+4*32], %o4
  369. movneg %icc, %o5, $B1
  370. ldx [$pwrtbl+12*32],%o5
  371. movcs %xcc, %o4, $B0
  372. ldx [$pwrtbl+5*32],%o4
  373. movcs %xcc, %o5, $B1
  374. ldx [$pwrtbl+13*32],%o5
  375. movvs %xcc, %o4, $B0
  376. ldx [$pwrtbl+6*32], %o4
  377. movvs %xcc, %o5, $B1
  378. ldx [$pwrtbl+14*32],%o5
  379. move %xcc, %o4, $B0
  380. ldx [$pwrtbl+7*32], %o4
  381. move %xcc, %o5, $B1
  382. ldx [$pwrtbl+15*32],%o5
  383. movneg %xcc, %o4, $B0
  384. add $pwrtbl,16*32, $pwrtbl
  385. movneg %xcc, %o5, $B1
  386. ___
  387. }
  388. sub load_b {
  389. my ($pwrtbl,$Bi)=@_;
  390. $code.=<<___;
  391. ldx [$pwrtbl+0*32], $Bi
  392. ldx [$pwrtbl+1*32], %o4
  393. ldx [$pwrtbl+2*32], %o5
  394. movvs %icc, %o4, $Bi
  395. ldx [$pwrtbl+3*32], %o4
  396. move %icc, %o5, $Bi
  397. ldx [$pwrtbl+4*32], %o5
  398. movneg %icc, %o4, $Bi
  399. ldx [$pwrtbl+5*32], %o4
  400. movcs %xcc, %o5, $Bi
  401. ldx [$pwrtbl+6*32], %o5
  402. movvs %xcc, %o4, $Bi
  403. ldx [$pwrtbl+7*32], %o4
  404. move %xcc, %o5, $Bi
  405. add $pwrtbl,8*32, $pwrtbl
  406. movneg %xcc, %o4, $Bi
  407. ___
  408. }
  409. ########################################################################
  410. # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
  411. # const u64 *pwrtbl,int pwr,int stride);
  412. #
  413. sub generate_bn_pwr5_mont_t4() {
  414. my $NUM=shift;
  415. my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
  416. $code.=<<___;
  417. .globl bn_pwr5_mont_t4_$NUM
  418. .align 32
  419. bn_pwr5_mont_t4_$NUM:
  420. #ifdef __arch64__
  421. mov 0,$sentinel
  422. mov -128,%g4
  423. #elif defined(SPARCV9_64BIT_STACK)
  424. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  425. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  426. mov -2047,%g4
  427. and %g1,SPARCV9_64BIT_STACK,%g1
  428. movrz %g1,0,%g4
  429. mov -1,$sentinel
  430. add %g4,-128,%g4
  431. #else
  432. mov -1,$sentinel
  433. mov -128,%g4
  434. #endif
  435. sllx $sentinel,32,$sentinel
  436. save %sp,%g4,%sp
  437. #ifndef __arch64__
  438. save %sp,-128,%sp ! warm it up
  439. save %sp,-128,%sp
  440. save %sp,-128,%sp
  441. save %sp,-128,%sp
  442. save %sp,-128,%sp
  443. save %sp,-128,%sp
  444. restore
  445. restore
  446. restore
  447. restore
  448. restore
  449. restore
  450. #endif
  451. and %sp,1,%g4
  452. or $sentinel,%fp,%fp
  453. or %g4,$sentinel,$sentinel
  454. ! copy arguments to global registers
  455. mov %i0,$tp
  456. mov %i1,$np
  457. ld [%i2+0],%f1 ! load *n0
  458. ld [%i2+4],%f0
  459. mov %i3,$pwrtbl
  460. srl %i4,%g0,%i4 ! pack last arguments
  461. sllx %i5,32,$pwr
  462. or %i4,$pwr,$pwr
  463. fsrc2 %f0,%f60
  464. ___
  465. # load tp[$NUM] ########################################################
  466. $code.=<<___;
  467. save %sp,-128,%sp; or $sentinel,%fp,%fp
  468. ___
  469. for($i=0; $i<14 && $i<$NUM; $i++) {
  470. $code.=<<___;
  471. ldx [$tp+$i*8],@A[$i]
  472. ___
  473. }
  474. for(; $i<$NUM; $i++) {
  475. $code.=<<___;
  476. ldd [$tp+$i*8],@A[$i]
  477. ___
  478. }
  479. # load np[$NUM] ########################################################
  480. $code.=<<___;
  481. save %sp,-128,%sp; or $sentinel,%fp,%fp
  482. ___
  483. for($i=0; $i<14 && $i<$NUM; $i++) {
  484. $code.=<<___;
  485. ldx [$np+$i*8],@N[$i]
  486. ___
  487. }
  488. $code.=<<___;
  489. save %sp,-128,%sp; or $sentinel,%fp,%fp
  490. ___
  491. for(; $i<28 && $i<$NUM; $i++) {
  492. $code.=<<___;
  493. ldx [$np+$i*8],@N[$i]
  494. ___
  495. }
  496. $code.=<<___;
  497. save %sp,-128,%sp; or $sentinel,%fp,%fp
  498. ___
  499. for(; $i<$NUM; $i++) {
  500. $code.=<<___;
  501. ldx [$np+$i*8],@N[$i]
  502. ___
  503. }
  504. # load pwrtbl[pwr] ########################################################
  505. $code.=<<___;
  506. save %sp,-128,%sp; or $sentinel,%fp,%fp
  507. srlx $pwr, 32, %o4 ! unpack $pwr
  508. srl $pwr, %g0, %o5
  509. sub %o4, 5, %o4
  510. mov $pwrtbl, %o7
  511. sllx %o4, 32, $pwr ! re-pack $pwr
  512. or %o5, $pwr, $pwr
  513. srl %o5, %o4, %o5
  514. ___
  515. &load_ccr("%o7","%o5","%o4");
  516. $code.=<<___;
  517. b .Lstride_$NUM
  518. nop
  519. .align 16
  520. .Lstride_$NUM:
  521. ___
  522. for($i=0; $i<14 && $i<$NUM; $i+=2) {
  523. &load_b_pair("%o7",@B[$i],@B[$i+1]);
  524. }
  525. $code.=<<___;
  526. save %sp,-128,%sp; or $sentinel,%fp,%fp
  527. ___
  528. for(; $i<$NUM; $i+=2) {
  529. &load_b_pair("%i7",@B[$i],@B[$i+1]);
  530. }
  531. $code.=<<___;
  532. srax $pwr, 32, %o4 ! unpack $pwr
  533. srl $pwr, %g0, %o5
  534. sub %o4, 5, %o4
  535. mov $pwrtbl, %i7
  536. sllx %o4, 32, $pwr ! re-pack $pwr
  537. or %o5, $pwr, $pwr
  538. srl %o5, %o4, %o5
  539. ___
  540. &load_ccr("%i7","%o5","%o4",1);
  541. # magic ################################################################
  542. for($i=0; $i<5; $i++) {
  543. $code.=<<___;
  544. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  545. fbu,pn %fcc3,.Labort_$NUM
  546. #ifndef __arch64__
  547. and %fp,$sentinel,$sentinel
  548. brz,pn $sentinel,.Labort_$NUM
  549. #endif
  550. nop
  551. ___
  552. }
  553. $code.=<<___;
  554. wr %o4, %g0, %ccr
  555. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  556. fbu,pn %fcc3,.Labort_$NUM
  557. #ifndef __arch64__
  558. and %fp,$sentinel,$sentinel
  559. brz,pn $sentinel,.Labort_$NUM
  560. #endif
  561. srax $pwr, 32, %o4
  562. #ifdef __arch64__
  563. brgez %o4,.Lstride_$NUM
  564. restore
  565. restore
  566. restore
  567. restore
  568. restore
  569. #else
  570. brgez %o4,.Lstride_$NUM
  571. restore; and %fp,$sentinel,$sentinel
  572. restore; and %fp,$sentinel,$sentinel
  573. restore; and %fp,$sentinel,$sentinel
  574. restore; and %fp,$sentinel,$sentinel
  575. brz,pn $sentinel,.Labort1_$NUM
  576. restore
  577. #endif
  578. ___
  579. # save tp[$NUM] ########################################################
  580. for($i=0; $i<14 && $i<$NUM; $i++) {
  581. $code.=<<___;
  582. movxtod @A[$i],@R[$i]
  583. ___
  584. }
  585. $code.=<<___;
  586. #ifdef __arch64__
  587. restore
  588. #else
  589. and %fp,$sentinel,$sentinel
  590. restore
  591. and $sentinel,1,%o7
  592. and %fp,$sentinel,$sentinel
  593. srl %fp,0,%fp ! just in case?
  594. or %o7,$sentinel,$sentinel
  595. brz,a,pn $sentinel,.Ldone_$NUM
  596. mov 0,%i0 ! return failure
  597. #endif
  598. ___
  599. for($i=0; $i<$NUM; $i++) {
  600. $code.=<<___;
  601. std @R[$i],[$tp+$i*8]
  602. ___
  603. }
  604. $code.=<<___;
  605. mov 1,%i0 ! return success
  606. .Ldone_$NUM:
  607. ret
  608. restore
  609. .Labort_$NUM:
  610. restore
  611. restore
  612. restore
  613. restore
  614. restore
  615. .Labort1_$NUM:
  616. restore
  617. mov 0,%i0 ! return failure
  618. ret
  619. restore
  620. .type bn_pwr5_mont_t4_$NUM, #function
  621. .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
  622. ___
  623. }
  624. for ($i=8;$i<=32;$i+=8) {
  625. &generate_bn_pwr5_mont_t4($i);
  626. }
  627. {
  628. ########################################################################
  629. # Fall-back subroutines
  630. #
  631. # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
  632. #
  633. ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
  634. (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
  635. # int bn_mul_mont(
  636. $rp="%o0"; # u64 *rp,
  637. $ap="%o1"; # const u64 *ap,
  638. $bp="%o2"; # const u64 *bp,
  639. $np="%o3"; # const u64 *np,
  640. $n0p="%o4"; # const BN_ULONG *n0,
  641. $num="%o5"; # int num); # caller ensures that num is >=3
  642. $code.=<<___;
  643. .globl bn_mul_mont_t4
  644. .align 32
  645. bn_mul_mont_t4:
  646. add %sp, STACK_BIAS, %g4 ! real top of stack
  647. sll $num, 3, $num ! size in bytes
  648. add $num, 63, %g1
  649. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  650. sub %g4, %g1, %g1
  651. andn %g1, 63, %g1 ! align at 64 byte
  652. sub %g1, STACK_FRAME, %g1 ! new top of stack
  653. sub %g1, %g4, %g1
  654. save %sp, %g1, %sp
  655. ___
  656. # +-------------------------------+<----- %sp
  657. # . .
  658. # +-------------------------------+<----- aligned at 64 bytes
  659. # | __int64 tmp[0] |
  660. # +-------------------------------+
  661. # . .
  662. # . .
  663. # +-------------------------------+<----- aligned at 64 bytes
  664. # . .
  665. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  666. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
  667. ($ovf,$i)=($t0,$t1);
  668. $code.=<<___;
  669. ld [$n0p+0], $t0 ! pull n0[0..1] value
  670. ld [$n0p+4], $t1
  671. add %sp, STACK_BIAS+STACK_FRAME, $tp
  672. ldx [$bp+0], $m0 ! m0=bp[0]
  673. sllx $t1, 32, $n0
  674. add $bp, 8, $bp
  675. or $t0, $n0, $n0
  676. ldx [$ap+0], $aj ! ap[0]
  677. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  678. umulxhi $aj, $m0, $hi0
  679. ldx [$ap+8], $aj ! ap[1]
  680. add $ap, 16, $ap
  681. ldx [$np+0], $nj ! np[0]
  682. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  683. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  684. umulxhi $aj, $m0, $aj ! ahi=aj
  685. mulx $nj, $m1, $lo1 ! np[0]*m1
  686. umulxhi $nj, $m1, $hi1
  687. ldx [$np+8], $nj ! np[1]
  688. addcc $lo0, $lo1, $lo1
  689. add $np, 16, $np
  690. addxc %g0, $hi1, $hi1
  691. mulx $nj, $m1, $nlo ! np[1]*m1
  692. umulxhi $nj, $m1, $nj ! nhi=nj
  693. ba .L1st
  694. sub $num, 24, $cnt ! cnt=num-3
  695. .align 16
  696. .L1st:
  697. addcc $alo, $hi0, $lo0
  698. addxc $aj, %g0, $hi0
  699. ldx [$ap+0], $aj ! ap[j]
  700. addcc $nlo, $hi1, $lo1
  701. add $ap, 8, $ap
  702. addxc $nj, %g0, $hi1 ! nhi=nj
  703. ldx [$np+0], $nj ! np[j]
  704. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  705. add $np, 8, $np
  706. umulxhi $aj, $m0, $aj ! ahi=aj
  707. mulx $nj, $m1, $nlo ! np[j]*m1
  708. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  709. umulxhi $nj, $m1, $nj ! nhi=nj
  710. addxc %g0, $hi1, $hi1
  711. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  712. add $tp, 8, $tp ! tp++
  713. brnz,pt $cnt, .L1st
  714. sub $cnt, 8, $cnt ! j--
  715. !.L1st
  716. addcc $alo, $hi0, $lo0
  717. addxc $aj, %g0, $hi0 ! ahi=aj
  718. addcc $nlo, $hi1, $lo1
  719. addxc $nj, %g0, $hi1
  720. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  721. addxc %g0, $hi1, $hi1
  722. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  723. add $tp, 8, $tp
  724. addcc $hi0, $hi1, $hi1
  725. addxc %g0, %g0, $ovf ! upmost overflow bit
  726. stxa $hi1, [$tp]0xe2
  727. add $tp, 8, $tp
  728. ba .Louter
  729. sub $num, 16, $i ! i=num-2
  730. .align 16
  731. .Louter:
  732. ldx [$bp+0], $m0 ! m0=bp[i]
  733. add $bp, 8, $bp
  734. sub $ap, $num, $ap ! rewind
  735. sub $np, $num, $np
  736. sub $tp, $num, $tp
  737. ldx [$ap+0], $aj ! ap[0]
  738. ldx [$np+0], $nj ! np[0]
  739. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  740. ldx [$tp], $tj ! tp[0]
  741. umulxhi $aj, $m0, $hi0
  742. ldx [$ap+8], $aj ! ap[1]
  743. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  744. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  745. addxc %g0, $hi0, $hi0
  746. mulx $lo0, $n0, $m1 ! tp[0]*n0
  747. umulxhi $aj, $m0, $aj ! ahi=aj
  748. mulx $nj, $m1, $lo1 ! np[0]*m1
  749. add $ap, 16, $ap
  750. umulxhi $nj, $m1, $hi1
  751. ldx [$np+8], $nj ! np[1]
  752. add $np, 16, $np
  753. addcc $lo1, $lo0, $lo1
  754. mulx $nj, $m1, $nlo ! np[1]*m1
  755. addxc %g0, $hi1, $hi1
  756. umulxhi $nj, $m1, $nj ! nhi=nj
  757. ba .Linner
  758. sub $num, 24, $cnt ! cnt=num-3
  759. .align 16
  760. .Linner:
  761. addcc $alo, $hi0, $lo0
  762. ldx [$tp+8], $tj ! tp[j]
  763. addxc $aj, %g0, $hi0 ! ahi=aj
  764. ldx [$ap+0], $aj ! ap[j]
  765. add $ap, 8, $ap
  766. addcc $nlo, $hi1, $lo1
  767. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  768. addxc $nj, %g0, $hi1 ! nhi=nj
  769. ldx [$np+0], $nj ! np[j]
  770. add $np, 8, $np
  771. umulxhi $aj, $m0, $aj ! ahi=aj
  772. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  773. mulx $nj, $m1, $nlo ! np[j]*m1
  774. addxc %g0, $hi0, $hi0
  775. umulxhi $nj, $m1, $nj ! nhi=nj
  776. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  777. addxc %g0, $hi1, $hi1
  778. stx $lo1, [$tp] ! tp[j-1]
  779. add $tp, 8, $tp
  780. brnz,pt $cnt, .Linner
  781. sub $cnt, 8, $cnt
  782. !.Linner
  783. ldx [$tp+8], $tj ! tp[j]
  784. addcc $alo, $hi0, $lo0
  785. addxc $aj, %g0, $hi0 ! ahi=aj
  786. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  787. addxc %g0, $hi0, $hi0
  788. addcc $nlo, $hi1, $lo1
  789. addxc $nj, %g0, $hi1 ! nhi=nj
  790. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  791. addxc %g0, $hi1, $hi1
  792. stx $lo1, [$tp] ! tp[j-1]
  793. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  794. addxccc $hi1, $hi0, $hi1
  795. addxc %g0, %g0, $ovf
  796. stx $hi1, [$tp+8]
  797. add $tp, 16, $tp
  798. brnz,pt $i, .Louter
  799. sub $i, 8, $i
  800. sub $ap, $num, $ap ! rewind
  801. sub $np, $num, $np
  802. sub $tp, $num, $tp
  803. ba .Lsub
  804. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  805. .align 16
  806. .Lsub:
  807. ldx [$tp], $tj
  808. add $tp, 8, $tp
  809. ldx [$np+0], $nj
  810. add $np, 8, $np
  811. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  812. srlx $tj, 32, $tj
  813. srlx $nj, 32, $nj
  814. subccc $tj, $nj, $t3
  815. add $rp, 8, $rp
  816. st $t2, [$rp-4] ! reverse order
  817. st $t3, [$rp-8]
  818. brnz,pt $cnt, .Lsub
  819. sub $cnt, 8, $cnt
  820. sub $np, $num, $np ! rewind
  821. sub $tp, $num, $tp
  822. sub $rp, $num, $rp
  823. subccc $ovf, %g0, $ovf ! handle upmost overflow bit
  824. ba .Lcopy
  825. sub $num, 8, $cnt
  826. .align 16
  827. .Lcopy: ! conditional copy
  828. ldx [$tp], $tj
  829. ldx [$rp+0], $t2
  830. stx %g0, [$tp] ! zap
  831. add $tp, 8, $tp
  832. movcs %icc, $tj, $t2
  833. stx $t2, [$rp+0]
  834. add $rp, 8, $rp
  835. brnz $cnt, .Lcopy
  836. sub $cnt, 8, $cnt
  837. mov 1, %o0
  838. ret
  839. restore
  840. .type bn_mul_mont_t4, #function
  841. .size bn_mul_mont_t4, .-bn_mul_mont_t4
  842. ___
  843. # int bn_mul_mont_gather5(
  844. $rp="%o0"; # u64 *rp,
  845. $ap="%o1"; # const u64 *ap,
  846. $bp="%o2"; # const u64 *pwrtbl,
  847. $np="%o3"; # const u64 *np,
  848. $n0p="%o4"; # const BN_ULONG *n0,
  849. $num="%o5"; # int num, # caller ensures that num is >=3
  850. # int power);
  851. $code.=<<___;
  852. .globl bn_mul_mont_gather5_t4
  853. .align 32
  854. bn_mul_mont_gather5_t4:
  855. add %sp, STACK_BIAS, %g4 ! real top of stack
  856. sll $num, 3, $num ! size in bytes
  857. add $num, 63, %g1
  858. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  859. sub %g4, %g1, %g1
  860. andn %g1, 63, %g1 ! align at 64 byte
  861. sub %g1, STACK_FRAME, %g1 ! new top of stack
  862. sub %g1, %g4, %g1
  863. LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
  864. save %sp, %g1, %sp
  865. ___
  866. # +-------------------------------+<----- %sp
  867. # . .
  868. # +-------------------------------+<----- aligned at 64 bytes
  869. # | __int64 tmp[0] |
  870. # +-------------------------------+
  871. # . .
  872. # . .
  873. # +-------------------------------+<----- aligned at 64 bytes
  874. # . .
  875. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  876. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
  877. ($ovf,$i)=($t0,$t1);
  878. &load_ccr($bp,"%g4",$ccr);
  879. &load_b($bp,$m0,"%o7"); # m0=bp[0]
  880. $code.=<<___;
  881. ld [$n0p+0], $t0 ! pull n0[0..1] value
  882. ld [$n0p+4], $t1
  883. add %sp, STACK_BIAS+STACK_FRAME, $tp
  884. sllx $t1, 32, $n0
  885. or $t0, $n0, $n0
  886. ldx [$ap+0], $aj ! ap[0]
  887. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  888. umulxhi $aj, $m0, $hi0
  889. ldx [$ap+8], $aj ! ap[1]
  890. add $ap, 16, $ap
  891. ldx [$np+0], $nj ! np[0]
  892. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  893. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  894. umulxhi $aj, $m0, $aj ! ahi=aj
  895. mulx $nj, $m1, $lo1 ! np[0]*m1
  896. umulxhi $nj, $m1, $hi1
  897. ldx [$np+8], $nj ! np[1]
  898. addcc $lo0, $lo1, $lo1
  899. add $np, 16, $np
  900. addxc %g0, $hi1, $hi1
  901. mulx $nj, $m1, $nlo ! np[1]*m1
  902. umulxhi $nj, $m1, $nj ! nhi=nj
  903. ba .L1st_g5
  904. sub $num, 24, $cnt ! cnt=num-3
  905. .align 16
  906. .L1st_g5:
  907. addcc $alo, $hi0, $lo0
  908. addxc $aj, %g0, $hi0
  909. ldx [$ap+0], $aj ! ap[j]
  910. addcc $nlo, $hi1, $lo1
  911. add $ap, 8, $ap
  912. addxc $nj, %g0, $hi1 ! nhi=nj
  913. ldx [$np+0], $nj ! np[j]
  914. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  915. add $np, 8, $np
  916. umulxhi $aj, $m0, $aj ! ahi=aj
  917. mulx $nj, $m1, $nlo ! np[j]*m1
  918. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  919. umulxhi $nj, $m1, $nj ! nhi=nj
  920. addxc %g0, $hi1, $hi1
  921. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  922. add $tp, 8, $tp ! tp++
  923. brnz,pt $cnt, .L1st_g5
  924. sub $cnt, 8, $cnt ! j--
  925. !.L1st_g5
  926. addcc $alo, $hi0, $lo0
  927. addxc $aj, %g0, $hi0 ! ahi=aj
  928. addcc $nlo, $hi1, $lo1
  929. addxc $nj, %g0, $hi1
  930. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  931. addxc %g0, $hi1, $hi1
  932. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  933. add $tp, 8, $tp
  934. addcc $hi0, $hi1, $hi1
  935. addxc %g0, %g0, $ovf ! upmost overflow bit
  936. stxa $hi1, [$tp]0xe2
  937. add $tp, 8, $tp
  938. ba .Louter_g5
  939. sub $num, 16, $i ! i=num-2
  940. .align 16
  941. .Louter_g5:
  942. wr $ccr, %g0, %ccr
  943. ___
  944. &load_b($bp,$m0); # m0=bp[i]
  945. $code.=<<___;
  946. sub $ap, $num, $ap ! rewind
  947. sub $np, $num, $np
  948. sub $tp, $num, $tp
  949. ldx [$ap+0], $aj ! ap[0]
  950. ldx [$np+0], $nj ! np[0]
  951. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  952. ldx [$tp], $tj ! tp[0]
  953. umulxhi $aj, $m0, $hi0
  954. ldx [$ap+8], $aj ! ap[1]
  955. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  956. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  957. addxc %g0, $hi0, $hi0
  958. mulx $lo0, $n0, $m1 ! tp[0]*n0
  959. umulxhi $aj, $m0, $aj ! ahi=aj
  960. mulx $nj, $m1, $lo1 ! np[0]*m1
  961. add $ap, 16, $ap
  962. umulxhi $nj, $m1, $hi1
  963. ldx [$np+8], $nj ! np[1]
  964. add $np, 16, $np
  965. addcc $lo1, $lo0, $lo1
  966. mulx $nj, $m1, $nlo ! np[1]*m1
  967. addxc %g0, $hi1, $hi1
  968. umulxhi $nj, $m1, $nj ! nhi=nj
  969. ba .Linner_g5
  970. sub $num, 24, $cnt ! cnt=num-3
  971. .align 16
  972. .Linner_g5:
  973. addcc $alo, $hi0, $lo0
  974. ldx [$tp+8], $tj ! tp[j]
  975. addxc $aj, %g0, $hi0 ! ahi=aj
  976. ldx [$ap+0], $aj ! ap[j]
  977. add $ap, 8, $ap
  978. addcc $nlo, $hi1, $lo1
  979. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  980. addxc $nj, %g0, $hi1 ! nhi=nj
  981. ldx [$np+0], $nj ! np[j]
  982. add $np, 8, $np
  983. umulxhi $aj, $m0, $aj ! ahi=aj
  984. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  985. mulx $nj, $m1, $nlo ! np[j]*m1
  986. addxc %g0, $hi0, $hi0
  987. umulxhi $nj, $m1, $nj ! nhi=nj
  988. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  989. addxc %g0, $hi1, $hi1
  990. stx $lo1, [$tp] ! tp[j-1]
  991. add $tp, 8, $tp
  992. brnz,pt $cnt, .Linner_g5
  993. sub $cnt, 8, $cnt
  994. !.Linner_g5
  995. ldx [$tp+8], $tj ! tp[j]
  996. addcc $alo, $hi0, $lo0
  997. addxc $aj, %g0, $hi0 ! ahi=aj
  998. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  999. addxc %g0, $hi0, $hi0
  1000. addcc $nlo, $hi1, $lo1
  1001. addxc $nj, %g0, $hi1 ! nhi=nj
  1002. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  1003. addxc %g0, $hi1, $hi1
  1004. stx $lo1, [$tp] ! tp[j-1]
  1005. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  1006. addxccc $hi1, $hi0, $hi1
  1007. addxc %g0, %g0, $ovf
  1008. stx $hi1, [$tp+8]
  1009. add $tp, 16, $tp
  1010. brnz,pt $i, .Louter_g5
  1011. sub $i, 8, $i
  1012. sub $ap, $num, $ap ! rewind
  1013. sub $np, $num, $np
  1014. sub $tp, $num, $tp
  1015. ba .Lsub_g5
  1016. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  1017. .align 16
  1018. .Lsub_g5:
  1019. ldx [$tp], $tj
  1020. add $tp, 8, $tp
  1021. ldx [$np+0], $nj
  1022. add $np, 8, $np
  1023. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  1024. srlx $tj, 32, $tj
  1025. srlx $nj, 32, $nj
  1026. subccc $tj, $nj, $t3
  1027. add $rp, 8, $rp
  1028. st $t2, [$rp-4] ! reverse order
  1029. st $t3, [$rp-8]
  1030. brnz,pt $cnt, .Lsub_g5
  1031. sub $cnt, 8, $cnt
  1032. sub $np, $num, $np ! rewind
  1033. sub $tp, $num, $tp
  1034. sub $rp, $num, $rp
  1035. subccc $ovf, %g0, $ovf ! handle upmost overflow bit
  1036. ba .Lcopy_g5
  1037. sub $num, 8, $cnt
  1038. .align 16
  1039. .Lcopy_g5: ! conditional copy
  1040. ldx [$tp], $tj
  1041. ldx [$rp+0], $t2
  1042. stx %g0, [$tp] ! zap
  1043. add $tp, 8, $tp
  1044. movcs %icc, $tj, $t2
  1045. stx $t2, [$rp+0]
  1046. add $rp, 8, $rp
  1047. brnz $cnt, .Lcopy_g5
  1048. sub $cnt, 8, $cnt
  1049. mov 1, %o0
  1050. ret
  1051. restore
  1052. .type bn_mul_mont_gather5_t4, #function
  1053. .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
  1054. ___
  1055. }
  1056. $code.=<<___;
  1057. .globl bn_flip_t4
  1058. .align 32
  1059. bn_flip_t4:
  1060. .Loop_flip:
  1061. ld [%o1+0], %o4
  1062. sub %o2, 1, %o2
  1063. ld [%o1+4], %o5
  1064. add %o1, 8, %o1
  1065. st %o5, [%o0+0]
  1066. st %o4, [%o0+4]
  1067. brnz %o2, .Loop_flip
  1068. add %o0, 8, %o0
  1069. retl
  1070. nop
  1071. .type bn_flip_t4, #function
  1072. .size bn_flip_t4, .-bn_flip_t4
  1073. .globl bn_flip_n_scatter5_t4
  1074. .align 32
  1075. bn_flip_n_scatter5_t4:
  1076. sll %o3, 3, %o3
  1077. srl %o1, 1, %o1
  1078. add %o3, %o2, %o2 ! &pwrtbl[pwr]
  1079. sub %o1, 1, %o1
  1080. .Loop_flip_n_scatter5:
  1081. ld [%o0+0], %o4 ! inp[i]
  1082. ld [%o0+4], %o5
  1083. add %o0, 8, %o0
  1084. sllx %o5, 32, %o5
  1085. or %o4, %o5, %o5
  1086. stx %o5, [%o2]
  1087. add %o2, 32*8, %o2
  1088. brnz %o1, .Loop_flip_n_scatter5
  1089. sub %o1, 1, %o1
  1090. retl
  1091. nop
  1092. .type bn_flip_n_scatter5_t4, #function
  1093. .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
  1094. .globl bn_gather5_t4
  1095. .align 32
  1096. bn_gather5_t4:
  1097. ___
  1098. &load_ccr("%o2","%o3","%g1");
  1099. $code.=<<___;
  1100. sub %o1, 1, %o1
  1101. .Loop_gather5:
  1102. ___
  1103. &load_b("%o2","%g1");
  1104. $code.=<<___;
  1105. stx %g1, [%o0]
  1106. add %o0, 8, %o0
  1107. brnz %o1, .Loop_gather5
  1108. sub %o1, 1, %o1
  1109. retl
  1110. nop
  1111. .type bn_gather5_t4, #function
  1112. .size bn_gather5_t4, .-bn_gather5_t4
  1113. .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
  1114. .align 4
  1115. ___
  1116. &emit_assembler();
  1117. close STDOUT or die "error closing STDOUT: $!";