sparct4-mont.pl 27 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
  10. # <appro@openssl.org>. The module is licensed under 2-clause BSD
  11. # license. November 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Montgomery squaring-n-multiplication module for SPARC T4.
  15. #
  16. # The module consists of three parts:
  17. #
  18. # 1) collection of "single-op" subroutines that perform single
  19. # operation, Montgomery squaring or multiplication, on 512-,
  20. # 1024-, 1536- and 2048-bit operands;
  21. # 2) collection of "multi-op" subroutines that perform 5 squaring and
  22. # 1 multiplication operations on operands of above lengths;
  23. # 3) fall-back and helper VIS3 subroutines.
  24. #
  25. # RSA sign is dominated by multi-op subroutine, while RSA verify and
  26. # DSA - by single-op. Special note about 4096-bit RSA verify result.
  27. # Operands are too long for dedicated hardware and it's handled by
  28. # VIS3 code, which is why you don't see any improvement. It's surely
  29. # possible to improve it [by deploying 'mpmul' instruction], maybe in
  30. # the future...
  31. #
  32. # Performance improvement.
  33. #
  34. # 64-bit process, VIS3:
  35. # sign verify sign/s verify/s
  36. # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
  37. # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
  38. # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
  39. # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
  40. # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
  41. #
  42. # 64-bit process, this module:
  43. # sign verify sign/s verify/s
  44. # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
  45. # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
  46. # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
  47. # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
  48. # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
  49. #
  50. ######################################################################
  51. # 32-bit process, VIS3:
  52. # sign verify sign/s verify/s
  53. # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
  54. # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
  55. # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
  56. # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
  57. # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
  58. #
  59. # 32-bit process, this module:
  60. # sign verify sign/s verify/s
  61. # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
  62. # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
  63. # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
  64. # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
  65. # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
  66. #
  67. # 32-bit code is prone to performance degradation as interrupt rate
  68. # dispatched to CPU executing the code grows. This is because in
  69. # standard process of handling interrupt in 32-bit process context
  70. # upper halves of most integer registers used as input or output are
  71. # zeroed. This renders result invalid, and operation has to be re-run.
  72. # If CPU is "bothered" with timer interrupts only, the penalty is
  73. # hardly measurable. But in order to mitigate this problem for higher
  74. # interrupt rates contemporary Linux kernel recognizes biased stack
  75. # even in 32-bit process context and preserves full register contents.
  76. # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
  77. # for details.
  78. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  79. push(@INC,"${dir}","${dir}../../perlasm");
  80. require "sparcv9_modes.pl";
  81. $output = pop;
  82. open STDOUT,">$output";
  83. $code.=<<___;
  84. #include "sparc_arch.h"
  85. #ifdef __arch64__
  86. .register %g2,#scratch
  87. .register %g3,#scratch
  88. #endif
  89. .section ".text",#alloc,#execinstr
  90. #ifdef __PIC__
  91. SPARC_PIC_THUNK(%g1)
  92. #endif
  93. ___
  94. ########################################################################
  95. # Register layout for mont[mul|sqr] instructions.
  96. # For details see "Oracle SPARC Architecture 2011" manual at
  97. # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
  98. #
  99. my @R=map("%f".2*$_,(0..11,30,31,12..29));
  100. my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
  101. my @A=(@N[0..13],@R[14..31]);
  102. my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
  103. ########################################################################
  104. # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
  105. # const u64 *np,const BN_ULONG *n0);
  106. #
  107. sub generate_bn_mul_mont_t4() {
  108. my $NUM=shift;
  109. my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
  110. $code.=<<___;
  111. .globl bn_mul_mont_t4_$NUM
  112. .align 32
  113. bn_mul_mont_t4_$NUM:
  114. #ifdef __arch64__
  115. mov 0,$sentinel
  116. mov -128,%g4
  117. #elif defined(SPARCV9_64BIT_STACK)
  118. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  119. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  120. mov -2047,%g4
  121. and %g1,SPARCV9_64BIT_STACK,%g1
  122. movrz %g1,0,%g4
  123. mov -1,$sentinel
  124. add %g4,-128,%g4
  125. #else
  126. mov -1,$sentinel
  127. mov -128,%g4
  128. #endif
  129. sllx $sentinel,32,$sentinel
  130. save %sp,%g4,%sp
  131. #ifndef __arch64__
  132. save %sp,-128,%sp ! warm it up
  133. save %sp,-128,%sp
  134. save %sp,-128,%sp
  135. save %sp,-128,%sp
  136. save %sp,-128,%sp
  137. save %sp,-128,%sp
  138. restore
  139. restore
  140. restore
  141. restore
  142. restore
  143. restore
  144. #endif
  145. and %sp,1,%g4
  146. or $sentinel,%fp,%fp
  147. or %g4,$sentinel,$sentinel
  148. ! copy arguments to global registers
  149. mov %i0,$rp
  150. mov %i1,$ap
  151. mov %i2,$bp
  152. mov %i3,$np
  153. ld [%i4+0],%f1 ! load *n0
  154. ld [%i4+4],%f0
  155. fsrc2 %f0,%f60
  156. ___
  157. # load ap[$NUM] ########################################################
  158. $code.=<<___;
  159. save %sp,-128,%sp; or $sentinel,%fp,%fp
  160. ___
  161. for($i=0; $i<14 && $i<$NUM; $i++) {
  162. my $lo=$i<13?@A[$i+1]:"%o7";
  163. $code.=<<___;
  164. ld [$ap+$i*8+0],$lo
  165. ld [$ap+$i*8+4],@A[$i]
  166. sllx @A[$i],32,@A[$i]
  167. or $lo,@A[$i],@A[$i]
  168. ___
  169. }
  170. for(; $i<$NUM; $i++) {
  171. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  172. $code.=<<___;
  173. ld [$ap+$i*8+0],$lo
  174. ld [$ap+$i*8+4],$hi
  175. fsrc2 $hi,@A[$i]
  176. ___
  177. }
  178. # load np[$NUM] ########################################################
  179. $code.=<<___;
  180. save %sp,-128,%sp; or $sentinel,%fp,%fp
  181. ___
  182. for($i=0; $i<14 && $i<$NUM; $i++) {
  183. my $lo=$i<13?@N[$i+1]:"%o7";
  184. $code.=<<___;
  185. ld [$np+$i*8+0],$lo
  186. ld [$np+$i*8+4],@N[$i]
  187. sllx @N[$i],32,@N[$i]
  188. or $lo,@N[$i],@N[$i]
  189. ___
  190. }
  191. $code.=<<___;
  192. save %sp,-128,%sp; or $sentinel,%fp,%fp
  193. ___
  194. for(; $i<28 && $i<$NUM; $i++) {
  195. my $lo=$i<27?@N[$i+1]:"%o7";
  196. $code.=<<___;
  197. ld [$np+$i*8+0],$lo
  198. ld [$np+$i*8+4],@N[$i]
  199. sllx @N[$i],32,@N[$i]
  200. or $lo,@N[$i],@N[$i]
  201. ___
  202. }
  203. $code.=<<___;
  204. save %sp,-128,%sp; or $sentinel,%fp,%fp
  205. ___
  206. for(; $i<$NUM; $i++) {
  207. my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
  208. $code.=<<___;
  209. ld [$np+$i*8+0],$lo
  210. ld [$np+$i*8+4],@N[$i]
  211. sllx @N[$i],32,@N[$i]
  212. or $lo,@N[$i],@N[$i]
  213. ___
  214. }
  215. $code.=<<___;
  216. cmp $ap,$bp
  217. be SIZE_T_CC,.Lmsquare_$NUM
  218. nop
  219. ___
  220. # load bp[$NUM] ########################################################
  221. $code.=<<___;
  222. save %sp,-128,%sp; or $sentinel,%fp,%fp
  223. ___
  224. for($i=0; $i<14 && $i<$NUM; $i++) {
  225. my $lo=$i<13?@B[$i+1]:"%o7";
  226. $code.=<<___;
  227. ld [$bp+$i*8+0],$lo
  228. ld [$bp+$i*8+4],@B[$i]
  229. sllx @B[$i],32,@B[$i]
  230. or $lo,@B[$i],@B[$i]
  231. ___
  232. }
  233. $code.=<<___;
  234. save %sp,-128,%sp; or $sentinel,%fp,%fp
  235. ___
  236. for(; $i<$NUM; $i++) {
  237. my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
  238. $code.=<<___;
  239. ld [$bp+$i*8+0],$lo
  240. ld [$bp+$i*8+4],@B[$i]
  241. sllx @B[$i],32,@B[$i]
  242. or $lo,@B[$i],@B[$i]
  243. ___
  244. }
  245. # magic ################################################################
  246. $code.=<<___;
  247. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  248. .Lmresume_$NUM:
  249. fbu,pn %fcc3,.Lmabort_$NUM
  250. #ifndef __arch64__
  251. and %fp,$sentinel,$sentinel
  252. brz,pn $sentinel,.Lmabort_$NUM
  253. #endif
  254. nop
  255. #ifdef __arch64__
  256. restore
  257. restore
  258. restore
  259. restore
  260. restore
  261. #else
  262. restore; and %fp,$sentinel,$sentinel
  263. restore; and %fp,$sentinel,$sentinel
  264. restore; and %fp,$sentinel,$sentinel
  265. restore; and %fp,$sentinel,$sentinel
  266. brz,pn $sentinel,.Lmabort1_$NUM
  267. restore
  268. #endif
  269. ___
  270. # save tp[$NUM] ########################################################
  271. for($i=0; $i<14 && $i<$NUM; $i++) {
  272. $code.=<<___;
  273. movxtod @A[$i],@R[$i]
  274. ___
  275. }
  276. $code.=<<___;
  277. #ifdef __arch64__
  278. restore
  279. #else
  280. and %fp,$sentinel,$sentinel
  281. restore
  282. and $sentinel,1,%o7
  283. and %fp,$sentinel,$sentinel
  284. srl %fp,0,%fp ! just in case?
  285. or %o7,$sentinel,$sentinel
  286. brz,a,pn $sentinel,.Lmdone_$NUM
  287. mov 0,%i0 ! return failure
  288. #endif
  289. ___
  290. for($i=0; $i<12 && $i<$NUM; $i++) {
  291. @R[$i] =~ /%f([0-9]+)/;
  292. my $lo = "%f".($1+1);
  293. $code.=<<___;
  294. st $lo,[$rp+$i*8+0]
  295. st @R[$i],[$rp+$i*8+4]
  296. ___
  297. }
  298. for(; $i<$NUM; $i++) {
  299. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  300. $code.=<<___;
  301. fsrc2 @R[$i],$hi
  302. st $lo,[$rp+$i*8+0]
  303. st $hi,[$rp+$i*8+4]
  304. ___
  305. }
  306. $code.=<<___;
  307. mov 1,%i0 ! return success
  308. .Lmdone_$NUM:
  309. ret
  310. restore
  311. .Lmabort_$NUM:
  312. restore
  313. restore
  314. restore
  315. restore
  316. restore
  317. .Lmabort1_$NUM:
  318. restore
  319. mov 0,%i0 ! return failure
  320. ret
  321. restore
  322. .align 32
  323. .Lmsquare_$NUM:
  324. save %sp,-128,%sp; or $sentinel,%fp,%fp
  325. save %sp,-128,%sp; or $sentinel,%fp,%fp
  326. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  327. ba .Lmresume_$NUM
  328. nop
  329. .type bn_mul_mont_t4_$NUM, #function
  330. .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
  331. ___
  332. }
  333. for ($i=8;$i<=32;$i+=8) {
  334. &generate_bn_mul_mont_t4($i);
  335. }
  336. ########################################################################
  337. #
  338. sub load_ccr {
  339. my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
  340. $code.=<<___;
  341. srl $pwr, 2, %o4
  342. and $pwr, 3, %o5
  343. and %o4, 7, %o4
  344. sll %o5, 3, %o5 ! offset within first cache line
  345. add %o5, $ptbl, $ptbl ! of the pwrtbl
  346. or %g0, 1, %o5
  347. sll %o5, %o4, $ccr
  348. ___
  349. $code.=<<___ if (!$skip_wr);
  350. wr $ccr, %g0, %ccr
  351. ___
  352. }
  353. sub load_b_pair {
  354. my ($pwrtbl,$B0,$B1)=@_;
  355. $code.=<<___;
  356. ldx [$pwrtbl+0*32], $B0
  357. ldx [$pwrtbl+8*32], $B1
  358. ldx [$pwrtbl+1*32], %o4
  359. ldx [$pwrtbl+9*32], %o5
  360. movvs %icc, %o4, $B0
  361. ldx [$pwrtbl+2*32], %o4
  362. movvs %icc, %o5, $B1
  363. ldx [$pwrtbl+10*32],%o5
  364. move %icc, %o4, $B0
  365. ldx [$pwrtbl+3*32], %o4
  366. move %icc, %o5, $B1
  367. ldx [$pwrtbl+11*32],%o5
  368. movneg %icc, %o4, $B0
  369. ldx [$pwrtbl+4*32], %o4
  370. movneg %icc, %o5, $B1
  371. ldx [$pwrtbl+12*32],%o5
  372. movcs %xcc, %o4, $B0
  373. ldx [$pwrtbl+5*32],%o4
  374. movcs %xcc, %o5, $B1
  375. ldx [$pwrtbl+13*32],%o5
  376. movvs %xcc, %o4, $B0
  377. ldx [$pwrtbl+6*32], %o4
  378. movvs %xcc, %o5, $B1
  379. ldx [$pwrtbl+14*32],%o5
  380. move %xcc, %o4, $B0
  381. ldx [$pwrtbl+7*32], %o4
  382. move %xcc, %o5, $B1
  383. ldx [$pwrtbl+15*32],%o5
  384. movneg %xcc, %o4, $B0
  385. add $pwrtbl,16*32, $pwrtbl
  386. movneg %xcc, %o5, $B1
  387. ___
  388. }
  389. sub load_b {
  390. my ($pwrtbl,$Bi)=@_;
  391. $code.=<<___;
  392. ldx [$pwrtbl+0*32], $Bi
  393. ldx [$pwrtbl+1*32], %o4
  394. ldx [$pwrtbl+2*32], %o5
  395. movvs %icc, %o4, $Bi
  396. ldx [$pwrtbl+3*32], %o4
  397. move %icc, %o5, $Bi
  398. ldx [$pwrtbl+4*32], %o5
  399. movneg %icc, %o4, $Bi
  400. ldx [$pwrtbl+5*32], %o4
  401. movcs %xcc, %o5, $Bi
  402. ldx [$pwrtbl+6*32], %o5
  403. movvs %xcc, %o4, $Bi
  404. ldx [$pwrtbl+7*32], %o4
  405. move %xcc, %o5, $Bi
  406. add $pwrtbl,8*32, $pwrtbl
  407. movneg %xcc, %o4, $Bi
  408. ___
  409. }
  410. ########################################################################
  411. # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
  412. # const u64 *pwrtbl,int pwr,int stride);
  413. #
  414. sub generate_bn_pwr5_mont_t4() {
  415. my $NUM=shift;
  416. my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
  417. $code.=<<___;
  418. .globl bn_pwr5_mont_t4_$NUM
  419. .align 32
  420. bn_pwr5_mont_t4_$NUM:
  421. #ifdef __arch64__
  422. mov 0,$sentinel
  423. mov -128,%g4
  424. #elif defined(SPARCV9_64BIT_STACK)
  425. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  426. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  427. mov -2047,%g4
  428. and %g1,SPARCV9_64BIT_STACK,%g1
  429. movrz %g1,0,%g4
  430. mov -1,$sentinel
  431. add %g4,-128,%g4
  432. #else
  433. mov -1,$sentinel
  434. mov -128,%g4
  435. #endif
  436. sllx $sentinel,32,$sentinel
  437. save %sp,%g4,%sp
  438. #ifndef __arch64__
  439. save %sp,-128,%sp ! warm it up
  440. save %sp,-128,%sp
  441. save %sp,-128,%sp
  442. save %sp,-128,%sp
  443. save %sp,-128,%sp
  444. save %sp,-128,%sp
  445. restore
  446. restore
  447. restore
  448. restore
  449. restore
  450. restore
  451. #endif
  452. and %sp,1,%g4
  453. or $sentinel,%fp,%fp
  454. or %g4,$sentinel,$sentinel
  455. ! copy arguments to global registers
  456. mov %i0,$tp
  457. mov %i1,$np
  458. ld [%i2+0],%f1 ! load *n0
  459. ld [%i2+4],%f0
  460. mov %i3,$pwrtbl
  461. srl %i4,%g0,%i4 ! pack last arguments
  462. sllx %i5,32,$pwr
  463. or %i4,$pwr,$pwr
  464. fsrc2 %f0,%f60
  465. ___
  466. # load tp[$NUM] ########################################################
  467. $code.=<<___;
  468. save %sp,-128,%sp; or $sentinel,%fp,%fp
  469. ___
  470. for($i=0; $i<14 && $i<$NUM; $i++) {
  471. $code.=<<___;
  472. ldx [$tp+$i*8],@A[$i]
  473. ___
  474. }
  475. for(; $i<$NUM; $i++) {
  476. $code.=<<___;
  477. ldd [$tp+$i*8],@A[$i]
  478. ___
  479. }
  480. # load np[$NUM] ########################################################
  481. $code.=<<___;
  482. save %sp,-128,%sp; or $sentinel,%fp,%fp
  483. ___
  484. for($i=0; $i<14 && $i<$NUM; $i++) {
  485. $code.=<<___;
  486. ldx [$np+$i*8],@N[$i]
  487. ___
  488. }
  489. $code.=<<___;
  490. save %sp,-128,%sp; or $sentinel,%fp,%fp
  491. ___
  492. for(; $i<28 && $i<$NUM; $i++) {
  493. $code.=<<___;
  494. ldx [$np+$i*8],@N[$i]
  495. ___
  496. }
  497. $code.=<<___;
  498. save %sp,-128,%sp; or $sentinel,%fp,%fp
  499. ___
  500. for(; $i<$NUM; $i++) {
  501. $code.=<<___;
  502. ldx [$np+$i*8],@N[$i]
  503. ___
  504. }
  505. # load pwrtbl[pwr] ########################################################
  506. $code.=<<___;
  507. save %sp,-128,%sp; or $sentinel,%fp,%fp
  508. srlx $pwr, 32, %o4 ! unpack $pwr
  509. srl $pwr, %g0, %o5
  510. sub %o4, 5, %o4
  511. mov $pwrtbl, %o7
  512. sllx %o4, 32, $pwr ! re-pack $pwr
  513. or %o5, $pwr, $pwr
  514. srl %o5, %o4, %o5
  515. ___
  516. &load_ccr("%o7","%o5","%o4");
  517. $code.=<<___;
  518. b .Lstride_$NUM
  519. nop
  520. .align 16
  521. .Lstride_$NUM:
  522. ___
  523. for($i=0; $i<14 && $i<$NUM; $i+=2) {
  524. &load_b_pair("%o7",@B[$i],@B[$i+1]);
  525. }
  526. $code.=<<___;
  527. save %sp,-128,%sp; or $sentinel,%fp,%fp
  528. ___
  529. for(; $i<$NUM; $i+=2) {
  530. &load_b_pair("%i7",@B[$i],@B[$i+1]);
  531. }
  532. $code.=<<___;
  533. srax $pwr, 32, %o4 ! unpack $pwr
  534. srl $pwr, %g0, %o5
  535. sub %o4, 5, %o4
  536. mov $pwrtbl, %i7
  537. sllx %o4, 32, $pwr ! re-pack $pwr
  538. or %o5, $pwr, $pwr
  539. srl %o5, %o4, %o5
  540. ___
  541. &load_ccr("%i7","%o5","%o4",1);
  542. # magic ################################################################
  543. for($i=0; $i<5; $i++) {
  544. $code.=<<___;
  545. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  546. fbu,pn %fcc3,.Labort_$NUM
  547. #ifndef __arch64__
  548. and %fp,$sentinel,$sentinel
  549. brz,pn $sentinel,.Labort_$NUM
  550. #endif
  551. nop
  552. ___
  553. }
  554. $code.=<<___;
  555. wr %o4, %g0, %ccr
  556. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  557. fbu,pn %fcc3,.Labort_$NUM
  558. #ifndef __arch64__
  559. and %fp,$sentinel,$sentinel
  560. brz,pn $sentinel,.Labort_$NUM
  561. #endif
  562. srax $pwr, 32, %o4
  563. #ifdef __arch64__
  564. brgez %o4,.Lstride_$NUM
  565. restore
  566. restore
  567. restore
  568. restore
  569. restore
  570. #else
  571. brgez %o4,.Lstride_$NUM
  572. restore; and %fp,$sentinel,$sentinel
  573. restore; and %fp,$sentinel,$sentinel
  574. restore; and %fp,$sentinel,$sentinel
  575. restore; and %fp,$sentinel,$sentinel
  576. brz,pn $sentinel,.Labort1_$NUM
  577. restore
  578. #endif
  579. ___
  580. # save tp[$NUM] ########################################################
  581. for($i=0; $i<14 && $i<$NUM; $i++) {
  582. $code.=<<___;
  583. movxtod @A[$i],@R[$i]
  584. ___
  585. }
  586. $code.=<<___;
  587. #ifdef __arch64__
  588. restore
  589. #else
  590. and %fp,$sentinel,$sentinel
  591. restore
  592. and $sentinel,1,%o7
  593. and %fp,$sentinel,$sentinel
  594. srl %fp,0,%fp ! just in case?
  595. or %o7,$sentinel,$sentinel
  596. brz,a,pn $sentinel,.Ldone_$NUM
  597. mov 0,%i0 ! return failure
  598. #endif
  599. ___
  600. for($i=0; $i<$NUM; $i++) {
  601. $code.=<<___;
  602. std @R[$i],[$tp+$i*8]
  603. ___
  604. }
  605. $code.=<<___;
  606. mov 1,%i0 ! return success
  607. .Ldone_$NUM:
  608. ret
  609. restore
  610. .Labort_$NUM:
  611. restore
  612. restore
  613. restore
  614. restore
  615. restore
  616. .Labort1_$NUM:
  617. restore
  618. mov 0,%i0 ! return failure
  619. ret
  620. restore
  621. .type bn_pwr5_mont_t4_$NUM, #function
  622. .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
  623. ___
  624. }
  625. for ($i=8;$i<=32;$i+=8) {
  626. &generate_bn_pwr5_mont_t4($i);
  627. }
  628. {
  629. ########################################################################
  630. # Fall-back subroutines
  631. #
  632. # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
  633. #
  634. ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
  635. (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
  636. # int bn_mul_mont(
  637. $rp="%o0"; # u64 *rp,
  638. $ap="%o1"; # const u64 *ap,
  639. $bp="%o2"; # const u64 *bp,
  640. $np="%o3"; # const u64 *np,
  641. $n0p="%o4"; # const BN_ULONG *n0,
  642. $num="%o5"; # int num); # caller ensures that num is >=3
  643. $code.=<<___;
  644. .globl bn_mul_mont_t4
  645. .align 32
  646. bn_mul_mont_t4:
  647. add %sp, STACK_BIAS, %g4 ! real top of stack
  648. sll $num, 3, $num ! size in bytes
  649. add $num, 63, %g1
  650. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  651. sub %g4, %g1, %g1
  652. andn %g1, 63, %g1 ! align at 64 byte
  653. sub %g1, STACK_FRAME, %g1 ! new top of stack
  654. sub %g1, %g4, %g1
  655. save %sp, %g1, %sp
  656. ___
  657. # +-------------------------------+<----- %sp
  658. # . .
  659. # +-------------------------------+<----- aligned at 64 bytes
  660. # | __int64 tmp[0] |
  661. # +-------------------------------+
  662. # . .
  663. # . .
  664. # +-------------------------------+<----- aligned at 64 bytes
  665. # . .
  666. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  667. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
  668. ($ovf,$i)=($t0,$t1);
  669. $code.=<<___;
  670. ld [$n0p+0], $t0 ! pull n0[0..1] value
  671. ld [$n0p+4], $t1
  672. add %sp, STACK_BIAS+STACK_FRAME, $tp
  673. ldx [$bp+0], $m0 ! m0=bp[0]
  674. sllx $t1, 32, $n0
  675. add $bp, 8, $bp
  676. or $t0, $n0, $n0
  677. ldx [$ap+0], $aj ! ap[0]
  678. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  679. umulxhi $aj, $m0, $hi0
  680. ldx [$ap+8], $aj ! ap[1]
  681. add $ap, 16, $ap
  682. ldx [$np+0], $nj ! np[0]
  683. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  684. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  685. umulxhi $aj, $m0, $aj ! ahi=aj
  686. mulx $nj, $m1, $lo1 ! np[0]*m1
  687. umulxhi $nj, $m1, $hi1
  688. ldx [$np+8], $nj ! np[1]
  689. addcc $lo0, $lo1, $lo1
  690. add $np, 16, $np
  691. addxc %g0, $hi1, $hi1
  692. mulx $nj, $m1, $nlo ! np[1]*m1
  693. umulxhi $nj, $m1, $nj ! nhi=nj
  694. ba .L1st
  695. sub $num, 24, $cnt ! cnt=num-3
  696. .align 16
  697. .L1st:
  698. addcc $alo, $hi0, $lo0
  699. addxc $aj, %g0, $hi0
  700. ldx [$ap+0], $aj ! ap[j]
  701. addcc $nlo, $hi1, $lo1
  702. add $ap, 8, $ap
  703. addxc $nj, %g0, $hi1 ! nhi=nj
  704. ldx [$np+0], $nj ! np[j]
  705. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  706. add $np, 8, $np
  707. umulxhi $aj, $m0, $aj ! ahi=aj
  708. mulx $nj, $m1, $nlo ! np[j]*m1
  709. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  710. umulxhi $nj, $m1, $nj ! nhi=nj
  711. addxc %g0, $hi1, $hi1
  712. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  713. add $tp, 8, $tp ! tp++
  714. brnz,pt $cnt, .L1st
  715. sub $cnt, 8, $cnt ! j--
  716. !.L1st
  717. addcc $alo, $hi0, $lo0
  718. addxc $aj, %g0, $hi0 ! ahi=aj
  719. addcc $nlo, $hi1, $lo1
  720. addxc $nj, %g0, $hi1
  721. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  722. addxc %g0, $hi1, $hi1
  723. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  724. add $tp, 8, $tp
  725. addcc $hi0, $hi1, $hi1
  726. addxc %g0, %g0, $ovf ! upmost overflow bit
  727. stxa $hi1, [$tp]0xe2
  728. add $tp, 8, $tp
  729. ba .Louter
  730. sub $num, 16, $i ! i=num-2
  731. .align 16
  732. .Louter:
  733. ldx [$bp+0], $m0 ! m0=bp[i]
  734. add $bp, 8, $bp
  735. sub $ap, $num, $ap ! rewind
  736. sub $np, $num, $np
  737. sub $tp, $num, $tp
  738. ldx [$ap+0], $aj ! ap[0]
  739. ldx [$np+0], $nj ! np[0]
  740. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  741. ldx [$tp], $tj ! tp[0]
  742. umulxhi $aj, $m0, $hi0
  743. ldx [$ap+8], $aj ! ap[1]
  744. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  745. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  746. addxc %g0, $hi0, $hi0
  747. mulx $lo0, $n0, $m1 ! tp[0]*n0
  748. umulxhi $aj, $m0, $aj ! ahi=aj
  749. mulx $nj, $m1, $lo1 ! np[0]*m1
  750. add $ap, 16, $ap
  751. umulxhi $nj, $m1, $hi1
  752. ldx [$np+8], $nj ! np[1]
  753. add $np, 16, $np
  754. addcc $lo1, $lo0, $lo1
  755. mulx $nj, $m1, $nlo ! np[1]*m1
  756. addxc %g0, $hi1, $hi1
  757. umulxhi $nj, $m1, $nj ! nhi=nj
  758. ba .Linner
  759. sub $num, 24, $cnt ! cnt=num-3
  760. .align 16
  761. .Linner:
  762. addcc $alo, $hi0, $lo0
  763. ldx [$tp+8], $tj ! tp[j]
  764. addxc $aj, %g0, $hi0 ! ahi=aj
  765. ldx [$ap+0], $aj ! ap[j]
  766. add $ap, 8, $ap
  767. addcc $nlo, $hi1, $lo1
  768. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  769. addxc $nj, %g0, $hi1 ! nhi=nj
  770. ldx [$np+0], $nj ! np[j]
  771. add $np, 8, $np
  772. umulxhi $aj, $m0, $aj ! ahi=aj
  773. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  774. mulx $nj, $m1, $nlo ! np[j]*m1
  775. addxc %g0, $hi0, $hi0
  776. umulxhi $nj, $m1, $nj ! nhi=nj
  777. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  778. addxc %g0, $hi1, $hi1
  779. stx $lo1, [$tp] ! tp[j-1]
  780. add $tp, 8, $tp
  781. brnz,pt $cnt, .Linner
  782. sub $cnt, 8, $cnt
  783. !.Linner
  784. ldx [$tp+8], $tj ! tp[j]
  785. addcc $alo, $hi0, $lo0
  786. addxc $aj, %g0, $hi0 ! ahi=aj
  787. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  788. addxc %g0, $hi0, $hi0
  789. addcc $nlo, $hi1, $lo1
  790. addxc $nj, %g0, $hi1 ! nhi=nj
  791. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  792. addxc %g0, $hi1, $hi1
  793. stx $lo1, [$tp] ! tp[j-1]
  794. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  795. addxccc $hi1, $hi0, $hi1
  796. addxc %g0, %g0, $ovf
  797. stx $hi1, [$tp+8]
  798. add $tp, 16, $tp
  799. brnz,pt $i, .Louter
  800. sub $i, 8, $i
  801. sub $ap, $num, $ap ! rewind
  802. sub $np, $num, $np
  803. sub $tp, $num, $tp
  804. ba .Lsub
  805. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  806. .align 16
  807. .Lsub:
  808. ldx [$tp], $tj
  809. add $tp, 8, $tp
  810. ldx [$np+0], $nj
  811. add $np, 8, $np
  812. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  813. srlx $tj, 32, $tj
  814. srlx $nj, 32, $nj
  815. subccc $tj, $nj, $t3
  816. add $rp, 8, $rp
  817. st $t2, [$rp-4] ! reverse order
  818. st $t3, [$rp-8]
  819. brnz,pt $cnt, .Lsub
  820. sub $cnt, 8, $cnt
  821. sub $np, $num, $np ! rewind
  822. sub $tp, $num, $tp
  823. sub $rp, $num, $rp
  824. subc $ovf, %g0, $ovf ! handle upmost overflow bit
  825. and $tp, $ovf, $ap
  826. andn $rp, $ovf, $np
  827. or $np, $ap, $ap ! ap=borrow?tp:rp
  828. ba .Lcopy
  829. sub $num, 8, $cnt
  830. .align 16
  831. .Lcopy: ! copy or in-place refresh
  832. ldx [$ap+0], $t2
  833. add $ap, 8, $ap
  834. stx %g0, [$tp] ! zap
  835. add $tp, 8, $tp
  836. stx $t2, [$rp+0]
  837. add $rp, 8, $rp
  838. brnz $cnt, .Lcopy
  839. sub $cnt, 8, $cnt
  840. mov 1, %o0
  841. ret
  842. restore
  843. .type bn_mul_mont_t4, #function
  844. .size bn_mul_mont_t4, .-bn_mul_mont_t4
  845. ___
  846. # int bn_mul_mont_gather5(
  847. $rp="%o0"; # u64 *rp,
  848. $ap="%o1"; # const u64 *ap,
  849. $bp="%o2"; # const u64 *pwrtbl,
  850. $np="%o3"; # const u64 *np,
  851. $n0p="%o4"; # const BN_ULONG *n0,
  852. $num="%o5"; # int num, # caller ensures that num is >=3
  853. # int power);
  854. $code.=<<___;
  855. .globl bn_mul_mont_gather5_t4
  856. .align 32
  857. bn_mul_mont_gather5_t4:
  858. add %sp, STACK_BIAS, %g4 ! real top of stack
  859. sll $num, 3, $num ! size in bytes
  860. add $num, 63, %g1
  861. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  862. sub %g4, %g1, %g1
  863. andn %g1, 63, %g1 ! align at 64 byte
  864. sub %g1, STACK_FRAME, %g1 ! new top of stack
  865. sub %g1, %g4, %g1
  866. LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
  867. save %sp, %g1, %sp
  868. ___
  869. # +-------------------------------+<----- %sp
  870. # . .
  871. # +-------------------------------+<----- aligned at 64 bytes
  872. # | __int64 tmp[0] |
  873. # +-------------------------------+
  874. # . .
  875. # . .
  876. # +-------------------------------+<----- aligned at 64 bytes
  877. # . .
  878. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  879. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
  880. ($ovf,$i)=($t0,$t1);
  881. &load_ccr($bp,"%g4",$ccr);
  882. &load_b($bp,$m0,"%o7"); # m0=bp[0]
  883. $code.=<<___;
  884. ld [$n0p+0], $t0 ! pull n0[0..1] value
  885. ld [$n0p+4], $t1
  886. add %sp, STACK_BIAS+STACK_FRAME, $tp
  887. sllx $t1, 32, $n0
  888. or $t0, $n0, $n0
  889. ldx [$ap+0], $aj ! ap[0]
  890. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  891. umulxhi $aj, $m0, $hi0
  892. ldx [$ap+8], $aj ! ap[1]
  893. add $ap, 16, $ap
  894. ldx [$np+0], $nj ! np[0]
  895. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  896. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  897. umulxhi $aj, $m0, $aj ! ahi=aj
  898. mulx $nj, $m1, $lo1 ! np[0]*m1
  899. umulxhi $nj, $m1, $hi1
  900. ldx [$np+8], $nj ! np[1]
  901. addcc $lo0, $lo1, $lo1
  902. add $np, 16, $np
  903. addxc %g0, $hi1, $hi1
  904. mulx $nj, $m1, $nlo ! np[1]*m1
  905. umulxhi $nj, $m1, $nj ! nhi=nj
  906. ba .L1st_g5
  907. sub $num, 24, $cnt ! cnt=num-3
  908. .align 16
  909. .L1st_g5:
  910. addcc $alo, $hi0, $lo0
  911. addxc $aj, %g0, $hi0
  912. ldx [$ap+0], $aj ! ap[j]
  913. addcc $nlo, $hi1, $lo1
  914. add $ap, 8, $ap
  915. addxc $nj, %g0, $hi1 ! nhi=nj
  916. ldx [$np+0], $nj ! np[j]
  917. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  918. add $np, 8, $np
  919. umulxhi $aj, $m0, $aj ! ahi=aj
  920. mulx $nj, $m1, $nlo ! np[j]*m1
  921. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  922. umulxhi $nj, $m1, $nj ! nhi=nj
  923. addxc %g0, $hi1, $hi1
  924. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  925. add $tp, 8, $tp ! tp++
  926. brnz,pt $cnt, .L1st_g5
  927. sub $cnt, 8, $cnt ! j--
  928. !.L1st_g5
  929. addcc $alo, $hi0, $lo0
  930. addxc $aj, %g0, $hi0 ! ahi=aj
  931. addcc $nlo, $hi1, $lo1
  932. addxc $nj, %g0, $hi1
  933. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  934. addxc %g0, $hi1, $hi1
  935. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  936. add $tp, 8, $tp
  937. addcc $hi0, $hi1, $hi1
  938. addxc %g0, %g0, $ovf ! upmost overflow bit
  939. stxa $hi1, [$tp]0xe2
  940. add $tp, 8, $tp
  941. ba .Louter_g5
  942. sub $num, 16, $i ! i=num-2
  943. .align 16
  944. .Louter_g5:
  945. wr $ccr, %g0, %ccr
  946. ___
  947. &load_b($bp,$m0); # m0=bp[i]
  948. $code.=<<___;
  949. sub $ap, $num, $ap ! rewind
  950. sub $np, $num, $np
  951. sub $tp, $num, $tp
  952. ldx [$ap+0], $aj ! ap[0]
  953. ldx [$np+0], $nj ! np[0]
  954. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  955. ldx [$tp], $tj ! tp[0]
  956. umulxhi $aj, $m0, $hi0
  957. ldx [$ap+8], $aj ! ap[1]
  958. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  959. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  960. addxc %g0, $hi0, $hi0
  961. mulx $lo0, $n0, $m1 ! tp[0]*n0
  962. umulxhi $aj, $m0, $aj ! ahi=aj
  963. mulx $nj, $m1, $lo1 ! np[0]*m1
  964. add $ap, 16, $ap
  965. umulxhi $nj, $m1, $hi1
  966. ldx [$np+8], $nj ! np[1]
  967. add $np, 16, $np
  968. addcc $lo1, $lo0, $lo1
  969. mulx $nj, $m1, $nlo ! np[1]*m1
  970. addxc %g0, $hi1, $hi1
  971. umulxhi $nj, $m1, $nj ! nhi=nj
  972. ba .Linner_g5
  973. sub $num, 24, $cnt ! cnt=num-3
  974. .align 16
  975. .Linner_g5:
  976. addcc $alo, $hi0, $lo0
  977. ldx [$tp+8], $tj ! tp[j]
  978. addxc $aj, %g0, $hi0 ! ahi=aj
  979. ldx [$ap+0], $aj ! ap[j]
  980. add $ap, 8, $ap
  981. addcc $nlo, $hi1, $lo1
  982. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  983. addxc $nj, %g0, $hi1 ! nhi=nj
  984. ldx [$np+0], $nj ! np[j]
  985. add $np, 8, $np
  986. umulxhi $aj, $m0, $aj ! ahi=aj
  987. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  988. mulx $nj, $m1, $nlo ! np[j]*m1
  989. addxc %g0, $hi0, $hi0
  990. umulxhi $nj, $m1, $nj ! nhi=nj
  991. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  992. addxc %g0, $hi1, $hi1
  993. stx $lo1, [$tp] ! tp[j-1]
  994. add $tp, 8, $tp
  995. brnz,pt $cnt, .Linner_g5
  996. sub $cnt, 8, $cnt
  997. !.Linner_g5
  998. ldx [$tp+8], $tj ! tp[j]
  999. addcc $alo, $hi0, $lo0
  1000. addxc $aj, %g0, $hi0 ! ahi=aj
  1001. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  1002. addxc %g0, $hi0, $hi0
  1003. addcc $nlo, $hi1, $lo1
  1004. addxc $nj, %g0, $hi1 ! nhi=nj
  1005. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  1006. addxc %g0, $hi1, $hi1
  1007. stx $lo1, [$tp] ! tp[j-1]
  1008. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  1009. addxccc $hi1, $hi0, $hi1
  1010. addxc %g0, %g0, $ovf
  1011. stx $hi1, [$tp+8]
  1012. add $tp, 16, $tp
  1013. brnz,pt $i, .Louter_g5
  1014. sub $i, 8, $i
  1015. sub $ap, $num, $ap ! rewind
  1016. sub $np, $num, $np
  1017. sub $tp, $num, $tp
  1018. ba .Lsub_g5
  1019. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  1020. .align 16
  1021. .Lsub_g5:
  1022. ldx [$tp], $tj
  1023. add $tp, 8, $tp
  1024. ldx [$np+0], $nj
  1025. add $np, 8, $np
  1026. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  1027. srlx $tj, 32, $tj
  1028. srlx $nj, 32, $nj
  1029. subccc $tj, $nj, $t3
  1030. add $rp, 8, $rp
  1031. st $t2, [$rp-4] ! reverse order
  1032. st $t3, [$rp-8]
  1033. brnz,pt $cnt, .Lsub_g5
  1034. sub $cnt, 8, $cnt
  1035. sub $np, $num, $np ! rewind
  1036. sub $tp, $num, $tp
  1037. sub $rp, $num, $rp
  1038. subc $ovf, %g0, $ovf ! handle upmost overflow bit
  1039. and $tp, $ovf, $ap
  1040. andn $rp, $ovf, $np
  1041. or $np, $ap, $ap ! ap=borrow?tp:rp
  1042. ba .Lcopy_g5
  1043. sub $num, 8, $cnt
  1044. .align 16
  1045. .Lcopy_g5: ! copy or in-place refresh
  1046. ldx [$ap+0], $t2
  1047. add $ap, 8, $ap
  1048. stx %g0, [$tp] ! zap
  1049. add $tp, 8, $tp
  1050. stx $t2, [$rp+0]
  1051. add $rp, 8, $rp
  1052. brnz $cnt, .Lcopy_g5
  1053. sub $cnt, 8, $cnt
  1054. mov 1, %o0
  1055. ret
  1056. restore
  1057. .type bn_mul_mont_gather5_t4, #function
  1058. .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
  1059. ___
  1060. }
  1061. $code.=<<___;
  1062. .globl bn_flip_t4
  1063. .align 32
  1064. bn_flip_t4:
  1065. .Loop_flip:
  1066. ld [%o1+0], %o4
  1067. sub %o2, 1, %o2
  1068. ld [%o1+4], %o5
  1069. add %o1, 8, %o1
  1070. st %o5, [%o0+0]
  1071. st %o4, [%o0+4]
  1072. brnz %o2, .Loop_flip
  1073. add %o0, 8, %o0
  1074. retl
  1075. nop
  1076. .type bn_flip_t4, #function
  1077. .size bn_flip_t4, .-bn_flip_t4
  1078. .globl bn_flip_n_scatter5_t4
  1079. .align 32
  1080. bn_flip_n_scatter5_t4:
  1081. sll %o3, 3, %o3
  1082. srl %o1, 1, %o1
  1083. add %o3, %o2, %o2 ! &pwrtbl[pwr]
  1084. sub %o1, 1, %o1
  1085. .Loop_flip_n_scatter5:
  1086. ld [%o0+0], %o4 ! inp[i]
  1087. ld [%o0+4], %o5
  1088. add %o0, 8, %o0
  1089. sllx %o5, 32, %o5
  1090. or %o4, %o5, %o5
  1091. stx %o5, [%o2]
  1092. add %o2, 32*8, %o2
  1093. brnz %o1, .Loop_flip_n_scatter5
  1094. sub %o1, 1, %o1
  1095. retl
  1096. nop
  1097. .type bn_flip_n_scatter5_t4, #function
  1098. .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
  1099. .globl bn_gather5_t4
  1100. .align 32
  1101. bn_gather5_t4:
  1102. ___
  1103. &load_ccr("%o2","%o3","%g1");
  1104. $code.=<<___;
  1105. sub %o1, 1, %o1
  1106. .Loop_gather5:
  1107. ___
  1108. &load_b("%o2","%g1");
  1109. $code.=<<___;
  1110. stx %g1, [%o0]
  1111. add %o0, 8, %o0
  1112. brnz %o1, .Loop_gather5
  1113. sub %o1, 1, %o1
  1114. retl
  1115. nop
  1116. .type bn_gather5_t4, #function
  1117. .size bn_gather5_t4, .-bn_gather5_t4
  1118. .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
  1119. .align 4
  1120. ___
  1121. &emit_assembler();
  1122. close STDOUT;