2
0

sparct4-mont.pl 27 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov
  10. # The module is licensed under 2-clause BSD license.
  11. # November 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Montgomery squaring-n-multiplication module for SPARC T4.
  15. #
  16. # The module consists of three parts:
  17. #
  18. # 1) collection of "single-op" subroutines that perform single
  19. # operation, Montgomery squaring or multiplication, on 512-,
  20. # 1024-, 1536- and 2048-bit operands;
  21. # 2) collection of "multi-op" subroutines that perform 5 squaring and
  22. # 1 multiplication operations on operands of above lengths;
  23. # 3) fall-back and helper VIS3 subroutines.
  24. #
  25. # RSA sign is dominated by multi-op subroutine, while RSA verify and
  26. # DSA - by single-op. Special note about 4096-bit RSA verify result.
  27. # Operands are too long for dedicated hardware and it's handled by
  28. # VIS3 code, which is why you don't see any improvement. It's surely
  29. # possible to improve it [by deploying 'mpmul' instruction], maybe in
  30. # the future...
  31. #
  32. # Performance improvement.
  33. #
  34. # 64-bit process, VIS3:
  35. # sign verify sign/s verify/s
  36. # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
  37. # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
  38. # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
  39. # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
  40. # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
  41. #
  42. # 64-bit process, this module:
  43. # sign verify sign/s verify/s
  44. # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
  45. # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
  46. # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
  47. # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
  48. # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
  49. #
  50. ######################################################################
  51. # 32-bit process, VIS3:
  52. # sign verify sign/s verify/s
  53. # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
  54. # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
  55. # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
  56. # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
  57. # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
  58. #
  59. # 32-bit process, this module:
  60. # sign verify sign/s verify/s
  61. # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
  62. # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
  63. # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
  64. # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
  65. # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
  66. #
  67. # 32-bit code is prone to performance degradation as interrupt rate
  68. # dispatched to CPU executing the code grows. This is because in
  69. # standard process of handling interrupt in 32-bit process context
  70. # upper halves of most integer registers used as input or output are
  71. # zeroed. This renders result invalid, and operation has to be re-run.
  72. # If CPU is "bothered" with timer interrupts only, the penalty is
  73. # hardly measurable. But in order to mitigate this problem for higher
  74. # interrupt rates contemporary Linux kernel recognizes biased stack
  75. # even in 32-bit process context and preserves full register contents.
  76. # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
  77. # for details.
  78. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  79. push(@INC,"${dir}","${dir}../../perlasm");
  80. require "sparcv9_modes.pl";
  81. $output = pop and open STDOUT,">$output";
  82. $code.=<<___;
  83. #ifndef __ASSEMBLER__
  84. # define __ASSEMBLER__ 1
  85. #endif
  86. #include "crypto/sparc_arch.h"
  87. #ifdef __arch64__
  88. .register %g2,#scratch
  89. .register %g3,#scratch
  90. #endif
  91. .section ".text",#alloc,#execinstr
  92. #ifdef __PIC__
  93. SPARC_PIC_THUNK(%g1)
  94. #endif
  95. ___
  96. ########################################################################
  97. # Register layout for mont[mul|sqr] instructions.
  98. # For details see "Oracle SPARC Architecture 2011" manual at
  99. # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
  100. #
  101. my @R=map("%f".2*$_,(0..11,30,31,12..29));
  102. my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
  103. my @A=(@N[0..13],@R[14..31]);
  104. my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
  105. ########################################################################
  106. # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
  107. # const u64 *np,const BN_ULONG *n0);
  108. #
  109. sub generate_bn_mul_mont_t4() {
  110. my $NUM=shift;
  111. my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
  112. $code.=<<___;
  113. .globl bn_mul_mont_t4_$NUM
  114. .align 32
  115. bn_mul_mont_t4_$NUM:
  116. #ifdef __arch64__
  117. mov 0,$sentinel
  118. mov -128,%g4
  119. #elif defined(SPARCV9_64BIT_STACK)
  120. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  121. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  122. mov -2047,%g4
  123. and %g1,SPARCV9_64BIT_STACK,%g1
  124. movrz %g1,0,%g4
  125. mov -1,$sentinel
  126. add %g4,-128,%g4
  127. #else
  128. mov -1,$sentinel
  129. mov -128,%g4
  130. #endif
  131. sllx $sentinel,32,$sentinel
  132. save %sp,%g4,%sp
  133. #ifndef __arch64__
  134. save %sp,-128,%sp ! warm it up
  135. save %sp,-128,%sp
  136. save %sp,-128,%sp
  137. save %sp,-128,%sp
  138. save %sp,-128,%sp
  139. save %sp,-128,%sp
  140. restore
  141. restore
  142. restore
  143. restore
  144. restore
  145. restore
  146. #endif
  147. and %sp,1,%g4
  148. or $sentinel,%fp,%fp
  149. or %g4,$sentinel,$sentinel
  150. ! copy arguments to global registers
  151. mov %i0,$rp
  152. mov %i1,$ap
  153. mov %i2,$bp
  154. mov %i3,$np
  155. ld [%i4+0],%f1 ! load *n0
  156. ld [%i4+4],%f0
  157. fsrc2 %f0,%f60
  158. ___
  159. # load ap[$NUM] ########################################################
  160. $code.=<<___;
  161. save %sp,-128,%sp; or $sentinel,%fp,%fp
  162. ___
  163. for($i=0; $i<14 && $i<$NUM; $i++) {
  164. my $lo=$i<13?@A[$i+1]:"%o7";
  165. $code.=<<___;
  166. ld [$ap+$i*8+0],$lo
  167. ld [$ap+$i*8+4],@A[$i]
  168. sllx @A[$i],32,@A[$i]
  169. or $lo,@A[$i],@A[$i]
  170. ___
  171. }
  172. for(; $i<$NUM; $i++) {
  173. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  174. $code.=<<___;
  175. ld [$ap+$i*8+0],$lo
  176. ld [$ap+$i*8+4],$hi
  177. fsrc2 $hi,@A[$i]
  178. ___
  179. }
  180. # load np[$NUM] ########################################################
  181. $code.=<<___;
  182. save %sp,-128,%sp; or $sentinel,%fp,%fp
  183. ___
  184. for($i=0; $i<14 && $i<$NUM; $i++) {
  185. my $lo=$i<13?@N[$i+1]:"%o7";
  186. $code.=<<___;
  187. ld [$np+$i*8+0],$lo
  188. ld [$np+$i*8+4],@N[$i]
  189. sllx @N[$i],32,@N[$i]
  190. or $lo,@N[$i],@N[$i]
  191. ___
  192. }
  193. $code.=<<___;
  194. save %sp,-128,%sp; or $sentinel,%fp,%fp
  195. ___
  196. for(; $i<28 && $i<$NUM; $i++) {
  197. my $lo=$i<27?@N[$i+1]:"%o7";
  198. $code.=<<___;
  199. ld [$np+$i*8+0],$lo
  200. ld [$np+$i*8+4],@N[$i]
  201. sllx @N[$i],32,@N[$i]
  202. or $lo,@N[$i],@N[$i]
  203. ___
  204. }
  205. $code.=<<___;
  206. save %sp,-128,%sp; or $sentinel,%fp,%fp
  207. ___
  208. for(; $i<$NUM; $i++) {
  209. my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
  210. $code.=<<___;
  211. ld [$np+$i*8+0],$lo
  212. ld [$np+$i*8+4],@N[$i]
  213. sllx @N[$i],32,@N[$i]
  214. or $lo,@N[$i],@N[$i]
  215. ___
  216. }
  217. $code.=<<___;
  218. cmp $ap,$bp
  219. be SIZE_T_CC,.Lmsquare_$NUM
  220. nop
  221. ___
  222. # load bp[$NUM] ########################################################
  223. $code.=<<___;
  224. save %sp,-128,%sp; or $sentinel,%fp,%fp
  225. ___
  226. for($i=0; $i<14 && $i<$NUM; $i++) {
  227. my $lo=$i<13?@B[$i+1]:"%o7";
  228. $code.=<<___;
  229. ld [$bp+$i*8+0],$lo
  230. ld [$bp+$i*8+4],@B[$i]
  231. sllx @B[$i],32,@B[$i]
  232. or $lo,@B[$i],@B[$i]
  233. ___
  234. }
  235. $code.=<<___;
  236. save %sp,-128,%sp; or $sentinel,%fp,%fp
  237. ___
  238. for(; $i<$NUM; $i++) {
  239. my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
  240. $code.=<<___;
  241. ld [$bp+$i*8+0],$lo
  242. ld [$bp+$i*8+4],@B[$i]
  243. sllx @B[$i],32,@B[$i]
  244. or $lo,@B[$i],@B[$i]
  245. ___
  246. }
  247. # magic ################################################################
  248. $code.=<<___;
  249. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  250. .Lmresume_$NUM:
  251. fbu,pn %fcc3,.Lmabort_$NUM
  252. #ifndef __arch64__
  253. and %fp,$sentinel,$sentinel
  254. brz,pn $sentinel,.Lmabort_$NUM
  255. #endif
  256. nop
  257. #ifdef __arch64__
  258. restore
  259. restore
  260. restore
  261. restore
  262. restore
  263. #else
  264. restore; and %fp,$sentinel,$sentinel
  265. restore; and %fp,$sentinel,$sentinel
  266. restore; and %fp,$sentinel,$sentinel
  267. restore; and %fp,$sentinel,$sentinel
  268. brz,pn $sentinel,.Lmabort1_$NUM
  269. restore
  270. #endif
  271. ___
  272. # save tp[$NUM] ########################################################
  273. for($i=0; $i<14 && $i<$NUM; $i++) {
  274. $code.=<<___;
  275. movxtod @A[$i],@R[$i]
  276. ___
  277. }
  278. $code.=<<___;
  279. #ifdef __arch64__
  280. restore
  281. #else
  282. and %fp,$sentinel,$sentinel
  283. restore
  284. and $sentinel,1,%o7
  285. and %fp,$sentinel,$sentinel
  286. srl %fp,0,%fp ! just in case?
  287. or %o7,$sentinel,$sentinel
  288. brz,a,pn $sentinel,.Lmdone_$NUM
  289. mov 0,%i0 ! return failure
  290. #endif
  291. ___
  292. for($i=0; $i<12 && $i<$NUM; $i++) {
  293. @R[$i] =~ /%f([0-9]+)/;
  294. my $lo = "%f".($1+1);
  295. $code.=<<___;
  296. st $lo,[$rp+$i*8+0]
  297. st @R[$i],[$rp+$i*8+4]
  298. ___
  299. }
  300. for(; $i<$NUM; $i++) {
  301. my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
  302. $code.=<<___;
  303. fsrc2 @R[$i],$hi
  304. st $lo,[$rp+$i*8+0]
  305. st $hi,[$rp+$i*8+4]
  306. ___
  307. }
  308. $code.=<<___;
  309. mov 1,%i0 ! return success
  310. .Lmdone_$NUM:
  311. ret
  312. restore
  313. .Lmabort_$NUM:
  314. restore
  315. restore
  316. restore
  317. restore
  318. restore
  319. .Lmabort1_$NUM:
  320. restore
  321. mov 0,%i0 ! return failure
  322. ret
  323. restore
  324. .align 32
  325. .Lmsquare_$NUM:
  326. save %sp,-128,%sp; or $sentinel,%fp,%fp
  327. save %sp,-128,%sp; or $sentinel,%fp,%fp
  328. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  329. ba .Lmresume_$NUM
  330. nop
  331. .type bn_mul_mont_t4_$NUM, #function
  332. .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
  333. ___
  334. }
  335. for ($i=8;$i<=32;$i+=8) {
  336. &generate_bn_mul_mont_t4($i);
  337. }
  338. ########################################################################
  339. #
  340. sub load_ccr {
  341. my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
  342. $code.=<<___;
  343. srl $pwr, 2, %o4
  344. and $pwr, 3, %o5
  345. and %o4, 7, %o4
  346. sll %o5, 3, %o5 ! offset within first cache line
  347. add %o5, $ptbl, $ptbl ! of the pwrtbl
  348. or %g0, 1, %o5
  349. sll %o5, %o4, $ccr
  350. ___
  351. $code.=<<___ if (!$skip_wr);
  352. wr $ccr, %g0, %ccr
  353. ___
  354. }
  355. sub load_b_pair {
  356. my ($pwrtbl,$B0,$B1)=@_;
  357. $code.=<<___;
  358. ldx [$pwrtbl+0*32], $B0
  359. ldx [$pwrtbl+8*32], $B1
  360. ldx [$pwrtbl+1*32], %o4
  361. ldx [$pwrtbl+9*32], %o5
  362. movvs %icc, %o4, $B0
  363. ldx [$pwrtbl+2*32], %o4
  364. movvs %icc, %o5, $B1
  365. ldx [$pwrtbl+10*32],%o5
  366. move %icc, %o4, $B0
  367. ldx [$pwrtbl+3*32], %o4
  368. move %icc, %o5, $B1
  369. ldx [$pwrtbl+11*32],%o5
  370. movneg %icc, %o4, $B0
  371. ldx [$pwrtbl+4*32], %o4
  372. movneg %icc, %o5, $B1
  373. ldx [$pwrtbl+12*32],%o5
  374. movcs %xcc, %o4, $B0
  375. ldx [$pwrtbl+5*32],%o4
  376. movcs %xcc, %o5, $B1
  377. ldx [$pwrtbl+13*32],%o5
  378. movvs %xcc, %o4, $B0
  379. ldx [$pwrtbl+6*32], %o4
  380. movvs %xcc, %o5, $B1
  381. ldx [$pwrtbl+14*32],%o5
  382. move %xcc, %o4, $B0
  383. ldx [$pwrtbl+7*32], %o4
  384. move %xcc, %o5, $B1
  385. ldx [$pwrtbl+15*32],%o5
  386. movneg %xcc, %o4, $B0
  387. add $pwrtbl,16*32, $pwrtbl
  388. movneg %xcc, %o5, $B1
  389. ___
  390. }
  391. sub load_b {
  392. my ($pwrtbl,$Bi)=@_;
  393. $code.=<<___;
  394. ldx [$pwrtbl+0*32], $Bi
  395. ldx [$pwrtbl+1*32], %o4
  396. ldx [$pwrtbl+2*32], %o5
  397. movvs %icc, %o4, $Bi
  398. ldx [$pwrtbl+3*32], %o4
  399. move %icc, %o5, $Bi
  400. ldx [$pwrtbl+4*32], %o5
  401. movneg %icc, %o4, $Bi
  402. ldx [$pwrtbl+5*32], %o4
  403. movcs %xcc, %o5, $Bi
  404. ldx [$pwrtbl+6*32], %o5
  405. movvs %xcc, %o4, $Bi
  406. ldx [$pwrtbl+7*32], %o4
  407. move %xcc, %o5, $Bi
  408. add $pwrtbl,8*32, $pwrtbl
  409. movneg %xcc, %o4, $Bi
  410. ___
  411. }
  412. ########################################################################
  413. # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
  414. # const u64 *pwrtbl,int pwr,int stride);
  415. #
  416. sub generate_bn_pwr5_mont_t4() {
  417. my $NUM=shift;
  418. my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
  419. $code.=<<___;
  420. .globl bn_pwr5_mont_t4_$NUM
  421. .align 32
  422. bn_pwr5_mont_t4_$NUM:
  423. #ifdef __arch64__
  424. mov 0,$sentinel
  425. mov -128,%g4
  426. #elif defined(SPARCV9_64BIT_STACK)
  427. SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  428. ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
  429. mov -2047,%g4
  430. and %g1,SPARCV9_64BIT_STACK,%g1
  431. movrz %g1,0,%g4
  432. mov -1,$sentinel
  433. add %g4,-128,%g4
  434. #else
  435. mov -1,$sentinel
  436. mov -128,%g4
  437. #endif
  438. sllx $sentinel,32,$sentinel
  439. save %sp,%g4,%sp
  440. #ifndef __arch64__
  441. save %sp,-128,%sp ! warm it up
  442. save %sp,-128,%sp
  443. save %sp,-128,%sp
  444. save %sp,-128,%sp
  445. save %sp,-128,%sp
  446. save %sp,-128,%sp
  447. restore
  448. restore
  449. restore
  450. restore
  451. restore
  452. restore
  453. #endif
  454. and %sp,1,%g4
  455. or $sentinel,%fp,%fp
  456. or %g4,$sentinel,$sentinel
  457. ! copy arguments to global registers
  458. mov %i0,$tp
  459. mov %i1,$np
  460. ld [%i2+0],%f1 ! load *n0
  461. ld [%i2+4],%f0
  462. mov %i3,$pwrtbl
  463. srl %i4,%g0,%i4 ! pack last arguments
  464. sllx %i5,32,$pwr
  465. or %i4,$pwr,$pwr
  466. fsrc2 %f0,%f60
  467. ___
  468. # load tp[$NUM] ########################################################
  469. $code.=<<___;
  470. save %sp,-128,%sp; or $sentinel,%fp,%fp
  471. ___
  472. for($i=0; $i<14 && $i<$NUM; $i++) {
  473. $code.=<<___;
  474. ldx [$tp+$i*8],@A[$i]
  475. ___
  476. }
  477. for(; $i<$NUM; $i++) {
  478. $code.=<<___;
  479. ldd [$tp+$i*8],@A[$i]
  480. ___
  481. }
  482. # load np[$NUM] ########################################################
  483. $code.=<<___;
  484. save %sp,-128,%sp; or $sentinel,%fp,%fp
  485. ___
  486. for($i=0; $i<14 && $i<$NUM; $i++) {
  487. $code.=<<___;
  488. ldx [$np+$i*8],@N[$i]
  489. ___
  490. }
  491. $code.=<<___;
  492. save %sp,-128,%sp; or $sentinel,%fp,%fp
  493. ___
  494. for(; $i<28 && $i<$NUM; $i++) {
  495. $code.=<<___;
  496. ldx [$np+$i*8],@N[$i]
  497. ___
  498. }
  499. $code.=<<___;
  500. save %sp,-128,%sp; or $sentinel,%fp,%fp
  501. ___
  502. for(; $i<$NUM; $i++) {
  503. $code.=<<___;
  504. ldx [$np+$i*8],@N[$i]
  505. ___
  506. }
  507. # load pwrtbl[pwr] ########################################################
  508. $code.=<<___;
  509. save %sp,-128,%sp; or $sentinel,%fp,%fp
  510. srlx $pwr, 32, %o4 ! unpack $pwr
  511. srl $pwr, %g0, %o5
  512. sub %o4, 5, %o4
  513. mov $pwrtbl, %o7
  514. sllx %o4, 32, $pwr ! re-pack $pwr
  515. or %o5, $pwr, $pwr
  516. srl %o5, %o4, %o5
  517. ___
  518. &load_ccr("%o7","%o5","%o4");
  519. $code.=<<___;
  520. b .Lstride_$NUM
  521. nop
  522. .align 16
  523. .Lstride_$NUM:
  524. ___
  525. for($i=0; $i<14 && $i<$NUM; $i+=2) {
  526. &load_b_pair("%o7",@B[$i],@B[$i+1]);
  527. }
  528. $code.=<<___;
  529. save %sp,-128,%sp; or $sentinel,%fp,%fp
  530. ___
  531. for(; $i<$NUM; $i+=2) {
  532. &load_b_pair("%i7",@B[$i],@B[$i+1]);
  533. }
  534. $code.=<<___;
  535. srax $pwr, 32, %o4 ! unpack $pwr
  536. srl $pwr, %g0, %o5
  537. sub %o4, 5, %o4
  538. mov $pwrtbl, %i7
  539. sllx %o4, 32, $pwr ! re-pack $pwr
  540. or %o5, $pwr, $pwr
  541. srl %o5, %o4, %o5
  542. ___
  543. &load_ccr("%i7","%o5","%o4",1);
  544. # magic ################################################################
  545. for($i=0; $i<5; $i++) {
  546. $code.=<<___;
  547. .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
  548. fbu,pn %fcc3,.Labort_$NUM
  549. #ifndef __arch64__
  550. and %fp,$sentinel,$sentinel
  551. brz,pn $sentinel,.Labort_$NUM
  552. #endif
  553. nop
  554. ___
  555. }
  556. $code.=<<___;
  557. wr %o4, %g0, %ccr
  558. .word 0x81b02920+$NUM-1 ! montmul $NUM-1
  559. fbu,pn %fcc3,.Labort_$NUM
  560. #ifndef __arch64__
  561. and %fp,$sentinel,$sentinel
  562. brz,pn $sentinel,.Labort_$NUM
  563. #endif
  564. srax $pwr, 32, %o4
  565. #ifdef __arch64__
  566. brgez %o4,.Lstride_$NUM
  567. restore
  568. restore
  569. restore
  570. restore
  571. restore
  572. #else
  573. brgez %o4,.Lstride_$NUM
  574. restore; and %fp,$sentinel,$sentinel
  575. restore; and %fp,$sentinel,$sentinel
  576. restore; and %fp,$sentinel,$sentinel
  577. restore; and %fp,$sentinel,$sentinel
  578. brz,pn $sentinel,.Labort1_$NUM
  579. restore
  580. #endif
  581. ___
  582. # save tp[$NUM] ########################################################
  583. for($i=0; $i<14 && $i<$NUM; $i++) {
  584. $code.=<<___;
  585. movxtod @A[$i],@R[$i]
  586. ___
  587. }
  588. $code.=<<___;
  589. #ifdef __arch64__
  590. restore
  591. #else
  592. and %fp,$sentinel,$sentinel
  593. restore
  594. and $sentinel,1,%o7
  595. and %fp,$sentinel,$sentinel
  596. srl %fp,0,%fp ! just in case?
  597. or %o7,$sentinel,$sentinel
  598. brz,a,pn $sentinel,.Ldone_$NUM
  599. mov 0,%i0 ! return failure
  600. #endif
  601. ___
  602. for($i=0; $i<$NUM; $i++) {
  603. $code.=<<___;
  604. std @R[$i],[$tp+$i*8]
  605. ___
  606. }
  607. $code.=<<___;
  608. mov 1,%i0 ! return success
  609. .Ldone_$NUM:
  610. ret
  611. restore
  612. .Labort_$NUM:
  613. restore
  614. restore
  615. restore
  616. restore
  617. restore
  618. .Labort1_$NUM:
  619. restore
  620. mov 0,%i0 ! return failure
  621. ret
  622. restore
  623. .type bn_pwr5_mont_t4_$NUM, #function
  624. .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
  625. ___
  626. }
  627. for ($i=8;$i<=32;$i+=8) {
  628. &generate_bn_pwr5_mont_t4($i);
  629. }
  630. {
  631. ########################################################################
  632. # Fall-back subroutines
  633. #
  634. # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
  635. #
  636. ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
  637. (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
  638. # int bn_mul_mont(
  639. $rp="%o0"; # u64 *rp,
  640. $ap="%o1"; # const u64 *ap,
  641. $bp="%o2"; # const u64 *bp,
  642. $np="%o3"; # const u64 *np,
  643. $n0p="%o4"; # const BN_ULONG *n0,
  644. $num="%o5"; # int num); # caller ensures that num is >=3
  645. $code.=<<___;
  646. .globl bn_mul_mont_t4
  647. .align 32
  648. bn_mul_mont_t4:
  649. add %sp, STACK_BIAS, %g4 ! real top of stack
  650. sll $num, 3, $num ! size in bytes
  651. add $num, 63, %g1
  652. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  653. sub %g4, %g1, %g1
  654. andn %g1, 63, %g1 ! align at 64 byte
  655. sub %g1, STACK_FRAME, %g1 ! new top of stack
  656. sub %g1, %g4, %g1
  657. save %sp, %g1, %sp
  658. ___
  659. # +-------------------------------+<----- %sp
  660. # . .
  661. # +-------------------------------+<----- aligned at 64 bytes
  662. # | __int64 tmp[0] |
  663. # +-------------------------------+
  664. # . .
  665. # . .
  666. # +-------------------------------+<----- aligned at 64 bytes
  667. # . .
  668. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  669. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
  670. ($ovf,$i)=($t0,$t1);
  671. $code.=<<___;
  672. ld [$n0p+0], $t0 ! pull n0[0..1] value
  673. ld [$n0p+4], $t1
  674. add %sp, STACK_BIAS+STACK_FRAME, $tp
  675. ldx [$bp+0], $m0 ! m0=bp[0]
  676. sllx $t1, 32, $n0
  677. add $bp, 8, $bp
  678. or $t0, $n0, $n0
  679. ldx [$ap+0], $aj ! ap[0]
  680. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  681. umulxhi $aj, $m0, $hi0
  682. ldx [$ap+8], $aj ! ap[1]
  683. add $ap, 16, $ap
  684. ldx [$np+0], $nj ! np[0]
  685. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  686. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  687. umulxhi $aj, $m0, $aj ! ahi=aj
  688. mulx $nj, $m1, $lo1 ! np[0]*m1
  689. umulxhi $nj, $m1, $hi1
  690. ldx [$np+8], $nj ! np[1]
  691. addcc $lo0, $lo1, $lo1
  692. add $np, 16, $np
  693. addxc %g0, $hi1, $hi1
  694. mulx $nj, $m1, $nlo ! np[1]*m1
  695. umulxhi $nj, $m1, $nj ! nhi=nj
  696. ba .L1st
  697. sub $num, 24, $cnt ! cnt=num-3
  698. .align 16
  699. .L1st:
  700. addcc $alo, $hi0, $lo0
  701. addxc $aj, %g0, $hi0
  702. ldx [$ap+0], $aj ! ap[j]
  703. addcc $nlo, $hi1, $lo1
  704. add $ap, 8, $ap
  705. addxc $nj, %g0, $hi1 ! nhi=nj
  706. ldx [$np+0], $nj ! np[j]
  707. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  708. add $np, 8, $np
  709. umulxhi $aj, $m0, $aj ! ahi=aj
  710. mulx $nj, $m1, $nlo ! np[j]*m1
  711. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  712. umulxhi $nj, $m1, $nj ! nhi=nj
  713. addxc %g0, $hi1, $hi1
  714. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  715. add $tp, 8, $tp ! tp++
  716. brnz,pt $cnt, .L1st
  717. sub $cnt, 8, $cnt ! j--
  718. !.L1st
  719. addcc $alo, $hi0, $lo0
  720. addxc $aj, %g0, $hi0 ! ahi=aj
  721. addcc $nlo, $hi1, $lo1
  722. addxc $nj, %g0, $hi1
  723. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  724. addxc %g0, $hi1, $hi1
  725. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  726. add $tp, 8, $tp
  727. addcc $hi0, $hi1, $hi1
  728. addxc %g0, %g0, $ovf ! upmost overflow bit
  729. stxa $hi1, [$tp]0xe2
  730. add $tp, 8, $tp
  731. ba .Louter
  732. sub $num, 16, $i ! i=num-2
  733. .align 16
  734. .Louter:
  735. ldx [$bp+0], $m0 ! m0=bp[i]
  736. add $bp, 8, $bp
  737. sub $ap, $num, $ap ! rewind
  738. sub $np, $num, $np
  739. sub $tp, $num, $tp
  740. ldx [$ap+0], $aj ! ap[0]
  741. ldx [$np+0], $nj ! np[0]
  742. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  743. ldx [$tp], $tj ! tp[0]
  744. umulxhi $aj, $m0, $hi0
  745. ldx [$ap+8], $aj ! ap[1]
  746. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  747. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  748. addxc %g0, $hi0, $hi0
  749. mulx $lo0, $n0, $m1 ! tp[0]*n0
  750. umulxhi $aj, $m0, $aj ! ahi=aj
  751. mulx $nj, $m1, $lo1 ! np[0]*m1
  752. add $ap, 16, $ap
  753. umulxhi $nj, $m1, $hi1
  754. ldx [$np+8], $nj ! np[1]
  755. add $np, 16, $np
  756. addcc $lo1, $lo0, $lo1
  757. mulx $nj, $m1, $nlo ! np[1]*m1
  758. addxc %g0, $hi1, $hi1
  759. umulxhi $nj, $m1, $nj ! nhi=nj
  760. ba .Linner
  761. sub $num, 24, $cnt ! cnt=num-3
  762. .align 16
  763. .Linner:
  764. addcc $alo, $hi0, $lo0
  765. ldx [$tp+8], $tj ! tp[j]
  766. addxc $aj, %g0, $hi0 ! ahi=aj
  767. ldx [$ap+0], $aj ! ap[j]
  768. add $ap, 8, $ap
  769. addcc $nlo, $hi1, $lo1
  770. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  771. addxc $nj, %g0, $hi1 ! nhi=nj
  772. ldx [$np+0], $nj ! np[j]
  773. add $np, 8, $np
  774. umulxhi $aj, $m0, $aj ! ahi=aj
  775. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  776. mulx $nj, $m1, $nlo ! np[j]*m1
  777. addxc %g0, $hi0, $hi0
  778. umulxhi $nj, $m1, $nj ! nhi=nj
  779. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  780. addxc %g0, $hi1, $hi1
  781. stx $lo1, [$tp] ! tp[j-1]
  782. add $tp, 8, $tp
  783. brnz,pt $cnt, .Linner
  784. sub $cnt, 8, $cnt
  785. !.Linner
  786. ldx [$tp+8], $tj ! tp[j]
  787. addcc $alo, $hi0, $lo0
  788. addxc $aj, %g0, $hi0 ! ahi=aj
  789. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  790. addxc %g0, $hi0, $hi0
  791. addcc $nlo, $hi1, $lo1
  792. addxc $nj, %g0, $hi1 ! nhi=nj
  793. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  794. addxc %g0, $hi1, $hi1
  795. stx $lo1, [$tp] ! tp[j-1]
  796. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  797. addxccc $hi1, $hi0, $hi1
  798. addxc %g0, %g0, $ovf
  799. stx $hi1, [$tp+8]
  800. add $tp, 16, $tp
  801. brnz,pt $i, .Louter
  802. sub $i, 8, $i
  803. sub $ap, $num, $ap ! rewind
  804. sub $np, $num, $np
  805. sub $tp, $num, $tp
  806. ba .Lsub
  807. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  808. .align 16
  809. .Lsub:
  810. ldx [$tp], $tj
  811. add $tp, 8, $tp
  812. ldx [$np+0], $nj
  813. add $np, 8, $np
  814. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  815. srlx $tj, 32, $tj
  816. srlx $nj, 32, $nj
  817. subccc $tj, $nj, $t3
  818. add $rp, 8, $rp
  819. st $t2, [$rp-4] ! reverse order
  820. st $t3, [$rp-8]
  821. brnz,pt $cnt, .Lsub
  822. sub $cnt, 8, $cnt
  823. sub $np, $num, $np ! rewind
  824. sub $tp, $num, $tp
  825. sub $rp, $num, $rp
  826. subccc $ovf, %g0, $ovf ! handle upmost overflow bit
  827. ba .Lcopy
  828. sub $num, 8, $cnt
  829. .align 16
  830. .Lcopy: ! conditional copy
  831. ldx [$tp], $tj
  832. ldx [$rp+0], $t2
  833. stx %g0, [$tp] ! zap
  834. add $tp, 8, $tp
  835. movcs %icc, $tj, $t2
  836. stx $t2, [$rp+0]
  837. add $rp, 8, $rp
  838. brnz $cnt, .Lcopy
  839. sub $cnt, 8, $cnt
  840. mov 1, %o0
  841. ret
  842. restore
  843. .type bn_mul_mont_t4, #function
  844. .size bn_mul_mont_t4, .-bn_mul_mont_t4
  845. ___
  846. # int bn_mul_mont_gather5(
  847. $rp="%o0"; # u64 *rp,
  848. $ap="%o1"; # const u64 *ap,
  849. $bp="%o2"; # const u64 *pwrtbl,
  850. $np="%o3"; # const u64 *np,
  851. $n0p="%o4"; # const BN_ULONG *n0,
  852. $num="%o5"; # int num, # caller ensures that num is >=3
  853. # int power);
  854. $code.=<<___;
  855. .globl bn_mul_mont_gather5_t4
  856. .align 32
  857. bn_mul_mont_gather5_t4:
  858. add %sp, STACK_BIAS, %g4 ! real top of stack
  859. sll $num, 3, $num ! size in bytes
  860. add $num, 63, %g1
  861. andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
  862. sub %g4, %g1, %g1
  863. andn %g1, 63, %g1 ! align at 64 byte
  864. sub %g1, STACK_FRAME, %g1 ! new top of stack
  865. sub %g1, %g4, %g1
  866. LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
  867. save %sp, %g1, %sp
  868. ___
  869. # +-------------------------------+<----- %sp
  870. # . .
  871. # +-------------------------------+<----- aligned at 64 bytes
  872. # | __int64 tmp[0] |
  873. # +-------------------------------+
  874. # . .
  875. # . .
  876. # +-------------------------------+<----- aligned at 64 bytes
  877. # . .
  878. ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
  879. ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
  880. ($ovf,$i)=($t0,$t1);
  881. &load_ccr($bp,"%g4",$ccr);
  882. &load_b($bp,$m0,"%o7"); # m0=bp[0]
  883. $code.=<<___;
  884. ld [$n0p+0], $t0 ! pull n0[0..1] value
  885. ld [$n0p+4], $t1
  886. add %sp, STACK_BIAS+STACK_FRAME, $tp
  887. sllx $t1, 32, $n0
  888. or $t0, $n0, $n0
  889. ldx [$ap+0], $aj ! ap[0]
  890. mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
  891. umulxhi $aj, $m0, $hi0
  892. ldx [$ap+8], $aj ! ap[1]
  893. add $ap, 16, $ap
  894. ldx [$np+0], $nj ! np[0]
  895. mulx $lo0, $n0, $m1 ! "tp[0]"*n0
  896. mulx $aj, $m0, $alo ! ap[1]*bp[0]
  897. umulxhi $aj, $m0, $aj ! ahi=aj
  898. mulx $nj, $m1, $lo1 ! np[0]*m1
  899. umulxhi $nj, $m1, $hi1
  900. ldx [$np+8], $nj ! np[1]
  901. addcc $lo0, $lo1, $lo1
  902. add $np, 16, $np
  903. addxc %g0, $hi1, $hi1
  904. mulx $nj, $m1, $nlo ! np[1]*m1
  905. umulxhi $nj, $m1, $nj ! nhi=nj
  906. ba .L1st_g5
  907. sub $num, 24, $cnt ! cnt=num-3
  908. .align 16
  909. .L1st_g5:
  910. addcc $alo, $hi0, $lo0
  911. addxc $aj, %g0, $hi0
  912. ldx [$ap+0], $aj ! ap[j]
  913. addcc $nlo, $hi1, $lo1
  914. add $ap, 8, $ap
  915. addxc $nj, %g0, $hi1 ! nhi=nj
  916. ldx [$np+0], $nj ! np[j]
  917. mulx $aj, $m0, $alo ! ap[j]*bp[0]
  918. add $np, 8, $np
  919. umulxhi $aj, $m0, $aj ! ahi=aj
  920. mulx $nj, $m1, $nlo ! np[j]*m1
  921. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  922. umulxhi $nj, $m1, $nj ! nhi=nj
  923. addxc %g0, $hi1, $hi1
  924. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  925. add $tp, 8, $tp ! tp++
  926. brnz,pt $cnt, .L1st_g5
  927. sub $cnt, 8, $cnt ! j--
  928. !.L1st_g5
  929. addcc $alo, $hi0, $lo0
  930. addxc $aj, %g0, $hi0 ! ahi=aj
  931. addcc $nlo, $hi1, $lo1
  932. addxc $nj, %g0, $hi1
  933. addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
  934. addxc %g0, $hi1, $hi1
  935. stxa $lo1, [$tp]0xe2 ! tp[j-1]
  936. add $tp, 8, $tp
  937. addcc $hi0, $hi1, $hi1
  938. addxc %g0, %g0, $ovf ! upmost overflow bit
  939. stxa $hi1, [$tp]0xe2
  940. add $tp, 8, $tp
  941. ba .Louter_g5
  942. sub $num, 16, $i ! i=num-2
  943. .align 16
  944. .Louter_g5:
  945. wr $ccr, %g0, %ccr
  946. ___
  947. &load_b($bp,$m0); # m0=bp[i]
  948. $code.=<<___;
  949. sub $ap, $num, $ap ! rewind
  950. sub $np, $num, $np
  951. sub $tp, $num, $tp
  952. ldx [$ap+0], $aj ! ap[0]
  953. ldx [$np+0], $nj ! np[0]
  954. mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
  955. ldx [$tp], $tj ! tp[0]
  956. umulxhi $aj, $m0, $hi0
  957. ldx [$ap+8], $aj ! ap[1]
  958. addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
  959. mulx $aj, $m0, $alo ! ap[1]*bp[i]
  960. addxc %g0, $hi0, $hi0
  961. mulx $lo0, $n0, $m1 ! tp[0]*n0
  962. umulxhi $aj, $m0, $aj ! ahi=aj
  963. mulx $nj, $m1, $lo1 ! np[0]*m1
  964. add $ap, 16, $ap
  965. umulxhi $nj, $m1, $hi1
  966. ldx [$np+8], $nj ! np[1]
  967. add $np, 16, $np
  968. addcc $lo1, $lo0, $lo1
  969. mulx $nj, $m1, $nlo ! np[1]*m1
  970. addxc %g0, $hi1, $hi1
  971. umulxhi $nj, $m1, $nj ! nhi=nj
  972. ba .Linner_g5
  973. sub $num, 24, $cnt ! cnt=num-3
  974. .align 16
  975. .Linner_g5:
  976. addcc $alo, $hi0, $lo0
  977. ldx [$tp+8], $tj ! tp[j]
  978. addxc $aj, %g0, $hi0 ! ahi=aj
  979. ldx [$ap+0], $aj ! ap[j]
  980. add $ap, 8, $ap
  981. addcc $nlo, $hi1, $lo1
  982. mulx $aj, $m0, $alo ! ap[j]*bp[i]
  983. addxc $nj, %g0, $hi1 ! nhi=nj
  984. ldx [$np+0], $nj ! np[j]
  985. add $np, 8, $np
  986. umulxhi $aj, $m0, $aj ! ahi=aj
  987. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  988. mulx $nj, $m1, $nlo ! np[j]*m1
  989. addxc %g0, $hi0, $hi0
  990. umulxhi $nj, $m1, $nj ! nhi=nj
  991. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  992. addxc %g0, $hi1, $hi1
  993. stx $lo1, [$tp] ! tp[j-1]
  994. add $tp, 8, $tp
  995. brnz,pt $cnt, .Linner_g5
  996. sub $cnt, 8, $cnt
  997. !.Linner_g5
  998. ldx [$tp+8], $tj ! tp[j]
  999. addcc $alo, $hi0, $lo0
  1000. addxc $aj, %g0, $hi0 ! ahi=aj
  1001. addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
  1002. addxc %g0, $hi0, $hi0
  1003. addcc $nlo, $hi1, $lo1
  1004. addxc $nj, %g0, $hi1 ! nhi=nj
  1005. addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
  1006. addxc %g0, $hi1, $hi1
  1007. stx $lo1, [$tp] ! tp[j-1]
  1008. subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
  1009. addxccc $hi1, $hi0, $hi1
  1010. addxc %g0, %g0, $ovf
  1011. stx $hi1, [$tp+8]
  1012. add $tp, 16, $tp
  1013. brnz,pt $i, .Louter_g5
  1014. sub $i, 8, $i
  1015. sub $ap, $num, $ap ! rewind
  1016. sub $np, $num, $np
  1017. sub $tp, $num, $tp
  1018. ba .Lsub_g5
  1019. subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
  1020. .align 16
  1021. .Lsub_g5:
  1022. ldx [$tp], $tj
  1023. add $tp, 8, $tp
  1024. ldx [$np+0], $nj
  1025. add $np, 8, $np
  1026. subccc $tj, $nj, $t2 ! tp[j]-np[j]
  1027. srlx $tj, 32, $tj
  1028. srlx $nj, 32, $nj
  1029. subccc $tj, $nj, $t3
  1030. add $rp, 8, $rp
  1031. st $t2, [$rp-4] ! reverse order
  1032. st $t3, [$rp-8]
  1033. brnz,pt $cnt, .Lsub_g5
  1034. sub $cnt, 8, $cnt
  1035. sub $np, $num, $np ! rewind
  1036. sub $tp, $num, $tp
  1037. sub $rp, $num, $rp
  1038. subccc $ovf, %g0, $ovf ! handle upmost overflow bit
  1039. ba .Lcopy_g5
  1040. sub $num, 8, $cnt
  1041. .align 16
  1042. .Lcopy_g5: ! conditional copy
  1043. ldx [$tp], $tj
  1044. ldx [$rp+0], $t2
  1045. stx %g0, [$tp] ! zap
  1046. add $tp, 8, $tp
  1047. movcs %icc, $tj, $t2
  1048. stx $t2, [$rp+0]
  1049. add $rp, 8, $rp
  1050. brnz $cnt, .Lcopy_g5
  1051. sub $cnt, 8, $cnt
  1052. mov 1, %o0
  1053. ret
  1054. restore
  1055. .type bn_mul_mont_gather5_t4, #function
  1056. .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
  1057. ___
  1058. }
  1059. $code.=<<___;
  1060. .globl bn_flip_t4
  1061. .align 32
  1062. bn_flip_t4:
  1063. .Loop_flip:
  1064. ld [%o1+0], %o4
  1065. sub %o2, 1, %o2
  1066. ld [%o1+4], %o5
  1067. add %o1, 8, %o1
  1068. st %o5, [%o0+0]
  1069. st %o4, [%o0+4]
  1070. brnz %o2, .Loop_flip
  1071. add %o0, 8, %o0
  1072. retl
  1073. nop
  1074. .type bn_flip_t4, #function
  1075. .size bn_flip_t4, .-bn_flip_t4
  1076. .globl bn_flip_n_scatter5_t4
  1077. .align 32
  1078. bn_flip_n_scatter5_t4:
  1079. sll %o3, 3, %o3
  1080. srl %o1, 1, %o1
  1081. add %o3, %o2, %o2 ! &pwrtbl[pwr]
  1082. sub %o1, 1, %o1
  1083. .Loop_flip_n_scatter5:
  1084. ld [%o0+0], %o4 ! inp[i]
  1085. ld [%o0+4], %o5
  1086. add %o0, 8, %o0
  1087. sllx %o5, 32, %o5
  1088. or %o4, %o5, %o5
  1089. stx %o5, [%o2]
  1090. add %o2, 32*8, %o2
  1091. brnz %o1, .Loop_flip_n_scatter5
  1092. sub %o1, 1, %o1
  1093. retl
  1094. nop
  1095. .type bn_flip_n_scatter5_t4, #function
  1096. .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
  1097. .globl bn_gather5_t4
  1098. .align 32
  1099. bn_gather5_t4:
  1100. ___
  1101. &load_ccr("%o2","%o3","%g1");
  1102. $code.=<<___;
  1103. sub %o1, 1, %o1
  1104. .Loop_gather5:
  1105. ___
  1106. &load_b("%o2","%g1");
  1107. $code.=<<___;
  1108. stx %g1, [%o0]
  1109. add %o0, 8, %o0
  1110. brnz %o1, .Loop_gather5
  1111. sub %o1, 1, %o1
  1112. retl
  1113. nop
  1114. .type bn_gather5_t4, #function
  1115. .size bn_gather5_t4, .-bn_gather5_t4
  1116. .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
  1117. .align 4
  1118. ___
  1119. &emit_assembler();
  1120. close STDOUT or die "error closing STDOUT: $!";