ppc.pl 44 KB


  1. #!/usr/bin/env perl
  2. #
  3. # Implemented as a Perl wrapper as we want to support several different
  4. # architectures with single file. We pick up the target based on the
  5. # file name we are asked to generate.
  6. #
  7. # It should be noted though that this perl code is nothing like
  8. # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  9. # as pre-processor to cover for platform differences in name decoration,
  10. # linker tables, 32-/64-bit instruction sets...
  11. #
  12. # As you might know there're several PowerPC ABI in use. Most notably
  13. # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  14. # are similar enough to implement leaf(!) functions, which would be ABI
  15. # neutral. And that's what you find here: ABI neutral leaf functions.
  16. # In case you wonder what that is...
  17. #
  18. # AIX performance
  19. #
  20. # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  21. #
  22. # The following is the performance of 32-bit compiler
  23. # generated code:
  24. #
  25. # OpenSSL 0.9.6c 21 dec 2001
  26. # built on: Tue Jun 11 11:06:51 EDT 2002
  27. # options:bn(64,32) ...
  28. #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
  29. # sign verify sign/s verify/s
  30. #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
  31. #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
  32. #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
  33. #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
  34. #dsa 512 bits 0.0087s 0.0106s 114.3 94.5
  35. #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
  36. #
  37. # Same bechmark with this assembler code:
  38. #
  39. #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
  40. #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
  41. #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
  42. #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
  43. #dsa 512 bits 0.0052s 0.0062s 191.6 162.0
  44. #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
  45. #
  46. # Number of operations increases by at almost 75%
  47. #
  48. # Here are performance numbers for 64-bit compiler
  49. # generated code:
  50. #
  51. # OpenSSL 0.9.6g [engine] 9 Aug 2002
  52. # built on: Fri Apr 18 16:59:20 EDT 2003
  53. # options:bn(64,64) ...
  54. # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  55. # sign verify sign/s verify/s
  56. #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
  57. #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
  58. #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
  59. #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
  60. #dsa 512 bits 0.0026s 0.0032s 382.5 313.7
  61. #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
  62. #
  63. # Same benchmark with this assembler code:
  64. #
  65. #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
  66. #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
  67. #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
  68. #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
  69. #dsa 512 bits 0.0016s 0.0020s 610.7 507.1
  70. #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
  71. #
  72. # Again, performance increases by at about 75%
  73. #
  74. # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  75. # OpenSSL 0.9.7c 30 Sep 2003
  76. #
  77. # Original code.
  78. #
  79. #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
  80. #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
  81. #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
  82. #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
  83. #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
  84. #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
  85. #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
  86. #
  87. # Same benchmark with this assembler code:
  88. #
  89. #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
  90. #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
  91. #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
  92. #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
  93. #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
  94. #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
  95. #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
  96. #
  97. # Performance increase of ~60%
  98. #
  99. # If you have comments or suggestions to improve code send
  100. # me a note at schari@us.ibm.com
  101. #
  102. $flavour = shift;
  103. if ($flavour =~ /32/) {
  104. $BITS= 32;
  105. $BNSZ= $BITS/8;
  106. $ISA= "\"ppc\"";
  107. $LD= "lwz"; # load
  108. $LDU= "lwzu"; # load and update
  109. $ST= "stw"; # store
  110. $STU= "stwu"; # store and update
  111. $UMULL= "mullw"; # unsigned multiply low
  112. $UMULH= "mulhwu"; # unsigned multiply high
  113. $UDIV= "divwu"; # unsigned divide
  114. $UCMPI= "cmplwi"; # unsigned compare with immediate
  115. $UCMP= "cmplw"; # unsigned compare
  116. $CNTLZ= "cntlzw"; # count leading zeros
  117. $SHL= "slw"; # shift left
  118. $SHR= "srw"; # unsigned shift right
  119. $SHRI= "srwi"; # unsigned shift right by immediate
  120. $SHLI= "slwi"; # shift left by immediate
  121. $CLRU= "clrlwi"; # clear upper bits
  122. $INSR= "insrwi"; # insert right
  123. $ROTL= "rotlwi"; # rotate left by immediate
  124. $TR= "tw"; # conditional trap
  125. } elsif ($flavour =~ /64/) {
  126. $BITS= 64;
  127. $BNSZ= $BITS/8;
  128. $ISA= "\"ppc64\"";
  129. # same as above, but 64-bit mnemonics...
  130. $LD= "ld"; # load
  131. $LDU= "ldu"; # load and update
  132. $ST= "std"; # store
  133. $STU= "stdu"; # store and update
  134. $UMULL= "mulld"; # unsigned multiply low
  135. $UMULH= "mulhdu"; # unsigned multiply high
  136. $UDIV= "divdu"; # unsigned divide
  137. $UCMPI= "cmpldi"; # unsigned compare with immediate
  138. $UCMP= "cmpld"; # unsigned compare
  139. $CNTLZ= "cntlzd"; # count leading zeros
  140. $SHL= "sld"; # shift left
  141. $SHR= "srd"; # unsigned shift right
  142. $SHRI= "srdi"; # unsigned shift right by immediate
  143. $SHLI= "sldi"; # shift left by immediate
  144. $CLRU= "clrldi"; # clear upper bits
  145. $INSR= "insrdi"; # insert right
  146. $ROTL= "rotldi"; # rotate left by immediate
  147. $TR= "td"; # conditional trap
  148. } else { die "nonsense $flavour"; }
  149. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  150. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  151. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  152. die "can't locate ppc-xlate.pl";
  153. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  154. $data=<<EOF;
  155. #--------------------------------------------------------------------
  156. #
  157. #
  158. #
  159. #
  160. # File: ppc32.s
  161. #
  162. # Created by: Suresh Chari
  163. # IBM Thomas J. Watson Research Library
  164. # Hawthorne, NY
  165. #
  166. #
  167. # Description: Optimized assembly routines for OpenSSL crypto
  168. # on the 32 bitPowerPC platform.
  169. #
  170. #
  171. # Version History
  172. #
  173. # 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
  174. # cleaned up code. Also made a single version which can
  175. # be used for both the AIX and Linux compilers. See NOTE
  176. # below.
  177. # 12/05/03 Suresh Chari
  178. # (with lots of help from) Andy Polyakov
  179. ##
  180. # 1. Initial version 10/20/02 Suresh Chari
  181. #
  182. #
  183. # The following file works for the xlc,cc
  184. # and gcc compilers.
  185. #
  186. # NOTE: To get the file to link correctly with the gcc compiler
  187. # you have to change the names of the routines and remove
  188. # the first .(dot) character. This should automatically
  189. # be done in the build process.
  190. #
  191. # Hand optimized assembly code for the following routines
  192. #
  193. # bn_sqr_comba4
  194. # bn_sqr_comba8
  195. # bn_mul_comba4
  196. # bn_mul_comba8
  197. # bn_sub_words
  198. # bn_add_words
  199. # bn_div_words
  200. # bn_sqr_words
  201. # bn_mul_words
  202. # bn_mul_add_words
  203. #
  204. # NOTE: It is possible to optimize this code more for
  205. # specific PowerPC or Power architectures. On the Northstar
  206. # architecture the optimizations in this file do
  207. # NOT provide much improvement.
  208. #
  209. # If you have comments or suggestions to improve code send
  210. # me a note at schari\@us.ibm.com
  211. #
  212. #--------------------------------------------------------------------------
  213. #
  214. # Defines to be used in the assembly code.
  215. #
  216. #.set r0,0 # we use it as storage for value of 0
  217. #.set SP,1 # preserved
  218. #.set RTOC,2 # preserved
  219. #.set r3,3 # 1st argument/return value
  220. #.set r4,4 # 2nd argument/volatile register
  221. #.set r5,5 # 3rd argument/volatile register
  222. #.set r6,6 # ...
  223. #.set r7,7
  224. #.set r8,8
  225. #.set r9,9
  226. #.set r10,10
  227. #.set r11,11
  228. #.set r12,12
  229. #.set r13,13 # not used, nor any other "below" it...
  230. # Declare function names to be global
  231. # NOTE: For gcc these names MUST be changed to remove
  232. # the first . i.e. for example change ".bn_sqr_comba4"
  233. # to "bn_sqr_comba4". This should be automatically done
  234. # in the build.
  235. .globl .bn_sqr_comba4
  236. .globl .bn_sqr_comba8
  237. .globl .bn_mul_comba4
  238. .globl .bn_mul_comba8
  239. .globl .bn_sub_words
  240. .globl .bn_add_words
  241. .globl .bn_div_words
  242. .globl .bn_sqr_words
  243. .globl .bn_mul_words
  244. .globl .bn_mul_add_words
  245. # .text section
  246. .machine "any"
  247. #
  248. # NOTE: The following label name should be changed to
  249. # "bn_sqr_comba4" i.e. remove the first dot
  250. # for the gcc compiler. This should be automatically
  251. # done in the build
  252. #
  253. .align 4
  254. .bn_sqr_comba4:
  255. #
  256. # Optimized version of bn_sqr_comba4.
  257. #
  258. # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  259. # r3 contains r
  260. # r4 contains a
  261. #
  262. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  263. #
  264. # r5,r6 are the two BN_ULONGs being multiplied.
  265. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  266. # r9,r10, r11 are the equivalents of c1,c2, c3.
  267. # Here's the assembly
  268. #
  269. #
  270. xor r0,r0,r0 # set r0 = 0. Used in the addze
  271. # instructions below
  272. #sqr_add_c(a,0,c1,c2,c3)
  273. $LD r5,`0*$BNSZ`(r4)
  274. $UMULL r9,r5,r5
  275. $UMULH r10,r5,r5 #in first iteration. No need
  276. #to add since c1=c2=c3=0.
  277. # Note c3(r11) is NOT set to 0
  278. # but will be.
  279. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  280. # sqr_add_c2(a,1,0,c2,c3,c1);
  281. $LD r6,`1*$BNSZ`(r4)
  282. $UMULL r7,r5,r6
  283. $UMULH r8,r5,r6
  284. addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
  285. adde r8,r8,r8
  286. addze r9,r0 # catch carry if any.
  287. # r9= r0(=0) and carry
  288. addc r10,r7,r10 # now add to temp result.
  289. addze r11,r8 # r8 added to r11 which is 0
  290. addze r9,r9
  291. $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
  292. #sqr_add_c(a,1,c3,c1,c2)
  293. $UMULL r7,r6,r6
  294. $UMULH r8,r6,r6
  295. addc r11,r7,r11
  296. adde r9,r8,r9
  297. addze r10,r0
  298. #sqr_add_c2(a,2,0,c3,c1,c2)
  299. $LD r6,`2*$BNSZ`(r4)
  300. $UMULL r7,r5,r6
  301. $UMULH r8,r5,r6
  302. addc r7,r7,r7
  303. adde r8,r8,r8
  304. addze r10,r10
  305. addc r11,r7,r11
  306. adde r9,r8,r9
  307. addze r10,r10
  308. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  309. #sqr_add_c2(a,3,0,c1,c2,c3);
  310. $LD r6,`3*$BNSZ`(r4)
  311. $UMULL r7,r5,r6
  312. $UMULH r8,r5,r6
  313. addc r7,r7,r7
  314. adde r8,r8,r8
  315. addze r11,r0
  316. addc r9,r7,r9
  317. adde r10,r8,r10
  318. addze r11,r11
  319. #sqr_add_c2(a,2,1,c1,c2,c3);
  320. $LD r5,`1*$BNSZ`(r4)
  321. $LD r6,`2*$BNSZ`(r4)
  322. $UMULL r7,r5,r6
  323. $UMULH r8,r5,r6
  324. addc r7,r7,r7
  325. adde r8,r8,r8
  326. addze r11,r11
  327. addc r9,r7,r9
  328. adde r10,r8,r10
  329. addze r11,r11
  330. $ST r9,`3*$BNSZ`(r3) #r[3]=c1
  331. #sqr_add_c(a,2,c2,c3,c1);
  332. $UMULL r7,r6,r6
  333. $UMULH r8,r6,r6
  334. addc r10,r7,r10
  335. adde r11,r8,r11
  336. addze r9,r0
  337. #sqr_add_c2(a,3,1,c2,c3,c1);
  338. $LD r6,`3*$BNSZ`(r4)
  339. $UMULL r7,r5,r6
  340. $UMULH r8,r5,r6
  341. addc r7,r7,r7
  342. adde r8,r8,r8
  343. addze r9,r9
  344. addc r10,r7,r10
  345. adde r11,r8,r11
  346. addze r9,r9
  347. $ST r10,`4*$BNSZ`(r3) #r[4]=c2
  348. #sqr_add_c2(a,3,2,c3,c1,c2);
  349. $LD r5,`2*$BNSZ`(r4)
  350. $UMULL r7,r5,r6
  351. $UMULH r8,r5,r6
  352. addc r7,r7,r7
  353. adde r8,r8,r8
  354. addze r10,r0
  355. addc r11,r7,r11
  356. adde r9,r8,r9
  357. addze r10,r10
  358. $ST r11,`5*$BNSZ`(r3) #r[5] = c3
  359. #sqr_add_c(a,3,c1,c2,c3);
  360. $UMULL r7,r6,r6
  361. $UMULH r8,r6,r6
  362. addc r9,r7,r9
  363. adde r10,r8,r10
  364. $ST r9,`6*$BNSZ`(r3) #r[6]=c1
  365. $ST r10,`7*$BNSZ`(r3) #r[7]=c2
  366. blr
  367. .long 0
  368. .byte 0,12,0x14,0,0,0,2,0
  369. .long 0
  370. #
  371. # NOTE: The following label name should be changed to
  372. # "bn_sqr_comba8" i.e. remove the first dot
  373. # for the gcc compiler. This should be automatically
  374. # done in the build
  375. #
  376. .align 4
  377. .bn_sqr_comba8:
  378. #
  379. # This is an optimized version of the bn_sqr_comba8 routine.
  380. # Tightly uses the adde instruction
  381. #
  382. #
  383. # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  384. # r3 contains r
  385. # r4 contains a
  386. #
  387. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  388. #
  389. # r5,r6 are the two BN_ULONGs being multiplied.
  390. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  391. # r9,r10, r11 are the equivalents of c1,c2, c3.
  392. #
  393. # Possible optimization of loading all 8 longs of a into registers
  394. # doesnt provide any speedup
  395. #
  396. xor r0,r0,r0 #set r0 = 0.Used in addze
  397. #instructions below.
  398. #sqr_add_c(a,0,c1,c2,c3);
  399. $LD r5,`0*$BNSZ`(r4)
  400. $UMULL r9,r5,r5 #1st iteration: no carries.
  401. $UMULH r10,r5,r5
  402. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  403. #sqr_add_c2(a,1,0,c2,c3,c1);
  404. $LD r6,`1*$BNSZ`(r4)
  405. $UMULL r7,r5,r6
  406. $UMULH r8,r5,r6
  407. addc r10,r7,r10 #add the two register number
  408. adde r11,r8,r0 # (r8,r7) to the three register
  409. addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
  410. addc r10,r7,r10 #add the two register number
  411. adde r11,r8,r11 # (r8,r7) to the three register
  412. addze r9,r9 # number (r9,r11,r10).
  413. $ST r10,`1*$BNSZ`(r3) # r[1]=c2
  414. #sqr_add_c(a,1,c3,c1,c2);
  415. $UMULL r7,r6,r6
  416. $UMULH r8,r6,r6
  417. addc r11,r7,r11
  418. adde r9,r8,r9
  419. addze r10,r0
  420. #sqr_add_c2(a,2,0,c3,c1,c2);
  421. $LD r6,`2*$BNSZ`(r4)
  422. $UMULL r7,r5,r6
  423. $UMULH r8,r5,r6
  424. addc r11,r7,r11
  425. adde r9,r8,r9
  426. addze r10,r10
  427. addc r11,r7,r11
  428. adde r9,r8,r9
  429. addze r10,r10
  430. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  431. #sqr_add_c2(a,3,0,c1,c2,c3);
  432. $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
  433. $UMULL r7,r5,r6
  434. $UMULH r8,r5,r6
  435. addc r9,r7,r9
  436. adde r10,r8,r10
  437. addze r11,r0
  438. addc r9,r7,r9
  439. adde r10,r8,r10
  440. addze r11,r11
  441. #sqr_add_c2(a,2,1,c1,c2,c3);
  442. $LD r5,`1*$BNSZ`(r4)
  443. $LD r6,`2*$BNSZ`(r4)
  444. $UMULL r7,r5,r6
  445. $UMULH r8,r5,r6
  446. addc r9,r7,r9
  447. adde r10,r8,r10
  448. addze r11,r11
  449. addc r9,r7,r9
  450. adde r10,r8,r10
  451. addze r11,r11
  452. $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
  453. #sqr_add_c(a,2,c2,c3,c1);
  454. $UMULL r7,r6,r6
  455. $UMULH r8,r6,r6
  456. addc r10,r7,r10
  457. adde r11,r8,r11
  458. addze r9,r0
  459. #sqr_add_c2(a,3,1,c2,c3,c1);
  460. $LD r6,`3*$BNSZ`(r4)
  461. $UMULL r7,r5,r6
  462. $UMULH r8,r5,r6
  463. addc r10,r7,r10
  464. adde r11,r8,r11
  465. addze r9,r9
  466. addc r10,r7,r10
  467. adde r11,r8,r11
  468. addze r9,r9
  469. #sqr_add_c2(a,4,0,c2,c3,c1);
  470. $LD r5,`0*$BNSZ`(r4)
  471. $LD r6,`4*$BNSZ`(r4)
  472. $UMULL r7,r5,r6
  473. $UMULH r8,r5,r6
  474. addc r10,r7,r10
  475. adde r11,r8,r11
  476. addze r9,r9
  477. addc r10,r7,r10
  478. adde r11,r8,r11
  479. addze r9,r9
  480. $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
  481. #sqr_add_c2(a,5,0,c3,c1,c2);
  482. $LD r6,`5*$BNSZ`(r4)
  483. $UMULL r7,r5,r6
  484. $UMULH r8,r5,r6
  485. addc r11,r7,r11
  486. adde r9,r8,r9
  487. addze r10,r0
  488. addc r11,r7,r11
  489. adde r9,r8,r9
  490. addze r10,r10
  491. #sqr_add_c2(a,4,1,c3,c1,c2);
  492. $LD r5,`1*$BNSZ`(r4)
  493. $LD r6,`4*$BNSZ`(r4)
  494. $UMULL r7,r5,r6
  495. $UMULH r8,r5,r6
  496. addc r11,r7,r11
  497. adde r9,r8,r9
  498. addze r10,r10
  499. addc r11,r7,r11
  500. adde r9,r8,r9
  501. addze r10,r10
  502. #sqr_add_c2(a,3,2,c3,c1,c2);
  503. $LD r5,`2*$BNSZ`(r4)
  504. $LD r6,`3*$BNSZ`(r4)
  505. $UMULL r7,r5,r6
  506. $UMULH r8,r5,r6
  507. addc r11,r7,r11
  508. adde r9,r8,r9
  509. addze r10,r10
  510. addc r11,r7,r11
  511. adde r9,r8,r9
  512. addze r10,r10
  513. $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
  514. #sqr_add_c(a,3,c1,c2,c3);
  515. $UMULL r7,r6,r6
  516. $UMULH r8,r6,r6
  517. addc r9,r7,r9
  518. adde r10,r8,r10
  519. addze r11,r0
  520. #sqr_add_c2(a,4,2,c1,c2,c3);
  521. $LD r6,`4*$BNSZ`(r4)
  522. $UMULL r7,r5,r6
  523. $UMULH r8,r5,r6
  524. addc r9,r7,r9
  525. adde r10,r8,r10
  526. addze r11,r11
  527. addc r9,r7,r9
  528. adde r10,r8,r10
  529. addze r11,r11
  530. #sqr_add_c2(a,5,1,c1,c2,c3);
  531. $LD r5,`1*$BNSZ`(r4)
  532. $LD r6,`5*$BNSZ`(r4)
  533. $UMULL r7,r5,r6
  534. $UMULH r8,r5,r6
  535. addc r9,r7,r9
  536. adde r10,r8,r10
  537. addze r11,r11
  538. addc r9,r7,r9
  539. adde r10,r8,r10
  540. addze r11,r11
  541. #sqr_add_c2(a,6,0,c1,c2,c3);
  542. $LD r5,`0*$BNSZ`(r4)
  543. $LD r6,`6*$BNSZ`(r4)
  544. $UMULL r7,r5,r6
  545. $UMULH r8,r5,r6
  546. addc r9,r7,r9
  547. adde r10,r8,r10
  548. addze r11,r11
  549. addc r9,r7,r9
  550. adde r10,r8,r10
  551. addze r11,r11
  552. $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
  553. #sqr_add_c2(a,7,0,c2,c3,c1);
  554. $LD r6,`7*$BNSZ`(r4)
  555. $UMULL r7,r5,r6
  556. $UMULH r8,r5,r6
  557. addc r10,r7,r10
  558. adde r11,r8,r11
  559. addze r9,r0
  560. addc r10,r7,r10
  561. adde r11,r8,r11
  562. addze r9,r9
  563. #sqr_add_c2(a,6,1,c2,c3,c1);
  564. $LD r5,`1*$BNSZ`(r4)
  565. $LD r6,`6*$BNSZ`(r4)
  566. $UMULL r7,r5,r6
  567. $UMULH r8,r5,r6
  568. addc r10,r7,r10
  569. adde r11,r8,r11
  570. addze r9,r9
  571. addc r10,r7,r10
  572. adde r11,r8,r11
  573. addze r9,r9
  574. #sqr_add_c2(a,5,2,c2,c3,c1);
  575. $LD r5,`2*$BNSZ`(r4)
  576. $LD r6,`5*$BNSZ`(r4)
  577. $UMULL r7,r5,r6
  578. $UMULH r8,r5,r6
  579. addc r10,r7,r10
  580. adde r11,r8,r11
  581. addze r9,r9
  582. addc r10,r7,r10
  583. adde r11,r8,r11
  584. addze r9,r9
  585. #sqr_add_c2(a,4,3,c2,c3,c1);
  586. $LD r5,`3*$BNSZ`(r4)
  587. $LD r6,`4*$BNSZ`(r4)
  588. $UMULL r7,r5,r6
  589. $UMULH r8,r5,r6
  590. addc r10,r7,r10
  591. adde r11,r8,r11
  592. addze r9,r9
  593. addc r10,r7,r10
  594. adde r11,r8,r11
  595. addze r9,r9
  596. $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
  597. #sqr_add_c(a,4,c3,c1,c2);
  598. $UMULL r7,r6,r6
  599. $UMULH r8,r6,r6
  600. addc r11,r7,r11
  601. adde r9,r8,r9
  602. addze r10,r0
  603. #sqr_add_c2(a,5,3,c3,c1,c2);
  604. $LD r6,`5*$BNSZ`(r4)
  605. $UMULL r7,r5,r6
  606. $UMULH r8,r5,r6
  607. addc r11,r7,r11
  608. adde r9,r8,r9
  609. addze r10,r10
  610. addc r11,r7,r11
  611. adde r9,r8,r9
  612. addze r10,r10
  613. #sqr_add_c2(a,6,2,c3,c1,c2);
  614. $LD r5,`2*$BNSZ`(r4)
  615. $LD r6,`6*$BNSZ`(r4)
  616. $UMULL r7,r5,r6
  617. $UMULH r8,r5,r6
  618. addc r11,r7,r11
  619. adde r9,r8,r9
  620. addze r10,r10
  621. addc r11,r7,r11
  622. adde r9,r8,r9
  623. addze r10,r10
  624. #sqr_add_c2(a,7,1,c3,c1,c2);
  625. $LD r5,`1*$BNSZ`(r4)
  626. $LD r6,`7*$BNSZ`(r4)
  627. $UMULL r7,r5,r6
  628. $UMULH r8,r5,r6
  629. addc r11,r7,r11
  630. adde r9,r8,r9
  631. addze r10,r10
  632. addc r11,r7,r11
  633. adde r9,r8,r9
  634. addze r10,r10
  635. $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
  636. #sqr_add_c2(a,7,2,c1,c2,c3);
  637. $LD r5,`2*$BNSZ`(r4)
  638. $UMULL r7,r5,r6
  639. $UMULH r8,r5,r6
  640. addc r9,r7,r9
  641. adde r10,r8,r10
  642. addze r11,r0
  643. addc r9,r7,r9
  644. adde r10,r8,r10
  645. addze r11,r11
  646. #sqr_add_c2(a,6,3,c1,c2,c3);
  647. $LD r5,`3*$BNSZ`(r4)
  648. $LD r6,`6*$BNSZ`(r4)
  649. $UMULL r7,r5,r6
  650. $UMULH r8,r5,r6
  651. addc r9,r7,r9
  652. adde r10,r8,r10
  653. addze r11,r11
  654. addc r9,r7,r9
  655. adde r10,r8,r10
  656. addze r11,r11
  657. #sqr_add_c2(a,5,4,c1,c2,c3);
  658. $LD r5,`4*$BNSZ`(r4)
  659. $LD r6,`5*$BNSZ`(r4)
  660. $UMULL r7,r5,r6
  661. $UMULH r8,r5,r6
  662. addc r9,r7,r9
  663. adde r10,r8,r10
  664. addze r11,r11
  665. addc r9,r7,r9
  666. adde r10,r8,r10
  667. addze r11,r11
  668. $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
  669. #sqr_add_c(a,5,c2,c3,c1);
  670. $UMULL r7,r6,r6
  671. $UMULH r8,r6,r6
  672. addc r10,r7,r10
  673. adde r11,r8,r11
  674. addze r9,r0
  675. #sqr_add_c2(a,6,4,c2,c3,c1);
  676. $LD r6,`6*$BNSZ`(r4)
  677. $UMULL r7,r5,r6
  678. $UMULH r8,r5,r6
  679. addc r10,r7,r10
  680. adde r11,r8,r11
  681. addze r9,r9
  682. addc r10,r7,r10
  683. adde r11,r8,r11
  684. addze r9,r9
  685. #sqr_add_c2(a,7,3,c2,c3,c1);
  686. $LD r5,`3*$BNSZ`(r4)
  687. $LD r6,`7*$BNSZ`(r4)
  688. $UMULL r7,r5,r6
  689. $UMULH r8,r5,r6
  690. addc r10,r7,r10
  691. adde r11,r8,r11
  692. addze r9,r9
  693. addc r10,r7,r10
  694. adde r11,r8,r11
  695. addze r9,r9
  696. $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
  697. #sqr_add_c2(a,7,4,c3,c1,c2);
  698. $LD r5,`4*$BNSZ`(r4)
  699. $UMULL r7,r5,r6
  700. $UMULH r8,r5,r6
  701. addc r11,r7,r11
  702. adde r9,r8,r9
  703. addze r10,r0
  704. addc r11,r7,r11
  705. adde r9,r8,r9
  706. addze r10,r10
  707. #sqr_add_c2(a,6,5,c3,c1,c2);
  708. $LD r5,`5*$BNSZ`(r4)
  709. $LD r6,`6*$BNSZ`(r4)
  710. $UMULL r7,r5,r6
  711. $UMULH r8,r5,r6
  712. addc r11,r7,r11
  713. adde r9,r8,r9
  714. addze r10,r10
  715. addc r11,r7,r11
  716. adde r9,r8,r9
  717. addze r10,r10
  718. $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
  719. #sqr_add_c(a,6,c1,c2,c3);
  720. $UMULL r7,r6,r6
  721. $UMULH r8,r6,r6
  722. addc r9,r7,r9
  723. adde r10,r8,r10
  724. addze r11,r0
  725. #sqr_add_c2(a,7,5,c1,c2,c3)
  726. $LD r6,`7*$BNSZ`(r4)
  727. $UMULL r7,r5,r6
  728. $UMULH r8,r5,r6
  729. addc r9,r7,r9
  730. adde r10,r8,r10
  731. addze r11,r11
  732. addc r9,r7,r9
  733. adde r10,r8,r10
  734. addze r11,r11
  735. $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
  736. #sqr_add_c2(a,7,6,c2,c3,c1)
  737. $LD r5,`6*$BNSZ`(r4)
  738. $UMULL r7,r5,r6
  739. $UMULH r8,r5,r6
  740. addc r10,r7,r10
  741. adde r11,r8,r11
  742. addze r9,r0
  743. addc r10,r7,r10
  744. adde r11,r8,r11
  745. addze r9,r9
  746. $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
  747. #sqr_add_c(a,7,c3,c1,c2);
  748. $UMULL r7,r6,r6
  749. $UMULH r8,r6,r6
  750. addc r11,r7,r11
  751. adde r9,r8,r9
  752. $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
  753. $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
  754. blr
  755. .long 0
  756. .byte 0,12,0x14,0,0,0,2,0
  757. .long 0
  758. #
  759. # NOTE: The following label name should be changed to
  760. # "bn_mul_comba4" i.e. remove the first dot
  761. # for the gcc compiler. This should be automatically
  762. # done in the build
  763. #
  764. .align 4
  765. .bn_mul_comba4:
  766. #
  767. # This is an optimized version of the bn_mul_comba4 routine.
  768. #
  769. # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  770. # r3 contains r
  771. # r4 contains a
  772. # r5 contains b
  773. # r6, r7 are the 2 BN_ULONGs being multiplied.
  774. # r8, r9 are the results of the 32x32 giving 64 multiply.
  775. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  776. #
  777. xor r0,r0,r0 #r0=0. Used in addze below.
  778. #mul_add_c(a[0],b[0],c1,c2,c3);
  779. $LD r6,`0*$BNSZ`(r4)
  780. $LD r7,`0*$BNSZ`(r5)
  781. $UMULL r10,r6,r7
  782. $UMULH r11,r6,r7
  783. $ST r10,`0*$BNSZ`(r3) #r[0]=c1
  784. #mul_add_c(a[0],b[1],c2,c3,c1);
  785. $LD r7,`1*$BNSZ`(r5)
  786. $UMULL r8,r6,r7
  787. $UMULH r9,r6,r7
  788. addc r11,r8,r11
  789. adde r12,r9,r0
  790. addze r10,r0
  791. #mul_add_c(a[1],b[0],c2,c3,c1);
  792. $LD r6, `1*$BNSZ`(r4)
  793. $LD r7, `0*$BNSZ`(r5)
  794. $UMULL r8,r6,r7
  795. $UMULH r9,r6,r7
  796. addc r11,r8,r11
  797. adde r12,r9,r12
  798. addze r10,r10
  799. $ST r11,`1*$BNSZ`(r3) #r[1]=c2
  800. #mul_add_c(a[2],b[0],c3,c1,c2);
  801. $LD r6,`2*$BNSZ`(r4)
  802. $UMULL r8,r6,r7
  803. $UMULH r9,r6,r7
  804. addc r12,r8,r12
  805. adde r10,r9,r10
  806. addze r11,r0
  807. #mul_add_c(a[1],b[1],c3,c1,c2);
  808. $LD r6,`1*$BNSZ`(r4)
  809. $LD r7,`1*$BNSZ`(r5)
  810. $UMULL r8,r6,r7
  811. $UMULH r9,r6,r7
  812. addc r12,r8,r12
  813. adde r10,r9,r10
  814. addze r11,r11
  815. #mul_add_c(a[0],b[2],c3,c1,c2);
  816. $LD r6,`0*$BNSZ`(r4)
  817. $LD r7,`2*$BNSZ`(r5)
  818. $UMULL r8,r6,r7
  819. $UMULH r9,r6,r7
  820. addc r12,r8,r12
  821. adde r10,r9,r10
  822. addze r11,r11
  823. $ST r12,`2*$BNSZ`(r3) #r[2]=c3
  824. #mul_add_c(a[0],b[3],c1,c2,c3);
  825. $LD r7,`3*$BNSZ`(r5)
  826. $UMULL r8,r6,r7
  827. $UMULH r9,r6,r7
  828. addc r10,r8,r10
  829. adde r11,r9,r11
  830. addze r12,r0
  831. #mul_add_c(a[1],b[2],c1,c2,c3);
  832. $LD r6,`1*$BNSZ`(r4)
  833. $LD r7,`2*$BNSZ`(r5)
  834. $UMULL r8,r6,r7
  835. $UMULH r9,r6,r7
  836. addc r10,r8,r10
  837. adde r11,r9,r11
  838. addze r12,r12
  839. #mul_add_c(a[2],b[1],c1,c2,c3);
  840. $LD r6,`2*$BNSZ`(r4)
  841. $LD r7,`1*$BNSZ`(r5)
  842. $UMULL r8,r6,r7
  843. $UMULH r9,r6,r7
  844. addc r10,r8,r10
  845. adde r11,r9,r11
  846. addze r12,r12
  847. #mul_add_c(a[3],b[0],c1,c2,c3);
  848. $LD r6,`3*$BNSZ`(r4)
  849. $LD r7,`0*$BNSZ`(r5)
  850. $UMULL r8,r6,r7
  851. $UMULH r9,r6,r7
  852. addc r10,r8,r10
  853. adde r11,r9,r11
  854. addze r12,r12
  855. $ST r10,`3*$BNSZ`(r3) #r[3]=c1
  856. #mul_add_c(a[3],b[1],c2,c3,c1);
  857. $LD r7,`1*$BNSZ`(r5)
  858. $UMULL r8,r6,r7
  859. $UMULH r9,r6,r7
  860. addc r11,r8,r11
  861. adde r12,r9,r12
  862. addze r10,r0
  863. #mul_add_c(a[2],b[2],c2,c3,c1);
  864. $LD r6,`2*$BNSZ`(r4)
  865. $LD r7,`2*$BNSZ`(r5)
  866. $UMULL r8,r6,r7
  867. $UMULH r9,r6,r7
  868. addc r11,r8,r11
  869. adde r12,r9,r12
  870. addze r10,r10
  871. #mul_add_c(a[1],b[3],c2,c3,c1);
  872. $LD r6,`1*$BNSZ`(r4)
  873. $LD r7,`3*$BNSZ`(r5)
  874. $UMULL r8,r6,r7
  875. $UMULH r9,r6,r7
  876. addc r11,r8,r11
  877. adde r12,r9,r12
  878. addze r10,r10
  879. $ST r11,`4*$BNSZ`(r3) #r[4]=c2
  880. #mul_add_c(a[2],b[3],c3,c1,c2);
  881. $LD r6,`2*$BNSZ`(r4)
  882. $UMULL r8,r6,r7
  883. $UMULH r9,r6,r7
  884. addc r12,r8,r12
  885. adde r10,r9,r10
  886. addze r11,r0
  887. #mul_add_c(a[3],b[2],c3,c1,c2);
  888. $LD r6,`3*$BNSZ`(r4)
  889. $LD r7,`2*$BNSZ`(r5)
  890. $UMULL r8,r6,r7
  891. $UMULH r9,r6,r7
  892. addc r12,r8,r12
  893. adde r10,r9,r10
  894. addze r11,r11
  895. $ST r12,`5*$BNSZ`(r3) #r[5]=c3
  896. #mul_add_c(a[3],b[3],c1,c2,c3);
  897. $LD r7,`3*$BNSZ`(r5)
  898. $UMULL r8,r6,r7
  899. $UMULH r9,r6,r7
  900. addc r10,r8,r10
  901. adde r11,r9,r11
  902. $ST r10,`6*$BNSZ`(r3) #r[6]=c1
  903. $ST r11,`7*$BNSZ`(r3) #r[7]=c2
  904. blr
  905. .long 0
  906. .byte 0,12,0x14,0,0,0,3,0
  907. .long 0
  908. #
  909. # NOTE: The following label name should be changed to
  910. # "bn_mul_comba8" i.e. remove the first dot
  911. # for the gcc compiler. This should be automatically
  912. # done in the build
  913. #
  914. .align 4
  915. .bn_mul_comba8:
  916. #
  917. # Optimized version of the bn_mul_comba8 routine.
  918. #
  919. # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  920. # r3 contains r
  921. # r4 contains a
  922. # r5 contains b
  923. # r6, r7 are the 2 BN_ULONGs being multiplied.
  924. # r8, r9 are the results of the 32x32 giving 64 multiply.
  925. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  926. #
  927. xor r0,r0,r0 #r0=0. Used in addze below.
  928. #mul_add_c(a[0],b[0],c1,c2,c3);
  929. $LD r6,`0*$BNSZ`(r4) #a[0]
  930. $LD r7,`0*$BNSZ`(r5) #b[0]
  931. $UMULL r10,r6,r7
  932. $UMULH r11,r6,r7
  933. $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
  934. #mul_add_c(a[0],b[1],c2,c3,c1);
  935. $LD r7,`1*$BNSZ`(r5)
  936. $UMULL r8,r6,r7
  937. $UMULH r9,r6,r7
  938. addc r11,r11,r8
  939. addze r12,r9 # since we didnt set r12 to zero before.
  940. addze r10,r0
  941. #mul_add_c(a[1],b[0],c2,c3,c1);
  942. $LD r6,`1*$BNSZ`(r4)
  943. $LD r7,`0*$BNSZ`(r5)
  944. $UMULL r8,r6,r7
  945. $UMULH r9,r6,r7
  946. addc r11,r11,r8
  947. adde r12,r12,r9
  948. addze r10,r10
  949. $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
  950. #mul_add_c(a[2],b[0],c3,c1,c2);
  951. $LD r6,`2*$BNSZ`(r4)
  952. $UMULL r8,r6,r7
  953. $UMULH r9,r6,r7
  954. addc r12,r12,r8
  955. adde r10,r10,r9
  956. addze r11,r0
  957. #mul_add_c(a[1],b[1],c3,c1,c2);
  958. $LD r6,`1*$BNSZ`(r4)
  959. $LD r7,`1*$BNSZ`(r5)
  960. $UMULL r8,r6,r7
  961. $UMULH r9,r6,r7
  962. addc r12,r12,r8
  963. adde r10,r10,r9
  964. addze r11,r11
  965. #mul_add_c(a[0],b[2],c3,c1,c2);
  966. $LD r6,`0*$BNSZ`(r4)
  967. $LD r7,`2*$BNSZ`(r5)
  968. $UMULL r8,r6,r7
  969. $UMULH r9,r6,r7
  970. addc r12,r12,r8
  971. adde r10,r10,r9
  972. addze r11,r11
  973. $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
  974. #mul_add_c(a[0],b[3],c1,c2,c3);
  975. $LD r7,`3*$BNSZ`(r5)
  976. $UMULL r8,r6,r7
  977. $UMULH r9,r6,r7
  978. addc r10,r10,r8
  979. adde r11,r11,r9
  980. addze r12,r0
  981. #mul_add_c(a[1],b[2],c1,c2,c3);
  982. $LD r6,`1*$BNSZ`(r4)
  983. $LD r7,`2*$BNSZ`(r5)
  984. $UMULL r8,r6,r7
  985. $UMULH r9,r6,r7
  986. addc r10,r10,r8
  987. adde r11,r11,r9
  988. addze r12,r12
  989. #mul_add_c(a[2],b[1],c1,c2,c3);
  990. $LD r6,`2*$BNSZ`(r4)
  991. $LD r7,`1*$BNSZ`(r5)
  992. $UMULL r8,r6,r7
  993. $UMULH r9,r6,r7
  994. addc r10,r10,r8
  995. adde r11,r11,r9
  996. addze r12,r12
  997. #mul_add_c(a[3],b[0],c1,c2,c3);
  998. $LD r6,`3*$BNSZ`(r4)
  999. $LD r7,`0*$BNSZ`(r5)
  1000. $UMULL r8,r6,r7
  1001. $UMULH r9,r6,r7
  1002. addc r10,r10,r8
  1003. adde r11,r11,r9
  1004. addze r12,r12
  1005. $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
  1006. #mul_add_c(a[4],b[0],c2,c3,c1);
  1007. $LD r6,`4*$BNSZ`(r4)
  1008. $UMULL r8,r6,r7
  1009. $UMULH r9,r6,r7
  1010. addc r11,r11,r8
  1011. adde r12,r12,r9
  1012. addze r10,r0
  1013. #mul_add_c(a[3],b[1],c2,c3,c1);
  1014. $LD r6,`3*$BNSZ`(r4)
  1015. $LD r7,`1*$BNSZ`(r5)
  1016. $UMULL r8,r6,r7
  1017. $UMULH r9,r6,r7
  1018. addc r11,r11,r8
  1019. adde r12,r12,r9
  1020. addze r10,r10
  1021. #mul_add_c(a[2],b[2],c2,c3,c1);
  1022. $LD r6,`2*$BNSZ`(r4)
  1023. $LD r7,`2*$BNSZ`(r5)
  1024. $UMULL r8,r6,r7
  1025. $UMULH r9,r6,r7
  1026. addc r11,r11,r8
  1027. adde r12,r12,r9
  1028. addze r10,r10
  1029. #mul_add_c(a[1],b[3],c2,c3,c1);
  1030. $LD r6,`1*$BNSZ`(r4)
  1031. $LD r7,`3*$BNSZ`(r5)
  1032. $UMULL r8,r6,r7
  1033. $UMULH r9,r6,r7
  1034. addc r11,r11,r8
  1035. adde r12,r12,r9
  1036. addze r10,r10
  1037. #mul_add_c(a[0],b[4],c2,c3,c1);
  1038. $LD r6,`0*$BNSZ`(r4)
  1039. $LD r7,`4*$BNSZ`(r5)
  1040. $UMULL r8,r6,r7
  1041. $UMULH r9,r6,r7
  1042. addc r11,r11,r8
  1043. adde r12,r12,r9
  1044. addze r10,r10
  1045. $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
  1046. #mul_add_c(a[0],b[5],c3,c1,c2);
  1047. $LD r7,`5*$BNSZ`(r5)
  1048. $UMULL r8,r6,r7
  1049. $UMULH r9,r6,r7
  1050. addc r12,r12,r8
  1051. adde r10,r10,r9
  1052. addze r11,r0
  1053. #mul_add_c(a[1],b[4],c3,c1,c2);
  1054. $LD r6,`1*$BNSZ`(r4)
  1055. $LD r7,`4*$BNSZ`(r5)
  1056. $UMULL r8,r6,r7
  1057. $UMULH r9,r6,r7
  1058. addc r12,r12,r8
  1059. adde r10,r10,r9
  1060. addze r11,r11
  1061. #mul_add_c(a[2],b[3],c3,c1,c2);
  1062. $LD r6,`2*$BNSZ`(r4)
  1063. $LD r7,`3*$BNSZ`(r5)
  1064. $UMULL r8,r6,r7
  1065. $UMULH r9,r6,r7
  1066. addc r12,r12,r8
  1067. adde r10,r10,r9
  1068. addze r11,r11
  1069. #mul_add_c(a[3],b[2],c3,c1,c2);
  1070. $LD r6,`3*$BNSZ`(r4)
  1071. $LD r7,`2*$BNSZ`(r5)
  1072. $UMULL r8,r6,r7
  1073. $UMULH r9,r6,r7
  1074. addc r12,r12,r8
  1075. adde r10,r10,r9
  1076. addze r11,r11
  1077. #mul_add_c(a[4],b[1],c3,c1,c2);
  1078. $LD r6,`4*$BNSZ`(r4)
  1079. $LD r7,`1*$BNSZ`(r5)
  1080. $UMULL r8,r6,r7
  1081. $UMULH r9,r6,r7
  1082. addc r12,r12,r8
  1083. adde r10,r10,r9
  1084. addze r11,r11
  1085. #mul_add_c(a[5],b[0],c3,c1,c2);
  1086. $LD r6,`5*$BNSZ`(r4)
  1087. $LD r7,`0*$BNSZ`(r5)
  1088. $UMULL r8,r6,r7
  1089. $UMULH r9,r6,r7
  1090. addc r12,r12,r8
  1091. adde r10,r10,r9
  1092. addze r11,r11
  1093. $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
  1094. #mul_add_c(a[6],b[0],c1,c2,c3);
  1095. $LD r6,`6*$BNSZ`(r4)
  1096. $UMULL r8,r6,r7
  1097. $UMULH r9,r6,r7
  1098. addc r10,r10,r8
  1099. adde r11,r11,r9
  1100. addze r12,r0
  1101. #mul_add_c(a[5],b[1],c1,c2,c3);
  1102. $LD r6,`5*$BNSZ`(r4)
  1103. $LD r7,`1*$BNSZ`(r5)
  1104. $UMULL r8,r6,r7
  1105. $UMULH r9,r6,r7
  1106. addc r10,r10,r8
  1107. adde r11,r11,r9
  1108. addze r12,r12
  1109. #mul_add_c(a[4],b[2],c1,c2,c3);
  1110. $LD r6,`4*$BNSZ`(r4)
  1111. $LD r7,`2*$BNSZ`(r5)
  1112. $UMULL r8,r6,r7
  1113. $UMULH r9,r6,r7
  1114. addc r10,r10,r8
  1115. adde r11,r11,r9
  1116. addze r12,r12
  1117. #mul_add_c(a[3],b[3],c1,c2,c3);
  1118. $LD r6,`3*$BNSZ`(r4)
  1119. $LD r7,`3*$BNSZ`(r5)
  1120. $UMULL r8,r6,r7
  1121. $UMULH r9,r6,r7
  1122. addc r10,r10,r8
  1123. adde r11,r11,r9
  1124. addze r12,r12
  1125. #mul_add_c(a[2],b[4],c1,c2,c3);
  1126. $LD r6,`2*$BNSZ`(r4)
  1127. $LD r7,`4*$BNSZ`(r5)
  1128. $UMULL r8,r6,r7
  1129. $UMULH r9,r6,r7
  1130. addc r10,r10,r8
  1131. adde r11,r11,r9
  1132. addze r12,r12
  1133. #mul_add_c(a[1],b[5],c1,c2,c3);
  1134. $LD r6,`1*$BNSZ`(r4)
  1135. $LD r7,`5*$BNSZ`(r5)
  1136. $UMULL r8,r6,r7
  1137. $UMULH r9,r6,r7
  1138. addc r10,r10,r8
  1139. adde r11,r11,r9
  1140. addze r12,r12
  1141. #mul_add_c(a[0],b[6],c1,c2,c3);
  1142. $LD r6,`0*$BNSZ`(r4)
  1143. $LD r7,`6*$BNSZ`(r5)
  1144. $UMULL r8,r6,r7
  1145. $UMULH r9,r6,r7
  1146. addc r10,r10,r8
  1147. adde r11,r11,r9
  1148. addze r12,r12
  1149. $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
  1150. #mul_add_c(a[0],b[7],c2,c3,c1);
  1151. $LD r7,`7*$BNSZ`(r5)
  1152. $UMULL r8,r6,r7
  1153. $UMULH r9,r6,r7
  1154. addc r11,r11,r8
  1155. adde r12,r12,r9
  1156. addze r10,r0
  1157. #mul_add_c(a[1],b[6],c2,c3,c1);
  1158. $LD r6,`1*$BNSZ`(r4)
  1159. $LD r7,`6*$BNSZ`(r5)
  1160. $UMULL r8,r6,r7
  1161. $UMULH r9,r6,r7
  1162. addc r11,r11,r8
  1163. adde r12,r12,r9
  1164. addze r10,r10
  1165. #mul_add_c(a[2],b[5],c2,c3,c1);
  1166. $LD r6,`2*$BNSZ`(r4)
  1167. $LD r7,`5*$BNSZ`(r5)
  1168. $UMULL r8,r6,r7
  1169. $UMULH r9,r6,r7
  1170. addc r11,r11,r8
  1171. adde r12,r12,r9
  1172. addze r10,r10
  1173. #mul_add_c(a[3],b[4],c2,c3,c1);
  1174. $LD r6,`3*$BNSZ`(r4)
  1175. $LD r7,`4*$BNSZ`(r5)
  1176. $UMULL r8,r6,r7
  1177. $UMULH r9,r6,r7
  1178. addc r11,r11,r8
  1179. adde r12,r12,r9
  1180. addze r10,r10
  1181. #mul_add_c(a[4],b[3],c2,c3,c1);
  1182. $LD r6,`4*$BNSZ`(r4)
  1183. $LD r7,`3*$BNSZ`(r5)
  1184. $UMULL r8,r6,r7
  1185. $UMULH r9,r6,r7
  1186. addc r11,r11,r8
  1187. adde r12,r12,r9
  1188. addze r10,r10
  1189. #mul_add_c(a[5],b[2],c2,c3,c1);
  1190. $LD r6,`5*$BNSZ`(r4)
  1191. $LD r7,`2*$BNSZ`(r5)
  1192. $UMULL r8,r6,r7
  1193. $UMULH r9,r6,r7
  1194. addc r11,r11,r8
  1195. adde r12,r12,r9
  1196. addze r10,r10
  1197. #mul_add_c(a[6],b[1],c2,c3,c1);
  1198. $LD r6,`6*$BNSZ`(r4)
  1199. $LD r7,`1*$BNSZ`(r5)
  1200. $UMULL r8,r6,r7
  1201. $UMULH r9,r6,r7
  1202. addc r11,r11,r8
  1203. adde r12,r12,r9
  1204. addze r10,r10
  1205. #mul_add_c(a[7],b[0],c2,c3,c1);
  1206. $LD r6,`7*$BNSZ`(r4)
  1207. $LD r7,`0*$BNSZ`(r5)
  1208. $UMULL r8,r6,r7
  1209. $UMULH r9,r6,r7
  1210. addc r11,r11,r8
  1211. adde r12,r12,r9
  1212. addze r10,r10
  1213. $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
  1214. #mul_add_c(a[7],b[1],c3,c1,c2);
  1215. $LD r7,`1*$BNSZ`(r5)
  1216. $UMULL r8,r6,r7
  1217. $UMULH r9,r6,r7
  1218. addc r12,r12,r8
  1219. adde r10,r10,r9
  1220. addze r11,r0
  1221. #mul_add_c(a[6],b[2],c3,c1,c2);
  1222. $LD r6,`6*$BNSZ`(r4)
  1223. $LD r7,`2*$BNSZ`(r5)
  1224. $UMULL r8,r6,r7
  1225. $UMULH r9,r6,r7
  1226. addc r12,r12,r8
  1227. adde r10,r10,r9
  1228. addze r11,r11
  1229. #mul_add_c(a[5],b[3],c3,c1,c2);
  1230. $LD r6,`5*$BNSZ`(r4)
  1231. $LD r7,`3*$BNSZ`(r5)
  1232. $UMULL r8,r6,r7
  1233. $UMULH r9,r6,r7
  1234. addc r12,r12,r8
  1235. adde r10,r10,r9
  1236. addze r11,r11
  1237. #mul_add_c(a[4],b[4],c3,c1,c2);
  1238. $LD r6,`4*$BNSZ`(r4)
  1239. $LD r7,`4*$BNSZ`(r5)
  1240. $UMULL r8,r6,r7
  1241. $UMULH r9,r6,r7
  1242. addc r12,r12,r8
  1243. adde r10,r10,r9
  1244. addze r11,r11
  1245. #mul_add_c(a[3],b[5],c3,c1,c2);
  1246. $LD r6,`3*$BNSZ`(r4)
  1247. $LD r7,`5*$BNSZ`(r5)
  1248. $UMULL r8,r6,r7
  1249. $UMULH r9,r6,r7
  1250. addc r12,r12,r8
  1251. adde r10,r10,r9
  1252. addze r11,r11
  1253. #mul_add_c(a[2],b[6],c3,c1,c2);
  1254. $LD r6,`2*$BNSZ`(r4)
  1255. $LD r7,`6*$BNSZ`(r5)
  1256. $UMULL r8,r6,r7
  1257. $UMULH r9,r6,r7
  1258. addc r12,r12,r8
  1259. adde r10,r10,r9
  1260. addze r11,r11
  1261. #mul_add_c(a[1],b[7],c3,c1,c2);
  1262. $LD r6,`1*$BNSZ`(r4)
  1263. $LD r7,`7*$BNSZ`(r5)
  1264. $UMULL r8,r6,r7
  1265. $UMULH r9,r6,r7
  1266. addc r12,r12,r8
  1267. adde r10,r10,r9
  1268. addze r11,r11
  1269. $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
  1270. #mul_add_c(a[2],b[7],c1,c2,c3);
  1271. $LD r6,`2*$BNSZ`(r4)
  1272. $UMULL r8,r6,r7
  1273. $UMULH r9,r6,r7
  1274. addc r10,r10,r8
  1275. adde r11,r11,r9
  1276. addze r12,r0
  1277. #mul_add_c(a[3],b[6],c1,c2,c3);
  1278. $LD r6,`3*$BNSZ`(r4)
  1279. $LD r7,`6*$BNSZ`(r5)
  1280. $UMULL r8,r6,r7
  1281. $UMULH r9,r6,r7
  1282. addc r10,r10,r8
  1283. adde r11,r11,r9
  1284. addze r12,r12
  1285. #mul_add_c(a[4],b[5],c1,c2,c3);
  1286. $LD r6,`4*$BNSZ`(r4)
  1287. $LD r7,`5*$BNSZ`(r5)
  1288. $UMULL r8,r6,r7
  1289. $UMULH r9,r6,r7
  1290. addc r10,r10,r8
  1291. adde r11,r11,r9
  1292. addze r12,r12
  1293. #mul_add_c(a[5],b[4],c1,c2,c3);
  1294. $LD r6,`5*$BNSZ`(r4)
  1295. $LD r7,`4*$BNSZ`(r5)
  1296. $UMULL r8,r6,r7
  1297. $UMULH r9,r6,r7
  1298. addc r10,r10,r8
  1299. adde r11,r11,r9
  1300. addze r12,r12
  1301. #mul_add_c(a[6],b[3],c1,c2,c3);
  1302. $LD r6,`6*$BNSZ`(r4)
  1303. $LD r7,`3*$BNSZ`(r5)
  1304. $UMULL r8,r6,r7
  1305. $UMULH r9,r6,r7
  1306. addc r10,r10,r8
  1307. adde r11,r11,r9
  1308. addze r12,r12
  1309. #mul_add_c(a[7],b[2],c1,c2,c3);
  1310. $LD r6,`7*$BNSZ`(r4)
  1311. $LD r7,`2*$BNSZ`(r5)
  1312. $UMULL r8,r6,r7
  1313. $UMULH r9,r6,r7
  1314. addc r10,r10,r8
  1315. adde r11,r11,r9
  1316. addze r12,r12
  1317. $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
  1318. #mul_add_c(a[7],b[3],c2,c3,c1);
  1319. $LD r7,`3*$BNSZ`(r5)
  1320. $UMULL r8,r6,r7
  1321. $UMULH r9,r6,r7
  1322. addc r11,r11,r8
  1323. adde r12,r12,r9
  1324. addze r10,r0
  1325. #mul_add_c(a[6],b[4],c2,c3,c1);
  1326. $LD r6,`6*$BNSZ`(r4)
  1327. $LD r7,`4*$BNSZ`(r5)
  1328. $UMULL r8,r6,r7
  1329. $UMULH r9,r6,r7
  1330. addc r11,r11,r8
  1331. adde r12,r12,r9
  1332. addze r10,r10
  1333. #mul_add_c(a[5],b[5],c2,c3,c1);
  1334. $LD r6,`5*$BNSZ`(r4)
  1335. $LD r7,`5*$BNSZ`(r5)
  1336. $UMULL r8,r6,r7
  1337. $UMULH r9,r6,r7
  1338. addc r11,r11,r8
  1339. adde r12,r12,r9
  1340. addze r10,r10
  1341. #mul_add_c(a[4],b[6],c2,c3,c1);
  1342. $LD r6,`4*$BNSZ`(r4)
  1343. $LD r7,`6*$BNSZ`(r5)
  1344. $UMULL r8,r6,r7
  1345. $UMULH r9,r6,r7
  1346. addc r11,r11,r8
  1347. adde r12,r12,r9
  1348. addze r10,r10
  1349. #mul_add_c(a[3],b[7],c2,c3,c1);
  1350. $LD r6,`3*$BNSZ`(r4)
  1351. $LD r7,`7*$BNSZ`(r5)
  1352. $UMULL r8,r6,r7
  1353. $UMULH r9,r6,r7
  1354. addc r11,r11,r8
  1355. adde r12,r12,r9
  1356. addze r10,r10
  1357. $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
  1358. #mul_add_c(a[4],b[7],c3,c1,c2);
  1359. $LD r6,`4*$BNSZ`(r4)
  1360. $UMULL r8,r6,r7
  1361. $UMULH r9,r6,r7
  1362. addc r12,r12,r8
  1363. adde r10,r10,r9
  1364. addze r11,r0
  1365. #mul_add_c(a[5],b[6],c3,c1,c2);
  1366. $LD r6,`5*$BNSZ`(r4)
  1367. $LD r7,`6*$BNSZ`(r5)
  1368. $UMULL r8,r6,r7
  1369. $UMULH r9,r6,r7
  1370. addc r12,r12,r8
  1371. adde r10,r10,r9
  1372. addze r11,r11
  1373. #mul_add_c(a[6],b[5],c3,c1,c2);
  1374. $LD r6,`6*$BNSZ`(r4)
  1375. $LD r7,`5*$BNSZ`(r5)
  1376. $UMULL r8,r6,r7
  1377. $UMULH r9,r6,r7
  1378. addc r12,r12,r8
  1379. adde r10,r10,r9
  1380. addze r11,r11
  1381. #mul_add_c(a[7],b[4],c3,c1,c2);
  1382. $LD r6,`7*$BNSZ`(r4)
  1383. $LD r7,`4*$BNSZ`(r5)
  1384. $UMULL r8,r6,r7
  1385. $UMULH r9,r6,r7
  1386. addc r12,r12,r8
  1387. adde r10,r10,r9
  1388. addze r11,r11
  1389. $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
  1390. #mul_add_c(a[7],b[5],c1,c2,c3);
  1391. $LD r7,`5*$BNSZ`(r5)
  1392. $UMULL r8,r6,r7
  1393. $UMULH r9,r6,r7
  1394. addc r10,r10,r8
  1395. adde r11,r11,r9
  1396. addze r12,r0
  1397. #mul_add_c(a[6],b[6],c1,c2,c3);
  1398. $LD r6,`6*$BNSZ`(r4)
  1399. $LD r7,`6*$BNSZ`(r5)
  1400. $UMULL r8,r6,r7
  1401. $UMULH r9,r6,r7
  1402. addc r10,r10,r8
  1403. adde r11,r11,r9
  1404. addze r12,r12
  1405. #mul_add_c(a[5],b[7],c1,c2,c3);
  1406. $LD r6,`5*$BNSZ`(r4)
  1407. $LD r7,`7*$BNSZ`(r5)
  1408. $UMULL r8,r6,r7
  1409. $UMULH r9,r6,r7
  1410. addc r10,r10,r8
  1411. adde r11,r11,r9
  1412. addze r12,r12
  1413. $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
  1414. #mul_add_c(a[6],b[7],c2,c3,c1);
  1415. $LD r6,`6*$BNSZ`(r4)
  1416. $UMULL r8,r6,r7
  1417. $UMULH r9,r6,r7
  1418. addc r11,r11,r8
  1419. adde r12,r12,r9
  1420. addze r10,r0
  1421. #mul_add_c(a[7],b[6],c2,c3,c1);
  1422. $LD r6,`7*$BNSZ`(r4)
  1423. $LD r7,`6*$BNSZ`(r5)
  1424. $UMULL r8,r6,r7
  1425. $UMULH r9,r6,r7
  1426. addc r11,r11,r8
  1427. adde r12,r12,r9
  1428. addze r10,r10
  1429. $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
  1430. #mul_add_c(a[7],b[7],c3,c1,c2);
  1431. $LD r7,`7*$BNSZ`(r5)
  1432. $UMULL r8,r6,r7
  1433. $UMULH r9,r6,r7
  1434. addc r12,r12,r8
  1435. adde r10,r10,r9
  1436. $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
  1437. $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
  1438. blr
  1439. .long 0
  1440. .byte 0,12,0x14,0,0,0,3,0
  1441. .long 0
  1442. #
  1443. # NOTE: The following label name should be changed to
  1444. # "bn_sub_words" i.e. remove the first dot
  1445. # for the gcc compiler. This should be automatically
  1446. # done in the build
  1447. #
  1448. #
  1449. .align 4
  1450. .bn_sub_words:
  1451. #
  1452. # Handcoded version of bn_sub_words
  1453. #
  1454. #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1455. #
  1456. # r3 = r
  1457. # r4 = a
  1458. # r5 = b
  1459. # r6 = n
  1460. #
  1461. # Note: No loop unrolling done since this is not a performance
  1462. # critical loop.
  1463. xor r0,r0,r0 #set r0 = 0
  1464. #
  1465. # check for r6 = 0 AND set carry bit.
  1466. #
  1467. subfc. r7,r0,r6 # If r6 is 0 then result is 0.
  1468. # if r6 > 0 then result !=0
  1469. # In either case carry bit is set.
  1470. beq Lppcasm_sub_adios
  1471. addi r4,r4,-$BNSZ
  1472. addi r3,r3,-$BNSZ
  1473. addi r5,r5,-$BNSZ
  1474. mtctr r6
  1475. Lppcasm_sub_mainloop:
  1476. $LDU r7,$BNSZ(r4)
  1477. $LDU r8,$BNSZ(r5)
  1478. subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
  1479. # if carry = 1 this is r7-r8. Else it
  1480. # is r7-r8 -1 as we need.
  1481. $STU r6,$BNSZ(r3)
  1482. bdnz Lppcasm_sub_mainloop
  1483. Lppcasm_sub_adios:
  1484. subfze r3,r0 # if carry bit is set then r3 = 0 else -1
  1485. andi. r3,r3,1 # keep only last bit.
  1486. blr
  1487. .long 0
  1488. .byte 0,12,0x14,0,0,0,4,0
  1489. .long 0
  1490. #
  1491. # NOTE: The following label name should be changed to
  1492. # "bn_add_words" i.e. remove the first dot
  1493. # for the gcc compiler. This should be automatically
  1494. # done in the build
  1495. #
  1496. .align 4
  1497. .bn_add_words:
  1498. #
  1499. # Handcoded version of bn_add_words
  1500. #
  1501. #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1502. #
  1503. # r3 = r
  1504. # r4 = a
  1505. # r5 = b
  1506. # r6 = n
  1507. #
  1508. # Note: No loop unrolling done since this is not a performance
  1509. # critical loop.
  1510. xor r0,r0,r0
  1511. #
  1512. # check for r6 = 0. Is this needed?
  1513. #
  1514. addic. r6,r6,0 #test r6 and clear carry bit.
  1515. beq Lppcasm_add_adios
  1516. addi r4,r4,-$BNSZ
  1517. addi r3,r3,-$BNSZ
  1518. addi r5,r5,-$BNSZ
  1519. mtctr r6
  1520. Lppcasm_add_mainloop:
  1521. $LDU r7,$BNSZ(r4)
  1522. $LDU r8,$BNSZ(r5)
  1523. adde r8,r7,r8
  1524. $STU r8,$BNSZ(r3)
  1525. bdnz Lppcasm_add_mainloop
  1526. Lppcasm_add_adios:
  1527. addze r3,r0 #return carry bit.
  1528. blr
  1529. .long 0
  1530. .byte 0,12,0x14,0,0,0,4,0
  1531. .long 0
  1532. #
  1533. # NOTE: The following label name should be changed to
  1534. # "bn_div_words" i.e. remove the first dot
  1535. # for the gcc compiler. This should be automatically
  1536. # done in the build
  1537. #
  1538. .align 4
  1539. .bn_div_words:
  1540. #
  1541. # This is a cleaned up version of code generated by
  1542. # the AIX compiler. The only optimization is to use
  1543. # the PPC instruction to count leading zeros instead
  1544. # of call to num_bits_word. Since this was compiled
  1545. # only at level -O2 we can possibly squeeze it more?
  1546. #
  1547. # r3 = h
  1548. # r4 = l
  1549. # r5 = d
  1550. $UCMPI 0,r5,0 # compare r5 and 0
  1551. bne Lppcasm_div1 # proceed if d!=0
  1552. li r3,-1 # d=0 return -1
  1553. blr
  1554. Lppcasm_div1:
  1555. xor r0,r0,r0 #r0=0
  1556. li r8,$BITS
  1557. $CNTLZ. r7,r5 #r7 = num leading 0s in d.
  1558. beq Lppcasm_div2 #proceed if no leading zeros
  1559. subf r8,r7,r8 #r8 = BN_num_bits_word(d)
  1560. $SHR. r9,r3,r8 #are there any bits above r8'th?
  1561. $TR 16,r9,r0 #if there're, signal to dump core...
  1562. Lppcasm_div2:
  1563. $UCMP 0,r3,r5 #h>=d?
  1564. blt Lppcasm_div3 #goto Lppcasm_div3 if not
  1565. subf r3,r5,r3 #h-=d ;
  1566. Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
  1567. cmpi 0,0,r7,0 # is (i == 0)?
  1568. beq Lppcasm_div4
  1569. $SHL r3,r3,r7 # h = (h<< i)
  1570. $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
  1571. $SHL r5,r5,r7 # d<<=i
  1572. or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
  1573. $SHL r4,r4,r7 # l <<=i
  1574. Lppcasm_div4:
  1575. $SHRI r9,r5,`$BITS/2` # r9 = dh
  1576. # dl will be computed when needed
  1577. # as it saves registers.
  1578. li r6,2 #r6=2
  1579. mtctr r6 #counter will be in count.
  1580. Lppcasm_divouterloop:
  1581. $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
  1582. $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
  1583. # compute here for innerloop.
  1584. $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
  1585. bne Lppcasm_div5 # goto Lppcasm_div5 if not
  1586. li r8,-1
  1587. $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
  1588. b Lppcasm_div6
  1589. Lppcasm_div5:
  1590. $UDIV r8,r3,r9 #q = h/dh
  1591. Lppcasm_div6:
  1592. $UMULL r12,r9,r8 #th = q*dh
  1593. $CLRU r10,r5,`$BITS/2` #r10=dl
  1594. $UMULL r6,r8,r10 #tl = q*dl
  1595. Lppcasm_divinnerloop:
  1596. subf r10,r12,r3 #t = h -th
  1597. $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
  1598. addic. r7,r7,0 #test if r7 == 0. used below.
  1599. # now want to compute
  1600. # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
  1601. # the following 2 instructions do that
  1602. $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
  1603. or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
  1604. $UCMP cr1,r6,r7 # compare (tl <= r7)
  1605. bne Lppcasm_divinnerexit
  1606. ble cr1,Lppcasm_divinnerexit
  1607. addi r8,r8,-1 #q--
  1608. subf r12,r9,r12 #th -=dh
  1609. $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
  1610. subf r6,r10,r6 #tl -=dl
  1611. b Lppcasm_divinnerloop
  1612. Lppcasm_divinnerexit:
  1613. $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
  1614. $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
  1615. $UCMP cr1,r4,r11 # compare l and tl
  1616. add r12,r12,r10 # th+=t
  1617. bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
  1618. addi r12,r12,1 # th++
  1619. Lppcasm_div7:
  1620. subf r11,r11,r4 #r11=l-tl
  1621. $UCMP cr1,r3,r12 #compare h and th
  1622. bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
  1623. addi r8,r8,-1 # q--
  1624. add r3,r5,r3 # h+=d
  1625. Lppcasm_div8:
  1626. subf r12,r12,r3 #r12 = h-th
  1627. $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
  1628. # want to compute
  1629. # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
  1630. # the following 2 instructions will do this.
  1631. $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
  1632. $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
  1633. bdz Lppcasm_div9 #if (count==0) break ;
  1634. $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
  1635. b Lppcasm_divouterloop
  1636. Lppcasm_div9:
  1637. or r3,r8,r0
  1638. blr
  1639. .long 0
  1640. .byte 0,12,0x14,0,0,0,3,0
  1641. .long 0
  1642. #
  1643. # NOTE: The following label name should be changed to
  1644. # "bn_sqr_words" i.e. remove the first dot
  1645. # for the gcc compiler. This should be automatically
  1646. # done in the build
  1647. #
  1648. .align 4
  1649. .bn_sqr_words:
  1650. #
  1651. # Optimized version of bn_sqr_words
  1652. #
  1653. # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
  1654. #
  1655. # r3 = r
  1656. # r4 = a
  1657. # r5 = n
  1658. #
  1659. # r6 = a[i].
  1660. # r7,r8 = product.
  1661. #
  1662. # No unrolling done here. Not performance critical.
  1663. addic. r5,r5,0 #test r5.
  1664. beq Lppcasm_sqr_adios
  1665. addi r4,r4,-$BNSZ
  1666. addi r3,r3,-$BNSZ
  1667. mtctr r5
  1668. Lppcasm_sqr_mainloop:
  1669. #sqr(r[0],r[1],a[0]);
  1670. $LDU r6,$BNSZ(r4)
  1671. $UMULL r7,r6,r6
  1672. $UMULH r8,r6,r6
  1673. $STU r7,$BNSZ(r3)
  1674. $STU r8,$BNSZ(r3)
  1675. bdnz Lppcasm_sqr_mainloop
  1676. Lppcasm_sqr_adios:
  1677. blr
  1678. .long 0
  1679. .byte 0,12,0x14,0,0,0,3,0
  1680. .long 0
  1681. #
  1682. # NOTE: The following label name should be changed to
  1683. # "bn_mul_words" i.e. remove the first dot
  1684. # for the gcc compiler. This should be automatically
  1685. # done in the build
  1686. #
  1687. .align 4
  1688. .bn_mul_words:
  1689. #
  1690. # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1691. #
  1692. # r3 = rp
  1693. # r4 = ap
  1694. # r5 = num
  1695. # r6 = w
  1696. xor r0,r0,r0
  1697. xor r12,r12,r12 # used for carry
  1698. rlwinm. r7,r5,30,2,31 # num >> 2
  1699. beq Lppcasm_mw_REM
  1700. mtctr r7
  1701. Lppcasm_mw_LOOP:
  1702. #mul(rp[0],ap[0],w,c1);
  1703. $LD r8,`0*$BNSZ`(r4)
  1704. $UMULL r9,r6,r8
  1705. $UMULH r10,r6,r8
  1706. addc r9,r9,r12
  1707. #addze r10,r10 #carry is NOT ignored.
  1708. #will be taken care of
  1709. #in second spin below
  1710. #using adde.
  1711. $ST r9,`0*$BNSZ`(r3)
  1712. #mul(rp[1],ap[1],w,c1);
  1713. $LD r8,`1*$BNSZ`(r4)
  1714. $UMULL r11,r6,r8
  1715. $UMULH r12,r6,r8
  1716. adde r11,r11,r10
  1717. #addze r12,r12
  1718. $ST r11,`1*$BNSZ`(r3)
  1719. #mul(rp[2],ap[2],w,c1);
  1720. $LD r8,`2*$BNSZ`(r4)
  1721. $UMULL r9,r6,r8
  1722. $UMULH r10,r6,r8
  1723. adde r9,r9,r12
  1724. #addze r10,r10
  1725. $ST r9,`2*$BNSZ`(r3)
  1726. #mul_add(rp[3],ap[3],w,c1);
  1727. $LD r8,`3*$BNSZ`(r4)
  1728. $UMULL r11,r6,r8
  1729. $UMULH r12,r6,r8
  1730. adde r11,r11,r10
  1731. addze r12,r12 #this spin we collect carry into
  1732. #r12
  1733. $ST r11,`3*$BNSZ`(r3)
  1734. addi r3,r3,`4*$BNSZ`
  1735. addi r4,r4,`4*$BNSZ`
  1736. bdnz Lppcasm_mw_LOOP
  1737. Lppcasm_mw_REM:
  1738. andi. r5,r5,0x3
  1739. beq Lppcasm_mw_OVER
  1740. #mul(rp[0],ap[0],w,c1);
  1741. $LD r8,`0*$BNSZ`(r4)
  1742. $UMULL r9,r6,r8
  1743. $UMULH r10,r6,r8
  1744. addc r9,r9,r12
  1745. addze r10,r10
  1746. $ST r9,`0*$BNSZ`(r3)
  1747. addi r12,r10,0
  1748. addi r5,r5,-1
  1749. cmpli 0,0,r5,0
  1750. beq Lppcasm_mw_OVER
  1751. #mul(rp[1],ap[1],w,c1);
  1752. $LD r8,`1*$BNSZ`(r4)
  1753. $UMULL r9,r6,r8
  1754. $UMULH r10,r6,r8
  1755. addc r9,r9,r12
  1756. addze r10,r10
  1757. $ST r9,`1*$BNSZ`(r3)
  1758. addi r12,r10,0
  1759. addi r5,r5,-1
  1760. cmpli 0,0,r5,0
  1761. beq Lppcasm_mw_OVER
  1762. #mul_add(rp[2],ap[2],w,c1);
  1763. $LD r8,`2*$BNSZ`(r4)
  1764. $UMULL r9,r6,r8
  1765. $UMULH r10,r6,r8
  1766. addc r9,r9,r12
  1767. addze r10,r10
  1768. $ST r9,`2*$BNSZ`(r3)
  1769. addi r12,r10,0
  1770. Lppcasm_mw_OVER:
  1771. addi r3,r12,0
  1772. blr
  1773. .long 0
  1774. .byte 0,12,0x14,0,0,0,4,0
  1775. .long 0
  1776. #
  1777. # NOTE: The following label name should be changed to
  1778. # "bn_mul_add_words" i.e. remove the first dot
  1779. # for the gcc compiler. This should be automatically
  1780. # done in the build
  1781. #
  1782. .align 4
  1783. .bn_mul_add_words:
  1784. #
  1785. # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1786. #
  1787. # r3 = rp
  1788. # r4 = ap
  1789. # r5 = num
  1790. # r6 = w
  1791. #
  1792. # empirical evidence suggests that unrolled version performs best!!
  1793. #
  1794. xor r0,r0,r0 #r0 = 0
  1795. xor r12,r12,r12 #r12 = 0 . used for carry
  1796. rlwinm. r7,r5,30,2,31 # num >> 2
  1797. beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
  1798. mtctr r7
  1799. Lppcasm_maw_mainloop:
  1800. #mul_add(rp[0],ap[0],w,c1);
  1801. $LD r8,`0*$BNSZ`(r4)
  1802. $LD r11,`0*$BNSZ`(r3)
  1803. $UMULL r9,r6,r8
  1804. $UMULH r10,r6,r8
  1805. addc r9,r9,r12 #r12 is carry.
  1806. addze r10,r10
  1807. addc r9,r9,r11
  1808. #addze r10,r10
  1809. #the above instruction addze
  1810. #is NOT needed. Carry will NOT
  1811. #be ignored. It's not affected
  1812. #by multiply and will be collected
  1813. #in the next spin
  1814. $ST r9,`0*$BNSZ`(r3)
  1815. #mul_add(rp[1],ap[1],w,c1);
  1816. $LD r8,`1*$BNSZ`(r4)
  1817. $LD r9,`1*$BNSZ`(r3)
  1818. $UMULL r11,r6,r8
  1819. $UMULH r12,r6,r8
  1820. adde r11,r11,r10 #r10 is carry.
  1821. addze r12,r12
  1822. addc r11,r11,r9
  1823. #addze r12,r12
  1824. $ST r11,`1*$BNSZ`(r3)
  1825. #mul_add(rp[2],ap[2],w,c1);
  1826. $LD r8,`2*$BNSZ`(r4)
  1827. $UMULL r9,r6,r8
  1828. $LD r11,`2*$BNSZ`(r3)
  1829. $UMULH r10,r6,r8
  1830. adde r9,r9,r12
  1831. addze r10,r10
  1832. addc r9,r9,r11
  1833. #addze r10,r10
  1834. $ST r9,`2*$BNSZ`(r3)
  1835. #mul_add(rp[3],ap[3],w,c1);
  1836. $LD r8,`3*$BNSZ`(r4)
  1837. $UMULL r11,r6,r8
  1838. $LD r9,`3*$BNSZ`(r3)
  1839. $UMULH r12,r6,r8
  1840. adde r11,r11,r10
  1841. addze r12,r12
  1842. addc r11,r11,r9
  1843. addze r12,r12
  1844. $ST r11,`3*$BNSZ`(r3)
  1845. addi r3,r3,`4*$BNSZ`
  1846. addi r4,r4,`4*$BNSZ`
  1847. bdnz- Lppcasm_maw_mainloop
  1848. Lppcasm_maw_leftover:
  1849. andi. r5,r5,0x3
  1850. beq Lppcasm_maw_adios
  1851. addi r3,r3,-$BNSZ
  1852. addi r4,r4,-$BNSZ
  1853. #mul_add(rp[0],ap[0],w,c1);
  1854. mtctr r5
  1855. $LDU r8,$BNSZ(r4)
  1856. $UMULL r9,r6,r8
  1857. $UMULH r10,r6,r8
  1858. $LDU r11,$BNSZ(r3)
  1859. addc r9,r9,r11
  1860. addze r10,r10
  1861. addc r9,r9,r12
  1862. addze r12,r10
  1863. $ST r9,0(r3)
  1864. bdz Lppcasm_maw_adios
  1865. #mul_add(rp[1],ap[1],w,c1);
  1866. $LDU r8,$BNSZ(r4)
  1867. $UMULL r9,r6,r8
  1868. $UMULH r10,r6,r8
  1869. $LDU r11,$BNSZ(r3)
  1870. addc r9,r9,r11
  1871. addze r10,r10
  1872. addc r9,r9,r12
  1873. addze r12,r10
  1874. $ST r9,0(r3)
  1875. bdz Lppcasm_maw_adios
  1876. #mul_add(rp[2],ap[2],w,c1);
  1877. $LDU r8,$BNSZ(r4)
  1878. $UMULL r9,r6,r8
  1879. $UMULH r10,r6,r8
  1880. $LDU r11,$BNSZ(r3)
  1881. addc r9,r9,r11
  1882. addze r10,r10
  1883. addc r9,r9,r12
  1884. addze r12,r10
  1885. $ST r9,0(r3)
  1886. Lppcasm_maw_adios:
  1887. addi r3,r12,0
  1888. blr
  1889. .long 0
  1890. .byte 0,12,0x14,0,0,0,4,0
  1891. .long 0
  1892. .align 4
  1893. EOF
  1894. $data =~ s/\`([^\`]*)\`/eval $1/gem;
  1895. print $data;
  1896. close STDOUT;