ppc.pl 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016
  1. #! /usr/bin/env perl
  2. # Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # Implemented as a Perl wrapper as we want to support several different
  9. # architectures with single file. We pick up the target based on the
  10. # file name we are asked to generate.
  11. #
  12. # It should be noted though that this perl code is nothing like
  13. # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  14. # as pre-processor to cover for platform differences in name decoration,
  15. # linker tables, 32-/64-bit instruction sets...
  16. #
  17. # As you might know there're several PowerPC ABI in use. Most notably
  18. # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  19. # are similar enough to implement leaf(!) functions, which would be ABI
  20. # neutral. And that's what you find here: ABI neutral leaf functions.
  21. # In case you wonder what that is...
  22. #
  23. # AIX performance
  24. #
  25. # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  26. #
  27. # The following is the performance of 32-bit compiler
  28. # generated code:
  29. #
  30. # OpenSSL 0.9.6c 21 dec 2001
  31. # built on: Tue Jun 11 11:06:51 EDT 2002
  32. # options:bn(64,32) ...
  33. #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
  34. # sign verify sign/s verify/s
  35. #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
  36. #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
  37. #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
  38. #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
  39. #dsa 512 bits 0.0087s 0.0106s 114.3 94.5
  40. #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
  41. #
  42. # Same benchmark with this assembler code:
  43. #
  44. #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
  45. #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
  46. #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
  47. #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
  48. #dsa 512 bits 0.0052s 0.0062s 191.6 162.0
  49. #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
  50. #
  51. # Number of operations increases by at almost 75%
  52. #
  53. # Here are performance numbers for 64-bit compiler
  54. # generated code:
  55. #
  56. # OpenSSL 0.9.6g [engine] 9 Aug 2002
  57. # built on: Fri Apr 18 16:59:20 EDT 2003
  58. # options:bn(64,64) ...
  59. # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  60. # sign verify sign/s verify/s
  61. #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
  62. #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
  63. #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
  64. #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
  65. #dsa 512 bits 0.0026s 0.0032s 382.5 313.7
  66. #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
  67. #
  68. # Same benchmark with this assembler code:
  69. #
  70. #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
  71. #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
  72. #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
  73. #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
  74. #dsa 512 bits 0.0016s 0.0020s 610.7 507.1
  75. #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
  76. #
  77. # Again, performance increases by at about 75%
  78. #
  79. # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  80. # OpenSSL 0.9.7c 30 Sep 2003
  81. #
  82. # Original code.
  83. #
  84. #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
  85. #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
  86. #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
  87. #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
  88. #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
  89. #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
  90. #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
  91. #
  92. # Same benchmark with this assembler code:
  93. #
  94. #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
  95. #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
  96. #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
  97. #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
  98. #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
  99. #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
  100. #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
  101. #
  102. # Performance increase of ~60%
  103. # Based on submission from Suresh N. Chari of IBM
  104. # $output is the last argument if it looks like a file (it has an extension)
  105. # $flavour is the first argument if it doesn't look like a file
  106. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  107. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  108. if ($flavour =~ /32/) {
  109. $BITS= 32;
  110. $BNSZ= $BITS/8;
  111. $ISA= "\"ppc\"";
  112. $LD= "lwz"; # load
  113. $LDU= "lwzu"; # load and update
  114. $ST= "stw"; # store
  115. $STU= "stwu"; # store and update
  116. $UMULL= "mullw"; # unsigned multiply low
  117. $UMULH= "mulhwu"; # unsigned multiply high
  118. $UDIV= "divwu"; # unsigned divide
  119. $UCMPI= "cmplwi"; # unsigned compare with immediate
  120. $UCMP= "cmplw"; # unsigned compare
  121. $CNTLZ= "cntlzw"; # count leading zeros
  122. $SHL= "slw"; # shift left
  123. $SHR= "srw"; # unsigned shift right
  124. $SHRI= "srwi"; # unsigned shift right by immediate
  125. $SHLI= "slwi"; # shift left by immediate
  126. $CLRU= "clrlwi"; # clear upper bits
  127. $INSR= "insrwi"; # insert right
  128. $ROTL= "rotlwi"; # rotate left by immediate
  129. $TR= "tw"; # conditional trap
  130. } elsif ($flavour =~ /64/) {
  131. $BITS= 64;
  132. $BNSZ= $BITS/8;
  133. $ISA= "\"ppc64\"";
  134. # same as above, but 64-bit mnemonics...
  135. $LD= "ld"; # load
  136. $LDU= "ldu"; # load and update
  137. $ST= "std"; # store
  138. $STU= "stdu"; # store and update
  139. $UMULL= "mulld"; # unsigned multiply low
  140. $UMULH= "mulhdu"; # unsigned multiply high
  141. $UDIV= "divdu"; # unsigned divide
  142. $UCMPI= "cmpldi"; # unsigned compare with immediate
  143. $UCMP= "cmpld"; # unsigned compare
  144. $CNTLZ= "cntlzd"; # count leading zeros
  145. $SHL= "sld"; # shift left
  146. $SHR= "srd"; # unsigned shift right
  147. $SHRI= "srdi"; # unsigned shift right by immediate
  148. $SHLI= "sldi"; # shift left by immediate
  149. $CLRU= "clrldi"; # clear upper bits
  150. $INSR= "insrdi"; # insert right
  151. $ROTL= "rotldi"; # rotate left by immediate
  152. $TR= "td"; # conditional trap
  153. } else { die "nonsense $flavour"; }
  154. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  155. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  156. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  157. die "can't locate ppc-xlate.pl";
  158. open STDOUT,"| $^X $xlate $flavour \"$output\""
  159. or die "can't call $xlate: $!";
  160. $data=<<EOF;
  161. #--------------------------------------------------------------------
  162. #
  163. #
  164. #
  165. #
  166. # File: ppc32.s
  167. #
  168. # Created by: Suresh Chari
  169. # IBM Thomas J. Watson Research Library
  170. # Hawthorne, NY
  171. #
  172. #
  173. # Description: Optimized assembly routines for OpenSSL crypto
  174. # on the 32 bitPowerPC platform.
  175. #
  176. #
  177. # Version History
  178. #
  179. # 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
  180. # cleaned up code. Also made a single version which can
  181. # be used for both the AIX and Linux compilers. See NOTE
  182. # below.
  183. # 12/05/03 Suresh Chari
  184. # (with lots of help from) Andy Polyakov
  185. ##
  186. # 1. Initial version 10/20/02 Suresh Chari
  187. #
  188. #
  189. # The following file works for the xlc,cc
  190. # and gcc compilers.
  191. #
  192. # NOTE: To get the file to link correctly with the gcc compiler
  193. # you have to change the names of the routines and remove
  194. # the first .(dot) character. This should automatically
  195. # be done in the build process.
  196. #
  197. # Hand optimized assembly code for the following routines
  198. #
  199. # bn_sqr_comba4
  200. # bn_sqr_comba8
  201. # bn_mul_comba4
  202. # bn_mul_comba8
  203. # bn_sub_words
  204. # bn_add_words
  205. # bn_div_words
  206. # bn_sqr_words
  207. # bn_mul_words
  208. # bn_mul_add_words
  209. #
  210. # NOTE: It is possible to optimize this code more for
  211. # specific PowerPC or Power architectures. On the Northstar
  212. # architecture the optimizations in this file do
  213. # NOT provide much improvement.
  214. #
  215. # If you have comments or suggestions to improve code send
  216. # me a note at schari\@us.ibm.com
  217. #
  218. #--------------------------------------------------------------------------
  219. #
  220. # Defines to be used in the assembly code.
  221. #
  222. #.set r0,0 # we use it as storage for value of 0
  223. #.set SP,1 # preserved
  224. #.set RTOC,2 # preserved
  225. #.set r3,3 # 1st argument/return value
  226. #.set r4,4 # 2nd argument/volatile register
  227. #.set r5,5 # 3rd argument/volatile register
  228. #.set r6,6 # ...
  229. #.set r7,7
  230. #.set r8,8
  231. #.set r9,9
  232. #.set r10,10
  233. #.set r11,11
  234. #.set r12,12
  235. #.set r13,13 # not used, nor any other "below" it...
  236. # Declare function names to be global
  237. # NOTE: For gcc these names MUST be changed to remove
  238. # the first . i.e. for example change ".bn_sqr_comba4"
  239. # to "bn_sqr_comba4". This should be automatically done
  240. # in the build.
  241. .globl .bn_sqr_comba4
  242. .globl .bn_sqr_comba8
  243. .globl .bn_mul_comba4
  244. .globl .bn_mul_comba8
  245. .globl .bn_sub_words
  246. .globl .bn_add_words
  247. .globl .bn_div_words
  248. .globl .bn_sqr_words
  249. .globl .bn_mul_words
  250. .globl .bn_mul_add_words
  251. # .text section
  252. .machine "any"
  253. .text
  254. #
  255. # NOTE: The following label name should be changed to
  256. # "bn_sqr_comba4" i.e. remove the first dot
  257. # for the gcc compiler. This should be automatically
  258. # done in the build
  259. #
  260. .align 4
  261. .bn_sqr_comba4:
  262. #
  263. # Optimized version of bn_sqr_comba4.
  264. #
  265. # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  266. # r3 contains r
  267. # r4 contains a
  268. #
  269. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  270. #
  271. # r5,r6 are the two BN_ULONGs being multiplied.
  272. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  273. # r9,r10, r11 are the equivalents of c1,c2, c3.
  274. # Here's the assembly
  275. #
  276. #
  277. xor r0,r0,r0 # set r0 = 0. Used in the addze
  278. # instructions below
  279. #sqr_add_c(a,0,c1,c2,c3)
  280. $LD r5,`0*$BNSZ`(r4)
  281. $UMULL r9,r5,r5
  282. $UMULH r10,r5,r5 #in first iteration. No need
  283. #to add since c1=c2=c3=0.
  284. # Note c3(r11) is NOT set to 0
  285. # but will be.
  286. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  287. # sqr_add_c2(a,1,0,c2,c3,c1);
  288. $LD r6,`1*$BNSZ`(r4)
  289. $UMULL r7,r5,r6
  290. $UMULH r8,r5,r6
  291. addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
  292. adde r8,r8,r8
  293. addze r9,r0 # catch carry if any.
  294. # r9= r0(=0) and carry
  295. addc r10,r7,r10 # now add to temp result.
  296. addze r11,r8 # r8 added to r11 which is 0
  297. addze r9,r9
  298. $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
  299. #sqr_add_c(a,1,c3,c1,c2)
  300. $UMULL r7,r6,r6
  301. $UMULH r8,r6,r6
  302. addc r11,r7,r11
  303. adde r9,r8,r9
  304. addze r10,r0
  305. #sqr_add_c2(a,2,0,c3,c1,c2)
  306. $LD r6,`2*$BNSZ`(r4)
  307. $UMULL r7,r5,r6
  308. $UMULH r8,r5,r6
  309. addc r7,r7,r7
  310. adde r8,r8,r8
  311. addze r10,r10
  312. addc r11,r7,r11
  313. adde r9,r8,r9
  314. addze r10,r10
  315. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  316. #sqr_add_c2(a,3,0,c1,c2,c3);
  317. $LD r6,`3*$BNSZ`(r4)
  318. $UMULL r7,r5,r6
  319. $UMULH r8,r5,r6
  320. addc r7,r7,r7
  321. adde r8,r8,r8
  322. addze r11,r0
  323. addc r9,r7,r9
  324. adde r10,r8,r10
  325. addze r11,r11
  326. #sqr_add_c2(a,2,1,c1,c2,c3);
  327. $LD r5,`1*$BNSZ`(r4)
  328. $LD r6,`2*$BNSZ`(r4)
  329. $UMULL r7,r5,r6
  330. $UMULH r8,r5,r6
  331. addc r7,r7,r7
  332. adde r8,r8,r8
  333. addze r11,r11
  334. addc r9,r7,r9
  335. adde r10,r8,r10
  336. addze r11,r11
  337. $ST r9,`3*$BNSZ`(r3) #r[3]=c1
  338. #sqr_add_c(a,2,c2,c3,c1);
  339. $UMULL r7,r6,r6
  340. $UMULH r8,r6,r6
  341. addc r10,r7,r10
  342. adde r11,r8,r11
  343. addze r9,r0
  344. #sqr_add_c2(a,3,1,c2,c3,c1);
  345. $LD r6,`3*$BNSZ`(r4)
  346. $UMULL r7,r5,r6
  347. $UMULH r8,r5,r6
  348. addc r7,r7,r7
  349. adde r8,r8,r8
  350. addze r9,r9
  351. addc r10,r7,r10
  352. adde r11,r8,r11
  353. addze r9,r9
  354. $ST r10,`4*$BNSZ`(r3) #r[4]=c2
  355. #sqr_add_c2(a,3,2,c3,c1,c2);
  356. $LD r5,`2*$BNSZ`(r4)
  357. $UMULL r7,r5,r6
  358. $UMULH r8,r5,r6
  359. addc r7,r7,r7
  360. adde r8,r8,r8
  361. addze r10,r0
  362. addc r11,r7,r11
  363. adde r9,r8,r9
  364. addze r10,r10
  365. $ST r11,`5*$BNSZ`(r3) #r[5] = c3
  366. #sqr_add_c(a,3,c1,c2,c3);
  367. $UMULL r7,r6,r6
  368. $UMULH r8,r6,r6
  369. addc r9,r7,r9
  370. adde r10,r8,r10
  371. $ST r9,`6*$BNSZ`(r3) #r[6]=c1
  372. $ST r10,`7*$BNSZ`(r3) #r[7]=c2
  373. blr
  374. .long 0
  375. .byte 0,12,0x14,0,0,0,2,0
  376. .long 0
  377. .size .bn_sqr_comba4,.-.bn_sqr_comba4
  378. #
  379. # NOTE: The following label name should be changed to
  380. # "bn_sqr_comba8" i.e. remove the first dot
  381. # for the gcc compiler. This should be automatically
  382. # done in the build
  383. #
  384. .align 4
  385. .bn_sqr_comba8:
  386. #
  387. # This is an optimized version of the bn_sqr_comba8 routine.
  388. # Tightly uses the adde instruction
  389. #
  390. #
  391. # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  392. # r3 contains r
  393. # r4 contains a
  394. #
  395. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  396. #
  397. # r5,r6 are the two BN_ULONGs being multiplied.
  398. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  399. # r9,r10, r11 are the equivalents of c1,c2, c3.
  400. #
  401. # Possible optimization of loading all 8 longs of a into registers
  402. # doesn't provide any speedup
  403. #
  404. xor r0,r0,r0 #set r0 = 0.Used in addze
  405. #instructions below.
  406. #sqr_add_c(a,0,c1,c2,c3);
  407. $LD r5,`0*$BNSZ`(r4)
  408. $UMULL r9,r5,r5 #1st iteration: no carries.
  409. $UMULH r10,r5,r5
  410. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  411. #sqr_add_c2(a,1,0,c2,c3,c1);
  412. $LD r6,`1*$BNSZ`(r4)
  413. $UMULL r7,r5,r6
  414. $UMULH r8,r5,r6
  415. addc r10,r7,r10 #add the two register number
  416. adde r11,r8,r0 # (r8,r7) to the three register
  417. addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
  418. addc r10,r7,r10 #add the two register number
  419. adde r11,r8,r11 # (r8,r7) to the three register
  420. addze r9,r9 # number (r9,r11,r10).
  421. $ST r10,`1*$BNSZ`(r3) # r[1]=c2
  422. #sqr_add_c(a,1,c3,c1,c2);
  423. $UMULL r7,r6,r6
  424. $UMULH r8,r6,r6
  425. addc r11,r7,r11
  426. adde r9,r8,r9
  427. addze r10,r0
  428. #sqr_add_c2(a,2,0,c3,c1,c2);
  429. $LD r6,`2*$BNSZ`(r4)
  430. $UMULL r7,r5,r6
  431. $UMULH r8,r5,r6
  432. addc r11,r7,r11
  433. adde r9,r8,r9
  434. addze r10,r10
  435. addc r11,r7,r11
  436. adde r9,r8,r9
  437. addze r10,r10
  438. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  439. #sqr_add_c2(a,3,0,c1,c2,c3);
  440. $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
  441. $UMULL r7,r5,r6
  442. $UMULH r8,r5,r6
  443. addc r9,r7,r9
  444. adde r10,r8,r10
  445. addze r11,r0
  446. addc r9,r7,r9
  447. adde r10,r8,r10
  448. addze r11,r11
  449. #sqr_add_c2(a,2,1,c1,c2,c3);
  450. $LD r5,`1*$BNSZ`(r4)
  451. $LD r6,`2*$BNSZ`(r4)
  452. $UMULL r7,r5,r6
  453. $UMULH r8,r5,r6
  454. addc r9,r7,r9
  455. adde r10,r8,r10
  456. addze r11,r11
  457. addc r9,r7,r9
  458. adde r10,r8,r10
  459. addze r11,r11
  460. $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
  461. #sqr_add_c(a,2,c2,c3,c1);
  462. $UMULL r7,r6,r6
  463. $UMULH r8,r6,r6
  464. addc r10,r7,r10
  465. adde r11,r8,r11
  466. addze r9,r0
  467. #sqr_add_c2(a,3,1,c2,c3,c1);
  468. $LD r6,`3*$BNSZ`(r4)
  469. $UMULL r7,r5,r6
  470. $UMULH r8,r5,r6
  471. addc r10,r7,r10
  472. adde r11,r8,r11
  473. addze r9,r9
  474. addc r10,r7,r10
  475. adde r11,r8,r11
  476. addze r9,r9
  477. #sqr_add_c2(a,4,0,c2,c3,c1);
  478. $LD r5,`0*$BNSZ`(r4)
  479. $LD r6,`4*$BNSZ`(r4)
  480. $UMULL r7,r5,r6
  481. $UMULH r8,r5,r6
  482. addc r10,r7,r10
  483. adde r11,r8,r11
  484. addze r9,r9
  485. addc r10,r7,r10
  486. adde r11,r8,r11
  487. addze r9,r9
  488. $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
  489. #sqr_add_c2(a,5,0,c3,c1,c2);
  490. $LD r6,`5*$BNSZ`(r4)
  491. $UMULL r7,r5,r6
  492. $UMULH r8,r5,r6
  493. addc r11,r7,r11
  494. adde r9,r8,r9
  495. addze r10,r0
  496. addc r11,r7,r11
  497. adde r9,r8,r9
  498. addze r10,r10
  499. #sqr_add_c2(a,4,1,c3,c1,c2);
  500. $LD r5,`1*$BNSZ`(r4)
  501. $LD r6,`4*$BNSZ`(r4)
  502. $UMULL r7,r5,r6
  503. $UMULH r8,r5,r6
  504. addc r11,r7,r11
  505. adde r9,r8,r9
  506. addze r10,r10
  507. addc r11,r7,r11
  508. adde r9,r8,r9
  509. addze r10,r10
  510. #sqr_add_c2(a,3,2,c3,c1,c2);
  511. $LD r5,`2*$BNSZ`(r4)
  512. $LD r6,`3*$BNSZ`(r4)
  513. $UMULL r7,r5,r6
  514. $UMULH r8,r5,r6
  515. addc r11,r7,r11
  516. adde r9,r8,r9
  517. addze r10,r10
  518. addc r11,r7,r11
  519. adde r9,r8,r9
  520. addze r10,r10
  521. $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
  522. #sqr_add_c(a,3,c1,c2,c3);
  523. $UMULL r7,r6,r6
  524. $UMULH r8,r6,r6
  525. addc r9,r7,r9
  526. adde r10,r8,r10
  527. addze r11,r0
  528. #sqr_add_c2(a,4,2,c1,c2,c3);
  529. $LD r6,`4*$BNSZ`(r4)
  530. $UMULL r7,r5,r6
  531. $UMULH r8,r5,r6
  532. addc r9,r7,r9
  533. adde r10,r8,r10
  534. addze r11,r11
  535. addc r9,r7,r9
  536. adde r10,r8,r10
  537. addze r11,r11
  538. #sqr_add_c2(a,5,1,c1,c2,c3);
  539. $LD r5,`1*$BNSZ`(r4)
  540. $LD r6,`5*$BNSZ`(r4)
  541. $UMULL r7,r5,r6
  542. $UMULH r8,r5,r6
  543. addc r9,r7,r9
  544. adde r10,r8,r10
  545. addze r11,r11
  546. addc r9,r7,r9
  547. adde r10,r8,r10
  548. addze r11,r11
  549. #sqr_add_c2(a,6,0,c1,c2,c3);
  550. $LD r5,`0*$BNSZ`(r4)
  551. $LD r6,`6*$BNSZ`(r4)
  552. $UMULL r7,r5,r6
  553. $UMULH r8,r5,r6
  554. addc r9,r7,r9
  555. adde r10,r8,r10
  556. addze r11,r11
  557. addc r9,r7,r9
  558. adde r10,r8,r10
  559. addze r11,r11
  560. $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
  561. #sqr_add_c2(a,7,0,c2,c3,c1);
  562. $LD r6,`7*$BNSZ`(r4)
  563. $UMULL r7,r5,r6
  564. $UMULH r8,r5,r6
  565. addc r10,r7,r10
  566. adde r11,r8,r11
  567. addze r9,r0
  568. addc r10,r7,r10
  569. adde r11,r8,r11
  570. addze r9,r9
  571. #sqr_add_c2(a,6,1,c2,c3,c1);
  572. $LD r5,`1*$BNSZ`(r4)
  573. $LD r6,`6*$BNSZ`(r4)
  574. $UMULL r7,r5,r6
  575. $UMULH r8,r5,r6
  576. addc r10,r7,r10
  577. adde r11,r8,r11
  578. addze r9,r9
  579. addc r10,r7,r10
  580. adde r11,r8,r11
  581. addze r9,r9
  582. #sqr_add_c2(a,5,2,c2,c3,c1);
  583. $LD r5,`2*$BNSZ`(r4)
  584. $LD r6,`5*$BNSZ`(r4)
  585. $UMULL r7,r5,r6
  586. $UMULH r8,r5,r6
  587. addc r10,r7,r10
  588. adde r11,r8,r11
  589. addze r9,r9
  590. addc r10,r7,r10
  591. adde r11,r8,r11
  592. addze r9,r9
  593. #sqr_add_c2(a,4,3,c2,c3,c1);
  594. $LD r5,`3*$BNSZ`(r4)
  595. $LD r6,`4*$BNSZ`(r4)
  596. $UMULL r7,r5,r6
  597. $UMULH r8,r5,r6
  598. addc r10,r7,r10
  599. adde r11,r8,r11
  600. addze r9,r9
  601. addc r10,r7,r10
  602. adde r11,r8,r11
  603. addze r9,r9
  604. $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
  605. #sqr_add_c(a,4,c3,c1,c2);
  606. $UMULL r7,r6,r6
  607. $UMULH r8,r6,r6
  608. addc r11,r7,r11
  609. adde r9,r8,r9
  610. addze r10,r0
  611. #sqr_add_c2(a,5,3,c3,c1,c2);
  612. $LD r6,`5*$BNSZ`(r4)
  613. $UMULL r7,r5,r6
  614. $UMULH r8,r5,r6
  615. addc r11,r7,r11
  616. adde r9,r8,r9
  617. addze r10,r10
  618. addc r11,r7,r11
  619. adde r9,r8,r9
  620. addze r10,r10
  621. #sqr_add_c2(a,6,2,c3,c1,c2);
  622. $LD r5,`2*$BNSZ`(r4)
  623. $LD r6,`6*$BNSZ`(r4)
  624. $UMULL r7,r5,r6
  625. $UMULH r8,r5,r6
  626. addc r11,r7,r11
  627. adde r9,r8,r9
  628. addze r10,r10
  629. addc r11,r7,r11
  630. adde r9,r8,r9
  631. addze r10,r10
  632. #sqr_add_c2(a,7,1,c3,c1,c2);
  633. $LD r5,`1*$BNSZ`(r4)
  634. $LD r6,`7*$BNSZ`(r4)
  635. $UMULL r7,r5,r6
  636. $UMULH r8,r5,r6
  637. addc r11,r7,r11
  638. adde r9,r8,r9
  639. addze r10,r10
  640. addc r11,r7,r11
  641. adde r9,r8,r9
  642. addze r10,r10
  643. $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
  644. #sqr_add_c2(a,7,2,c1,c2,c3);
  645. $LD r5,`2*$BNSZ`(r4)
  646. $UMULL r7,r5,r6
  647. $UMULH r8,r5,r6
  648. addc r9,r7,r9
  649. adde r10,r8,r10
  650. addze r11,r0
  651. addc r9,r7,r9
  652. adde r10,r8,r10
  653. addze r11,r11
  654. #sqr_add_c2(a,6,3,c1,c2,c3);
  655. $LD r5,`3*$BNSZ`(r4)
  656. $LD r6,`6*$BNSZ`(r4)
  657. $UMULL r7,r5,r6
  658. $UMULH r8,r5,r6
  659. addc r9,r7,r9
  660. adde r10,r8,r10
  661. addze r11,r11
  662. addc r9,r7,r9
  663. adde r10,r8,r10
  664. addze r11,r11
  665. #sqr_add_c2(a,5,4,c1,c2,c3);
  666. $LD r5,`4*$BNSZ`(r4)
  667. $LD r6,`5*$BNSZ`(r4)
  668. $UMULL r7,r5,r6
  669. $UMULH r8,r5,r6
  670. addc r9,r7,r9
  671. adde r10,r8,r10
  672. addze r11,r11
  673. addc r9,r7,r9
  674. adde r10,r8,r10
  675. addze r11,r11
  676. $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
  677. #sqr_add_c(a,5,c2,c3,c1);
  678. $UMULL r7,r6,r6
  679. $UMULH r8,r6,r6
  680. addc r10,r7,r10
  681. adde r11,r8,r11
  682. addze r9,r0
  683. #sqr_add_c2(a,6,4,c2,c3,c1);
  684. $LD r6,`6*$BNSZ`(r4)
  685. $UMULL r7,r5,r6
  686. $UMULH r8,r5,r6
  687. addc r10,r7,r10
  688. adde r11,r8,r11
  689. addze r9,r9
  690. addc r10,r7,r10
  691. adde r11,r8,r11
  692. addze r9,r9
  693. #sqr_add_c2(a,7,3,c2,c3,c1);
  694. $LD r5,`3*$BNSZ`(r4)
  695. $LD r6,`7*$BNSZ`(r4)
  696. $UMULL r7,r5,r6
  697. $UMULH r8,r5,r6
  698. addc r10,r7,r10
  699. adde r11,r8,r11
  700. addze r9,r9
  701. addc r10,r7,r10
  702. adde r11,r8,r11
  703. addze r9,r9
  704. $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
  705. #sqr_add_c2(a,7,4,c3,c1,c2);
  706. $LD r5,`4*$BNSZ`(r4)
  707. $UMULL r7,r5,r6
  708. $UMULH r8,r5,r6
  709. addc r11,r7,r11
  710. adde r9,r8,r9
  711. addze r10,r0
  712. addc r11,r7,r11
  713. adde r9,r8,r9
  714. addze r10,r10
  715. #sqr_add_c2(a,6,5,c3,c1,c2);
  716. $LD r5,`5*$BNSZ`(r4)
  717. $LD r6,`6*$BNSZ`(r4)
  718. $UMULL r7,r5,r6
  719. $UMULH r8,r5,r6
  720. addc r11,r7,r11
  721. adde r9,r8,r9
  722. addze r10,r10
  723. addc r11,r7,r11
  724. adde r9,r8,r9
  725. addze r10,r10
  726. $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
  727. #sqr_add_c(a,6,c1,c2,c3);
  728. $UMULL r7,r6,r6
  729. $UMULH r8,r6,r6
  730. addc r9,r7,r9
  731. adde r10,r8,r10
  732. addze r11,r0
  733. #sqr_add_c2(a,7,5,c1,c2,c3)
  734. $LD r6,`7*$BNSZ`(r4)
  735. $UMULL r7,r5,r6
  736. $UMULH r8,r5,r6
  737. addc r9,r7,r9
  738. adde r10,r8,r10
  739. addze r11,r11
  740. addc r9,r7,r9
  741. adde r10,r8,r10
  742. addze r11,r11
  743. $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
  744. #sqr_add_c2(a,7,6,c2,c3,c1)
  745. $LD r5,`6*$BNSZ`(r4)
  746. $UMULL r7,r5,r6
  747. $UMULH r8,r5,r6
  748. addc r10,r7,r10
  749. adde r11,r8,r11
  750. addze r9,r0
  751. addc r10,r7,r10
  752. adde r11,r8,r11
  753. addze r9,r9
  754. $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
  755. #sqr_add_c(a,7,c3,c1,c2);
  756. $UMULL r7,r6,r6
  757. $UMULH r8,r6,r6
  758. addc r11,r7,r11
  759. adde r9,r8,r9
  760. $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
  761. $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
  762. blr
  763. .long 0
  764. .byte 0,12,0x14,0,0,0,2,0
  765. .long 0
  766. .size .bn_sqr_comba8,.-.bn_sqr_comba8
  767. #
  768. # NOTE: The following label name should be changed to
  769. # "bn_mul_comba4" i.e. remove the first dot
  770. # for the gcc compiler. This should be automatically
  771. # done in the build
  772. #
  773. .align 4
  774. .bn_mul_comba4:
  775. #
  776. # This is an optimized version of the bn_mul_comba4 routine.
  777. #
  778. # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  779. # r3 contains r
  780. # r4 contains a
  781. # r5 contains b
  782. # r6, r7 are the 2 BN_ULONGs being multiplied.
  783. # r8, r9 are the results of the 32x32 giving 64 multiply.
  784. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  785. #
  786. xor r0,r0,r0 #r0=0. Used in addze below.
  787. #mul_add_c(a[0],b[0],c1,c2,c3);
  788. $LD r6,`0*$BNSZ`(r4)
  789. $LD r7,`0*$BNSZ`(r5)
  790. $UMULL r10,r6,r7
  791. $UMULH r11,r6,r7
  792. $ST r10,`0*$BNSZ`(r3) #r[0]=c1
  793. #mul_add_c(a[0],b[1],c2,c3,c1);
  794. $LD r7,`1*$BNSZ`(r5)
  795. $UMULL r8,r6,r7
  796. $UMULH r9,r6,r7
  797. addc r11,r8,r11
  798. adde r12,r9,r0
  799. addze r10,r0
  800. #mul_add_c(a[1],b[0],c2,c3,c1);
  801. $LD r6, `1*$BNSZ`(r4)
  802. $LD r7, `0*$BNSZ`(r5)
  803. $UMULL r8,r6,r7
  804. $UMULH r9,r6,r7
  805. addc r11,r8,r11
  806. adde r12,r9,r12
  807. addze r10,r10
  808. $ST r11,`1*$BNSZ`(r3) #r[1]=c2
  809. #mul_add_c(a[2],b[0],c3,c1,c2);
  810. $LD r6,`2*$BNSZ`(r4)
  811. $UMULL r8,r6,r7
  812. $UMULH r9,r6,r7
  813. addc r12,r8,r12
  814. adde r10,r9,r10
  815. addze r11,r0
  816. #mul_add_c(a[1],b[1],c3,c1,c2);
  817. $LD r6,`1*$BNSZ`(r4)
  818. $LD r7,`1*$BNSZ`(r5)
  819. $UMULL r8,r6,r7
  820. $UMULH r9,r6,r7
  821. addc r12,r8,r12
  822. adde r10,r9,r10
  823. addze r11,r11
  824. #mul_add_c(a[0],b[2],c3,c1,c2);
  825. $LD r6,`0*$BNSZ`(r4)
  826. $LD r7,`2*$BNSZ`(r5)
  827. $UMULL r8,r6,r7
  828. $UMULH r9,r6,r7
  829. addc r12,r8,r12
  830. adde r10,r9,r10
  831. addze r11,r11
  832. $ST r12,`2*$BNSZ`(r3) #r[2]=c3
  833. #mul_add_c(a[0],b[3],c1,c2,c3);
  834. $LD r7,`3*$BNSZ`(r5)
  835. $UMULL r8,r6,r7
  836. $UMULH r9,r6,r7
  837. addc r10,r8,r10
  838. adde r11,r9,r11
  839. addze r12,r0
  840. #mul_add_c(a[1],b[2],c1,c2,c3);
  841. $LD r6,`1*$BNSZ`(r4)
  842. $LD r7,`2*$BNSZ`(r5)
  843. $UMULL r8,r6,r7
  844. $UMULH r9,r6,r7
  845. addc r10,r8,r10
  846. adde r11,r9,r11
  847. addze r12,r12
  848. #mul_add_c(a[2],b[1],c1,c2,c3);
  849. $LD r6,`2*$BNSZ`(r4)
  850. $LD r7,`1*$BNSZ`(r5)
  851. $UMULL r8,r6,r7
  852. $UMULH r9,r6,r7
  853. addc r10,r8,r10
  854. adde r11,r9,r11
  855. addze r12,r12
  856. #mul_add_c(a[3],b[0],c1,c2,c3);
  857. $LD r6,`3*$BNSZ`(r4)
  858. $LD r7,`0*$BNSZ`(r5)
  859. $UMULL r8,r6,r7
  860. $UMULH r9,r6,r7
  861. addc r10,r8,r10
  862. adde r11,r9,r11
  863. addze r12,r12
  864. $ST r10,`3*$BNSZ`(r3) #r[3]=c1
  865. #mul_add_c(a[3],b[1],c2,c3,c1);
  866. $LD r7,`1*$BNSZ`(r5)
  867. $UMULL r8,r6,r7
  868. $UMULH r9,r6,r7
  869. addc r11,r8,r11
  870. adde r12,r9,r12
  871. addze r10,r0
  872. #mul_add_c(a[2],b[2],c2,c3,c1);
  873. $LD r6,`2*$BNSZ`(r4)
  874. $LD r7,`2*$BNSZ`(r5)
  875. $UMULL r8,r6,r7
  876. $UMULH r9,r6,r7
  877. addc r11,r8,r11
  878. adde r12,r9,r12
  879. addze r10,r10
  880. #mul_add_c(a[1],b[3],c2,c3,c1);
  881. $LD r6,`1*$BNSZ`(r4)
  882. $LD r7,`3*$BNSZ`(r5)
  883. $UMULL r8,r6,r7
  884. $UMULH r9,r6,r7
  885. addc r11,r8,r11
  886. adde r12,r9,r12
  887. addze r10,r10
  888. $ST r11,`4*$BNSZ`(r3) #r[4]=c2
  889. #mul_add_c(a[2],b[3],c3,c1,c2);
  890. $LD r6,`2*$BNSZ`(r4)
  891. $UMULL r8,r6,r7
  892. $UMULH r9,r6,r7
  893. addc r12,r8,r12
  894. adde r10,r9,r10
  895. addze r11,r0
  896. #mul_add_c(a[3],b[2],c3,c1,c2);
  897. $LD r6,`3*$BNSZ`(r4)
  898. $LD r7,`2*$BNSZ`(r5)
  899. $UMULL r8,r6,r7
  900. $UMULH r9,r6,r7
  901. addc r12,r8,r12
  902. adde r10,r9,r10
  903. addze r11,r11
  904. $ST r12,`5*$BNSZ`(r3) #r[5]=c3
  905. #mul_add_c(a[3],b[3],c1,c2,c3);
  906. $LD r7,`3*$BNSZ`(r5)
  907. $UMULL r8,r6,r7
  908. $UMULH r9,r6,r7
  909. addc r10,r8,r10
  910. adde r11,r9,r11
  911. $ST r10,`6*$BNSZ`(r3) #r[6]=c1
  912. $ST r11,`7*$BNSZ`(r3) #r[7]=c2
  913. blr
  914. .long 0
  915. .byte 0,12,0x14,0,0,0,3,0
  916. .long 0
  917. .size .bn_mul_comba4,.-.bn_mul_comba4
  918. #
  919. # NOTE: The following label name should be changed to
  920. # "bn_mul_comba8" i.e. remove the first dot
  921. # for the gcc compiler. This should be automatically
  922. # done in the build
  923. #
  924. .align 4
  925. .bn_mul_comba8:
  926. #
  927. # Optimized version of the bn_mul_comba8 routine.
  928. #
  929. # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  930. # r3 contains r
  931. # r4 contains a
  932. # r5 contains b
  933. # r6, r7 are the 2 BN_ULONGs being multiplied.
  934. # r8, r9 are the results of the 32x32 giving 64 multiply.
  935. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  936. #
  937. xor r0,r0,r0 #r0=0. Used in addze below.
  938. #mul_add_c(a[0],b[0],c1,c2,c3);
  939. $LD r6,`0*$BNSZ`(r4) #a[0]
  940. $LD r7,`0*$BNSZ`(r5) #b[0]
  941. $UMULL r10,r6,r7
  942. $UMULH r11,r6,r7
  943. $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
  944. #mul_add_c(a[0],b[1],c2,c3,c1);
  945. $LD r7,`1*$BNSZ`(r5)
  946. $UMULL r8,r6,r7
  947. $UMULH r9,r6,r7
  948. addc r11,r11,r8
  949. addze r12,r9 # since we didn't set r12 to zero before.
  950. addze r10,r0
  951. #mul_add_c(a[1],b[0],c2,c3,c1);
  952. $LD r6,`1*$BNSZ`(r4)
  953. $LD r7,`0*$BNSZ`(r5)
  954. $UMULL r8,r6,r7
  955. $UMULH r9,r6,r7
  956. addc r11,r11,r8
  957. adde r12,r12,r9
  958. addze r10,r10
  959. $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
  960. #mul_add_c(a[2],b[0],c3,c1,c2);
  961. $LD r6,`2*$BNSZ`(r4)
  962. $UMULL r8,r6,r7
  963. $UMULH r9,r6,r7
  964. addc r12,r12,r8
  965. adde r10,r10,r9
  966. addze r11,r0
  967. #mul_add_c(a[1],b[1],c3,c1,c2);
  968. $LD r6,`1*$BNSZ`(r4)
  969. $LD r7,`1*$BNSZ`(r5)
  970. $UMULL r8,r6,r7
  971. $UMULH r9,r6,r7
  972. addc r12,r12,r8
  973. adde r10,r10,r9
  974. addze r11,r11
  975. #mul_add_c(a[0],b[2],c3,c1,c2);
  976. $LD r6,`0*$BNSZ`(r4)
  977. $LD r7,`2*$BNSZ`(r5)
  978. $UMULL r8,r6,r7
  979. $UMULH r9,r6,r7
  980. addc r12,r12,r8
  981. adde r10,r10,r9
  982. addze r11,r11
  983. $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
  984. #mul_add_c(a[0],b[3],c1,c2,c3);
  985. $LD r7,`3*$BNSZ`(r5)
  986. $UMULL r8,r6,r7
  987. $UMULH r9,r6,r7
  988. addc r10,r10,r8
  989. adde r11,r11,r9
  990. addze r12,r0
  991. #mul_add_c(a[1],b[2],c1,c2,c3);
  992. $LD r6,`1*$BNSZ`(r4)
  993. $LD r7,`2*$BNSZ`(r5)
  994. $UMULL r8,r6,r7
  995. $UMULH r9,r6,r7
  996. addc r10,r10,r8
  997. adde r11,r11,r9
  998. addze r12,r12
  999. #mul_add_c(a[2],b[1],c1,c2,c3);
  1000. $LD r6,`2*$BNSZ`(r4)
  1001. $LD r7,`1*$BNSZ`(r5)
  1002. $UMULL r8,r6,r7
  1003. $UMULH r9,r6,r7
  1004. addc r10,r10,r8
  1005. adde r11,r11,r9
  1006. addze r12,r12
  1007. #mul_add_c(a[3],b[0],c1,c2,c3);
  1008. $LD r6,`3*$BNSZ`(r4)
  1009. $LD r7,`0*$BNSZ`(r5)
  1010. $UMULL r8,r6,r7
  1011. $UMULH r9,r6,r7
  1012. addc r10,r10,r8
  1013. adde r11,r11,r9
  1014. addze r12,r12
  1015. $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
  1016. #mul_add_c(a[4],b[0],c2,c3,c1);
  1017. $LD r6,`4*$BNSZ`(r4)
  1018. $UMULL r8,r6,r7
  1019. $UMULH r9,r6,r7
  1020. addc r11,r11,r8
  1021. adde r12,r12,r9
  1022. addze r10,r0
  1023. #mul_add_c(a[3],b[1],c2,c3,c1);
  1024. $LD r6,`3*$BNSZ`(r4)
  1025. $LD r7,`1*$BNSZ`(r5)
  1026. $UMULL r8,r6,r7
  1027. $UMULH r9,r6,r7
  1028. addc r11,r11,r8
  1029. adde r12,r12,r9
  1030. addze r10,r10
  1031. #mul_add_c(a[2],b[2],c2,c3,c1);
  1032. $LD r6,`2*$BNSZ`(r4)
  1033. $LD r7,`2*$BNSZ`(r5)
  1034. $UMULL r8,r6,r7
  1035. $UMULH r9,r6,r7
  1036. addc r11,r11,r8
  1037. adde r12,r12,r9
  1038. addze r10,r10
  1039. #mul_add_c(a[1],b[3],c2,c3,c1);
  1040. $LD r6,`1*$BNSZ`(r4)
  1041. $LD r7,`3*$BNSZ`(r5)
  1042. $UMULL r8,r6,r7
  1043. $UMULH r9,r6,r7
  1044. addc r11,r11,r8
  1045. adde r12,r12,r9
  1046. addze r10,r10
  1047. #mul_add_c(a[0],b[4],c2,c3,c1);
  1048. $LD r6,`0*$BNSZ`(r4)
  1049. $LD r7,`4*$BNSZ`(r5)
  1050. $UMULL r8,r6,r7
  1051. $UMULH r9,r6,r7
  1052. addc r11,r11,r8
  1053. adde r12,r12,r9
  1054. addze r10,r10
  1055. $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
  1056. #mul_add_c(a[0],b[5],c3,c1,c2);
  1057. $LD r7,`5*$BNSZ`(r5)
  1058. $UMULL r8,r6,r7
  1059. $UMULH r9,r6,r7
  1060. addc r12,r12,r8
  1061. adde r10,r10,r9
  1062. addze r11,r0
  1063. #mul_add_c(a[1],b[4],c3,c1,c2);
  1064. $LD r6,`1*$BNSZ`(r4)
  1065. $LD r7,`4*$BNSZ`(r5)
  1066. $UMULL r8,r6,r7
  1067. $UMULH r9,r6,r7
  1068. addc r12,r12,r8
  1069. adde r10,r10,r9
  1070. addze r11,r11
  1071. #mul_add_c(a[2],b[3],c3,c1,c2);
  1072. $LD r6,`2*$BNSZ`(r4)
  1073. $LD r7,`3*$BNSZ`(r5)
  1074. $UMULL r8,r6,r7
  1075. $UMULH r9,r6,r7
  1076. addc r12,r12,r8
  1077. adde r10,r10,r9
  1078. addze r11,r11
  1079. #mul_add_c(a[3],b[2],c3,c1,c2);
  1080. $LD r6,`3*$BNSZ`(r4)
  1081. $LD r7,`2*$BNSZ`(r5)
  1082. $UMULL r8,r6,r7
  1083. $UMULH r9,r6,r7
  1084. addc r12,r12,r8
  1085. adde r10,r10,r9
  1086. addze r11,r11
  1087. #mul_add_c(a[4],b[1],c3,c1,c2);
  1088. $LD r6,`4*$BNSZ`(r4)
  1089. $LD r7,`1*$BNSZ`(r5)
  1090. $UMULL r8,r6,r7
  1091. $UMULH r9,r6,r7
  1092. addc r12,r12,r8
  1093. adde r10,r10,r9
  1094. addze r11,r11
  1095. #mul_add_c(a[5],b[0],c3,c1,c2);
  1096. $LD r6,`5*$BNSZ`(r4)
  1097. $LD r7,`0*$BNSZ`(r5)
  1098. $UMULL r8,r6,r7
  1099. $UMULH r9,r6,r7
  1100. addc r12,r12,r8
  1101. adde r10,r10,r9
  1102. addze r11,r11
  1103. $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
  1104. #mul_add_c(a[6],b[0],c1,c2,c3);
  1105. $LD r6,`6*$BNSZ`(r4)
  1106. $UMULL r8,r6,r7
  1107. $UMULH r9,r6,r7
  1108. addc r10,r10,r8
  1109. adde r11,r11,r9
  1110. addze r12,r0
  1111. #mul_add_c(a[5],b[1],c1,c2,c3);
  1112. $LD r6,`5*$BNSZ`(r4)
  1113. $LD r7,`1*$BNSZ`(r5)
  1114. $UMULL r8,r6,r7
  1115. $UMULH r9,r6,r7
  1116. addc r10,r10,r8
  1117. adde r11,r11,r9
  1118. addze r12,r12
  1119. #mul_add_c(a[4],b[2],c1,c2,c3);
  1120. $LD r6,`4*$BNSZ`(r4)
  1121. $LD r7,`2*$BNSZ`(r5)
  1122. $UMULL r8,r6,r7
  1123. $UMULH r9,r6,r7
  1124. addc r10,r10,r8
  1125. adde r11,r11,r9
  1126. addze r12,r12
  1127. #mul_add_c(a[3],b[3],c1,c2,c3);
  1128. $LD r6,`3*$BNSZ`(r4)
  1129. $LD r7,`3*$BNSZ`(r5)
  1130. $UMULL r8,r6,r7
  1131. $UMULH r9,r6,r7
  1132. addc r10,r10,r8
  1133. adde r11,r11,r9
  1134. addze r12,r12
  1135. #mul_add_c(a[2],b[4],c1,c2,c3);
  1136. $LD r6,`2*$BNSZ`(r4)
  1137. $LD r7,`4*$BNSZ`(r5)
  1138. $UMULL r8,r6,r7
  1139. $UMULH r9,r6,r7
  1140. addc r10,r10,r8
  1141. adde r11,r11,r9
  1142. addze r12,r12
  1143. #mul_add_c(a[1],b[5],c1,c2,c3);
  1144. $LD r6,`1*$BNSZ`(r4)
  1145. $LD r7,`5*$BNSZ`(r5)
  1146. $UMULL r8,r6,r7
  1147. $UMULH r9,r6,r7
  1148. addc r10,r10,r8
  1149. adde r11,r11,r9
  1150. addze r12,r12
  1151. #mul_add_c(a[0],b[6],c1,c2,c3);
  1152. $LD r6,`0*$BNSZ`(r4)
  1153. $LD r7,`6*$BNSZ`(r5)
  1154. $UMULL r8,r6,r7
  1155. $UMULH r9,r6,r7
  1156. addc r10,r10,r8
  1157. adde r11,r11,r9
  1158. addze r12,r12
  1159. $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
  1160. #mul_add_c(a[0],b[7],c2,c3,c1);
  1161. $LD r7,`7*$BNSZ`(r5)
  1162. $UMULL r8,r6,r7
  1163. $UMULH r9,r6,r7
  1164. addc r11,r11,r8
  1165. adde r12,r12,r9
  1166. addze r10,r0
  1167. #mul_add_c(a[1],b[6],c2,c3,c1);
  1168. $LD r6,`1*$BNSZ`(r4)
  1169. $LD r7,`6*$BNSZ`(r5)
  1170. $UMULL r8,r6,r7
  1171. $UMULH r9,r6,r7
  1172. addc r11,r11,r8
  1173. adde r12,r12,r9
  1174. addze r10,r10
  1175. #mul_add_c(a[2],b[5],c2,c3,c1);
  1176. $LD r6,`2*$BNSZ`(r4)
  1177. $LD r7,`5*$BNSZ`(r5)
  1178. $UMULL r8,r6,r7
  1179. $UMULH r9,r6,r7
  1180. addc r11,r11,r8
  1181. adde r12,r12,r9
  1182. addze r10,r10
  1183. #mul_add_c(a[3],b[4],c2,c3,c1);
  1184. $LD r6,`3*$BNSZ`(r4)
  1185. $LD r7,`4*$BNSZ`(r5)
  1186. $UMULL r8,r6,r7
  1187. $UMULH r9,r6,r7
  1188. addc r11,r11,r8
  1189. adde r12,r12,r9
  1190. addze r10,r10
  1191. #mul_add_c(a[4],b[3],c2,c3,c1);
  1192. $LD r6,`4*$BNSZ`(r4)
  1193. $LD r7,`3*$BNSZ`(r5)
  1194. $UMULL r8,r6,r7
  1195. $UMULH r9,r6,r7
  1196. addc r11,r11,r8
  1197. adde r12,r12,r9
  1198. addze r10,r10
  1199. #mul_add_c(a[5],b[2],c2,c3,c1);
  1200. $LD r6,`5*$BNSZ`(r4)
  1201. $LD r7,`2*$BNSZ`(r5)
  1202. $UMULL r8,r6,r7
  1203. $UMULH r9,r6,r7
  1204. addc r11,r11,r8
  1205. adde r12,r12,r9
  1206. addze r10,r10
  1207. #mul_add_c(a[6],b[1],c2,c3,c1);
  1208. $LD r6,`6*$BNSZ`(r4)
  1209. $LD r7,`1*$BNSZ`(r5)
  1210. $UMULL r8,r6,r7
  1211. $UMULH r9,r6,r7
  1212. addc r11,r11,r8
  1213. adde r12,r12,r9
  1214. addze r10,r10
  1215. #mul_add_c(a[7],b[0],c2,c3,c1);
  1216. $LD r6,`7*$BNSZ`(r4)
  1217. $LD r7,`0*$BNSZ`(r5)
  1218. $UMULL r8,r6,r7
  1219. $UMULH r9,r6,r7
  1220. addc r11,r11,r8
  1221. adde r12,r12,r9
  1222. addze r10,r10
  1223. $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
  1224. #mul_add_c(a[7],b[1],c3,c1,c2);
  1225. $LD r7,`1*$BNSZ`(r5)
  1226. $UMULL r8,r6,r7
  1227. $UMULH r9,r6,r7
  1228. addc r12,r12,r8
  1229. adde r10,r10,r9
  1230. addze r11,r0
  1231. #mul_add_c(a[6],b[2],c3,c1,c2);
  1232. $LD r6,`6*$BNSZ`(r4)
  1233. $LD r7,`2*$BNSZ`(r5)
  1234. $UMULL r8,r6,r7
  1235. $UMULH r9,r6,r7
  1236. addc r12,r12,r8
  1237. adde r10,r10,r9
  1238. addze r11,r11
  1239. #mul_add_c(a[5],b[3],c3,c1,c2);
  1240. $LD r6,`5*$BNSZ`(r4)
  1241. $LD r7,`3*$BNSZ`(r5)
  1242. $UMULL r8,r6,r7
  1243. $UMULH r9,r6,r7
  1244. addc r12,r12,r8
  1245. adde r10,r10,r9
  1246. addze r11,r11
  1247. #mul_add_c(a[4],b[4],c3,c1,c2);
  1248. $LD r6,`4*$BNSZ`(r4)
  1249. $LD r7,`4*$BNSZ`(r5)
  1250. $UMULL r8,r6,r7
  1251. $UMULH r9,r6,r7
  1252. addc r12,r12,r8
  1253. adde r10,r10,r9
  1254. addze r11,r11
  1255. #mul_add_c(a[3],b[5],c3,c1,c2);
  1256. $LD r6,`3*$BNSZ`(r4)
  1257. $LD r7,`5*$BNSZ`(r5)
  1258. $UMULL r8,r6,r7
  1259. $UMULH r9,r6,r7
  1260. addc r12,r12,r8
  1261. adde r10,r10,r9
  1262. addze r11,r11
  1263. #mul_add_c(a[2],b[6],c3,c1,c2);
  1264. $LD r6,`2*$BNSZ`(r4)
  1265. $LD r7,`6*$BNSZ`(r5)
  1266. $UMULL r8,r6,r7
  1267. $UMULH r9,r6,r7
  1268. addc r12,r12,r8
  1269. adde r10,r10,r9
  1270. addze r11,r11
  1271. #mul_add_c(a[1],b[7],c3,c1,c2);
  1272. $LD r6,`1*$BNSZ`(r4)
  1273. $LD r7,`7*$BNSZ`(r5)
  1274. $UMULL r8,r6,r7
  1275. $UMULH r9,r6,r7
  1276. addc r12,r12,r8
  1277. adde r10,r10,r9
  1278. addze r11,r11
  1279. $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
  1280. #mul_add_c(a[2],b[7],c1,c2,c3);
  1281. $LD r6,`2*$BNSZ`(r4)
  1282. $UMULL r8,r6,r7
  1283. $UMULH r9,r6,r7
  1284. addc r10,r10,r8
  1285. adde r11,r11,r9
  1286. addze r12,r0
  1287. #mul_add_c(a[3],b[6],c1,c2,c3);
  1288. $LD r6,`3*$BNSZ`(r4)
  1289. $LD r7,`6*$BNSZ`(r5)
  1290. $UMULL r8,r6,r7
  1291. $UMULH r9,r6,r7
  1292. addc r10,r10,r8
  1293. adde r11,r11,r9
  1294. addze r12,r12
  1295. #mul_add_c(a[4],b[5],c1,c2,c3);
  1296. $LD r6,`4*$BNSZ`(r4)
  1297. $LD r7,`5*$BNSZ`(r5)
  1298. $UMULL r8,r6,r7
  1299. $UMULH r9,r6,r7
  1300. addc r10,r10,r8
  1301. adde r11,r11,r9
  1302. addze r12,r12
  1303. #mul_add_c(a[5],b[4],c1,c2,c3);
  1304. $LD r6,`5*$BNSZ`(r4)
  1305. $LD r7,`4*$BNSZ`(r5)
  1306. $UMULL r8,r6,r7
  1307. $UMULH r9,r6,r7
  1308. addc r10,r10,r8
  1309. adde r11,r11,r9
  1310. addze r12,r12
  1311. #mul_add_c(a[6],b[3],c1,c2,c3);
  1312. $LD r6,`6*$BNSZ`(r4)
  1313. $LD r7,`3*$BNSZ`(r5)
  1314. $UMULL r8,r6,r7
  1315. $UMULH r9,r6,r7
  1316. addc r10,r10,r8
  1317. adde r11,r11,r9
  1318. addze r12,r12
  1319. #mul_add_c(a[7],b[2],c1,c2,c3);
  1320. $LD r6,`7*$BNSZ`(r4)
  1321. $LD r7,`2*$BNSZ`(r5)
  1322. $UMULL r8,r6,r7
  1323. $UMULH r9,r6,r7
  1324. addc r10,r10,r8
  1325. adde r11,r11,r9
  1326. addze r12,r12
  1327. $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
  1328. #mul_add_c(a[7],b[3],c2,c3,c1);
  1329. $LD r7,`3*$BNSZ`(r5)
  1330. $UMULL r8,r6,r7
  1331. $UMULH r9,r6,r7
  1332. addc r11,r11,r8
  1333. adde r12,r12,r9
  1334. addze r10,r0
  1335. #mul_add_c(a[6],b[4],c2,c3,c1);
  1336. $LD r6,`6*$BNSZ`(r4)
  1337. $LD r7,`4*$BNSZ`(r5)
  1338. $UMULL r8,r6,r7
  1339. $UMULH r9,r6,r7
  1340. addc r11,r11,r8
  1341. adde r12,r12,r9
  1342. addze r10,r10
  1343. #mul_add_c(a[5],b[5],c2,c3,c1);
  1344. $LD r6,`5*$BNSZ`(r4)
  1345. $LD r7,`5*$BNSZ`(r5)
  1346. $UMULL r8,r6,r7
  1347. $UMULH r9,r6,r7
  1348. addc r11,r11,r8
  1349. adde r12,r12,r9
  1350. addze r10,r10
  1351. #mul_add_c(a[4],b[6],c2,c3,c1);
  1352. $LD r6,`4*$BNSZ`(r4)
  1353. $LD r7,`6*$BNSZ`(r5)
  1354. $UMULL r8,r6,r7
  1355. $UMULH r9,r6,r7
  1356. addc r11,r11,r8
  1357. adde r12,r12,r9
  1358. addze r10,r10
  1359. #mul_add_c(a[3],b[7],c2,c3,c1);
  1360. $LD r6,`3*$BNSZ`(r4)
  1361. $LD r7,`7*$BNSZ`(r5)
  1362. $UMULL r8,r6,r7
  1363. $UMULH r9,r6,r7
  1364. addc r11,r11,r8
  1365. adde r12,r12,r9
  1366. addze r10,r10
  1367. $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
  1368. #mul_add_c(a[4],b[7],c3,c1,c2);
  1369. $LD r6,`4*$BNSZ`(r4)
  1370. $UMULL r8,r6,r7
  1371. $UMULH r9,r6,r7
  1372. addc r12,r12,r8
  1373. adde r10,r10,r9
  1374. addze r11,r0
  1375. #mul_add_c(a[5],b[6],c3,c1,c2);
  1376. $LD r6,`5*$BNSZ`(r4)
  1377. $LD r7,`6*$BNSZ`(r5)
  1378. $UMULL r8,r6,r7
  1379. $UMULH r9,r6,r7
  1380. addc r12,r12,r8
  1381. adde r10,r10,r9
  1382. addze r11,r11
  1383. #mul_add_c(a[6],b[5],c3,c1,c2);
  1384. $LD r6,`6*$BNSZ`(r4)
  1385. $LD r7,`5*$BNSZ`(r5)
  1386. $UMULL r8,r6,r7
  1387. $UMULH r9,r6,r7
  1388. addc r12,r12,r8
  1389. adde r10,r10,r9
  1390. addze r11,r11
  1391. #mul_add_c(a[7],b[4],c3,c1,c2);
  1392. $LD r6,`7*$BNSZ`(r4)
  1393. $LD r7,`4*$BNSZ`(r5)
  1394. $UMULL r8,r6,r7
  1395. $UMULH r9,r6,r7
  1396. addc r12,r12,r8
  1397. adde r10,r10,r9
  1398. addze r11,r11
  1399. $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
  1400. #mul_add_c(a[7],b[5],c1,c2,c3);
  1401. $LD r7,`5*$BNSZ`(r5)
  1402. $UMULL r8,r6,r7
  1403. $UMULH r9,r6,r7
  1404. addc r10,r10,r8
  1405. adde r11,r11,r9
  1406. addze r12,r0
  1407. #mul_add_c(a[6],b[6],c1,c2,c3);
  1408. $LD r6,`6*$BNSZ`(r4)
  1409. $LD r7,`6*$BNSZ`(r5)
  1410. $UMULL r8,r6,r7
  1411. $UMULH r9,r6,r7
  1412. addc r10,r10,r8
  1413. adde r11,r11,r9
  1414. addze r12,r12
  1415. #mul_add_c(a[5],b[7],c1,c2,c3);
  1416. $LD r6,`5*$BNSZ`(r4)
  1417. $LD r7,`7*$BNSZ`(r5)
  1418. $UMULL r8,r6,r7
  1419. $UMULH r9,r6,r7
  1420. addc r10,r10,r8
  1421. adde r11,r11,r9
  1422. addze r12,r12
  1423. $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
  1424. #mul_add_c(a[6],b[7],c2,c3,c1);
  1425. $LD r6,`6*$BNSZ`(r4)
  1426. $UMULL r8,r6,r7
  1427. $UMULH r9,r6,r7
  1428. addc r11,r11,r8
  1429. adde r12,r12,r9
  1430. addze r10,r0
  1431. #mul_add_c(a[7],b[6],c2,c3,c1);
  1432. $LD r6,`7*$BNSZ`(r4)
  1433. $LD r7,`6*$BNSZ`(r5)
  1434. $UMULL r8,r6,r7
  1435. $UMULH r9,r6,r7
  1436. addc r11,r11,r8
  1437. adde r12,r12,r9
  1438. addze r10,r10
  1439. $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
  1440. #mul_add_c(a[7],b[7],c3,c1,c2);
  1441. $LD r7,`7*$BNSZ`(r5)
  1442. $UMULL r8,r6,r7
  1443. $UMULH r9,r6,r7
  1444. addc r12,r12,r8
  1445. adde r10,r10,r9
  1446. $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
  1447. $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
  1448. blr
  1449. .long 0
  1450. .byte 0,12,0x14,0,0,0,3,0
  1451. .long 0
  1452. .size .bn_mul_comba8,.-.bn_mul_comba8
  1453. #
  1454. # NOTE: The following label name should be changed to
  1455. # "bn_sub_words" i.e. remove the first dot
  1456. # for the gcc compiler. This should be automatically
  1457. # done in the build
  1458. #
  1459. #
  1460. .align 4
  1461. .bn_sub_words:
  1462. #
  1463. # Handcoded version of bn_sub_words
  1464. #
  1465. #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1466. #
  1467. # r3 = r
  1468. # r4 = a
  1469. # r5 = b
  1470. # r6 = n
  1471. #
  1472. # Note: No loop unrolling done since this is not a performance
  1473. # critical loop.
  1474. xor r0,r0,r0 #set r0 = 0
  1475. #
  1476. # check for r6 = 0 AND set carry bit.
  1477. #
  1478. subfc. r7,r0,r6 # If r6 is 0 then result is 0.
  1479. # if r6 > 0 then result !=0
  1480. # In either case carry bit is set.
  1481. beq Lppcasm_sub_adios
  1482. addi r4,r4,-$BNSZ
  1483. addi r3,r3,-$BNSZ
  1484. addi r5,r5,-$BNSZ
  1485. mtctr r6
  1486. Lppcasm_sub_mainloop:
  1487. $LDU r7,$BNSZ(r4)
  1488. $LDU r8,$BNSZ(r5)
  1489. subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
  1490. # if carry = 1 this is r7-r8. Else it
  1491. # is r7-r8 -1 as we need.
  1492. $STU r6,$BNSZ(r3)
  1493. bdnz Lppcasm_sub_mainloop
  1494. Lppcasm_sub_adios:
  1495. subfze r3,r0 # if carry bit is set then r3 = 0 else -1
  1496. andi. r3,r3,1 # keep only last bit.
  1497. blr
  1498. .long 0
  1499. .byte 0,12,0x14,0,0,0,4,0
  1500. .long 0
  1501. .size .bn_sub_words,.-.bn_sub_words
  1502. #
  1503. # NOTE: The following label name should be changed to
  1504. # "bn_add_words" i.e. remove the first dot
  1505. # for the gcc compiler. This should be automatically
  1506. # done in the build
  1507. #
  1508. .align 4
  1509. .bn_add_words:
  1510. #
  1511. # Handcoded version of bn_add_words
  1512. #
  1513. #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1514. #
  1515. # r3 = r
  1516. # r4 = a
  1517. # r5 = b
  1518. # r6 = n
  1519. #
  1520. # Note: No loop unrolling done since this is not a performance
  1521. # critical loop.
  1522. xor r0,r0,r0
  1523. #
  1524. # check for r6 = 0. Is this needed?
  1525. #
  1526. addic. r6,r6,0 #test r6 and clear carry bit.
  1527. beq Lppcasm_add_adios
  1528. addi r4,r4,-$BNSZ
  1529. addi r3,r3,-$BNSZ
  1530. addi r5,r5,-$BNSZ
  1531. mtctr r6
  1532. Lppcasm_add_mainloop:
  1533. $LDU r7,$BNSZ(r4)
  1534. $LDU r8,$BNSZ(r5)
  1535. adde r8,r7,r8
  1536. $STU r8,$BNSZ(r3)
  1537. bdnz Lppcasm_add_mainloop
  1538. Lppcasm_add_adios:
  1539. addze r3,r0 #return carry bit.
  1540. blr
  1541. .long 0
  1542. .byte 0,12,0x14,0,0,0,4,0
  1543. .long 0
  1544. .size .bn_add_words,.-.bn_add_words
  1545. #
  1546. # NOTE: The following label name should be changed to
  1547. # "bn_div_words" i.e. remove the first dot
  1548. # for the gcc compiler. This should be automatically
  1549. # done in the build
  1550. #
  1551. .align 4
  1552. .bn_div_words:
  1553. #
  1554. # This is a cleaned up version of code generated by
  1555. # the AIX compiler. The only optimization is to use
  1556. # the PPC instruction to count leading zeros instead
  1557. # of call to num_bits_word. Since this was compiled
  1558. # only at level -O2 we can possibly squeeze it more?
  1559. #
  1560. # r3 = h
  1561. # r4 = l
  1562. # r5 = d
  1563. $UCMPI 0,r5,0 # compare r5 and 0
  1564. bne Lppcasm_div1 # proceed if d!=0
  1565. li r3,-1 # d=0 return -1
  1566. blr
  1567. Lppcasm_div1:
  1568. xor r0,r0,r0 #r0=0
  1569. li r8,$BITS
  1570. $CNTLZ. r7,r5 #r7 = num leading 0s in d.
  1571. beq Lppcasm_div2 #proceed if no leading zeros
  1572. subf r8,r7,r8 #r8 = BN_num_bits_word(d)
  1573. $SHR. r9,r3,r8 #are there any bits above r8'th?
  1574. $TR 16,r9,r0 #if there're, signal to dump core...
  1575. Lppcasm_div2:
  1576. $UCMP 0,r3,r5 #h>=d?
  1577. blt Lppcasm_div3 #goto Lppcasm_div3 if not
  1578. subf r3,r5,r3 #h-=d ;
  1579. Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
  1580. cmpi 0,0,r7,0 # is (i == 0)?
  1581. beq Lppcasm_div4
  1582. $SHL r3,r3,r7 # h = (h<< i)
  1583. $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
  1584. $SHL r5,r5,r7 # d<<=i
  1585. or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
  1586. $SHL r4,r4,r7 # l <<=i
  1587. Lppcasm_div4:
  1588. $SHRI r9,r5,`$BITS/2` # r9 = dh
  1589. # dl will be computed when needed
  1590. # as it saves registers.
  1591. li r6,2 #r6=2
  1592. mtctr r6 #counter will be in count.
  1593. Lppcasm_divouterloop:
  1594. $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
  1595. $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
  1596. # compute here for innerloop.
  1597. $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
  1598. bne Lppcasm_div5 # goto Lppcasm_div5 if not
  1599. li r8,-1
  1600. $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
  1601. b Lppcasm_div6
  1602. Lppcasm_div5:
  1603. $UDIV r8,r3,r9 #q = h/dh
  1604. Lppcasm_div6:
  1605. $UMULL r12,r9,r8 #th = q*dh
  1606. $CLRU r10,r5,`$BITS/2` #r10=dl
  1607. $UMULL r6,r8,r10 #tl = q*dl
  1608. Lppcasm_divinnerloop:
  1609. subf r10,r12,r3 #t = h -th
  1610. $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
  1611. addic. r7,r7,0 #test if r7 == 0. used below.
  1612. # now want to compute
  1613. # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
  1614. # the following 2 instructions do that
  1615. $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
  1616. or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
  1617. $UCMP cr1,r6,r7 # compare (tl <= r7)
  1618. bne Lppcasm_divinnerexit
  1619. ble cr1,Lppcasm_divinnerexit
  1620. addi r8,r8,-1 #q--
  1621. subf r12,r9,r12 #th -=dh
  1622. $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
  1623. subf r6,r10,r6 #tl -=dl
  1624. b Lppcasm_divinnerloop
  1625. Lppcasm_divinnerexit:
  1626. $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
  1627. $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
  1628. $UCMP cr1,r4,r11 # compare l and tl
  1629. add r12,r12,r10 # th+=t
  1630. bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
  1631. addi r12,r12,1 # th++
  1632. Lppcasm_div7:
  1633. subf r11,r11,r4 #r11=l-tl
  1634. $UCMP cr1,r3,r12 #compare h and th
  1635. bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
  1636. addi r8,r8,-1 # q--
  1637. add r3,r5,r3 # h+=d
  1638. Lppcasm_div8:
  1639. subf r12,r12,r3 #r12 = h-th
  1640. $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
  1641. # want to compute
  1642. # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
  1643. # the following 2 instructions will do this.
  1644. $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
  1645. $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
  1646. bdz Lppcasm_div9 #if (count==0) break ;
  1647. $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
  1648. b Lppcasm_divouterloop
  1649. Lppcasm_div9:
  1650. or r3,r8,r0
  1651. blr
  1652. .long 0
  1653. .byte 0,12,0x14,0,0,0,3,0
  1654. .long 0
  1655. .size .bn_div_words,.-.bn_div_words
  1656. #
  1657. # NOTE: The following label name should be changed to
  1658. # "bn_sqr_words" i.e. remove the first dot
  1659. # for the gcc compiler. This should be automatically
  1660. # done in the build
  1661. #
  1662. .align 4
  1663. .bn_sqr_words:
  1664. #
  1665. # Optimized version of bn_sqr_words
  1666. #
  1667. # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
  1668. #
  1669. # r3 = r
  1670. # r4 = a
  1671. # r5 = n
  1672. #
  1673. # r6 = a[i].
  1674. # r7,r8 = product.
  1675. #
  1676. # No unrolling done here. Not performance critical.
  1677. addic. r5,r5,0 #test r5.
  1678. beq Lppcasm_sqr_adios
  1679. addi r4,r4,-$BNSZ
  1680. addi r3,r3,-$BNSZ
  1681. mtctr r5
  1682. Lppcasm_sqr_mainloop:
  1683. #sqr(r[0],r[1],a[0]);
  1684. $LDU r6,$BNSZ(r4)
  1685. $UMULL r7,r6,r6
  1686. $UMULH r8,r6,r6
  1687. $STU r7,$BNSZ(r3)
  1688. $STU r8,$BNSZ(r3)
  1689. bdnz Lppcasm_sqr_mainloop
  1690. Lppcasm_sqr_adios:
  1691. blr
  1692. .long 0
  1693. .byte 0,12,0x14,0,0,0,3,0
  1694. .long 0
  1695. .size .bn_sqr_words,.-.bn_sqr_words
  1696. #
  1697. # NOTE: The following label name should be changed to
  1698. # "bn_mul_words" i.e. remove the first dot
  1699. # for the gcc compiler. This should be automatically
  1700. # done in the build
  1701. #
  1702. .align 4
  1703. .bn_mul_words:
  1704. #
  1705. # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1706. #
  1707. # r3 = rp
  1708. # r4 = ap
  1709. # r5 = num
  1710. # r6 = w
  1711. xor r0,r0,r0
  1712. xor r12,r12,r12 # used for carry
  1713. rlwinm. r7,r5,30,2,31 # num >> 2
  1714. beq Lppcasm_mw_REM
  1715. mtctr r7
  1716. Lppcasm_mw_LOOP:
  1717. #mul(rp[0],ap[0],w,c1);
  1718. $LD r8,`0*$BNSZ`(r4)
  1719. $UMULL r9,r6,r8
  1720. $UMULH r10,r6,r8
  1721. addc r9,r9,r12
  1722. #addze r10,r10 #carry is NOT ignored.
  1723. #will be taken care of
  1724. #in second spin below
  1725. #using adde.
  1726. $ST r9,`0*$BNSZ`(r3)
  1727. #mul(rp[1],ap[1],w,c1);
  1728. $LD r8,`1*$BNSZ`(r4)
  1729. $UMULL r11,r6,r8
  1730. $UMULH r12,r6,r8
  1731. adde r11,r11,r10
  1732. #addze r12,r12
  1733. $ST r11,`1*$BNSZ`(r3)
  1734. #mul(rp[2],ap[2],w,c1);
  1735. $LD r8,`2*$BNSZ`(r4)
  1736. $UMULL r9,r6,r8
  1737. $UMULH r10,r6,r8
  1738. adde r9,r9,r12
  1739. #addze r10,r10
  1740. $ST r9,`2*$BNSZ`(r3)
  1741. #mul_add(rp[3],ap[3],w,c1);
  1742. $LD r8,`3*$BNSZ`(r4)
  1743. $UMULL r11,r6,r8
  1744. $UMULH r12,r6,r8
  1745. adde r11,r11,r10
  1746. addze r12,r12 #this spin we collect carry into
  1747. #r12
  1748. $ST r11,`3*$BNSZ`(r3)
  1749. addi r3,r3,`4*$BNSZ`
  1750. addi r4,r4,`4*$BNSZ`
  1751. bdnz Lppcasm_mw_LOOP
  1752. Lppcasm_mw_REM:
  1753. andi. r5,r5,0x3
  1754. beq Lppcasm_mw_OVER
  1755. #mul(rp[0],ap[0],w,c1);
  1756. $LD r8,`0*$BNSZ`(r4)
  1757. $UMULL r9,r6,r8
  1758. $UMULH r10,r6,r8
  1759. addc r9,r9,r12
  1760. addze r10,r10
  1761. $ST r9,`0*$BNSZ`(r3)
  1762. addi r12,r10,0
  1763. addi r5,r5,-1
  1764. cmpli 0,0,r5,0
  1765. beq Lppcasm_mw_OVER
  1766. #mul(rp[1],ap[1],w,c1);
  1767. $LD r8,`1*$BNSZ`(r4)
  1768. $UMULL r9,r6,r8
  1769. $UMULH r10,r6,r8
  1770. addc r9,r9,r12
  1771. addze r10,r10
  1772. $ST r9,`1*$BNSZ`(r3)
  1773. addi r12,r10,0
  1774. addi r5,r5,-1
  1775. cmpli 0,0,r5,0
  1776. beq Lppcasm_mw_OVER
  1777. #mul_add(rp[2],ap[2],w,c1);
  1778. $LD r8,`2*$BNSZ`(r4)
  1779. $UMULL r9,r6,r8
  1780. $UMULH r10,r6,r8
  1781. addc r9,r9,r12
  1782. addze r10,r10
  1783. $ST r9,`2*$BNSZ`(r3)
  1784. addi r12,r10,0
  1785. Lppcasm_mw_OVER:
  1786. addi r3,r12,0
  1787. blr
  1788. .long 0
  1789. .byte 0,12,0x14,0,0,0,4,0
  1790. .long 0
  1791. .size .bn_mul_words,.-.bn_mul_words
  1792. #
  1793. # NOTE: The following label name should be changed to
  1794. # "bn_mul_add_words" i.e. remove the first dot
  1795. # for the gcc compiler. This should be automatically
  1796. # done in the build
  1797. #
  1798. .align 4
  1799. .bn_mul_add_words:
  1800. #
  1801. # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1802. #
  1803. # r3 = rp
  1804. # r4 = ap
  1805. # r5 = num
  1806. # r6 = w
  1807. #
  1808. # empirical evidence suggests that unrolled version performs best!!
  1809. #
  1810. xor r0,r0,r0 #r0 = 0
  1811. xor r12,r12,r12 #r12 = 0 . used for carry
  1812. rlwinm. r7,r5,30,2,31 # num >> 2
  1813. beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
  1814. mtctr r7
  1815. Lppcasm_maw_mainloop:
  1816. #mul_add(rp[0],ap[0],w,c1);
  1817. $LD r8,`0*$BNSZ`(r4)
  1818. $LD r11,`0*$BNSZ`(r3)
  1819. $UMULL r9,r6,r8
  1820. $UMULH r10,r6,r8
  1821. addc r9,r9,r12 #r12 is carry.
  1822. addze r10,r10
  1823. addc r9,r9,r11
  1824. #addze r10,r10
  1825. #the above instruction addze
  1826. #is NOT needed. Carry will NOT
  1827. #be ignored. It's not affected
  1828. #by multiply and will be collected
  1829. #in the next spin
  1830. $ST r9,`0*$BNSZ`(r3)
  1831. #mul_add(rp[1],ap[1],w,c1);
  1832. $LD r8,`1*$BNSZ`(r4)
  1833. $LD r9,`1*$BNSZ`(r3)
  1834. $UMULL r11,r6,r8
  1835. $UMULH r12,r6,r8
  1836. adde r11,r11,r10 #r10 is carry.
  1837. addze r12,r12
  1838. addc r11,r11,r9
  1839. #addze r12,r12
  1840. $ST r11,`1*$BNSZ`(r3)
  1841. #mul_add(rp[2],ap[2],w,c1);
  1842. $LD r8,`2*$BNSZ`(r4)
  1843. $UMULL r9,r6,r8
  1844. $LD r11,`2*$BNSZ`(r3)
  1845. $UMULH r10,r6,r8
  1846. adde r9,r9,r12
  1847. addze r10,r10
  1848. addc r9,r9,r11
  1849. #addze r10,r10
  1850. $ST r9,`2*$BNSZ`(r3)
  1851. #mul_add(rp[3],ap[3],w,c1);
  1852. $LD r8,`3*$BNSZ`(r4)
  1853. $UMULL r11,r6,r8
  1854. $LD r9,`3*$BNSZ`(r3)
  1855. $UMULH r12,r6,r8
  1856. adde r11,r11,r10
  1857. addze r12,r12
  1858. addc r11,r11,r9
  1859. addze r12,r12
  1860. $ST r11,`3*$BNSZ`(r3)
  1861. addi r3,r3,`4*$BNSZ`
  1862. addi r4,r4,`4*$BNSZ`
  1863. bdnz Lppcasm_maw_mainloop
  1864. Lppcasm_maw_leftover:
  1865. andi. r5,r5,0x3
  1866. beq Lppcasm_maw_adios
  1867. addi r3,r3,-$BNSZ
  1868. addi r4,r4,-$BNSZ
  1869. #mul_add(rp[0],ap[0],w,c1);
  1870. mtctr r5
  1871. $LDU r8,$BNSZ(r4)
  1872. $UMULL r9,r6,r8
  1873. $UMULH r10,r6,r8
  1874. $LDU r11,$BNSZ(r3)
  1875. addc r9,r9,r11
  1876. addze r10,r10
  1877. addc r9,r9,r12
  1878. addze r12,r10
  1879. $ST r9,0(r3)
  1880. bdz Lppcasm_maw_adios
  1881. #mul_add(rp[1],ap[1],w,c1);
  1882. $LDU r8,$BNSZ(r4)
  1883. $UMULL r9,r6,r8
  1884. $UMULH r10,r6,r8
  1885. $LDU r11,$BNSZ(r3)
  1886. addc r9,r9,r11
  1887. addze r10,r10
  1888. addc r9,r9,r12
  1889. addze r12,r10
  1890. $ST r9,0(r3)
  1891. bdz Lppcasm_maw_adios
  1892. #mul_add(rp[2],ap[2],w,c1);
  1893. $LDU r8,$BNSZ(r4)
  1894. $UMULL r9,r6,r8
  1895. $UMULH r10,r6,r8
  1896. $LDU r11,$BNSZ(r3)
  1897. addc r9,r9,r11
  1898. addze r10,r10
  1899. addc r9,r9,r12
  1900. addze r12,r10
  1901. $ST r9,0(r3)
  1902. Lppcasm_maw_adios:
  1903. addi r3,r12,0
  1904. blr
  1905. .long 0
  1906. .byte 0,12,0x14,0,0,0,4,0
  1907. .long 0
  1908. .size .bn_mul_add_words,.-.bn_mul_add_words
  1909. .align 4
  1910. EOF
  1911. $data =~ s/\`([^\`]*)\`/eval $1/gem;
  1912. print $data;
  1913. close STDOUT or die "error closing STDOUT: $!";