mips3.s 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201
  1. .rdata
  2. .asciiz "mips3.s, Version 1.1"
  3. .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
  4. /*
  5. * ====================================================================
  6. * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  7. * project.
  8. *
  9. * Rights for redistribution and usage in source and binary forms are
  10. * granted according to the OpenSSL license. Warranty of any kind is
  11. * disclaimed.
  12. * ====================================================================
  13. */
  14. /*
  15. * This is my modest contributon to the OpenSSL project (see
  16. * http://www.openssl.org/ for more information about it) and is
  17. * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
  18. * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  19. *
  20. * The module is designed to work with either of the "new" MIPS ABI(5),
  21. * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
  22. * IRIX 5.x not only because it doesn't support new ABIs but also
  23. * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
  24. * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
  25. * cause illegal instruction exception:-(
  26. *
  27. * In addition the code depends on preprocessor flags set up by MIPSpro
  28. * compiler driver (either as or cc) and therefore (probably?) can't be
  29. * compiled by the GNU assembler. GNU C driver manages fine though...
  30. * I mean as long as -mmips-as is specified or is the default option,
  31. * because then it simply invokes /usr/bin/as which in turn takes
  32. * perfect care of the preprocessor definitions. Another neat feature
  33. * offered by the MIPSpro assembler is an optimization pass. This gave
  34. * me the opportunity to have the code looking more regular as all those
  35. * architecture dependent instruction rescheduling details were left to
  36. * the assembler. Cool, huh?
  37. *
  38. * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
  39. * goes way over 3 times faster!
  40. *
  41. * <appro@fy.chalmers.se>
  42. */
  43. #include <asm.h>
  44. #include <regdef.h>
  45. #if _MIPS_ISA>=4
  46. #define MOVNZ(cond,dst,src) \
  47. movn dst,src,cond
  48. #else
  49. #define MOVNZ(cond,dst,src) \
  50. .set noreorder; \
  51. bnezl cond,.+8; \
  52. move dst,src; \
  53. .set reorder
  54. #endif
  55. .text
  56. .set noat
  57. .set reorder
  58. #define MINUS4 v1
  59. .align 5
  60. LEAF(bn_mul_add_words)
  61. .set noreorder
  62. bgtzl a2,.L_bn_mul_add_words_proceed
  63. ld t0,0(a1)
  64. jr ra
  65. move v0,zero
  66. .set reorder
  67. .L_bn_mul_add_words_proceed:
  68. li MINUS4,-4
  69. and ta0,a2,MINUS4
  70. move v0,zero
  71. beqz ta0,.L_bn_mul_add_words_tail
  72. .L_bn_mul_add_words_loop:
  73. dmultu t0,a3
  74. ld t1,0(a0)
  75. ld t2,8(a1)
  76. ld t3,8(a0)
  77. ld ta0,16(a1)
  78. ld ta1,16(a0)
  79. daddu t1,v0
  80. sltu v0,t1,v0 /* All manuals say it "compares 32-bit
  81. * values", but it seems to work fine
  82. * even on 64-bit registers. */
  83. mflo AT
  84. mfhi t0
  85. daddu t1,AT
  86. daddu v0,t0
  87. sltu AT,t1,AT
  88. sd t1,0(a0)
  89. daddu v0,AT
  90. dmultu t2,a3
  91. ld ta2,24(a1)
  92. ld ta3,24(a0)
  93. daddu t3,v0
  94. sltu v0,t3,v0
  95. mflo AT
  96. mfhi t2
  97. daddu t3,AT
  98. daddu v0,t2
  99. sltu AT,t3,AT
  100. sd t3,8(a0)
  101. daddu v0,AT
  102. dmultu ta0,a3
  103. subu a2,4
  104. PTR_ADD a0,32
  105. PTR_ADD a1,32
  106. daddu ta1,v0
  107. sltu v0,ta1,v0
  108. mflo AT
  109. mfhi ta0
  110. daddu ta1,AT
  111. daddu v0,ta0
  112. sltu AT,ta1,AT
  113. sd ta1,-16(a0)
  114. daddu v0,AT
  115. dmultu ta2,a3
  116. and ta0,a2,MINUS4
  117. daddu ta3,v0
  118. sltu v0,ta3,v0
  119. mflo AT
  120. mfhi ta2
  121. daddu ta3,AT
  122. daddu v0,ta2
  123. sltu AT,ta3,AT
  124. sd ta3,-8(a0)
  125. daddu v0,AT
  126. .set noreorder
  127. bgtzl ta0,.L_bn_mul_add_words_loop
  128. ld t0,0(a1)
  129. bnezl a2,.L_bn_mul_add_words_tail
  130. ld t0,0(a1)
  131. .set reorder
  132. .L_bn_mul_add_words_return:
  133. jr ra
  134. .L_bn_mul_add_words_tail:
  135. dmultu t0,a3
  136. ld t1,0(a0)
  137. subu a2,1
  138. daddu t1,v0
  139. sltu v0,t1,v0
  140. mflo AT
  141. mfhi t0
  142. daddu t1,AT
  143. daddu v0,t0
  144. sltu AT,t1,AT
  145. sd t1,0(a0)
  146. daddu v0,AT
  147. beqz a2,.L_bn_mul_add_words_return
  148. ld t0,8(a1)
  149. dmultu t0,a3
  150. ld t1,8(a0)
  151. subu a2,1
  152. daddu t1,v0
  153. sltu v0,t1,v0
  154. mflo AT
  155. mfhi t0
  156. daddu t1,AT
  157. daddu v0,t0
  158. sltu AT,t1,AT
  159. sd t1,8(a0)
  160. daddu v0,AT
  161. beqz a2,.L_bn_mul_add_words_return
  162. ld t0,16(a1)
  163. dmultu t0,a3
  164. ld t1,16(a0)
  165. daddu t1,v0
  166. sltu v0,t1,v0
  167. mflo AT
  168. mfhi t0
  169. daddu t1,AT
  170. daddu v0,t0
  171. sltu AT,t1,AT
  172. sd t1,16(a0)
  173. daddu v0,AT
  174. jr ra
  175. END(bn_mul_add_words)
  176. .align 5
  177. LEAF(bn_mul_words)
  178. .set noreorder
  179. bgtzl a2,.L_bn_mul_words_proceed
  180. ld t0,0(a1)
  181. jr ra
  182. move v0,zero
  183. .set reorder
  184. .L_bn_mul_words_proceed:
  185. li MINUS4,-4
  186. and ta0,a2,MINUS4
  187. move v0,zero
  188. beqz ta0,.L_bn_mul_words_tail
  189. .L_bn_mul_words_loop:
  190. dmultu t0,a3
  191. ld t2,8(a1)
  192. ld ta0,16(a1)
  193. ld ta2,24(a1)
  194. mflo AT
  195. mfhi t0
  196. daddu v0,AT
  197. sltu t1,v0,AT
  198. sd v0,0(a0)
  199. daddu v0,t1,t0
  200. dmultu t2,a3
  201. subu a2,4
  202. PTR_ADD a0,32
  203. PTR_ADD a1,32
  204. mflo AT
  205. mfhi t2
  206. daddu v0,AT
  207. sltu t3,v0,AT
  208. sd v0,-24(a0)
  209. daddu v0,t3,t2
  210. dmultu ta0,a3
  211. mflo AT
  212. mfhi ta0
  213. daddu v0,AT
  214. sltu ta1,v0,AT
  215. sd v0,-16(a0)
  216. daddu v0,ta1,ta0
  217. dmultu ta2,a3
  218. and ta0,a2,MINUS4
  219. mflo AT
  220. mfhi ta2
  221. daddu v0,AT
  222. sltu ta3,v0,AT
  223. sd v0,-8(a0)
  224. daddu v0,ta3,ta2
  225. .set noreorder
  226. bgtzl ta0,.L_bn_mul_words_loop
  227. ld t0,0(a1)
  228. bnezl a2,.L_bn_mul_words_tail
  229. ld t0,0(a1)
  230. .set reorder
  231. .L_bn_mul_words_return:
  232. jr ra
  233. .L_bn_mul_words_tail:
  234. dmultu t0,a3
  235. subu a2,1
  236. mflo AT
  237. mfhi t0
  238. daddu v0,AT
  239. sltu t1,v0,AT
  240. sd v0,0(a0)
  241. daddu v0,t1,t0
  242. beqz a2,.L_bn_mul_words_return
  243. ld t0,8(a1)
  244. dmultu t0,a3
  245. subu a2,1
  246. mflo AT
  247. mfhi t0
  248. daddu v0,AT
  249. sltu t1,v0,AT
  250. sd v0,8(a0)
  251. daddu v0,t1,t0
  252. beqz a2,.L_bn_mul_words_return
  253. ld t0,16(a1)
  254. dmultu t0,a3
  255. mflo AT
  256. mfhi t0
  257. daddu v0,AT
  258. sltu t1,v0,AT
  259. sd v0,16(a0)
  260. daddu v0,t1,t0
  261. jr ra
  262. END(bn_mul_words)
  263. .align 5
  264. LEAF(bn_sqr_words)
  265. .set noreorder
  266. bgtzl a2,.L_bn_sqr_words_proceed
  267. ld t0,0(a1)
  268. jr ra
  269. move v0,zero
  270. .set reorder
  271. .L_bn_sqr_words_proceed:
  272. li MINUS4,-4
  273. and ta0,a2,MINUS4
  274. move v0,zero
  275. beqz ta0,.L_bn_sqr_words_tail
  276. .L_bn_sqr_words_loop:
  277. dmultu t0,t0
  278. ld t2,8(a1)
  279. ld ta0,16(a1)
  280. ld ta2,24(a1)
  281. mflo t1
  282. mfhi t0
  283. sd t1,0(a0)
  284. sd t0,8(a0)
  285. dmultu t2,t2
  286. subu a2,4
  287. PTR_ADD a0,64
  288. PTR_ADD a1,32
  289. mflo t3
  290. mfhi t2
  291. sd t3,-48(a0)
  292. sd t2,-40(a0)
  293. dmultu ta0,ta0
  294. mflo ta1
  295. mfhi ta0
  296. sd ta1,-32(a0)
  297. sd ta0,-24(a0)
  298. dmultu ta2,ta2
  299. and ta0,a2,MINUS4
  300. mflo ta3
  301. mfhi ta2
  302. sd ta3,-16(a0)
  303. sd ta2,-8(a0)
  304. .set noreorder
  305. bgtzl ta0,.L_bn_sqr_words_loop
  306. ld t0,0(a1)
  307. bnezl a2,.L_bn_sqr_words_tail
  308. ld t0,0(a1)
  309. .set reorder
  310. .L_bn_sqr_words_return:
  311. move v0,zero
  312. jr ra
  313. .L_bn_sqr_words_tail:
  314. dmultu t0,t0
  315. subu a2,1
  316. mflo t1
  317. mfhi t0
  318. sd t1,0(a0)
  319. sd t0,8(a0)
  320. beqz a2,.L_bn_sqr_words_return
  321. ld t0,8(a1)
  322. dmultu t0,t0
  323. subu a2,1
  324. mflo t1
  325. mfhi t0
  326. sd t1,16(a0)
  327. sd t0,24(a0)
  328. beqz a2,.L_bn_sqr_words_return
  329. ld t0,16(a1)
  330. dmultu t0,t0
  331. mflo t1
  332. mfhi t0
  333. sd t1,32(a0)
  334. sd t0,40(a0)
  335. jr ra
  336. END(bn_sqr_words)
  337. .align 5
  338. LEAF(bn_add_words)
  339. .set noreorder
  340. bgtzl a3,.L_bn_add_words_proceed
  341. ld t0,0(a1)
  342. jr ra
  343. move v0,zero
  344. .set reorder
  345. .L_bn_add_words_proceed:
  346. li MINUS4,-4
  347. and AT,a3,MINUS4
  348. move v0,zero
  349. beqz AT,.L_bn_add_words_tail
  350. .L_bn_add_words_loop:
  351. ld ta0,0(a2)
  352. subu a3,4
  353. ld t1,8(a1)
  354. and AT,a3,MINUS4
  355. ld t2,16(a1)
  356. PTR_ADD a2,32
  357. ld t3,24(a1)
  358. PTR_ADD a0,32
  359. ld ta1,-24(a2)
  360. PTR_ADD a1,32
  361. ld ta2,-16(a2)
  362. ld ta3,-8(a2)
  363. daddu ta0,t0
  364. sltu t8,ta0,t0
  365. daddu t0,ta0,v0
  366. sltu v0,t0,ta0
  367. sd t0,-32(a0)
  368. daddu v0,t8
  369. daddu ta1,t1
  370. sltu t9,ta1,t1
  371. daddu t1,ta1,v0
  372. sltu v0,t1,ta1
  373. sd t1,-24(a0)
  374. daddu v0,t9
  375. daddu ta2,t2
  376. sltu t8,ta2,t2
  377. daddu t2,ta2,v0
  378. sltu v0,t2,ta2
  379. sd t2,-16(a0)
  380. daddu v0,t8
  381. daddu ta3,t3
  382. sltu t9,ta3,t3
  383. daddu t3,ta3,v0
  384. sltu v0,t3,ta3
  385. sd t3,-8(a0)
  386. daddu v0,t9
  387. .set noreorder
  388. bgtzl AT,.L_bn_add_words_loop
  389. ld t0,0(a1)
  390. bnezl a3,.L_bn_add_words_tail
  391. ld t0,0(a1)
  392. .set reorder
  393. .L_bn_add_words_return:
  394. jr ra
  395. .L_bn_add_words_tail:
  396. ld ta0,0(a2)
  397. daddu ta0,t0
  398. subu a3,1
  399. sltu t8,ta0,t0
  400. daddu t0,ta0,v0
  401. sltu v0,t0,ta0
  402. sd t0,0(a0)
  403. daddu v0,t8
  404. beqz a3,.L_bn_add_words_return
  405. ld t1,8(a1)
  406. ld ta1,8(a2)
  407. daddu ta1,t1
  408. subu a3,1
  409. sltu t9,ta1,t1
  410. daddu t1,ta1,v0
  411. sltu v0,t1,ta1
  412. sd t1,8(a0)
  413. daddu v0,t9
  414. beqz a3,.L_bn_add_words_return
  415. ld t2,16(a1)
  416. ld ta2,16(a2)
  417. daddu ta2,t2
  418. sltu t8,ta2,t2
  419. daddu t2,ta2,v0
  420. sltu v0,t2,ta2
  421. sd t2,16(a0)
  422. daddu v0,t8
  423. jr ra
  424. END(bn_add_words)
  425. .align 5
  426. LEAF(bn_sub_words)
  427. .set noreorder
  428. bgtzl a3,.L_bn_sub_words_proceed
  429. ld t0,0(a1)
  430. jr ra
  431. move v0,zero
  432. .set reorder
  433. .L_bn_sub_words_proceed:
  434. li MINUS4,-4
  435. and AT,a3,MINUS4
  436. move v0,zero
  437. beqz AT,.L_bn_sub_words_tail
  438. .L_bn_sub_words_loop:
  439. ld ta0,0(a2)
  440. subu a3,4
  441. ld t1,8(a1)
  442. and AT,a3,MINUS4
  443. ld t2,16(a1)
  444. PTR_ADD a2,32
  445. ld t3,24(a1)
  446. PTR_ADD a0,32
  447. ld ta1,-24(a2)
  448. PTR_ADD a1,32
  449. ld ta2,-16(a2)
  450. ld ta3,-8(a2)
  451. sltu t8,t0,ta0
  452. dsubu t0,ta0
  453. dsubu ta0,t0,v0
  454. sd ta0,-32(a0)
  455. MOVNZ (t0,v0,t8)
  456. sltu t9,t1,ta1
  457. dsubu t1,ta1
  458. dsubu ta1,t1,v0
  459. sd ta1,-24(a0)
  460. MOVNZ (t1,v0,t9)
  461. sltu t8,t2,ta2
  462. dsubu t2,ta2
  463. dsubu ta2,t2,v0
  464. sd ta2,-16(a0)
  465. MOVNZ (t2,v0,t8)
  466. sltu t9,t3,ta3
  467. dsubu t3,ta3
  468. dsubu ta3,t3,v0
  469. sd ta3,-8(a0)
  470. MOVNZ (t3,v0,t9)
  471. .set noreorder
  472. bgtzl AT,.L_bn_sub_words_loop
  473. ld t0,0(a1)
  474. bnezl a3,.L_bn_sub_words_tail
  475. ld t0,0(a1)
  476. .set reorder
  477. .L_bn_sub_words_return:
  478. jr ra
  479. .L_bn_sub_words_tail:
  480. ld ta0,0(a2)
  481. subu a3,1
  482. sltu t8,t0,ta0
  483. dsubu t0,ta0
  484. dsubu ta0,t0,v0
  485. MOVNZ (t0,v0,t8)
  486. sd ta0,0(a0)
  487. beqz a3,.L_bn_sub_words_return
  488. ld t1,8(a1)
  489. subu a3,1
  490. ld ta1,8(a2)
  491. sltu t9,t1,ta1
  492. dsubu t1,ta1
  493. dsubu ta1,t1,v0
  494. MOVNZ (t1,v0,t9)
  495. sd ta1,8(a0)
  496. beqz a3,.L_bn_sub_words_return
  497. ld t2,16(a1)
  498. ld ta2,16(a2)
  499. sltu t8,t2,ta2
  500. dsubu t2,ta2
  501. dsubu ta2,t2,v0
  502. MOVNZ (t2,v0,t8)
  503. sd ta2,16(a0)
  504. jr ra
  505. END(bn_sub_words)
  506. #undef MINUS4
  507. .align 5
  508. LEAF(bn_div_3_words)
  509. .set reorder
  510. move a3,a0 /* we know that bn_div_words doesn't
  511. * touch a3, ta2, ta3 and preserves a2
  512. * so that we can save two arguments
  513. * and return address in registers
  514. * instead of stack:-)
  515. */
  516. ld a0,(a3)
  517. move ta2,a1
  518. ld a1,-8(a3)
  519. bne a0,a2,.L_bn_div_3_words_proceed
  520. li v0,-1
  521. jr ra
  522. .L_bn_div_3_words_proceed:
  523. move ta3,ra
  524. bal bn_div_words
  525. move ra,ta3
  526. dmultu ta2,v0
  527. ld t2,-16(a3)
  528. move ta0,zero
  529. mfhi t1
  530. mflo t0
  531. sltu t8,t1,v1
  532. .L_bn_div_3_words_inner_loop:
  533. bnez t8,.L_bn_div_3_words_inner_loop_done
  534. sgeu AT,t2,t0
  535. seq t9,t1,v1
  536. and AT,t9
  537. sltu t3,t0,ta2
  538. daddu v1,a2
  539. dsubu t1,t3
  540. dsubu t0,ta2
  541. sltu t8,t1,v1
  542. sltu ta0,v1,a2
  543. or t8,ta0
  544. .set noreorder
  545. beqzl AT,.L_bn_div_3_words_inner_loop
  546. dsubu v0,1
  547. .set reorder
  548. .L_bn_div_3_words_inner_loop_done:
  549. jr ra
  550. END(bn_div_3_words)
  551. .align 5
  552. LEAF(bn_div_words)
  553. .set noreorder
  554. bnezl a2,.L_bn_div_words_proceed
  555. move v1,zero
  556. jr ra
  557. li v0,-1 /* I'd rather signal div-by-zero
  558. * which can be done with 'break 7' */
  559. .L_bn_div_words_proceed:
  560. bltz a2,.L_bn_div_words_body
  561. move t9,v1
  562. dsll a2,1
  563. bgtz a2,.-4
  564. addu t9,1
  565. .set reorder
  566. negu t1,t9
  567. li t2,-1
  568. dsll t2,t1
  569. and t2,a0
  570. dsrl AT,a1,t1
  571. .set noreorder
  572. bnezl t2,.+8
  573. break 6 /* signal overflow */
  574. .set reorder
  575. dsll a0,t9
  576. dsll a1,t9
  577. or a0,AT
  578. #define QT ta0
  579. #define HH ta1
  580. #define DH v1
  581. .L_bn_div_words_body:
  582. dsrl DH,a2,32
  583. sgeu AT,a0,a2
  584. .set noreorder
  585. bnezl AT,.+8
  586. dsubu a0,a2
  587. .set reorder
  588. li QT,-1
  589. dsrl HH,a0,32
  590. dsrl QT,32 /* q=0xffffffff */
  591. beq DH,HH,.L_bn_div_words_skip_div1
  592. ddivu zero,a0,DH
  593. mflo QT
  594. .L_bn_div_words_skip_div1:
  595. dmultu a2,QT
  596. dsll t3,a0,32
  597. dsrl AT,a1,32
  598. or t3,AT
  599. mflo t0
  600. mfhi t1
  601. .L_bn_div_words_inner_loop1:
  602. sltu t2,t3,t0
  603. seq t8,HH,t1
  604. sltu AT,HH,t1
  605. and t2,t8
  606. sltu v0,t0,a2
  607. or AT,t2
  608. .set noreorder
  609. beqz AT,.L_bn_div_words_inner_loop1_done
  610. dsubu t1,v0
  611. dsubu t0,a2
  612. b .L_bn_div_words_inner_loop1
  613. dsubu QT,1
  614. .set reorder
  615. .L_bn_div_words_inner_loop1_done:
  616. dsll a1,32
  617. dsubu a0,t3,t0
  618. dsll v0,QT,32
  619. li QT,-1
  620. dsrl HH,a0,32
  621. dsrl QT,32 /* q=0xffffffff */
  622. beq DH,HH,.L_bn_div_words_skip_div2
  623. ddivu zero,a0,DH
  624. mflo QT
  625. .L_bn_div_words_skip_div2:
  626. #undef DH
  627. dmultu a2,QT
  628. dsll t3,a0,32
  629. dsrl AT,a1,32
  630. or t3,AT
  631. mflo t0
  632. mfhi t1
  633. .L_bn_div_words_inner_loop2:
  634. sltu t2,t3,t0
  635. seq t8,HH,t1
  636. sltu AT,HH,t1
  637. and t2,t8
  638. sltu v1,t0,a2
  639. or AT,t2
  640. .set noreorder
  641. beqz AT,.L_bn_div_words_inner_loop2_done
  642. dsubu t1,v1
  643. dsubu t0,a2
  644. b .L_bn_div_words_inner_loop2
  645. dsubu QT,1
  646. .set reorder
  647. .L_bn_div_words_inner_loop2_done:
  648. #undef HH
  649. dsubu a0,t3,t0
  650. or v0,QT
  651. dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
  652. dsrl a2,t9 /* restore a2 */
  653. jr ra
  654. #undef QT
  655. END(bn_div_words)
  656. #define a_0 t0
  657. #define a_1 t1
  658. #define a_2 t2
  659. #define a_3 t3
  660. #define b_0 ta0
  661. #define b_1 ta1
  662. #define b_2 ta2
  663. #define b_3 ta3
  664. #define a_4 s0
  665. #define a_5 s2
  666. #define a_6 s4
  667. #define a_7 a1 /* once we load a[7] we don't need a anymore */
  668. #define b_4 s1
  669. #define b_5 s3
  670. #define b_6 s5
  671. #define b_7 a2 /* once we load b[7] we don't need b anymore */
  672. #define t_1 t8
  673. #define t_2 t9
  674. #define c_1 v0
  675. #define c_2 v1
  676. #define c_3 a3
  677. #define FRAME_SIZE 48
  678. .align 5
  679. LEAF(bn_mul_comba8)
  680. .set noreorder
  681. PTR_SUB sp,FRAME_SIZE
  682. .frame sp,64,ra
  683. .set reorder
  684. ld a_0,0(a1) /* If compiled with -mips3 option on
  685. * R5000 box assembler barks on this
  686. * line with "shouldn't have mult/div
  687. * as last instruction in bb (R10K
  688. * bug)" warning. If anybody out there
  689. * has a clue about how to circumvent
  690. * this do send me a note.
  691. * <appro@fy.chalmers.se>
  692. */
  693. ld b_0,0(a2)
  694. ld a_1,8(a1)
  695. ld a_2,16(a1)
  696. ld a_3,24(a1)
  697. ld b_1,8(a2)
  698. ld b_2,16(a2)
  699. ld b_3,24(a2)
  700. dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
  701. sd s0,0(sp)
  702. sd s1,8(sp)
  703. sd s2,16(sp)
  704. sd s3,24(sp)
  705. sd s4,32(sp)
  706. sd s5,40(sp)
  707. mflo c_1
  708. mfhi c_2
  709. dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
  710. ld a_4,32(a1)
  711. ld a_5,40(a1)
  712. ld a_6,48(a1)
  713. ld a_7,56(a1)
  714. ld b_4,32(a2)
  715. ld b_5,40(a2)
  716. mflo t_1
  717. mfhi t_2
  718. daddu c_2,t_1
  719. sltu AT,c_2,t_1
  720. daddu c_3,t_2,AT
  721. dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
  722. ld b_6,48(a2)
  723. ld b_7,56(a2)
  724. sd c_1,0(a0) /* r[0]=c1; */
  725. mflo t_1
  726. mfhi t_2
  727. daddu c_2,t_1
  728. sltu AT,c_2,t_1
  729. daddu t_2,AT
  730. daddu c_3,t_2
  731. sltu c_1,c_3,t_2
  732. sd c_2,8(a0) /* r[1]=c2; */
  733. dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
  734. mflo t_1
  735. mfhi t_2
  736. daddu c_3,t_1
  737. sltu AT,c_3,t_1
  738. daddu t_2,AT
  739. daddu c_1,t_2
  740. dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
  741. mflo t_1
  742. mfhi t_2
  743. daddu c_3,t_1
  744. sltu AT,c_3,t_1
  745. daddu t_2,AT
  746. daddu c_1,t_2
  747. sltu c_2,c_1,t_2
  748. dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
  749. mflo t_1
  750. mfhi t_2
  751. daddu c_3,t_1
  752. sltu AT,c_3,t_1
  753. daddu t_2,AT
  754. daddu c_1,t_2
  755. sltu AT,c_1,t_2
  756. daddu c_2,AT
  757. sd c_3,16(a0) /* r[2]=c3; */
  758. dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
  759. mflo t_1
  760. mfhi t_2
  761. daddu c_1,t_1
  762. sltu AT,c_1,t_1
  763. daddu t_2,AT
  764. daddu c_2,t_2
  765. sltu c_3,c_2,t_2
  766. dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
  767. mflo t_1
  768. mfhi t_2
  769. daddu c_1,t_1
  770. sltu AT,c_1,t_1
  771. daddu t_2,AT
  772. daddu c_2,t_2
  773. sltu AT,c_2,t_2
  774. daddu c_3,AT
  775. dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
  776. mflo t_1
  777. mfhi t_2
  778. daddu c_1,t_1
  779. sltu AT,c_1,t_1
  780. daddu t_2,AT
  781. daddu c_2,t_2
  782. sltu AT,c_2,t_2
  783. daddu c_3,AT
  784. dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
  785. mflo t_1
  786. mfhi t_2
  787. daddu c_1,t_1
  788. sltu AT,c_1,t_1
  789. daddu t_2,AT
  790. daddu c_2,t_2
  791. sltu AT,c_2,t_2
  792. daddu c_3,AT
  793. sd c_1,24(a0) /* r[3]=c1; */
  794. dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
  795. mflo t_1
  796. mfhi t_2
  797. daddu c_2,t_1
  798. sltu AT,c_2,t_1
  799. daddu t_2,AT
  800. daddu c_3,t_2
  801. sltu c_1,c_3,t_2
  802. dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
  803. mflo t_1
  804. mfhi t_2
  805. daddu c_2,t_1
  806. sltu AT,c_2,t_1
  807. daddu t_2,AT
  808. daddu c_3,t_2
  809. sltu AT,c_3,t_2
  810. daddu c_1,AT
  811. dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
  812. mflo t_1
  813. mfhi t_2
  814. daddu c_2,t_1
  815. sltu AT,c_2,t_1
  816. daddu t_2,AT
  817. daddu c_3,t_2
  818. sltu AT,c_3,t_2
  819. daddu c_1,AT
  820. dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
  821. mflo t_1
  822. mfhi t_2
  823. daddu c_2,t_1
  824. sltu AT,c_2,t_1
  825. daddu t_2,AT
  826. daddu c_3,t_2
  827. sltu AT,c_3,t_2
  828. daddu c_1,AT
  829. dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
  830. mflo t_1
  831. mfhi t_2
  832. daddu c_2,t_1
  833. sltu AT,c_2,t_1
  834. daddu t_2,AT
  835. daddu c_3,t_2
  836. sltu AT,c_3,t_2
  837. daddu c_1,AT
  838. sd c_2,32(a0) /* r[4]=c2; */
  839. dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
  840. mflo t_1
  841. mfhi t_2
  842. daddu c_3,t_1
  843. sltu AT,c_3,t_1
  844. daddu t_2,AT
  845. daddu c_1,t_2
  846. sltu c_2,c_1,t_2
  847. dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
  848. mflo t_1
  849. mfhi t_2
  850. daddu c_3,t_1
  851. sltu AT,c_3,t_1
  852. daddu t_2,AT
  853. daddu c_1,t_2
  854. sltu AT,c_1,t_2
  855. daddu c_2,AT
  856. dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
  857. mflo t_1
  858. mfhi t_2
  859. daddu c_3,t_1
  860. sltu AT,c_3,t_1
  861. daddu t_2,AT
  862. daddu c_1,t_2
  863. sltu AT,c_1,t_2
  864. daddu c_2,AT
  865. dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
  866. mflo t_1
  867. mfhi t_2
  868. daddu c_3,t_1
  869. sltu AT,c_3,t_1
  870. daddu t_2,AT
  871. daddu c_1,t_2
  872. sltu AT,c_1,t_2
  873. daddu c_2,AT
  874. dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
  875. mflo t_1
  876. mfhi t_2
  877. daddu c_3,t_1
  878. sltu AT,c_3,t_1
  879. daddu t_2,AT
  880. daddu c_1,t_2
  881. sltu AT,c_1,t_2
  882. daddu c_2,AT
  883. dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
  884. mflo t_1
  885. mfhi t_2
  886. daddu c_3,t_1
  887. sltu AT,c_3,t_1
  888. daddu t_2,AT
  889. daddu c_1,t_2
  890. sltu AT,c_1,t_2
  891. daddu c_2,AT
  892. sd c_3,40(a0) /* r[5]=c3; */
  893. dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
  894. mflo t_1
  895. mfhi t_2
  896. daddu c_1,t_1
  897. sltu AT,c_1,t_1
  898. daddu t_2,AT
  899. daddu c_2,t_2
  900. sltu c_3,c_2,t_2
  901. dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
  902. mflo t_1
  903. mfhi t_2
  904. daddu c_1,t_1
  905. sltu AT,c_1,t_1
  906. daddu t_2,AT
  907. daddu c_2,t_2
  908. sltu AT,c_2,t_2
  909. daddu c_3,AT
  910. dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
  911. mflo t_1
  912. mfhi t_2
  913. daddu c_1,t_1
  914. sltu AT,c_1,t_1
  915. daddu t_2,AT
  916. daddu c_2,t_2
  917. sltu AT,c_2,t_2
  918. daddu c_3,AT
  919. dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
  920. mflo t_1
  921. mfhi t_2
  922. daddu c_1,t_1
  923. sltu AT,c_1,t_1
  924. daddu t_2,AT
  925. daddu c_2,t_2
  926. sltu AT,c_2,t_2
  927. daddu c_3,AT
  928. dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
  929. mflo t_1
  930. mfhi t_2
  931. daddu c_1,t_1
  932. sltu AT,c_1,t_1
  933. daddu t_2,AT
  934. daddu c_2,t_2
  935. sltu AT,c_2,t_2
  936. daddu c_3,AT
  937. dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
  938. mflo t_1
  939. mfhi t_2
  940. daddu c_1,t_1
  941. sltu AT,c_1,t_1
  942. daddu t_2,AT
  943. daddu c_2,t_2
  944. sltu AT,c_2,t_2
  945. daddu c_3,AT
  946. dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
  947. mflo t_1
  948. mfhi t_2
  949. daddu c_1,t_1
  950. sltu AT,c_1,t_1
  951. daddu t_2,AT
  952. daddu c_2,t_2
  953. sltu AT,c_2,t_2
  954. daddu c_3,AT
  955. sd c_1,48(a0) /* r[6]=c1; */
  956. dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
  957. mflo t_1
  958. mfhi t_2
  959. daddu c_2,t_1
  960. sltu AT,c_2,t_1
  961. daddu t_2,AT
  962. daddu c_3,t_2
  963. sltu c_1,c_3,t_2
  964. dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
  965. mflo t_1
  966. mfhi t_2
  967. daddu c_2,t_1
  968. sltu AT,c_2,t_1
  969. daddu t_2,AT
  970. daddu c_3,t_2
  971. sltu AT,c_3,t_2
  972. daddu c_1,AT
  973. dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
  974. mflo t_1
  975. mfhi t_2
  976. daddu c_2,t_1
  977. sltu AT,c_2,t_1
  978. daddu t_2,AT
  979. daddu c_3,t_2
  980. sltu AT,c_3,t_2
  981. daddu c_1,AT
  982. dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
  983. mflo t_1
  984. mfhi t_2
  985. daddu c_2,t_1
  986. sltu AT,c_2,t_1
  987. daddu t_2,AT
  988. daddu c_3,t_2
  989. sltu AT,c_3,t_2
  990. daddu c_1,AT
  991. dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
  992. mflo t_1
  993. mfhi t_2
  994. daddu c_2,t_1
  995. sltu AT,c_2,t_1
  996. daddu t_2,AT
  997. daddu c_3,t_2
  998. sltu AT,c_3,t_2
  999. daddu c_1,AT
  1000. dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
  1001. mflo t_1
  1002. mfhi t_2
  1003. daddu c_2,t_1
  1004. sltu AT,c_2,t_1
  1005. daddu t_2,AT
  1006. daddu c_3,t_2
  1007. sltu AT,c_3,t_2
  1008. daddu c_1,AT
  1009. dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
  1010. mflo t_1
  1011. mfhi t_2
  1012. daddu c_2,t_1
  1013. sltu AT,c_2,t_1
  1014. daddu t_2,AT
  1015. daddu c_3,t_2
  1016. sltu AT,c_3,t_2
  1017. daddu c_1,AT
  1018. dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
  1019. mflo t_1
  1020. mfhi t_2
  1021. daddu c_2,t_1
  1022. sltu AT,c_2,t_1
  1023. daddu t_2,AT
  1024. daddu c_3,t_2
  1025. sltu AT,c_3,t_2
  1026. daddu c_1,AT
  1027. sd c_2,56(a0) /* r[7]=c2; */
  1028. dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
  1029. mflo t_1
  1030. mfhi t_2
  1031. daddu c_3,t_1
  1032. sltu AT,c_3,t_1
  1033. daddu t_2,AT
  1034. daddu c_1,t_2
  1035. sltu c_2,c_1,t_2
  1036. dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
  1037. mflo t_1
  1038. mfhi t_2
  1039. daddu c_3,t_1
  1040. sltu AT,c_3,t_1
  1041. daddu t_2,AT
  1042. daddu c_1,t_2
  1043. sltu AT,c_1,t_2
  1044. daddu c_2,AT
  1045. dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
  1046. mflo t_1
  1047. mfhi t_2
  1048. daddu c_3,t_1
  1049. sltu AT,c_3,t_1
  1050. daddu t_2,AT
  1051. daddu c_1,t_2
  1052. sltu AT,c_1,t_2
  1053. daddu c_2,AT
  1054. dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
  1055. mflo t_1
  1056. mfhi t_2
  1057. daddu c_3,t_1
  1058. sltu AT,c_3,t_1
  1059. daddu t_2,AT
  1060. daddu c_1,t_2
  1061. sltu AT,c_1,t_2
  1062. daddu c_2,AT
  1063. dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
  1064. mflo t_1
  1065. mfhi t_2
  1066. daddu c_3,t_1
  1067. sltu AT,c_3,t_1
  1068. daddu t_2,AT
  1069. daddu c_1,t_2
  1070. sltu AT,c_1,t_2
  1071. daddu c_2,AT
  1072. dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
  1073. mflo t_1
  1074. mfhi t_2
  1075. daddu c_3,t_1
  1076. sltu AT,c_3,t_1
  1077. daddu t_2,AT
  1078. daddu c_1,t_2
  1079. sltu AT,c_1,t_2
  1080. daddu c_2,AT
  1081. dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
  1082. mflo t_1
  1083. mfhi t_2
  1084. daddu c_3,t_1
  1085. sltu AT,c_3,t_1
  1086. daddu t_2,AT
  1087. daddu c_1,t_2
  1088. sltu AT,c_1,t_2
  1089. daddu c_2,AT
  1090. sd c_3,64(a0) /* r[8]=c3; */
  1091. dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
  1092. mflo t_1
  1093. mfhi t_2
  1094. daddu c_1,t_1
  1095. sltu AT,c_1,t_1
  1096. daddu t_2,AT
  1097. daddu c_2,t_2
  1098. sltu c_3,c_2,t_2
  1099. dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
  1100. mflo t_1
  1101. mfhi t_2
  1102. daddu c_1,t_1
  1103. sltu AT,c_1,t_1
  1104. daddu t_2,AT
  1105. daddu c_2,t_2
  1106. sltu AT,c_2,t_2
  1107. daddu c_3,AT
  1108. dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
  1109. mflo t_1
  1110. mfhi t_2
  1111. daddu c_1,t_1
  1112. sltu AT,c_1,t_1
  1113. daddu t_2,AT
  1114. daddu c_2,t_2
  1115. sltu AT,c_2,t_2
  1116. daddu c_3,AT
  1117. dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
  1118. mflo t_1
  1119. mfhi t_2
  1120. daddu c_1,t_1
  1121. sltu AT,c_1,t_1
  1122. daddu t_2,AT
  1123. daddu c_2,t_2
  1124. sltu AT,c_2,t_2
  1125. daddu c_3,AT
  1126. dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
  1127. mflo t_1
  1128. mfhi t_2
  1129. daddu c_1,t_1
  1130. sltu AT,c_1,t_1
  1131. daddu t_2,AT
  1132. daddu c_2,t_2
  1133. sltu AT,c_2,t_2
  1134. daddu c_3,AT
  1135. dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
  1136. mflo t_1
  1137. mfhi t_2
  1138. daddu c_1,t_1
  1139. sltu AT,c_1,t_1
  1140. daddu t_2,AT
  1141. daddu c_2,t_2
  1142. sltu AT,c_2,t_2
  1143. daddu c_3,AT
  1144. sd c_1,72(a0) /* r[9]=c1; */
  1145. dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
  1146. mflo t_1
  1147. mfhi t_2
  1148. daddu c_2,t_1
  1149. sltu AT,c_2,t_1
  1150. daddu t_2,AT
  1151. daddu c_3,t_2
  1152. sltu c_1,c_3,t_2
  1153. dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
  1154. mflo t_1
  1155. mfhi t_2
  1156. daddu c_2,t_1
  1157. sltu AT,c_2,t_1
  1158. daddu t_2,AT
  1159. daddu c_3,t_2
  1160. sltu AT,c_3,t_2
  1161. daddu c_1,AT
  1162. dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
  1163. mflo t_1
  1164. mfhi t_2
  1165. daddu c_2,t_1
  1166. sltu AT,c_2,t_1
  1167. daddu t_2,AT
  1168. daddu c_3,t_2
  1169. sltu AT,c_3,t_2
  1170. daddu c_1,AT
  1171. dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
  1172. mflo t_1
  1173. mfhi t_2
  1174. daddu c_2,t_1
  1175. sltu AT,c_2,t_1
  1176. daddu t_2,AT
  1177. daddu c_3,t_2
  1178. sltu AT,c_3,t_2
  1179. daddu c_1,AT
  1180. dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
  1181. mflo t_1
  1182. mfhi t_2
  1183. daddu c_2,t_1
  1184. sltu AT,c_2,t_1
  1185. daddu t_2,AT
  1186. daddu c_3,t_2
  1187. sltu AT,c_3,t_2
  1188. daddu c_1,AT
  1189. sd c_2,80(a0) /* r[10]=c2; */
  1190. dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
  1191. mflo t_1
  1192. mfhi t_2
  1193. daddu c_3,t_1
  1194. sltu AT,c_3,t_1
  1195. daddu t_2,AT
  1196. daddu c_1,t_2
  1197. sltu c_2,c_1,t_2
  1198. dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
  1199. mflo t_1
  1200. mfhi t_2
  1201. daddu c_3,t_1
  1202. sltu AT,c_3,t_1
  1203. daddu t_2,AT
  1204. daddu c_1,t_2
  1205. sltu AT,c_1,t_2
  1206. daddu c_2,AT
  1207. dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
  1208. mflo t_1
  1209. mfhi t_2
  1210. daddu c_3,t_1
  1211. sltu AT,c_3,t_1
  1212. daddu t_2,AT
  1213. daddu c_1,t_2
  1214. sltu AT,c_1,t_2
  1215. daddu c_2,AT
  1216. dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
  1217. mflo t_1
  1218. mfhi t_2
  1219. daddu c_3,t_1
  1220. sltu AT,c_3,t_1
  1221. daddu t_2,AT
  1222. daddu c_1,t_2
  1223. sltu AT,c_1,t_2
  1224. daddu c_2,AT
  1225. sd c_3,88(a0) /* r[11]=c3; */
  1226. dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
  1227. mflo t_1
  1228. mfhi t_2
  1229. daddu c_1,t_1
  1230. sltu AT,c_1,t_1
  1231. daddu t_2,AT
  1232. daddu c_2,t_2
  1233. sltu c_3,c_2,t_2
  1234. dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
  1235. mflo t_1
  1236. mfhi t_2
  1237. daddu c_1,t_1
  1238. sltu AT,c_1,t_1
  1239. daddu t_2,AT
  1240. daddu c_2,t_2
  1241. sltu AT,c_2,t_2
  1242. daddu c_3,AT
  1243. dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
  1244. mflo t_1
  1245. mfhi t_2
  1246. daddu c_1,t_1
  1247. sltu AT,c_1,t_1
  1248. daddu t_2,AT
  1249. daddu c_2,t_2
  1250. sltu AT,c_2,t_2
  1251. daddu c_3,AT
  1252. sd c_1,96(a0) /* r[12]=c1; */
  1253. dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
  1254. mflo t_1
  1255. mfhi t_2
  1256. daddu c_2,t_1
  1257. sltu AT,c_2,t_1
  1258. daddu t_2,AT
  1259. daddu c_3,t_2
  1260. sltu c_1,c_3,t_2
  1261. dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
  1262. mflo t_1
  1263. mfhi t_2
  1264. daddu c_2,t_1
  1265. sltu AT,c_2,t_1
  1266. daddu t_2,AT
  1267. daddu c_3,t_2
  1268. sltu AT,c_3,t_2
  1269. daddu c_1,AT
  1270. sd c_2,104(a0) /* r[13]=c2; */
  1271. dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
  1272. ld s0,0(sp)
  1273. ld s1,8(sp)
  1274. ld s2,16(sp)
  1275. ld s3,24(sp)
  1276. ld s4,32(sp)
  1277. ld s5,40(sp)
  1278. mflo t_1
  1279. mfhi t_2
  1280. daddu c_3,t_1
  1281. sltu AT,c_3,t_1
  1282. daddu t_2,AT
  1283. daddu c_1,t_2
  1284. sd c_3,112(a0) /* r[14]=c3; */
  1285. sd c_1,120(a0) /* r[15]=c1; */
  1286. PTR_ADD sp,FRAME_SIZE
  1287. jr ra
  1288. END(bn_mul_comba8)
  1289. .align 5
  1290. LEAF(bn_mul_comba4)
  1291. .set reorder
  1292. ld a_0,0(a1)
  1293. ld b_0,0(a2)
  1294. ld a_1,8(a1)
  1295. ld a_2,16(a1)
  1296. dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
  1297. ld a_3,24(a1)
  1298. ld b_1,8(a2)
  1299. ld b_2,16(a2)
  1300. ld b_3,24(a2)
  1301. mflo c_1
  1302. mfhi c_2
  1303. sd c_1,0(a0)
  1304. dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
  1305. mflo t_1
  1306. mfhi t_2
  1307. daddu c_2,t_1
  1308. sltu AT,c_2,t_1
  1309. daddu c_3,t_2,AT
  1310. dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
  1311. mflo t_1
  1312. mfhi t_2
  1313. daddu c_2,t_1
  1314. sltu AT,c_2,t_1
  1315. daddu t_2,AT
  1316. daddu c_3,t_2
  1317. sltu c_1,c_3,t_2
  1318. sd c_2,8(a0)
  1319. dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
  1320. mflo t_1
  1321. mfhi t_2
  1322. daddu c_3,t_1
  1323. sltu AT,c_3,t_1
  1324. daddu t_2,AT
  1325. daddu c_1,t_2
  1326. dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
  1327. mflo t_1
  1328. mfhi t_2
  1329. daddu c_3,t_1
  1330. sltu AT,c_3,t_1
  1331. daddu t_2,AT
  1332. daddu c_1,t_2
  1333. sltu c_2,c_1,t_2
  1334. dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
  1335. mflo t_1
  1336. mfhi t_2
  1337. daddu c_3,t_1
  1338. sltu AT,c_3,t_1
  1339. daddu t_2,AT
  1340. daddu c_1,t_2
  1341. sltu AT,c_1,t_2
  1342. daddu c_2,AT
  1343. sd c_3,16(a0)
  1344. dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
  1345. mflo t_1
  1346. mfhi t_2
  1347. daddu c_1,t_1
  1348. sltu AT,c_1,t_1
  1349. daddu t_2,AT
  1350. daddu c_2,t_2
  1351. sltu c_3,c_2,t_2
  1352. dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
  1353. mflo t_1
  1354. mfhi t_2
  1355. daddu c_1,t_1
  1356. sltu AT,c_1,t_1
  1357. daddu t_2,AT
  1358. daddu c_2,t_2
  1359. sltu AT,c_2,t_2
  1360. daddu c_3,AT
  1361. dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
  1362. mflo t_1
  1363. mfhi t_2
  1364. daddu c_1,t_1
  1365. sltu AT,c_1,t_1
  1366. daddu t_2,AT
  1367. daddu c_2,t_2
  1368. sltu AT,c_2,t_2
  1369. daddu c_3,AT
  1370. dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
  1371. mflo t_1
  1372. mfhi t_2
  1373. daddu c_1,t_1
  1374. sltu AT,c_1,t_1
  1375. daddu t_2,AT
  1376. daddu c_2,t_2
  1377. sltu AT,c_2,t_2
  1378. daddu c_3,AT
  1379. sd c_1,24(a0)
  1380. dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
  1381. mflo t_1
  1382. mfhi t_2
  1383. daddu c_2,t_1
  1384. sltu AT,c_2,t_1
  1385. daddu t_2,AT
  1386. daddu c_3,t_2
  1387. sltu c_1,c_3,t_2
  1388. dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
  1389. mflo t_1
  1390. mfhi t_2
  1391. daddu c_2,t_1
  1392. sltu AT,c_2,t_1
  1393. daddu t_2,AT
  1394. daddu c_3,t_2
  1395. sltu AT,c_3,t_2
  1396. daddu c_1,AT
  1397. dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
  1398. mflo t_1
  1399. mfhi t_2
  1400. daddu c_2,t_1
  1401. sltu AT,c_2,t_1
  1402. daddu t_2,AT
  1403. daddu c_3,t_2
  1404. sltu AT,c_3,t_2
  1405. daddu c_1,AT
  1406. sd c_2,32(a0)
  1407. dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
  1408. mflo t_1
  1409. mfhi t_2
  1410. daddu c_3,t_1
  1411. sltu AT,c_3,t_1
  1412. daddu t_2,AT
  1413. daddu c_1,t_2
  1414. sltu c_2,c_1,t_2
  1415. dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
  1416. mflo t_1
  1417. mfhi t_2
  1418. daddu c_3,t_1
  1419. sltu AT,c_3,t_1
  1420. daddu t_2,AT
  1421. daddu c_1,t_2
  1422. sltu AT,c_1,t_2
  1423. daddu c_2,AT
  1424. sd c_3,40(a0)
  1425. dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
  1426. mflo t_1
  1427. mfhi t_2
  1428. daddu c_1,t_1
  1429. sltu AT,c_1,t_1
  1430. daddu t_2,AT
  1431. daddu c_2,t_2
  1432. sd c_1,48(a0)
  1433. sd c_2,56(a0)
  1434. jr ra
  1435. END(bn_mul_comba4)
  1436. #undef a_4
  1437. #undef a_5
  1438. #undef a_6
  1439. #undef a_7
  1440. #define a_4 b_0
  1441. #define a_5 b_1
  1442. #define a_6 b_2
  1443. #define a_7 b_3
  1444. .align 5
  1445. LEAF(bn_sqr_comba8)
  1446. .set reorder
  1447. ld a_0,0(a1)
  1448. ld a_1,8(a1)
  1449. ld a_2,16(a1)
  1450. ld a_3,24(a1)
  1451. dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
  1452. ld a_4,32(a1)
  1453. ld a_5,40(a1)
  1454. ld a_6,48(a1)
  1455. ld a_7,56(a1)
  1456. mflo c_1
  1457. mfhi c_2
  1458. sd c_1,0(a0)
  1459. dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
  1460. mflo t_1
  1461. mfhi t_2
  1462. slt c_1,t_2,zero
  1463. dsll t_2,1
  1464. slt a2,t_1,zero
  1465. daddu t_2,a2
  1466. dsll t_1,1
  1467. daddu c_2,t_1
  1468. sltu AT,c_2,t_1
  1469. daddu c_3,t_2,AT
  1470. sd c_2,8(a0)
  1471. dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
  1472. mflo t_1
  1473. mfhi t_2
  1474. slt c_2,t_2,zero
  1475. dsll t_2,1
  1476. slt a2,t_1,zero
  1477. daddu t_2,a2
  1478. dsll t_1,1
  1479. daddu c_3,t_1
  1480. sltu AT,c_3,t_1
  1481. daddu t_2,AT
  1482. daddu c_1,t_2
  1483. sltu AT,c_1,t_2
  1484. daddu c_2,AT
  1485. dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
  1486. mflo t_1
  1487. mfhi t_2
  1488. daddu c_3,t_1
  1489. sltu AT,c_3,t_1
  1490. daddu t_2,AT
  1491. daddu c_1,t_2
  1492. sltu AT,c_1,t_2
  1493. daddu c_2,AT
  1494. sd c_3,16(a0)
  1495. dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
  1496. mflo t_1
  1497. mfhi t_2
  1498. slt c_3,t_2,zero
  1499. dsll t_2,1
  1500. slt a2,t_1,zero
  1501. daddu t_2,a2
  1502. dsll t_1,1
  1503. daddu c_1,t_1
  1504. sltu AT,c_1,t_1
  1505. daddu t_2,AT
  1506. daddu c_2,t_2
  1507. sltu AT,c_2,t_2
  1508. daddu c_3,AT
  1509. dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
  1510. mflo t_1
  1511. mfhi t_2
  1512. slt AT,t_2,zero
  1513. daddu c_3,AT
  1514. dsll t_2,1
  1515. slt a2,t_1,zero
  1516. daddu t_2,a2
  1517. dsll t_1,1
  1518. daddu c_1,t_1
  1519. sltu AT,c_1,t_1
  1520. daddu t_2,AT
  1521. daddu c_2,t_2
  1522. sltu AT,c_2,t_2
  1523. daddu c_3,AT
  1524. sd c_1,24(a0)
  1525. dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
  1526. mflo t_1
  1527. mfhi t_2
  1528. slt c_1,t_2,zero
  1529. dsll t_2,1
  1530. slt a2,t_1,zero
  1531. daddu t_2,a2
  1532. dsll t_1,1
  1533. daddu c_2,t_1
  1534. sltu AT,c_2,t_1
  1535. daddu t_2,AT
  1536. daddu c_3,t_2
  1537. sltu AT,c_3,t_2
  1538. daddu c_1,AT
  1539. dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
  1540. mflo t_1
  1541. mfhi t_2
  1542. slt AT,t_2,zero
  1543. daddu c_1,AT
  1544. dsll t_2,1
  1545. slt a2,t_1,zero
  1546. daddu t_2,a2
  1547. dsll t_1,1
  1548. daddu c_2,t_1
  1549. sltu AT,c_2,t_1
  1550. daddu t_2,AT
  1551. daddu c_3,t_2
  1552. sltu AT,c_3,t_2
  1553. daddu c_1,AT
  1554. dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
  1555. mflo t_1
  1556. mfhi t_2
  1557. daddu c_2,t_1
  1558. sltu AT,c_2,t_1
  1559. daddu t_2,AT
  1560. daddu c_3,t_2
  1561. sltu AT,c_3,t_2
  1562. daddu c_1,AT
  1563. sd c_2,32(a0)
  1564. dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
  1565. mflo t_1
  1566. mfhi t_2
  1567. slt c_2,t_2,zero
  1568. dsll t_2,1
  1569. slt a2,t_1,zero
  1570. daddu t_2,a2
  1571. dsll t_1,1
  1572. daddu c_3,t_1
  1573. sltu AT,c_3,t_1
  1574. daddu t_2,AT
  1575. daddu c_1,t_2
  1576. sltu AT,c_1,t_2
  1577. daddu c_2,AT
  1578. dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
  1579. mflo t_1
  1580. mfhi t_2
  1581. slt AT,t_2,zero
  1582. daddu c_2,AT
  1583. dsll t_2,1
  1584. slt a2,t_1,zero
  1585. daddu t_2,a2
  1586. dsll t_1,1
  1587. daddu c_3,t_1
  1588. sltu AT,c_3,t_1
  1589. daddu t_2,AT
  1590. daddu c_1,t_2
  1591. sltu AT,c_1,t_2
  1592. daddu c_2,AT
  1593. dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
  1594. mflo t_1
  1595. mfhi t_2
  1596. slt AT,t_2,zero
  1597. daddu c_2,AT
  1598. dsll t_2,1
  1599. slt a2,t_1,zero
  1600. daddu t_2,a2
  1601. dsll t_1,1
  1602. daddu c_3,t_1
  1603. sltu AT,c_3,t_1
  1604. daddu t_2,AT
  1605. daddu c_1,t_2
  1606. sltu AT,c_1,t_2
  1607. daddu c_2,AT
  1608. sd c_3,40(a0)
  1609. dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
  1610. mflo t_1
  1611. mfhi t_2
  1612. slt c_3,t_2,zero
  1613. dsll t_2,1
  1614. slt a2,t_1,zero
  1615. daddu t_2,a2
  1616. dsll t_1,1
  1617. daddu c_1,t_1
  1618. sltu AT,c_1,t_1
  1619. daddu t_2,AT
  1620. daddu c_2,t_2
  1621. sltu AT,c_2,t_2
  1622. daddu c_3,AT
  1623. dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
  1624. mflo t_1
  1625. mfhi t_2
  1626. slt AT,t_2,zero
  1627. daddu c_3,AT
  1628. dsll t_2,1
  1629. slt a2,t_1,zero
  1630. daddu t_2,a2
  1631. dsll t_1,1
  1632. daddu c_1,t_1
  1633. sltu AT,c_1,t_1
  1634. daddu t_2,AT
  1635. daddu c_2,t_2
  1636. sltu AT,c_2,t_2
  1637. daddu c_3,AT
  1638. dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
  1639. mflo t_1
  1640. mfhi t_2
  1641. slt AT,t_2,zero
  1642. daddu c_3,AT
  1643. dsll t_2,1
  1644. slt a2,t_1,zero
  1645. daddu t_2,a2
  1646. dsll t_1,1
  1647. daddu c_1,t_1
  1648. sltu AT,c_1,t_1
  1649. daddu t_2,AT
  1650. daddu c_2,t_2
  1651. sltu AT,c_2,t_2
  1652. daddu c_3,AT
  1653. dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
  1654. mflo t_1
  1655. mfhi t_2
  1656. daddu c_1,t_1
  1657. sltu AT,c_1,t_1
  1658. daddu t_2,AT
  1659. daddu c_2,t_2
  1660. sltu AT,c_2,t_2
  1661. daddu c_3,AT
  1662. sd c_1,48(a0)
  1663. dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
  1664. mflo t_1
  1665. mfhi t_2
  1666. slt c_1,t_2,zero
  1667. dsll t_2,1
  1668. slt a2,t_1,zero
  1669. daddu t_2,a2
  1670. dsll t_1,1
  1671. daddu c_2,t_1
  1672. sltu AT,c_2,t_1
  1673. daddu t_2,AT
  1674. daddu c_3,t_2
  1675. sltu AT,c_3,t_2
  1676. daddu c_1,AT
  1677. dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
  1678. mflo t_1
  1679. mfhi t_2
  1680. slt AT,t_2,zero
  1681. daddu c_1,AT
  1682. dsll t_2,1
  1683. slt a2,t_1,zero
  1684. daddu t_2,a2
  1685. dsll t_1,1
  1686. daddu c_2,t_1
  1687. sltu AT,c_2,t_1
  1688. daddu t_2,AT
  1689. daddu c_3,t_2
  1690. sltu AT,c_3,t_2
  1691. daddu c_1,AT
  1692. dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
  1693. mflo t_1
  1694. mfhi t_2
  1695. slt AT,t_2,zero
  1696. daddu c_1,AT
  1697. dsll t_2,1
  1698. slt a2,t_1,zero
  1699. daddu t_2,a2
  1700. dsll t_1,1
  1701. daddu c_2,t_1
  1702. sltu AT,c_2,t_1
  1703. daddu t_2,AT
  1704. daddu c_3,t_2
  1705. sltu AT,c_3,t_2
  1706. daddu c_1,AT
  1707. dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
  1708. mflo t_1
  1709. mfhi t_2
  1710. slt AT,t_2,zero
  1711. daddu c_1,AT
  1712. dsll t_2,1
  1713. slt a2,t_1,zero
  1714. daddu t_2,a2
  1715. dsll t_1,1
  1716. daddu c_2,t_1
  1717. sltu AT,c_2,t_1
  1718. daddu t_2,AT
  1719. daddu c_3,t_2
  1720. sltu AT,c_3,t_2
  1721. daddu c_1,AT
  1722. sd c_2,56(a0)
  1723. dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
  1724. mflo t_1
  1725. mfhi t_2
  1726. slt c_2,t_2,zero
  1727. dsll t_2,1
  1728. slt a2,t_1,zero
  1729. daddu t_2,a2
  1730. dsll t_1,1
  1731. daddu c_3,t_1
  1732. sltu AT,c_3,t_1
  1733. daddu t_2,AT
  1734. daddu c_1,t_2
  1735. sltu AT,c_1,t_2
  1736. daddu c_2,AT
  1737. dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
  1738. mflo t_1
  1739. mfhi t_2
  1740. slt AT,t_2,zero
  1741. daddu c_2,AT
  1742. dsll t_2,1
  1743. slt a2,t_1,zero
  1744. daddu t_2,a2
  1745. dsll t_1,1
  1746. daddu c_3,t_1
  1747. sltu AT,c_3,t_1
  1748. daddu t_2,AT
  1749. daddu c_1,t_2
  1750. sltu AT,c_1,t_2
  1751. daddu c_2,AT
  1752. dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
  1753. mflo t_1
  1754. mfhi t_2
  1755. slt AT,t_2,zero
  1756. daddu c_2,AT
  1757. dsll t_2,1
  1758. slt a2,t_1,zero
  1759. daddu t_2,a2
  1760. dsll t_1,1
  1761. daddu c_3,t_1
  1762. sltu AT,c_3,t_1
  1763. daddu t_2,AT
  1764. daddu c_1,t_2
  1765. sltu AT,c_1,t_2
  1766. daddu c_2,AT
  1767. dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
  1768. mflo t_1
  1769. mfhi t_2
  1770. daddu c_3,t_1
  1771. sltu AT,c_3,t_1
  1772. daddu t_2,AT
  1773. daddu c_1,t_2
  1774. sltu AT,c_1,t_2
  1775. daddu c_2,AT
  1776. sd c_3,64(a0)
  1777. dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
  1778. mflo t_1
  1779. mfhi t_2
  1780. slt c_3,t_2,zero
  1781. dsll t_2,1
  1782. slt a2,t_1,zero
  1783. daddu t_2,a2
  1784. dsll t_1,1
  1785. daddu c_1,t_1
  1786. sltu AT,c_1,t_1
  1787. daddu t_2,AT
  1788. daddu c_2,t_2
  1789. sltu AT,c_2,t_2
  1790. daddu c_3,AT
  1791. dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
  1792. mflo t_1
  1793. mfhi t_2
  1794. slt AT,t_2,zero
  1795. daddu c_3,AT
  1796. dsll t_2,1
  1797. slt a2,t_1,zero
  1798. daddu t_2,a2
  1799. dsll t_1,1
  1800. daddu c_1,t_1
  1801. sltu AT,c_1,t_1
  1802. daddu t_2,AT
  1803. daddu c_2,t_2
  1804. sltu AT,c_2,t_2
  1805. daddu c_3,AT
  1806. dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
  1807. mflo t_1
  1808. mfhi t_2
  1809. slt AT,t_2,zero
  1810. daddu c_3,AT
  1811. dsll t_2,1
  1812. slt a2,t_1,zero
  1813. daddu t_2,a2
  1814. dsll t_1,1
  1815. daddu c_1,t_1
  1816. sltu AT,c_1,t_1
  1817. daddu t_2,AT
  1818. daddu c_2,t_2
  1819. sltu AT,c_2,t_2
  1820. daddu c_3,AT
  1821. sd c_1,72(a0)
  1822. dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
  1823. mflo t_1
  1824. mfhi t_2
  1825. slt c_1,t_2,zero
  1826. dsll t_2,1
  1827. slt a2,t_1,zero
  1828. daddu t_2,a2
  1829. dsll t_1,1
  1830. daddu c_2,t_1
  1831. sltu AT,c_2,t_1
  1832. daddu t_2,AT
  1833. daddu c_3,t_2
  1834. sltu AT,c_3,t_2
  1835. daddu c_1,AT
  1836. dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
  1837. mflo t_1
  1838. mfhi t_2
  1839. slt AT,t_2,zero
  1840. daddu c_1,AT
  1841. dsll t_2,1
  1842. slt a2,t_1,zero
  1843. daddu t_2,a2
  1844. dsll t_1,1
  1845. daddu c_2,t_1
  1846. sltu AT,c_2,t_1
  1847. daddu t_2,AT
  1848. daddu c_3,t_2
  1849. sltu AT,c_3,t_2
  1850. daddu c_1,AT
  1851. dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
  1852. mflo t_1
  1853. mfhi t_2
  1854. daddu c_2,t_1
  1855. sltu AT,c_2,t_1
  1856. daddu t_2,AT
  1857. daddu c_3,t_2
  1858. sltu AT,c_3,t_2
  1859. daddu c_1,AT
  1860. sd c_2,80(a0)
  1861. dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
  1862. mflo t_1
  1863. mfhi t_2
  1864. slt c_2,t_2,zero
  1865. dsll t_2,1
  1866. slt a2,t_1,zero
  1867. daddu t_2,a2
  1868. dsll t_1,1
  1869. daddu c_3,t_1
  1870. sltu AT,c_3,t_1
  1871. daddu t_2,AT
  1872. daddu c_1,t_2
  1873. sltu AT,c_1,t_2
  1874. daddu c_2,AT
  1875. dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
  1876. mflo t_1
  1877. mfhi t_2
  1878. slt AT,t_2,zero
  1879. daddu c_2,AT
  1880. dsll t_2,1
  1881. slt a2,t_1,zero
  1882. daddu t_2,a2
  1883. dsll t_1,1
  1884. daddu c_3,t_1
  1885. sltu AT,c_3,t_1
  1886. daddu t_2,AT
  1887. daddu c_1,t_2
  1888. sltu AT,c_1,t_2
  1889. daddu c_2,AT
  1890. sd c_3,88(a0)
  1891. dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
  1892. mflo t_1
  1893. mfhi t_2
  1894. slt c_3,t_2,zero
  1895. dsll t_2,1
  1896. slt a2,t_1,zero
  1897. daddu t_2,a2
  1898. dsll t_1,1
  1899. daddu c_1,t_1
  1900. sltu AT,c_1,t_1
  1901. daddu t_2,AT
  1902. daddu c_2,t_2
  1903. sltu AT,c_2,t_2
  1904. daddu c_3,AT
  1905. dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
  1906. mflo t_1
  1907. mfhi t_2
  1908. daddu c_1,t_1
  1909. sltu AT,c_1,t_1
  1910. daddu t_2,AT
  1911. daddu c_2,t_2
  1912. sltu AT,c_2,t_2
  1913. daddu c_3,AT
  1914. sd c_1,96(a0)
  1915. dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
  1916. mflo t_1
  1917. mfhi t_2
  1918. slt c_1,t_2,zero
  1919. dsll t_2,1
  1920. slt a2,t_1,zero
  1921. daddu t_2,a2
  1922. dsll t_1,1
  1923. daddu c_2,t_1
  1924. sltu AT,c_2,t_1
  1925. daddu t_2,AT
  1926. daddu c_3,t_2
  1927. sltu AT,c_3,t_2
  1928. daddu c_1,AT
  1929. sd c_2,104(a0)
  1930. dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
  1931. mflo t_1
  1932. mfhi t_2
  1933. daddu c_3,t_1
  1934. sltu AT,c_3,t_1
  1935. daddu t_2,AT
  1936. daddu c_1,t_2
  1937. sd c_3,112(a0)
  1938. sd c_1,120(a0)
  1939. jr ra
  1940. END(bn_sqr_comba8)
  1941. .align 5
  1942. LEAF(bn_sqr_comba4)
  1943. .set reorder
  1944. ld a_0,0(a1)
  1945. ld a_1,8(a1)
  1946. ld a_2,16(a1)
  1947. ld a_3,24(a1)
  1948. dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
  1949. mflo c_1
  1950. mfhi c_2
  1951. sd c_1,0(a0)
  1952. dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
  1953. mflo t_1
  1954. mfhi t_2
  1955. slt c_1,t_2,zero
  1956. dsll t_2,1
  1957. slt a2,t_1,zero
  1958. daddu t_2,a2
  1959. dsll t_1,1
  1960. daddu c_2,t_1
  1961. sltu AT,c_2,t_1
  1962. daddu c_3,t_2,AT
  1963. sd c_2,8(a0)
  1964. dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
  1965. mflo t_1
  1966. mfhi t_2
  1967. slt c_2,t_2,zero
  1968. dsll t_2,1
  1969. slt a2,t_1,zero
  1970. daddu t_2,a2
  1971. dsll t_1,1
  1972. daddu c_3,t_1
  1973. sltu AT,c_3,t_1
  1974. daddu t_2,AT
  1975. daddu c_1,t_2
  1976. sltu AT,c_1,t_2
  1977. daddu c_2,AT
  1978. dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
  1979. mflo t_1
  1980. mfhi t_2
  1981. daddu c_3,t_1
  1982. sltu AT,c_3,t_1
  1983. daddu t_2,AT
  1984. daddu c_1,t_2
  1985. sltu AT,c_1,t_2
  1986. daddu c_2,AT
  1987. sd c_3,16(a0)
  1988. dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
  1989. mflo t_1
  1990. mfhi t_2
  1991. slt c_3,t_2,zero
  1992. dsll t_2,1
  1993. slt a2,t_1,zero
  1994. daddu t_2,a2
  1995. dsll t_1,1
  1996. daddu c_1,t_1
  1997. sltu AT,c_1,t_1
  1998. daddu t_2,AT
  1999. daddu c_2,t_2
  2000. sltu AT,c_2,t_2
  2001. daddu c_3,AT
  2002. dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
  2003. mflo t_1
  2004. mfhi t_2
  2005. slt AT,t_2,zero
  2006. daddu c_3,AT
  2007. dsll t_2,1
  2008. slt a2,t_1,zero
  2009. daddu t_2,a2
  2010. dsll t_1,1
  2011. daddu c_1,t_1
  2012. sltu AT,c_1,t_1
  2013. daddu t_2,AT
  2014. daddu c_2,t_2
  2015. sltu AT,c_2,t_2
  2016. daddu c_3,AT
  2017. sd c_1,24(a0)
  2018. dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
  2019. mflo t_1
  2020. mfhi t_2
  2021. slt c_1,t_2,zero
  2022. dsll t_2,1
  2023. slt a2,t_1,zero
  2024. daddu t_2,a2
  2025. dsll t_1,1
  2026. daddu c_2,t_1
  2027. sltu AT,c_2,t_1
  2028. daddu t_2,AT
  2029. daddu c_3,t_2
  2030. sltu AT,c_3,t_2
  2031. daddu c_1,AT
  2032. dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
  2033. mflo t_1
  2034. mfhi t_2
  2035. daddu c_2,t_1
  2036. sltu AT,c_2,t_1
  2037. daddu t_2,AT
  2038. daddu c_3,t_2
  2039. sltu AT,c_3,t_2
  2040. daddu c_1,AT
  2041. sd c_2,32(a0)
  2042. dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
  2043. mflo t_1
  2044. mfhi t_2
  2045. slt c_2,t_2,zero
  2046. dsll t_2,1
  2047. slt a2,t_1,zero
  2048. daddu t_2,a2
  2049. dsll t_1,1
  2050. daddu c_3,t_1
  2051. sltu AT,c_3,t_1
  2052. daddu t_2,AT
  2053. daddu c_1,t_2
  2054. sltu AT,c_1,t_2
  2055. daddu c_2,AT
  2056. sd c_3,40(a0)
  2057. dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
  2058. mflo t_1
  2059. mfhi t_2
  2060. daddu c_1,t_1
  2061. sltu AT,c_1,t_1
  2062. daddu t_2,AT
  2063. daddu c_2,t_2
  2064. sd c_1,48(a0)
  2065. sd c_2,56(a0)
  2066. jr ra
  2067. END(bn_sqr_comba4)