pa-risc2.s 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618
  1. ;
  2. ; PA-RISC 2.0 implementation of bn_asm code, based on the
  3. ; 64-bit version of the code. This code is effectively the
  4. ; same as the 64-bit version except the register model is
  5. ; slightly different given all values must be 32-bit between
  6. ; function calls. Thus the 64-bit return values are returned
  7. ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
  8. ;
  9. ;
  10. ; This code is approximately 2x faster than the C version
  11. ; for RSA/DSA.
  12. ;
  13. ; See http://devresource.hp.com/ for more details on the PA-RISC
  14. ; architecture. Also see the book "PA-RISC 2.0 Architecture"
  15. ; by Gerry Kane for information on the instruction set architecture.
  16. ;
  17. ; Code written by Chris Ruemmler (with some help from the HP C
  18. ; compiler).
  19. ;
  20. ; The code compiles with HP's assembler
  21. ;
  22. .level 2.0N
  23. .space $TEXT$
  24. .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
  25. ;
  26. ; Global Register definitions used for the routines.
  27. ;
  28. ; Some information about HP's runtime architecture for 32-bits.
  29. ;
  30. ; "Caller save" means the calling function must save the register
  31. ; if it wants the register to be preserved.
  32. ; "Callee save" means if a function uses the register, it must save
  33. ; the value before using it.
  34. ;
  35. ; For the floating point registers
  36. ;
  37. ; "caller save" registers: fr4-fr11, fr22-fr31
  38. ; "callee save" registers: fr12-fr21
  39. ; "special" registers: fr0-fr3 (status and exception registers)
  40. ;
  41. ; For the integer registers
  42. ; value zero : r0
  43. ; "caller save" registers: r1,r19-r26
  44. ; "callee save" registers: r3-r18
  45. ; return register : r2 (rp)
  46. ; return values ; r28,r29 (ret0,ret1)
  47. ; Stack pointer ; r30 (sp)
  48. ; millicode return ptr ; r31 (also a caller save register)
  49. ;
  50. ; Arguments to the routines
  51. ;
  52. r_ptr .reg %r26
  53. a_ptr .reg %r25
  54. b_ptr .reg %r24
  55. num .reg %r24
  56. n .reg %r23
  57. ;
  58. ; Note that the "w" argument for bn_mul_add_words and bn_mul_words
  59. ; is passed on the stack at a delta of -56 from the top of stack
  60. ; as the routine is entered.
  61. ;
  62. ;
  63. ; Globals used in some routines
  64. ;
  65. top_overflow .reg %r23
  66. high_mask .reg %r22 ; value 0xffffffff80000000L
  67. ;------------------------------------------------------------------------------
  68. ;
  69. ; bn_mul_add_words
  70. ;
  71. ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
  72. ; int num, BN_ULONG w)
  73. ;
  74. ; arg0 = r_ptr
  75. ; arg1 = a_ptr
  76. ; arg3 = num
  77. ; -56(sp) = w
  78. ;
  79. ; Local register definitions
  80. ;
  81. fm1 .reg %fr22
  82. fm .reg %fr23
  83. ht_temp .reg %fr24
  84. ht_temp_1 .reg %fr25
  85. lt_temp .reg %fr26
  86. lt_temp_1 .reg %fr27
  87. fm1_1 .reg %fr28
  88. fm_1 .reg %fr29
  89. fw_h .reg %fr7L
  90. fw_l .reg %fr7R
  91. fw .reg %fr7
  92. fht_0 .reg %fr8L
  93. flt_0 .reg %fr8R
  94. t_float_0 .reg %fr8
  95. fht_1 .reg %fr9L
  96. flt_1 .reg %fr9R
  97. t_float_1 .reg %fr9
  98. tmp_0 .reg %r31
  99. tmp_1 .reg %r21
  100. m_0 .reg %r20
  101. m_1 .reg %r19
  102. ht_0 .reg %r1
  103. ht_1 .reg %r3
  104. lt_0 .reg %r4
  105. lt_1 .reg %r5
  106. m1_0 .reg %r6
  107. m1_1 .reg %r7
  108. rp_val .reg %r8
  109. rp_val_1 .reg %r9
  110. bn_mul_add_words
  111. .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
  112. .proc
  113. .callinfo frame=128
  114. .entry
  115. .align 64
  116. STD %r3,0(%sp) ; save r3
  117. STD %r4,8(%sp) ; save r4
  118. NOP ; Needed to make the loop 16-byte aligned
  119. NOP ; needed to make the loop 16-byte aligned
  120. STD %r5,16(%sp) ; save r5
  121. NOP
  122. STD %r6,24(%sp) ; save r6
  123. STD %r7,32(%sp) ; save r7
  124. STD %r8,40(%sp) ; save r8
  125. STD %r9,48(%sp) ; save r9
  126. COPY %r0,%ret1 ; return 0 by default
  127. DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
  128. CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
  129. LDO 128(%sp),%sp ; bump stack
  130. ;
  131. ; The loop is unrolled twice, so if there is only 1 number
  132. ; then go straight to the cleanup code.
  133. ;
  134. CMPIB,= 1,num,bn_mul_add_words_single_top
  135. FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
  136. ;
  137. ; This loop is unrolled 2 times (64-byte aligned as well)
  138. ;
  139. ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
  140. ; two 32-bit mutiplies can be issued per cycle.
  141. ;
  142. bn_mul_add_words_unroll2
  143. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  144. FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  145. LDD 0(r_ptr),rp_val ; rp[0]
  146. LDD 8(r_ptr),rp_val_1 ; rp[1]
  147. XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
  148. XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
  149. FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
  150. FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
  151. XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
  152. XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
  153. FSTD fm,-8(%sp) ; -8(sp) = m[0]
  154. FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
  155. XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
  156. XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
  157. FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
  158. FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
  159. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  160. XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
  161. FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
  162. FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
  163. LDD -8(%sp),m_0 ; m[0]
  164. LDD -40(%sp),m_1 ; m[1]
  165. LDD -16(%sp),m1_0 ; m1[0]
  166. LDD -48(%sp),m1_1 ; m1[1]
  167. LDD -24(%sp),ht_0 ; ht[0]
  168. LDD -56(%sp),ht_1 ; ht[1]
  169. ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
  170. ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
  171. LDD -32(%sp),lt_0
  172. LDD -64(%sp),lt_1
  173. CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
  174. ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
  175. CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
  176. ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
  177. EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
  178. DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
  179. EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
  180. DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
  181. ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
  182. ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
  183. ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
  184. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  185. ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
  186. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  187. ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
  188. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  189. ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
  190. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  191. LDO -2(num),num ; num = num - 2;
  192. ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
  193. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  194. STD lt_0,0(r_ptr) ; rp[0] = lt[0]
  195. ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
  196. ADD,DC ht_1,%r0,%ret1 ; ht[1]++
  197. LDO 16(a_ptr),a_ptr ; a_ptr += 2
  198. STD lt_1,8(r_ptr) ; rp[1] = lt[1]
  199. CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
  200. LDO 16(r_ptr),r_ptr ; r_ptr += 2
  201. CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
  202. ;
  203. ; Top of loop aligned on 64-byte boundary
  204. ;
  205. bn_mul_add_words_single_top
  206. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  207. LDD 0(r_ptr),rp_val ; rp[0]
  208. LDO 8(a_ptr),a_ptr ; a_ptr++
  209. XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
  210. FSTD fm1,-16(%sp) ; -16(sp) = m1
  211. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  212. FSTD fm,-8(%sp) ; -8(sp) = m
  213. XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
  214. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  215. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  216. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  217. LDD -8(%sp),m_0
  218. LDD -16(%sp),m1_0 ; m1 = temp1
  219. ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
  220. LDD -24(%sp),ht_0
  221. LDD -32(%sp),lt_0
  222. CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
  223. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  224. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  225. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  226. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  227. ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
  228. ADD,DC ht_0,%r0,ht_0 ; ht++
  229. ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
  230. ADD,DC ht_0,%r0,ht_0 ; ht++
  231. ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
  232. ADD,DC ht_0,%r0,%ret1 ; ht++
  233. STD lt_0,0(r_ptr) ; rp[0] = lt
  234. bn_mul_add_words_exit
  235. .EXIT
  236. EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
  237. LDD -80(%sp),%r9 ; restore r9
  238. LDD -88(%sp),%r8 ; restore r8
  239. LDD -96(%sp),%r7 ; restore r7
  240. LDD -104(%sp),%r6 ; restore r6
  241. LDD -112(%sp),%r5 ; restore r5
  242. LDD -120(%sp),%r4 ; restore r4
  243. BVE (%rp)
  244. LDD,MB -128(%sp),%r3 ; restore r3
  245. .PROCEND ;in=23,24,25,26,29;out=28;
  246. ;----------------------------------------------------------------------------
  247. ;
  248. ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  249. ;
  250. ; arg0 = rp
  251. ; arg1 = ap
  252. ; arg3 = num
  253. ; w on stack at -56(sp)
  254. bn_mul_words
  255. .proc
  256. .callinfo frame=128
  257. .entry
  258. .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  259. .align 64
  260. STD %r3,0(%sp) ; save r3
  261. STD %r4,8(%sp) ; save r4
  262. NOP
  263. STD %r5,16(%sp) ; save r5
  264. STD %r6,24(%sp) ; save r6
  265. STD %r7,32(%sp) ; save r7
  266. COPY %r0,%ret1 ; return 0 by default
  267. DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
  268. CMPIB,>= 0,num,bn_mul_words_exit
  269. LDO 128(%sp),%sp ; bump stack
  270. ;
  271. ; See if only 1 word to do, thus just do cleanup
  272. ;
  273. CMPIB,= 1,num,bn_mul_words_single_top
  274. FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
  275. ;
  276. ; This loop is unrolled 2 times (64-byte aligned as well)
  277. ;
  278. ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
  279. ; two 32-bit mutiplies can be issued per cycle.
  280. ;
  281. bn_mul_words_unroll2
  282. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  283. FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  284. XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
  285. XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
  286. FSTD fm1,-16(%sp) ; -16(sp) = m1
  287. FSTD fm1_1,-48(%sp) ; -48(sp) = m1
  288. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  289. XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
  290. FSTD fm,-8(%sp) ; -8(sp) = m
  291. FSTD fm_1,-40(%sp) ; -40(sp) = m
  292. XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
  293. XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
  294. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  295. FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
  296. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  297. XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
  298. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  299. FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
  300. LDD -8(%sp),m_0
  301. LDD -40(%sp),m_1
  302. LDD -16(%sp),m1_0
  303. LDD -48(%sp),m1_1
  304. LDD -24(%sp),ht_0
  305. LDD -56(%sp),ht_1
  306. ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
  307. ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
  308. LDD -32(%sp),lt_0
  309. LDD -64(%sp),lt_1
  310. CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
  311. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  312. CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
  313. ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
  314. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  315. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  316. EXTRD,U tmp_1,31,32,m_1 ; m>>32
  317. DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
  318. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  319. ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
  320. ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
  321. ADD,DC ht_0,%r0,ht_0 ; ht++
  322. ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
  323. ADD,DC ht_1,%r0,ht_1 ; ht++
  324. ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
  325. ADD,DC ht_0,%r0,ht_0 ; ht++
  326. ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
  327. ADD,DC ht_1,%r0,ht_1 ; ht++
  328. STD lt_0,0(r_ptr) ; rp[0] = lt
  329. STD lt_1,8(r_ptr) ; rp[1] = lt
  330. COPY ht_1,%ret1 ; carry = ht
  331. LDO -2(num),num ; num = num - 2;
  332. LDO 16(a_ptr),a_ptr ; ap += 2
  333. CMPIB,<= 2,num,bn_mul_words_unroll2
  334. LDO 16(r_ptr),r_ptr ; rp++
  335. CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
  336. ;
  337. ; Top of loop aligned on 64-byte boundary
  338. ;
  339. bn_mul_words_single_top
  340. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  341. XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
  342. FSTD fm1,-16(%sp) ; -16(sp) = m1
  343. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  344. FSTD fm,-8(%sp) ; -8(sp) = m
  345. XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
  346. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  347. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  348. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  349. LDD -8(%sp),m_0
  350. LDD -16(%sp),m1_0
  351. ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
  352. LDD -24(%sp),ht_0
  353. LDD -32(%sp),lt_0
  354. CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
  355. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  356. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  357. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  358. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  359. ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
  360. ADD,DC ht_0,%r0,ht_0 ; ht++
  361. ADD %ret1,lt_0,lt_0 ; lt = lt + c;
  362. ADD,DC ht_0,%r0,ht_0 ; ht++
  363. COPY ht_0,%ret1 ; copy carry
  364. STD lt_0,0(r_ptr) ; rp[0] = lt
  365. bn_mul_words_exit
  366. .EXIT
  367. EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
  368. LDD -96(%sp),%r7 ; restore r7
  369. LDD -104(%sp),%r6 ; restore r6
  370. LDD -112(%sp),%r5 ; restore r5
  371. LDD -120(%sp),%r4 ; restore r4
  372. BVE (%rp)
  373. LDD,MB -128(%sp),%r3 ; restore r3
  374. .PROCEND
  375. ;----------------------------------------------------------------------------
  376. ;
  377. ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
  378. ;
  379. ; arg0 = rp
  380. ; arg1 = ap
  381. ; arg2 = num
  382. ;
  383. bn_sqr_words
  384. .proc
  385. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  386. .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  387. .entry
  388. .align 64
  389. STD %r3,0(%sp) ; save r3
  390. STD %r4,8(%sp) ; save r4
  391. NOP
  392. STD %r5,16(%sp) ; save r5
  393. CMPIB,>= 0,num,bn_sqr_words_exit
  394. LDO 128(%sp),%sp ; bump stack
  395. ;
  396. ; If only 1, the goto straight to cleanup
  397. ;
  398. CMPIB,= 1,num,bn_sqr_words_single_top
  399. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  400. ;
  401. ; This loop is unrolled 2 times (64-byte aligned as well)
  402. ;
  403. bn_sqr_words_unroll2
  404. FLDD 0(a_ptr),t_float_0 ; a[0]
  405. FLDD 8(a_ptr),t_float_1 ; a[1]
  406. XMPYU fht_0,flt_0,fm ; m[0]
  407. XMPYU fht_1,flt_1,fm_1 ; m[1]
  408. FSTD fm,-24(%sp) ; store m[0]
  409. FSTD fm_1,-56(%sp) ; store m[1]
  410. XMPYU flt_0,flt_0,lt_temp ; lt[0]
  411. XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
  412. FSTD lt_temp,-16(%sp) ; store lt[0]
  413. FSTD lt_temp_1,-48(%sp) ; store lt[1]
  414. XMPYU fht_0,fht_0,ht_temp ; ht[0]
  415. XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
  416. FSTD ht_temp,-8(%sp) ; store ht[0]
  417. FSTD ht_temp_1,-40(%sp) ; store ht[1]
  418. LDD -24(%sp),m_0
  419. LDD -56(%sp),m_1
  420. AND m_0,high_mask,tmp_0 ; m[0] & Mask
  421. AND m_1,high_mask,tmp_1 ; m[1] & Mask
  422. DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
  423. DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
  424. LDD -16(%sp),lt_0
  425. LDD -48(%sp),lt_1
  426. EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
  427. EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
  428. LDD -8(%sp),ht_0
  429. LDD -40(%sp),ht_1
  430. ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
  431. ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
  432. ADD lt_0,m_0,lt_0 ; lt = lt+m
  433. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  434. STD lt_0,0(r_ptr) ; rp[0] = lt[0]
  435. STD ht_0,8(r_ptr) ; rp[1] = ht[1]
  436. ADD lt_1,m_1,lt_1 ; lt = lt+m
  437. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  438. STD lt_1,16(r_ptr) ; rp[2] = lt[1]
  439. STD ht_1,24(r_ptr) ; rp[3] = ht[1]
  440. LDO -2(num),num ; num = num - 2;
  441. LDO 16(a_ptr),a_ptr ; ap += 2
  442. CMPIB,<= 2,num,bn_sqr_words_unroll2
  443. LDO 32(r_ptr),r_ptr ; rp += 4
  444. CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
  445. ;
  446. ; Top of loop aligned on 64-byte boundary
  447. ;
  448. bn_sqr_words_single_top
  449. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  450. XMPYU fht_0,flt_0,fm ; m
  451. FSTD fm,-24(%sp) ; store m
  452. XMPYU flt_0,flt_0,lt_temp ; lt
  453. FSTD lt_temp,-16(%sp) ; store lt
  454. XMPYU fht_0,fht_0,ht_temp ; ht
  455. FSTD ht_temp,-8(%sp) ; store ht
  456. LDD -24(%sp),m_0 ; load m
  457. AND m_0,high_mask,tmp_0 ; m & Mask
  458. DEPD,Z m_0,30,31,m_0 ; m << 32+1
  459. LDD -16(%sp),lt_0 ; lt
  460. LDD -8(%sp),ht_0 ; ht
  461. EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
  462. ADD m_0,lt_0,lt_0 ; lt = lt+m
  463. ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
  464. ADD,DC ht_0,%r0,ht_0 ; ht++
  465. STD lt_0,0(r_ptr) ; rp[0] = lt
  466. STD ht_0,8(r_ptr) ; rp[1] = ht
  467. bn_sqr_words_exit
  468. .EXIT
  469. LDD -112(%sp),%r5 ; restore r5
  470. LDD -120(%sp),%r4 ; restore r4
  471. BVE (%rp)
  472. LDD,MB -128(%sp),%r3
  473. .PROCEND ;in=23,24,25,26,29;out=28;
  474. ;----------------------------------------------------------------------------
  475. ;
  476. ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  477. ;
  478. ; arg0 = rp
  479. ; arg1 = ap
  480. ; arg2 = bp
  481. ; arg3 = n
  482. t .reg %r22
  483. b .reg %r21
  484. l .reg %r20
  485. bn_add_words
  486. .proc
  487. .entry
  488. .callinfo
  489. .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  490. .align 64
  491. CMPIB,>= 0,n,bn_add_words_exit
  492. COPY %r0,%ret1 ; return 0 by default
  493. ;
  494. ; If 2 or more numbers do the loop
  495. ;
  496. CMPIB,= 1,n,bn_add_words_single_top
  497. NOP
  498. ;
  499. ; This loop is unrolled 2 times (64-byte aligned as well)
  500. ;
  501. bn_add_words_unroll2
  502. LDD 0(a_ptr),t
  503. LDD 0(b_ptr),b
  504. ADD t,%ret1,t ; t = t+c;
  505. ADD,DC %r0,%r0,%ret1 ; set c to carry
  506. ADD t,b,l ; l = t + b[0]
  507. ADD,DC %ret1,%r0,%ret1 ; c+= carry
  508. STD l,0(r_ptr)
  509. LDD 8(a_ptr),t
  510. LDD 8(b_ptr),b
  511. ADD t,%ret1,t ; t = t+c;
  512. ADD,DC %r0,%r0,%ret1 ; set c to carry
  513. ADD t,b,l ; l = t + b[0]
  514. ADD,DC %ret1,%r0,%ret1 ; c+= carry
  515. STD l,8(r_ptr)
  516. LDO -2(n),n
  517. LDO 16(a_ptr),a_ptr
  518. LDO 16(b_ptr),b_ptr
  519. CMPIB,<= 2,n,bn_add_words_unroll2
  520. LDO 16(r_ptr),r_ptr
  521. CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
  522. bn_add_words_single_top
  523. LDD 0(a_ptr),t
  524. LDD 0(b_ptr),b
  525. ADD t,%ret1,t ; t = t+c;
  526. ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
  527. ADD t,b,l ; l = t + b[0]
  528. ADD,DC %ret1,%r0,%ret1 ; c+= carry
  529. STD l,0(r_ptr)
  530. bn_add_words_exit
  531. .EXIT
  532. BVE (%rp)
  533. EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
  534. .PROCEND ;in=23,24,25,26,29;out=28;
  535. ;----------------------------------------------------------------------------
  536. ;
  537. ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  538. ;
  539. ; arg0 = rp
  540. ; arg1 = ap
  541. ; arg2 = bp
  542. ; arg3 = n
  543. t1 .reg %r22
  544. t2 .reg %r21
  545. sub_tmp1 .reg %r20
  546. sub_tmp2 .reg %r19
  547. bn_sub_words
  548. .proc
  549. .callinfo
  550. .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  551. .entry
  552. .align 64
  553. CMPIB,>= 0,n,bn_sub_words_exit
  554. COPY %r0,%ret1 ; return 0 by default
  555. ;
  556. ; If 2 or more numbers do the loop
  557. ;
  558. CMPIB,= 1,n,bn_sub_words_single_top
  559. NOP
  560. ;
  561. ; This loop is unrolled 2 times (64-byte aligned as well)
  562. ;
  563. bn_sub_words_unroll2
  564. LDD 0(a_ptr),t1
  565. LDD 0(b_ptr),t2
  566. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  567. SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
  568. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  569. LDO 1(%r0),sub_tmp2
  570. CMPCLR,*= t1,t2,%r0
  571. COPY sub_tmp2,%ret1
  572. STD sub_tmp1,0(r_ptr)
  573. LDD 8(a_ptr),t1
  574. LDD 8(b_ptr),t2
  575. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  576. SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
  577. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  578. LDO 1(%r0),sub_tmp2
  579. CMPCLR,*= t1,t2,%r0
  580. COPY sub_tmp2,%ret1
  581. STD sub_tmp1,8(r_ptr)
  582. LDO -2(n),n
  583. LDO 16(a_ptr),a_ptr
  584. LDO 16(b_ptr),b_ptr
  585. CMPIB,<= 2,n,bn_sub_words_unroll2
  586. LDO 16(r_ptr),r_ptr
  587. CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
  588. bn_sub_words_single_top
  589. LDD 0(a_ptr),t1
  590. LDD 0(b_ptr),t2
  591. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  592. SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
  593. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  594. LDO 1(%r0),sub_tmp2
  595. CMPCLR,*= t1,t2,%r0
  596. COPY sub_tmp2,%ret1
  597. STD sub_tmp1,0(r_ptr)
  598. bn_sub_words_exit
  599. .EXIT
  600. BVE (%rp)
  601. EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
  602. .PROCEND ;in=23,24,25,26,29;out=28;
  603. ;------------------------------------------------------------------------------
  604. ;
  605. ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
  606. ;
  607. ; arg0 = h
  608. ; arg1 = l
  609. ; arg2 = d
  610. ;
  611. ; This is mainly just output from the HP C compiler.
  612. ;
  613. ;------------------------------------------------------------------------------
  614. bn_div_words
  615. .PROC
  616. .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
  617. .IMPORT BN_num_bits_word,CODE
  618. ;--- not PIC .IMPORT __iob,DATA
  619. ;--- not PIC .IMPORT fprintf,CODE
  620. .IMPORT abort,CODE
  621. .IMPORT $$div2U,MILLICODE
  622. .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
  623. .ENTRY
  624. STW %r2,-20(%r30) ;offset 0x8ec
  625. STW,MA %r3,192(%r30) ;offset 0x8f0
  626. STW %r4,-188(%r30) ;offset 0x8f4
  627. DEPD %r5,31,32,%r6 ;offset 0x8f8
  628. STD %r6,-184(%r30) ;offset 0x8fc
  629. DEPD %r7,31,32,%r8 ;offset 0x900
  630. STD %r8,-176(%r30) ;offset 0x904
  631. STW %r9,-168(%r30) ;offset 0x908
  632. LDD -248(%r30),%r3 ;offset 0x90c
  633. COPY %r26,%r4 ;offset 0x910
  634. COPY %r24,%r5 ;offset 0x914
  635. DEPD %r25,31,32,%r4 ;offset 0x918
  636. CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
  637. DEPD %r23,31,32,%r5 ;offset 0x920
  638. MOVIB,TR -1,%r29,$00060002 ;offset 0x924
  639. EXTRD,U %r29,31,32,%r28 ;offset 0x928
  640. $0006002A
  641. LDO -1(%r29),%r29 ;offset 0x92c
  642. SUB %r23,%r7,%r23 ;offset 0x930
  643. $00060024
  644. SUB %r4,%r31,%r25 ;offset 0x934
  645. AND %r25,%r19,%r26 ;offset 0x938
  646. CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
  647. DEPD,Z %r25,31,32,%r20 ;offset 0x940
  648. OR %r20,%r24,%r21 ;offset 0x944
  649. CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
  650. SUB %r31,%r2,%r31 ;offset 0x94c
  651. $00060046
  652. $0006002E
  653. DEPD,Z %r23,31,32,%r25 ;offset 0x950
  654. EXTRD,U %r23,31,32,%r26 ;offset 0x954
  655. AND %r25,%r19,%r24 ;offset 0x958
  656. ADD,L %r31,%r26,%r31 ;offset 0x95c
  657. CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
  658. LDO 1(%r31),%r31 ;offset 0x964
  659. $00060032
  660. CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
  661. LDO -1(%r29),%r29 ;offset 0x96c
  662. ADD,L %r4,%r3,%r4 ;offset 0x970
  663. $00060036
  664. ADDIB,=,N -1,%r8,$D0 ;offset 0x974
  665. SUB %r5,%r24,%r28 ;offset 0x978
  666. $0006003A
  667. SUB %r4,%r31,%r24 ;offset 0x97c
  668. SHRPD %r24,%r28,32,%r4 ;offset 0x980
  669. DEPD,Z %r29,31,32,%r9 ;offset 0x984
  670. DEPD,Z %r28,31,32,%r5 ;offset 0x988
  671. $0006001C
  672. EXTRD,U %r4,31,32,%r31 ;offset 0x98c
  673. CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
  674. MOVB,TR %r6,%r29,$D1 ;offset 0x994
  675. STD %r29,-152(%r30) ;offset 0x998
  676. $0006000C
  677. EXTRD,U %r3,31,32,%r25 ;offset 0x99c
  678. COPY %r3,%r26 ;offset 0x9a0
  679. EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
  680. EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
  681. .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
  682. B,L BN_num_bits_word,%r2 ;offset 0x9ac
  683. EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
  684. LDI 64,%r20 ;offset 0x9b4
  685. DEPD %r7,31,32,%r5 ;offset 0x9b8
  686. DEPD %r8,31,32,%r4 ;offset 0x9bc
  687. DEPD %r9,31,32,%r3 ;offset 0x9c0
  688. CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
  689. COPY %r28,%r24 ;offset 0x9c8
  690. MTSARCM %r24 ;offset 0x9cc
  691. DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
  692. CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
  693. $00060012
  694. SUBI 64,%r24,%r31 ;offset 0x9d8
  695. CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
  696. SUB %r4,%r3,%r4 ;offset 0x9e0
  697. $00060016
  698. CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
  699. COPY %r0,%r9 ;offset 0x9e8
  700. MTSARCM %r31 ;offset 0x9ec
  701. DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
  702. SUBI 64,%r31,%r26 ;offset 0x9f4
  703. MTSAR %r26 ;offset 0x9f8
  704. SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
  705. MTSARCM %r31 ;offset 0xa00
  706. DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
  707. $0006001A
  708. DEPDI,Z -1,31,32,%r19 ;offset 0xa08
  709. AND %r3,%r19,%r29 ;offset 0xa0c
  710. EXTRD,U %r29,31,32,%r2 ;offset 0xa10
  711. DEPDI,Z -1,63,32,%r6 ;offset 0xa14
  712. MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
  713. EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
  714. $D2
  715. ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
  716. ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
  717. ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
  718. ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
  719. ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
  720. ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
  721. .CALL ;
  722. B,L abort,%r2 ;offset 0xa34
  723. NOP ;offset 0xa38
  724. B $D3 ;offset 0xa3c
  725. LDW -212(%r30),%r2 ;offset 0xa40
  726. $00060020
  727. COPY %r4,%r26 ;offset 0xa44
  728. EXTRD,U %r4,31,32,%r25 ;offset 0xa48
  729. COPY %r2,%r24 ;offset 0xa4c
  730. .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
  731. B,L $$div2U,%r31 ;offset 0xa50
  732. EXTRD,U %r2,31,32,%r23 ;offset 0xa54
  733. DEPD %r28,31,32,%r29 ;offset 0xa58
  734. $00060022
  735. STD %r29,-152(%r30) ;offset 0xa5c
  736. $D1
  737. AND %r5,%r19,%r24 ;offset 0xa60
  738. EXTRD,U %r24,31,32,%r24 ;offset 0xa64
  739. STW %r2,-160(%r30) ;offset 0xa68
  740. STW %r7,-128(%r30) ;offset 0xa6c
  741. FLDD -152(%r30),%fr4 ;offset 0xa70
  742. FLDD -152(%r30),%fr7 ;offset 0xa74
  743. FLDW -160(%r30),%fr8L ;offset 0xa78
  744. FLDW -128(%r30),%fr5L ;offset 0xa7c
  745. XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
  746. FSTD %fr10,-136(%r30) ;offset 0xa84
  747. XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
  748. FSTD %fr22,-144(%r30) ;offset 0xa8c
  749. XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
  750. XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
  751. FSTD %fr11,-112(%r30) ;offset 0xa98
  752. FSTD %fr23,-120(%r30) ;offset 0xa9c
  753. LDD -136(%r30),%r28 ;offset 0xaa0
  754. DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
  755. LDD -144(%r30),%r20 ;offset 0xaa8
  756. ADD,L %r20,%r31,%r31 ;offset 0xaac
  757. LDD -112(%r30),%r22 ;offset 0xab0
  758. DEPD,Z %r22,31,32,%r22 ;offset 0xab4
  759. LDD -120(%r30),%r21 ;offset 0xab8
  760. B $00060024 ;offset 0xabc
  761. ADD,L %r21,%r22,%r23 ;offset 0xac0
  762. $D0
  763. OR %r9,%r29,%r29 ;offset 0xac4
  764. $00060040
  765. EXTRD,U %r29,31,32,%r28 ;offset 0xac8
  766. $00060002
  767. $L2
  768. LDW -212(%r30),%r2 ;offset 0xacc
  769. $D3
  770. LDW -168(%r30),%r9 ;offset 0xad0
  771. LDD -176(%r30),%r8 ;offset 0xad4
  772. EXTRD,U %r8,31,32,%r7 ;offset 0xad8
  773. LDD -184(%r30),%r6 ;offset 0xadc
  774. EXTRD,U %r6,31,32,%r5 ;offset 0xae0
  775. LDW -188(%r30),%r4 ;offset 0xae4
  776. BVE (%r2) ;offset 0xae8
  777. .EXIT
  778. LDW,MB -192(%r30),%r3 ;offset 0xaec
  779. .PROCEND ;in=23,25;out=28,29;fpin=105,107;
  780. ;----------------------------------------------------------------------------
  781. ;
  782. ; Registers to hold 64-bit values to manipulate. The "L" part
  783. ; of the register corresponds to the upper 32-bits, while the "R"
  784. ; part corresponds to the lower 32-bits
  785. ;
  786. ; Note, that when using b6 and b7, the code must save these before
  787. ; using them because they are callee save registers
  788. ;
  789. ;
  790. ; Floating point registers to use to save values that
  791. ; are manipulated. These don't collide with ftemp1-6 and
  792. ; are all caller save registers
  793. ;
  794. a0 .reg %fr22
  795. a0L .reg %fr22L
  796. a0R .reg %fr22R
  797. a1 .reg %fr23
  798. a1L .reg %fr23L
  799. a1R .reg %fr23R
  800. a2 .reg %fr24
  801. a2L .reg %fr24L
  802. a2R .reg %fr24R
  803. a3 .reg %fr25
  804. a3L .reg %fr25L
  805. a3R .reg %fr25R
  806. a4 .reg %fr26
  807. a4L .reg %fr26L
  808. a4R .reg %fr26R
  809. a5 .reg %fr27
  810. a5L .reg %fr27L
  811. a5R .reg %fr27R
  812. a6 .reg %fr28
  813. a6L .reg %fr28L
  814. a6R .reg %fr28R
  815. a7 .reg %fr29
  816. a7L .reg %fr29L
  817. a7R .reg %fr29R
  818. b0 .reg %fr30
  819. b0L .reg %fr30L
  820. b0R .reg %fr30R
  821. b1 .reg %fr31
  822. b1L .reg %fr31L
  823. b1R .reg %fr31R
  824. ;
  825. ; Temporary floating point variables, these are all caller save
  826. ; registers
  827. ;
  828. ftemp1 .reg %fr4
  829. ftemp2 .reg %fr5
  830. ftemp3 .reg %fr6
  831. ftemp4 .reg %fr7
  832. ;
  833. ; The B set of registers when used.
  834. ;
  835. b2 .reg %fr8
  836. b2L .reg %fr8L
  837. b2R .reg %fr8R
  838. b3 .reg %fr9
  839. b3L .reg %fr9L
  840. b3R .reg %fr9R
  841. b4 .reg %fr10
  842. b4L .reg %fr10L
  843. b4R .reg %fr10R
  844. b5 .reg %fr11
  845. b5L .reg %fr11L
  846. b5R .reg %fr11R
  847. b6 .reg %fr12
  848. b6L .reg %fr12L
  849. b6R .reg %fr12R
  850. b7 .reg %fr13
  851. b7L .reg %fr13L
  852. b7R .reg %fr13R
  853. c1 .reg %r21 ; only reg
  854. temp1 .reg %r20 ; only reg
  855. temp2 .reg %r19 ; only reg
  856. temp3 .reg %r31 ; only reg
  857. m1 .reg %r28
  858. c2 .reg %r23
  859. high_one .reg %r1
  860. ht .reg %r6
  861. lt .reg %r5
  862. m .reg %r4
  863. c3 .reg %r3
  864. SQR_ADD_C .macro A0L,A0R,C1,C2,C3
  865. XMPYU A0L,A0R,ftemp1 ; m
  866. FSTD ftemp1,-24(%sp) ; store m
  867. XMPYU A0R,A0R,ftemp2 ; lt
  868. FSTD ftemp2,-16(%sp) ; store lt
  869. XMPYU A0L,A0L,ftemp3 ; ht
  870. FSTD ftemp3,-8(%sp) ; store ht
  871. LDD -24(%sp),m ; load m
  872. AND m,high_mask,temp2 ; m & Mask
  873. DEPD,Z m,30,31,temp3 ; m << 32+1
  874. LDD -16(%sp),lt ; lt
  875. LDD -8(%sp),ht ; ht
  876. EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
  877. ADD temp3,lt,lt ; lt = lt+m
  878. ADD,L ht,temp1,ht ; ht += temp1
  879. ADD,DC ht,%r0,ht ; ht++
  880. ADD C1,lt,C1 ; c1=c1+lt
  881. ADD,DC ht,%r0,ht ; ht++
  882. ADD C2,ht,C2 ; c2=c2+ht
  883. ADD,DC C3,%r0,C3 ; c3++
  884. .endm
  885. SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
  886. XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
  887. FSTD ftemp1,-16(%sp) ;
  888. XMPYU A0R,A1L,ftemp2 ; m = bh*lt
  889. FSTD ftemp2,-8(%sp) ;
  890. XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
  891. FSTD ftemp3,-32(%sp)
  892. XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
  893. FSTD ftemp4,-24(%sp) ;
  894. LDD -8(%sp),m ; r21 = m
  895. LDD -16(%sp),m1 ; r19 = m1
  896. ADD,L m,m1,m ; m+m1
  897. DEPD,Z m,31,32,temp3 ; (m+m1<<32)
  898. LDD -24(%sp),ht ; r24 = ht
  899. CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
  900. ADD,L ht,high_one,ht ; ht+=high_one
  901. EXTRD,U m,31,32,temp1 ; m >> 32
  902. LDD -32(%sp),lt ; lt
  903. ADD,L ht,temp1,ht ; ht+= m>>32
  904. ADD lt,temp3,lt ; lt = lt+m1
  905. ADD,DC ht,%r0,ht ; ht++
  906. ADD ht,ht,ht ; ht=ht+ht;
  907. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  908. ADD lt,lt,lt ; lt=lt+lt;
  909. ADD,DC ht,%r0,ht ; add in carry (ht++)
  910. ADD C1,lt,C1 ; c1=c1+lt
  911. ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
  912. LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
  913. ADD C2,ht,C2 ; c2 = c2 + ht
  914. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  915. .endm
  916. ;
  917. ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  918. ; arg0 = r_ptr
  919. ; arg1 = a_ptr
  920. ;
  921. bn_sqr_comba8
  922. .PROC
  923. .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  924. .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  925. .ENTRY
  926. .align 64
  927. STD %r3,0(%sp) ; save r3
  928. STD %r4,8(%sp) ; save r4
  929. STD %r5,16(%sp) ; save r5
  930. STD %r6,24(%sp) ; save r6
  931. ;
  932. ; Zero out carries
  933. ;
  934. COPY %r0,c1
  935. COPY %r0,c2
  936. COPY %r0,c3
  937. LDO 128(%sp),%sp ; bump stack
  938. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  939. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  940. ;
  941. ; Load up all of the values we are going to use
  942. ;
  943. FLDD 0(a_ptr),a0
  944. FLDD 8(a_ptr),a1
  945. FLDD 16(a_ptr),a2
  946. FLDD 24(a_ptr),a3
  947. FLDD 32(a_ptr),a4
  948. FLDD 40(a_ptr),a5
  949. FLDD 48(a_ptr),a6
  950. FLDD 56(a_ptr),a7
  951. SQR_ADD_C a0L,a0R,c1,c2,c3
  952. STD c1,0(r_ptr) ; r[0] = c1;
  953. COPY %r0,c1
  954. SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
  955. STD c2,8(r_ptr) ; r[1] = c2;
  956. COPY %r0,c2
  957. SQR_ADD_C a1L,a1R,c3,c1,c2
  958. SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
  959. STD c3,16(r_ptr) ; r[2] = c3;
  960. COPY %r0,c3
  961. SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
  962. SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
  963. STD c1,24(r_ptr) ; r[3] = c1;
  964. COPY %r0,c1
  965. SQR_ADD_C a2L,a2R,c2,c3,c1
  966. SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
  967. SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
  968. STD c2,32(r_ptr) ; r[4] = c2;
  969. COPY %r0,c2
  970. SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
  971. SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
  972. SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
  973. STD c3,40(r_ptr) ; r[5] = c3;
  974. COPY %r0,c3
  975. SQR_ADD_C a3L,a3R,c1,c2,c3
  976. SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
  977. SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
  978. SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
  979. STD c1,48(r_ptr) ; r[6] = c1;
  980. COPY %r0,c1
  981. SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
  982. SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
  983. SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
  984. SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
  985. STD c2,56(r_ptr) ; r[7] = c2;
  986. COPY %r0,c2
  987. SQR_ADD_C a4L,a4R,c3,c1,c2
  988. SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
  989. SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
  990. SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
  991. STD c3,64(r_ptr) ; r[8] = c3;
  992. COPY %r0,c3
  993. SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
  994. SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
  995. SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
  996. STD c1,72(r_ptr) ; r[9] = c1;
  997. COPY %r0,c1
  998. SQR_ADD_C a5L,a5R,c2,c3,c1
  999. SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
  1000. SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
  1001. STD c2,80(r_ptr) ; r[10] = c2;
  1002. COPY %r0,c2
  1003. SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
  1004. SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
  1005. STD c3,88(r_ptr) ; r[11] = c3;
  1006. COPY %r0,c3
  1007. SQR_ADD_C a6L,a6R,c1,c2,c3
  1008. SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
  1009. STD c1,96(r_ptr) ; r[12] = c1;
  1010. COPY %r0,c1
  1011. SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
  1012. STD c2,104(r_ptr) ; r[13] = c2;
  1013. COPY %r0,c2
  1014. SQR_ADD_C a7L,a7R,c3,c1,c2
  1015. STD c3, 112(r_ptr) ; r[14] = c3
  1016. STD c1, 120(r_ptr) ; r[15] = c1
  1017. .EXIT
  1018. LDD -104(%sp),%r6 ; restore r6
  1019. LDD -112(%sp),%r5 ; restore r5
  1020. LDD -120(%sp),%r4 ; restore r4
  1021. BVE (%rp)
  1022. LDD,MB -128(%sp),%r3
  1023. .PROCEND
  1024. ;-----------------------------------------------------------------------------
  1025. ;
  1026. ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  1027. ; arg0 = r_ptr
  1028. ; arg1 = a_ptr
  1029. ;
  1030. bn_sqr_comba4
  1031. .proc
  1032. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1033. .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1034. .entry
  1035. .align 64
  1036. STD %r3,0(%sp) ; save r3
  1037. STD %r4,8(%sp) ; save r4
  1038. STD %r5,16(%sp) ; save r5
  1039. STD %r6,24(%sp) ; save r6
  1040. ;
  1041. ; Zero out carries
  1042. ;
  1043. COPY %r0,c1
  1044. COPY %r0,c2
  1045. COPY %r0,c3
  1046. LDO 128(%sp),%sp ; bump stack
  1047. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  1048. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1049. ;
  1050. ; Load up all of the values we are going to use
  1051. ;
  1052. FLDD 0(a_ptr),a0
  1053. FLDD 8(a_ptr),a1
  1054. FLDD 16(a_ptr),a2
  1055. FLDD 24(a_ptr),a3
  1056. FLDD 32(a_ptr),a4
  1057. FLDD 40(a_ptr),a5
  1058. FLDD 48(a_ptr),a6
  1059. FLDD 56(a_ptr),a7
  1060. SQR_ADD_C a0L,a0R,c1,c2,c3
  1061. STD c1,0(r_ptr) ; r[0] = c1;
  1062. COPY %r0,c1
  1063. SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
  1064. STD c2,8(r_ptr) ; r[1] = c2;
  1065. COPY %r0,c2
  1066. SQR_ADD_C a1L,a1R,c3,c1,c2
  1067. SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
  1068. STD c3,16(r_ptr) ; r[2] = c3;
  1069. COPY %r0,c3
  1070. SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
  1071. SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
  1072. STD c1,24(r_ptr) ; r[3] = c1;
  1073. COPY %r0,c1
  1074. SQR_ADD_C a2L,a2R,c2,c3,c1
  1075. SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
  1076. STD c2,32(r_ptr) ; r[4] = c2;
  1077. COPY %r0,c2
  1078. SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
  1079. STD c3,40(r_ptr) ; r[5] = c3;
  1080. COPY %r0,c3
  1081. SQR_ADD_C a3L,a3R,c1,c2,c3
  1082. STD c1,48(r_ptr) ; r[6] = c1;
  1083. STD c2,56(r_ptr) ; r[7] = c2;
  1084. .EXIT
  1085. LDD -104(%sp),%r6 ; restore r6
  1086. LDD -112(%sp),%r5 ; restore r5
  1087. LDD -120(%sp),%r4 ; restore r4
  1088. BVE (%rp)
  1089. LDD,MB -128(%sp),%r3
  1090. .PROCEND
  1091. ;---------------------------------------------------------------------------
  1092. MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
  1093. XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
  1094. FSTD ftemp1,-16(%sp) ;
  1095. XMPYU A0R,B0L,ftemp2 ; m = bh*lt
  1096. FSTD ftemp2,-8(%sp) ;
  1097. XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
  1098. FSTD ftemp3,-32(%sp)
  1099. XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
  1100. FSTD ftemp4,-24(%sp) ;
  1101. LDD -8(%sp),m ; r21 = m
  1102. LDD -16(%sp),m1 ; r19 = m1
  1103. ADD,L m,m1,m ; m+m1
  1104. DEPD,Z m,31,32,temp3 ; (m+m1<<32)
  1105. LDD -24(%sp),ht ; r24 = ht
  1106. CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
  1107. ADD,L ht,high_one,ht ; ht+=high_one
  1108. EXTRD,U m,31,32,temp1 ; m >> 32
  1109. LDD -32(%sp),lt ; lt
  1110. ADD,L ht,temp1,ht ; ht+= m>>32
  1111. ADD lt,temp3,lt ; lt = lt+m1
  1112. ADD,DC ht,%r0,ht ; ht++
  1113. ADD C1,lt,C1 ; c1=c1+lt
  1114. ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
  1115. ADD C2,ht,C2 ; c2 = c2 + ht
  1116. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  1117. .endm
  1118. ;
  1119. ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  1120. ; arg0 = r_ptr
  1121. ; arg1 = a_ptr
  1122. ; arg2 = b_ptr
  1123. ;
  1124. bn_mul_comba8
  1125. .proc
  1126. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1127. .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1128. .entry
  1129. .align 64
  1130. STD %r3,0(%sp) ; save r3
  1131. STD %r4,8(%sp) ; save r4
  1132. STD %r5,16(%sp) ; save r5
  1133. STD %r6,24(%sp) ; save r6
  1134. FSTD %fr12,32(%sp) ; save r6
  1135. FSTD %fr13,40(%sp) ; save r7
  1136. ;
  1137. ; Zero out carries
  1138. ;
  1139. COPY %r0,c1
  1140. COPY %r0,c2
  1141. COPY %r0,c3
  1142. LDO 128(%sp),%sp ; bump stack
  1143. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1144. ;
  1145. ; Load up all of the values we are going to use
  1146. ;
  1147. FLDD 0(a_ptr),a0
  1148. FLDD 8(a_ptr),a1
  1149. FLDD 16(a_ptr),a2
  1150. FLDD 24(a_ptr),a3
  1151. FLDD 32(a_ptr),a4
  1152. FLDD 40(a_ptr),a5
  1153. FLDD 48(a_ptr),a6
  1154. FLDD 56(a_ptr),a7
  1155. FLDD 0(b_ptr),b0
  1156. FLDD 8(b_ptr),b1
  1157. FLDD 16(b_ptr),b2
  1158. FLDD 24(b_ptr),b3
  1159. FLDD 32(b_ptr),b4
  1160. FLDD 40(b_ptr),b5
  1161. FLDD 48(b_ptr),b6
  1162. FLDD 56(b_ptr),b7
  1163. MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
  1164. STD c1,0(r_ptr)
  1165. COPY %r0,c1
  1166. MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
  1167. MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
  1168. STD c2,8(r_ptr)
  1169. COPY %r0,c2
  1170. MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
  1171. MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
  1172. MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
  1173. STD c3,16(r_ptr)
  1174. COPY %r0,c3
  1175. MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
  1176. MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
  1177. MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
  1178. MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
  1179. STD c1,24(r_ptr)
  1180. COPY %r0,c1
  1181. MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
  1182. MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
  1183. MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
  1184. MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
  1185. MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
  1186. STD c2,32(r_ptr)
  1187. COPY %r0,c2
  1188. MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
  1189. MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
  1190. MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
  1191. MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
  1192. MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
  1193. MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
  1194. STD c3,40(r_ptr)
  1195. COPY %r0,c3
  1196. MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
  1197. MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
  1198. MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
  1199. MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
  1200. MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
  1201. MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
  1202. MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
  1203. STD c1,48(r_ptr)
  1204. COPY %r0,c1
  1205. MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
  1206. MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
  1207. MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
  1208. MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
  1209. MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
  1210. MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
  1211. MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
  1212. MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
  1213. STD c2,56(r_ptr)
  1214. COPY %r0,c2
  1215. MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
  1216. MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
  1217. MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
  1218. MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
  1219. MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
  1220. MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
  1221. MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
  1222. STD c3,64(r_ptr)
  1223. COPY %r0,c3
  1224. MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
  1225. MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
  1226. MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
  1227. MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
  1228. MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
  1229. MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
  1230. STD c1,72(r_ptr)
  1231. COPY %r0,c1
  1232. MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
  1233. MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
  1234. MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
  1235. MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
  1236. MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
  1237. STD c2,80(r_ptr)
  1238. COPY %r0,c2
  1239. MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
  1240. MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
  1241. MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
  1242. MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
  1243. STD c3,88(r_ptr)
  1244. COPY %r0,c3
  1245. MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
  1246. MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
  1247. MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
  1248. STD c1,96(r_ptr)
  1249. COPY %r0,c1
  1250. MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
  1251. MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
  1252. STD c2,104(r_ptr)
  1253. COPY %r0,c2
  1254. MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
  1255. STD c3,112(r_ptr)
  1256. STD c1,120(r_ptr)
  1257. .EXIT
  1258. FLDD -88(%sp),%fr13
  1259. FLDD -96(%sp),%fr12
  1260. LDD -104(%sp),%r6 ; restore r6
  1261. LDD -112(%sp),%r5 ; restore r5
  1262. LDD -120(%sp),%r4 ; restore r4
  1263. BVE (%rp)
  1264. LDD,MB -128(%sp),%r3
  1265. .PROCEND
  1266. ;-----------------------------------------------------------------------------
  1267. ;
  1268. ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  1269. ; arg0 = r_ptr
  1270. ; arg1 = a_ptr
  1271. ; arg2 = b_ptr
  1272. ;
  1273. bn_mul_comba4
  1274. .proc
  1275. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1276. .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1277. .entry
  1278. .align 64
  1279. STD %r3,0(%sp) ; save r3
  1280. STD %r4,8(%sp) ; save r4
  1281. STD %r5,16(%sp) ; save r5
  1282. STD %r6,24(%sp) ; save r6
  1283. FSTD %fr12,32(%sp) ; save r6
  1284. FSTD %fr13,40(%sp) ; save r7
  1285. ;
  1286. ; Zero out carries
  1287. ;
  1288. COPY %r0,c1
  1289. COPY %r0,c2
  1290. COPY %r0,c3
  1291. LDO 128(%sp),%sp ; bump stack
  1292. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1293. ;
  1294. ; Load up all of the values we are going to use
  1295. ;
  1296. FLDD 0(a_ptr),a0
  1297. FLDD 8(a_ptr),a1
  1298. FLDD 16(a_ptr),a2
  1299. FLDD 24(a_ptr),a3
  1300. FLDD 0(b_ptr),b0
  1301. FLDD 8(b_ptr),b1
  1302. FLDD 16(b_ptr),b2
  1303. FLDD 24(b_ptr),b3
  1304. MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
  1305. STD c1,0(r_ptr)
  1306. COPY %r0,c1
  1307. MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
  1308. MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
  1309. STD c2,8(r_ptr)
  1310. COPY %r0,c2
  1311. MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
  1312. MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
  1313. MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
  1314. STD c3,16(r_ptr)
  1315. COPY %r0,c3
  1316. MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
  1317. MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
  1318. MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
  1319. MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
  1320. STD c1,24(r_ptr)
  1321. COPY %r0,c1
  1322. MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
  1323. MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
  1324. MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
  1325. STD c2,32(r_ptr)
  1326. COPY %r0,c2
  1327. MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
  1328. MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
  1329. STD c3,40(r_ptr)
  1330. COPY %r0,c3
  1331. MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
  1332. STD c1,48(r_ptr)
  1333. STD c2,56(r_ptr)
  1334. .EXIT
  1335. FLDD -88(%sp),%fr13
  1336. FLDD -96(%sp),%fr12
  1337. LDD -104(%sp),%r6 ; restore r6
  1338. LDD -112(%sp),%r5 ; restore r5
  1339. LDD -120(%sp),%r4 ; restore r4
  1340. BVE (%rp)
  1341. LDD,MB -128(%sp),%r3
  1342. .PROCEND
  1343. ;--- not PIC .SPACE $TEXT$
  1344. ;--- not PIC .SUBSPA $CODE$
  1345. ;--- not PIC .SPACE $PRIVATE$,SORT=16
  1346. ;--- not PIC .IMPORT $global$,DATA
  1347. ;--- not PIC .SPACE $TEXT$
  1348. ;--- not PIC .SUBSPA $CODE$
  1349. ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
  1350. ;--- not PIC C$7
  1351. ;--- not PIC .ALIGN 8
  1352. ;--- not PIC .STRINGZ "Division would overflow (%d)\n"
  1353. .END