2
0

pa-risc2W.s 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605
  1. ;
  2. ; PA-RISC 64-bit implementation of bn_asm code
  3. ;
  4. ; This code is approximately 2x faster than the C version
  5. ; for RSA/DSA.
  6. ;
  7. ; See http://devresource.hp.com/ for more details on the PA-RISC
  8. ; architecture. Also see the book "PA-RISC 2.0 Architecture"
  9. ; by Gerry Kane for information on the instruction set architecture.
  10. ;
  11. ; Code written by Chris Ruemmler (with some help from the HP C
  12. ; compiler).
  13. ;
  14. ; The code compiles with HP's assembler
  15. ;
  16. .level 2.0W
  17. .space $TEXT$
  18. .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
  19. ;
  20. ; Global Register definitions used for the routines.
  21. ;
  22. ; Some information about HP's runtime architecture for 64-bits.
  23. ;
  24. ; "Caller save" means the calling function must save the register
  25. ; if it wants the register to be preserved.
  26. ; "Callee save" means if a function uses the register, it must save
  27. ; the value before using it.
  28. ;
  29. ; For the floating point registers
  30. ;
  31. ; "caller save" registers: fr4-fr11, fr22-fr31
  32. ; "callee save" registers: fr12-fr21
  33. ; "special" registers: fr0-fr3 (status and exception registers)
  34. ;
  35. ; For the integer registers
  36. ; value zero : r0
  37. ; "caller save" registers: r1,r19-r26
  38. ; "callee save" registers: r3-r18
  39. ; return register : r2 (rp)
  40. ; return values ; r28 (ret0,ret1)
  41. ; Stack pointer ; r30 (sp)
  42. ; global data pointer ; r27 (dp)
  43. ; argument pointer ; r29 (ap)
  44. ; millicode return ptr ; r31 (also a caller save register)
  45. ;
  46. ; Arguments to the routines
  47. ;
  48. r_ptr .reg %r26
  49. a_ptr .reg %r25
  50. b_ptr .reg %r24
  51. num .reg %r24
  52. w .reg %r23
  53. n .reg %r23
  54. ;
  55. ; Globals used in some routines
  56. ;
  57. top_overflow .reg %r29
  58. high_mask .reg %r22 ; value 0xffffffff80000000L
  59. ;------------------------------------------------------------------------------
  60. ;
  61. ; bn_mul_add_words
  62. ;
  63. ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
  64. ; int num, BN_ULONG w)
  65. ;
  66. ; arg0 = r_ptr
  67. ; arg1 = a_ptr
  68. ; arg2 = num
  69. ; arg3 = w
  70. ;
  71. ; Local register definitions
  72. ;
  73. fm1 .reg %fr22
  74. fm .reg %fr23
  75. ht_temp .reg %fr24
  76. ht_temp_1 .reg %fr25
  77. lt_temp .reg %fr26
  78. lt_temp_1 .reg %fr27
  79. fm1_1 .reg %fr28
  80. fm_1 .reg %fr29
  81. fw_h .reg %fr7L
  82. fw_l .reg %fr7R
  83. fw .reg %fr7
  84. fht_0 .reg %fr8L
  85. flt_0 .reg %fr8R
  86. t_float_0 .reg %fr8
  87. fht_1 .reg %fr9L
  88. flt_1 .reg %fr9R
  89. t_float_1 .reg %fr9
  90. tmp_0 .reg %r31
  91. tmp_1 .reg %r21
  92. m_0 .reg %r20
  93. m_1 .reg %r19
  94. ht_0 .reg %r1
  95. ht_1 .reg %r3
  96. lt_0 .reg %r4
  97. lt_1 .reg %r5
  98. m1_0 .reg %r6
  99. m1_1 .reg %r7
  100. rp_val .reg %r8
  101. rp_val_1 .reg %r9
  102. bn_mul_add_words
  103. .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
  104. .proc
  105. .callinfo frame=128
  106. .entry
  107. .align 64
  108. STD %r3,0(%sp) ; save r3
  109. STD %r4,8(%sp) ; save r4
  110. NOP ; Needed to make the loop 16-byte aligned
  111. NOP ; Needed to make the loop 16-byte aligned
  112. STD %r5,16(%sp) ; save r5
  113. STD %r6,24(%sp) ; save r6
  114. STD %r7,32(%sp) ; save r7
  115. STD %r8,40(%sp) ; save r8
  116. STD %r9,48(%sp) ; save r9
  117. COPY %r0,%ret0 ; return 0 by default
  118. DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
  119. STD w,56(%sp) ; store w on stack
  120. CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
  121. LDO 128(%sp),%sp ; bump stack
  122. ;
  123. ; The loop is unrolled twice, so if there is only 1 number
  124. ; then go straight to the cleanup code.
  125. ;
  126. CMPIB,= 1,num,bn_mul_add_words_single_top
  127. FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
  128. ;
  129. ; This loop is unrolled 2 times (64-byte aligned as well)
  130. ;
  131. ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
  132. ; two 32-bit mutiplies can be issued per cycle.
  133. ;
  134. bn_mul_add_words_unroll2
  135. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  136. FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  137. LDD 0(r_ptr),rp_val ; rp[0]
  138. LDD 8(r_ptr),rp_val_1 ; rp[1]
  139. XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
  140. XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
  141. FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
  142. FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
  143. XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
  144. XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
  145. FSTD fm,-8(%sp) ; -8(sp) = m[0]
  146. FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
  147. XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
  148. XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
  149. FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
  150. FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
  151. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  152. XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
  153. FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
  154. FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
  155. LDD -8(%sp),m_0 ; m[0]
  156. LDD -40(%sp),m_1 ; m[1]
  157. LDD -16(%sp),m1_0 ; m1[0]
  158. LDD -48(%sp),m1_1 ; m1[1]
  159. LDD -24(%sp),ht_0 ; ht[0]
  160. LDD -56(%sp),ht_1 ; ht[1]
  161. ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
  162. ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
  163. LDD -32(%sp),lt_0
  164. LDD -64(%sp),lt_1
  165. CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
  166. ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
  167. CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
  168. ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
  169. EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
  170. DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
  171. EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
  172. DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
  173. ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
  174. ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
  175. ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
  176. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  177. ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
  178. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  179. ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
  180. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  181. ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
  182. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  183. LDO -2(num),num ; num = num - 2;
  184. ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
  185. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  186. STD lt_0,0(r_ptr) ; rp[0] = lt[0]
  187. ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
  188. ADD,DC ht_1,%r0,%ret0 ; ht[1]++
  189. LDO 16(a_ptr),a_ptr ; a_ptr += 2
  190. STD lt_1,8(r_ptr) ; rp[1] = lt[1]
  191. CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
  192. LDO 16(r_ptr),r_ptr ; r_ptr += 2
  193. CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
  194. ;
  195. ; Top of loop aligned on 64-byte boundary
  196. ;
  197. bn_mul_add_words_single_top
  198. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  199. LDD 0(r_ptr),rp_val ; rp[0]
  200. LDO 8(a_ptr),a_ptr ; a_ptr++
  201. XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
  202. FSTD fm1,-16(%sp) ; -16(sp) = m1
  203. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  204. FSTD fm,-8(%sp) ; -8(sp) = m
  205. XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
  206. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  207. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  208. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  209. LDD -8(%sp),m_0
  210. LDD -16(%sp),m1_0 ; m1 = temp1
  211. ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
  212. LDD -24(%sp),ht_0
  213. LDD -32(%sp),lt_0
  214. CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
  215. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  216. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  217. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  218. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  219. ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
  220. ADD,DC ht_0,%r0,ht_0 ; ht++
  221. ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
  222. ADD,DC ht_0,%r0,ht_0 ; ht++
  223. ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
  224. ADD,DC ht_0,%r0,%ret0 ; ht++
  225. STD lt_0,0(r_ptr) ; rp[0] = lt
  226. bn_mul_add_words_exit
  227. .EXIT
  228. LDD -80(%sp),%r9 ; restore r9
  229. LDD -88(%sp),%r8 ; restore r8
  230. LDD -96(%sp),%r7 ; restore r7
  231. LDD -104(%sp),%r6 ; restore r6
  232. LDD -112(%sp),%r5 ; restore r5
  233. LDD -120(%sp),%r4 ; restore r4
  234. BVE (%rp)
  235. LDD,MB -128(%sp),%r3 ; restore r3
  236. .PROCEND ;in=23,24,25,26,29;out=28;
  237. ;----------------------------------------------------------------------------
  238. ;
  239. ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  240. ;
  241. ; arg0 = rp
  242. ; arg1 = ap
  243. ; arg2 = num
  244. ; arg3 = w
  245. bn_mul_words
  246. .proc
  247. .callinfo frame=128
  248. .entry
  249. .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  250. .align 64
  251. STD %r3,0(%sp) ; save r3
  252. STD %r4,8(%sp) ; save r4
  253. STD %r5,16(%sp) ; save r5
  254. STD %r6,24(%sp) ; save r6
  255. STD %r7,32(%sp) ; save r7
  256. COPY %r0,%ret0 ; return 0 by default
  257. DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
  258. STD w,56(%sp) ; w on stack
  259. CMPIB,>= 0,num,bn_mul_words_exit
  260. LDO 128(%sp),%sp ; bump stack
  261. ;
  262. ; See if only 1 word to do, thus just do cleanup
  263. ;
  264. CMPIB,= 1,num,bn_mul_words_single_top
  265. FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
  266. ;
  267. ; This loop is unrolled 2 times (64-byte aligned as well)
  268. ;
  269. ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
  270. ; two 32-bit mutiplies can be issued per cycle.
  271. ;
  272. bn_mul_words_unroll2
  273. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  274. FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  275. XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
  276. XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
  277. FSTD fm1,-16(%sp) ; -16(sp) = m1
  278. FSTD fm1_1,-48(%sp) ; -48(sp) = m1
  279. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  280. XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
  281. FSTD fm,-8(%sp) ; -8(sp) = m
  282. FSTD fm_1,-40(%sp) ; -40(sp) = m
  283. XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
  284. XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
  285. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  286. FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
  287. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  288. XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
  289. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  290. FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
  291. LDD -8(%sp),m_0
  292. LDD -40(%sp),m_1
  293. LDD -16(%sp),m1_0
  294. LDD -48(%sp),m1_1
  295. LDD -24(%sp),ht_0
  296. LDD -56(%sp),ht_1
  297. ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
  298. ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
  299. LDD -32(%sp),lt_0
  300. LDD -64(%sp),lt_1
  301. CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
  302. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  303. CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
  304. ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
  305. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  306. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  307. EXTRD,U tmp_1,31,32,m_1 ; m>>32
  308. DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
  309. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  310. ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
  311. ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
  312. ADD,DC ht_0,%r0,ht_0 ; ht++
  313. ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
  314. ADD,DC ht_1,%r0,ht_1 ; ht++
  315. ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
  316. ADD,DC ht_0,%r0,ht_0 ; ht++
  317. ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
  318. ADD,DC ht_1,%r0,ht_1 ; ht++
  319. STD lt_0,0(r_ptr) ; rp[0] = lt
  320. STD lt_1,8(r_ptr) ; rp[1] = lt
  321. COPY ht_1,%ret0 ; carry = ht
  322. LDO -2(num),num ; num = num - 2;
  323. LDO 16(a_ptr),a_ptr ; ap += 2
  324. CMPIB,<= 2,num,bn_mul_words_unroll2
  325. LDO 16(r_ptr),r_ptr ; rp++
  326. CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
  327. ;
  328. ; Top of loop aligned on 64-byte boundary
  329. ;
  330. bn_mul_words_single_top
  331. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  332. XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
  333. FSTD fm1,-16(%sp) ; -16(sp) = m1
  334. XMPYU flt_0,fw_h,fm ; m = lt*fw_h
  335. FSTD fm,-8(%sp) ; -8(sp) = m
  336. XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
  337. FSTD ht_temp,-24(%sp) ; -24(sp) = ht
  338. XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
  339. FSTD lt_temp,-32(%sp) ; -32(sp) = lt
  340. LDD -8(%sp),m_0
  341. LDD -16(%sp),m1_0
  342. ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
  343. LDD -24(%sp),ht_0
  344. LDD -32(%sp),lt_0
  345. CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
  346. ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
  347. EXTRD,U tmp_0,31,32,m_0 ; m>>32
  348. DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
  349. ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
  350. ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
  351. ADD,DC ht_0,%r0,ht_0 ; ht++
  352. ADD %ret0,lt_0,lt_0 ; lt = lt + c;
  353. ADD,DC ht_0,%r0,ht_0 ; ht++
  354. COPY ht_0,%ret0 ; copy carry
  355. STD lt_0,0(r_ptr) ; rp[0] = lt
  356. bn_mul_words_exit
  357. .EXIT
  358. LDD -96(%sp),%r7 ; restore r7
  359. LDD -104(%sp),%r6 ; restore r6
  360. LDD -112(%sp),%r5 ; restore r5
  361. LDD -120(%sp),%r4 ; restore r4
  362. BVE (%rp)
  363. LDD,MB -128(%sp),%r3 ; restore r3
  364. .PROCEND ;in=23,24,25,26,29;out=28;
  365. ;----------------------------------------------------------------------------
  366. ;
  367. ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
  368. ;
  369. ; arg0 = rp
  370. ; arg1 = ap
  371. ; arg2 = num
  372. ;
  373. bn_sqr_words
  374. .proc
  375. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  376. .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  377. .entry
  378. .align 64
  379. STD %r3,0(%sp) ; save r3
  380. STD %r4,8(%sp) ; save r4
  381. NOP
  382. STD %r5,16(%sp) ; save r5
  383. CMPIB,>= 0,num,bn_sqr_words_exit
  384. LDO 128(%sp),%sp ; bump stack
  385. ;
  386. ; If only 1, the goto straight to cleanup
  387. ;
  388. CMPIB,= 1,num,bn_sqr_words_single_top
  389. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  390. ;
  391. ; This loop is unrolled 2 times (64-byte aligned as well)
  392. ;
  393. bn_sqr_words_unroll2
  394. FLDD 0(a_ptr),t_float_0 ; a[0]
  395. FLDD 8(a_ptr),t_float_1 ; a[1]
  396. XMPYU fht_0,flt_0,fm ; m[0]
  397. XMPYU fht_1,flt_1,fm_1 ; m[1]
  398. FSTD fm,-24(%sp) ; store m[0]
  399. FSTD fm_1,-56(%sp) ; store m[1]
  400. XMPYU flt_0,flt_0,lt_temp ; lt[0]
  401. XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
  402. FSTD lt_temp,-16(%sp) ; store lt[0]
  403. FSTD lt_temp_1,-48(%sp) ; store lt[1]
  404. XMPYU fht_0,fht_0,ht_temp ; ht[0]
  405. XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
  406. FSTD ht_temp,-8(%sp) ; store ht[0]
  407. FSTD ht_temp_1,-40(%sp) ; store ht[1]
  408. LDD -24(%sp),m_0
  409. LDD -56(%sp),m_1
  410. AND m_0,high_mask,tmp_0 ; m[0] & Mask
  411. AND m_1,high_mask,tmp_1 ; m[1] & Mask
  412. DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
  413. DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
  414. LDD -16(%sp),lt_0
  415. LDD -48(%sp),lt_1
  416. EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
  417. EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
  418. LDD -8(%sp),ht_0
  419. LDD -40(%sp),ht_1
  420. ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
  421. ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
  422. ADD lt_0,m_0,lt_0 ; lt = lt+m
  423. ADD,DC ht_0,%r0,ht_0 ; ht[0]++
  424. STD lt_0,0(r_ptr) ; rp[0] = lt[0]
  425. STD ht_0,8(r_ptr) ; rp[1] = ht[1]
  426. ADD lt_1,m_1,lt_1 ; lt = lt+m
  427. ADD,DC ht_1,%r0,ht_1 ; ht[1]++
  428. STD lt_1,16(r_ptr) ; rp[2] = lt[1]
  429. STD ht_1,24(r_ptr) ; rp[3] = ht[1]
  430. LDO -2(num),num ; num = num - 2;
  431. LDO 16(a_ptr),a_ptr ; ap += 2
  432. CMPIB,<= 2,num,bn_sqr_words_unroll2
  433. LDO 32(r_ptr),r_ptr ; rp += 4
  434. CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
  435. ;
  436. ; Top of loop aligned on 64-byte boundary
  437. ;
  438. bn_sqr_words_single_top
  439. FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
  440. XMPYU fht_0,flt_0,fm ; m
  441. FSTD fm,-24(%sp) ; store m
  442. XMPYU flt_0,flt_0,lt_temp ; lt
  443. FSTD lt_temp,-16(%sp) ; store lt
  444. XMPYU fht_0,fht_0,ht_temp ; ht
  445. FSTD ht_temp,-8(%sp) ; store ht
  446. LDD -24(%sp),m_0 ; load m
  447. AND m_0,high_mask,tmp_0 ; m & Mask
  448. DEPD,Z m_0,30,31,m_0 ; m << 32+1
  449. LDD -16(%sp),lt_0 ; lt
  450. LDD -8(%sp),ht_0 ; ht
  451. EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
  452. ADD m_0,lt_0,lt_0 ; lt = lt+m
  453. ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
  454. ADD,DC ht_0,%r0,ht_0 ; ht++
  455. STD lt_0,0(r_ptr) ; rp[0] = lt
  456. STD ht_0,8(r_ptr) ; rp[1] = ht
  457. bn_sqr_words_exit
  458. .EXIT
  459. LDD -112(%sp),%r5 ; restore r5
  460. LDD -120(%sp),%r4 ; restore r4
  461. BVE (%rp)
  462. LDD,MB -128(%sp),%r3
  463. .PROCEND ;in=23,24,25,26,29;out=28;
  464. ;----------------------------------------------------------------------------
  465. ;
  466. ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  467. ;
  468. ; arg0 = rp
  469. ; arg1 = ap
  470. ; arg2 = bp
  471. ; arg3 = n
  472. t .reg %r22
  473. b .reg %r21
  474. l .reg %r20
  475. bn_add_words
  476. .proc
  477. .entry
  478. .callinfo
  479. .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  480. .align 64
  481. CMPIB,>= 0,n,bn_add_words_exit
  482. COPY %r0,%ret0 ; return 0 by default
  483. ;
  484. ; If 2 or more numbers do the loop
  485. ;
  486. CMPIB,= 1,n,bn_add_words_single_top
  487. NOP
  488. ;
  489. ; This loop is unrolled 2 times (64-byte aligned as well)
  490. ;
  491. bn_add_words_unroll2
  492. LDD 0(a_ptr),t
  493. LDD 0(b_ptr),b
  494. ADD t,%ret0,t ; t = t+c;
  495. ADD,DC %r0,%r0,%ret0 ; set c to carry
  496. ADD t,b,l ; l = t + b[0]
  497. ADD,DC %ret0,%r0,%ret0 ; c+= carry
  498. STD l,0(r_ptr)
  499. LDD 8(a_ptr),t
  500. LDD 8(b_ptr),b
  501. ADD t,%ret0,t ; t = t+c;
  502. ADD,DC %r0,%r0,%ret0 ; set c to carry
  503. ADD t,b,l ; l = t + b[0]
  504. ADD,DC %ret0,%r0,%ret0 ; c+= carry
  505. STD l,8(r_ptr)
  506. LDO -2(n),n
  507. LDO 16(a_ptr),a_ptr
  508. LDO 16(b_ptr),b_ptr
  509. CMPIB,<= 2,n,bn_add_words_unroll2
  510. LDO 16(r_ptr),r_ptr
  511. CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
  512. bn_add_words_single_top
  513. LDD 0(a_ptr),t
  514. LDD 0(b_ptr),b
  515. ADD t,%ret0,t ; t = t+c;
  516. ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
  517. ADD t,b,l ; l = t + b[0]
  518. ADD,DC %ret0,%r0,%ret0 ; c+= carry
  519. STD l,0(r_ptr)
  520. bn_add_words_exit
  521. .EXIT
  522. BVE (%rp)
  523. NOP
  524. .PROCEND ;in=23,24,25,26,29;out=28;
  525. ;----------------------------------------------------------------------------
  526. ;
  527. ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  528. ;
  529. ; arg0 = rp
  530. ; arg1 = ap
  531. ; arg2 = bp
  532. ; arg3 = n
  533. t1 .reg %r22
  534. t2 .reg %r21
  535. sub_tmp1 .reg %r20
  536. sub_tmp2 .reg %r19
  537. bn_sub_words
  538. .proc
  539. .callinfo
  540. .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  541. .entry
  542. .align 64
  543. CMPIB,>= 0,n,bn_sub_words_exit
  544. COPY %r0,%ret0 ; return 0 by default
  545. ;
  546. ; If 2 or more numbers do the loop
  547. ;
  548. CMPIB,= 1,n,bn_sub_words_single_top
  549. NOP
  550. ;
  551. ; This loop is unrolled 2 times (64-byte aligned as well)
  552. ;
  553. bn_sub_words_unroll2
  554. LDD 0(a_ptr),t1
  555. LDD 0(b_ptr),t2
  556. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  557. SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
  558. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  559. LDO 1(%r0),sub_tmp2
  560. CMPCLR,*= t1,t2,%r0
  561. COPY sub_tmp2,%ret0
  562. STD sub_tmp1,0(r_ptr)
  563. LDD 8(a_ptr),t1
  564. LDD 8(b_ptr),t2
  565. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  566. SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
  567. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  568. LDO 1(%r0),sub_tmp2
  569. CMPCLR,*= t1,t2,%r0
  570. COPY sub_tmp2,%ret0
  571. STD sub_tmp1,8(r_ptr)
  572. LDO -2(n),n
  573. LDO 16(a_ptr),a_ptr
  574. LDO 16(b_ptr),b_ptr
  575. CMPIB,<= 2,n,bn_sub_words_unroll2
  576. LDO 16(r_ptr),r_ptr
  577. CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
  578. bn_sub_words_single_top
  579. LDD 0(a_ptr),t1
  580. LDD 0(b_ptr),t2
  581. SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
  582. SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
  583. CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
  584. LDO 1(%r0),sub_tmp2
  585. CMPCLR,*= t1,t2,%r0
  586. COPY sub_tmp2,%ret0
  587. STD sub_tmp1,0(r_ptr)
  588. bn_sub_words_exit
  589. .EXIT
  590. BVE (%rp)
  591. NOP
  592. .PROCEND ;in=23,24,25,26,29;out=28;
  593. ;------------------------------------------------------------------------------
  594. ;
  595. ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
  596. ;
  597. ; arg0 = h
  598. ; arg1 = l
  599. ; arg2 = d
  600. ;
  601. ; This is mainly just modified assembly from the compiler, thus the
  602. ; lack of variable names.
  603. ;
  604. ;------------------------------------------------------------------------------
  605. bn_div_words
  606. .proc
  607. .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
  608. .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  609. .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
  610. .IMPORT __iob,DATA
  611. .IMPORT fprintf,CODE,NO_RELOCATION
  612. .IMPORT abort,CODE,NO_RELOCATION
  613. .IMPORT $$div2U,MILLICODE
  614. .entry
  615. STD %r2,-16(%r30)
  616. STD,MA %r3,352(%r30)
  617. STD %r4,-344(%r30)
  618. STD %r5,-336(%r30)
  619. STD %r6,-328(%r30)
  620. STD %r7,-320(%r30)
  621. STD %r8,-312(%r30)
  622. STD %r9,-304(%r30)
  623. STD %r10,-296(%r30)
  624. STD %r27,-288(%r30) ; save gp
  625. COPY %r24,%r3 ; save d
  626. COPY %r26,%r4 ; save h (high 64-bits)
  627. LDO -1(%r0),%ret0 ; return -1 by default
  628. CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
  629. COPY %r25,%r5 ; save l (low 64-bits)
  630. LDO -48(%r30),%r29 ; create ap
  631. .CALL ;in=26,29;out=28;
  632. B,L BN_num_bits_word,%r2
  633. COPY %r3,%r26
  634. LDD -288(%r30),%r27 ; restore gp
  635. LDI 64,%r21
  636. CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
  637. COPY %ret0,%r24 ; i
  638. MTSARCM %r24
  639. DEPDI,Z -1,%sar,1,%r29
  640. CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
  641. $00000012
  642. SUBI 64,%r24,%r31 ; i = 64 - i;
  643. CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
  644. SUB %r4,%r3,%r4 ; h -= d
  645. CMPB,= %r31,%r0,$0000001A ; if (i)
  646. COPY %r0,%r10 ; ret = 0
  647. MTSARCM %r31 ; i to shift
  648. DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
  649. SUBI 64,%r31,%r19 ; 64 - i; redundent
  650. MTSAR %r19 ; (64 -i) to shift
  651. SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
  652. MTSARCM %r31 ; i to shift
  653. DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
  654. $0000001A
  655. DEPDI,Z -1,31,32,%r19
  656. EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
  657. EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
  658. LDO 2(%r0),%r9
  659. STD %r3,-280(%r30) ; "d" to stack
  660. $0000001C
  661. DEPDI,Z -1,63,32,%r29 ;
  662. EXTRD,U %r4,31,32,%r31 ; h >> 32
  663. CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
  664. COPY %r4,%r26
  665. EXTRD,U %r4,31,32,%r25
  666. COPY %r6,%r24
  667. .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
  668. B,L $$div2U,%r2
  669. EXTRD,U %r6,31,32,%r23
  670. DEPD %r28,31,32,%r29
  671. $D2
  672. STD %r29,-272(%r30) ; q
  673. AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
  674. EXTRD,U %r24,31,32,%r24 ; ???
  675. FLDD -272(%r30),%fr7 ; q
  676. FLDD -280(%r30),%fr8 ; d
  677. XMPYU %fr8L,%fr7L,%fr10
  678. FSTD %fr10,-256(%r30)
  679. XMPYU %fr8L,%fr7R,%fr22
  680. FSTD %fr22,-264(%r30)
  681. XMPYU %fr8R,%fr7L,%fr11
  682. XMPYU %fr8R,%fr7R,%fr23
  683. FSTD %fr11,-232(%r30)
  684. FSTD %fr23,-240(%r30)
  685. LDD -256(%r30),%r28
  686. DEPD,Z %r28,31,32,%r2
  687. LDD -264(%r30),%r20
  688. ADD,L %r20,%r2,%r31
  689. LDD -232(%r30),%r22
  690. DEPD,Z %r22,31,32,%r22
  691. LDD -240(%r30),%r21
  692. B $00000024 ; enter loop
  693. ADD,L %r21,%r22,%r23
  694. $0000002A
  695. LDO -1(%r29),%r29
  696. SUB %r23,%r8,%r23
  697. $00000024
  698. SUB %r4,%r31,%r25
  699. AND %r25,%r19,%r26
  700. CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
  701. DEPD,Z %r25,31,32,%r20
  702. OR %r20,%r24,%r21
  703. CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
  704. SUB %r31,%r6,%r31
  705. ;-------------Break path---------------------
  706. $00000046
  707. DEPD,Z %r23,31,32,%r25 ;tl
  708. EXTRD,U %r23,31,32,%r26 ;t
  709. AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
  710. ADD,L %r31,%r26,%r31 ;th += t;
  711. CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
  712. LDO 1(%r31),%r31 ; th++;
  713. CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
  714. LDO -1(%r29),%r29 ;q--;
  715. ADD,L %r4,%r3,%r4 ;h += d;
  716. $00000036
  717. ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
  718. SUB %r5,%r24,%r28 ; l -= tl;
  719. SUB %r4,%r31,%r24 ; h -= th;
  720. SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
  721. DEPD,Z %r29,31,32,%r10 ; ret = q<<32
  722. b $0000001C
  723. DEPD,Z %r28,31,32,%r5 ; l = l << 32
  724. $D1
  725. OR %r10,%r29,%r28 ; ret |= q
  726. $D3
  727. LDD -368(%r30),%r2
  728. $D0
  729. LDD -296(%r30),%r10
  730. LDD -304(%r30),%r9
  731. LDD -312(%r30),%r8
  732. LDD -320(%r30),%r7
  733. LDD -328(%r30),%r6
  734. LDD -336(%r30),%r5
  735. LDD -344(%r30),%r4
  736. BVE (%r2)
  737. .EXIT
  738. LDD,MB -352(%r30),%r3
  739. bn_div_err_case
  740. MFIA %r6
  741. ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
  742. LDO R'bn_div_words-bn_div_err_case(%r1),%r6
  743. ADDIL LT'__iob,%r27,%r1
  744. LDD RT'__iob(%r1),%r26
  745. ADDIL L'C$4-bn_div_words,%r6,%r1
  746. LDO R'C$4-bn_div_words(%r1),%r25
  747. LDO 64(%r26),%r26
  748. .CALL ;in=24,25,26,29;out=28;
  749. B,L fprintf,%r2
  750. LDO -48(%r30),%r29
  751. LDD -288(%r30),%r27
  752. .CALL ;in=29;
  753. B,L abort,%r2
  754. LDO -48(%r30),%r29
  755. LDD -288(%r30),%r27
  756. B $D0
  757. LDD -368(%r30),%r2
  758. .PROCEND ;in=24,25,26,29;out=28;
  759. ;----------------------------------------------------------------------------
  760. ;
  761. ; Registers to hold 64-bit values to manipulate. The "L" part
  762. ; of the register corresponds to the upper 32-bits, while the "R"
  763. ; part corresponds to the lower 32-bits
  764. ;
  765. ; Note, that when using b6 and b7, the code must save these before
  766. ; using them because they are callee save registers
  767. ;
  768. ;
  769. ; Floating point registers to use to save values that
  770. ; are manipulated. These don't collide with ftemp1-6 and
  771. ; are all caller save registers
  772. ;
  773. a0 .reg %fr22
  774. a0L .reg %fr22L
  775. a0R .reg %fr22R
  776. a1 .reg %fr23
  777. a1L .reg %fr23L
  778. a1R .reg %fr23R
  779. a2 .reg %fr24
  780. a2L .reg %fr24L
  781. a2R .reg %fr24R
  782. a3 .reg %fr25
  783. a3L .reg %fr25L
  784. a3R .reg %fr25R
  785. a4 .reg %fr26
  786. a4L .reg %fr26L
  787. a4R .reg %fr26R
  788. a5 .reg %fr27
  789. a5L .reg %fr27L
  790. a5R .reg %fr27R
  791. a6 .reg %fr28
  792. a6L .reg %fr28L
  793. a6R .reg %fr28R
  794. a7 .reg %fr29
  795. a7L .reg %fr29L
  796. a7R .reg %fr29R
  797. b0 .reg %fr30
  798. b0L .reg %fr30L
  799. b0R .reg %fr30R
  800. b1 .reg %fr31
  801. b1L .reg %fr31L
  802. b1R .reg %fr31R
  803. ;
  804. ; Temporary floating point variables, these are all caller save
  805. ; registers
  806. ;
  807. ftemp1 .reg %fr4
  808. ftemp2 .reg %fr5
  809. ftemp3 .reg %fr6
  810. ftemp4 .reg %fr7
  811. ;
  812. ; The B set of registers when used.
  813. ;
  814. b2 .reg %fr8
  815. b2L .reg %fr8L
  816. b2R .reg %fr8R
  817. b3 .reg %fr9
  818. b3L .reg %fr9L
  819. b3R .reg %fr9R
  820. b4 .reg %fr10
  821. b4L .reg %fr10L
  822. b4R .reg %fr10R
  823. b5 .reg %fr11
  824. b5L .reg %fr11L
  825. b5R .reg %fr11R
  826. b6 .reg %fr12
  827. b6L .reg %fr12L
  828. b6R .reg %fr12R
  829. b7 .reg %fr13
  830. b7L .reg %fr13L
  831. b7R .reg %fr13R
  832. c1 .reg %r21 ; only reg
  833. temp1 .reg %r20 ; only reg
  834. temp2 .reg %r19 ; only reg
  835. temp3 .reg %r31 ; only reg
  836. m1 .reg %r28
  837. c2 .reg %r23
  838. high_one .reg %r1
  839. ht .reg %r6
  840. lt .reg %r5
  841. m .reg %r4
  842. c3 .reg %r3
  843. SQR_ADD_C .macro A0L,A0R,C1,C2,C3
  844. XMPYU A0L,A0R,ftemp1 ; m
  845. FSTD ftemp1,-24(%sp) ; store m
  846. XMPYU A0R,A0R,ftemp2 ; lt
  847. FSTD ftemp2,-16(%sp) ; store lt
  848. XMPYU A0L,A0L,ftemp3 ; ht
  849. FSTD ftemp3,-8(%sp) ; store ht
  850. LDD -24(%sp),m ; load m
  851. AND m,high_mask,temp2 ; m & Mask
  852. DEPD,Z m,30,31,temp3 ; m << 32+1
  853. LDD -16(%sp),lt ; lt
  854. LDD -8(%sp),ht ; ht
  855. EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
  856. ADD temp3,lt,lt ; lt = lt+m
  857. ADD,L ht,temp1,ht ; ht += temp1
  858. ADD,DC ht,%r0,ht ; ht++
  859. ADD C1,lt,C1 ; c1=c1+lt
  860. ADD,DC ht,%r0,ht ; ht++
  861. ADD C2,ht,C2 ; c2=c2+ht
  862. ADD,DC C3,%r0,C3 ; c3++
  863. .endm
  864. SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
  865. XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
  866. FSTD ftemp1,-16(%sp) ;
  867. XMPYU A0R,A1L,ftemp2 ; m = bh*lt
  868. FSTD ftemp2,-8(%sp) ;
  869. XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
  870. FSTD ftemp3,-32(%sp)
  871. XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
  872. FSTD ftemp4,-24(%sp) ;
  873. LDD -8(%sp),m ; r21 = m
  874. LDD -16(%sp),m1 ; r19 = m1
  875. ADD,L m,m1,m ; m+m1
  876. DEPD,Z m,31,32,temp3 ; (m+m1<<32)
  877. LDD -24(%sp),ht ; r24 = ht
  878. CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
  879. ADD,L ht,high_one,ht ; ht+=high_one
  880. EXTRD,U m,31,32,temp1 ; m >> 32
  881. LDD -32(%sp),lt ; lt
  882. ADD,L ht,temp1,ht ; ht+= m>>32
  883. ADD lt,temp3,lt ; lt = lt+m1
  884. ADD,DC ht,%r0,ht ; ht++
  885. ADD ht,ht,ht ; ht=ht+ht;
  886. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  887. ADD lt,lt,lt ; lt=lt+lt;
  888. ADD,DC ht,%r0,ht ; add in carry (ht++)
  889. ADD C1,lt,C1 ; c1=c1+lt
  890. ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
  891. LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
  892. ADD C2,ht,C2 ; c2 = c2 + ht
  893. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  894. .endm
  895. ;
  896. ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  897. ; arg0 = r_ptr
  898. ; arg1 = a_ptr
  899. ;
  900. bn_sqr_comba8
  901. .PROC
  902. .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  903. .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  904. .ENTRY
  905. .align 64
  906. STD %r3,0(%sp) ; save r3
  907. STD %r4,8(%sp) ; save r4
  908. STD %r5,16(%sp) ; save r5
  909. STD %r6,24(%sp) ; save r6
  910. ;
  911. ; Zero out carries
  912. ;
  913. COPY %r0,c1
  914. COPY %r0,c2
  915. COPY %r0,c3
  916. LDO 128(%sp),%sp ; bump stack
  917. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  918. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  919. ;
  920. ; Load up all of the values we are going to use
  921. ;
  922. FLDD 0(a_ptr),a0
  923. FLDD 8(a_ptr),a1
  924. FLDD 16(a_ptr),a2
  925. FLDD 24(a_ptr),a3
  926. FLDD 32(a_ptr),a4
  927. FLDD 40(a_ptr),a5
  928. FLDD 48(a_ptr),a6
  929. FLDD 56(a_ptr),a7
  930. SQR_ADD_C a0L,a0R,c1,c2,c3
  931. STD c1,0(r_ptr) ; r[0] = c1;
  932. COPY %r0,c1
  933. SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
  934. STD c2,8(r_ptr) ; r[1] = c2;
  935. COPY %r0,c2
  936. SQR_ADD_C a1L,a1R,c3,c1,c2
  937. SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
  938. STD c3,16(r_ptr) ; r[2] = c3;
  939. COPY %r0,c3
  940. SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
  941. SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
  942. STD c1,24(r_ptr) ; r[3] = c1;
  943. COPY %r0,c1
  944. SQR_ADD_C a2L,a2R,c2,c3,c1
  945. SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
  946. SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
  947. STD c2,32(r_ptr) ; r[4] = c2;
  948. COPY %r0,c2
  949. SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
  950. SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
  951. SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
  952. STD c3,40(r_ptr) ; r[5] = c3;
  953. COPY %r0,c3
  954. SQR_ADD_C a3L,a3R,c1,c2,c3
  955. SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
  956. SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
  957. SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
  958. STD c1,48(r_ptr) ; r[6] = c1;
  959. COPY %r0,c1
  960. SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
  961. SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
  962. SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
  963. SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
  964. STD c2,56(r_ptr) ; r[7] = c2;
  965. COPY %r0,c2
  966. SQR_ADD_C a4L,a4R,c3,c1,c2
  967. SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
  968. SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
  969. SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
  970. STD c3,64(r_ptr) ; r[8] = c3;
  971. COPY %r0,c3
  972. SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
  973. SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
  974. SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
  975. STD c1,72(r_ptr) ; r[9] = c1;
  976. COPY %r0,c1
  977. SQR_ADD_C a5L,a5R,c2,c3,c1
  978. SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
  979. SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
  980. STD c2,80(r_ptr) ; r[10] = c2;
  981. COPY %r0,c2
  982. SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
  983. SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
  984. STD c3,88(r_ptr) ; r[11] = c3;
  985. COPY %r0,c3
  986. SQR_ADD_C a6L,a6R,c1,c2,c3
  987. SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
  988. STD c1,96(r_ptr) ; r[12] = c1;
  989. COPY %r0,c1
  990. SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
  991. STD c2,104(r_ptr) ; r[13] = c2;
  992. COPY %r0,c2
  993. SQR_ADD_C a7L,a7R,c3,c1,c2
  994. STD c3, 112(r_ptr) ; r[14] = c3
  995. STD c1, 120(r_ptr) ; r[15] = c1
  996. .EXIT
  997. LDD -104(%sp),%r6 ; restore r6
  998. LDD -112(%sp),%r5 ; restore r5
  999. LDD -120(%sp),%r4 ; restore r4
  1000. BVE (%rp)
  1001. LDD,MB -128(%sp),%r3
  1002. .PROCEND
  1003. ;-----------------------------------------------------------------------------
  1004. ;
  1005. ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  1006. ; arg0 = r_ptr
  1007. ; arg1 = a_ptr
  1008. ;
  1009. bn_sqr_comba4
  1010. .proc
  1011. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1012. .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1013. .entry
  1014. .align 64
  1015. STD %r3,0(%sp) ; save r3
  1016. STD %r4,8(%sp) ; save r4
  1017. STD %r5,16(%sp) ; save r5
  1018. STD %r6,24(%sp) ; save r6
  1019. ;
  1020. ; Zero out carries
  1021. ;
  1022. COPY %r0,c1
  1023. COPY %r0,c2
  1024. COPY %r0,c3
  1025. LDO 128(%sp),%sp ; bump stack
  1026. DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
  1027. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1028. ;
  1029. ; Load up all of the values we are going to use
  1030. ;
  1031. FLDD 0(a_ptr),a0
  1032. FLDD 8(a_ptr),a1
  1033. FLDD 16(a_ptr),a2
  1034. FLDD 24(a_ptr),a3
  1035. FLDD 32(a_ptr),a4
  1036. FLDD 40(a_ptr),a5
  1037. FLDD 48(a_ptr),a6
  1038. FLDD 56(a_ptr),a7
  1039. SQR_ADD_C a0L,a0R,c1,c2,c3
  1040. STD c1,0(r_ptr) ; r[0] = c1;
  1041. COPY %r0,c1
  1042. SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
  1043. STD c2,8(r_ptr) ; r[1] = c2;
  1044. COPY %r0,c2
  1045. SQR_ADD_C a1L,a1R,c3,c1,c2
  1046. SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
  1047. STD c3,16(r_ptr) ; r[2] = c3;
  1048. COPY %r0,c3
  1049. SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
  1050. SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
  1051. STD c1,24(r_ptr) ; r[3] = c1;
  1052. COPY %r0,c1
  1053. SQR_ADD_C a2L,a2R,c2,c3,c1
  1054. SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
  1055. STD c2,32(r_ptr) ; r[4] = c2;
  1056. COPY %r0,c2
  1057. SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
  1058. STD c3,40(r_ptr) ; r[5] = c3;
  1059. COPY %r0,c3
  1060. SQR_ADD_C a3L,a3R,c1,c2,c3
  1061. STD c1,48(r_ptr) ; r[6] = c1;
  1062. STD c2,56(r_ptr) ; r[7] = c2;
  1063. .EXIT
  1064. LDD -104(%sp),%r6 ; restore r6
  1065. LDD -112(%sp),%r5 ; restore r5
  1066. LDD -120(%sp),%r4 ; restore r4
  1067. BVE (%rp)
  1068. LDD,MB -128(%sp),%r3
  1069. .PROCEND
  1070. ;---------------------------------------------------------------------------
  1071. MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
  1072. XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
  1073. FSTD ftemp1,-16(%sp) ;
  1074. XMPYU A0R,B0L,ftemp2 ; m = bh*lt
  1075. FSTD ftemp2,-8(%sp) ;
  1076. XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
  1077. FSTD ftemp3,-32(%sp)
  1078. XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
  1079. FSTD ftemp4,-24(%sp) ;
  1080. LDD -8(%sp),m ; r21 = m
  1081. LDD -16(%sp),m1 ; r19 = m1
  1082. ADD,L m,m1,m ; m+m1
  1083. DEPD,Z m,31,32,temp3 ; (m+m1<<32)
  1084. LDD -24(%sp),ht ; r24 = ht
  1085. CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
  1086. ADD,L ht,high_one,ht ; ht+=high_one
  1087. EXTRD,U m,31,32,temp1 ; m >> 32
  1088. LDD -32(%sp),lt ; lt
  1089. ADD,L ht,temp1,ht ; ht+= m>>32
  1090. ADD lt,temp3,lt ; lt = lt+m1
  1091. ADD,DC ht,%r0,ht ; ht++
  1092. ADD C1,lt,C1 ; c1=c1+lt
  1093. ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
  1094. ADD C2,ht,C2 ; c2 = c2 + ht
  1095. ADD,DC C3,%r0,C3 ; add in carry (c3++)
  1096. .endm
  1097. ;
  1098. ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  1099. ; arg0 = r_ptr
  1100. ; arg1 = a_ptr
  1101. ; arg2 = b_ptr
  1102. ;
  1103. bn_mul_comba8
  1104. .proc
  1105. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1106. .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1107. .entry
  1108. .align 64
  1109. STD %r3,0(%sp) ; save r3
  1110. STD %r4,8(%sp) ; save r4
  1111. STD %r5,16(%sp) ; save r5
  1112. STD %r6,24(%sp) ; save r6
  1113. FSTD %fr12,32(%sp) ; save r6
  1114. FSTD %fr13,40(%sp) ; save r7
  1115. ;
  1116. ; Zero out carries
  1117. ;
  1118. COPY %r0,c1
  1119. COPY %r0,c2
  1120. COPY %r0,c3
  1121. LDO 128(%sp),%sp ; bump stack
  1122. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1123. ;
  1124. ; Load up all of the values we are going to use
  1125. ;
  1126. FLDD 0(a_ptr),a0
  1127. FLDD 8(a_ptr),a1
  1128. FLDD 16(a_ptr),a2
  1129. FLDD 24(a_ptr),a3
  1130. FLDD 32(a_ptr),a4
  1131. FLDD 40(a_ptr),a5
  1132. FLDD 48(a_ptr),a6
  1133. FLDD 56(a_ptr),a7
  1134. FLDD 0(b_ptr),b0
  1135. FLDD 8(b_ptr),b1
  1136. FLDD 16(b_ptr),b2
  1137. FLDD 24(b_ptr),b3
  1138. FLDD 32(b_ptr),b4
  1139. FLDD 40(b_ptr),b5
  1140. FLDD 48(b_ptr),b6
  1141. FLDD 56(b_ptr),b7
  1142. MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
  1143. STD c1,0(r_ptr)
  1144. COPY %r0,c1
  1145. MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
  1146. MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
  1147. STD c2,8(r_ptr)
  1148. COPY %r0,c2
  1149. MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
  1150. MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
  1151. MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
  1152. STD c3,16(r_ptr)
  1153. COPY %r0,c3
  1154. MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
  1155. MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
  1156. MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
  1157. MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
  1158. STD c1,24(r_ptr)
  1159. COPY %r0,c1
  1160. MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
  1161. MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
  1162. MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
  1163. MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
  1164. MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
  1165. STD c2,32(r_ptr)
  1166. COPY %r0,c2
  1167. MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
  1168. MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
  1169. MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
  1170. MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
  1171. MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
  1172. MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
  1173. STD c3,40(r_ptr)
  1174. COPY %r0,c3
  1175. MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
  1176. MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
  1177. MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
  1178. MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
  1179. MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
  1180. MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
  1181. MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
  1182. STD c1,48(r_ptr)
  1183. COPY %r0,c1
  1184. MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
  1185. MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
  1186. MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
  1187. MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
  1188. MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
  1189. MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
  1190. MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
  1191. MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
  1192. STD c2,56(r_ptr)
  1193. COPY %r0,c2
  1194. MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
  1195. MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
  1196. MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
  1197. MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
  1198. MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
  1199. MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
  1200. MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
  1201. STD c3,64(r_ptr)
  1202. COPY %r0,c3
  1203. MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
  1204. MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
  1205. MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
  1206. MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
  1207. MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
  1208. MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
  1209. STD c1,72(r_ptr)
  1210. COPY %r0,c1
  1211. MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
  1212. MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
  1213. MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
  1214. MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
  1215. MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
  1216. STD c2,80(r_ptr)
  1217. COPY %r0,c2
  1218. MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
  1219. MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
  1220. MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
  1221. MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
  1222. STD c3,88(r_ptr)
  1223. COPY %r0,c3
  1224. MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
  1225. MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
  1226. MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
  1227. STD c1,96(r_ptr)
  1228. COPY %r0,c1
  1229. MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
  1230. MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
  1231. STD c2,104(r_ptr)
  1232. COPY %r0,c2
  1233. MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
  1234. STD c3,112(r_ptr)
  1235. STD c1,120(r_ptr)
  1236. .EXIT
  1237. FLDD -88(%sp),%fr13
  1238. FLDD -96(%sp),%fr12
  1239. LDD -104(%sp),%r6 ; restore r6
  1240. LDD -112(%sp),%r5 ; restore r5
  1241. LDD -120(%sp),%r4 ; restore r4
  1242. BVE (%rp)
  1243. LDD,MB -128(%sp),%r3
  1244. .PROCEND
  1245. ;-----------------------------------------------------------------------------
  1246. ;
  1247. ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  1248. ; arg0 = r_ptr
  1249. ; arg1 = a_ptr
  1250. ; arg2 = b_ptr
  1251. ;
  1252. bn_mul_comba4
  1253. .proc
  1254. .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
  1255. .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
  1256. .entry
  1257. .align 64
  1258. STD %r3,0(%sp) ; save r3
  1259. STD %r4,8(%sp) ; save r4
  1260. STD %r5,16(%sp) ; save r5
  1261. STD %r6,24(%sp) ; save r6
  1262. FSTD %fr12,32(%sp) ; save r6
  1263. FSTD %fr13,40(%sp) ; save r7
  1264. ;
  1265. ; Zero out carries
  1266. ;
  1267. COPY %r0,c1
  1268. COPY %r0,c2
  1269. COPY %r0,c3
  1270. LDO 128(%sp),%sp ; bump stack
  1271. DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
  1272. ;
  1273. ; Load up all of the values we are going to use
  1274. ;
  1275. FLDD 0(a_ptr),a0
  1276. FLDD 8(a_ptr),a1
  1277. FLDD 16(a_ptr),a2
  1278. FLDD 24(a_ptr),a3
  1279. FLDD 0(b_ptr),b0
  1280. FLDD 8(b_ptr),b1
  1281. FLDD 16(b_ptr),b2
  1282. FLDD 24(b_ptr),b3
  1283. MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
  1284. STD c1,0(r_ptr)
  1285. COPY %r0,c1
  1286. MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
  1287. MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
  1288. STD c2,8(r_ptr)
  1289. COPY %r0,c2
  1290. MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
  1291. MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
  1292. MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
  1293. STD c3,16(r_ptr)
  1294. COPY %r0,c3
  1295. MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
  1296. MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
  1297. MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
  1298. MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
  1299. STD c1,24(r_ptr)
  1300. COPY %r0,c1
  1301. MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
  1302. MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
  1303. MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
  1304. STD c2,32(r_ptr)
  1305. COPY %r0,c2
  1306. MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
  1307. MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
  1308. STD c3,40(r_ptr)
  1309. COPY %r0,c3
  1310. MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
  1311. STD c1,48(r_ptr)
  1312. STD c2,56(r_ptr)
  1313. .EXIT
  1314. FLDD -88(%sp),%fr13
  1315. FLDD -96(%sp),%fr12
  1316. LDD -104(%sp),%r6 ; restore r6
  1317. LDD -112(%sp),%r5 ; restore r5
  1318. LDD -120(%sp),%r4 ; restore r4
  1319. BVE (%rp)
  1320. LDD,MB -128(%sp),%r3
  1321. .PROCEND
  1322. .SPACE $TEXT$
  1323. .SUBSPA $CODE$
  1324. .SPACE $PRIVATE$,SORT=16
  1325. .IMPORT $global$,DATA
  1326. .SPACE $TEXT$
  1327. .SUBSPA $CODE$
  1328. .SUBSPA $LIT$,ACCESS=0x2c
  1329. C$4
  1330. .ALIGN 8
  1331. .STRINGZ "Division would overflow (%d)\n"
  1332. .END