asm.c 60 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787
  1. /* asm.c
  2. *
  3. * Copyright (C) 2006-2023 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. #ifdef HAVE_CONFIG_H
  22. #include <config.h>
  23. #endif
  24. #include <wolfssl/wolfcrypt/settings.h>
  25. /*
  26. * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
  27. * http://math.libtomcrypt.com
  28. */
  29. /******************************************************************/
  30. /* fp_montgomery_reduce.c asm or generic */
  31. /* Each platform needs to query info type 1 from cpuid to see if aesni is
  32. * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
  33. */
  34. #if defined(HAVE_INTEL_MULX)
  35. #ifndef _MSC_VER
  36. #define cpuid(reg, leaf, sub)\
  37. __asm__ __volatile__ ("cpuid":\
  38. "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
  39. "a" (leaf), "c"(sub));
  40. #else
  41. #include <intrin.h>
  42. #define cpuid(a,b,c) __cpuidex((int*)a,b,c)
  43. #endif /* _MSC_VER */
  44. #define EAX 0
  45. #define EBX 1
  46. #define ECX 2
  47. #define EDX 3
  48. #define CPUID_AVX1 0x1
  49. #define CPUID_AVX2 0x2
  50. #define CPUID_RDRAND 0x4
  51. #define CPUID_RDSEED 0x8
  52. #define CPUID_BMI2 0x10 /* MULX, RORX */
  53. #define CPUID_ADX 0x20 /* ADCX, ADOX */
  54. #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
  55. #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
  56. #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
  57. #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
  58. #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
  59. #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
  60. #define SET_FLAGS
  61. static word32 cpuid_check = 0 ;
  62. static word32 cpuid_flags = 0 ;
  63. static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
  64. int got_intel_cpu = 0;
  65. int got_amd_cpu = 0;
  66. unsigned int reg[5];
  67. reg[4] = '\0' ;
  68. cpuid(reg, 0, 0);
  69. /* check for intel cpu */
  70. if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
  71. memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
  72. memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
  73. got_intel_cpu = 1;
  74. }
  75. /* check for AMD cpu */
  76. if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 &&
  77. memcmp((char *)&(reg[EDX]), "enti", 4) == 0 &&
  78. memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) {
  79. got_amd_cpu = 1;
  80. }
  81. if (got_intel_cpu || got_amd_cpu) {
  82. cpuid(reg, leaf, sub);
  83. return((reg[num]>>bit)&0x1) ;
  84. }
  85. return 0 ;
  86. }
  87. WC_INLINE static int set_cpuid_flags(void) {
  88. if(cpuid_check == 0) {
  89. if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
  90. if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
  91. cpuid_check = 1 ;
  92. return 0 ;
  93. }
  94. return 1 ;
  95. }
  96. #define RETURN return
  97. #define IF_HAVE_INTEL_MULX(func, ret) \
  98. if(cpuid_check==0)set_cpuid_flags() ; \
  99. if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
  100. #else
  101. #define IF_HAVE_INTEL_MULX(func, ret) WC_DO_NOTHING
  102. #endif
  103. #if defined(TFM_X86) && !defined(TFM_SSE2)
  104. /* x86-32 code */
  105. #define MONT_START
  106. #define MONT_FINI
  107. #define LOOP_END
  108. #define LOOP_START \
  109. mu = c[x] * mp
  110. #define INNERMUL \
  111. __asm__( \
  112. "movl %5,%%eax \n\t" \
  113. "mull %4 \n\t" \
  114. "addl %1,%%eax \n\t" \
  115. "adcl $0,%%edx \n\t" \
  116. "addl %%eax,%0 \n\t" \
  117. "adcl $0,%%edx \n\t" \
  118. "movl %%edx,%1 \n\t" \
  119. :"=g"(_c[LO]), "=r"(cy) \
  120. :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
  121. : "%eax", "%edx", "cc")
  122. #define PROPCARRY \
  123. __asm__( \
  124. "addl %1,%0 \n\t" \
  125. "setb %%al \n\t" \
  126. "movzbl %%al,%1 \n\t" \
  127. :"=g"(_c[LO]), "=r"(cy) \
  128. :"0"(_c[LO]), "1"(cy) \
  129. : "%eax", "cc")
  130. /******************************************************************/
  131. #elif defined(TFM_X86_64)
  132. /* x86-64 code */
  133. #define MONT_START
  134. #define MONT_FINI
  135. #define LOOP_END
  136. #define LOOP_START \
  137. mu = c[x] * mp
  138. #define INNERMUL \
  139. __asm__( \
  140. "movq %5,%%rax \n\t" \
  141. "mulq %4 \n\t" \
  142. "addq %1,%%rax \n\t" \
  143. "adcq $0,%%rdx \n\t" \
  144. "addq %%rax,%0 \n\t" \
  145. "adcq $0,%%rdx \n\t" \
  146. "movq %%rdx,%1 \n\t" \
  147. :"=g"(_c[LO]), "=r"(cy) \
  148. :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
  149. : "%rax", "%rdx", "cc")
  150. #if defined(HAVE_INTEL_MULX)
  151. #define MULX_INNERMUL8(x,y,z,cy) \
  152. __asm__ volatile ( \
  153. "movq %[yn], %%rdx\n\t" \
  154. "xorq %%rcx, %%rcx\n\t" \
  155. "movq 0(%[c]), %%r8\n\t" \
  156. "movq 8(%[c]), %%r9\n\t" \
  157. "movq 16(%[c]), %%r10\n\t" \
  158. "movq 24(%[c]), %%r11\n\t" \
  159. "movq 32(%[c]), %%r12\n\t" \
  160. "movq 40(%[c]), %%r13\n\t" \
  161. "movq 48(%[c]), %%r14\n\t" \
  162. "movq 56(%[c]), %%r15\n\t" \
  163. \
  164. "mulx 0(%[xp]), %%rax, %%rcx\n\t" \
  165. "adcxq %[cy], %%r8\n\t" \
  166. "adoxq %%rax, %%r8\n\t" \
  167. "mulx 8(%[xp]), %%rax, %[cy]\n\t" \
  168. "adcxq %%rcx, %%r9\n\t" \
  169. "adoxq %%rax, %%r9\n\t" \
  170. "mulx 16(%[xp]), %%rax, %%rcx\n\t" \
  171. "adcxq %[cy], %%r10\n\t" \
  172. "adoxq %%rax, %%r10\n\t" \
  173. "mulx 24(%[xp]), %%rax, %[cy]\n\t" \
  174. "adcxq %%rcx, %%r11\n\t" \
  175. "adoxq %%rax, %%r11\n\t" \
  176. "mulx 32(%[xp]), %%rax, %%rcx\n\t" \
  177. "adcxq %[cy], %%r12\n\t" \
  178. "adoxq %%rax, %%r12\n\t" \
  179. "mulx 40(%[xp]), %%rax, %[cy]\n\t" \
  180. "adcxq %%rcx, %%r13\n\t" \
  181. "adoxq %%rax, %%r13\n\t" \
  182. "mulx 48(%[xp]), %%rax, %%rcx\n\t" \
  183. "adcxq %[cy], %%r14\n\t" \
  184. "adoxq %%rax, %%r14\n\t" \
  185. "adcxq %%rcx, %%r15\n\t" \
  186. "mulx 56(%[xp]), %%rax, %[cy]\n\t" \
  187. "movq $0, %%rdx\n\t" \
  188. "adoxq %%rdx, %%rax\n\t" \
  189. "adcxq %%rdx, %[cy]\n\t" \
  190. "adoxq %%rdx, %[cy]\n\t" \
  191. "addq %%rax, %%r15\n\t" \
  192. "adcq $0, %[cy]\n\t" \
  193. \
  194. "movq %%r8, 0(%[c])\n\t" \
  195. "movq %%r9, 8(%[c])\n\t" \
  196. "movq %%r10, 16(%[c])\n\t" \
  197. "movq %%r11, 24(%[c])\n\t" \
  198. "movq %%r12, 32(%[c])\n\t" \
  199. "movq %%r13, 40(%[c])\n\t" \
  200. "movq %%r14, 48(%[c])\n\t" \
  201. "movq %%r15, 56(%[c])\n\t" \
  202. : [cy] "+r" (cy) \
  203. : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y) \
  204. :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \
  205. "%rdx", "%rax", "%rcx" \
  206. )
  207. #define INNERMUL8_MULX \
  208. {\
  209. MULX_INNERMUL8(tmpm, mu, _c, cy);\
  210. }
  211. #endif
  212. #define INNERMUL8 \
  213. __asm__( \
  214. "movq 0(%5),%%rax \n\t" \
  215. "movq 0(%2),%%r10 \n\t" \
  216. "movq 0x8(%5),%%r11 \n\t" \
  217. "mulq %4 \n\t" \
  218. "addq %%r10,%%rax \n\t" \
  219. "adcq $0,%%rdx \n\t" \
  220. "movq 0x8(%2),%%r10 \n\t" \
  221. "addq %3,%%rax \n\t" \
  222. "adcq $0,%%rdx \n\t" \
  223. "movq %%rax,0(%0) \n\t" \
  224. "movq %%rdx,%1 \n\t" \
  225. \
  226. "movq %%r11,%%rax \n\t" \
  227. "movq 0x10(%5),%%r11 \n\t" \
  228. "mulq %4 \n\t" \
  229. "addq %%r10,%%rax \n\t" \
  230. "adcq $0,%%rdx \n\t" \
  231. "movq 0x10(%2),%%r10 \n\t" \
  232. "addq %3,%%rax \n\t" \
  233. "adcq $0,%%rdx \n\t" \
  234. "movq %%rax,0x8(%0) \n\t" \
  235. "movq %%rdx,%1 \n\t" \
  236. \
  237. "movq %%r11,%%rax \n\t" \
  238. "movq 0x18(%5),%%r11 \n\t" \
  239. "mulq %4 \n\t" \
  240. "addq %%r10,%%rax \n\t" \
  241. "adcq $0,%%rdx \n\t" \
  242. "movq 0x18(%2),%%r10 \n\t" \
  243. "addq %3,%%rax \n\t" \
  244. "adcq $0,%%rdx \n\t" \
  245. "movq %%rax,0x10(%0) \n\t" \
  246. "movq %%rdx,%1 \n\t" \
  247. \
  248. "movq %%r11,%%rax \n\t" \
  249. "movq 0x20(%5),%%r11 \n\t" \
  250. "mulq %4 \n\t" \
  251. "addq %%r10,%%rax \n\t" \
  252. "adcq $0,%%rdx \n\t" \
  253. "movq 0x20(%2),%%r10 \n\t" \
  254. "addq %3,%%rax \n\t" \
  255. "adcq $0,%%rdx \n\t" \
  256. "movq %%rax,0x18(%0) \n\t" \
  257. "movq %%rdx,%1 \n\t" \
  258. \
  259. "movq %%r11,%%rax \n\t" \
  260. "movq 0x28(%5),%%r11 \n\t" \
  261. "mulq %4 \n\t" \
  262. "addq %%r10,%%rax \n\t" \
  263. "adcq $0,%%rdx \n\t" \
  264. "movq 0x28(%2),%%r10 \n\t" \
  265. "addq %3,%%rax \n\t" \
  266. "adcq $0,%%rdx \n\t" \
  267. "movq %%rax,0x20(%0) \n\t" \
  268. "movq %%rdx,%1 \n\t" \
  269. \
  270. "movq %%r11,%%rax \n\t" \
  271. "movq 0x30(%5),%%r11 \n\t" \
  272. "mulq %4 \n\t" \
  273. "addq %%r10,%%rax \n\t" \
  274. "adcq $0,%%rdx \n\t" \
  275. "movq 0x30(%2),%%r10 \n\t" \
  276. "addq %3,%%rax \n\t" \
  277. "adcq $0,%%rdx \n\t" \
  278. "movq %%rax,0x28(%0) \n\t" \
  279. "movq %%rdx,%1 \n\t" \
  280. \
  281. "movq %%r11,%%rax \n\t" \
  282. "movq 0x38(%5),%%r11 \n\t" \
  283. "mulq %4 \n\t" \
  284. "addq %%r10,%%rax \n\t" \
  285. "adcq $0,%%rdx \n\t" \
  286. "movq 0x38(%2),%%r10 \n\t" \
  287. "addq %3,%%rax \n\t" \
  288. "adcq $0,%%rdx \n\t" \
  289. "movq %%rax,0x30(%0) \n\t" \
  290. "movq %%rdx,%1 \n\t" \
  291. \
  292. "movq %%r11,%%rax \n\t" \
  293. "mulq %4 \n\t" \
  294. "addq %%r10,%%rax \n\t" \
  295. "adcq $0,%%rdx \n\t" \
  296. "addq %3,%%rax \n\t" \
  297. "adcq $0,%%rdx \n\t" \
  298. "movq %%rax,0x38(%0) \n\t" \
  299. "movq %%rdx,%1 \n\t" \
  300. \
  301. :"=r"(_c), "=r"(cy) \
  302. : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
  303. : "%rax", "%rdx", "%r10", "%r11", "cc")
  304. #define PROPCARRY \
  305. __asm__( \
  306. "addq %1,%0 \n\t" \
  307. "setb %%al \n\t" \
  308. "movzbq %%al,%1 \n\t" \
  309. :"=g"(_c[LO]), "=r"(cy) \
  310. :"0"(_c[LO]), "1"(cy) \
  311. : "%rax", "cc")
  312. /******************************************************************/
  313. #elif defined(TFM_SSE2)
  314. /* SSE2 code (assumes 32-bit fp_digits) */
  315. /* XMM register assignments:
  316. * xmm0 *tmpm++, then Mu * (*tmpm++)
  317. * xmm1 c[x], then Mu
  318. * xmm2 mp
  319. * xmm3 cy
  320. * xmm4 _c[LO]
  321. */
  322. #define MONT_START \
  323. __asm__("movd %0,%%mm2"::"g"(mp))
  324. #define MONT_FINI \
  325. __asm__("emms")
  326. #define LOOP_START \
  327. __asm__( \
  328. "movd %0,%%mm1 \n\t" \
  329. "pxor %%mm3,%%mm3 \n\t" \
  330. "pmuludq %%mm2,%%mm1 \n\t" \
  331. :: "g"(c[x]))
  332. /* pmuludq on mmx registers does a 32x32->64 multiply. */
  333. #define INNERMUL \
  334. __asm__( \
  335. "movd %1,%%mm4 \n\t" \
  336. "movd %2,%%mm0 \n\t" \
  337. "paddq %%mm4,%%mm3 \n\t" \
  338. "pmuludq %%mm1,%%mm0 \n\t" \
  339. "paddq %%mm0,%%mm3 \n\t" \
  340. "movd %%mm3,%0 \n\t" \
  341. "psrlq $32, %%mm3 \n\t" \
  342. :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
  343. #define INNERMUL8 \
  344. __asm__( \
  345. "movd 0(%1),%%mm4 \n\t" \
  346. "movd 0(%2),%%mm0 \n\t" \
  347. "paddq %%mm4,%%mm3 \n\t" \
  348. "pmuludq %%mm1,%%mm0 \n\t" \
  349. "movd 4(%2),%%mm5 \n\t" \
  350. "paddq %%mm0,%%mm3 \n\t" \
  351. "movd 4(%1),%%mm6 \n\t" \
  352. "movd %%mm3,0(%0) \n\t" \
  353. "psrlq $32, %%mm3 \n\t" \
  354. \
  355. "paddq %%mm6,%%mm3 \n\t" \
  356. "pmuludq %%mm1,%%mm5 \n\t" \
  357. "movd 8(%2),%%mm6 \n\t" \
  358. "paddq %%mm5,%%mm3 \n\t" \
  359. "movd 8(%1),%%mm7 \n\t" \
  360. "movd %%mm3,4(%0) \n\t" \
  361. "psrlq $32, %%mm3 \n\t" \
  362. \
  363. "paddq %%mm7,%%mm3 \n\t" \
  364. "pmuludq %%mm1,%%mm6 \n\t" \
  365. "movd 12(%2),%%mm7 \n\t" \
  366. "paddq %%mm6,%%mm3 \n\t" \
  367. "movd 12(%1),%%mm5 \n\t" \
  368. "movd %%mm3,8(%0) \n\t" \
  369. "psrlq $32, %%mm3 \n\t" \
  370. \
  371. "paddq %%mm5,%%mm3 \n\t" \
  372. "pmuludq %%mm1,%%mm7 \n\t" \
  373. "movd 16(%2),%%mm5 \n\t" \
  374. "paddq %%mm7,%%mm3 \n\t" \
  375. "movd 16(%1),%%mm6 \n\t" \
  376. "movd %%mm3,12(%0) \n\t" \
  377. "psrlq $32, %%mm3 \n\t" \
  378. \
  379. "paddq %%mm6,%%mm3 \n\t" \
  380. "pmuludq %%mm1,%%mm5 \n\t" \
  381. "movd 20(%2),%%mm6 \n\t" \
  382. "paddq %%mm5,%%mm3 \n\t" \
  383. "movd 20(%1),%%mm7 \n\t" \
  384. "movd %%mm3,16(%0) \n\t" \
  385. "psrlq $32, %%mm3 \n\t" \
  386. \
  387. "paddq %%mm7,%%mm3 \n\t" \
  388. "pmuludq %%mm1,%%mm6 \n\t" \
  389. "movd 24(%2),%%mm7 \n\t" \
  390. "paddq %%mm6,%%mm3 \n\t" \
  391. "movd 24(%1),%%mm5 \n\t" \
  392. "movd %%mm3,20(%0) \n\t" \
  393. "psrlq $32, %%mm3 \n\t" \
  394. \
  395. "paddq %%mm5,%%mm3 \n\t" \
  396. "pmuludq %%mm1,%%mm7 \n\t" \
  397. "movd 28(%2),%%mm5 \n\t" \
  398. "paddq %%mm7,%%mm3 \n\t" \
  399. "movd 28(%1),%%mm6 \n\t" \
  400. "movd %%mm3,24(%0) \n\t" \
  401. "psrlq $32, %%mm3 \n\t" \
  402. \
  403. "paddq %%mm6,%%mm3 \n\t" \
  404. "pmuludq %%mm1,%%mm5 \n\t" \
  405. "paddq %%mm5,%%mm3 \n\t" \
  406. "movd %%mm3,28(%0) \n\t" \
  407. "psrlq $32, %%mm3 \n\t" \
  408. :"=r"(_c) : "0"(_c), "r"(tmpm) );
  409. /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
  410. pointer */
  411. #define LOOP_END \
  412. __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
  413. #define PROPCARRY \
  414. __asm__( \
  415. "addl %1,%0 \n\t" \
  416. "setb %%al \n\t" \
  417. "movzbl %%al,%1 \n\t" \
  418. :"=g"(_c[LO]), "=r"(cy) \
  419. :"0"(_c[LO]), "1"(cy) \
  420. : "%eax", "cc")
  421. /******************************************************************/
  422. #elif defined(TFM_ARM)
  423. /* ARMv4 code */
  424. #define MONT_START
  425. #define MONT_FINI
  426. #define LOOP_END
  427. #define LOOP_START \
  428. mu = c[x] * mp
  429. #ifdef __thumb__
  430. #define INNERMUL \
  431. __asm__( \
  432. " LDR r0,%1 \n\t" \
  433. " ADDS r0,r0,%0 \n\t" \
  434. " ITE CS \n\t" \
  435. " MOVCS %0,#1 \n\t" \
  436. " MOVCC %0,#0 \n\t" \
  437. " UMLAL r0,%0,%3,%4 \n\t" \
  438. " STR r0,%1 \n\t" \
  439. :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
  440. #define PROPCARRY \
  441. __asm__( \
  442. " LDR r0,%1 \n\t" \
  443. " ADDS r0,r0,%0 \n\t" \
  444. " STR r0,%1 \n\t" \
  445. " ITE CS \n\t" \
  446. " MOVCS %0,#1 \n\t" \
  447. " MOVCC %0,#0 \n\t" \
  448. :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
  449. /* TAO thumb mode uses ite (if then else) to detect carry directly
  450. * fixed unmatched constraint warning by changing 1 to m */
  451. #else /* __thumb__ */
  452. #define INNERMUL \
  453. __asm__( \
  454. " LDR r0,%1 \n\t" \
  455. " ADDS r0,r0,%0 \n\t" \
  456. " MOVCS %0,#1 \n\t" \
  457. " MOVCC %0,#0 \n\t" \
  458. " UMLAL r0,%0,%3,%4 \n\t" \
  459. " STR r0,%1 \n\t" \
  460. :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
  461. #define PROPCARRY \
  462. __asm__( \
  463. " LDR r0,%1 \n\t" \
  464. " ADDS r0,r0,%0 \n\t" \
  465. " STR r0,%1 \n\t" \
  466. " MOVCS %0,#1 \n\t" \
  467. " MOVCC %0,#0 \n\t" \
  468. :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
  469. #endif /* __thumb__ */
  470. #elif defined(TFM_PPC32)
  471. /* PPC32 */
  472. #define MONT_START
  473. #define MONT_FINI
  474. #define LOOP_END
  475. #define LOOP_START \
  476. mu = c[x] * mp
  477. #define INNERMUL \
  478. __asm__( \
  479. " mullw 16,%3,%4 \n\t" \
  480. " mulhwu 17,%3,%4 \n\t" \
  481. " addc 16,16,%2 \n\t" \
  482. " addze 17,17 \n\t" \
  483. " addc %1,16,%5 \n\t" \
  484. " addze %0,17 \n\t" \
  485. :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
  486. #define PROPCARRY \
  487. __asm__( \
  488. " addc %1,%3,%2 \n\t" \
  489. " xor %0,%2,%2 \n\t" \
  490. " addze %0,%2 \n\t" \
  491. :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
  492. #elif defined(TFM_PPC64)
  493. /* PPC64 */
  494. #define MONT_START
  495. #define MONT_FINI
  496. #define LOOP_END
  497. #define LOOP_START \
  498. mu = c[x] * mp
  499. #define INNERMUL \
  500. __asm__( \
  501. " mulld r16,%3,%4 \n\t" \
  502. " mulhdu r17,%3,%4 \n\t" \
  503. " addc r16,16,%0 \n\t" \
  504. " addze r17,r17 \n\t" \
  505. " ldx r18,0,%1 \n\t" \
  506. " addc r16,r16,r18 \n\t" \
  507. " addze %0,r17 \n\t" \
  508. " sdx r16,0,%1 \n\t" \
  509. :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
  510. #define PROPCARRY \
  511. __asm__( \
  512. " ldx r16,0,%1 \n\t" \
  513. " addc r16,r16,%0 \n\t" \
  514. " sdx r16,0,%1 \n\t" \
  515. " xor %0,%0,%0 \n\t" \
  516. " addze %0,%0 \n\t" \
  517. :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
  518. /******************************************************************/
  519. #elif defined(TFM_AVR32)
  520. /* AVR32 */
  521. #define MONT_START
  522. #define MONT_FINI
  523. #define LOOP_END
  524. #define LOOP_START \
  525. mu = c[x] * mp
  526. #define INNERMUL \
  527. __asm__( \
  528. " ld.w r2,%1 \n\t" \
  529. " add r2,%0 \n\t" \
  530. " eor r3,r3 \n\t" \
  531. " acr r3 \n\t" \
  532. " macu.d r2,%3,%4 \n\t" \
  533. " st.w %1,r2 \n\t" \
  534. " mov %0,r3 \n\t" \
  535. :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
  536. #define PROPCARRY \
  537. __asm__( \
  538. " ld.w r2,%1 \n\t" \
  539. " add r2,%0 \n\t" \
  540. " st.w %1,r2 \n\t" \
  541. " eor %0,%0 \n\t" \
  542. " acr %0 \n\t" \
  543. :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
  544. /******************************************************************/
  545. #elif defined(TFM_MIPS)
  546. /* MIPS */
  547. #define MONT_START
  548. #define MONT_FINI
  549. #define LOOP_END
  550. #define LOOP_START \
  551. mu = c[x] * mp
  552. #define INNERMUL \
  553. __asm__( \
  554. " multu %3,%4 \n\t" \
  555. " mflo $12 \n\t" \
  556. " mfhi $13 \n\t" \
  557. " addu $12,$12,%0 \n\t" \
  558. " sltu $10,$12,%0 \n\t" \
  559. " addu $13,$13,$10 \n\t" \
  560. " lw $10,%1 \n\t" \
  561. " addu $12,$12,$10 \n\t" \
  562. " sltu $10,$12,$10 \n\t" \
  563. " addu %0,$13,$10 \n\t" \
  564. " sw $12,%1 \n\t" \
  565. :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm;
  566. #define PROPCARRY \
  567. __asm__( \
  568. " lw $10,%1 \n\t" \
  569. " addu $10,$10,%0 \n\t" \
  570. " sw $10,%1 \n\t" \
  571. " sltu %0,$10,%0 \n\t" \
  572. :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10");
  573. /******************************************************************/
  574. #else
  575. /* ISO C code */
  576. #define MONT_START
  577. #define MONT_FINI
  578. #define LOOP_END
  579. #define LOOP_START \
  580. mu = c[x] * mp
  581. #define INNERMUL \
  582. do { fp_word t; \
  583. t = ((fp_word)_c[0] + (fp_word)cy) + \
  584. (((fp_word)mu) * ((fp_word)*tmpm++)); \
  585. _c[0] = (fp_digit)t; \
  586. cy = (fp_digit)(t >> DIGIT_BIT); \
  587. } while (0)
  588. #define PROPCARRY \
  589. do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
  590. #endif
  591. /******************************************************************/
  592. #define LO 0
  593. /* end fp_montogomery_reduce.c asm */
  594. /* start fp_sqr_comba.c asm */
  595. #if defined(TFM_X86)
  596. /* x86-32 optimized */
  597. #define COMBA_START
  598. #define CLEAR_CARRY \
  599. c0 = c1 = c2 = 0;
  600. #define COMBA_STORE(x) \
  601. x = c0;
  602. #define COMBA_STORE2(x) \
  603. x = c1;
  604. #define CARRY_FORWARD \
  605. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  606. #define COMBA_FINI
  607. #define SQRADD(i, j) \
  608. __asm__( \
  609. "movl %3,%%eax \n\t" \
  610. "mull %%eax \n\t" \
  611. "addl %%eax,%0 \n\t" \
  612. "adcl %%edx,%1 \n\t" \
  613. "adcl $0,%2 \n\t" \
  614. :"+m"(c0), "+m"(c1), "+m"(c2) \
  615. : "m"(i) \
  616. :"%eax","%edx","cc");
  617. #define SQRADD2(i, j) \
  618. __asm__( \
  619. "movl %3,%%eax \n\t" \
  620. "mull %4 \n\t" \
  621. "addl %%eax,%0 \n\t" \
  622. "adcl %%edx,%1 \n\t" \
  623. "adcl $0,%2 \n\t" \
  624. "addl %%eax,%0 \n\t" \
  625. "adcl %%edx,%1 \n\t" \
  626. "adcl $0,%2 \n\t" \
  627. :"+m"(c0), "+m"(c1), "+m"(c2) \
  628. : "m"(i), "m"(j) \
  629. :"%eax","%edx", "cc");
  630. #define SQRADDSC(i, j) \
  631. __asm__( \
  632. "movl %3,%%eax \n\t" \
  633. "mull %4 \n\t" \
  634. "movl %%eax,%0 \n\t" \
  635. "movl %%edx,%1 \n\t" \
  636. "xorl %2,%2 \n\t" \
  637. :"=r"(sc0), "=r"(sc1), "=r"(sc2) \
  638. : "g"(i), "g"(j) \
  639. :"%eax","%edx","cc");
  640. #define SQRADDAC(i, j) \
  641. __asm__( \
  642. "movl %6,%%eax \n\t" \
  643. "mull %7 \n\t" \
  644. "addl %%eax,%0 \n\t" \
  645. "adcl %%edx,%1 \n\t" \
  646. "adcl $0,%2 \n\t" \
  647. :"=r"(sc0), "=r"(sc1), "=r"(sc2) \
  648. : "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) \
  649. :"%eax","%edx","cc");
  650. #define SQRADDDB \
  651. __asm__( \
  652. "addl %6,%0 \n\t" \
  653. "adcl %7,%1 \n\t" \
  654. "adcl %8,%2 \n\t" \
  655. "addl %6,%0 \n\t" \
  656. "adcl %7,%1 \n\t" \
  657. "adcl %8,%2 \n\t" \
  658. :"=r"(c0), "=r"(c1), "=r"(c2) \
  659. : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), \
  660. "r"(sc2) \
  661. : "cc");
  662. #elif defined(TFM_X86_64)
  663. /* x86-64 optimized */
  664. #define COMBA_START
  665. #define CLEAR_CARRY \
  666. c0 = c1 = c2 = 0;
  667. #define COMBA_STORE(x) \
  668. x = c0;
  669. #define COMBA_STORE2(x) \
  670. x = c1;
  671. #define CARRY_FORWARD \
  672. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  673. #define COMBA_FINI
  674. #define SQRADD(i, j) \
  675. __asm__( \
  676. "movq %6,%%rax \n\t" \
  677. "mulq %%rax \n\t" \
  678. "addq %%rax,%0 \n\t" \
  679. "adcq %%rdx,%1 \n\t" \
  680. "adcq $0,%2 \n\t" \
  681. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
  682. #define SQRADD2(i, j) \
  683. __asm__( \
  684. "movq %6,%%rax \n\t" \
  685. "mulq %7 \n\t" \
  686. "addq %%rax,%0 \n\t" \
  687. "adcq %%rdx,%1 \n\t" \
  688. "adcq $0,%2 \n\t" \
  689. "addq %%rax,%0 \n\t" \
  690. "adcq %%rdx,%1 \n\t" \
  691. "adcq $0,%2 \n\t" \
  692. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  693. #define SQRADDSC(i, j) \
  694. __asm__( \
  695. "movq %3,%%rax \n\t" \
  696. "mulq %4 \n\t" \
  697. "movq %%rax,%0 \n\t" \
  698. "movq %%rdx,%1 \n\t" \
  699. "xorq %2,%2 \n\t" \
  700. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
  701. #define SQRADDAC(i, j) \
  702. __asm__( \
  703. "movq %6,%%rax \n\t" \
  704. "mulq %7 \n\t" \
  705. "addq %%rax,%0 \n\t" \
  706. "adcq %%rdx,%1 \n\t" \
  707. "adcq $0,%2 \n\t" \
  708. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  709. #define SQRADDDB \
  710. __asm__( \
  711. "addq %6,%0 \n\t" \
  712. "adcq %7,%1 \n\t" \
  713. "adcq %8,%2 \n\t" \
  714. "addq %6,%0 \n\t" \
  715. "adcq %7,%1 \n\t" \
  716. "adcq %8,%2 \n\t" \
  717. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
  718. #elif defined(TFM_SSE2)
  719. /* SSE2 Optimized */
  720. #define COMBA_START
  721. #define CLEAR_CARRY \
  722. c0 = c1 = c2 = 0;
  723. #define COMBA_STORE(x) \
  724. x = c0;
  725. #define COMBA_STORE2(x) \
  726. x = c1;
  727. #define CARRY_FORWARD \
  728. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  729. #define COMBA_FINI \
  730. __asm__("emms");
  731. #define SQRADD(i, j) \
  732. __asm__( \
  733. "movd %6,%%mm0 \n\t" \
  734. "pmuludq %%mm0,%%mm0\n\t" \
  735. "movd %%mm0,%%eax \n\t" \
  736. "psrlq $32,%%mm0 \n\t" \
  737. "addl %%eax,%0 \n\t" \
  738. "movd %%mm0,%%eax \n\t" \
  739. "adcl %%eax,%1 \n\t" \
  740. "adcl $0,%2 \n\t" \
  741. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
  742. #define SQRADD2(i, j) \
  743. __asm__( \
  744. "movd %6,%%mm0 \n\t" \
  745. "movd %7,%%mm1 \n\t" \
  746. "pmuludq %%mm1,%%mm0\n\t" \
  747. "movd %%mm0,%%eax \n\t" \
  748. "psrlq $32,%%mm0 \n\t" \
  749. "movd %%mm0,%%edx \n\t" \
  750. "addl %%eax,%0 \n\t" \
  751. "adcl %%edx,%1 \n\t" \
  752. "adcl $0,%2 \n\t" \
  753. "addl %%eax,%0 \n\t" \
  754. "adcl %%edx,%1 \n\t" \
  755. "adcl $0,%2 \n\t" \
  756. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
  757. #define SQRADDSC(i, j) \
  758. __asm__( \
  759. "movd %3,%%mm0 \n\t" \
  760. "movd %4,%%mm1 \n\t" \
  761. "pmuludq %%mm1,%%mm0\n\t" \
  762. "movd %%mm0,%0 \n\t" \
  763. "psrlq $32,%%mm0 \n\t" \
  764. "movd %%mm0,%1 \n\t" \
  765. "xorl %2,%2 \n\t" \
  766. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
  767. /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
  768. #define SQRADDAC(i, j) \
  769. __asm__( \
  770. "movd %6,%%mm0 \n\t" \
  771. "movd %7,%%mm1 \n\t" \
  772. "pmuludq %%mm1,%%mm0\n\t" \
  773. "movd %%mm0,%%eax \n\t" \
  774. "psrlq $32,%%mm0 \n\t" \
  775. "movd %%mm0,%%edx \n\t" \
  776. "addl %%eax,%0 \n\t" \
  777. "adcl %%edx,%1 \n\t" \
  778. "adcl $0,%2 \n\t" \
  779. :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
  780. #define SQRADDDB \
  781. __asm__( \
  782. "addl %6,%0 \n\t" \
  783. "adcl %7,%1 \n\t" \
  784. "adcl %8,%2 \n\t" \
  785. "addl %6,%0 \n\t" \
  786. "adcl %7,%1 \n\t" \
  787. "adcl %8,%2 \n\t" \
  788. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
  789. #elif defined(TFM_ARM)
  790. /* ARM code */
  791. #define COMBA_START
  792. #define CLEAR_CARRY \
  793. c0 = c1 = c2 = 0;
  794. #define COMBA_STORE(x) \
  795. x = c0;
  796. #define COMBA_STORE2(x) \
  797. x = c1;
  798. #define CARRY_FORWARD \
  799. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  800. #define COMBA_FINI
  801. /* multiplies point i and j, updates carry "c1" and digit c2 */
  802. #define SQRADD(i, j) \
  803. __asm__( \
  804. " UMULL r0,r1,%6,%6 \n\t" \
  805. " ADDS %0,%0,r0 \n\t" \
  806. " ADCS %1,%1,r1 \n\t" \
  807. " ADC %2,%2,#0 \n\t" \
  808. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
  809. /* for squaring some of the terms are doubled... */
  810. #define SQRADD2(i, j) \
  811. __asm__( \
  812. " UMULL r0,r1,%6,%7 \n\t" \
  813. " ADDS %0,%0,r0 \n\t" \
  814. " ADCS %1,%1,r1 \n\t" \
  815. " ADC %2,%2,#0 \n\t" \
  816. " ADDS %0,%0,r0 \n\t" \
  817. " ADCS %1,%1,r1 \n\t" \
  818. " ADC %2,%2,#0 \n\t" \
  819. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
  820. #define SQRADDSC(i, j) \
  821. __asm__( \
  822. " UMULL %0,%1,%3,%4 \n\t" \
  823. " SUB %2,%2,%2 \n\t" \
  824. :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
  825. /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
  826. #define SQRADDAC(i, j) \
  827. __asm__( \
  828. " UMULL r0,r1,%6,%7 \n\t" \
  829. " ADDS %0,%0,r0 \n\t" \
  830. " ADCS %1,%1,r1 \n\t" \
  831. " ADC %2,%2,#0 \n\t" \
  832. :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
  833. #define SQRADDDB \
  834. __asm__( \
  835. " ADDS %0,%0,%3 \n\t" \
  836. " ADCS %1,%1,%4 \n\t" \
  837. " ADC %2,%2,%5 \n\t" \
  838. " ADDS %0,%0,%3 \n\t" \
  839. " ADCS %1,%1,%4 \n\t" \
  840. " ADC %2,%2,%5 \n\t" \
  841. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
  842. #elif defined(TFM_PPC32)
  843. /* PPC32 */
  844. #define COMBA_START
  845. #define CLEAR_CARRY \
  846. c0 = c1 = c2 = 0;
  847. #define COMBA_STORE(x) \
  848. x = c0;
  849. #define COMBA_STORE2(x) \
  850. x = c1;
  851. #define CARRY_FORWARD \
  852. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  853. #define COMBA_FINI
  854. /* multiplies point i and j, updates carry "c1" and digit c2 */
  855. #define SQRADD(i, j) \
  856. __asm__( \
  857. " mullw 16,%6,%6 \n\t" \
  858. " addc %0,%0,16 \n\t" \
  859. " mulhwu 16,%6,%6 \n\t" \
  860. " adde %1,%1,16 \n\t" \
  861. " addze %2,%2 \n\t" \
  862. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
  863. /* for squaring some of the terms are doubled... */
  864. #define SQRADD2(i, j) \
  865. __asm__( \
  866. " mullw 16,%6,%7 \n\t" \
  867. " mulhwu 17,%6,%7 \n\t" \
  868. " addc %0,%0,16 \n\t" \
  869. " adde %1,%1,17 \n\t" \
  870. " addze %2,%2 \n\t" \
  871. " addc %0,%0,16 \n\t" \
  872. " adde %1,%1,17 \n\t" \
  873. " addze %2,%2 \n\t" \
  874. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
  875. #define SQRADDSC(i, j) \
  876. __asm__( \
  877. " mullw %0,%6,%7 \n\t" \
  878. " mulhwu %1,%6,%7 \n\t" \
  879. " xor %2,%2,%2 \n\t" \
  880. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
  881. #define SQRADDAC(i, j) \
  882. __asm__( \
  883. " mullw 16,%6,%7 \n\t" \
  884. " addc %0,%0,16 \n\t" \
  885. " mulhwu 16,%6,%7 \n\t" \
  886. " adde %1,%1,16 \n\t" \
  887. " addze %2,%2 \n\t" \
  888. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
  889. #define SQRADDDB \
  890. __asm__( \
  891. " addc %0,%0,%3 \n\t" \
  892. " adde %1,%1,%4 \n\t" \
  893. " adde %2,%2,%5 \n\t" \
  894. " addc %0,%0,%3 \n\t" \
  895. " adde %1,%1,%4 \n\t" \
  896. " adde %2,%2,%5 \n\t" \
  897. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
  898. #elif defined(TFM_PPC64)
  899. /* PPC64 */
  900. #define COMBA_START
  901. #define CLEAR_CARRY \
  902. c0 = c1 = c2 = 0;
  903. #define COMBA_STORE(x) \
  904. x = c0;
  905. #define COMBA_STORE2(x) \
  906. x = c1;
  907. #define CARRY_FORWARD \
  908. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  909. #define COMBA_FINI
  910. /* multiplies point i and j, updates carry "c1" and digit c2 */
  911. #define SQRADD(i, j) \
  912. __asm__( \
  913. " mulld r16,%6,%6 \n\t" \
  914. " addc %0,%0,r16 \n\t" \
  915. " mulhdu r16,%6,%6 \n\t" \
  916. " adde %1,%1,r16 \n\t" \
  917. " addze %2,%2 \n\t" \
  918. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
  919. /* for squaring some of the terms are doubled... */
  920. #define SQRADD2(i, j) \
  921. __asm__( \
  922. " mulld r16,%6,%7 \n\t" \
  923. " mulhdu r17,%6,%7 \n\t" \
  924. " addc %0,%0,r16 \n\t" \
  925. " adde %1,%1,r17 \n\t" \
  926. " addze %2,%2 \n\t" \
  927. " addc %0,%0,r16 \n\t" \
  928. " adde %1,%1,r17 \n\t" \
  929. " addze %2,%2 \n\t" \
  930. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
  931. #define SQRADDSC(i, j) \
  932. __asm__( \
  933. " mulld %0,%6,%7 \n\t" \
  934. " mulhdu %1,%6,%7 \n\t" \
  935. " xor %2,%2,%2 \n\t" \
  936. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
  937. #define SQRADDAC(i, j) \
  938. __asm__( \
  939. " mulld r16,%6,%7 \n\t" \
  940. " addc %0,%0,r16 \n\t" \
  941. " mulhdu r16,%6,%7 \n\t" \
  942. " adde %1,%1,r16 \n\t" \
  943. " addze %2,%2 \n\t" \
  944. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
  945. #define SQRADDDB \
  946. __asm__( \
  947. " addc %0,%0,%3 \n\t" \
  948. " adde %1,%1,%4 \n\t" \
  949. " adde %2,%2,%5 \n\t" \
  950. " addc %0,%0,%3 \n\t" \
  951. " adde %1,%1,%4 \n\t" \
  952. " adde %2,%2,%5 \n\t" \
  953. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
  954. #elif defined(TFM_AVR32)
  955. /* AVR32 */
  956. #define COMBA_START
  957. #define CLEAR_CARRY \
  958. c0 = c1 = c2 = 0;
  959. #define COMBA_STORE(x) \
  960. x = c0;
  961. #define COMBA_STORE2(x) \
  962. x = c1;
  963. #define CARRY_FORWARD \
  964. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  965. #define COMBA_FINI
  966. /* multiplies point i and j, updates carry "c1" and digit c2 */
  967. #define SQRADD(i, j) \
  968. __asm__( \
  969. " mulu.d r2,%6,%6 \n\t" \
  970. " add %0,%0,r2 \n\t" \
  971. " adc %1,%1,r3 \n\t" \
  972. " acr %2 \n\t" \
  973. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
  974. /* for squaring some of the terms are doubled... */
  975. #define SQRADD2(i, j) \
  976. __asm__( \
  977. " mulu.d r2,%6,%7 \n\t" \
  978. " add %0,%0,r2 \n\t" \
  979. " adc %1,%1,r3 \n\t" \
  980. " acr %2, \n\t" \
  981. " add %0,%0,r2 \n\t" \
  982. " adc %1,%1,r3 \n\t" \
  983. " acr %2, \n\t" \
  984. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
  985. #define SQRADDSC(i, j) \
  986. __asm__( \
  987. " mulu.d r2,%6,%7 \n\t" \
  988. " mov %0,r2 \n\t" \
  989. " mov %1,r3 \n\t" \
  990. " eor %2,%2 \n\t" \
  991. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
  992. #define SQRADDAC(i, j) \
  993. __asm__( \
  994. " mulu.d r2,%6,%7 \n\t" \
  995. " add %0,%0,r2 \n\t" \
  996. " adc %1,%1,r3 \n\t" \
  997. " acr %2 \n\t" \
  998. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
  999. #define SQRADDDB \
  1000. __asm__( \
  1001. " add %0,%0,%3 \n\t" \
  1002. " adc %1,%1,%4 \n\t" \
  1003. " adc %2,%2,%5 \n\t" \
  1004. " add %0,%0,%3 \n\t" \
  1005. " adc %1,%1,%4 \n\t" \
  1006. " adc %2,%2,%5 \n\t" \
  1007. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
  1008. #elif defined(TFM_MIPS)
  1009. /* MIPS */
  1010. #define COMBA_START
  1011. #define CLEAR_CARRY \
  1012. c0 = c1 = c2 = 0;
  1013. #define COMBA_STORE(x) \
  1014. x = c0;
  1015. #define COMBA_STORE2(x) \
  1016. x = c1;
  1017. #define CARRY_FORWARD \
  1018. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1019. #define COMBA_FINI
  1020. /* multiplies point i and j, updates carry "c1" and digit c2 */
  1021. #define SQRADD(i, j) \
  1022. __asm__( \
  1023. " multu %6,%6 \n\t" \
  1024. " mflo $12 \n\t" \
  1025. " mfhi $13 \n\t" \
  1026. " addu %0,%0,$12 \n\t" \
  1027. " sltu $12,%0,$12 \n\t" \
  1028. " addu %1,%1,$13 \n\t" \
  1029. " sltu $13,%1,$13 \n\t" \
  1030. " addu %1,%1,$12 \n\t" \
  1031. " sltu $12,%1,$12 \n\t" \
  1032. " addu %2,%2,$13 \n\t" \
  1033. " addu %2,%2,$12 \n\t" \
  1034. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
  1035. /* for squaring some of the terms are doubled... */
  1036. #define SQRADD2(i, j) \
  1037. __asm__( \
  1038. " multu %6,%7 \n\t" \
  1039. " mflo $12 \n\t" \
  1040. " mfhi $13 \n\t" \
  1041. \
  1042. " addu %0,%0,$12 \n\t" \
  1043. " sltu $14,%0,$12 \n\t" \
  1044. " addu %1,%1,$13 \n\t" \
  1045. " sltu $15,%1,$13 \n\t" \
  1046. " addu %1,%1,$14 \n\t" \
  1047. " sltu $14,%1,$14 \n\t" \
  1048. " addu %2,%2,$15 \n\t" \
  1049. " addu %2,%2,$14 \n\t" \
  1050. \
  1051. " addu %0,%0,$12 \n\t" \
  1052. " sltu $14,%0,$12 \n\t" \
  1053. " addu %1,%1,$13 \n\t" \
  1054. " sltu $15,%1,$13 \n\t" \
  1055. " addu %1,%1,$14 \n\t" \
  1056. " sltu $14,%1,$14 \n\t" \
  1057. " addu %2,%2,$15 \n\t" \
  1058. " addu %2,%2,$14 \n\t" \
  1059. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
  1060. #define SQRADDSC(i, j) \
  1061. __asm__( \
  1062. " multu %6,%7 \n\t" \
  1063. " mflo %0 \n\t" \
  1064. " mfhi %1 \n\t" \
  1065. " xor %2,%2,%2 \n\t" \
  1066. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
  1067. #define SQRADDAC(i, j) \
  1068. __asm__( \
  1069. " multu %6,%7 \n\t" \
  1070. " mflo $12 \n\t" \
  1071. " mfhi $13 \n\t" \
  1072. " addu %0,%0,$12 \n\t" \
  1073. " sltu $12,%0,$12 \n\t" \
  1074. " addu %1,%1,$13 \n\t" \
  1075. " sltu $13,%1,$13 \n\t" \
  1076. " addu %1,%1,$12 \n\t" \
  1077. " sltu $12,%1,$12 \n\t" \
  1078. " addu %2,%2,$13 \n\t" \
  1079. " addu %2,%2,$12 \n\t" \
  1080. :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
  1081. #define SQRADDDB \
  1082. __asm__( \
  1083. " addu %0,%0,%3 \n\t" \
  1084. " sltu $10,%0,%3 \n\t" \
  1085. " addu %1,%1,$10 \n\t" \
  1086. " sltu $10,%1,$10 \n\t" \
  1087. " addu %1,%1,%4 \n\t" \
  1088. " sltu $11,%1,%4 \n\t" \
  1089. " addu %2,%2,$10 \n\t" \
  1090. " addu %2,%2,$11 \n\t" \
  1091. " addu %2,%2,%5 \n\t" \
  1092. \
  1093. " addu %0,%0,%3 \n\t" \
  1094. " sltu $10,%0,%3 \n\t" \
  1095. " addu %1,%1,$10 \n\t" \
  1096. " sltu $10,%1,$10 \n\t" \
  1097. " addu %1,%1,%4 \n\t" \
  1098. " sltu $11,%1,%4 \n\t" \
  1099. " addu %2,%2,$10 \n\t" \
  1100. " addu %2,%2,$11 \n\t" \
  1101. " addu %2,%2,%5 \n\t" \
  1102. :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
  1103. #else
  1104. #define TFM_ISO
  1105. /* ISO C portable code */
  1106. #define COMBA_START
  1107. #define CLEAR_CARRY \
  1108. c0 = c1 = c2 = 0;
  1109. #define COMBA_STORE(x) \
  1110. x = c0;
  1111. #define COMBA_STORE2(x) \
  1112. x = c1;
  1113. #define CARRY_FORWARD \
  1114. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1115. #define COMBA_FINI
  1116. /* multiplies point i and j, updates carry "c1" and digit c2 */
  1117. #define SQRADD(i, j) \
  1118. do { fp_word t; \
  1119. t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
  1120. t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
  1121. c2 +=(fp_digit) (t >> DIGIT_BIT); \
  1122. } while (0);
  1123. /* for squaring some of the terms are doubled... */
  1124. #define SQRADD2(i, j) \
  1125. do { fp_word t; \
  1126. t = ((fp_word)i) * ((fp_word)j); \
  1127. tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
  1128. tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
  1129. c2 +=(fp_digit)(tt >> DIGIT_BIT); \
  1130. tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
  1131. tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
  1132. c2 +=(fp_digit)(tt >> DIGIT_BIT); \
  1133. } while (0);
  1134. #define SQRADDSC(i, j) \
  1135. do { fp_word t; \
  1136. t = ((fp_word)i) * ((fp_word)j); \
  1137. sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
  1138. } while (0);
  1139. #define SQRADDAC(i, j) \
  1140. do { fp_word t; \
  1141. t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
  1142. t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
  1143. sc2 += (fp_digit)(t >> DIGIT_BIT); \
  1144. } while (0);
  1145. #define SQRADDDB \
  1146. do { fp_word t; \
  1147. t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
  1148. t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
  1149. c1 = (fp_digit)t; \
  1150. c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
  1151. } while (0);
  1152. #endif
  1153. #ifdef TFM_SMALL_SET
  1154. #include "fp_sqr_comba_small_set.i"
  1155. #endif
  1156. #if defined(TFM_SQR3) && FP_SIZE >= 6
  1157. #include "fp_sqr_comba_3.i"
  1158. #endif
  1159. #if defined(TFM_SQR4) && FP_SIZE >= 8
  1160. #include "fp_sqr_comba_4.i"
  1161. #endif
  1162. #if defined(TFM_SQR6) && FP_SIZE >= 12
  1163. #include "fp_sqr_comba_6.i"
  1164. #endif
  1165. #if defined(TFM_SQR7) && FP_SIZE >= 14
  1166. #include "fp_sqr_comba_7.i"
  1167. #endif
  1168. #if defined(TFM_SQR8) && FP_SIZE >= 16
  1169. #include "fp_sqr_comba_8.i"
  1170. #endif
  1171. #if defined(TFM_SQR9) && FP_SIZE >= 18
  1172. #include "fp_sqr_comba_9.i"
  1173. #endif
  1174. #if defined(TFM_SQR12) && FP_SIZE >= 24
  1175. #include "fp_sqr_comba_12.i"
  1176. #endif
  1177. #if defined(TFM_SQR17) && FP_SIZE >= 34
  1178. #include "fp_sqr_comba_17.i"
  1179. #endif
  1180. #if defined(TFM_SQR20) && FP_SIZE >= 40
  1181. #include "fp_sqr_comba_20.i"
  1182. #endif
  1183. #if defined(TFM_SQR24) && FP_SIZE >= 48
  1184. #include "fp_sqr_comba_24.i"
  1185. #endif
  1186. #if defined(TFM_SQR28) && FP_SIZE >= 56
  1187. #include "fp_sqr_comba_28.i"
  1188. #endif
  1189. #if defined(TFM_SQR32) && FP_SIZE >= 64
  1190. #include "fp_sqr_comba_32.i"
  1191. #endif
  1192. #if defined(TFM_SQR48) && FP_SIZE >= 96
  1193. #include "fp_sqr_comba_48.i"
  1194. #endif
  1195. #if defined(TFM_SQR64) && FP_SIZE >= 128
  1196. #include "fp_sqr_comba_64.i"
  1197. #endif
  1198. /* end fp_sqr_comba.c asm */
  1199. /* start fp_mul_comba.c asm */
  1200. /* these are the combas. Worship them. */
  1201. #if defined(TFM_X86)
  1202. /* Generic x86 optimized code */
  1203. /* anything you need at the start */
  1204. #define COMBA_START
  1205. /* clear the chaining variables */
  1206. #define COMBA_CLEAR \
  1207. c0 = c1 = c2 = 0;
  1208. /* forward the carry to the next digit */
  1209. #define COMBA_FORWARD \
  1210. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1211. /* store the first sum */
  1212. #define COMBA_STORE(x) \
  1213. x = c0;
  1214. /* store the second sum [carry] */
  1215. #define COMBA_STORE2(x) \
  1216. x = c1;
  1217. /* anything you need at the end */
  1218. #define COMBA_FINI
  1219. /* this should multiply i and j */
  1220. #define MULADD(i, j) \
  1221. __asm__( \
  1222. "movl %6,%%eax \n\t" \
  1223. "mull %7 \n\t" \
  1224. "addl %%eax,%0 \n\t" \
  1225. "adcl %%edx,%1 \n\t" \
  1226. "adcl $0,%2 \n\t" \
  1227. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
  1228. #elif defined(TFM_X86_64)
  1229. /* x86-64 optimized */
  1230. /* anything you need at the start */
  1231. #define COMBA_START
  1232. /* clear the chaining variables */
  1233. #define COMBA_CLEAR \
  1234. c0 = c1 = c2 = 0;
  1235. /* forward the carry to the next digit */
  1236. #define COMBA_FORWARD \
  1237. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1238. /* store the first sum */
  1239. #define COMBA_STORE(x) \
  1240. x = c0;
  1241. /* store the second sum [carry] */
  1242. #define COMBA_STORE2(x) \
  1243. x = c1;
  1244. /* anything you need at the end */
  1245. #define COMBA_FINI
  1246. /* this should multiply i and j */
  1247. #define MULADD(i, j) \
  1248. __asm__ ( \
  1249. "movq %6,%%rax \n\t" \
  1250. "mulq %7 \n\t" \
  1251. "addq %%rax,%0 \n\t" \
  1252. "adcq %%rdx,%1 \n\t" \
  1253. "adcq $0,%2 \n\t" \
  1254. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
  1255. #if defined(HAVE_INTEL_MULX)
  1256. #define MULADD_BODY(a,b,carry,c) \
  1257. __asm__ volatile( \
  1258. "movq %[a0],%%rdx\n\t" \
  1259. "xorq %%rcx, %%rcx\n\t" \
  1260. "movq 0(%[cp]),%%r8\n\t" \
  1261. "movq 8(%[cp]),%%r9\n\t" \
  1262. "movq 16(%[cp]),%%r10\n\t" \
  1263. "movq 24(%[cp]),%%r11\n\t" \
  1264. \
  1265. "mulx (%[bp]),%%rax, %%rbx\n\t" \
  1266. "adcxq %[ca], %%r8\n\t" \
  1267. "adoxq %%rax, %%r8\n\t" \
  1268. "mulx 8(%[bp]),%%rax, %%rcx\n\t" \
  1269. "adcxq %%rbx, %%r9\n\t" \
  1270. "adoxq %%rax, %%r9\n\t" \
  1271. "mulx 16(%[bp]),%%rax, %%rbx\n\t" \
  1272. "adcxq %%rcx, %%r10\n\t" \
  1273. "adoxq %%rax, %%r10\n\t" \
  1274. "mulx 24(%[bp]),%%rax, %%rcx\n\t" \
  1275. "adcxq %%rbx, %%r11\n\t" \
  1276. "mov $0, %[ca]\n\t" \
  1277. "adoxq %%rax, %%r11\n\t" \
  1278. "adcxq %%rcx, %[ca]\n\t" \
  1279. "mov $0, %%rdx\n\t" \
  1280. "adoxq %%rdx, %[ca]\n\t" \
  1281. \
  1282. "movq %%r8, 0(%[cp])\n\t" \
  1283. "movq %%r9, 8(%[cp])\n\t" \
  1284. "movq %%r10, 16(%[cp])\n\t" \
  1285. "movq %%r11, 24(%[cp])\n\t" \
  1286. : [ca] "+r" (carry) \
  1287. : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])), \
  1288. [cp] "r" (&(c->dp[iz])) \
  1289. : "%r8", "%r9", "%r10", "%r11", \
  1290. "%rdx", "%rax", "%rcx", "%rbx" \
  1291. )
  1292. #define TFM_INTEL_MUL_COMBA(a, b, ca, c) \
  1293. for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \
  1294. for (ix=0; ix<a->used; ix++) { \
  1295. ca = 0; \
  1296. for (iy=0; iy<b->used; iy+=4) { \
  1297. iz = ix + iy; \
  1298. MULADD_BODY(a, b, ca, c); \
  1299. } \
  1300. c->dp[ix + iy] = ca; \
  1301. }
  1302. #endif
  1303. #elif defined(TFM_SSE2)
  1304. /* use SSE2 optimizations */
  1305. /* anything you need at the start */
  1306. #define COMBA_START
  1307. /* clear the chaining variables */
  1308. #define COMBA_CLEAR \
  1309. c0 = c1 = c2 = 0;
  1310. /* forward the carry to the next digit */
  1311. #define COMBA_FORWARD \
  1312. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1313. /* store the first sum */
  1314. #define COMBA_STORE(x) \
  1315. x = c0;
  1316. /* store the second sum [carry] */
  1317. #define COMBA_STORE2(x) \
  1318. x = c1;
  1319. /* anything you need at the end */
  1320. #define COMBA_FINI \
  1321. __asm__("emms");
  1322. /* this should multiply i and j */
  1323. #define MULADD(i, j) \
  1324. __asm__( \
  1325. "movd %6,%%mm0 \n\t" \
  1326. "movd %7,%%mm1 \n\t" \
  1327. "pmuludq %%mm1,%%mm0\n\t" \
  1328. "movd %%mm0,%%eax \n\t" \
  1329. "psrlq $32,%%mm0 \n\t" \
  1330. "addl %%eax,%0 \n\t" \
  1331. "movd %%mm0,%%eax \n\t" \
  1332. "adcl %%eax,%1 \n\t" \
  1333. "adcl $0,%2 \n\t" \
  1334. :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
  1335. #elif defined(TFM_ARM)
  1336. /* ARM code */
  1337. #define COMBA_START
  1338. #define COMBA_CLEAR \
  1339. c0 = c1 = c2 = 0;
  1340. #define COMBA_FORWARD \
  1341. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1342. #define COMBA_STORE(x) \
  1343. x = c0;
  1344. #define COMBA_STORE2(x) \
  1345. x = c1;
  1346. #define COMBA_FINI
  1347. #define MULADD(i, j) \
  1348. __asm__( \
  1349. " UMULL r0,r1,%6,%7 \n\t" \
  1350. " ADDS %0,%0,r0 \n\t" \
  1351. " ADCS %1,%1,r1 \n\t" \
  1352. " ADC %2,%2,#0 \n\t" \
  1353. :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
  1354. #elif defined(TFM_PPC32)
  1355. /* For 32-bit PPC */
  1356. #define COMBA_START
  1357. #define COMBA_CLEAR \
  1358. c0 = c1 = c2 = 0;
  1359. #define COMBA_FORWARD \
  1360. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1361. #define COMBA_STORE(x) \
  1362. x = c0;
  1363. #define COMBA_STORE2(x) \
  1364. x = c1;
  1365. #define COMBA_FINI
  1366. /* untested: will mulhwu change the flags? Docs say no */
  1367. #define MULADD(i, j) \
  1368. __asm__( \
  1369. " mullw 16,%6,%7 \n\t" \
  1370. " addc %0,%0,16 \n\t" \
  1371. " mulhwu 16,%6,%7 \n\t" \
  1372. " adde %1,%1,16 \n\t" \
  1373. " addze %2,%2 \n\t" \
  1374. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
  1375. #elif defined(TFM_PPC64)
  1376. /* For 64-bit PPC */
  1377. #define COMBA_START
  1378. #define COMBA_CLEAR \
  1379. c0 = c1 = c2 = 0;
  1380. #define COMBA_FORWARD \
  1381. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1382. #define COMBA_STORE(x) \
  1383. x = c0;
  1384. #define COMBA_STORE2(x) \
  1385. x = c1;
  1386. #define COMBA_FINI
  1387. /* untested: will mulhdu change the flags? Docs say no */
  1388. #define MULADD(i, j) \
  1389. ____asm__( \
  1390. " mulld r16,%6,%7 \n\t" \
  1391. " addc %0,%0,16 \n\t" \
  1392. " mulhdu r16,%6,%7 \n\t" \
  1393. " adde %1,%1,16 \n\t" \
  1394. " addze %2,%2 \n\t" \
  1395. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
  1396. #elif defined(TFM_AVR32)
  1397. /* ISO C code */
  1398. #define COMBA_START
  1399. #define COMBA_CLEAR \
  1400. c0 = c1 = c2 = 0;
  1401. #define COMBA_FORWARD \
  1402. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1403. #define COMBA_STORE(x) \
  1404. x = c0;
  1405. #define COMBA_STORE2(x) \
  1406. x = c1;
  1407. #define COMBA_FINI
  1408. #define MULADD(i, j) \
  1409. ____asm__( \
  1410. " mulu.d r2,%6,%7 \n\t"\
  1411. " add %0,r2 \n\t"\
  1412. " adc %1,%1,r3 \n\t"\
  1413. " acr %2 \n\t"\
  1414. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
  1415. #elif defined(TFM_MIPS)
  1416. /* MIPS */
  1417. #define COMBA_START
  1418. #define COMBA_CLEAR \
  1419. c0 = c1 = c2 = 0;
  1420. #define COMBA_FORWARD \
  1421. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1422. #define COMBA_STORE(x) \
  1423. x = c0;
  1424. #define COMBA_STORE2(x) \
  1425. x = c1;
  1426. #define COMBA_FINI
  1427. #define MULADD(i, j) \
  1428. __asm__( \
  1429. " multu %6,%7 \n\t" \
  1430. " mflo $12 \n\t" \
  1431. " mfhi $13 \n\t" \
  1432. " addu %0,%0,$12 \n\t" \
  1433. " sltu $12,%0,$12 \n\t" \
  1434. " addu %1,%1,$13 \n\t" \
  1435. " sltu $13,%1,$13 \n\t" \
  1436. " addu %1,%1,$12 \n\t" \
  1437. " sltu $12,%1,$12 \n\t" \
  1438. " addu %2,%2,$13 \n\t" \
  1439. " addu %2,%2,$12 \n\t" \
  1440. :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
  1441. #else
  1442. /* ISO C code */
  1443. #define COMBA_START
  1444. #define COMBA_CLEAR \
  1445. c0 = c1 = c2 = 0;
  1446. #define COMBA_FORWARD \
  1447. do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  1448. #define COMBA_STORE(x) \
  1449. x = c0;
  1450. #define COMBA_STORE2(x) \
  1451. x = c1;
  1452. #define COMBA_FINI
  1453. #define MULADD(i, j) \
  1454. do { fp_word t; \
  1455. t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); \
  1456. c0 = (fp_digit)t; \
  1457. t = (fp_word)c1 + (t >> DIGIT_BIT); \
  1458. c1 = (fp_digit)t; \
  1459. c2 += (fp_digit)(t >> DIGIT_BIT); \
  1460. } while (0);
  1461. #endif
  1462. #ifdef TFM_SMALL_SET
  1463. #include "fp_mul_comba_small_set.i"
  1464. #endif
  1465. #if defined(TFM_MUL3) && FP_SIZE >= 6
  1466. #include "fp_mul_comba_3.i"
  1467. #endif
  1468. #if defined(TFM_MUL4) && FP_SIZE >= 8
  1469. #include "fp_mul_comba_4.i"
  1470. #endif
  1471. #if defined(TFM_MUL6) && FP_SIZE >= 12
  1472. #include "fp_mul_comba_6.i"
  1473. #endif
  1474. #if defined(TFM_MUL7) && FP_SIZE >= 14
  1475. #include "fp_mul_comba_7.i"
  1476. #endif
  1477. #if defined(TFM_MUL8) && FP_SIZE >= 16
  1478. #include "fp_mul_comba_8.i"
  1479. #endif
  1480. #if defined(TFM_MUL9) && FP_SIZE >= 18
  1481. #include "fp_mul_comba_9.i"
  1482. #endif
  1483. #if defined(TFM_MUL12) && FP_SIZE >= 24
  1484. #include "fp_mul_comba_12.i"
  1485. #endif
  1486. #if defined(TFM_MUL17) && FP_SIZE >= 34
  1487. #include "fp_mul_comba_17.i"
  1488. #endif
  1489. #if defined(TFM_MUL20) && FP_SIZE >= 40
  1490. #include "fp_mul_comba_20.i"
  1491. #endif
  1492. #if defined(TFM_MUL24) && FP_SIZE >= 48
  1493. #include "fp_mul_comba_24.i"
  1494. #endif
  1495. #if defined(TFM_MUL28) && FP_SIZE >= 56
  1496. #include "fp_mul_comba_28.i"
  1497. #endif
  1498. #if defined(TFM_MUL32) && FP_SIZE >= 64
  1499. #include "fp_mul_comba_32.i"
  1500. #endif
  1501. #if defined(TFM_MUL48) && FP_SIZE >= 96
  1502. #include "fp_mul_comba_48.i"
  1503. #endif
  1504. #if defined(TFM_MUL64) && FP_SIZE >= 128
  1505. #include "fp_mul_comba_64.i"
  1506. #endif
  1507. /* end fp_mul_comba.c asm */