Browse Source

all: import amd64 from Bell Labs

David du Colombier 2 years ago
parent
commit
19028bdd04
50 changed files with 1572 additions and 2 deletions
  1. 80 0
      amd64/include/ape/float.h
  2. 74 0
      amd64/include/ape/math.h
  3. 18 0
      amd64/include/ape/stdarg.h
  4. 38 0
      amd64/include/ape/ureg.h
  5. 3 0
      sys/src/ape/lib/9/amd64/getcallerpc.s
  6. 38 0
      sys/src/ape/lib/9/amd64/getfcr.s
  7. 11 0
      sys/src/ape/lib/ap/amd64/_seek.c
  8. 5 0
      sys/src/ape/lib/ap/amd64/cycles.s
  9. 26 0
      sys/src/ape/lib/ap/amd64/lock.c
  10. 12 0
      sys/src/ape/lib/ap/amd64/main9.s
  11. 45 0
      sys/src/ape/lib/ap/amd64/main9p.s
  12. 20 0
      sys/src/ape/lib/ap/amd64/mkfile
  13. 81 0
      sys/src/ape/lib/ap/amd64/notetramp.c
  14. 27 0
      sys/src/ape/lib/ap/amd64/setjmp.s
  15. 38 0
      sys/src/ape/lib/ap/amd64/strchr.s
  16. 16 0
      sys/src/ape/lib/ap/amd64/strlen.s
  17. 5 0
      sys/src/ape/lib/ap/amd64/tas.s
  18. 14 0
      sys/src/libc/amd64/_seek.c
  19. 4 0
      sys/src/libc/amd64/argv0.s
  20. 66 0
      sys/src/libc/amd64/atom.s
  21. 5 0
      sys/src/libc/amd64/cycles.s
  22. 3 0
      sys/src/libc/amd64/getcallerpc.s
  23. 38 0
      sys/src/libc/amd64/getfcr.s
  24. 19 0
      sys/src/libc/amd64/main9.s
  25. 41 0
      sys/src/libc/amd64/main9p.s
  26. 58 0
      sys/src/libc/amd64/memccpy.s
  27. 23 0
      sys/src/libc/amd64/memchr.s
  28. 52 0
      sys/src/libc/amd64/memcmp.s
  29. 81 0
      sys/src/libc/amd64/memcpy.s
  30. 81 0
      sys/src/libc/amd64/memmove.s
  31. 41 0
      sys/src/libc/amd64/memset.s
  32. 42 0
      sys/src/libc/amd64/mkfile
  33. 12 0
      sys/src/libc/amd64/muldiv.s
  34. 16 0
      sys/src/libc/amd64/notejmp.c
  35. 9 0
      sys/src/libc/amd64/rdpmc.s
  36. 17 0
      sys/src/libc/amd64/setjmp.s
  37. 4 0
      sys/src/libc/amd64/sqrt.s
  38. 48 0
      sys/src/libc/amd64/strcat.s
  39. 38 0
      sys/src/libc/amd64/strchr.s
  40. 40 0
      sys/src/libc/amd64/strcpy.s
  41. 16 0
      sys/src/libc/amd64/strlen.s
  42. 8 0
      sys/src/libc/amd64/tas.s
  43. 20 0
      sys/src/libmp/amd64/mkfile
  44. 21 0
      sys/src/libmp/amd64/mpdigdiv.s
  45. 54 0
      sys/src/libmp/amd64/mpvecadd.s
  46. 53 0
      sys/src/libmp/amd64/mpvecdigmuladd.s
  47. 53 0
      sys/src/libmp/amd64/mpvecdigmulsub.s
  48. 45 0
      sys/src/libmp/amd64/mpvecsub.s
  49. 11 0
      sys/src/libsec/amd64/mkfile
  50. 2 2
      sys/src/mkfile.proto

+ 80 - 0
amd64/include/ape/float.h

@@ -0,0 +1,80 @@
+#ifndef __FLOAT
+#define __FLOAT
+/* IEEE, default rounding */
+
+#define FLT_ROUNDS	1
+#define FLT_RADIX	2
+
+#define FLT_DIG		6
+#define FLT_EPSILON	1.19209290e-07
+#define FLT_MANT_DIG	24
+#define FLT_MAX		3.40282347e+38
+#define FLT_MAX_10_EXP	38
+#define FLT_MAX_EXP	128
+#define FLT_MIN		1.17549435e-38
+#define FLT_MIN_10_EXP	-37
+#define FLT_MIN_EXP	-125
+
+#define DBL_DIG		15
+#define DBL_EPSILON	2.2204460492503131e-16
+#define DBL_MANT_DIG	53
+#define DBL_MAX		1.797693134862315708145e+308
+#define DBL_MAX_10_EXP	308
+#define DBL_MAX_EXP	1024
+#define DBL_MIN		2.225073858507201383090233e-308
+#define DBL_MIN_10_EXP	-307
+#define DBL_MIN_EXP	-1021
+#define LDBL_MANT_DIG	DBL_MANT_DIG
+#define LDBL_EPSILON	DBL_EPSILON
+#define LDBL_DIG	DBL_DIG
+#define LDBL_MIN_EXP	DBL_MIN_EXP
+#define LDBL_MIN	DBL_MIN
+#define LDBL_MIN_10_EXP	DBL_MIN_10_EXP
+#define LDBL_MAX_EXP	DBL_MAX_EXP
+#define LDBL_MAX	DBL_MAX
+#define LDBL_MAX_10_EXP	DBL_MAX_10_EXP
+
+typedef 	union FPdbleword FPdbleword;
+union FPdbleword
+{
+	double	x;
+	struct {	/* little endian */
+		long lo;
+		long hi;
+	};
+};
+
+#ifdef _RESEARCH_SOURCE
+/* define stuff needed for floating conversion */
+#define IEEE_8087	1
+#define Sudden_Underflow 1
+#endif
+#ifdef _PLAN9_SOURCE
+/* MXCSR */
+/* fcr */
+#define	FPFTZ	(1<<15)	/* amd64 */
+#define	FPINEX	(1<<12)
+#define	FPUNFL	(1<<11)
+#define	FPOVFL	(1<<10)
+#define	FPZDIV	(1<<9)
+#define	FPDNRM	(1<<8)	/* amd64 */
+#define	FPINVAL	(1<<7)
+#define	FPDAZ	(1<<6)	/* amd64 */
+#define	FPRNR	(0<<13)
+#define	FPRZ	(3<<13)
+#define	FPRPINF	(2<<13)
+#define	FPRNINF	(1<<13)
+#define	FPRMASK	(3<<13)
+#define	FPPEXT	0
+#define	FPPSGL	0
+#define	FPPDBL	0
+#define	FPPMASK	0
+/* fsr */
+#define	FPAINEX	(1<<5)
+#define	FPAUNFL	(1<<4)
+#define	FPAOVFL	(1<<3)
+#define	FPAZDIV	(1<<2)
+#define	FPADNRM	(1<<1)	/* not in plan 9 */
+#define	FPAINVAL	(1<<0)
+#endif
+#endif /* __FLOAT */

+ 74 - 0
amd64/include/ape/math.h

@@ -0,0 +1,74 @@
+#ifndef __MATH
+#define __MATH
+#pragma lib "/$M/lib/ape/libap.a"
+
+/* a HUGE_VAL appropriate for IEEE double-precision */
+/* the correct value, 1.797693134862316e+308, causes a ken overflow */
+#define HUGE_VAL 1.79769313486231e+308
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double acos(double);
+extern double asin(double);
+extern double atan(double);
+extern double atan2(double, double);
+extern double cos(double);
+extern double sin(double);
+extern double tan(double);
+extern double cosh(double);
+extern double sinh(double);
+extern double tanh(double);
+extern double exp(double);
+extern double frexp(double, int *);
+extern double ldexp(double, int);
+extern double log(double);
+extern double log10(double);
+extern double modf(double, double *);
+extern double pow(double, double);
+extern double sqrt(double);
+extern double ceil(double);
+extern double fabs(double);
+extern double floor(double);
+extern double fmod(double, double);
+extern double NaN(void);
+extern int isNaN(double);
+extern double Inf(int);
+extern int isInf(double, int);
+
+#ifdef _RESEARCH_SOURCE
+/* does >> treat left operand as unsigned ? */
+#define Unsigned_Shifts 1
+#define	M_E		2.7182818284590452354	/* e */
+#define	M_LOG2E		1.4426950408889634074	/* log 2e */
+#define	M_LOG10E	0.43429448190325182765	/* log 10e */
+#define	M_LN2		0.69314718055994530942	/* log e2 */
+#define	M_LN10		2.30258509299404568402	/* log e10 */
+#define	M_PI		3.14159265358979323846	/* pi */
+#define	M_PI_2		1.57079632679489661923	/* pi/2 */
+#define	M_PI_4		0.78539816339744830962	/* pi/4 */
+#define	M_1_PI		0.31830988618379067154	/* 1/pi */
+#define	M_2_PI		0.63661977236758134308	/* 2/pi */
+#define	M_2_SQRTPI	1.12837916709551257390	/* 2/sqrt(pi) */
+#define	M_SQRT2		1.41421356237309504880	/* sqrt(2) */
+#define	M_SQRT1_2	0.70710678118654752440	/* 1/sqrt(2) */
+
+extern double hypot(double, double);
+extern double erf(double);
+extern double erfc(double);
+extern double j0(double);
+extern double y0(double);
+extern double j1(double);
+extern double y1(double);
+extern double jn(int, double);
+extern double yn(int, double);
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __MATH */

+ 18 - 0
amd64/include/ape/stdarg.h

@@ -0,0 +1,18 @@
+#ifndef __STDARG
+#define __STDARG
+
+typedef char *va_list;
+
+#define va_start(list, start) list = (sizeof(start)<8 ? (char *)((long long *)&(start)+1) : \
+(char *)(&(start)+1))
+#define va_end(list)
+#define va_arg(list, mode)\
+	((sizeof(mode) == 1)?\
+		((mode*)(list += 8))[-8]:\
+	(sizeof(mode) == 2)?\
+		((mode*)(list += 8))[-4]:\
+	(sizeof(mode) == 4)?\
+		((mode*)(list += 8))[-2]:\
+		((mode*)(list += sizeof(mode)))[-1])
+
+#endif /* __STDARG */

+ 38 - 0
amd64/include/ape/ureg.h

@@ -0,0 +1,38 @@
+#ifndef __UREG_H
+#define __UREG_H
+#if !defined(_PLAN9_SOURCE)
+    This header file is an extension to ANSI/POSIX
+#endif
+
+struct Ureg {
+	unsigned long long	ax;
+	unsigned long long	bx;
+	unsigned long long	cx;
+	unsigned long long	dx;
+	unsigned long long	si;
+	unsigned long long	di;
+	unsigned long long	bp;
+	unsigned long long	r8;
+	unsigned long long	r9;
+	unsigned long long	r10;
+	unsigned long long	r11;
+	unsigned long long	r12;
+	unsigned long long	r13;
+	unsigned long long	r14;
+	unsigned long long	r15;
+
+	unsigned short		ds;
+	unsigned short		es;
+	unsigned short		fs;
+	unsigned short		gs;
+
+	unsigned long long	type;
+	unsigned long long	error;		/* error code (or zero) */
+	unsigned long long	ip;		/* pc */
+	unsigned long long	cs;		/* old context */
+	unsigned long long	flags;		/* old flags */
+	unsigned long long	sp;		/* sp */
+	unsigned long long	ss;		/* old stack segment */
+};
+
+#endif

+ 3 - 0
sys/src/ape/lib/9/amd64/getcallerpc.s

@@ -0,0 +1,3 @@
+TEXT getcallerpc(SB), $0
+	MOVQ	-8(RARG), AX
+	RET

+ 38 - 0
sys/src/ape/lib/9/amd64/getfcr.s

@@ -0,0 +1,38 @@
+
+TEXT	setfcr(SB), $4
+	XORL	$(0x3F<<7),RARG	/* bits are cleared in csr to enable them */
+	ANDL	$0xFFC0, RARG	/* just the fcr bits */
+	WAIT	/* is this needed? */
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
+
+TEXT	getfcr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVWLZX	0(SP), AX
+	ANDL	$0xFFC0, AX
+	XORL	$(0x3F<<7),AX
+	RET
+
+TEXT	getfsr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$0x3F, AX
+	RET
+
+TEXT	setfsr(SB), $4
+	ANDL	$0x3F, RARG
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET

+ 11 - 0
sys/src/ape/lib/ap/amd64/_seek.c

@@ -0,0 +1,11 @@
+extern long __SEEK(long long*, int, long long, int);
+
+long long
+_SEEK(int fd, long long o, int p)
+{
+	long long l;
+
+	if(__SEEK(&l, fd, o, p) < 0)
+		l = -1;
+	return l;
+}

+ 5 - 0
sys/src/ape/lib/ap/amd64/cycles.s

@@ -0,0 +1,5 @@
+TEXT _cycles(SB),1,$0				/* time stamp counter; cycles since power up */
+	RDTSC
+	MOVL	AX, 0(RARG)			/* lo */
+	MOVL	DX, 4(RARG)			/* hi */
+	RET

+ 26 - 0
sys/src/ape/lib/ap/amd64/lock.c

@@ -0,0 +1,26 @@
+#define _LOCK_EXTENSION
+#include "../plan9/sys9.h"
+#include <lock.h>
+
+int	tas(int*);
+
+void
+lock(Lock *lk)
+{
+	while(tas(&lk->val))
+		_SLEEP(0);
+}
+
+int
+canlock(Lock *lk)
+{
+	if(tas(&lk->val))
+		return 0;
+	return 1;
+}
+
+void
+unlock(Lock *lk)
+{
+	lk->val = 0;
+}

+ 12 - 0
sys/src/ape/lib/ap/amd64/main9.s

@@ -0,0 +1,12 @@
+	TEXT	_main(SB), 1, $(3*8)
+
+	CALL	_envsetup(SB)
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	environ(SB), AX
+	MOVQ	AX, 16(SP)
+	CALL	main(SB)
+	MOVQ	AX, RARG
+	CALL	exit(SB)
+	RET

+ 45 - 0
sys/src/ape/lib/ap/amd64/main9p.s

@@ -0,0 +1,45 @@
+#define NPRIVATES	16
+
+GLOBL	_tos(SB), $8
+GLOBL	_privates(SB), $8
+GLOBL	_nprivates(SB), $8
+
+TEXT	_mainp(SB), 1, $(3*8+NPRIVATES*8)
+
+	/* _tos = arg */
+	MOVQ	AX, _tos(SB)
+	LEAQ	8(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVQ	$NPRIVATES, _nprivates(SB)
+
+	/* _profmain(); */
+	CALL	_profmain(SB)
+
+	/* _tos->prof.pp = _tos->prof.next; */
+	MOVQ	_tos+0(SB),DX
+	MOVQ	4(DX),CX
+	MOVQ	CX,(DX)
+
+	CALL	_envsetup(SB)
+
+	/* main(argc, argv, environ); */
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	environ(SB), AX
+	MOVQ	AX, 16(SP)
+	CALL	main(SB)
+
+loop:
+	MOVL	AX, RARG
+	CALL	exit(SB)
+	MOVQ	$_profin(SB), AX	/* force loading of profile */
+	MOVL	$0, AX
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	RET
+
+TEXT	_callpc(SB), 1, $0
+	MOVQ	8(RARG), AX
+	RET

+ 20 - 0
sys/src/ape/lib/ap/amd64/mkfile

@@ -0,0 +1,20 @@
+APE=/sys/src/ape
+objtype=amd64
+<$APE/config
+LIB=/$objtype/lib/ape/libap.a
+OFILES=\
+	_seek.$O\
+	cycles.$O\
+	lock.$O\
+	main9.$O\
+	main9p.$O\
+	notetramp.$O\
+	setjmp.$O\
+	strchr.$O\
+	strlen.$O\
+	tas.$O\
+
+</sys/src/cmd/mksyslib
+
+CFLAGS=-c -D_POSIX_SOURCE -D_PLAN9_SOURCE
+

+ 81 - 0
sys/src/ape/lib/ap/amd64/notetramp.c

@@ -0,0 +1,81 @@
+#include "../plan9/lib.h"
+#include "../plan9/sys9.h"
+#include <signal.h>
+#include <setjmp.h>
+
+/* A stack to hold pcs when signals nest */
+#define MAXSIGSTACK 20
+typedef struct Pcstack Pcstack;
+static struct Pcstack {
+	int sig;
+	void (*hdlr)(int, char*, Ureg*);
+	unsigned long long restorepc;
+	Ureg *u;
+} pcstack[MAXSIGSTACK];
+static int nstack = 0;
+
+static void notecont(Ureg*, char*);
+
+void
+_notetramp(int sig, void (*hdlr)(int, char*, Ureg*), Ureg *u)
+{
+	Pcstack *p;
+
+	if(nstack >= MAXSIGSTACK)
+		_NOTED(1);	/* nesting too deep; just do system default */
+	p = &pcstack[nstack];
+	p->restorepc = u->ip;
+	p->sig = sig;
+	p->hdlr = hdlr;
+	p->u = u;
+	nstack++;
+	u->ip = (unsigned long long) notecont;
+	_NOTED(2);	/* NSAVE: clear note but hold state */
+}
+
+static void
+notecont(Ureg *u, char *s)
+{
+	Pcstack *p;
+	void(*f)(int, char*, Ureg*);
+
+	p = &pcstack[nstack-1];
+	f = p->hdlr;
+	u->ip = p->restorepc;
+	nstack--;
+	(*f)(p->sig, s, u);
+	_NOTED(3);	/* NRSTR */
+}
+
+#define JMPBUFPC 1
+#define JMPBUFSP 0
+
+extern sigset_t	_psigblocked;
+
+typedef struct {
+	sigset_t set;
+	sigset_t blocked;
+	unsigned long long jmpbuf[2];
+} sigjmp_buf_amd64;
+
+void
+siglongjmp(sigjmp_buf j, int ret)
+{
+	struct Ureg *u;
+	sigjmp_buf_amd64 *jb;
+
+	jb = (sigjmp_buf_amd64*)j;
+
+	if(jb->set)
+		_psigblocked = jb->blocked;
+	if(nstack == 0 || pcstack[nstack-1].u->sp > jb->jmpbuf[JMPBUFSP])
+		longjmp((void*)jb->jmpbuf, ret);
+	u = pcstack[nstack-1].u;
+	nstack--;
+	u->ax = ret;
+	if(ret == 0)
+		u->ax = 1;
+	u->ip = jb->jmpbuf[JMPBUFPC];
+	u->sp = jb->jmpbuf[JMPBUFSP] + 8;
+	_NOTED(3);	/* NRSTR */
+}

+ 27 - 0
sys/src/ape/lib/ap/amd64/setjmp.s

@@ -0,0 +1,27 @@
+TEXT	longjmp(SB), $0
+	MOVL	r+8(FP), AX
+	CMPL	AX, $0
+	JNE	ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVL	$1, AX		/* bless their pointed heads */
+ok:
+	MOVQ	0(RARG), SP	/* restore sp */
+	MOVQ	8(RARG), BX	/* put return pc on the stack */
+	MOVQ	BX, 0(SP)
+	RET
+
+TEXT	setjmp(SB), $0
+	MOVQ	SP, 0(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 8(RARG)
+	MOVL	$0, AX		/* return 0 */
+	RET
+
+TEXT	sigsetjmp(SB), $0
+	MOVL	savemask+8(FP), BX
+	MOVL	BX, 0(RARG)
+	MOVL	$_psigblocked(SB), 4(RARG)
+	MOVQ	SP, 8(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 16(RARG)
+	MOVL	$0, AX	/* return 0 */
+	RET

+ 38 - 0
sys/src/ape/lib/ap/amd64/strchr.s

@@ -0,0 +1,38 @@
+	TEXT	strchr(SB), $0
+
+	MOVQ	RARG, DI
+	MOVB	c+8(FP), AX
+	CMPB	AX, $0
+	JEQ	l2	/**/
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(DI), BX
+	CMPB	BX, $0
+	JEQ	ret0
+	ADDQ	$1, DI
+	CMPB	AX, BX
+	JNE	l1
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+/*
+ * char is null
+ */
+l2:
+	MOVQ	$-1, CX
+	CLD
+
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+ret0:
+	MOVQ	$0, AX
+	RET

+ 16 - 0
sys/src/ape/lib/ap/amd64/strlen.s

@@ -0,0 +1,16 @@
+	TEXT	strlen(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * look for end of string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	RARG, AX
+	SUBQ	$1, AX
+	RET

+ 5 - 0
sys/src/ape/lib/ap/amd64/tas.s

@@ -0,0 +1,5 @@
+TEXT	tas(SB),$0
+
+	MOVL	$0xdeadead,AX
+	XCHGL	AX,(RARG)
+	RET

+ 14 - 0
sys/src/libc/amd64/_seek.c

@@ -0,0 +1,14 @@
+#include <u.h>
+#include <libc.h>
+
+extern int _seek(vlong*, int, vlong, int);
+
+vlong
+seek(int fd, vlong o, int p)
+{
+	vlong l;
+
+	if(_seek(&l, fd, o, p) < 0)
+		l = -1LL;
+	return l;
+}

+ 4 - 0
sys/src/libc/amd64/argv0.s

@@ -0,0 +1,4 @@
+GLOBL	argv0(SB), $8
+GLOBL	_tos(SB), $8
+GLOBL	_privates(SB), $8
+GLOBL	_nprivates(SB), $4

+ 66 - 0
sys/src/libc/amd64/atom.s

@@ -0,0 +1,66 @@
+TEXT ainc(SB), 1, $0	/* long ainc(long *); */
+ainclp:
+	MOVL	(RARG), AX	/* exp */
+	MOVL	AX, BX
+	INCL	BX		/* new */
+	LOCK; CMPXCHGL BX, (RARG)
+	JNZ	ainclp
+	MOVL	BX, AX
+	RET
+
+TEXT adec(SB), 1, $0	/* long adec(long*); */
+adeclp:
+	MOVL	(RARG), AX
+	MOVL	AX, BX
+	DECL	BX
+	LOCK; CMPXCHGL BX, (RARG)
+	JNZ	adeclp
+	MOVL	BX, AX
+	RET
+
+/*
+ * int cas32(u32int *p, u32int ov, u32int nv);
+ * int cas(uint *p, int ov, int nv);
+ * int casul(ulong *p, ulong ov, ulong nv);
+ */
+
+TEXT cas32(SB), 1, $0
+TEXT cas(SB), 1, $0
+TEXT casul(SB), 1, $0
+TEXT casl(SB), 1, $0			/* back compat */
+	MOVL	exp+8(FP), AX
+	MOVL	new+16(FP), BX
+	LOCK; CMPXCHGL BX, (RARG)
+	MOVL	$1, AX				/* use CMOVLEQ etc. here? */
+	JNZ	_cas32r0
+_cas32r1:
+	RET
+_cas32r0:
+	DECL	AX
+	RET
+
+/*
+ * int cas64(u64int *p, u64int ov, u64int nv);
+ * int casp(void **p, void *ov, void *nv);
+ */
+
+TEXT cas64(SB), 1, $0
+TEXT casp(SB), 1, $0
+	MOVQ	exp+8(FP), AX
+	MOVQ	new+16(FP), BX
+	LOCK; CMPXCHGQ BX, (RARG)
+	MOVL	$1, AX				/* use CMOVLEQ etc. here? */
+	JNZ	_cas64r0
+_cas64r1:
+	RET
+_cas64r0:
+	DECL	AX
+	RET
+
+/*
+ * void mfence(void);
+ */
+TEXT mfence(SB),0,$0
+	MFENCE
+	RET
+

+ 5 - 0
sys/src/libc/amd64/cycles.s

@@ -0,0 +1,5 @@
+TEXT cycles(SB),1,$0				/* time stamp counter; cycles since power up */
+	RDTSC
+	MOVL	AX, 0(RARG)			/* lo */
+	MOVL	DX, 4(RARG)			/* hi */
+	RET

+ 3 - 0
sys/src/libc/amd64/getcallerpc.s

@@ -0,0 +1,3 @@
+TEXT getcallerpc(SB), $0
+	MOVQ	-8(RARG), AX
+	RET

+ 38 - 0
sys/src/libc/amd64/getfcr.s

@@ -0,0 +1,38 @@
+
+TEXT	setfcr(SB), $4
+	XORL	$(0x3F<<7),RARG	/* bits are cleared in csr to enable them */
+	ANDL	$0xFFC0, RARG	/* just the fcr bits */
+	WAIT	/* is this needed? */
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
+
+TEXT	getfcr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVWLZX	0(SP), AX
+	ANDL	$0xFFC0, AX
+	XORL	$(0x3F<<7),AX
+	RET
+
+TEXT	getfsr(SB), $4
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$0x3F, AX
+	RET
+
+TEXT	setfsr(SB), $4
+	ANDL	$0x3F, RARG
+	WAIT
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	RARG, AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET

+ 19 - 0
sys/src/libc/amd64/main9.s

@@ -0,0 +1,19 @@
+#define NPRIVATES	16
+
+TEXT	_main(SB), 1, $(2*8+NPRIVATES*8)
+	MOVQ	AX, _tos(SB)
+	LEAQ	16(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVL	$NPRIVATES, _nprivates(SB)
+	MOVL	inargc-8(FP), RARG
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	CALL	main(SB)
+
+loop:
+	MOVQ	$_exits<>(SB), RARG
+	CALL	exits(SB)
+	JMP	loop
+
+DATA	_exits<>+0(SB)/4, $"main"
+GLOBL	_exits<>+0(SB), $5

+ 41 - 0
sys/src/libc/amd64/main9p.s

@@ -0,0 +1,41 @@
+#define NPRIVATES	16
+
+TEXT _mainp(SB), 1, $(2*8+NPRIVATES*8)
+	MOVQ	AX, _tos(SB)		/* _tos = arg */
+	LEAQ	16(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVL	$NPRIVATES, _nprivates(SB)
+
+	CALL	_profmain(SB)		/* _profmain(); */
+
+	MOVQ	_tos+0(SB), DX		/* _tos->prof.pp = _tos->prof.next; */
+	MOVQ	8(DX), CX
+	MOVQ	CX, (DX)
+
+	MOVL	inargc-8(FP), RARG	/* main(argc, argv); */
+	LEAQ	inargv+0(FP), AX
+	MOVQ	AX, 8(SP)
+	CALL	main(SB)
+
+loop:
+	MOVQ	$_exits<>(SB), RARG
+	CALL	exits(SB)
+	MOVQ	$_profin(SB), AX	/* force loading of profile */
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	MOVQ	RARG, AX
+	RET
+
+TEXT	_saveret(SB), 1, $0
+	RET
+
+TEXT	_restorearg(SB), 1, $0
+	RET				/* we want RARG in RARG */
+
+TEXT	_callpc(SB), 1, $0
+	MOVQ	8(RARG), AX
+	RET
+
+DATA	_exits<>+0(SB)/4, $"main"
+GLOBL	_exits<>+0(SB), $5

+ 58 - 0
sys/src/libc/amd64/memccpy.s

@@ -0,0 +1,58 @@
+	TEXT	memccpy(SB),$0
+
+	MOVL	n+24(FP), CX
+	CMPL	CX, $0
+	JEQ	none
+	MOVQ	p2+8(FP), DI
+	MOVBLZX	c+16(FP), AX
+	CLD
+/*
+ * find the character in the second string
+ */
+
+	REPN;	SCASB
+	JEQ	found
+
+/*
+ * if not found, set count to 'n'
+ */
+none:
+	MOVL	$0, AX
+	MOVL	n+24(FP), BX
+	JMP	memcpy
+
+/*
+ * if found, set count to bytes thru character
+ */
+found:
+	MOVQ	DI, AX
+	SUBQ	p2+8(FP), AX
+	MOVQ	AX, BX
+	ADDQ	RARG, AX
+
+/*
+ * copy the memory
+ */
+
+memcpy:
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET

+ 23 - 0
sys/src/libc/amd64/memchr.s

@@ -0,0 +1,23 @@
+	TEXT	memchr(SB),$0
+
+	MOVL	n+16(FP), CX
+	CMPL	CX, $0
+	JEQ	none
+	MOVQ	RARG, DI
+	MOVBLZX	c+8(FP), AX
+	CLD
+/*
+ * SCASB is memchr instruction
+ */
+
+	REPN;	SCASB
+	JEQ	found
+
+none:
+	MOVL	$0, AX
+	RET
+
+found:
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET

+ 52 - 0
sys/src/libc/amd64/memcmp.s

@@ -0,0 +1,52 @@
+	TEXT	memcmp(SB),$0
+
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JEQ	none
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+	CLD
+	MOVQ	DI, CX
+	ORQ	SI, CX
+	ANDL	$3, CX
+	JNE	c3
+/*
+ * first by longs
+ */
+
+	MOVL	BX, CX
+	SHRQ	$2, CX
+
+	REP;	CMPSL
+	JNE	found
+
+/*
+ * then by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	CMPSB
+	JNE	found1
+
+none:
+	MOVQ	$0, AX
+	RET
+
+/*
+ * if long found,
+ * back up and look by bytes
+ */
+found:
+	MOVL	$4, CX
+	SUBQ	CX, DI
+	SUBQ	CX, SI
+	REP;	CMPSB
+
+found1:
+	JLS	lt
+	MOVQ	$-1, AX
+	RET
+lt:
+	MOVQ	$1, AX
+	RET

+ 81 - 0
sys/src/libc/amd64/memcpy.s

@@ -0,0 +1,81 @@
+TEXT memcpy(SB), $0
+	MOVQ	RARG, DI
+	MOVQ	DI, AX			/* return value */
+	MOVQ	p2+8(FP), SI
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JGT	_ok
+	JEQ	_return			/* nothing to do if n == 0 */
+	MOVL	$0, SI			/* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *	(p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+	CMPQ	SI, DI
+	JGT	_forward
+	JEQ	_return			/* nothing to do if p2 == p1 */
+	MOVQ	SI, DX
+	ADDQ	BX, DX
+	CMPQ	DX, DI
+	JGT	_back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+	CLD
+	MOVQ	SI, DX
+	ORQ	DI, DX
+	ANDL	$3, DX
+	JNE	c3f
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+c3f:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+	SUBQ	$4, DI
+	SUBQ	$4, SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3b
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+
+c3b:
+	ADDQ	$3, DI
+	ADDQ	$3, SI
+	MOVL	BX, CX
+	REP;	MOVSB
+
+_return:
+	RET

+ 81 - 0
sys/src/libc/amd64/memmove.s

@@ -0,0 +1,81 @@
+TEXT memmove(SB), $0
+	MOVQ	RARG, DI
+	MOVQ	DI, AX			/* return value */
+	MOVQ	p2+8(FP), SI
+	MOVL	n+16(FP), BX
+	CMPL	BX, $0
+	JGT	_ok
+	JEQ	_return			/* nothing to do if n == 0 */
+	MOVL	$0, SI			/* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *	(p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+	CMPQ	SI, DI
+	JGT	_forward
+	JEQ	_return			/* nothing to do if p2 == p1 */
+	MOVQ	SI, DX
+	ADDQ	BX, DX
+	CMPQ	DX, DI
+	JGT	_back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+	CLD
+	MOVQ	SI, DX
+	ORQ	DI, DX
+	ANDL	$3, DX
+	JNE	c3f
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+c3f:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+	SUBQ	$4, DI
+	SUBQ	$4, SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, DX
+	ORQ	SI, DX
+	ANDL	$3, DX
+	JNE	c3b
+	MOVL	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	JEQ	_return			/* flags set by above ANDL */
+
+c3b:
+	ADDQ	$3, DI
+	ADDQ	$3, SI
+	MOVL	BX, CX
+	REP;	MOVSB
+
+_return:
+	RET

+ 41 - 0
sys/src/libc/amd64/memset.s

@@ -0,0 +1,41 @@
+	TEXT	memset(SB),$0
+
+	CLD
+	MOVQ	RARG, DI
+	MOVBLZX	c+8(FP), AX
+	MOVL	n+16(FP), BX
+/*
+ * if not enough bytes, just set bytes
+ */
+	CMPL	BX, $9
+	JLS	c3
+/*
+ * if not aligned, just set bytes
+ */
+	MOVQ	RARG, CX
+	ANDL	$3,CX
+	JNE	c3
+/*
+ * build word in AX
+ */
+	MOVB	AL, AH
+	MOVL	AX, CX
+	SHLL	$16, CX
+	ORL	CX, AX
+/*
+ * set whole longs
+ */
+c1:
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	ANDL	$3, BX
+	REP;	STOSL
+/*
+ * set the rest, by bytes
+ */
+c3:
+	MOVL	BX, CX
+	REP;	STOSB
+ret:
+	MOVQ	RARG,AX
+	RET

+ 42 - 0
sys/src/libc/amd64/mkfile

@@ -0,0 +1,42 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+SFILES=\
+	argv0.s\
+	atom.s\
+	cycles.s\
+	getfcr.s\
+	main9.s\
+	main9p.s\
+	memccpy.s\
+	memchr.s\
+	memcmp.s\
+	memcpy.s\
+	memmove.s\
+	memset.s\
+	muldiv.s\
+	rdpmc.s\
+	setjmp.s\
+	sqrt.s\
+	strcat.s\
+	strchr.s\
+	strcpy.s\
+	strlen.s\
+	tas.s\
+
+CFILES=\
+	_seek.c\
+	getcallerpc.c\
+	notejmp.c\
+
+HFILES=/sys/include/libc.h
+
+OFILES=${CFILES:%.c=%.$O} ${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$CFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib

+ 12 - 0
sys/src/libc/amd64/muldiv.s

@@ -0,0 +1,12 @@
+TEXT	umuldiv(SB), $0
+	MOVL	RARG, AX
+	MULL	b+8(FP)
+	DIVL	c+16(FP)
+	RET
+
+TEXT	muldiv(SB), $0
+	MOVL	RARG, AX
+	IMULL	b+8(FP)
+	IDIVL	c+16(FP)
+	RET
+	END

+ 16 - 0
sys/src/libc/amd64/notejmp.c

@@ -0,0 +1,16 @@
+#include <u.h>
+#include <libc.h>
+#include <ureg.h>
+
+void
+notejmp(void *vr, jmp_buf j, int ret)
+{
+	struct Ureg *r = vr;
+
+	r->ax = ret;
+	if(ret == 0)
+		r->ax = 1;
+	r->ip = j[JMPBUFPC];
+	r->sp = j[JMPBUFSP] + 8;
+	noted(NCONT);
+}

+ 9 - 0
sys/src/libc/amd64/rdpmc.s

@@ -0,0 +1,9 @@
+MODE $64
+
+TEXT rdpmc(SB), 1, $-4				/* performance monitor counter */
+	MOVL	RARG, CX
+	RDPMC						/* read CX performance counter */
+	XCHGL	DX, AX				/* swap lo/hi, zero-extend */
+	SHLQ	$32, AX				/* hi<<32 */
+	ORQ	DX, AX					/* (hi<<32)|lo */
+	RET

+ 17 - 0
sys/src/libc/amd64/setjmp.s

@@ -0,0 +1,17 @@
+TEXT	longjmp(SB), $0
+	MOVL	r+8(FP), AX
+	CMPL	AX, $0
+	JNE	ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVL	$1, AX		/* bless their pointed heads */
+ok:
+	MOVQ	0(RARG), SP	/* restore sp */
+	MOVQ	8(RARG), BX	/* put return pc on the stack */
+	MOVQ	BX, 0(SP)
+	RET
+
+TEXT	setjmp(SB), $0
+	MOVQ	SP, 0(RARG)	/* store sp */
+	MOVQ	0(SP), BX	/* store return pc */
+	MOVQ	BX, 8(RARG)
+	MOVL	$0, AX		/* return 0 */
+	RET

+ 4 - 0
sys/src/libc/amd64/sqrt.s

@@ -0,0 +1,4 @@
+TEXT	sqrt(SB), $0
+	MOVSD	a+0(FP), X0
+	SQRTSD	X0, X0
+	RET

+ 48 - 0
sys/src/libc/amd64/strcat.s

@@ -0,0 +1,48 @@
+	TEXT	strcat(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+
+/*
+ * find length of second string
+ */
+
+	MOVQ	p2+8(FP), DI
+	REPN;	SCASB
+
+	MOVQ	DI, BX
+	SUBQ	p2+8(FP), BX
+
+/*
+ * find end of first string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+/*
+ * copy the memory
+ */
+	SUBQ	$1, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, CX
+	ORQ	SI, CX
+	ANDL	$3, CX
+	JNE	c3
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVQ	BX, CX
+	REP;	MOVSB
+
+	MOVQ	RARG, AX
+	RET

+ 38 - 0
sys/src/libc/amd64/strchr.s

@@ -0,0 +1,38 @@
+	TEXT	strchr(SB), $0
+
+	MOVQ	RARG, DI
+	MOVB	c+8(FP), AX
+	CMPB	AX, $0
+	JEQ	l2	/**/
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(DI), BX
+	CMPB	BX, $0
+	JEQ	ret0
+	ADDQ	$1, DI
+	CMPB	AX, BX
+	JNE	l1
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+/*
+ * char is null
+ */
+l2:
+	MOVQ	$-1, CX
+	CLD
+
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	$1, AX
+	RET
+
+ret0:
+	MOVQ	$0, AX
+	RET

+ 40 - 0
sys/src/libc/amd64/strcpy.s

@@ -0,0 +1,40 @@
+	TEXT	strcpy(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * find end of second string
+ */
+
+	MOVQ	p2+8(FP), DI
+	REPN;	SCASB
+
+	MOVQ	DI, BX
+	SUBQ	p2+8(FP), BX
+
+/*
+ * copy the memory
+ */
+	MOVQ	RARG, DI
+	MOVQ	p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+	MOVQ	DI, CX
+	ORQ		SI, CX
+	ANDL	$3, CX
+	JNE	c3
+	MOVQ	BX, CX
+	SHRQ	$2, CX
+	REP;	MOVSL
+/*
+ * copy the rest, by bytes
+ */
+	ANDL	$3, BX
+c3:
+	MOVL	BX, CX
+	REP;	MOVSB
+
+	MOVQ	RARG, AX
+	RET

+ 16 - 0
sys/src/libc/amd64/strlen.s

@@ -0,0 +1,16 @@
+	TEXT	strlen(SB),$0
+
+	MOVL	$0, AX
+	MOVQ	$-1, CX
+	CLD
+/*
+ * look for end of string
+ */
+
+	MOVQ	RARG, DI
+	REPN;	SCASB
+
+	MOVQ	DI, AX
+	SUBQ	RARG, AX
+	SUBQ	$1, AX
+	RET

+ 8 - 0
sys/src/libc/amd64/tas.s

@@ -0,0 +1,8 @@
+/*
+ * The kernel and the libc use the same constant for TAS
+ */
+TEXT	_tas(SB),$0
+
+	MOVL	$0xdeaddead,AX
+	XCHGL	AX,(RARG)
+	RET

+ 20 - 0
sys/src/libmp/amd64/mkfile

@@ -0,0 +1,20 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libmp.a
+SFILES=\
+	mpdigdiv.s\
+	mpvecadd.s\
+	mpvecdigmuladd.s\
+	mpvecdigmulsub.s\
+	mpvecsub.s\
+
+HFILES=/$objtype/include/u.h /sys/include/mp.h ../port/dat.h
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib

+ 21 - 0
sys/src/libmp/amd64/mpdigdiv.s

@@ -0,0 +1,21 @@
+TEXT	mpdigdiv(SB),$0
+
+/*	MOVL	dividend+0(FP),BX */
+	MOVL	0(RARG),AX
+	MOVL	4(RARG),DX
+	MOVL	divisor+8(FP),BX
+	MOVQ	quotient+16(FP),DI
+	XORL	CX,CX
+	CMPL	DX,BX		/* dividend >= 2^32 * divisor */
+	JHS	_divovfl
+	CMPL	BX,CX		/* divisor == 0 */
+	JE	_divovfl
+	DIVL	BX		/* AX = DX:AX/BX */
+	MOVL	AX,0(DI)
+	RET
+
+	/* return all 1's */
+_divovfl:
+	NOTL	CX
+	MOVL	CX,0(DI)
+	RET

+ 54 - 0
sys/src/libmp/amd64/mpvecadd.s

@@ -0,0 +1,54 @@
+/*
+ *	mpvecadd(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *sum)
+ *
+ *		sum[0:alen] = a[0:alen-1] + b[0:blen-1]
+ *
+ *	prereq: alen >= blen, sum has room for alen+1 digits
+ */
+TEXT	mpvecadd(SB),$0
+
+	MOVL	alen+8(FP),DX
+	MOVL	blen+24(FP),CX
+/*	MOVL	a+0(FP),SI */
+	MOVQ	RARG, SI
+	MOVQ	b+16(FP),BX
+	SUBL	CX,DX
+	MOVQ	sum+32(FP),DI
+	XORL	BP,BP			/* this also sets carry to 0 */
+
+	/* skip addition if b is zero */
+	TESTL	CX,CX
+	JZ	_add1
+
+	/* sum[0:blen-1],carry = a[0:blen-1] + b[0:blen-1] */
+_addloop1:
+	MOVL	(SI)(BP*4), AX
+	ADCL	(BX)(BP*4), AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_addloop1
+
+_add1:
+	/* jump if alen > blen */
+	INCL	DX
+	MOVL	DX,CX
+	LOOP	_addloop2
+
+	/* sum[alen] = carry */
+_addend:
+	JC	_addcarry
+	MOVL	$0,(DI)(BP*4)
+	RET
+_addcarry:
+	MOVL	$1,(DI)(BP*4)
+	RET
+
+	/* sum[blen:alen-1],carry = a[blen:alen-1] + 0 */
+_addloop2:
+	MOVL	(SI)(BP*4),AX
+	ADCL	$0,AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_addloop2
+	JMP	_addend
+

+ 53 - 0
sys/src/libmp/amd64/mpvecdigmuladd.s

@@ -0,0 +1,53 @@
+/*
+ *	mpvecdigmul(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p += b*m
+ *
+ *	each step look like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		hi = DX		- constrained by hardware
+ *		lo = AX		- constrained by hardware
+ *		b+n = SI	- can't be BP
+ *		p+n = DI	- can't be BP
+ *		i-n = BP
+ *		m = BX
+ *		oldhi = CX
+ *		
+ */
+TEXT	mpvecdigmuladd(SB),$0
+
+/*	MOVQ	b+0(FP),SI	*/
+	MOVQ	RARG,SI
+	MOVL	n+8(FP),CX
+	MOVL	m+16(FP),BX
+	MOVQ	p+24(FP),DI
+	MOVL	CX,BP
+	NEGQ	BP		/* BP = -n */
+	SHLL	$2,CX
+	ADDQ	CX,SI		/* SI = b + n */
+	ADDQ	CX,DI		/* DI = p + n */
+	XORL	CX,CX
+_muladdloop:
+	MOVL	(SI)(BP*4),AX	/* lo = b[i] */
+	MULL	BX		/* hi, lo = b[i] * m */
+	ADDL	CX,AX		/* lo += oldhi */
+	JCC	_muladdnocarry1
+	INCL	DX		/* hi += carry */
+_muladdnocarry1:
+	ADDL	AX,(DI)(BP*4)	/* p[i] += lo */
+	JCC	_muladdnocarry2
+	INCL	DX		/* hi += carry */
+_muladdnocarry2:
+	MOVL	DX,CX		/* oldhi = hi */
+	INCQ	BP		/* i++ */
+	JNZ	_muladdloop
+	XORL	AX,AX
+	ADDL	CX,(DI)(BP*4)	/* p[n] + oldhi */
+	ADCL	AX,AX		/* return carry out of p[n] */
+	RET

+ 53 - 0
sys/src/libmp/amd64/mpvecdigmulsub.s

@@ -0,0 +1,53 @@
+/*
+ *	mpvecdigmulsub(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p -= b*m
+ *
+ *	each step look like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		hi = DX		- constrained by hardware
+ *		lo = AX		- constrained by hardware
+ *		b = SI		- can't be BP
+ *		p = DI		- can't be BP
+ *		i = BP
+ *		n = CX		- constrained by LOOP instr
+ *		m = BX
+ *		oldhi = R8
+ *		
+ */
+TEXT	mpvecdigmulsub(SB),$0
+
+/*	MOVL	b+0(FP),SI	*/
+	MOVQ	RARG,SI
+	MOVL	n+8(FP),CX
+	MOVL	m+16(FP),BX
+	MOVQ	p+24(FP),DI
+	XORL	BP,BP
+	MOVL	BP,R8
+_mulsubloop:
+	MOVL	(SI)(BP*4),AX		/* lo = b[i] */
+	MULL	BX			/* hi, lo = b[i] * m */
+	ADDL	R8,AX		/* lo += oldhi */
+	JCC	_mulsubnocarry1
+	INCL	DX			/* hi += carry */
+_mulsubnocarry1:
+	SUBL	AX,(DI)(BP*4)
+	JCC	_mulsubnocarry2
+	INCL	DX			/* hi += carry */
+_mulsubnocarry2:
+	MOVL	DX,R8
+	INCL	BP
+	LOOP	_mulsubloop
+	SUBL	R8,(DI)(BP*4)
+	JCC	_mulsubnocarry3
+	MOVQ	$-1,AX
+	RET
+_mulsubnocarry3:
+	MOVQ	$1,AX
+	RET

+ 45 - 0
sys/src/libmp/amd64/mpvecsub.s

@@ -0,0 +1,45 @@
+/*
+ *	mpvecsub(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *diff)
+ *
+ *		diff[0:alen-1] = a[0:alen-1] - b[0:blen-1]
+ *
+ *	prereq: alen >= blen, diff has room for alen digits
+ */
+TEXT	mpvecsub(SB),$0
+
+/*	MOVQ	a+0(FP),SI */
+	MOVQ	RARG, SI
+	MOVQ	b+16(FP),BX
+	MOVL	alen+8(FP),DX
+	MOVL	blen+24(FP),CX
+	MOVQ	diff+32(FP),DI
+	SUBL	CX,DX
+	XORL	BP,BP			/* this also sets carry to 0 */
+
+	/* skip subraction if b is zero */
+	TESTL	CX,CX
+	JZ	_sub1
+
+	/* diff[0:blen-1],borrow = a[0:blen-1] - b[0:blen-1] */
+_subloop1:
+	MOVL	(SI)(BP*4),AX
+	SBBL	(BX)(BP*4),AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_subloop1
+
+_sub1:
+	INCL	DX
+	MOVL	DX,CX
+	LOOP	_subloop2
+	RET
+
+	/* diff[blen:alen-1] = a[blen:alen-1] - 0 */
+_subloop2:
+	MOVL	(SI)(BP*4),AX
+	SBBL	$0,AX
+	MOVL	AX,(DI)(BP*4)
+	INCL	BP
+	LOOP	_subloop2
+	RET
+

+ 11 - 0
sys/src/libsec/amd64/mkfile

@@ -0,0 +1,11 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libsec.a
+OFILES=	\
+
+HFILES=/sys/include/libsec.h
+
+UPDATE=mkfile
+
+</sys/src/cmd/mksyslib

+ 2 - 2
sys/src/mkfile.proto

@@ -2,8 +2,8 @@
 # common mkfile parameters shared by all architectures
 #
 
-OS=58qv
-CPUS=arm 386 power mips
+OS=568qv
+CPUS=arm amd64 386 power mips
 CFLAGS=-FTVw
 LEX=lex
 YACC=yacc