Browse Source

all: import spim support from bitbucket.org/cherry9/plan9-loongson (thanks Cherry Zhang)

David du Colombier 2 years ago
parent
commit
356639b147
71 changed files with 4970 additions and 11 deletions
  1. 2 0
      spim/bin/ape/psh
  2. 73 0
      spim/include/ape/float.h
  3. 78 0
      spim/include/ape/math.h
  4. 18 0
      spim/include/ape/stdarg.h
  5. 52 0
      spim/include/ape/ureg.h
  6. 69 0
      spim/include/u.h
  7. 44 0
      spim/include/ureg.h
  8. 9 0
      spim/mkfile
  9. 3 0
      sys/src/ape/lib/9/spim/getcallerpc.s
  10. 15 0
      sys/src/ape/lib/9/spim/getfcr.s
  11. 52 0
      sys/src/ape/lib/ap/spim/atom.s
  12. 3 0
      sys/src/ape/lib/ap/spim/c_fcr0.s
  13. 5 0
      sys/src/ape/lib/ap/spim/cycles.c
  14. 15 0
      sys/src/ape/lib/ap/spim/getfcr.s
  15. 36 0
      sys/src/ape/lib/ap/spim/lock.c
  16. 177 0
      sys/src/ape/lib/ap/spim/lock.pre-sema.c
  17. 12 0
      sys/src/ape/lib/ap/spim/main9.s
  18. 54 0
      sys/src/ape/lib/ap/spim/main9p.s
  19. 39 0
      sys/src/ape/lib/ap/spim/memchr.s
  20. 116 0
      sys/src/ape/lib/ap/spim/memcmp.s
  21. 161 0
      sys/src/ape/lib/ap/spim/memmove.s
  22. 88 0
      sys/src/ape/lib/ap/spim/memset.s
  23. 27 0
      sys/src/ape/lib/ap/spim/mkfile
  24. 72 0
      sys/src/ape/lib/ap/spim/notetramp.c
  25. 24 0
      sys/src/ape/lib/ap/spim/setjmp.s
  26. 63 0
      sys/src/ape/lib/ap/spim/strchr.s
  27. 21 0
      sys/src/ape/lib/ap/spim/strcmp.s
  28. 92 0
      sys/src/ape/lib/ap/spim/strcpy.s
  29. 30 0
      sys/src/ape/lib/ap/spim/tas.s
  30. 17 0
      sys/src/ape/lib/ap/spim/vlop.s
  31. 719 0
      sys/src/ape/lib/ap/spim/vlrt.c
  32. 26 0
      sys/src/ape/lib/mp/spim/mkfile
  33. 23 0
      sys/src/ape/lib/sec/spim/mkfile
  34. 2 0
      sys/src/cmd/gs/arch.h
  35. 46 0
      sys/src/cmd/gs/spim.h
  36. 2 0
      sys/src/cmd/vc/swt.c
  37. 32 9
      sys/src/cmd/vl/asm.c
  38. 4 0
      sys/src/libc/spim/argv0.s
  39. 52 0
      sys/src/libc/spim/atom.s
  40. 3 0
      sys/src/libc/spim/c_fcr0.s
  41. 10 0
      sys/src/libc/spim/cycles.c
  42. 3 0
      sys/src/libc/spim/getcallerpc.s
  43. 15 0
      sys/src/libc/spim/getfcr.s
  44. 25 0
      sys/src/libc/spim/main9.s
  45. 41 0
      sys/src/libc/spim/main9p.s
  46. 20 0
      sys/src/libc/spim/memccpy.s
  47. 39 0
      sys/src/libc/spim/memchr.s
  48. 116 0
      sys/src/libc/spim/memcmp.s
  49. 237 0
      sys/src/libc/spim/memmove.s
  50. 88 0
      sys/src/libc/spim/memset.s
  51. 40 0
      sys/src/libc/spim/mkfile
  52. 16 0
      sys/src/libc/spim/notejmp.c
  53. 14 0
      sys/src/libc/spim/setjmp.s
  54. 103 0
      sys/src/libc/spim/sqrt.c
  55. 63 0
      sys/src/libc/spim/strchr.s
  56. 21 0
      sys/src/libc/spim/strcmp.s
  57. 92 0
      sys/src/libc/spim/strcpy.s
  58. 30 0
      sys/src/libc/spim/tas.s
  59. 17 0
      sys/src/libc/spim/vlop.s
  60. 722 0
      sys/src/libc/spim/vlrt.c
  61. 2 2
      sys/src/libmach/vdb.c
  62. 21 0
      sys/src/libmp/spim/mkfile
  63. 67 0
      sys/src/libmp/spim/mpvecadd.s
  64. 58 0
      sys/src/libmp/spim/mpvecdigmuladd.s
  65. 61 0
      sys/src/libmp/spim/mpvecdigmulsub.s
  66. 66 0
      sys/src/libmp/spim/mpvecsub.s
  67. 296 0
      sys/src/libsec/spim/md5block.s
  68. 19 0
      sys/src/libsec/spim/mkfile
  69. 220 0
      sys/src/libsec/spim/sha1block.s
  70. 26 0
      sys/src/libthread/spim.c
  71. 46 0
      sys/src/libthread/xincspim.s

+ 2 - 0
spim/bin/ape/psh

@@ -0,0 +1,2 @@
+#!/bin/rc
+exec /rc/bin/ape/psh $*

+ 73 - 0
spim/include/ape/float.h

@@ -0,0 +1,73 @@
+#ifndef __FLOAT
+#define __FLOAT
+/* IEEE, default rounding */
+
+#define FLT_ROUNDS	1
+#define FLT_RADIX	2
+
+#define FLT_DIG		6
+#define FLT_EPSILON	1.19209290e-07
+#define FLT_MANT_DIG	24
+#define FLT_MAX		3.40282347e+38
+#define FLT_MAX_10_EXP	38
+#define FLT_MAX_EXP	128
+#define FLT_MIN		1.17549435e-38
+#define FLT_MIN_10_EXP	-37
+#define FLT_MIN_EXP	-125
+
+#define DBL_DIG		15
+#define DBL_EPSILON	2.2204460492503131e-16
+#define DBL_MANT_DIG	53
+#define DBL_MAX		1.797693134862315708145e+308
+#define DBL_MAX_10_EXP	308
+#define DBL_MAX_EXP	1024
+#define DBL_MIN		2.225073858507201383090233e-308
+#define DBL_MIN_10_EXP	-307
+#define DBL_MIN_EXP	-1021
+#define LDBL_MANT_DIG	DBL_MANT_DIG
+#define LDBL_EPSILON	DBL_EPSILON
+#define LDBL_DIG	DBL_DIG
+#define LDBL_MIN_EXP	DBL_MIN_EXP
+#define LDBL_MIN	DBL_MIN
+#define LDBL_MIN_10_EXP	DBL_MIN_10_EXP
+#define LDBL_MAX_EXP	DBL_MAX_EXP
+#define LDBL_MAX	DBL_MAX
+#define LDBL_MAX_10_EXP	DBL_MAX_10_EXP
+
+typedef 	union FPdbleword FPdbleword;
+union FPdbleword
+{
+	double	x;
+	struct {	/* little endian */
+		long lo;
+		long hi;
+	};
+};
+
+#ifdef _RESEARCH_SOURCE
+/* define stuff needed for floating conversion */
+#define IEEE_8087	1
+#define Sudden_Underflow 1
+#endif
+#ifdef _PLAN9_SOURCE
+/* FCR */
+#define	FPINEX	(1<<7)
+#define	FPOVFL	(1<<9)
+#define	FPUNFL	(1<<8)
+#define	FPZDIV	(1<<10)
+#define	FPRNR	(0<<0)
+#define	FPRZ	(1<<0)
+#define	FPRPINF	(2<<0)
+#define	FPRNINF	(3<<0)
+#define	FPRMASK	(3<<0)
+#define	FPPEXT	0
+#define	FPPSGL	0
+#define	FPPDBL	0
+#define	FPPMASK	0
+/* FSR */
+#define	FPAINEX	(1<<2)
+#define	FPAOVFL	(1<<4)
+#define	FPAUNFL	(1<<3)
+#define	FPAZDIV	(1<<5)
+#endif
+#endif /* __FLOAT */

+ 78 - 0
spim/include/ape/math.h

@@ -0,0 +1,78 @@
+#ifndef __MATH
+#define __MATH
+#pragma lib "/$M/lib/ape/libap.a"
+
+/* a HUGE_VAL appropriate for IEEE double-precision */
+/* the correct value, 1.797693134862316e+308, causes a ken overflow */
+#define HUGE_VAL 1.79769313486231e+308
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double acos(double);
+extern double asin(double);
+extern double atan(double);
+extern double atan2(double, double);
+extern double cos(double);
+extern double hypot(double, double);
+extern double sin(double);
+extern double tan(double);
+extern double cosh(double);
+extern double sinh(double);
+extern double tanh(double);
+extern double exp(double);
+extern double frexp(double, int *);
+extern double ldexp(double, int);
+extern double log(double);
+extern double log10(double);
+extern double modf(double, double *);
+extern double pow(double, double);
+extern double sqrt(double);
+extern double ceil(double);
+extern double fabs(double);
+extern double floor(double);
+extern double fmod(double, double);
+extern double NaN(void);
+extern int isNaN(double);
+extern double Inf(int);
+extern int isInf(double, int);
+
+#ifdef _RESEARCH_SOURCE
+/* does >> treat left operand as unsigned ? */
+#define Unsigned_Shifts 1
+#define	M_E		2.7182818284590452354	/* e */
+#define	M_LOG2E		1.4426950408889634074	/* log 2e */
+#define	M_LOG10E	0.43429448190325182765	/* log 10e */
+#define	M_LN2		0.69314718055994530942	/* log e2 */
+#define	M_LN10		2.30258509299404568402	/* log e10 */
+#define	M_PI		3.14159265358979323846	/* pi */
+#define	M_PI_2		1.57079632679489661923	/* pi/2 */
+#define	M_PI_4		0.78539816339744830962	/* pi/4 */
+#define	M_1_PI		0.31830988618379067154	/* 1/pi */
+#define	M_2_PI		0.63661977236758134308	/* 2/pi */
+#define	M_2_SQRTPI	1.12837916709551257390	/* 2/sqrt(pi) */
+#define	M_SQRT2		1.41421356237309504880	/* sqrt(2) */
+#define	M_SQRT1_2	0.70710678118654752440	/* 1/sqrt(2) */
+
+extern double hypot(double, double);
+extern double erf(double);
+extern double erfc(double);
+extern double j0(double);
+extern double y0(double);
+extern double j1(double);
+extern double y1(double);
+extern double jn(int, double);
+extern double yn(int, double);
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#define isnan(x) isNaN(x)
+#define isinf(x) isInf(x, 0)
+
+#endif /* __MATH */

+ 18 - 0
spim/include/ape/stdarg.h

@@ -0,0 +1,18 @@
+#ifndef __STDARG
+#define __STDARG
+
+typedef char *va_list;
+
+#define va_start(list, start) list =\
+	(sizeof(start) < 4?\
+		(char*)((int*)&(start)+1):\
+		(char*)(&(start)+1))
+#define va_end(list)
+#define va_arg(list, mode)\
+	((sizeof(mode) == 1)?\
+		((list += 4), (mode*)list)[-4]:\
+	(sizeof(mode) == 2)?\
+		((list += 4), (mode*)list)[-2]:\
+		((list += sizeof(mode)), (mode*)list)[-1])
+
+#endif /* __STDARG */

+ 52 - 0
spim/include/ape/ureg.h

@@ -0,0 +1,52 @@
+#ifndef __UREG_H
+#define __UREG_H
+#if !defined(_PLAN9_SOURCE)
+    This header file is an extension to ANSI/POSIX
+#endif
+
+struct Ureg
+{
+	unsigned long	status;
+	unsigned long	pc;
+	union{
+		unsigned long	sp;		/* r29 */
+		unsigned long	usp;		/* r29 */
+	};
+	unsigned long	cause;
+	unsigned long	badvaddr;
+	unsigned long	tlbvirt;
+	unsigned long	hi;
+	unsigned long	lo;
+	unsigned long	r31;
+	unsigned long	r30;
+	unsigned long	r28;
+	unsigned long	r27;		/* unused */
+	unsigned long	r26;		/* unused */
+	unsigned long	r25;
+	unsigned long	r24;
+	unsigned long	r23;
+	unsigned long	r22;
+	unsigned long	r21;
+	unsigned long	r20;
+	unsigned long	r19;
+	unsigned long	r18;
+	unsigned long	r17;
+	unsigned long	r16;
+	unsigned long	r15;
+	unsigned long	r14;
+	unsigned long	r13;
+	unsigned long	r12;
+	unsigned long	r11;
+	unsigned long	r10;
+	unsigned long	r9;
+	unsigned long	r8;
+	unsigned long	r7;
+	unsigned long	r6;
+	unsigned long	r5;
+	unsigned long	r4;
+	unsigned long	r3;
+	unsigned long	r2;
+	unsigned long	r1;
+};
+
+#endif

+ 69 - 0
spim/include/u.h

@@ -0,0 +1,69 @@
+#define nil		((void*)0)
+typedef	unsigned short	ushort;
+typedef	unsigned char	uchar;
+typedef	unsigned long	ulong;
+typedef	unsigned int	uint;
+typedef	signed char	schar;
+typedef	long long	vlong;
+typedef	unsigned long long uvlong;
+typedef unsigned long	uintptr;
+typedef unsigned long	usize;
+typedef	uint		Rune;
+typedef 	union FPdbleword FPdbleword;
+typedef long	jmp_buf[2];
+#define	JMPBUFSP	0
+#define	JMPBUFPC	1
+#define	JMPBUFDPC	0
+typedef unsigned int	mpdigit;	/* for /sys/include/mp.h */
+typedef unsigned char u8int;
+typedef unsigned short u16int;
+typedef unsigned int	u32int;
+typedef unsigned long long u64int;
+
+/* FCR (FCR31) */
+#define	FPINEX	(1<<7)		/* enables */
+#define	FPUNFL	(1<<8)
+#define	FPOVFL	(1<<9)
+#define	FPZDIV	(1<<10)
+#define	FPINVAL	(1<<11)
+#define	FPRNR	(0<<0)		/* rounding modes */
+#define	FPRZ	(1<<0)
+#define	FPRPINF	(2<<0)
+#define	FPRNINF	(3<<0)
+#define	FPRMASK	(3<<0)
+#define	FPPEXT	0
+#define	FPPSGL	0
+#define	FPPDBL	0
+#define	FPPMASK	0
+#define FPCOND	(1<<23)
+
+/* FSR (also FCR31) */
+#define	FPAINEX	(1<<2)		/* flags */
+#define	FPAOVFL	(1<<4)
+#define	FPAUNFL	(1<<3)
+#define	FPAZDIV	(1<<5)
+#define	FPAINVAL (1<<6)
+
+union FPdbleword
+{
+	double	x;
+	struct {	/* little endian */
+		ulong lo;
+		ulong hi;
+	};
+};
+
+/* stdarg */
+typedef	char*	va_list;
+#define va_start(list, start) list =\
+	(sizeof(start) < 4?\
+		(char*)((int*)&(start)+1):\
+		(char*)(&(start)+1))
+#define va_end(list)\
+	USED(list)
+#define va_arg(list, mode)\
+	((sizeof(mode) == 1)?\
+		((list += 4), (mode*)list)[-4]:\
+	(sizeof(mode) == 2)?\
+		((list += 4), (mode*)list)[-2]:\
+		((list += sizeof(mode)), (mode*)list)[-1])

+ 44 - 0
spim/include/ureg.h

@@ -0,0 +1,44 @@
+struct Ureg
+{
+	ulong	status;
+	ulong	pc;
+	union{
+		ulong	sp;		/* r29 */
+		ulong	usp;		/* r29 */
+	};
+	ulong	cause;
+	ulong	badvaddr;
+	ulong	tlbvirt;
+	ulong	hi;
+	ulong	lo;
+	ulong	r31;
+	ulong	r30;
+	ulong	r28;
+	ulong	r27;		/* unused */
+	ulong	r26;		/* unused */
+	ulong	r25;
+	ulong	r24;
+	ulong	r23;
+	ulong	r22;
+	ulong	r21;
+	ulong	r20;
+	ulong	r19;
+	ulong	r18;
+	ulong	r17;
+	ulong	r16;
+	ulong	r15;
+	ulong	r14;
+	ulong	r13;
+	ulong	r12;
+	ulong	r11;
+	ulong	r10;
+	ulong	r9;
+	ulong	r8;
+	ulong	r7;
+	ulong	r6;
+	ulong	r5;
+	ulong	r4;
+	ulong	r3;
+	ulong	r2;
+	ulong	r1;
+};

+ 9 - 0
spim/mkfile

@@ -0,0 +1,9 @@
+</sys/src/mkfile.proto
+
+OS=0$OS
+CPUS=$CPUS spim
+
+CC=0c
+LD=0l
+O=0
+AS=0a

+ 3 - 0
sys/src/ape/lib/9/spim/getcallerpc.s

@@ -0,0 +1,3 @@
+TEXT	getcallerpc(SB), $0
+	MOVW	0(SP), R1
+	RET

+ 15 - 0
sys/src/ape/lib/9/spim/getfcr.s

@@ -0,0 +1,15 @@
+TEXT	getfsr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfsr(SB), $0
+	MOVW	R1, FCR31
+	RET
+
+TEXT	getfcr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfcr(SB), $0
+	MOVW	R1, FCR31
+	RET

+ 52 - 0
sys/src/ape/lib/ap/spim/atom.s

@@ -0,0 +1,52 @@
+/*
+ *	R4000 user-level atomic operations
+ */
+
+#define	LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define	SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define	NOOP		WORD	$0x27
+
+TEXT ainc(SB), 1, $-4			/* long ainc(long *); */
+TEXT _xinc(SB), 1, $-4			/* void _xinc(long *); */
+	MOVW	R1, R2			/* address of counter */
+loop:	MOVW	$1, R3
+	LL(2, 1)
+	NOOP
+	ADDU	R1, R3
+	MOVW	R3, R1			/* return new value */
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop
+	RET
+
+TEXT adec(SB), 1, $-4			/* long adec(long*); */
+TEXT _xdec(SB), 1, $-4			/* long _xdec(long *); */
+	MOVW	R1, R2			/* address of counter */
+loop1:	MOVW	$-1, R3
+	LL(2, 1)
+	NOOP
+	ADDU	R1, R3
+	MOVW	R3, R1			/* return new value */
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop1
+	RET
+
+/*
+ * int cas(uint* p, int ov, int nv);
+ */
+TEXT cas(SB), 1, $-4
+	MOVW	ov+4(FP), R2
+	MOVW	nv+8(FP), R3
+spincas:
+	LL(1, 4)			/* R4 = *R1 */
+	NOOP
+	BNE	R2, R4, fail
+	SC(1, 3)			/* *R1 = R3 */
+	NOOP
+	BEQ	R3, spincas		/* R3 == 0 means store failed */
+	MOVW	$1, R1
+	RET
+fail:
+	MOVW	$0, R1
+	RET

+ 3 - 0
sys/src/ape/lib/ap/spim/c_fcr0.s

@@ -0,0 +1,3 @@
+	TEXT	C_fcr0(SB), $0
+	MOVW	FCR0, R1
+	RET

+ 5 - 0
sys/src/ape/lib/ap/spim/cycles.c

@@ -0,0 +1,5 @@
+void
+_cycles(unsigned long long *u)
+{
+	*u = 0;
+}

+ 15 - 0
sys/src/ape/lib/ap/spim/getfcr.s

@@ -0,0 +1,15 @@
+TEXT	getfsr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfsr(SB), $0
+	MOVW	R1, FCR31
+	RET
+
+TEXT	getfcr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfcr(SB), $0
+	MOVW	R1, FCR31
+	RET

+ 36 - 0
sys/src/ape/lib/ap/spim/lock.c

@@ -0,0 +1,36 @@
+#include "../plan9/lib.h"
+#include "../plan9/sys9.h"
+#define _LOCK_EXTENSION
+#include <lock.h>
+//#include <lib9.h>
+
+void
+lock(Lock *l)
+{
+	if(ainc(&l->key) == 1)
+		return;	/* changed from 0 -> 1: we hold lock */
+	/* otherwise wait in kernel */
+	while(_SEMACQUIRE(&l->sem, 1) < 0){
+		/* interrupted; try again */
+	}
+}
+
+void
+unlock(Lock *l)
+{
+	if(adec(&l->key) == 0)
+		return;	/* changed from 1 -> 0: no contention */
+	_SEMRELEASE(&l->sem, 1);
+}
+
+int
+canlock(Lock *l)
+{
+	if(ainc(&l->key) == 1)
+		return 1;	/* changed from 0 -> 1: success */
+	/* Undo increment (but don't miss wakeup) */
+	if(adec(&l->key) == 0)
+		return 0;	/* changed from 1 -> 0: no contention */
+	_SEMRELEASE(&l->sem, 1);
+	return 0;
+}

+ 177 - 0
sys/src/ape/lib/ap/spim/lock.pre-sema.c

@@ -0,0 +1,177 @@
+#define _LOCK_EXTENSION
+#include <stdlib.h>
+#include <string.h>
+#include "../plan9/sys9.h"
+#include <lock.h>
+
+enum
+{
+	Pagesize	= 4096,
+	Semperpg	= Pagesize/(16*sizeof(unsigned int)),
+	Lockaddr	= 0x60000000,
+
+	POWER		= 0x320,
+	MAGNUM		= 0x330,
+	MAGNUMII	= 0x340,
+	R4K		= 0x500,
+};
+
+static	int arch;
+extern	int C_3ktas(int*);
+extern	int C_4ktas(int*);
+extern	int C_fcr0(void);
+
+static void
+lockinit(void)
+{
+	int n;
+
+	if(arch != 0)
+		return;	/* allow multiple calls */
+	arch = C_fcr0();
+	switch(arch) {
+	case POWER:
+		n = _SEGATTACH(0,  "lock", (void*)Lockaddr, Pagesize);
+		if(n < 0) {
+			arch = MAGNUM;
+			break;
+		}
+		memset((void*)Lockaddr, 0, Pagesize);
+		break;
+	case MAGNUM:
+	case MAGNUMII:
+	case R4K:
+		break;
+	default:
+		arch = R4K;
+		break;
+	}
+
+}
+
+void
+lock(Lock *lk)
+{
+	int *hwsem;
+	int hash;
+
+retry:
+	switch(arch) {
+	case 0:
+		lockinit();
+		goto retry;
+	case MAGNUM:
+	case MAGNUMII:
+		while(C_3ktas(&lk->val))
+			_SLEEP(0);
+		return;
+	case R4K:
+		for(;;){
+			while(lk->val)
+				;
+			if(C_4ktas(&lk->val) == 0)
+				return;
+		}
+		break;
+	case POWER:
+		/* Use low order lock bits to generate hash */
+		hash = ((int)lk/sizeof(int)) & (Semperpg-1);
+		hwsem = (int*)Lockaddr+hash;
+
+		for(;;) {
+			if((*hwsem & 1) == 0) {
+				if(lk->val)
+					*hwsem = 0;
+				else {
+					lk->val = 1;
+					*hwsem = 0;
+					return;
+				}
+			}
+			while(lk->val)
+				;
+		}
+	}
+}
+
+int
+canlock(Lock *lk)
+{
+	int *hwsem;
+	int hash;
+
+retry:
+	switch(arch) {
+	case 0:
+		lockinit();
+		goto retry;
+	case MAGNUM:
+	case MAGNUMII:
+		if(C_3ktas(&lk->val))
+			return 0;
+		return 1;
+	case R4K:
+		if(C_4ktas(&lk->val))
+			return 0;
+		return 1;
+	case POWER:
+		/* Use low order lock bits to generate hash */
+		hash = ((int)lk/sizeof(int)) & (Semperpg-1);
+		hwsem = (int*)Lockaddr+hash;
+
+		if((*hwsem & 1) == 0) {
+			if(lk->val)
+				*hwsem = 0;
+			else {
+				lk->val = 1;
+				*hwsem = 0;
+				return 1;
+			}
+		}
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+void
+unlock(Lock *lk)
+{
+	lk->val = 0;
+}
+
+int
+tas(int *p)
+{
+	int *hwsem;
+	int hash;
+
+retry:
+	switch(arch) {
+	case 0:
+		lockinit();
+		goto retry;
+	case MAGNUM:
+	case MAGNUMII:
+		return C_3ktas(p);
+	case R4K:
+		return C_4ktas(p);
+	case POWER:
+		/* Use low order lock bits to generate hash */
+		hash = ((int)p/sizeof(int)) & (Semperpg-1);
+		hwsem = (int*)Lockaddr+hash;
+
+		if((*hwsem & 1) == 0) {
+			if(*p)
+				*hwsem = 0;
+			else {
+				*p = 1;
+				*hwsem = 0;
+				return 0;
+			}
+		}
+		return 1;
+	default:
+		return 0;
+	}
+}

+ 12 - 0
sys/src/ape/lib/ap/spim/main9.s

@@ -0,0 +1,12 @@
+	TEXT	_main(SB), $16
+	MOVW	$setR30(SB), R30
+	JAL	_envsetup(SB)
+	MOVW	inargc-4(FP), R1
+	MOVW	$inargv+0(FP), R2
+	MOVW	R1, 4(R29)
+	MOVW	R2, 8(R29)
+	JAL	main(SB)
+loop:
+	MOVW	R1, 4(R29)
+	JAL	exit(SB)
+	JMP	loop

+ 54 - 0
sys/src/ape/lib/ap/spim/main9p.s

@@ -0,0 +1,54 @@
+#define NPRIVATES	16
+
+GLOBL	_tos(SB), $4
+GLOBL	_privates(SB), $4
+GLOBL	_nprivates(SB), $4
+
+TEXT	_mainp(SB), 1, $(3*4+NPRIVATES*4)
+	MOVW	$setR30(SB), R30
+
+	/* _tos = arg */
+	MOVW	R1, _tos(SB)
+/*
+	MOVW	$0,FCR31
+	NOR	R0,R0
+	MOVD	$0.5, F26
+	SUBD	F26, F26, F24
+	ADDD	F26, F26, F28
+	ADDD	F28, F28, F30
+*/
+	MOVW	$8(SP), R1
+	MOVW	R1, _privates(SB)
+	MOVW	$NPRIVATES, R1
+	MOVW	R1, _nprivates(SB)
+
+	/* _profmain(); */
+	JAL	_profmain(SB)
+
+	/* _tos->prof.pp = _tos->prof.next; */
+	MOVW	_tos+0(SB),R1
+	MOVW	4(R1),R2
+	MOVW	R2,(R1)
+
+	JAL	_envsetup(SB)
+
+	/* main(argc, argv, environ); */
+	MOVW	inargc-4(FP), R1
+	MOVW	$inargv+0(FP), R2
+	MOVW	environ(SB), R3
+	MOVW	R1, 4(R29)
+	MOVW	R2, 8(R29)
+	MOVW	R3, 12(R29)
+	JAL	main(SB)
+loop:
+	MOVW	R1, 4(R29)
+	JAL	exit(SB)
+	MOVW	$_profin(SB), R0	/* force loading of profile */
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	RET
+
+TEXT	_callpc(SB), 1, $0
+	MOVW	argp-4(FP), R1
+	RET

+ 39 - 0
sys/src/ape/lib/ap/spim/memchr.s

@@ -0,0 +1,39 @@
+	TEXT	memchr(SB), $0
+MOVW R1, 0(FP)
+
+	MOVW	n+8(FP), R1
+	MOVW	s1+0(FP), R2
+	MOVBU	c+4(FP), R3	// little endian, 4(FP) instead of 7(FP)
+	ADDU	R1, R2, R6
+
+	AND	$(~1), R1, R5
+	ADDU	R2, R5
+	BEQ	R2, R5, lt2
+
+l1:
+	MOVBU	0(R2), R4
+	MOVBU	1(R2), R7
+	BEQ	R3, R4, eq0
+	ADDU	$2, R2
+	BEQ	R3, R7, eq
+	BNE	R2, R5, l1
+
+lt2:
+	BEQ	R2, R6, zret
+
+l2:
+	MOVBU	(R2), R4
+	ADDU	$1, R2
+	BEQ	R3, R4, eq
+	BNE	R2, R6, l2
+zret:
+	MOVW	R0, R1
+	RET
+
+eq0:
+	MOVW	R2, R1
+	RET
+
+eq:
+	SUBU	$1,R2, R1
+	RET

+ 116 - 0
sys/src/ape/lib/ap/spim/memcmp.s

@@ -0,0 +1,116 @@
+	TEXT	memcmp(SB), $0
+MOVW R1, 0(FP)
+
+/*
+ * performance:
+ *	alligned about 1.0us/call and 17.4mb/sec
+ *	unalligned is about 3.1mb/sec
+ */
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	s1+0(FP), R4		/* R4 is pointer1 */
+	MOVW	s2+4(FP), R5		/* R5 is pointer2 */
+	ADDU	R3,R4, R6		/* R6 is end pointer1 */
+
+	JMP	out		// XXX little endian
+
+/*
+ * if not at least 4 chars,
+ * dont even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word cmp.
+ */
+	SGT	$4,R3, R1
+	BNE	R1, out
+
+/*
+ * test if both pointers
+ * are similarly word alligned
+ */
+	XOR	R4,R5, R1
+	AND	$3, R1
+	BNE	R1, out
+
+/*
+ * byte at a time to word allign
+ */
+l1:
+	AND	$3,R4, R1
+	BEQ	R1, l2
+	MOVB	0(R4), R8
+	MOVB	0(R5), R9
+	ADDU	$1, R4
+	BNE	R8,R9, ne
+	ADDU	$1, R5
+	JMP	l1
+
+/*
+ * turn R3 into end pointer1-15
+ * cmp 16 at a time while theres room
+ */
+l2:
+	ADDU	$-15,R6, R3
+l3:
+	SGTU	R3,R4, R1
+	BEQ	R1, l4
+	MOVW	0(R4), R8
+	MOVW	0(R5), R9
+	MOVW	4(R4), R10
+	BNE	R8,R9, ne
+	MOVW	4(R5), R11
+	MOVW	8(R4), R8
+	BNE	R10,R11, ne1
+	MOVW	8(R5), R9
+	MOVW	12(R4), R10
+	BNE	R8,R9, ne
+	MOVW	12(R5), R11
+	ADDU	$16, R4
+	BNE	R10,R11, ne1
+	BNE	R8,R9, ne
+	ADDU	$16, R5
+	JMP	l3
+
+/*
+ * turn R3 into end pointer1-3
+ * cmp 4 at a time while theres room
+ */
+l4:
+	ADDU	$-3,R6, R3
+l5:
+	SGTU	R3,R4, R1
+	BEQ	R1, out
+	MOVW	0(R4), R8
+	MOVW	0(R5), R9
+	ADDU	$4, R4
+	BNE	R8,R9, ne	/* only works because big endian */
+	ADDU	$4, R5
+	JMP	l5
+
+/*
+ * last loop, cmp byte at a time
+ */
+out:
+	SGTU	R6,R4, R1
+	BEQ	R1, ret
+	MOVB	0(R4), R8
+	MOVB	0(R5), R9
+	ADDU	$1, R4
+	BNE	R8,R9, ne
+	ADDU	$1, R5
+	JMP	out
+
+ne1:
+	SGTU	R10,R11, R1
+	BNE	R1, ret
+	MOVW	$-1,R1
+	RET
+ne:
+	SGTU	R8,R9, R1
+	BNE	R1, ret
+	MOVW	$-1,R1
+ret:
+	RET
+	END

+ 161 - 0
sys/src/ape/lib/ap/spim/memmove.s

@@ -0,0 +1,161 @@
+	TEXT	memmove(SB), $0
+
+	JMP	move
+
+	TEXT	memcpy(SB), $0
+move:
+	MOVW	R1, s1+0(FP)
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	R1, R4			/* R4 is to-pointer */
+	SGT	R0, R3, R5
+	BEQ	R5, ok
+	MOVW	(R0), R0		/* abort if negative count */
+ok:
+	MOVW	s2+4(FP), R5		/* R5 is from-pointer */
+	ADDU	R3,R5, R7		/* R7 is end from-pointer */
+	ADDU	R3,R4, R6		/* R6 is end to-pointer */
+
+/*
+ * easiest test is copy backwards if
+ * destination string has higher mem address
+ */
+	SGT	$4,R3, R2
+	SGTU	R4,R5, R1
+	BNE	R1, back
+
+/*
+ * if not at least 4 chars,
+ * don't even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word store.
+ */
+	BNE	R2, fout
+
+/*
+ * test if both pointers
+ * are similarly word aligned
+ */
+	XOR	R4,R5, R1
+	AND	$3, R1
+	BNE	R1, fout
+
+/*
+ * byte at a time to word align
+ */
+f1:
+	AND	$3,R4, R1
+	BEQ	R1, f2
+	MOVB	0(R5), R8
+	ADDU	$1, R5
+	MOVB	R8, 0(R4)
+	ADDU	$1, R4
+	JMP	f1
+
+/*
+ * turn R3 into to-end pointer-15
+ * copy 16 at a time while theres room.
+ * R6 is smaller than R7 --
+ * there are problems if R7 is 0.
+ */
+f2:
+	ADDU	$-15,R6, R3
+f3:
+	SGTU	R3,R4, R1
+	BEQ	R1, f4
+	MOVW	0(R5), R8
+	MOVW	4(R5), R9
+	MOVW	R8, 0(R4)
+	MOVW	8(R5), R8
+	MOVW	R9, 4(R4)
+	MOVW	12(R5), R9
+	ADDU	$16, R5
+	MOVW	R8, 8(R4)
+	MOVW	R9, 12(R4)
+	ADDU	$16, R4
+	JMP	f3
+
+/*
+ * turn R3 into to-end pointer-3
+ * copy 4 at a time while theres room
+ */
+f4:
+	ADDU	$-3,R6, R3
+f5:
+	SGTU	R3,R4, R1
+	BEQ	R1, fout
+	MOVW	0(R5), R8
+	ADDU	$4, R5
+	MOVW	R8, 0(R4)
+	ADDU	$4, R4
+	JMP	f5
+
+/*
+ * last loop, copy byte at a time
+ */
+fout:
+	BEQ	R7,R5, ret
+	MOVB	0(R5), R8
+	ADDU	$1, R5
+	MOVB	R8, 0(R4)
+	ADDU	$1, R4
+	JMP	fout
+
+/*
+ * whole thing repeated for backwards
+ */
+back:
+	BNE	R2, bout
+	XOR	R6,R7, R1
+	AND	$3, R1
+	BNE	R1, bout
+b1:
+	AND	$3,R7, R1
+	BEQ	R1, b2
+	MOVB	-1(R7), R8
+	ADDU	$-1, R7
+	MOVB	R8, -1(R6)
+	ADDU	$-1, R6
+	JMP	b1
+b2:
+	ADDU	$15,R5, R3
+b3:
+	SGTU	R7,R3, R1
+	BEQ	R1, b4
+	MOVW	-4(R7), R8
+	MOVW	-8(R7), R9
+	MOVW	R8, -4(R6)
+	MOVW	-12(R7), R8
+	MOVW	R9, -8(R6)
+	MOVW	-16(R7), R9
+	ADDU	$-16, R7
+	MOVW	R8, -12(R6)
+	MOVW	R9, -16(R6)
+	ADDU	$-16, R6
+	JMP	b3
+b4:
+	ADDU	$3,R5, R3
+b5:
+	SGTU	R7,R3, R1
+	BEQ	R1, bout
+	MOVW	-4(R7), R8
+	ADDU	$-4, R7
+	MOVW	R8, -4(R6)
+	ADDU	$-4, R6
+	JMP	b5
+
+bout:
+	BEQ	R7,R5, ret
+	MOVB	-1(R7), R8
+	ADDU	$-1, R7
+	MOVB	R8, -1(R6)
+	ADDU	$-1, R6
+	JMP	bout
+
+ret:
+	MOVW	s1+0(FP), R1
+	RET
+	END

+ 88 - 0
sys/src/ape/lib/ap/spim/memset.s

@@ -0,0 +1,88 @@
+	TEXT	memset(SB),$12
+MOVW R1, 0(FP)
+
+/*
+ * performance:
+ *	about 1us/call and 28mb/sec
+ */
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	p+0(FP), R4		/* R4 is pointer */
+	MOVW	c+4(FP), R5		/* R5 is char */
+	ADDU	R3,R4, R6		/* R6 is end pointer */
+
+/*
+ * if not at least 4 chars,
+ * dont even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word store.
+ */
+	SGT	$4,R3, R1
+	BNE	R1, out
+
+/*
+ * turn R5 into a word of characters
+ */
+	AND	$0xff, R5
+	SLL	$8,R5, R1
+	OR	R1, R5
+	SLL	$16,R5, R1
+	OR	R1, R5
+
+/*
+ * store one byte at a time until pointer
+ * is alligned on a word boundary
+ */
+l1:
+	AND	$3,R4, R1
+	BEQ	R1, l2
+	MOVB	R5, 0(R4)
+	ADDU	$1, R4
+	JMP	l1
+
+/*
+ * turn R3 into end pointer-15
+ * store 16 at a time while theres room
+ */
+l2:
+	ADDU	$-15,R6, R3
+l3:
+	SGTU	R3,R4, R1
+	BEQ	R1, l4
+	MOVW	R5, 0(R4)
+	MOVW	R5, 4(R4)
+	ADDU	$16, R4
+	MOVW	R5, -8(R4)
+	MOVW	R5, -4(R4)
+	JMP	l3
+
+/*
+ * turn R3 into end pointer-3
+ * store 4 at a time while theres room
+ */
+l4:
+	ADDU	$-3,R6, R3
+l5:
+	SGTU	R3,R4, R1
+	BEQ	R1, out
+	MOVW	R5, 0(R4)
+	ADDU	$4, R4
+	JMP	l5
+
+/*
+ * last loop, store byte at a time
+ */
+out:
+	SGTU	R6,R4 ,R1
+	BEQ	R1, ret
+	MOVB	R5, 0(R4)
+	ADDU	$1, R4
+	JMP	out
+
+ret:
+	MOVW	s1+0(FP), R1
+	RET
+	END

+ 27 - 0
sys/src/ape/lib/ap/spim/mkfile

@@ -0,0 +1,27 @@
+APE=/sys/src/ape
+<$APE/config
+LIB=/$objtype/lib/ape/libap.a
+OFILES=\
+	atom.$O\
+	c_fcr0.$O\
+	cycles.$O\
+	getfcr.$O\
+	lock.$O\
+	main9.$O\
+	main9p.$O\
+	memchr.$O\
+	memcmp.$O\
+	memmove.$O\
+	memset.$O\
+	notetramp.$O\
+	setjmp.$O\
+	strchr.$O\
+	strcmp.$O\
+	strcpy.$O\
+	tas.$O\
+	vlop.$O\
+	vlrt.$O\
+
+</sys/src/cmd/mksyslib
+
+CFLAGS=$CFLAGS -c -D_POSIX_SOURCE -D_PLAN9_SOURCE

+ 72 - 0
sys/src/ape/lib/ap/spim/notetramp.c

@@ -0,0 +1,72 @@
+#include "../plan9/lib.h"
+#include "../plan9/sys9.h"
+#include <signal.h>
+#include <setjmp.h>
+
+/* A stack to hold pcs when signals nest */
+#define MAXSIGSTACK 20
+typedef struct Pcstack Pcstack;
+static struct Pcstack {
+	int sig;
+	void (*hdlr)(int, char*, Ureg*);
+	unsigned long restorepc;
+	Ureg *u;
+} pcstack[MAXSIGSTACK];
+static int nstack = 0;
+
+static void notecont(Ureg*, char*);
+
+void
+_notetramp(int sig, void (*hdlr)(int, char*, Ureg*), Ureg *u)
+{
+	Pcstack *p;
+
+	if(nstack >= MAXSIGSTACK)
+		_NOTED(1);	/* nesting too deep; just do system default */
+	p = &pcstack[nstack];
+	p->restorepc = u->pc;
+	p->sig = sig;
+	p->hdlr = hdlr;
+	p->u = u;
+	nstack++;
+	u->pc = (unsigned long) notecont;
+	_NOTED(2);	/* NSAVE: clear note but hold state */
+}
+
+static void
+notecont(Ureg *u, char *s)
+{
+	Pcstack *p;
+	void(*f)(int, char*, Ureg*);
+
+	p = &pcstack[nstack-1];
+	f = p->hdlr;
+	u->pc = p->restorepc;
+	nstack--;
+	(*f)(p->sig, s, u);
+	_NOTED(3);	/* NRSTR */
+}
+
+#define JMPBUFPC 1
+#define JMPBUFSP 0
+
+extern sigset_t	_psigblocked;
+
+void
+siglongjmp(sigjmp_buf j, int ret)
+{
+	struct Ureg *u;
+
+	if(j[0])
+		_psigblocked = j[1];
+	if(nstack == 0 || pcstack[nstack-1].u->sp > j[2+JMPBUFSP])
+		longjmp(j+2, ret);
+	u = pcstack[nstack-1].u;
+	nstack--;
+	u->r1 = ret;
+	if(ret == 0)
+		u->r1 = 1;
+	u->pc = j[2+JMPBUFPC];
+	u->sp = j[2+JMPBUFSP];
+	_NOTED(3);	/* NRSTR */
+}

+ 24 - 0
sys/src/ape/lib/ap/spim/setjmp.s

@@ -0,0 +1,24 @@
+TEXT	setjmp(SB), 1, $-4
+	MOVW	R29, (R1)
+	MOVW	R31, 4(R1)
+	MOVW	$0, R1
+	RET
+
+TEXT	sigsetjmp(SB), 1, $-4
+	MOVW	savemask+4(FP), R2
+	MOVW	R2, 0(R1)
+	MOVW	$_psigblocked(SB), R2
+	MOVW	R2, 4(R1)
+	MOVW	R29, 8(R1)
+	MOVW	R31, 12(R1)
+	MOVW	$0, R1
+	RET
+
+TEXT	longjmp(SB), 1, $-4
+	MOVW	r+4(FP), R3
+	BNE	R3, ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVW	$1, R3		/* bless their pointed heads */
+ok:	MOVW	(R1), R29
+	MOVW	4(R1), R31
+	MOVW	R3, R1
+	RET

+ 63 - 0
sys/src/ape/lib/ap/spim/strchr.s

@@ -0,0 +1,63 @@
+	TEXT	strchr(SB), $0
+MOVW R1, 0(FP)
+	MOVB	c+4(FP), R4	// little endian, 4(FP) instead of 7(FP)
+	MOVW	s+0(FP), R3
+
+	BEQ	R4, l2
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(R3), R1
+	ADDU	$1, R3
+	BEQ	R1, ret
+	BNE	R1,R4, l1
+	JMP	rm1
+
+/*
+ * char is null
+ * align to word
+ */
+l2:
+	AND	$3,R3, R1
+	BEQ	R1, l3
+	MOVB	(R3), R1
+	ADDU	$1, R3
+	BNE	R1, l2
+	JMP	rm1
+
+l3:
+	MOVW	$0xff000000, R6
+	MOVW	$0x00ff0000, R7
+
+l4:
+	MOVW	(R3), R5
+	ADDU	$4, R3
+	AND	$0xff,R5, R1	/* byte 0 */
+	AND	$0xff00,R5, R2	/* byte 1 */
+	BEQ	R1, b0
+	AND	R7,R5, R1	/* byte 2 */
+	BEQ	R2, b1
+	AND	R6,R5, R2	/* byte 3 */
+	BEQ	R1, b2
+	BNE	R2, l4
+
+rm1:
+	ADDU	$-1,R3, R1
+	JMP	ret
+
+b2:
+	ADDU	$-2,R3, R1
+	JMP	ret
+
+b1:
+	ADDU	$-3,R3, R1
+	JMP	ret
+
+b0:
+	ADDU	$-4,R3, R1
+	JMP	ret
+
+ret:
+	RET

+ 21 - 0
sys/src/ape/lib/ap/spim/strcmp.s

@@ -0,0 +1,21 @@
+TEXT	strcmp(SB), $0
+
+	MOVW	s2+4(FP), R2
+
+l1:
+	MOVB	(R2), R3
+	MOVB	(R1), R4
+	ADDU	$1, R1
+	BEQ	R3, end
+	ADDU	$1, R2
+	BEQ	R3, R4, l1
+
+	SGTU	R4, R3, R1
+	BNE	R1, ret
+	MOVW	$-1, R1
+	RET
+
+end:
+	SGTU	R4, R3, R1
+ret:
+	RET

+ 92 - 0
sys/src/ape/lib/ap/spim/strcpy.s

@@ -0,0 +1,92 @@
+TEXT	strcpy(SB), $0
+
+	MOVW	s2+4(FP),R2		/* R2 is from pointer */
+	MOVW	R1, R3			/* R3 is to pointer */
+
+/*
+ * align 'from' pointer
+ */
+l1:
+	AND	$3, R2, R5
+	ADDU	$1, R2
+	BEQ	R5, l2
+	MOVB	-1(R2), R5
+	ADDU	$1, R3
+	MOVB	R5, -1(R3)
+	BNE	R5, l1
+	RET
+
+/*
+ * test if 'to' is also alligned
+ */
+l2:
+	AND	$3,R3, R5
+	BEQ	R5, l4
+
+/*
+ * copy 4 at a time, 'to' not aligned
+ */
+l3:
+	MOVW	-1(R2), R4
+	ADD	$4, R2
+	ADD	$4, R3
+	AND	$0xff,R4, R5
+	MOVB	R5, -4(R3)
+	BEQ	R5, out
+
+	SRL	$8,R4, R5
+	AND	$0xff, R5
+	MOVB	R5, -3(R3)
+	BEQ	R5, out
+
+	SRL	$16,R4, R5
+	AND	$0xff, R5
+	MOVB	R5, -2(R3)
+	BEQ	R5, out
+
+	SRL	$24,R4, R5
+	MOVB	R5, -1(R3)
+	BNE	R5, l3
+
+out:
+	RET
+
+/*
+ * word at a time both aligned
+ */
+l4:
+	MOVW	$0xff000000, R7
+	MOVW	$0x00ff0000, R8
+
+l5:
+	ADDU	$4, R3
+	MOVW	-1(R2), R4	/* fetch */
+
+	ADDU	$4, R2
+	AND	$0xff,R4, R5	/* is it byte 0 */
+	AND	$0xff00,R4, R6	/* is it byte 1 */
+	BEQ	R5, b0
+
+	AND	R8,R4, R5	/* is it byte 2 */
+	BEQ	R6, b1
+
+	AND	R7,R4, R6	/* is it byte 3 */
+	BEQ	R5, b2
+
+	MOVW	R4, -4(R3)	/* store */
+	BNE	R6, l5
+	JMP	out
+
+b0:
+	MOVB	$0, -4(R3)
+	JMP	out
+
+b1:
+	MOVB	R4, -4(R3)
+	MOVB	$0, -3(R3)
+	JMP	out
+
+b2:
+	MOVH	R4, -4(R3)
+	MOVB	$0, -2(R3)
+	JMP	out

+ 30 - 0
sys/src/ape/lib/ap/spim/tas.s

@@ -0,0 +1,30 @@
+/*
+ *	mips user level lock code
+ */
+
+#define	LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define	SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define	NOOP		WORD	$0x27
+#define COP3		WORD	$(023<<26)
+
+	TEXT	C_3ktas(SB),$0
+	MOVW	R1, R21
+btas:
+	MOVW	R21, R1
+	MOVB	R0, 1(R1)
+	NOOP
+	COP3
+	BLTZ	R1, btas
+	RET
+
+	TEXT	tas(SB), $0
+	TEXT	C_4ktas(SB), $0
+	MOVW	R1, R2		/* address of key */
+tas1:
+	MOVW	$1, R3
+	LL(2, 1)
+	NOOP
+	SC(2, 3)
+	NOOP
+	BEQ	R3, tas1
+	RET

+ 17 - 0
sys/src/ape/lib/ap/spim/vlop.s

@@ -0,0 +1,17 @@
+TEXT	_mulv(SB), $0
+	MOVW	8(FP), R2	/* hi1 */
+	MOVW	4(FP), R3	/* lo1 */
+	MOVW	16(FP), R4	/* hi2 */
+	MOVW	12(FP), R5	/* lo2 */
+	MULU	R5, R3	/* lo1*lo2 -> hi:lo*/
+	MOVW	LO, R6
+	MOVW	HI, R7
+	MULU	R3, R4	/* lo1*hi2 -> _:hi */
+	MOVW	LO, R8
+	ADDU	R8, R7
+	MULU	R2, R5	/* hi1*lo2 -> _:hi */
+	MOVW	LO, R8
+	ADDU	R8, R7
+	MOVW	R6, 0(R1)	/* lo */
+	MOVW	R7, 4(R1)	/* hi */
+	RET

+ 719 - 0
sys/src/ape/lib/ap/spim/vlrt.c

@@ -0,0 +1,719 @@
+typedef	unsigned long	ulong;
+typedef	unsigned int	uint;
+typedef	unsigned short	ushort;
+typedef	unsigned char	uchar;
+typedef	signed char	schar;
+
+#define	SIGN(n)	(1UL<<(n-1))
+
+typedef	struct	Vlong	Vlong;
+struct	Vlong
+{
+	union
+	{
+		struct
+		{
+			ulong	lo;
+			ulong	hi;
+		};
+		struct
+		{
+			ushort	loms;
+			ushort	lols;
+			ushort	hims;
+			ushort	hils;
+		};
+	};
+};
+
+void	abort(void);
+
+void
+_addv(Vlong *r, Vlong a, Vlong b)
+{
+	ulong lo, hi;
+
+	lo = a.lo + b.lo;
+	hi = a.hi + b.hi;
+	if(lo < a.lo)
+		hi++;
+	r->lo = lo;
+	r->hi = hi;
+}
+
+void
+_subv(Vlong *r, Vlong a, Vlong b)
+{
+	ulong lo, hi;
+
+	lo = a.lo - b.lo;
+	hi = a.hi - b.hi;
+	if(lo > a.lo)
+		hi--;
+	r->lo = lo;
+	r->hi = hi;
+}
+
+void
+_d2v(Vlong *y, double d)
+{
+	union { double d; struct Vlong; } x;
+	ulong xhi, xlo, ylo, yhi;
+	int sh;
+
+	x.d = d;
+
+	xhi = (x.hi & 0xfffff) | 0x100000;
+	xlo = x.lo;
+	sh = 1075 - ((x.hi >> 20) & 0x7ff);
+
+	ylo = 0;
+	yhi = 0;
+	if(sh >= 0) {
+		/* v = (hi||lo) >> sh */
+		if(sh < 32) {
+			if(sh == 0) {
+				ylo = xlo;
+				yhi = xhi;
+			} else {
+				ylo = (xlo >> sh) | (xhi << (32-sh));
+				yhi = xhi >> sh;
+			}
+		} else {
+			if(sh == 32) {
+				ylo = xhi;
+			} else
+			if(sh < 64) {
+				ylo = xhi >> (sh-32);
+			}
+		}
+	} else {
+		/* v = (hi||lo) << -sh */
+		sh = -sh;
+		if(sh <= 10) {
+			ylo = xlo << sh;
+			yhi = (xhi << sh) | (xlo >> (32-sh));
+		} else {
+			/* overflow */
+			yhi = d;	/* causes something awful */
+		}
+	}
+	if(x.hi & SIGN(32)) {
+		if(ylo != 0) {
+			ylo = -ylo;
+			yhi = ~yhi;
+		} else
+			yhi = -yhi;
+	}
+
+	y->hi = yhi;
+	y->lo = ylo;
+}
+
+void
+_f2v(Vlong *y, float f)
+{
+
+	_d2v(y, f);
+}
+
+double
+_v2d(Vlong x)
+{
+	if(x.hi & SIGN(32)) {
+		if(x.lo) {
+			x.lo = -x.lo;
+			x.hi = ~x.hi;
+		} else
+			x.hi = -x.hi;
+		return -((long)x.hi*4294967296. + x.lo);
+	}
+	return (long)x.hi*4294967296. + x.lo;
+}
+
+float
+_v2f(Vlong x)
+{
+	return _v2d(x);
+}
+
+static void
+dodiv(Vlong num, Vlong den, Vlong *qp, Vlong *rp)
+{
+	ulong numlo, numhi, denhi, denlo, quohi, quolo, t;
+	int i;
+
+	numhi = num.hi;
+	numlo = num.lo;
+	denhi = den.hi;
+	denlo = den.lo;
+
+	/*
+	 * get a divide by zero
+	 */
+	if(denlo==0 && denhi==0) {
+		numlo = numlo / denlo;
+	}
+
+	/*
+	 * set up the divisor and find the number of iterations needed
+	 */
+	if(numhi >= SIGN(32)) {
+		quohi = SIGN(32);
+		quolo = 0;
+	} else {
+		quohi = numhi;
+		quolo = numlo;
+	}
+	i = 0;
+	while(denhi < quohi || (denhi == quohi && denlo < quolo)) {
+		denhi = (denhi<<1) | (denlo>>31);
+		denlo <<= 1;
+		i++;
+	}
+
+	quohi = 0;
+	quolo = 0;
+	for(; i >= 0; i--) {
+		quohi = (quohi<<1) | (quolo>>31);
+		quolo <<= 1;
+		if(numhi > denhi || (numhi == denhi && numlo >= denlo)) {
+			t = numlo;
+			numlo -= denlo;
+			if(numlo > t)
+				numhi--;
+			numhi -= denhi;
+			quolo |= 1;
+		}
+		denlo = (denlo>>1) | (denhi<<31);
+		denhi >>= 1;
+	}
+
+	if(qp) {
+		qp->lo = quolo;
+		qp->hi = quohi;
+	}
+	if(rp) {
+		rp->lo = numlo;
+		rp->hi = numhi;
+	}
+}
+
+void
+_divvu(Vlong *q, Vlong n, Vlong d)
+{
+
+	if(n.hi == 0 && d.hi == 0) {
+		q->hi = 0;
+		q->lo = n.lo / d.lo;
+		return;
+	}
+	dodiv(n, d, q, 0);
+}
+
+void
+_modvu(Vlong *r, Vlong n, Vlong d)
+{
+
+	if(n.hi == 0 && d.hi == 0) {
+		r->hi = 0;
+		r->lo = n.lo % d.lo;
+		return;
+	}
+	dodiv(n, d, 0, r);
+}
+
+static void
+vneg(Vlong *v)
+{
+
+	if(v->lo == 0) {
+		v->hi = -v->hi;
+		return;
+	}
+	v->lo = -v->lo;
+	v->hi = ~v->hi;
+}
+
+void
+_divv(Vlong *q, Vlong n, Vlong d)
+{
+	long nneg, dneg;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		q->lo = (long)n.lo / (long)d.lo;
+		q->hi = ((long)q->lo) >> 31;
+		return;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, q, 0);
+	if(nneg != dneg)
+		vneg(q);
+}
+
+void
+_modv(Vlong *r, Vlong n, Vlong d)
+{
+	long nneg, dneg;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		r->lo = (long)n.lo % (long)d.lo;
+		r->hi = ((long)r->lo) >> 31;
+		return;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, 0, r);
+	if(nneg)
+		vneg(r);
+}
+
+void
+_rshav(Vlong *r, Vlong a, int b)
+{
+	long t;
+
+	t = a.hi;
+	if(b >= 32) {
+		r->hi = t>>31;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->lo = t>>31;
+			return;
+		}
+		r->lo = t >> (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->hi = t;
+		r->lo = a.lo;
+		return;
+	}
+	r->hi = t >> b;
+	r->lo = (t << (32-b)) | (a.lo >> b);
+}
+
+void
+_rshlv(Vlong *r, Vlong a, int b)
+{
+	ulong t;
+
+	t = a.hi;
+	if(b >= 32) {
+		r->hi = 0;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->lo = 0;
+			return;
+		}
+		r->lo = t >> (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->hi = t;
+		r->lo = a.lo;
+		return;
+	}
+	r->hi = t >> b;
+	r->lo = (t << (32-b)) | (a.lo >> b);
+}
+
+void
+_lshv(Vlong *r, Vlong a, int b)
+{
+	ulong t;
+
+	t = a.lo;
+	if(b >= 32) {
+		r->lo = 0;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->hi = 0;
+			return;
+		}
+		r->hi = t << (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->lo = t;
+		r->hi = a.hi;
+		return;
+	}
+	r->lo = t << b;
+	r->hi = (t >> (32-b)) | (a.hi << b);
+}
+
+void
+_andv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi & b.hi;
+	r->lo = a.lo & b.lo;
+}
+
+void
+_orv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi | b.hi;
+	r->lo = a.lo | b.lo;
+}
+
+void
+_xorv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi ^ b.hi;
+	r->lo = a.lo ^ b.lo;
+}
+
+void
+_vpp(Vlong *l, Vlong *r)
+{
+
+	l->hi = r->hi;
+	l->lo = r->lo;
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+}
+
+void
+_vmm(Vlong *l, Vlong *r)
+{
+
+	l->hi = r->hi;
+	l->lo = r->lo;
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+}
+
+void
+_ppv(Vlong *l, Vlong *r)
+{
+
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+	l->hi = r->hi;
+	l->lo = r->lo;
+}
+
+void
+_mmv(Vlong *l, Vlong *r)
+{
+
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+	l->hi = r->hi;
+	l->lo = r->lo;
+}
+
+void
+_vasop(Vlong *ret, void *lv, void fn(Vlong*, Vlong, Vlong), int type, Vlong rv)
+{
+	Vlong t, u;
+
+	u.lo = 0;
+	u.hi = 0;
+	switch(type) {
+	default:
+		abort();
+		break;
+
+	case 1:	/* schar */
+		t.lo = *(schar*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(schar*)lv = u.lo;
+		break;
+
+	case 2:	/* uchar */
+		t.lo = *(uchar*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(uchar*)lv = u.lo;
+		break;
+
+	case 3:	/* short */
+		t.lo = *(short*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(short*)lv = u.lo;
+		break;
+
+	case 4:	/* ushort */
+		t.lo = *(ushort*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(ushort*)lv = u.lo;
+		break;
+
+	case 9:	/* int */
+		t.lo = *(int*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(int*)lv = u.lo;
+		break;
+
+	case 10:	/* uint */
+		t.lo = *(uint*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(uint*)lv = u.lo;
+		break;
+
+	case 5:	/* long */
+		t.lo = *(long*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(long*)lv = u.lo;
+		break;
+
+	case 6:	/* ulong */
+		t.lo = *(ulong*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(ulong*)lv = u.lo;
+		break;
+
+	case 7:	/* vlong */
+	case 8:	/* uvlong */
+		fn(&u, *(Vlong*)lv, rv);
+		*(Vlong*)lv = u;
+		break;
+	}
+	*ret = u;
+}
+
+void
+_p2v(Vlong *ret, void *p)
+{
+	long t;
+
+	t = (ulong)p;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sl2v(Vlong *ret, long sl)
+{
+	long t;
+
+	t = sl;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_ul2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_si2v(Vlong *ret, int si)
+{
+	long t;
+
+	t = si;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_ui2v(Vlong *ret, uint ui)
+{
+	long t;
+
+	t = ui;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sh2v(Vlong *ret, long sh)
+{
+	long t;
+
+	t = (sh << 16) >> 16;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_uh2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul & 0xffff;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sc2v(Vlong *ret, long uc)
+{
+	long t;
+
+	t = (uc << 24) >> 24;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_uc2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul & 0xff;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+long
+_v2sc(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xff;
+	return (t << 24) >> 24;
+}
+
+long
+_v2uc(Vlong rv)
+{
+
+	return rv.lo & 0xff;
+}
+
+long
+_v2sh(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xffff;
+	return (t << 16) >> 16;
+}
+
+long
+_v2uh(Vlong rv)
+{
+
+	return rv.lo & 0xffff;
+}
+
+long
+_v2sl(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2ul(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2si(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2ui(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+int
+_testv(Vlong rv)
+{
+	return rv.lo || rv.hi;
+}
+
+int
+_eqv(Vlong lv, Vlong rv)
+{
+	return lv.lo == rv.lo && lv.hi == rv.hi;
+}
+
+int
+_nev(Vlong lv, Vlong rv)
+{
+	return lv.lo != rv.lo || lv.hi != rv.hi;
+}
+
+int
+_ltv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+int
+_lev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+int
+_gtv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+int
+_gev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}
+
+int
+_lov(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+int
+_lsv(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+int
+_hiv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+int
+_hsv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}

+ 26 - 0
sys/src/ape/lib/mp/spim/mkfile

@@ -0,0 +1,26 @@
+APE=/sys/src/ape
+<$APE/config
+
+LIB=/$objtype/lib/ape/libmp.a
+
+SFILES=\
+	mpvecadd.s\
+	mpvecdigmuladd.s\
+	mpvecdigmulsub.s\
+	mpvecsub.s\
+#	mpdigdiv.s\
+
+HFILES=\
+	/sys/include/ape/mp.h\
+	../../../../libmp/port/dat.h
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib
+
+%.$O:	../../../../libmp/spim/%.s
+	$AS ../../../../libmp/spim/$stem.s

+ 23 - 0
sys/src/ape/lib/sec/spim/mkfile

@@ -0,0 +1,23 @@
+APE=/sys/src/ape
+<$APE/config
+
+LIB=/$objtype/lib/ape/libsec.a
+
+FILES=\
+	md5block\
+	sha1block\
+
+HFILES=/sys/include/ape/libsec.h
+
+SFILES=${FILES:%=%.s}
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib
+
+%.$O:	../../../../libsec/spim/%.s
+	$AS ../../../../libsec/spim/$stem.s

+ 2 - 0
sys/src/cmd/gs/arch.h

@@ -4,6 +4,8 @@
 #include "386.h"
 #elif Tmips
 #include "mips.h"
+#elif Tspim
+#include "spim.h"
 #elif Tpower
 #include "mips.h"
 #elif Talpha

+ 46 - 0
sys/src/cmd/gs/spim.h

@@ -0,0 +1,46 @@
+/* Parameters derived from machine and compiler architecture. */
+/* This file is generated mechanically by genarch.c. */
+
+	 /* ---------------- Scalar alignments ---------------- */
+
+#define ARCH_ALIGN_SHORT_MOD 2
+#define ARCH_ALIGN_INT_MOD 4
+#define ARCH_ALIGN_LONG_MOD 4
+#define ARCH_ALIGN_PTR_MOD 4
+#define ARCH_ALIGN_FLOAT_MOD 4
+#define ARCH_ALIGN_DOUBLE_MOD 4
+#define ARCH_ALIGN_STRUCT_MOD 4
+
+	 /* ---------------- Scalar sizes ---------------- */
+
+#define ARCH_LOG2_SIZEOF_CHAR 0
+#define ARCH_LOG2_SIZEOF_SHORT 1
+#define ARCH_LOG2_SIZEOF_INT 2
+#define ARCH_LOG2_SIZEOF_LONG 2
+#define ARCH_LOG2_SIZEOF_LONG_LONG 3
+#define ARCH_SIZEOF_PTR 4
+#define ARCH_SIZEOF_FLOAT 4
+#define ARCH_SIZEOF_DOUBLE 8
+#define ARCH_FLOAT_MANTISSA_BITS 24
+#define ARCH_DOUBLE_MANTISSA_BITS 53
+
+	 /* ---------------- Unsigned max values ---------------- */
+
+#define ARCH_MAX_UCHAR ((unsigned char)0xff + (unsigned char)0)
+#define ARCH_MAX_USHORT ((unsigned short)0xffff + (unsigned short)0)
+#define ARCH_MAX_UINT ((unsigned int)~0 + (unsigned int)0)
+#define ARCH_MAX_ULONG ((unsigned long)~0L + (unsigned long)0)
+
+	 /* ---------------- Cache sizes ---------------- */
+
+#define ARCH_CACHE1_SIZE 1048576
+#define ARCH_CACHE2_SIZE 4194304
+
+	 /* ---------------- Miscellaneous ---------------- */
+
+#define ARCH_IS_BIG_ENDIAN 0
+#define ARCH_PTRS_ARE_SIGNED 0
+#define ARCH_FLOATS_ARE_IEEE 1
+#define ARCH_ARITH_RSHIFT 2
+#define ARCH_CAN_SHIFT_FULL_LONG 0
+#define ARCH_DIV_NEG_POS_TRUNCATES 1

+ 2 - 0
sys/src/cmd/vc/swt.c

@@ -124,6 +124,8 @@ outstring(char *s, long n)
 {
 	long r;
 
+	if(suppress)
+		return nstring;
 	r = nstring;
 	while(n) {
 		string[mnstring] = *s++;

+ 32 - 9
sys/src/cmd/vl/asm.c

@@ -675,7 +675,10 @@ datblk(long s, long n, int str)
 				fl = ieeedtof(p->to.ieee);
 				cast = (char*)&fl;
 				for(; i<c; i++) {
-					buf.dbuf[l] = cast[fnuxi8[i+4]];
+					if(little)
+						buf.dbuf[l] = cast[fnuxi8[i]];
+					else
+						buf.dbuf[l] = cast[fnuxi8[i+4]];
 					l++;
 				}
 				break;
@@ -1025,8 +1028,13 @@ asmout(Prog *p, Optab *o, int aflag)
 			o1 = OP_IRR(opirr(ALAST), v>>16, REGZERO, REGTMP);
 			o2 = OP_IRR(opirr(AOR), v, REGTMP, REGTMP);
 			o3 = OP_RRR(oprrr(AADDU), r, REGTMP, REGTMP);
-			o4 = OP_IRR(opirr(AMOVF+ALAST), 0, REGTMP, p->to.reg+1);
-			o5 = OP_IRR(opirr(AMOVF+ALAST), 4, REGTMP, p->to.reg);
+			if(little) {
+				o4 = OP_IRR(opirr(AMOVF+ALAST), 0, REGTMP, p->to.reg);
+				o5 = OP_IRR(opirr(AMOVF+ALAST), 4, REGTMP, p->to.reg+1);
+			} else {
+				o4 = OP_IRR(opirr(AMOVF+ALAST), 0, REGTMP, p->to.reg+1);
+				o5 = OP_IRR(opirr(AMOVF+ALAST), 4, REGTMP, p->to.reg);
+			}
 			break;
 		case 16:
 			o1 = OP_IRR(opirr(ALAST), v>>16, REGZERO, REGTMP);
@@ -1035,8 +1043,13 @@ asmout(Prog *p, Optab *o, int aflag)
 			o4 = OP_IRR(opirr(AMOVF+ALAST), 0, REGTMP, p->to.reg);
 			break;
 		case 8:
-			o1 = OP_IRR(opirr(AMOVF+ALAST), v, r, p->to.reg+1);
-			o2 = OP_IRR(opirr(AMOVF+ALAST), v+4, r, p->to.reg);
+			if(little) {
+				o1 = OP_IRR(opirr(AMOVF+ALAST), v, r, p->to.reg);
+				o2 = OP_IRR(opirr(AMOVF+ALAST), v+4, r, p->to.reg+1);
+			} else {
+				o1 = OP_IRR(opirr(AMOVF+ALAST), v, r, p->to.reg+1);
+				o2 = OP_IRR(opirr(AMOVF+ALAST), v+4, r, p->to.reg);
+			}
 			break;
 		case 4:
 			o1 = OP_IRR(opirr(AMOVF+ALAST), v, r, p->to.reg);
@@ -1056,8 +1069,13 @@ asmout(Prog *p, Optab *o, int aflag)
 			o1 = OP_IRR(opirr(ALAST), v>>16, REGZERO, REGTMP);
 			o2 = OP_IRR(opirr(AOR), v, REGTMP, REGTMP);
 			o3 = OP_RRR(oprrr(AADDU), r, REGTMP, REGTMP);
-			o4 = OP_IRR(opirr(AMOVF), 0, REGTMP, p->from.reg+1);
-			o5 = OP_IRR(opirr(AMOVF), 4, REGTMP, p->from.reg);
+			if(little) {
+				o4 = OP_IRR(opirr(AMOVF), 0, REGTMP, p->from.reg);
+				o5 = OP_IRR(opirr(AMOVF), 4, REGTMP, p->from.reg+1);
+			} else {
+				o4 = OP_IRR(opirr(AMOVF), 0, REGTMP, p->from.reg+1);
+				o5 = OP_IRR(opirr(AMOVF), 4, REGTMP, p->from.reg);
+			}
 			break;
 		case 16:
 			if(r == REGTMP)
@@ -1068,8 +1086,13 @@ asmout(Prog *p, Optab *o, int aflag)
 			o4 = OP_IRR(opirr(AMOVF), 0, REGTMP, p->from.reg);
 			break;
 		case 8:
-			o1 = OP_IRR(opirr(AMOVF), v, r, p->from.reg+1);
-			o2 = OP_IRR(opirr(AMOVF), v+4, r, p->from.reg);
+			if(little) {
+				o1 = OP_IRR(opirr(AMOVF), v, r, p->from.reg);
+				o2 = OP_IRR(opirr(AMOVF), v+4, r, p->from.reg+1);
+			} else {
+				o1 = OP_IRR(opirr(AMOVF), v, r, p->from.reg+1);
+				o2 = OP_IRR(opirr(AMOVF), v+4, r, p->from.reg);
+			}
 			break;
 		case 4:
 			o1 = OP_IRR(opirr(AMOVF), v, r, p->from.reg);

+ 4 - 0
sys/src/libc/spim/argv0.s

@@ -0,0 +1,4 @@
+GLOBL	argv0(SB), $4
+GLOBL	_tos(SB), $4
+GLOBL	_privates(SB), $4
+GLOBL	_nprivates(SB), $4

+ 52 - 0
sys/src/libc/spim/atom.s

@@ -0,0 +1,52 @@
+/*
+ *	R4000 user-level atomic operations
+ */
+
+#define	LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define	SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define	NOOP		WORD	$0x27
+
+TEXT ainc(SB), 1, $-4			/* long ainc(long *); */
+TEXT _xinc(SB), 1, $-4			/* void _xinc(long *); */
+	MOVW	R1, R2			/* address of counter */
+loop:	MOVW	$1, R3
+	LL(2, 1)
+	NOOP
+	ADDU	R1, R3
+	MOVW	R3, R1			/* return new value */
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop
+	RET
+
+TEXT adec(SB), 1, $-4			/* long adec(long*); */
+TEXT _xdec(SB), 1, $-4			/* long _xdec(long *); */
+	MOVW	R1, R2			/* address of counter */
+loop1:	MOVW	$-1, R3
+	LL(2, 1)
+	NOOP
+	ADDU	R1, R3
+	MOVW	R3, R1			/* return new value */
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop1
+	RET
+
+/*
+ * int cas(uint* p, int ov, int nv);
+ */
+TEXT cas(SB), 1, $-4
+	MOVW	ov+4(FP), R2
+	MOVW	nv+8(FP), R3
+spincas:
+	LL(1, 4)			/* R4 = *R1 */
+	NOOP
+	BNE	R2, R4, fail
+	SC(1, 3)			/* *R1 = R3 */
+	NOOP
+	BEQ	R3, spincas		/* R3 == 0 means store failed */
+	MOVW	$1, R1
+	RET
+fail:
+	MOVW	$0, R1
+	RET

+ 3 - 0
sys/src/libc/spim/c_fcr0.s

@@ -0,0 +1,3 @@
+	TEXT	C_fcr0(SB), $0
+	MOVW	FCR0, R1
+	RET

+ 10 - 0
sys/src/libc/spim/cycles.c

@@ -0,0 +1,10 @@
+#include <u.h>
+#include <libc.h>
+
+#pragma profile off
+
+void
+cycles(uvlong*u)
+{
+	*u = 0LL;
+}

+ 3 - 0
sys/src/libc/spim/getcallerpc.s

@@ -0,0 +1,3 @@
+TEXT	getcallerpc(SB), $0
+	MOVW	0(SP), R1
+	RET

+ 15 - 0
sys/src/libc/spim/getfcr.s

@@ -0,0 +1,15 @@
+TEXT	getfsr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfsr(SB), $0
+	MOVW	R1, FCR31
+	RET
+
+TEXT	getfcr(SB), $0
+	MOVW	FCR31, R1
+	RET
+
+TEXT	setfcr(SB), $0
+	MOVW	R1, FCR31
+	RET

+ 25 - 0
sys/src/libc/spim/main9.s

@@ -0,0 +1,25 @@
+#define NPRIVATES	16
+
+TEXT	_main(SB), 1, $(16 + NPRIVATES*4)
+
+	MOVW	$setR30(SB), R30
+	MOVW	R1, _tos(SB)
+
+	MOVW	$p-64(SP), R1
+	MOVW	R1, _privates(SB)
+	MOVW	$NPRIVATES, R1
+	MOVW	R1, _nprivates(SB)
+
+	MOVW	inargc-4(FP), R1
+	MOVW	$inargv+0(FP), R2
+	MOVW	R1, 4(R29)
+	MOVW	R2, 8(R29)
+	JAL	main(SB)
+loop:
+	MOVW	$_exitstr<>(SB), R1
+	MOVW	R1, 4(R29)
+	JAL	exits(SB)
+	JMP	loop
+
+DATA	_exitstr<>+0(SB)/4, $"main"
+GLOBL	_exitstr<>+0(SB), $5

+ 41 - 0
sys/src/libc/spim/main9p.s

@@ -0,0 +1,41 @@
+#define NPRIVATES	16
+
+TEXT	_mainp(SB), 1, $(16 + NPRIVATES*4)
+
+	MOVW	$setR30(SB), R30
+	/* _tos = arg */
+	MOVW	R1, _tos(SB)
+
+	MOVW	$p-64(SP), R1
+	MOVW	R1, _privates(SB)
+	MOVW	$NPRIVATES, R1
+	MOVW	R1, _nprivates(SB)
+
+	/* _profmain(); */
+	JAL	_profmain(SB)
+	/* _tos->prof.pp = _tos->prof.next; */
+	MOVW	_tos(SB), R1
+	MOVW	4(R1), R2
+	MOVW	R2, 0(R1)
+	/* main(argc, argv); */
+	MOVW	inargc-4(FP), R1
+	MOVW	$inargv+0(FP), R2
+	MOVW	R1, 4(R29)
+	MOVW	R2, 8(R29)
+	JAL	main(SB)
+loop:
+	MOVW	$exits<>(SB), R1
+	MOVW	R1, 4(R29)
+	JAL	exits(SB)
+	MOVW	$_profin(SB), R0	/* force loading of profile */
+	JMP	loop
+
+TEXT	_savearg(SB), 1, $0
+	RET
+
+TEXT	_callpc(SB), 1, $0
+	MOVW	argp-4(FP), R1
+	RET
+
+DATA	exits<>+0(SB)/4, $"main"
+GLOBL	exits<>+0(SB), $5

+ 20 - 0
sys/src/libc/spim/memccpy.s

@@ -0,0 +1,20 @@
+	TEXT	memccpy(SB), $0
+MOVW R1, 0(FP)
+	MOVW	n+12(FP), R1
+	BEQ	R1, ret
+	MOVW	s1+0(FP), R3
+	MOVW	s2+4(FP), R2
+	MOVBU	c+8(FP), R4	/* little endian */
+	ADDU	R1, R2, R5
+
+l1:	MOVBU	(R2), R6
+	ADDU	$1, R2
+	MOVBU	R6, (R3)
+	ADDU	$1, R3
+	BEQ	R4, R6, eq
+	BNE	R2, R5, l1
+	MOVW	$0, R1
+	RET
+
+eq:	MOVW	R3, R1
+ret:	RET

+ 39 - 0
sys/src/libc/spim/memchr.s

@@ -0,0 +1,39 @@
+	TEXT	memchr(SB), $0
+MOVW R1, 0(FP)
+
+	MOVW	n+8(FP), R1
+	MOVW	s1+0(FP), R2
+	MOVBU	c+4(FP), R3	// little endian, 4(FP) instead of 7(FP)
+	ADDU	R1, R2, R6
+
+	AND	$(~1), R1, R5
+	ADDU	R2, R5
+	BEQ	R2, R5, lt2
+
+l1:
+	MOVBU	0(R2), R4
+	MOVBU	1(R2), R7
+	BEQ	R3, R4, eq0
+	ADDU	$2, R2
+	BEQ	R3, R7, eq
+	BNE	R2, R5, l1
+
+lt2:
+	BEQ	R2, R6, zret
+
+l2:
+	MOVBU	(R2), R4
+	ADDU	$1, R2
+	BEQ	R3, R4, eq
+	BNE	R2, R6, l2
+zret:
+	MOVW	R0, R1
+	RET
+
+eq0:
+	MOVW	R2, R1
+	RET
+
+eq:
+	SUBU	$1,R2, R1
+	RET

+ 116 - 0
sys/src/libc/spim/memcmp.s

@@ -0,0 +1,116 @@
+	TEXT	memcmp(SB), $0
+MOVW R1, 0(FP)
+
+/*
+ * performance:
+ *	alligned about 1.0us/call and 17.4mb/sec
+ *	unalligned is about 3.1mb/sec
+ */
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	s1+0(FP), R4		/* R4 is pointer1 */
+	MOVW	s2+4(FP), R5		/* R5 is pointer2 */
+	ADDU	R3,R4, R6		/* R6 is end pointer1 */
+
+	JMP	out		// XXX little endian
+
+/*
+ * if not at least 4 chars,
+ * dont even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word cmp.
+ */
+	SGT	$4,R3, R1
+	BNE	R1, out
+
+/*
+ * test if both pointers
+ * are similarly word alligned
+ */
+	XOR	R4,R5, R1
+	AND	$3, R1
+	BNE	R1, out
+
+/*
+ * byte at a time to word allign
+ */
+l1:
+	AND	$3,R4, R1
+	BEQ	R1, l2
+	MOVBU	0(R4), R8
+	MOVBU	0(R5), R9
+	ADDU	$1, R4
+	BNE	R8,R9, ne
+	ADDU	$1, R5
+	JMP	l1
+
+/*
+ * turn R3 into end pointer1-15
+ * cmp 16 at a time while theres room
+ */
+l2:
+	ADDU	$-15,R6, R3
+l3:
+	SGTU	R3,R4, R1
+	BEQ	R1, l4
+	MOVW	0(R4), R8
+	MOVW	0(R5), R9
+	MOVW	4(R4), R10
+	BNE	R8,R9, ne
+	MOVW	4(R5), R11
+	MOVW	8(R4), R8
+	BNE	R10,R11, ne1
+	MOVW	8(R5), R9
+	MOVW	12(R4), R10
+	BNE	R8,R9, ne
+	MOVW	12(R5), R11
+	ADDU	$16, R4
+	BNE	R10,R11, ne1
+	BNE	R8,R9, ne
+	ADDU	$16, R5
+	JMP	l3
+
+/*
+ * turn R3 into end pointer1-3
+ * cmp 4 at a time while theres room
+ */
+l4:
+	ADDU	$-3,R6, R3
+l5:
+	SGTU	R3,R4, R1
+	BEQ	R1, out
+	MOVW	0(R4), R8
+	MOVW	0(R5), R9
+	ADDU	$4, R4
+	BNE	R8,R9, ne	/* only works because big endian */
+	ADDU	$4, R5
+	JMP	l5
+
+/*
+ * last loop, cmp byte at a time
+ */
+out:
+	SGTU	R6,R4, R1
+	BEQ	R1, ret
+	MOVBU	0(R4), R8
+	MOVBU	0(R5), R9
+	ADDU	$1, R4
+	BNE	R8,R9, ne
+	ADDU	$1, R5
+	JMP	out
+
+ne1:
+	SGTU	R10,R11, R1
+	BNE	R1, ret
+	MOVW	$-1,R1
+	RET
+ne:
+	SGTU	R8,R9, R1
+	BNE	R1, ret
+	MOVW	$-1,R1
+ret:
+	RET
+	END

+ 237 - 0
sys/src/libc/spim/memmove.s

@@ -0,0 +1,237 @@
+	TEXT	memmove(SB), $0
+
+	JMP	move
+
+	TEXT	memcpy(SB), $0
+move:
+	MOVW	R1, s1+0(FP)
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	R1, R4			/* R4 is to-pointer */
+	SGT	R0, R3, R5
+	BEQ	R5, ok
+	MOVW	(R0), R0		/* abort if negative count */
+ok:
+	MOVW	s2+4(FP), R5		/* R5 is from-pointer */
+	ADDU	R3,R5, R7		/* R7 is end from-pointer */
+	ADDU	R3,R4, R6		/* R6 is end to-pointer */
+
+/*
+ * easiest test is copy backwards if
+ * destination string has higher mem address
+ */
+	SGT	$4,R3, R2
+	SGTU	R4,R5, R1
+	BNE	R1, back
+
+/*
+ * if not at least 4 chars,
+ * don't even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word store.
+ */
+	BNE	R2, fout
+
+
+/*
+ * byte at a time to word align destination
+ */
+f1:
+	AND	$3,R4, R1
+	BEQ	R1, f2
+	MOVB	0(R5), R8
+	ADDU	$1, R5
+	MOVB	R8, 0(R4)
+	ADDU	$1, R4
+	JMP	f1
+
+/*
+ * test if source is now word aligned
+ */
+f2:
+	AND	$3, R5, R1
+	BNE	R1, fun2
+/*
+ * turn R3 into to-end pointer-15
+ * copy 16 at a time while theres room.
+ * R6 is smaller than R7 --
+ * there are problems if R7 is 0.
+ */
+	ADDU	$-15,R6, R3
+f3:
+	SGTU	R3,R4, R1
+	BEQ	R1, f4
+	MOVW	0(R5), R8
+	MOVW	4(R5), R9
+	MOVW	R8, 0(R4)
+	MOVW	8(R5), R8
+	MOVW	R9, 4(R4)
+	MOVW	12(R5), R9
+	ADDU	$16, R5
+	MOVW	R8, 8(R4)
+	MOVW	R9, 12(R4)
+	ADDU	$16, R4
+	JMP	f3
+
+/*
+ * turn R3 into to-end pointer-3
+ * copy 4 at a time while theres room
+ */
+f4:
+	ADDU	$-3,R6, R3
+f5:
+	SGTU	R3,R4, R1
+	BEQ	R1, fout
+	MOVW	0(R5), R8
+	ADDU	$4, R5
+	MOVW	R8, 0(R4)
+	ADDU	$4, R4
+	JMP	f5
+
+/*
+ * forward copy, unaligned
+ * turn R3 into to-end pointer-15
+ * copy 16 at a time while theres room.
+ * R6 is smaller than R7 --
+ * there are problems if R7 is 0.
+ */
+fun2:
+	ADDU	$-15,R6, R3
+fun3:
+	SGTU	R3,R4, R1
+	BEQ	R1, fun4
+	MOVWR	0(R5), R8
+	MOVWL	3(R5), R8
+	MOVWR	4(R5), R9
+	MOVWL	7(R5), R9
+	MOVW	R8, 0(R4)
+	MOVWR	8(R5), R8
+	MOVWL	11(R5), R8
+	MOVW	R9, 4(R4)
+	MOVWR	12(R5), R9
+	MOVWL	15(R5), R9
+	ADDU	$16, R5
+	MOVW	R8, 8(R4)
+	MOVW	R9, 12(R4)
+	ADDU	$16, R4
+	JMP	fun3
+
+/*
+ * turn R3 into to-end pointer-3
+ * copy 4 at a time while theres room
+ */
+fun4:
+	ADDU	$-3,R6, R3
+fun5:
+	SGTU	R3,R4, R1
+	BEQ	R1, fout
+	MOVWR	0(R5), R8
+	MOVWL	3(R5), R8
+	ADDU	$4, R5
+	MOVW	R8, 0(R4)
+	ADDU	$4, R4
+	JMP	fun5
+
+/*
+ * last loop, copy byte at a time
+ */
+fout:
+	BEQ	R7,R5, ret
+	MOVB	0(R5), R8
+	ADDU	$1, R5
+	MOVB	R8, 0(R4)
+	ADDU	$1, R4
+	JMP	fout
+
+/*
+ * whole thing repeated for backwards
+ */
+back:
+	BNE	R2, bout
+b1:
+	AND	$3,R6, R1
+	BEQ	R1, b2
+	MOVB	-1(R7), R8
+	ADDU	$-1, R7
+	MOVB	R8, -1(R6)
+	ADDU	$-1, R6
+	JMP	b1
+
+b2:
+	AND	$3, R7, R1
+	BNE	R1, bun2
+
+	ADDU	$15,R5, R3
+b3:
+	SGTU	R7,R3, R1
+	BEQ	R1, b4
+	MOVW	-4(R7), R8
+	MOVW	-8(R7), R9
+	MOVW	R8, -4(R6)
+	MOVW	-12(R7), R8
+	MOVW	R9, -8(R6)
+	MOVW	-16(R7), R9
+	ADDU	$-16, R7
+	MOVW	R8, -12(R6)
+	MOVW	R9, -16(R6)
+	ADDU	$-16, R6
+	JMP	b3
+b4:
+	ADDU	$3,R5, R3
+b5:
+	SGTU	R7,R3, R1
+	BEQ	R1, bout
+	MOVW	-4(R7), R8
+	ADDU	$-4, R7
+	MOVW	R8, -4(R6)
+	ADDU	$-4, R6
+	JMP	b5
+
+bun2:
+	ADDU	$15,R5, R3
+bun3:
+	SGTU	R7,R3, R1
+	BEQ	R1, bun4
+	MOVWR	-4(R7), R8
+	MOVWL	-1(R7), R8
+	MOVWR	-8(R7), R9
+	MOVWL	-5(R7), R9
+	MOVW	R8, -4(R6)
+	MOVWR	-12(R7), R8
+	MOVWL	-9(R7), R8
+	MOVW	R9, -8(R6)
+	MOVWR	-16(R7), R9
+	MOVWL	-13(R7), R9
+	ADDU	$-16, R7
+	MOVW	R8, -12(R6)
+	MOVW	R9, -16(R6)
+	ADDU	$-16, R6
+	JMP	bun3
+
+bun4:
+	ADDU	$3,R5, R3
+bun5:
+	SGTU	R7,R3, R1
+	BEQ	R1, bout
+	MOVWR	-4(R7), R8
+	MOVWL	-1(R7), R8
+	ADDU	$-4, R7
+	MOVW	R8, -4(R6)
+	ADDU	$-4, R6
+	JMP	bun5
+
+bout:
+	BEQ	R7,R5, ret
+	MOVB	-1(R7), R8
+	ADDU	$-1, R7
+	MOVB	R8, -1(R6)
+	ADDU	$-1, R6
+	JMP	bout
+
+ret:
+	MOVW	s1+0(FP), R1
+	RET
+	END

+ 88 - 0
sys/src/libc/spim/memset.s

@@ -0,0 +1,88 @@
+	TEXT	memset(SB),$12
+MOVW R1, 0(FP)
+
+/*
+ * performance:
+ *	about 1us/call and 28mb/sec
+ */
+
+	MOVW	n+8(FP), R3		/* R3 is count */
+	MOVW	p+0(FP), R4		/* R4 is pointer */
+	MOVW	c+4(FP), R5		/* R5 is char */
+	ADDU	R3,R4, R6		/* R6 is end pointer */
+
+/*
+ * if not at least 4 chars,
+ * dont even mess around.
+ * 3 chars to guarantee any
+ * rounding up to a word
+ * boundary and 4 characters
+ * to get at least maybe one
+ * full word store.
+ */
+	SGT	$4,R3, R1
+	BNE	R1, out
+
+/*
+ * turn R5 into a word of characters
+ */
+	AND	$0xff, R5
+	SLL	$8,R5, R1
+	OR	R1, R5
+	SLL	$16,R5, R1
+	OR	R1, R5
+
+/*
+ * store one byte at a time until pointer
+ * is alligned on a word boundary
+ */
+l1:
+	AND	$3,R4, R1
+	BEQ	R1, l2
+	MOVB	R5, 0(R4)
+	ADDU	$1, R4
+	JMP	l1
+
+/*
+ * turn R3 into end pointer-15
+ * store 16 at a time while theres room
+ */
+l2:
+	ADDU	$-15,R6, R3
+l3:
+	SGTU	R3,R4, R1
+	BEQ	R1, l4
+	MOVW	R5, 0(R4)
+	MOVW	R5, 4(R4)
+	ADDU	$16, R4
+	MOVW	R5, -8(R4)
+	MOVW	R5, -4(R4)
+	JMP	l3
+
+/*
+ * turn R3 into end pointer-3
+ * store 4 at a time while theres room
+ */
+l4:
+	ADDU	$-3,R6, R3
+l5:
+	SGTU	R3,R4, R1
+	BEQ	R1, out
+	MOVW	R5, 0(R4)
+	ADDU	$4, R4
+	JMP	l5
+
+/*
+ * last loop, store byte at a time
+ */
+out:
+	SGTU	R6,R4 ,R1
+	BEQ	R1, ret
+	MOVB	R5, 0(R4)
+	ADDU	$1, R4
+	JMP	out
+
+ret:
+	MOVW	s1+0(FP), R1
+	RET
+	END

+ 40 - 0
sys/src/libc/spim/mkfile

@@ -0,0 +1,40 @@
+objtype=spim
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+SFILES=\
+	argv0.s\
+	atom.s\
+	c_fcr0.s\
+	getcallerpc.s\
+	getfcr.s\
+	main9.s\
+	main9p.s\
+	memccpy.s\
+	memchr.s\
+	memcmp.s\
+	memmove.s\
+	memset.s\
+	setjmp.s\
+	strchr.s\
+	strcmp.s\
+	strcpy.s\
+	tas.s\
+	vlop.s\
+
+CFILES=\
+	cycles.c\
+	notejmp.c\
+	sqrt.c\
+	vlrt.c\
+
+HFILES=/sys/include/libc.h
+
+OFILES=${CFILES:%.c=%.$O} ${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$CFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib

+ 16 - 0
sys/src/libc/spim/notejmp.c

@@ -0,0 +1,16 @@
+#include <u.h>
+#include <libc.h>
+#include <ureg.h>
+
+void
+notejmp(void *vr, jmp_buf j, int ret)
+{
+	struct Ureg *r = vr;
+
+	r->r1 = ret;
+	if(ret == 0)
+		r->r1 = 1;
+	r->pc = j[JMPBUFPC];
+	r->sp = j[JMPBUFSP];
+	noted(NCONT);
+}

+ 14 - 0
sys/src/libc/spim/setjmp.s

@@ -0,0 +1,14 @@
+TEXT	setjmp(SB), 1, $-4
+	MOVW	R29, (R1)
+	MOVW	R31, 4(R1)
+	MOVW	$0, R1
+	RET
+
+TEXT	longjmp(SB), 1, $-4
+	MOVW	r+4(FP), R3
+	BNE	R3, ok		/* ansi: "longjmp(0) => longjmp(1)" */
+	MOVW	$1, R3		/* bless their pointed heads */
+ok:	MOVW	(R1), R29
+	MOVW	4(R1), R31
+	MOVW	R3, R1
+	RET

+ 103 - 0
sys/src/libc/spim/sqrt.c

@@ -0,0 +1,103 @@
+#include <u.h>
+#include <libc.h>
+
+static	long	sqtab[64] =
+{
+	0x6cdb2, 0x726d4, 0x77ea3, 0x7d52f, 0x82a85, 0x87eb1, 0x8d1c0, 0x923bd,
+	0x974b2, 0x9c4a8, 0xa13a9, 0xa61be, 0xaaeee, 0xafb41, 0xb46bf, 0xb916e,
+	0xbdb55, 0xc247a, 0xc6ce3, 0xcb495, 0xcfb95, 0xd41ea, 0xd8796, 0xdcca0,
+	0xe110c, 0xe54dd, 0xe9818, 0xedac0, 0xf1cd9, 0xf5e67, 0xf9f6e, 0xfdfef,
+	0x01fe0, 0x05ee6, 0x09cfd, 0x0da30, 0x11687, 0x1520c, 0x18cc8, 0x1c6c1,
+	0x20000, 0x2388a, 0x27068, 0x2a79e, 0x2de32, 0x3142b, 0x3498c, 0x37e5b,
+	0x3b29d, 0x3e655, 0x41989, 0x44c3b, 0x47e70, 0x4b02b, 0x4e16f, 0x51241,
+	0x542a2, 0x57296, 0x5a220, 0x5d142, 0x60000, 0x62e5a, 0x65c55, 0x689f2,
+};
+
+double
+sqrt(double arg)
+{
+	int e, ms;
+	double a, t;
+	union
+	{
+		double	d;
+		struct
+		{
+			long	ls;
+			long	ms;
+		};
+	} u;
+
+	u.d = arg;
+	ms = u.ms;
+
+	/*
+	 * sign extend the mantissa with
+	 * exponent. result should be > 0 for
+	 * normal case.
+	 */
+	e = ms >> 20;
+	if(e <= 0) {
+		if(e == 0)
+			return 0;
+		return NaN();
+	}
+
+	/*
+	 * pick up arg/4 by adjusting exponent
+	 */
+	u.ms = ms - (2 << 20);
+	a = u.d;
+
+	/*
+	 * use 5 bits of mantissa and 1 bit
+	 * of exponent to form table index.
+	 * insert exponent/2 - 1.
+	 */
+	e = (((e - 1023) >> 1) + 1022) << 20;
+	u.ms = *(long*)((char*)sqtab + ((ms >> 13) & 0xfc)) | e;
+	u.ls = 0;
+
+	/*
+	 * three laps of newton
+	 */
+	e = 1 << 20;
+	t = u.d;
+	u.d = t + a/t;
+	u.ms -= e;		/* u.d /= 2; */
+	t = u.d;
+	u.d = t + a/t;
+	u.ms -= e;		/* u.d /= 2; */
+	t = u.d;
+
+	return t + a/t;
+}
+
+/*
+ * this is the program that generated the table.
+ * it calls sqrt by some other means.
+ *
+ * void
+ * main(void)
+ * {
+ * 	int i;
+ * 	union	U
+ * 	{
+ * 		double	d;
+ * 		struct
+ * 		{
+ * 			long	ls;
+ * 			long	ms;
+ * 		};
+ * 	} u;
+ *
+ * 	for(i=0; i<64; i++) {
+ * 		u.ms = (i<<15) | 0x3fe04000;
+ * 		u.ls = 0;
+ * 		u.d = sqrt(u.d);
+ * 		print(" 0x%.5lux,", u.ms & 0xfffff);
+ * 	}
+ * 	print("\n");
+ * 	exits(0);
+ * }
+ */

+ 63 - 0
sys/src/libc/spim/strchr.s

@@ -0,0 +1,63 @@
+	TEXT	strchr(SB), $0
+MOVW R1, 0(FP)
+	MOVB	c+4(FP), R4	// little endian, 4(FP) instead of 7(FP)
+	MOVW	s+0(FP), R3
+
+	BEQ	R4, l2
+
+/*
+ * char is not null
+ */
+l1:
+	MOVB	(R3), R1
+	ADDU	$1, R3
+	BEQ	R1, ret
+	BNE	R1,R4, l1
+	JMP	rm1
+
+/*
+ * char is null
+ * align to word
+ */
+l2:
+	AND	$3,R3, R1
+	BEQ	R1, l3
+	MOVB	(R3), R1
+	ADDU	$1, R3
+	BNE	R1, l2
+	JMP	rm1
+
+l3:
+	MOVW	$0xff000000, R6
+	MOVW	$0x00ff0000, R7
+
+l4:
+	MOVW	(R3), R5
+	ADDU	$4, R3
+	AND	$0xff,R5, R1	/* byte 0 */
+	AND	$0xff00,R5, R2	/* byte 1 */
+	BEQ	R1, b0
+	AND	R7,R5, R1	/* byte 2 */
+	BEQ	R2, b1
+	AND	R6,R5, R2	/* byte 3 */
+	BEQ	R1, b2
+	BNE	R2, l4
+
+rm1:
+	ADDU	$-1,R3, R1
+	JMP	ret
+
+b2:
+	ADDU	$-2,R3, R1
+	JMP	ret
+
+b1:
+	ADDU	$-3,R3, R1
+	JMP	ret
+
+b0:
+	ADDU	$-4,R3, R1
+	JMP	ret
+
+ret:
+	RET

+ 21 - 0
sys/src/libc/spim/strcmp.s

@@ -0,0 +1,21 @@
+TEXT	strcmp(SB), $0
+
+	MOVW	s2+4(FP), R2
+
+l1:
+	MOVB	(R2), R3
+	MOVB	(R1), R4
+	ADDU	$1, R1
+	BEQ	R3, end
+	ADDU	$1, R2
+	BEQ	R3, R4, l1
+
+	SGTU	R4, R3, R1
+	BNE	R1, ret
+	MOVW	$-1, R1
+	RET
+
+end:
+	SGTU	R4, R3, R1
+ret:
+	RET

+ 92 - 0
sys/src/libc/spim/strcpy.s

@@ -0,0 +1,92 @@
+TEXT	strcpy(SB), $0
+
+	MOVW	s2+4(FP),R2		/* R2 is from pointer */
+	MOVW	R1, R3			/* R3 is to pointer */
+
+/*
+ * align 'from' pointer
+ */
+l1:
+	AND	$3, R2, R5
+	ADDU	$1, R2
+	BEQ	R5, l2
+	MOVB	-1(R2), R5
+	ADDU	$1, R3
+	MOVB	R5, -1(R3)
+	BNE	R5, l1
+	RET
+
+/*
+ * test if 'to' is also alligned
+ */
+l2:
+	AND	$3,R3, R5
+	BEQ	R5, l4
+
+/*
+ * copy 4 at a time, 'to' not aligned
+ */
+l3:
+	MOVW	-1(R2), R4
+	ADD	$4, R2
+	ADD	$4, R3
+	AND	$0xff,R4, R5
+	MOVB	R5, -4(R3)
+	BEQ	R5, out
+
+	SRL	$8,R4, R5
+	AND	$0xff, R5
+	MOVB	R5, -3(R3)
+	BEQ	R5, out
+
+	SRL	$16,R4, R5
+	AND	$0xff, R5
+	MOVB	R5, -2(R3)
+	BEQ	R5, out
+
+	SRL	$24,R4, R5
+	MOVB	R5, -1(R3)
+	BNE	R5, l3
+
+out:
+	RET
+
+/*
+ * word at a time both aligned
+ */
+l4:
+	MOVW	$0xff000000, R7
+	MOVW	$0x00ff0000, R8
+
+l5:
+	ADDU	$4, R3
+	MOVW	-1(R2), R4	/* fetch */
+
+	ADDU	$4, R2
+	AND	$0xff,R4, R5	/* is it byte 0 */
+	AND	$0xff00,R4, R6	/* is it byte 1 */
+	BEQ	R5, b0
+
+	AND	R8,R4, R5	/* is it byte 2 */
+	BEQ	R6, b1
+
+	AND	R7,R4, R6	/* is it byte 3 */
+	BEQ	R5, b2
+
+	MOVW	R4, -4(R3)	/* store */
+	BNE	R6, l5
+	JMP	out
+
+b0:
+	MOVB	$0, -4(R3)
+	JMP	out
+
+b1:
+	MOVB	R4, -4(R3)
+	MOVB	$0, -3(R3)
+	JMP	out
+
+b2:
+	MOVH	R4, -4(R3)
+	MOVB	$0, -2(R3)
+	JMP	out

+ 30 - 0
sys/src/libc/spim/tas.s

@@ -0,0 +1,30 @@
+/*
+ *	mips user level lock code
+ */
+
+#define	LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define	SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define	NOOP		WORD	$0x27
+#define COP3		WORD	$(023<<26)
+
+	TEXT	C_3ktas(SB),$0
+	MOVW	R1, R21
+btas:
+	MOVW	R21, R1
+	MOVB	R0, 1(R1)
+	NOOP
+	COP3
+	BLTZ	R1, btas
+	RET
+
+	TEXT	_tas(SB), $0
+	TEXT	C_4ktas(SB), $0
+	MOVW	R1, R2		/* address of key */
+tas1:
+	MOVW	$1, R3
+	LL(2, 1)
+	NOOP
+	SC(2, 3)
+	NOOP
+	BEQ	R3, tas1
+	RET

+ 17 - 0
sys/src/libc/spim/vlop.s

@@ -0,0 +1,17 @@
+TEXT	_mulv(SB), $0
+	MOVW	8(FP), R2	/* hi1 */
+	MOVW	4(FP), R3	/* lo1 */
+	MOVW	16(FP), R4	/* hi2 */
+	MOVW	12(FP), R5	/* lo2 */
+	MULU	R5, R3	/* lo1*lo2 -> hi:lo*/
+	MOVW	LO, R6
+	MOVW	HI, R7
+	MULU	R3, R4	/* lo1*hi2 -> _:hi */
+	MOVW	LO, R8
+	ADDU	R8, R7
+	MULU	R2, R5	/* hi1*lo2 -> _:hi */
+	MOVW	LO, R8
+	ADDU	R8, R7
+	MOVW	R6, 0(R1)	/* lo */
+	MOVW	R7, 4(R1)	/* hi */
+	RET

+ 722 - 0
sys/src/libc/spim/vlrt.c

@@ -0,0 +1,722 @@
+typedef	unsigned long	ulong;
+typedef	unsigned int	uint;
+typedef	unsigned short	ushort;
+typedef	unsigned char	uchar;
+typedef	signed char	schar;
+
+#define	SIGN(n)	(1UL<<(n-1))
+
+typedef	struct	Vlong	Vlong;
+struct	Vlong
+{
+	union
+	{
+		struct
+		{
+			ulong	lo;
+			ulong	hi;
+		};
+		struct
+		{
+			ushort	loms;
+			ushort	lols;
+			ushort	hims;
+			ushort	hils;
+		};
+	};
+};
+
+void	abort(void);
+
+/* needed by profiler; can't be profiled. */
+#pragma profile off
+
+void
+_addv(Vlong *r, Vlong a, Vlong b)
+{
+	ulong lo, hi;
+
+	lo = a.lo + b.lo;
+	hi = a.hi + b.hi;
+	if(lo < a.lo)
+		hi++;
+	r->lo = lo;
+	r->hi = hi;
+}
+
+void
+_subv(Vlong *r, Vlong a, Vlong b)
+{
+	ulong lo, hi;
+
+	lo = a.lo - b.lo;
+	hi = a.hi - b.hi;
+	if(lo > a.lo)
+		hi--;
+	r->lo = lo;
+	r->hi = hi;
+}
+
+void
+_d2v(Vlong *y, double d)
+{
+	union { double d; struct Vlong; } x;
+	ulong xhi, xlo, ylo, yhi;
+	int sh;
+
+	x.d = d;
+
+	xhi = (x.hi & 0xfffff) | 0x100000;
+	xlo = x.lo;
+	sh = 1075 - ((x.hi >> 20) & 0x7ff);
+
+	ylo = 0;
+	yhi = 0;
+	if(sh >= 0) {
+		/* v = (hi||lo) >> sh */
+		if(sh < 32) {
+			if(sh == 0) {
+				ylo = xlo;
+				yhi = xhi;
+			} else {
+				ylo = (xlo >> sh) | (xhi << (32-sh));
+				yhi = xhi >> sh;
+			}
+		} else {
+			if(sh == 32) {
+				ylo = xhi;
+			} else
+			if(sh < 64) {
+				ylo = xhi >> (sh-32);
+			}
+		}
+	} else {
+		/* v = (hi||lo) << -sh */
+		sh = -sh;
+		if(sh <= 10) {
+			ylo = xlo << sh;
+			yhi = (xhi << sh) | (xlo >> (32-sh));
+		} else {
+			/* overflow */
+			yhi = d;	/* causes something awful */
+		}
+	}
+	if(x.hi & SIGN(32)) {
+		if(ylo != 0) {
+			ylo = -ylo;
+			yhi = ~yhi;
+		} else
+			yhi = -yhi;
+	}
+
+	y->hi = yhi;
+	y->lo = ylo;
+}
+
+void
+_f2v(Vlong *y, float f)
+{
+
+	_d2v(y, f);
+}
+
+double
+_v2d(Vlong x)
+{
+	if(x.hi & SIGN(32)) {
+		if(x.lo) {
+			x.lo = -x.lo;
+			x.hi = ~x.hi;
+		} else
+			x.hi = -x.hi;
+		return -((long)x.hi*4294967296. + x.lo);
+	}
+	return (long)x.hi*4294967296. + x.lo;
+}
+
+float
+_v2f(Vlong x)
+{
+	return _v2d(x);
+}
+
+static void
+dodiv(Vlong num, Vlong den, Vlong *qp, Vlong *rp)
+{
+	ulong numlo, numhi, denhi, denlo, quohi, quolo, t;
+	int i;
+
+	numhi = num.hi;
+	numlo = num.lo;
+	denhi = den.hi;
+	denlo = den.lo;
+
+	/*
+	 * get a divide by zero
+	 */
+	if(denlo==0 && denhi==0) {
+		numlo = numlo / denlo;
+	}
+
+	/*
+	 * set up the divisor and find the number of iterations needed
+	 */
+	if(numhi >= SIGN(32)) {
+		quohi = SIGN(32);
+		quolo = 0;
+	} else {
+		quohi = numhi;
+		quolo = numlo;
+	}
+	i = 0;
+	while(denhi < quohi || (denhi == quohi && denlo < quolo)) {
+		denhi = (denhi<<1) | (denlo>>31);
+		denlo <<= 1;
+		i++;
+	}
+
+	quohi = 0;
+	quolo = 0;
+	for(; i >= 0; i--) {
+		quohi = (quohi<<1) | (quolo>>31);
+		quolo <<= 1;
+		if(numhi > denhi || (numhi == denhi && numlo >= denlo)) {
+			t = numlo;
+			numlo -= denlo;
+			if(numlo > t)
+				numhi--;
+			numhi -= denhi;
+			quolo |= 1;
+		}
+		denlo = (denlo>>1) | (denhi<<31);
+		denhi >>= 1;
+	}
+
+	if(qp) {
+		qp->lo = quolo;
+		qp->hi = quohi;
+	}
+	if(rp) {
+		rp->lo = numlo;
+		rp->hi = numhi;
+	}
+}
+
+void
+_divvu(Vlong *q, Vlong n, Vlong d)
+{
+
+	if(n.hi == 0 && d.hi == 0) {
+		q->hi = 0;
+		q->lo = n.lo / d.lo;
+		return;
+	}
+	dodiv(n, d, q, 0);
+}
+
+void
+_modvu(Vlong *r, Vlong n, Vlong d)
+{
+
+	if(n.hi == 0 && d.hi == 0) {
+		r->hi = 0;
+		r->lo = n.lo % d.lo;
+		return;
+	}
+	dodiv(n, d, 0, r);
+}
+
+static void
+vneg(Vlong *v)
+{
+
+	if(v->lo == 0) {
+		v->hi = -v->hi;
+		return;
+	}
+	v->lo = -v->lo;
+	v->hi = ~v->hi;
+}
+
+void
+_divv(Vlong *q, Vlong n, Vlong d)
+{
+	long nneg, dneg;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		q->lo = (long)n.lo / (long)d.lo;
+		q->hi = ((long)q->lo) >> 31;
+		return;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, q, 0);
+	if(nneg != dneg)
+		vneg(q);
+}
+
+void
+_modv(Vlong *r, Vlong n, Vlong d)
+{
+	long nneg, dneg;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		r->lo = (long)n.lo % (long)d.lo;
+		r->hi = ((long)r->lo) >> 31;
+		return;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, 0, r);
+	if(nneg)
+		vneg(r);
+}
+
+void
+_rshav(Vlong *r, Vlong a, int b)
+{
+	long t;
+
+	t = a.hi;
+	if(b >= 32) {
+		r->hi = t>>31;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->lo = t>>31;
+			return;
+		}
+		r->lo = t >> (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->hi = t;
+		r->lo = a.lo;
+		return;
+	}
+	r->hi = t >> b;
+	r->lo = (t << (32-b)) | (a.lo >> b);
+}
+
+void
+_rshlv(Vlong *r, Vlong a, int b)
+{
+	ulong t;
+
+	t = a.hi;
+	if(b >= 32) {
+		r->hi = 0;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->lo = 0;
+			return;
+		}
+		r->lo = t >> (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->hi = t;
+		r->lo = a.lo;
+		return;
+	}
+	r->hi = t >> b;
+	r->lo = (t << (32-b)) | (a.lo >> b);
+}
+
+void
+_lshv(Vlong *r, Vlong a, int b)
+{
+	ulong t;
+
+	t = a.lo;
+	if(b >= 32) {
+		r->lo = 0;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r->hi = 0;
+			return;
+		}
+		r->hi = t << (b-32);
+		return;
+	}
+	if(b <= 0) {
+		r->lo = t;
+		r->hi = a.hi;
+		return;
+	}
+	r->lo = t << b;
+	r->hi = (t >> (32-b)) | (a.hi << b);
+}
+
+void
+_andv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi & b.hi;
+	r->lo = a.lo & b.lo;
+}
+
+void
+_orv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi | b.hi;
+	r->lo = a.lo | b.lo;
+}
+
+void
+_xorv(Vlong *r, Vlong a, Vlong b)
+{
+	r->hi = a.hi ^ b.hi;
+	r->lo = a.lo ^ b.lo;
+}
+
+void
+_vpp(Vlong *l, Vlong *r)
+{
+
+	l->hi = r->hi;
+	l->lo = r->lo;
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+}
+
+void
+_vmm(Vlong *l, Vlong *r)
+{
+
+	l->hi = r->hi;
+	l->lo = r->lo;
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+}
+
+void
+_ppv(Vlong *l, Vlong *r)
+{
+
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+	l->hi = r->hi;
+	l->lo = r->lo;
+}
+
+void
+_mmv(Vlong *l, Vlong *r)
+{
+
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+	l->hi = r->hi;
+	l->lo = r->lo;
+}
+
+void
+_vasop(Vlong *ret, void *lv, void fn(Vlong*, Vlong, Vlong), int type, Vlong rv)
+{
+	Vlong t, u;
+
+	u.lo = 0;
+	u.hi = 0;
+	switch(type) {
+	default:
+		abort();
+		break;
+
+	case 1:	/* schar */
+		t.lo = *(schar*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(schar*)lv = u.lo;
+		break;
+
+	case 2:	/* uchar */
+		t.lo = *(uchar*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(uchar*)lv = u.lo;
+		break;
+
+	case 3:	/* short */
+		t.lo = *(short*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(short*)lv = u.lo;
+		break;
+
+	case 4:	/* ushort */
+		t.lo = *(ushort*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(ushort*)lv = u.lo;
+		break;
+
+	case 9:	/* int */
+		t.lo = *(int*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(int*)lv = u.lo;
+		break;
+
+	case 10:	/* uint */
+		t.lo = *(uint*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(uint*)lv = u.lo;
+		break;
+
+	case 5:	/* long */
+		t.lo = *(long*)lv;
+		t.hi = t.lo >> 31;
+		fn(&u, t, rv);
+		*(long*)lv = u.lo;
+		break;
+
+	case 6:	/* ulong */
+		t.lo = *(ulong*)lv;
+		t.hi = 0;
+		fn(&u, t, rv);
+		*(ulong*)lv = u.lo;
+		break;
+
+	case 7:	/* vlong */
+	case 8:	/* uvlong */
+		fn(&u, *(Vlong*)lv, rv);
+		*(Vlong*)lv = u;
+		break;
+	}
+	*ret = u;
+}
+
+void
+_p2v(Vlong *ret, void *p)
+{
+	long t;
+
+	t = (ulong)p;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sl2v(Vlong *ret, long sl)
+{
+	long t;
+
+	t = sl;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_ul2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_si2v(Vlong *ret, int si)
+{
+	long t;
+
+	t = si;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_ui2v(Vlong *ret, uint ui)
+{
+	long t;
+
+	t = ui;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sh2v(Vlong *ret, long sh)
+{
+	long t;
+
+	t = (sh << 16) >> 16;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_uh2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul & 0xffff;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+void
+_sc2v(Vlong *ret, long uc)
+{
+	long t;
+
+	t = (uc << 24) >> 24;
+	ret->lo = t;
+	ret->hi = t >> 31;
+}
+
+void
+_uc2v(Vlong *ret, ulong ul)
+{
+	long t;
+
+	t = ul & 0xff;
+	ret->lo = t;
+	ret->hi = 0;
+}
+
+long
+_v2sc(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xff;
+	return (t << 24) >> 24;
+}
+
+long
+_v2uc(Vlong rv)
+{
+
+	return rv.lo & 0xff;
+}
+
+long
+_v2sh(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xffff;
+	return (t << 16) >> 16;
+}
+
+long
+_v2uh(Vlong rv)
+{
+
+	return rv.lo & 0xffff;
+}
+
+long
+_v2sl(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2ul(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2si(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+long
+_v2ui(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+int
+_testv(Vlong rv)
+{
+	return rv.lo || rv.hi;
+}
+
+int
+_eqv(Vlong lv, Vlong rv)
+{
+	return lv.lo == rv.lo && lv.hi == rv.hi;
+}
+
+int
+_nev(Vlong lv, Vlong rv)
+{
+	return lv.lo != rv.lo || lv.hi != rv.hi;
+}
+
+int
+_ltv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+int
+_lev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+int
+_gtv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+int
+_gev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}
+
+int
+_lov(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+int
+_lsv(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+int
+_hiv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+int
+_hsv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}

+ 2 - 2
sys/src/libmach/vdb.c

@@ -42,7 +42,7 @@ Machdata mipsmach =
 
 Machdata mipsmachle =
 {
-	{0, 0, 0, 0xD},		/* break point */
+	{0xD, 0, 0, 0},		/* break point */
 	4,			/* break point size */
 
 	leswab,			/* short to local byte order */
@@ -65,7 +65,7 @@ Machdata mipsmachle =
  */
 Machdata mipsmach2le =
 {
-	{0, 0, 0, 0xD},		/* break point */
+	{0xD, 0, 0, 0},		/* break point */
 	4,			/* break point size */
 
 	leswab,			/* short to local byte order */

+ 21 - 0
sys/src/libmp/spim/mkfile

@@ -0,0 +1,21 @@
+objtype=spim
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libmp.a
+# can't use this mpdigdiv.s on mips32 cpus
+SFILES=\
+	mpvecadd.s\
+	mpvecsub.s\
+	mpvecdigmuladd.s\
+	mpvecdigmulsub.s\
+#	mpdigdiv.s\
+
+HFILES=/$objtype/include/u.h /sys/include/mp.h ../port/dat.h
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib

+ 67 - 0
sys/src/libmp/spim/mpvecadd.s

@@ -0,0 +1,67 @@
+#define	BDNZ	BC	16,0,
+#define	BDNE	BC	0,2,
+
+/*
+ *	mpvecadd(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *sum)
+ *
+ *		sum[0:alen] = a[0:alen-1] + b[0:blen-1]
+ *
+ *	prereq: alen >= blen, sum has room for alen+1 digits
+ *
+ *		R1 == a	(first arg passed in R1)
+ *		R3 == carry
+ *		R4 == alen
+ *		R5 == b
+ *		R6 == blen
+ *		R7 == sum
+ *		R2 == temporary
+ *		R8 == temporary
+ *		R9 == temporary
+ */
+TEXT	mpvecadd(SB),$-4
+
+	MOVW	alen+4(FP), R4
+	MOVW	b+8(FP), R5
+	MOVW	blen+12(FP), R6
+	MOVW	sum+16(FP), R7
+	SUBU	R6, R4		/* calculate counter for second loop (alen > blen) */
+	MOVW	R0, R3
+
+	/* if blen == 0, don't need to add it in */
+	BEQ	R6,_add1
+
+	/* sum[0:blen-1],carry = a[0:blen-1] + b[0:blen-1] */
+_addloop1:
+	MOVW	0(R1), R8
+	ADDU	$4, R1
+	MOVW	0(R5), R9
+	ADDU	$4, R5
+	ADDU	R3, R8
+	SGTU	R3, R8, R3
+	ADDU	R8, R9
+	SGTU	R8, R9, R2
+	ADDU	R2, R3
+	MOVW	R9, 0(R7)
+	ADDU	$4, R7
+	SUBU	$1, R6
+	BNE	R6, _addloop1
+
+_add1:
+	/* if alen == blen, we're done */
+	BEQ	R4, _addend
+
+	/* sum[blen:alen-1],carry = a[blen:alen-1] + 0 + carry */
+_addloop2:
+	MOVW	0(R1), R8
+	ADDU	$4, R1
+	ADDU	R3, R8
+	SGTU	R3, R8, R3
+	MOVW	R8, 0(R7)
+	ADDU	$4, R7
+	SUBU	$1, R4
+	BNE	R4, _addloop2
+
+	/* sum[alen] = carry */
+_addend:
+	MOVW	R3, 0(R7)
+	RET

+ 58 - 0
sys/src/libmp/spim/mpvecdigmuladd.s

@@ -0,0 +1,58 @@
+/*
+ *	mpvecdigmuladd(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p += b*m
+ *
+ *	each step looks like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		b = R1
+ *		n = R4
+ *		m = R5
+ *		p = R6
+ *		i = R7
+ *		hi = R8		- constrained by hardware
+ *		lo = R9		- constrained by hardware
+ *		oldhi = R10
+ *		tmp = R11
+ *
+ */
+TEXT	mpvecdigmuladd(SB),$0
+
+	MOVW	n+4(FP),R4
+	MOVW	m+8(FP),R5
+	MOVW	p+12(FP),R6
+
+
+	MOVW	R0, R10		/* oldhi = 0 */
+	BEQ	R6, _muladd1
+_muladdloop:
+	MOVW	0(R1), R9	/* lo = b[i] */
+	ADDU	$4, R1
+	MOVW	0(R6), R11	/* tmp = p[i] */
+	MULU	R9, R5
+	MOVW	HI, R8		/* hi = (b[i] * m)>>32 */
+	MOVW	LO, R9		/* lo = b[i] * m */
+	ADDU	R10, R9		/* lo += oldhi */
+	SGTU	R10, R9, R2
+	ADDU	R2, R8		/* hi += carry */
+	ADDU	R9, R11		/* tmp += lo */
+	SGTU	R9, R11, R2
+	ADDU	R2, R8		/* hi += carry */
+	MOVW	R11, 0(R6)	/* p[i] = tmp */
+	ADDU	$4, R6
+	MOVW	R8, R10		/* oldhi = hi */
+	SUBU	$1, R4
+	BNE	R4, _muladdloop
+
+_muladd1:
+	MOVW	0(R6), R11	/* tmp = p[i] */
+	ADDU	R10, R11	/* tmp += oldhi */
+	MOVW	R11, 0(R6)	/* p[i] = tmp */
+
+	RET

+ 61 - 0
sys/src/libmp/spim/mpvecdigmulsub.s

@@ -0,0 +1,61 @@
+/*
+ *	mpvecdigmulsub(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *	p -= b*m
+ *
+ *	each step looks like:
+ *		hi,lo = m*b[i]
+ *		lo += oldhi + carry
+ *		hi += carry
+ *		p[i] += lo
+ *		oldhi = hi
+ *
+ *	the registers are:
+ *		b = R1
+ *		n = R4
+ *		m = R5
+ *		p = R6
+ *		i = R7
+ *		hi = R8		- constrained by hardware
+ *		lo = R9		- constrained by hardware
+ *		oldhi = R10
+ *		tmp = R11
+ *
+ */
+TEXT	mpvecdigmulsub(SB),$0
+
+	MOVW	n+4(FP),R4
+	MOVW	m+8(FP),R5
+	MOVW	p+12(FP),R6
+
+	MOVW	R0, R10		/* oldhi = 0 */
+_mulsubloop:
+	MOVW	0(R1), R9	/* lo = b[i] */
+	ADDU	$4, R1
+	MOVW	0(R6), R11	/* tmp = p[i] */
+	MULU	R9, R5
+	MOVW	HI, R8		/* hi = (b[i] * m)>>32 */
+	MOVW	LO, R9		/* lo = b[i] * m */
+	ADDU	R10, R9		/* lo += oldhi */
+	SGTU	R10, R9, R2
+	ADDU	R2, R8		/* hi += carry */
+	SUBU	R9, R11, R3	/* tmp -= lo */
+	SGTU	R3, R11, R2
+	ADDU	R2, R8		/* hi += carry */
+	MOVW	R3, 0(R6)	/* p[i] = tmp */
+	ADDU	$4, R6
+	MOVW	R8, R10		/* oldhi = hi */
+	SUBU	$1, R4
+	BNE	R4, _mulsubloop
+
+	MOVW	0(R6), R11	/* tmp = p[i] */
+	SUBU	R10, R11, R3	/* tmp -= oldhi */
+	MOVW	R3, 0(R6)	/* p[i] = tmp */
+	SGTU	R3, R11, R1
+	BNE	R1, _mulsub2
+	MOVW	$1, R1		/* return +1 for positive result */
+	RET
+
+_mulsub2:
+	MOVW	$-1, R1		/* return -1 for negative result */
+	RET

+ 66 - 0
sys/src/libmp/spim/mpvecsub.s

@@ -0,0 +1,66 @@
+#define	BDNZ	BC	16,0,
+#define	BDNE	BC	0,2,
+
+/*
+ *	mpvecadd(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *sum)
+ *
+ *		sum[0:alen] = a[0:alen-1] - b[0:blen-1]
+ *
+ *	prereq: alen >= blen, sum has room for alen+1 digits
+ *
+ *		R1 == a	(first arg passed in R1)
+ *		R3 == carry
+ *		R4 == alen
+ *		R5 == b
+ *		R6 == blen
+ *		R7 == sum
+ *		R2 == temporary
+ *		R8 == temporary
+ *		R9 == temporary
+ */
+TEXT	mpvecsub(SB),$-4
+
+	MOVW	alen+4(FP), R4
+	MOVW	b+8(FP), R5
+	MOVW	blen+12(FP), R6
+	MOVW	sum+16(FP), R7
+	SUBU	R6, R4		/* calculate counter for second loop (alen > blen) */
+	MOVW	R0, R3
+
+	/* if blen == 0, don't need to subtract it */
+	BEQ	R6,_sub1
+
+	/* sum[0:blen-1],carry = a[0:blen-1] - b[0:blen-1] */
+_subloop1:
+	MOVW	0(R1), R8
+	ADDU	$4, R1
+	MOVW	0(R5), R9
+	ADDU	$4, R5
+	SUBU	R3, R8, R2
+	SGTU	R2, R8, R3
+	SUBU	R9, R2, R8
+	SGTU	R8, R2, R9
+	ADDU	R9, R3
+	MOVW	R8, 0(R7)
+	ADDU	$4, R7
+	SUBU	$1, R6
+	BNE	R6, _subloop1
+
+_sub1:
+	/* if alen == blen, we're done */
+	BEQ	R4, _subend
+
+	/* sum[blen:alen-1],carry = a[blen:alen-1] + 0 + carry */
+_subloop2:
+	MOVW	0(R1), R8
+	ADDU	$4, R1
+	SUBU	R3, R8, R2
+	SGTU	R2, R8, R3
+	MOVW	R2, 0(R7)
+	ADDU	$4, R7
+	SUBU	$1, R4
+	BNE	R4, _subloop2
+
+	/* sum[alen] = carry */
+_subend:
+	RET

+ 296 - 0
sys/src/libsec/spim/md5block.s

@@ -0,0 +1,296 @@
+/*
+ *  rfc1321 requires that I include this.  The code is new.  The constants
+ *  all come from the rfc (hence the copyright).  We trade a table for the
+ *  macros in rfc.  The total size is a lot less. -- presotto
+ *
+ *	Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ *	rights reserved.
+ *
+ *	License to copy and use this software is granted provided that it
+ *	is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ *	Algorithm" in all material mentioning or referencing this software
+ *	or this function.
+ *
+ *	License is also granted to make and use derivative works provided
+ *	that such works are identified as "derived from the RSA Data
+ *	Security, Inc. MD5 Message-Digest Algorithm" in all material
+ *	mentioning or referencing the derived work.
+ *
+ *	RSA Data Security, Inc. makes no representations concerning either
+ *	the merchantability of this software or the suitability of this
+ *	software forany particular purpose. It is provided "as is"
+ *	without express or implied warranty of any kind.
+ *	These notices must be retained in any copies of any part of this
+ *	documentation and/or software.
+ */
+
+	/* round 1 */
+	DATA	md5tab<>+( 0*4)(SB)/4,$0xd76aa478
+	DATA	md5tab<>+( 1*4)(SB)/4,$0xe8c7b756
+	DATA	md5tab<>+( 2*4)(SB)/4,$0x242070db
+	DATA	md5tab<>+( 3*4)(SB)/4,$0xc1bdceee
+	DATA	md5tab<>+( 4*4)(SB)/4,$0xf57c0faf
+	DATA	md5tab<>+( 5*4)(SB)/4,$0x4787c62a
+	DATA	md5tab<>+( 6*4)(SB)/4,$0xa8304613
+	DATA	md5tab<>+( 7*4)(SB)/4,$0xfd469501
+	DATA	md5tab<>+( 8*4)(SB)/4,$0x698098d8
+	DATA	md5tab<>+( 9*4)(SB)/4,$0x8b44f7af
+	DATA	md5tab<>+(10*4)(SB)/4,$0xffff5bb1
+	DATA	md5tab<>+(11*4)(SB)/4,$0x895cd7be
+	DATA	md5tab<>+(12*4)(SB)/4,$0x6b901122
+	DATA	md5tab<>+(13*4)(SB)/4,$0xfd987193
+	DATA	md5tab<>+(14*4)(SB)/4,$0xa679438e
+	DATA	md5tab<>+(15*4)(SB)/4,$0x49b40821
+
+	/* round 2 */
+	DATA	md5tab<>+(16*4)(SB)/4,$0xf61e2562
+	DATA	md5tab<>+(17*4)(SB)/4,$0xc040b340
+	DATA	md5tab<>+(18*4)(SB)/4,$0x265e5a51
+	DATA	md5tab<>+(19*4)(SB)/4,$0xe9b6c7aa
+	DATA	md5tab<>+(20*4)(SB)/4,$0xd62f105d
+	DATA	md5tab<>+(21*4)(SB)/4,$0x02441453
+	DATA	md5tab<>+(22*4)(SB)/4,$0xd8a1e681
+	DATA	md5tab<>+(23*4)(SB)/4,$0xe7d3fbc8
+	DATA	md5tab<>+(24*4)(SB)/4,$0x21e1cde6
+	DATA	md5tab<>+(25*4)(SB)/4,$0xc33707d6
+	DATA	md5tab<>+(26*4)(SB)/4,$0xf4d50d87
+	DATA	md5tab<>+(27*4)(SB)/4,$0x455a14ed
+	DATA	md5tab<>+(28*4)(SB)/4,$0xa9e3e905
+	DATA	md5tab<>+(29*4)(SB)/4,$0xfcefa3f8
+	DATA	md5tab<>+(30*4)(SB)/4,$0x676f02d9
+	DATA	md5tab<>+(31*4)(SB)/4,$0x8d2a4c8a
+
+	/* round 3 */
+	DATA	md5tab<>+(32*4)(SB)/4,$0xfffa3942
+	DATA	md5tab<>+(33*4)(SB)/4,$0x8771f681
+	DATA	md5tab<>+(34*4)(SB)/4,$0x6d9d6122
+	DATA	md5tab<>+(35*4)(SB)/4,$0xfde5380c
+	DATA	md5tab<>+(36*4)(SB)/4,$0xa4beea44
+	DATA	md5tab<>+(37*4)(SB)/4,$0x4bdecfa9
+	DATA	md5tab<>+(38*4)(SB)/4,$0xf6bb4b60
+	DATA	md5tab<>+(39*4)(SB)/4,$0xbebfbc70
+	DATA	md5tab<>+(40*4)(SB)/4,$0x289b7ec6
+	DATA	md5tab<>+(41*4)(SB)/4,$0xeaa127fa
+	DATA	md5tab<>+(42*4)(SB)/4,$0xd4ef3085
+	DATA	md5tab<>+(43*4)(SB)/4,$0x04881d05
+	DATA	md5tab<>+(44*4)(SB)/4,$0xd9d4d039
+	DATA	md5tab<>+(45*4)(SB)/4,$0xe6db99e5
+	DATA	md5tab<>+(46*4)(SB)/4,$0x1fa27cf8
+	DATA	md5tab<>+(47*4)(SB)/4,$0xc4ac5665
+
+	/* round 4 */
+	DATA	md5tab<>+(48*4)(SB)/4,$0xf4292244
+	DATA	md5tab<>+(49*4)(SB)/4,$0x432aff97
+	DATA	md5tab<>+(50*4)(SB)/4,$0xab9423a7
+	DATA	md5tab<>+(51*4)(SB)/4,$0xfc93a039
+	DATA	md5tab<>+(52*4)(SB)/4,$0x655b59c3
+	DATA	md5tab<>+(53*4)(SB)/4,$0x8f0ccc92
+	DATA	md5tab<>+(54*4)(SB)/4,$0xffeff47d
+	DATA	md5tab<>+(55*4)(SB)/4,$0x85845dd1
+	DATA	md5tab<>+(56*4)(SB)/4,$0x6fa87e4f
+	DATA	md5tab<>+(57*4)(SB)/4,$0xfe2ce6e0
+	DATA	md5tab<>+(58*4)(SB)/4,$0xa3014314
+	DATA	md5tab<>+(59*4)(SB)/4,$0x4e0811a1
+	DATA	md5tab<>+(60*4)(SB)/4,$0xf7537e82
+	DATA	md5tab<>+(61*4)(SB)/4,$0xbd3af235
+	DATA	md5tab<>+(62*4)(SB)/4,$0x2ad7d2bb
+	DATA	md5tab<>+(63*4)(SB)/4,$0xeb86d391
+
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+#define	AREG		R5
+#define BREG		R6
+#define CREG		R7
+#define DREG		R8
+#define DATAREG		R1
+#define TABREG		R10
+#define STREG		R11
+#define XREG		R12
+#define ELOOPREG	R13
+#define EDREG		R14
+#define IREG		R15
+
+#define TMP1		R9
+#define TMP2		R2
+#define TMP3		R3
+#define TMP4		R4
+
+/*
+ * decode little endian data into x[off], then the body
+ * bodies have this form:
+ *	a += FN(B,C,D);
+ *	a += x[off] + t[off];
+ *	a = (a << S11) | (a >> (32 - S11));
+ *	a += b;
+ */
+#define BODY1(off,FN,SH,A,B,C,D)\
+	MOVW off(DATAREG),TMP2;\
+	MOVW off(TABREG),TMP3;\
+	FN(B,C,D)\
+	ADDU TMP1,A;\
+	MOVW TMP2,off(XREG);\
+	ADDU TMP2,A;\
+	ADDU TMP3,A;\
+	SLL $SH,A,TMP1;\
+	SRL $(32-SH),A;\
+	OR TMP1,A;\
+	ADDU B,A;\
+
+#define BODY(off,inc,FN,SH,A,B,C,D)\
+	MOVW off(TABREG),TMP3;\
+	ADDU XREG,IREG,TMP4;\
+	MOVW (TMP4),TMP2;\
+	ADDU $(inc*4),IREG;\
+	AND $63,IREG;\
+	FN(B,C,D)\
+	ADDU TMP1,A;\
+	ADDU TMP2,A;\
+	ADDU TMP3,A;\
+	SLL $SH,A,TMP1;\
+	SRL $(32-SH),A;\
+	OR  TMP1,A;\
+	ADDU B,A;\
+
+/*
+ * fn1 = ((c ^ d) & b) ^ d
+ */
+#define FN1(B,C,D)\
+	XOR C,D,TMP1;\
+	AND B,TMP1;\
+	XOR D,TMP1;\
+
+/*
+ * fn2 = ((b ^ c) & d) ^ c;
+ */
+#define FN2(B,C,D)\
+	XOR B,C,TMP1;\
+	AND D,TMP1;\
+	XOR C,TMP1;\
+
+/*
+ * fn3 = b ^ c ^ d;
+ */
+#define FN3(B,C,D)\
+	XOR B,C,TMP1;\
+	XOR D,TMP1;\
+
+/*
+ * fn4 = c ^ (b | ~d);
+ */
+#define FN4(B,C,D)\
+	XOR $-1,D,TMP1;\
+	OR B,TMP1;\
+	XOR C,TMP1;\
+
+#define	DATA	0
+#define	LEN	4
+#define	STATE	8
+
+#define XOFF	(-4-16*4)
+
+	TEXT	_md5block+0(SB),$68
+
+	MOVW	len+LEN(FP),TMP1
+	ADDU	DATAREG,TMP1,EDREG
+	MOVW	state+STATE(FP),STREG
+
+	MOVW 0(STREG),AREG
+	MOVW 4(STREG),BREG
+	MOVW 8(STREG),CREG
+	MOVW 12(STREG),DREG
+
+mainloop:
+
+	MOVW $md5tab<>+0(SB),TABREG
+	ADDU $(16*4),DATAREG,ELOOPREG
+	MOVW $x+XOFF(SP),XREG
+
+loop1:
+	BODY1(0,FN1,S11,AREG,BREG,CREG,DREG)
+	BODY1(4,FN1,S12,DREG,AREG,BREG,CREG)
+	BODY1(8,FN1,S13,CREG,DREG,AREG,BREG)
+	BODY1(12,FN1,S14,BREG,CREG,DREG,AREG)
+
+	ADDU $16,DATAREG
+	ADDU $16,TABREG
+	ADDU $16,XREG
+
+	BNE DATAREG,ELOOPREG,loop1
+
+
+	MOVW $x+XOFF(SP),XREG
+	MOVW $(1*4),IREG
+	MOVW $(1*4),ELOOPREG
+loop2:
+	BODY(0,5,FN2,S21,AREG,BREG,CREG,DREG)
+	BODY(4,5,FN2,S22,DREG,AREG,BREG,CREG)
+	BODY(8,5,FN2,S23,CREG,DREG,AREG,BREG)
+	BODY(12,5,FN2,S24,BREG,CREG,DREG,AREG)
+
+	ADDU $16,TABREG
+
+	BNE IREG,ELOOPREG,loop2
+
+
+	MOVW $(5*4),IREG
+	MOVW $(5*4),ELOOPREG
+loop3:
+	BODY(0,3,FN3,S31,AREG,BREG,CREG,DREG)
+	BODY(4,3,FN3,S32,DREG,AREG,BREG,CREG)
+	BODY(8,3,FN3,S33,CREG,DREG,AREG,BREG)
+	BODY(12,3,FN3,S34,BREG,CREG,DREG,AREG)
+
+	ADDU $16,TABREG
+
+	BNE IREG,ELOOPREG,loop3
+
+
+	MOVW $0,IREG
+loop4:
+	BODY(0,7,FN4,S41,AREG,BREG,CREG,DREG)
+	BODY(4,7,FN4,S42,DREG,AREG,BREG,CREG)
+	BODY(8,7,FN4,S43,CREG,DREG,AREG,BREG)
+	BODY(12,7,FN4,S44,BREG,CREG,DREG,AREG)
+
+	ADDU $16,TABREG
+
+	BNE IREG,R0,loop4
+
+	MOVW 0(STREG),TMP1
+	MOVW 4(STREG),TMP2
+	MOVW 8(STREG),TMP3
+	MOVW 12(STREG),TMP4
+	ADDU TMP1,AREG
+	ADDU TMP2,BREG
+	ADDU TMP3,CREG
+	ADDU TMP4,DREG
+	MOVW AREG,0(STREG)
+	MOVW BREG,4(STREG)
+	MOVW CREG,8(STREG)
+	MOVW DREG,12(STREG)
+
+	BNE DATAREG,EDREG,mainloop
+
+	RET
+
+	GLOBL	md5tab<>+0(SB),$256
+
+	END

+ 19 - 0
sys/src/libsec/spim/mkfile

@@ -0,0 +1,19 @@
+objtype=spim
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libsec.a
+FILES=\
+	md5block\
+	sha1block\
+
+HFILES=/sys/include/libsec.h
+
+SFILES=${FILES:%=%.s}
+
+OFILES=${FILES:%=%.$O}
+
+UPDATE=mkfile\
+	$HFILES\
+	$SFILES\
+
+</sys/src/cmd/mksyslib

+ 220 - 0
sys/src/libsec/spim/sha1block.s

@@ -0,0 +1,220 @@
+	TEXT	_sha1block+0(SB),$328
+
+/*
+ * wp[off] = x;
+ * x += A <<< 5;
+ * E += 0xca62c1d6 + x;
+ * x = FN(B,C,D);
+ * E += x;
+ * B >>> 2
+ */
+#define BODYX(off,FN,V,A,B,C,D,E)\
+	FN(B,C,D)\
+	ADDU TMP1,E;\
+	ADDU V,E;\
+	MOVW TMP2,off(WREG);\
+	ADDU TMP2,E;\
+	SLL $5,A,TMP3;\
+	SRL $27,A,TMP4;\
+	OR TMP3,TMP4;\
+	ADDU TMP4,E;\
+	SLL $30,B,TMP4;\
+	SRL $2,B;\
+	OR TMP4,B
+
+/*
+ * x = data[i]
+ * BODYX
+ */
+#define BODY1(off,FN,V,A,B,C,D,E)\
+	MOVBU off(DATAREG),TMP2;\
+	MOVBU (off+1)(DATAREG),TMP3;\
+	MOVBU (off+2)(DATAREG),TMP1;\
+	MOVBU (off+3)(DATAREG),TMP4;\
+	SLL $24,TMP2;\
+	SLL $16,TMP3;\
+	OR TMP3,TMP2;\
+	SLL $8,TMP1;\
+	OR TMP1,TMP2;\
+	OR TMP4,TMP2;\
+	BODYX(off,FN,V,A,B,C,D,E)
+
+/*
+ * x = (wp[off-3] ^ wp[off-8] ^ wp[off-14] ^ wp[off-16]) <<< 1;
+ * BODYX
+ */
+#define BODY(off,FN,V,A,B,C,D,E)\
+	MOVW (off-64)(WREG),TMP1;\
+	MOVW (off-56)(WREG),TMP2;\
+	MOVW (off-32)(WREG),TMP3;\
+	MOVW (off-12)(WREG),TMP4;\
+	XOR TMP1,TMP2;\
+	XOR TMP3,TMP2;\
+	XOR TMP4,TMP2;\
+	SLL $1,TMP2,TMP1;\
+	SRL $31,TMP2;\
+	OR TMP1,TMP2;\
+	BODYX(off,FN,V,A,B,C,D,E)
+
+/*
+ * fn1 = (((C^D)&B)^D);
+ */
+#define FN1(B,C,D)\
+	XOR C,D,TMP1;\
+	AND B,TMP1;\
+	XOR D,TMP1;
+
+/*
+ * fn24 = B ^ C ^ D
+ */
+#define FN24(B,C,D)\
+	XOR B,C,TMP1;\
+	XOR D,TMP1;
+
+/*
+ * fn3 = ((B ^ C) & (D ^ B)) ^ B
+ */
+#define FN3(B,C,D)\
+	XOR B,C,TMP1;\
+	XOR B,D,TMP4;\
+	AND TMP4,TMP1;\
+	XOR B,TMP1;
+
+/*
+ * stack offsets
+ * void vtSha1Block(ulong *STATE, uchar *DATA, int LEN)
+ */
+#define	DATA	0
+#define	LEN	4
+#define	STATE	8
+
+/*
+ * stack offsets for locals
+ * ulong w[80];
+ * uchar *edata;
+ * ulong *w15, *w40, *w60, *w80;
+ * register local
+ * ulong *wp = BP
+ * ulong a = eax, b = ebx, c = ecx, d = edx, e = esi
+ * ulong tmp = edi
+ */
+#define WARRAY	(-4-(80*4))
+
+#define	AREG		R5
+#define BREG		R6
+#define CREG		R7
+#define DREG		R8
+#define EREG		R9
+#define DATAREG		R1
+#define STREG		R11
+#define WREG		R12
+#define W15REG		R13
+#define W60REG		R14
+#define W40REG		R15
+#define W80REG		R16
+#define EDREG		R17
+#define VREG		R18
+
+#define TMP1		R10
+#define TMP2		R2
+#define TMP3		R3
+#define TMP4		R4
+#define TMP5		R19
+
+	MOVW len+LEN(FP),TMP1
+	MOVW state+STATE(FP),STREG
+	ADDU DATAREG,TMP1,EDREG
+
+	MOVW 0(STREG),AREG
+	MOVW 4(STREG),BREG
+	MOVW 8(STREG),CREG
+	MOVW 12(STREG),DREG
+	MOVW 16(STREG),EREG
+
+	MOVW $warray+WARRAY(SP),WREG
+	ADDU $(15*4),WREG,W15REG
+	ADDU $(40*4),WREG,W40REG
+	ADDU $(60*4),WREG,W60REG
+	ADDU $(80*4),WREG,W80REG
+
+mainloop:
+	MOVW $warray+WARRAY(SP),WREG
+
+	MOVW $0x5a827999,VREG
+loop1:
+	BODY1(0,FN1,VREG,AREG,BREG,CREG,DREG,EREG)
+	BODY1(4,FN1,VREG,EREG,AREG,BREG,CREG,DREG)
+	BODY1(8,FN1,VREG,DREG,EREG,AREG,BREG,CREG)
+	BODY1(12,FN1,VREG,CREG,DREG,EREG,AREG,BREG)
+	BODY1(16,FN1,VREG,BREG,CREG,DREG,EREG,AREG)
+
+	ADDU $20,DATAREG
+	ADDU $20,WREG
+	BNE WREG,W15REG,loop1
+
+	BODY1(0,FN1,VREG,AREG,BREG,CREG,DREG,EREG)
+	ADDU $4,DATAREG
+
+	BODY(4,FN1,VREG,EREG,AREG,BREG,CREG,DREG)
+	BODY(8,FN1,VREG,DREG,EREG,AREG,BREG,CREG)
+	BODY(12,FN1,VREG,CREG,DREG,EREG,AREG,BREG)
+	BODY(16,FN1,VREG,BREG,CREG,DREG,EREG,AREG)
+
+	ADDU $20,WREG
+
+	MOVW $0x6ed9eba1,VREG
+loop2:
+	BODY(0,FN24,VREG,AREG,BREG,CREG,DREG,EREG)
+	BODY(4,FN24,VREG,EREG,AREG,BREG,CREG,DREG)
+	BODY(8,FN24,VREG,DREG,EREG,AREG,BREG,CREG)
+	BODY(12,FN24,VREG,CREG,DREG,EREG,AREG,BREG)
+	BODY(16,FN24,VREG,BREG,CREG,DREG,EREG,AREG)
+
+	ADDU $20,WREG
+	BNE WREG,W40REG,loop2
+
+	MOVW $0x8f1bbcdc,VREG
+loop3:
+	BODY(0,FN3,VREG,AREG,BREG,CREG,DREG,EREG)
+	BODY(4,FN3,VREG,EREG,AREG,BREG,CREG,DREG)
+	BODY(8,FN3,VREG,DREG,EREG,AREG,BREG,CREG)
+	BODY(12,FN3,VREG,CREG,DREG,EREG,AREG,BREG)
+	BODY(16,FN3,VREG,BREG,CREG,DREG,EREG,AREG)
+
+	ADDU $20,WREG
+	BNE WREG,W60REG,loop3
+
+	MOVW $0xca62c1d6,VREG
+loop4:
+	BODY(0,FN24,VREG,AREG,BREG,CREG,DREG,EREG)
+	BODY(4,FN24,VREG,EREG,AREG,BREG,CREG,DREG)
+	BODY(8,FN24,VREG,DREG,EREG,AREG,BREG,CREG)
+	BODY(12,FN24,VREG,CREG,DREG,EREG,AREG,BREG)
+	BODY(16,FN24,VREG,BREG,CREG,DREG,EREG,AREG)
+
+	ADDU $20,WREG
+	BNE WREG,W80REG,loop4
+
+	MOVW 0(STREG),TMP1
+	MOVW 4(STREG),TMP2
+	MOVW 8(STREG),TMP3
+	MOVW 12(STREG),TMP4
+	MOVW 16(STREG),TMP5
+
+	ADDU TMP1,AREG
+	ADDU TMP2,BREG
+	ADDU TMP3,CREG
+	ADDU TMP4,DREG
+	ADDU TMP5,EREG
+
+	MOVW AREG,0(STREG)
+	MOVW BREG,4(STREG)
+	MOVW CREG,8(STREG)
+	MOVW DREG,12(STREG)
+	MOVW EREG,16(STREG)
+
+	BNE DATAREG,EDREG,mainloop
+
+	RET
+
+	END

+ 26 - 0
sys/src/libthread/spim.c

@@ -0,0 +1,26 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include "threadimpl.h"
+
+/* first argument goes in a register; simplest just to ignore it */
+static void
+launcherspim(int, void (*f)(void *arg), void *arg)
+{
+	(*f)(arg);
+	threadexits(nil);
+}
+
+void
+_threadinitstack(Thread *t, void (*f)(void*), void *arg)
+{
+	ulong *tos;
+
+	tos = (ulong*)&t->stk[t->stksize&~7];
+	*--tos = (ulong)arg;
+	*--tos = (ulong)f;
+	*--tos = 0;	/* first arg to launcherspim */
+	*--tos = 0;	/* place to store return PC */
+	t->sched[JMPBUFPC] = (ulong)launcherspim+JMPBUFDPC;
+	t->sched[JMPBUFSP] = (ulong)tos;
+}

+ 46 - 0
sys/src/libthread/xincspim.s

@@ -0,0 +1,46 @@
+/*
+ *	R4000 user level lock code
+ */
+
+#define	LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define	SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define	NOOP		WORD	$0x27
+
+#ifdef oldstyle
+TEXT	xadd(SB), $0
+
+	MOVW	R1, R2		/* address of counter */
+loop:	MOVW	n+4(FP), R3	/* increment */
+	LL(2, 1)
+	NOOP
+	ADD	R1,R3,R3
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop
+	RET
+#endif
+
+TEXT	_xinc(SB), $0
+
+	MOVW	R1, R2		/* address of counter */
+loop:	MOVW	$1, R3
+	LL(2, 1)
+	NOOP
+	ADD	R1,R3,R3
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop
+	RET
+
+TEXT	_xdec(SB), $0
+
+	MOVW	R1, R2		/* address of counter */
+loop1:	MOVW	$-1, R3
+	LL(2, 1)
+	NOOP
+	ADD	R1,R3,R3
+	MOVW	R3, R1
+	SC(2, 3)
+	NOOP
+	BEQ	R3,loop1
+	RET