Browse Source

Plan 9 from Bell Labs 2012-05-02

David du Colombier 7 years ago
parent
commit
c0318f7435
59 changed files with 18774 additions and 1 deletions
  1. 3 0
      sys/src/9/mkfile
  2. 58 0
      sys/src/9/teg2/_announce
  3. 165 0
      sys/src/9/teg2/arch.c
  4. 869 0
      sys/src/9/teg2/archtegra.c
  5. 311 0
      sys/src/9/teg2/arm.h
  6. 135 0
      sys/src/9/teg2/arm.s
  7. 56 0
      sys/src/9/teg2/atom.s
  8. 456 0
      sys/src/9/teg2/cache-l2-pl310.c
  9. 240 0
      sys/src/9/teg2/cache.v7.s
  10. 106 0
      sys/src/9/teg2/caches-v7.c
  11. 198 0
      sys/src/9/teg2/caches.c
  12. 138 0
      sys/src/9/teg2/clock-tegra.c
  13. 623 0
      sys/src/9/teg2/clock.c
  14. 200 0
      sys/src/9/teg2/coproc.c
  15. 478 0
      sys/src/9/teg2/dat.h
  16. 192 0
      sys/src/9/teg2/devarch.c
  17. 1366 0
      sys/src/9/teg2/devcons.c
  18. 528 0
      sys/src/9/teg2/devether.c
  19. 796 0
      sys/src/9/teg2/devuart.c
  20. 1675 0
      sys/src/9/teg2/ether8169.c
  21. 42 0
      sys/src/9/teg2/etherif.h
  22. 235 0
      sys/src/9/teg2/ethermii.c
  23. 116 0
      sys/src/9/teg2/ethermii.h
  24. 233 0
      sys/src/9/teg2/fns.h
  25. 300 0
      sys/src/9/teg2/fpi.c
  26. 61 0
      sys/src/9/teg2/fpi.h
  27. 502 0
      sys/src/9/teg2/fpiarm.c
  28. 136 0
      sys/src/9/teg2/fpimem.c
  29. 25 0
      sys/src/9/teg2/init9.s
  30. 219 0
      sys/src/9/teg2/io.h
  31. 410 0
      sys/src/9/teg2/kbd.c
  32. 869 0
      sys/src/9/teg2/l.s
  33. 325 0
      sys/src/9/teg2/lexception.s
  34. 38 0
      sys/src/9/teg2/lproc.s
  35. 985 0
      sys/src/9/teg2/main.c
  36. 150 0
      sys/src/9/teg2/mem.h
  37. 155 0
      sys/src/9/teg2/mkfile
  38. 750 0
      sys/src/9/teg2/mmu.c
  39. 4 0
      sys/src/9/teg2/notes/assumes-hz-under-1000
  40. 41 0
      sys/src/9/teg2/notes/bug.rfe
  41. 59 0
      sys/src/9/teg2/notes/byte-order
  42. 19 0
      sys/src/9/teg2/notes/clks
  43. 22 0
      sys/src/9/teg2/notes/movm.w
  44. 29 0
      sys/src/9/teg2/notes/pci
  45. 78 0
      sys/src/9/teg2/notes/pci.2.buses
  46. BIN
      sys/src/9/teg2/nvram
  47. 853 0
      sys/src/9/teg2/pci.c
  48. 138 0
      sys/src/9/teg2/random.c
  49. 208 0
      sys/src/9/teg2/rebootcode.s
  50. 129 0
      sys/src/9/teg2/softfpu.c
  51. 366 0
      sys/src/9/teg2/syscall.c
  52. 1068 0
      sys/src/9/teg2/trap.c
  53. 91 0
      sys/src/9/teg2/ts
  54. 821 0
      sys/src/9/teg2/uarti8250.c
  55. 104 0
      sys/src/9/teg2/usbehci.h
  56. 51 0
      sys/src/9/teg2/v7-arch.c
  57. 489 0
      sys/src/9/teg2/vfp3.c
  58. 60 0
      sys/src/9/teg2/words
  59. 0 1
      sys/src/cmd/ip/ipconfig/ipconfig.h

+ 3 - 0
sys/src/9/mkfile

@@ -1,9 +1,12 @@
 ARCH=\
 	alphapc\
 	bitsy\
+	kw\
 	mtx\
+	omap\
 	pc\
 	ppc\
+	teg2\
 	
 all:V:
 	for(i in $ARCH)@{

+ 58 - 0
sys/src/9/teg2/_announce

@@ -0,0 +1,58 @@
+This is a preliminary Plan 9 port to the Compulab Trimslice,
+containing a Tegra 2 SoC: a dual-core, (truly) dual-issue 1GHz
+Cortex-A9 v7a-architecture ARM system, *and* it comes in a case.  VFP
+3 floating-point hardware is present, but 5l doesn't yet generate
+those instructions.  This is the first multiprocessor ARM port we've
+done, and much of the code should be reusable in future ports.  There
+are still things to be done but it can run both processors and is
+believed to have adequate kernel support for VFP 3 floating-point.
+
+
+What's implemented.
+
+Two cpus running concurrently with level 1 and 2 caches enabled.
+
+Realtek 8168 Ethernet.  A slightly dimmer 8169.  Has to be jabbed with
+an electric cattle prod by software about once per day when it wedges.
+
+Profiling.  Charles Forsyth fixed various bugs to make user-mode
+profiling on ARMs work for the first time ever.
+
+
+What's not (yet) implemented.
+
+USB.  It probably just needs initialisation.
+
+NOR flash.
+
+Video.
+
+VFP3 floating point.  The go 5l generates VFP 3 floating-point
+instructions (among other changes).  Attempts to transplant just that
+code into our 5l failed to generate correct code.  Eventually someone
+will get this to work, and then we'll be able to use the hardware
+floating-point.  Even with only software emulation of floating-point,
+astro runs in under 3 seconds.
+
+In-line 64-bit arithmetic in 5[cl].
+
+And the really horrid peripherals: NAND flash and MMC.
+
+
+Known problems.
+
+kprof.  kprof profiling doesn't work correctly, charging all CPU time
+to _start.
+
+Reboot.  After an fshalt -r reboot (or two) with cpu1 enabled,
+accesses to pci registers (notably 0x80015000) in the newly-loaded
+kernel often hang.  One of three watchdogs' reset should jolt the
+system back to life and force a reboot through u-boot when this
+happens.  Sometimes the ethernet goes dead instead ("waiting for
+dhcp..." forever); this could be a different symptom of pci illness.
+
+Also following a reboot, cpu1's local (not tegra SoC shared) timers
+don't interrupt.  Since the local watchdogs don't seem to actually
+interrupt nor generate resets when used in anger (as opposed to
+boot-time check-out), their loss is merely a mystery.  The local timer
+not interrupting is more worrying.

+ 165 - 0
sys/src/9/teg2/arch.c

@@ -0,0 +1,165 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include <tos.h>
+#include "ureg.h"
+
+#include "arm.h"
+
+/*
+ * A lot of this stuff doesn't belong here
+ * but this is a convenient dumping ground for
+ * later sorting into the appropriate buckets.
+ */
+
+/* Give enough context in the ureg to produce a kernel stack for
+ * a sleeping process
+ */
+void
+setkernur(Ureg* ureg, Proc* p)
+{
+	ureg->pc = p->sched.pc;
+	ureg->sp = p->sched.sp+4;
+	ureg->r14 = PTR2UINT(sched);
+}
+
+/*
+ * called in sysfile.c
+ */
+void
+evenaddr(uintptr addr)
+{
+	if(addr & 3){
+		postnote(up, 1, "sys: odd address", NDebug);
+		error(Ebadarg);
+	}
+}
+
+/* go to user space */
+void
+kexit(Ureg*)
+{
+	uvlong t;
+	Tos *tos;
+
+	/* precise time accounting, kernel exit */
+	tos = (Tos*)(USTKTOP-sizeof(Tos));
+	cycles(&t);
+	tos->kcycles += t - up->kentry;
+	tos->pcycles = up->pcycles;
+	tos->cyclefreq = m->cpuhz;
+	tos->pid = up->pid;
+
+	/* make visible immediately to user phase */
+	l1cache->wbse(tos, sizeof *tos);
+}
+
+/*
+ *  return the userpc the last exception happened at
+ */
+uintptr
+userpc(void)
+{
+	Ureg *ureg = up->dbgreg;
+	return ureg->pc;
+}
+
+/* This routine must save the values of registers the user is not permitted
+ * to write from devproc and then restore the saved values before returning.
+ */
+void
+setregisters(Ureg* ureg, char* pureg, char* uva, int n)
+{
+	USED(ureg, pureg, uva, n);
+}
+
+/*
+ *  this is the body for all kproc's
+ */
+static void
+linkproc(void)
+{
+	spllo();
+	up->kpfun(up->kparg);
+	pexit("kproc exiting", 0);
+}
+
+/*
+ *  setup stack and initial PC for a new kernel proc.  This is architecture
+ *  dependent because of the starting stack location
+ */
+void
+kprocchild(Proc *p, void (*func)(void*), void *arg)
+{
+	p->sched.pc = PTR2UINT(linkproc);
+	p->sched.sp = PTR2UINT(p->kstack+KSTACK);
+
+	p->kpfun = func;
+	p->kparg = arg;
+}
+
+/*
+ *  pc output by dumpaproc
+ */
+uintptr
+dbgpc(Proc* p)
+{
+	Ureg *ureg;
+
+	ureg = p->dbgreg;
+	if(ureg == 0)
+		return 0;
+
+	return ureg->pc;
+}
+
+/*
+ *  set mach dependent process state for a new process
+ */
+void
+procsetup(Proc* p)
+{
+	fpusysprocsetup(p);
+}
+
+/*
+ *  Save the mach dependent part of the process state.
+ */
+void
+procsave(Proc* p)
+{
+	uvlong t;
+
+	cycles(&t);
+	p->pcycles += t;
+
+	fpuprocsave(p);
+	l1cache->wbse(p, sizeof *p);		/* is this needed? */
+	l1cache->wb();				/* is this needed? */
+}
+
+void
+procrestore(Proc* p)
+{
+	uvlong t;
+
+	if(p->kp)
+		return;
+	cycles(&t);
+	p->pcycles -= t;
+	wakewfi();		/* in case there's another runnable proc */
+
+	/* let it fault in at first use */
+//	fpuprocrestore(p);
+	l1cache->wb();			/* system is more stable with this */
+}
+
+int
+userureg(Ureg* ureg)
+{
+	return (ureg->psr & PsrMask) == PsrMusr;
+}

+ 869 - 0
sys/src/9/teg2/archtegra.c

@@ -0,0 +1,869 @@
+/*
+ * nvidia tegra 2 architecture-specific stuff
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "io.h"
+#include "arm.h"
+
+#include "../port/netif.h"
+#include "etherif.h"
+#include "../port/flashif.h"
+#include "../port/usb.h"
+#include "../port/portusbehci.h"
+#include "usbehci.h"
+
+enum {
+	/* hardware limits imposed by register contents or layouts */
+	Maxcpus		= 4,
+	Maxflowcpus	= 2,
+
+	Debug	= 0,
+};
+
+typedef struct Clkrst Clkrst;
+typedef struct Diag Diag;
+typedef struct Flow Flow;
+typedef struct Scu Scu;
+typedef struct Power Power;
+
+struct Clkrst {
+	ulong	rstsrc;
+	ulong	rstdevl;
+	ulong	rstdevh;
+	ulong	rstdevu;
+
+	ulong	clkoutl;
+	ulong	clkouth;
+	ulong	clkoutu;
+
+	uchar	_pad0[0x24-0x1c];
+	ulong	supcclkdiv;		/* super cclk divider */
+	ulong	_pad1;
+	ulong	supsclkdiv;		/* super sclk divider */
+
+	uchar	_pad4[0x4c-0x30];
+	ulong	clkcpu;
+
+	uchar	_pad1[0xe0-0x50];
+	ulong	pllxbase;		/* pllx controls CPU clock speed */
+	ulong	pllxmisc;
+	ulong	pllebase;		/* plle is dedicated to pcie */
+	ulong	pllemisc;
+
+	uchar	_pad2[0x340-0xf0];
+	ulong	cpuset;
+	ulong	cpuclr;
+};
+
+enum {
+	/* rstsrc bits */
+	Wdcpurst =	1<<0,
+	Wdcoprst =	1<<1,
+	Wdsysrst =	1<<2,
+	Wdsel =		1<<4,		/* tmr1 or tmr2? */
+	Wdena =		1<<5,
+
+	/* devl bits */
+	Sysreset =	1<<2,
+
+	/* clkcpu bits */
+	Cpu1stop =	1<<9,
+	Cpu0stop =	1<<8,
+
+	/* cpu* bits */
+	Cpu1dbgreset =	1<<13,
+	Cpu0dbgreset =	1<<12,
+	Cpu1wdreset =	1<<9,
+	Cpu0wdreset =	1<<8,
+	Cpu1dereset =	1<<5,
+	Cpu0dereset =	1<<4,
+	Cpu1reset =	1<<1,
+	Cpu0reset =	1<<0,
+};
+
+struct Power {
+	ulong	ctl;			/* mainly for rtc clock signals */
+	ulong	secregdis;
+	ulong	swrst;
+
+	ulong	wakevmask;
+	ulong	waklvl;
+	ulong	waksts;
+	ulong	swwaksts;
+
+	ulong	dpdpadsovr;		/* deep power down pads override */
+	ulong	dpdsample;
+	ulong	dpden;
+
+	ulong	gatetimroff;
+	ulong	gatetimron;
+	ulong	toggle;
+	ulong	unclamp;
+	ulong	gatests;		/* ro */
+
+	ulong	goodtmr;
+	ulong	blinktmr;
+
+	ulong	noiopwr;
+	ulong	detect;
+	ulong	detlatch;
+
+	ulong	scratch[24];
+	ulong	secscratch[6];
+
+	ulong	cpupwrgoodtmr;
+	ulong	cpupwrofftmr;
+
+	ulong	pgmask[2];
+
+	ulong	autowaklvl;
+	ulong	autowaklvlmask;
+	ulong	wakdelay;
+
+	ulong	detval;
+	ulong	ddr;
+	ulong	usbdebdel;	/* usb de-bounce delay */
+	ulong	usbao;
+	ulong	cryptoop;
+	ulong	pllpwb0ovr;
+	ulong	scratch24[42-24+1];
+	ulong	boundoutmirr[3];
+	ulong	sys33ven;
+	ulong	boundoutmirracc;
+	ulong	gate;
+};
+
+enum {
+	/* toggle bits */
+	Start	= 1<<8,
+	/* partition ids */
+	Partpcie= 3,
+	Partl2	= 4,
+};
+
+struct Scu {
+	ulong	ctl;
+	ulong	cfg;			/* ro */
+	ulong	cpupwrsts;
+	ulong	inval;
+
+	uchar	_pad0[0x40-0x10];
+	ulong	filtstart;
+	ulong	filtend;
+
+	uchar	_pad1[0x50-0x48];
+	ulong	accctl;			/* initially 0 */
+	ulong	nsaccctl;
+};
+
+enum {
+	/* ctl bits */
+	Scuenable =	1<<0,
+	Filter =	1<<1,
+	Scuparity =	1<<2,
+	Specfill =	1<<3,		/* only for PL310 */
+	Allport0 =	1<<4,
+	Standby =	1<<5,
+	Icstandby =	1<<6,
+};
+
+struct Flow {
+	ulong	haltcpu0;
+	ulong	haltcop;
+	ulong	cpu0;
+	ulong	cop;
+	ulong	xrq;
+	ulong	haltcpu1;
+	ulong	cpu1;
+};
+
+enum {
+	/* haltcpu* bits */
+	Stop =	2<<29,
+
+	/* cpu* bits */
+	Event =			1<<14,	/* w1c */
+	Waitwfebitsshift =	4,
+	Waitwfebitsmask =	MASK(2),
+	Eventenable =		1<<1,
+	Cpuenable =		1<<0,
+};
+
+struct Diag {
+	Cacheline c0;
+	Lock;
+	long	cnt;
+	long	sync;
+	Cacheline c1;
+};
+
+extern ulong testmem;
+
+/*
+ * number of cpus available.  contrast with conf.nmach, which is number
+ * of running cpus.
+ */
+int navailcpus;
+Isolated l1ptstable;
+
+Soc soc = {
+	.clkrst	= 0x60006000,		/* clock & reset signals */
+	.power	= 0x7000e400,
+	.exceptvec = PHYSEVP,		/* undocumented magic */
+	.sema	= 0x60001000,
+	.l2cache= PHYSL2BAG,		/* pl310 bag on the side */
+	.flow	= 0x60007000,
+
+	/* 4 non-gic controllers */
+//	.intr	= { 0x60004000, 0x60004100, 0x60004200, 0x60004300, },
+
+	/* private memory region */
+	.scu	= 0x50040000,
+	/* we got this address from the `cortex-a series programmer's guide'. */
+	.intr	= 0x50040100,		/* per-cpu interface */
+	.glbtmr	= 0x50040200,
+	.loctmr	= 0x50040600,
+	.intrdist=0x50041000,
+
+	.uart	= { 0x70006000, 0x70006040,
+		    0x70006200, 0x70006300, 0x70006400, },
+
+	.rtc	= 0x7000e000,
+	.tmr	= { 0x60005000, 0x60005008, 0x60005050, 0x60005058, },
+	.µs	= 0x60005010,
+
+	.pci	= 0x80000000,
+	.ether	= 0xa0024000,
+
+	.nand	= 0x70008000,
+	.nor	= 0x70009000,		/* also VIRTNOR */
+
+	.ehci	= P2VAHB(0xc5000000),	/* 1st of 3 */
+	.ide	= P2VAHB(0xc3000000),
+
+	.gpio	= { 0x6000d000, 0x6000d080, 0x6000d100, 0x6000d180,
+			    0x6000d200, 0x6000d280, 0x6000d300, },
+	.spi	= { 0x7000d400, 0x7000d600, 0x7000d800, 0x7000da00, },
+ 	.twsi	= 0x7000c000,
+	.mmc	= { P2VAHB(0xc8000000), P2VAHB(0xc8000200),
+		    P2VAHB(0xc8000400), P2VAHB(0xc8000600), },
+};
+
+static volatile Diag diag;
+static int missed;
+
+void
+dumpcpuclks(void)		/* run CPU at full speed */
+{
+	Clkrst *clk = (Clkrst *)soc.clkrst;
+
+	iprint("pllx base %#lux misc %#lux\n", clk->pllxbase, clk->pllxmisc);
+	iprint("plle base %#lux misc %#lux\n", clk->pllebase, clk->pllemisc);
+	iprint("super cclk divider %#lux\n", clk->supcclkdiv);
+	iprint("super sclk divider %#lux\n", clk->supsclkdiv);
+}
+
+static char *
+devidstr(ulong)
+{
+	return "ARM Cortex-A9";
+}
+
+void
+archtegralink(void)
+{
+}
+
+/* convert AddrDevid register to a string in buf and return buf */
+char *
+cputype2name(char *buf, int size)
+{
+	ulong r;
+
+	r = cpidget();			/* main id register */
+	assert((r >> 24) == 'A');
+	seprint(buf, buf + size, "Cortex-A9 r%ldp%ld",
+		(r >> 20) & MASK(4), r & MASK(4));
+	return buf;
+}
+
+static void
+errata(void)
+{
+	ulong reg, r, p;
+
+	/* apply cortex-a9 errata workarounds */
+	r = cpidget();			/* main id register */
+	assert((r >> 24) == 'A');
+	p = r & MASK(4);		/* minor revision */
+	r >>= 20;
+	r &= MASK(4);			/* major revision */
+
+	/* this is an undocumented `diagnostic register' that linux knows */
+	reg = cprdsc(0, CpDTLB, 0, 1);
+	if (r < 2 || r == 2 && p <= 2)
+		reg |= 1<<4;			/* 742230 */
+	if (r == 2 && p <= 2)
+		reg |= 1<<6 | 1<<12 | 1<<22;	/* 743622, 2×742231 */
+	if (r < 3)
+		reg |= 1<<11;			/* 751472 */
+	cpwrsc(0, CpDTLB, 0, 1, reg);
+}
+
+void
+archconfinit(void)
+{
+	char *p;
+	ulong hz;
+
+	assert(m != nil);
+	m->cpuhz = 1000 * Mhz;			/* trimslice speed */
+	p = getconf("*cpumhz");
+	if (p) {
+		hz = atoi(p) * Mhz;
+		if (hz >= 100*Mhz && hz <= 3600UL*Mhz)
+			m->cpuhz = hz;
+	}
+	m->delayloop = m->cpuhz/2000;		/* initial estimate */
+	errata();
+}
+
+int
+archether(unsigned ctlrno, Ether *ether)
+{
+	switch(ctlrno) {
+	case 0:
+		ether->type = "rtl8169";		/* pci-e ether */
+		ether->ctlrno = ctlrno;
+		ether->irq = Pcieirq;			/* non-msi pci-e intr */
+		ether->nopt = 0;
+		ether->mbps = 1000;
+		return 1;
+	}
+	return -1;
+}
+
+void
+dumpscustate(void)
+{
+	Scu *scu = (Scu *)soc.scu;
+
+	print("cpu%d scu: accctl %#lux\n", m->machno, scu->accctl);
+	print("cpu%d scu: smp cpu bit map %#lo for %ld cpus; ", m->machno,
+		(scu->cfg >> 4) & MASK(4), (scu->cfg & MASK(2)) + 1);
+	print("cpus' power %#lux\n", scu->cpupwrsts);
+}
+
+void
+scuon(void)
+{
+	Scu *scu = (Scu *)soc.scu;
+
+	if (scu->ctl & Scuenable)
+		return;
+	scu->inval = MASK(16);
+	coherence();
+	scu->ctl = Scuparity | Scuenable | Specfill;
+	coherence();
+}
+
+int
+getncpus(void)
+{
+	int n;
+	char *p;
+	Scu *scu;
+
+	if (navailcpus == 0) {
+		scu = (Scu *)soc.scu;
+		navailcpus = (scu->cfg & MASK(2)) + 1;
+		if (navailcpus > MAXMACH)
+			navailcpus = MAXMACH;
+
+		p = getconf("*ncpu");
+		if (p && *p) {
+			n = atoi(p);
+			if (n > 0 && n < navailcpus)
+				navailcpus = n;
+		}
+	}
+	return navailcpus;
+}
+
+void
+cpuidprint(void)
+{
+	char name[64];
+
+	cputype2name(name, sizeof name);
+	delay(50);				/* let uart catch up */
+	iprint("cpu%d: %lldMHz ARM %s %s-endian\n",
+		m->machno, m->cpuhz / Mhz, name,
+		getpsr() & PsrBigend? "big": "little");
+}
+
+static void
+clockson(void)
+{
+	Clkrst *clk = (Clkrst *)soc.clkrst;
+
+	/* enable all by clearing resets */
+	clk->rstdevl = clk->rstdevh = clk->rstdevu = 0;
+	coherence();
+	clk->clkoutl = clk->clkouth = clk->clkoutu = ~0; /* enable all clocks */
+	coherence();
+
+	clk->rstsrc = Wdcpurst | Wdcoprst | Wdsysrst | Wdena;
+	coherence();
+}
+
+/* we could be shutting down ourself (if cpu == m->machno), so take care. */
+void
+stopcpu(uint cpu)
+{
+	Flow *flow = (Flow *)soc.flow;
+	Clkrst *clk = (Clkrst *)soc.clkrst;
+
+	if (cpu == 0) {
+		iprint("stopcpu: may not stop cpu0\n");
+		return;
+	}
+
+	machoff(cpu);
+	lock(&active);
+	active.stopped |= 1 << cpu;
+	unlock(&active);
+	l1cache->wb();
+
+	/* shut down arm7 avp coproc so it can't cause mischief. */
+	/* could try watchdog without stopping avp. */
+	flow->haltcop = Stop;
+	coherence();
+	flow->cop = 0;					/* no Cpuenable */
+	coherence();
+	delay(10);
+
+	assert(cpu < Maxflowcpus);
+	*(cpu == 0? &flow->haltcpu0: &flow->haltcpu1) = Stop;
+	coherence();
+	*(cpu == 0? &flow->cpu0: &flow->cpu1) = 0;	/* no Cpuenable */
+	coherence();
+	delay(10);
+
+	/* cold reset */
+	assert(cpu < Maxcpus);
+	clk->cpuset = (Cpu0reset | Cpu0dbgreset | Cpu0dereset) << cpu;
+	coherence();
+	delay(1);
+
+	l1cache->wb();
+}
+
+static void
+synccpus(volatile long *cntp, int n)
+{
+	ainc(cntp);
+	while (*cntp < n)
+		;
+	/* all cpus should now be here */
+}
+
+static void
+pass1(int pass, volatile Diag *dp)
+{
+	int i;
+
+	if(m->machno == 0)
+		iprint(" %d", pass);
+	for (i = 1000*1000; --i > 0; ) {
+		ainc(&dp->cnt);
+		adec(&dp->cnt);
+	}
+
+	synccpus(&dp->sync, navailcpus);
+	/* all cpus are now here */
+
+	ilock(dp);
+	if(dp->cnt != 0)
+		panic("cpu%d: diag: failed w count %ld", m->machno, dp->cnt);
+	iunlock(dp);
+
+	synccpus(&dp->sync, 2 * navailcpus);
+	/* all cpus are now here */
+	adec(&dp->sync);
+	adec(&dp->sync);
+}
+
+/*
+ * try to confirm coherence of l1 caches.
+ * assume that all available cpus will be started.
+ */
+void
+l1diag(void)
+{
+	int pass;
+	volatile Diag *dp;
+
+	if (!Debug)
+		return;
+
+	l1cache->wb();
+
+	/*
+	 * synchronise and print
+	 */
+	dp = &diag;
+	ilock(dp);
+	if (m->machno == 0)
+		iprint("l1: waiting for %d cpus... ", navailcpus);
+	iunlock(dp);
+
+	synccpus(&dp->sync, navailcpus);
+
+	ilock(dp);
+	if (m->machno == 0)
+		iprint("cache coherency pass");
+	iunlock(dp);
+
+	synccpus(&dp->sync, 2 * navailcpus);
+	adec(&dp->sync);
+	adec(&dp->sync);
+
+	/*
+	 * cpus contend
+	 */
+	for (pass = 0; pass < 3; pass++)
+		pass1(pass, dp);
+
+	/*
+	 * synchronise and check sanity
+	 */
+	synccpus(&dp->sync, navailcpus);
+
+	if(dp->sync < navailcpus || dp->sync >= 2 * navailcpus)
+		panic("cpu%d: diag: failed w dp->sync %ld", m->machno,
+			dp->sync);
+	if(dp->cnt != 0)
+		panic("cpu%d: diag: failed w dp->cnt %ld", m->machno,
+			dp->cnt);
+
+	ilock(dp);
+	iprint(" cpu%d ok", m->machno);
+	iunlock(dp);
+
+	synccpus(&dp->sync, 2 * navailcpus);
+	adec(&dp->sync);
+	adec(&dp->sync);
+	l1cache->wb();
+
+	/*
+	 * all done, print
+	 */
+	ilock(dp);
+	if (m->machno == 0)
+		iprint("\n");
+	iunlock(dp);
+}
+
+static void
+unfreeze(uint cpu)
+{
+	Clkrst *clk = (Clkrst *)soc.clkrst;
+	Flow *flow = (Flow *)soc.flow;
+
+	assert(cpu < Maxcpus);
+
+	clk->clkcpu &= ~(Cpu0stop << cpu);
+	coherence();
+	/* out of reset */
+	clk->cpuclr = (Cpu0reset | Cpu0wdreset | Cpu0dbgreset | Cpu0dereset) <<
+		cpu;
+	coherence();
+
+	assert(cpu < Maxflowcpus);
+	*(cpu == 0? &flow->cpu0: &flow->cpu1) = 0;
+	coherence();
+	*(cpu == 0? &flow->haltcpu0: &flow->haltcpu1) = 0; /* normal operat'n */
+	coherence();
+}
+
+/*
+ * this is all a bit magic.  the soc.exceptvec register is effectively
+ * undocumented.  we had to look at linux and experiment, alas.  this is the
+ * sort of thing that should be standardised as part of the cortex mpcore spec.
+ * even intel document their equivalent procedure.
+ */
+int
+startcpu(uint cpu)
+{
+	int i, r;
+	ulong oldvec, rstaddr;
+	ulong *evp = (ulong *)soc.exceptvec;	/* magic */
+
+	r = 0;
+	if (getncpus() < 2 || cpu == m->machno ||
+	    cpu >= MAXMACH || cpu >= navailcpus)
+		return -1;
+
+	oldvec = *evp;
+	l1cache->wb();			/* start next cpu w same view of ram */
+	*evp = rstaddr = PADDR(_vrst);	/* will start cpu executing at _vrst */
+	coherence();
+	l1cache->wb();
+	unfreeze(cpu);
+
+	for (i = 2000; i > 0 && *evp == rstaddr; i--)
+		delay(1);
+	if (i <= 0 || *evp != cpu) {
+		iprint("cpu%d: didn't start!\n", cpu);
+		stopcpu(cpu);		/* make sure it's stopped */
+		r = -1;
+	}
+	*evp = oldvec;
+	return r;
+}
+
+static void
+cksecure(void)
+{
+	ulong db;
+	extern ulong getdebug(void);
+
+	if (getscr() & 1)
+		panic("cpu%d: running non-secure", m->machno);
+	db = getdebug();
+	if (db)
+		iprint("cpu%d: debug enable reg %#lux\n", m->machno, db);
+}
+
+ulong
+smpon(void)
+{
+	ulong aux;
+
+	/* cortex-a9 model-specific configuration */
+	aux = getauxctl();
+	putauxctl(aux | CpACsmp | CpACmaintbcast);
+	return aux;
+}
+
+void
+cortexa9cachecfg(void)
+{
+	/* cortex-a9 model-specific configuration */
+	putauxctl(getauxctl() | CpACparity | CpAClwr0line | CpACl2pref);
+}
+
+/*
+ * called on a cpu other than 0 from cpureset in l.s,
+ * from _vrst in lexception.s.
+ * mmu and l1 (and system-wide l2) caches and coherency (smpon) are on,
+ * but interrupts are disabled.
+ * our mmu is using an exact copy of cpu0's l1 page table
+ * as it was after userinit ran.
+ */
+void
+cpustart(void)
+{
+	int ms;
+	ulong *evp;
+	Power *pwr;
+
+	up = nil;
+	if (active.machs & (1<<m->machno)) {
+		serialputc('?');
+		serialputc('r');
+		panic("cpu%d: resetting after start", m->machno);
+	}
+	assert(m->machno != 0);
+
+	errata();
+	cortexa9cachecfg();
+	memdiag(&testmem);
+
+	machinit();			/* bumps nmach, adds bit to machs */
+	machoff(m->machno);		/* not ready to go yet */
+
+	/* clock signals and scu are system-wide and already on */
+	clockshutdown();		/* kill any watch-dog timer */
+
+	trapinit();
+	clockinit();			/* sets loop delay */
+	timersinit();
+	cpuidprint();
+
+	/*
+	 * notify cpu0 that we're up so it can proceed to l1diag.
+	 */
+	evp = (ulong *)soc.exceptvec;	/* magic */
+	*evp = m->machno;
+	coherence();
+
+	l1diag();		/* contend with other cpus to verify sanity */
+
+	/*
+	 * pwr->noiopwr == 0
+	 * pwr->detect == 0x1ff (default, all disabled)
+	 */
+	pwr = (Power *)soc.power;
+	assert(pwr->gatests == MASK(7)); /* everything has power */
+
+	/*
+	 * 8169 has to initialise before we get past this, thus cpu0
+	 * has to schedule processes first.
+	 */
+	if (Debug)
+		iprint("cpu%d: waiting for 8169\n", m->machno);
+	for (ms = 0; !l1ptstable.word && ms < 5000; ms += 10) {
+		delay(10);
+		cachedinvse(&l1ptstable.word, sizeof l1ptstable.word);
+	}
+	if (!l1ptstable.word)
+		iprint("cpu%d: 8169 unreasonably slow; proceeding\n", m->machno);
+	/* now safe to copy cpu0's l1 pt in mmuinit */
+
+	mmuinit();			/* update our l1 pt from cpu0's */
+	fpon();
+	machon(m->machno);		/* now ready to go and be scheduled */
+
+	if (Debug)
+		iprint("cpu%d: scheding\n", m->machno);
+	schedinit();
+	panic("cpu%d: schedinit returned", m->machno);
+}
+
+/* mainly used to break out of wfi */
+void
+sgintr(Ureg *ureg, void *)
+{
+	iprint("cpu%d: got sgi\n", m->machno);
+	/* try to prod cpu1 into life when it gets stuck */
+	if (m->machno != 0)
+		clockprod(ureg);
+}
+
+void
+archreset(void)
+{
+	static int beenhere;
+
+	if (beenhere)
+		return;
+	beenhere = 1;
+
+	/* conservative temporary values until archconfinit runs */
+	m->cpuhz = 1000 * Mhz;			/* trimslice speed */
+	m->delayloop = m->cpuhz/2000;		/* initial estimate */
+
+	prcachecfg();
+
+	clockson();
+	/* all partitions were powered up by u-boot, so needn't do anything */
+	archconfinit();
+//	resetusb();
+	fpon();
+
+	if (irqtooearly)
+		panic("archreset: too early for irqenable");
+	irqenable(Cpu0irq, sgintr, nil, "cpu0");
+	irqenable(Cpu1irq, sgintr, nil, "cpu1");
+	/* ... */
+}
+
+void
+archreboot(void)
+{
+	Clkrst *clk = (Clkrst *)soc.clkrst;
+
+	assert(m->machno == 0);
+	iprint("archreboot: reset!\n");
+	delay(20);
+
+	clk->rstdevl |= Sysreset;
+	coherence();
+	delay(500);
+
+	/* shouldn't get here */
+	splhi();
+	iprint("awaiting reset");
+	for(;;) {
+		delay(1000);
+		print(".");
+	}
+}
+
+void
+kbdinit(void)
+{
+}
+
+static void
+missing(ulong addr, char *name)
+{
+	static int firstmiss = 1;
+
+	if (addr == 0) {
+		iprint("address zero for %s\n", name);
+		return;
+	}
+	if (probeaddr(addr) >= 0)
+		return;
+	missed++;
+	if (firstmiss) {
+		iprint("missing:");
+		firstmiss = 0;
+	} else
+		iprint(",\n\t");
+	iprint(" %s at %#lux", name, addr);
+}
+
+/* verify that all the necessary device registers are accessible */
+void
+chkmissing(void)
+{
+	delay(10);
+	missing(KZERO, "dram");
+	missing(soc.intr, "intr ctlr");
+	missing(soc.intrdist, "intr distrib");
+	missing(soc.tmr[0], "tegra timer1");
+	missing(soc.uart[0], "console uart");
+	missing(soc.pci, "pcie");
+	missing(soc.ether, "ether8169");
+	missing(soc.µs, "µs counter");
+	if (missed)
+		iprint("\n");
+	delay(10);
+}
+
+void
+archflashwp(Flash*, int)
+{
+}
+
+/*
+ * for ../port/devflash.c:/^flashreset
+ * retrieve flash type, virtual base and length and return 0;
+ * return -1 on error (no flash)
+ */
+int
+archflashreset(int bank, Flash *f)
+{
+	if(bank != 0)
+		return -1;
+panic("archflashreset: rewrite for nor & nand flash on ts");
+	/*
+	 * this is set up for the igepv2 board.
+	 */
+	f->type = "onenand";
+	f->addr = (void*)VIRTNOR;		/* mapped here by archreset */
+	f->size = 0;				/* done by probe */
+	f->width = 1;
+	f->interleave = 0;
+	return 0;
+}

+ 311 - 0
sys/src/9/teg2/arm.h

@@ -0,0 +1,311 @@
+/*
+ * arm-specific definitions for cortex-a8 and -a9
+ * these are used in C and assembler
+ *
+ * `cortex' refers to the cortex-a8 or -a9.
+ */
+
+#define NREGS		15	/* general-purpose regs, R0 through R14 */
+
+/*
+ * Program Status Registers
+ */
+#define PsrMusr		0x00000010		/* mode */
+#define PsrMfiq		0x00000011
+#define PsrMirq		0x00000012
+#define PsrMsvc		0x00000013	/* `protected mode for OS' */
+#define PsrMmon		0x00000016	/* `secure monitor' (trustzone hyper) */
+#define PsrMabt		0x00000017
+#define PsrMund		0x0000001B
+#define PsrMsys		0x0000001F	/* `privileged user mode for OS' (trustzone) */
+#define PsrMask		0x0000001F
+
+#define PsrThumb	0x00000020		/* beware hammers */
+#define PsrDfiq		0x00000040		/* disable FIQ interrupts */
+#define PsrDirq		0x00000080		/* disable IRQ interrupts */
+#define PsrDasabt	0x00000100		/* disable asynch aborts */
+#define PsrBigend	0x00000200
+
+#define PsrJaz		0x01000000		/* java mode */
+
+#define PsrV		0x10000000		/* overflow */
+#define PsrC		0x20000000		/* carry/borrow/extend */
+#define PsrZ		0x40000000		/* zero */
+#define PsrN		0x80000000		/* negative/less than */
+
+#define PsrMbz		(PsrJaz|PsrThumb|PsrBigend) /* these bits must be 0 */
+
+/*
+ * MCR and MRC are anti-mnemonic.
+ *	MTCP	coproc, opcode1, Rd, CRn, CRm[, opcode2]	# arm -> coproc
+ *	MFCP	coproc, opcode1, Rd, CRn, CRm[, opcode2]	# coproc -> arm
+ */
+
+#define MTCP	MCR
+#define MFCP	MRC
+
+/* instruction decoding */
+#define ISCPOP(op)	((op) == 0xE || ((op) & ~1) == 0xC)
+#define ISFPAOP(cp, op)	((cp) == CpOFPA && ISCPOP(op))
+#define ISVFPOP(cp, op)	(((cp) == CpDFP || (cp) == CpFP) && ISCPOP(op))
+
+/*
+ * Coprocessors
+ *	MCR	coproc, opcode1, Rd, CRn, CRm[, opcode2]	# arm -> coproc
+ *	MRC	coproc, opcode1, Rd, CRn, CRm[, opcode2]	# coproc -> arm
+ */
+#define CpOFPA		1			/* ancient 7500 FPA */
+#define CpFP		10			/* float FP, VFP cfg. */
+#define CpDFP		11			/* double FP */
+#define CpSC		15			/* System Control */
+
+/*
+ * Primary (CRn) CpSC registers.
+ */
+#define	CpID		0			/* ID and cache type */
+#define	CpCONTROL	1			/* miscellaneous control */
+#define	CpTTB		2			/* Translation Table Base(s) */
+#define	CpDAC		3			/* Domain Access Control */
+#define	CpFSR		5			/* Fault Status */
+#define	CpFAR		6			/* Fault Address */
+#define	CpCACHE		7			/* cache/write buffer control */
+#define	CpTLB		8			/* TLB control */
+#define	CpCLD		9			/* L2 Cache Lockdown, op1==1 */
+#define CpTLD		10			/* TLB Lockdown, with op2 */
+#define CpVECS		12			/* vector bases, op1==0, Crm==0, op2s (cortex) */
+#define	CpPID		13			/* Process ID */
+#define CpDTLB		15			/* TLB, L1 cache stuff (cortex) */
+
+/*
+ * CpTTB op1==0, Crm==0 opcode2 values.
+ */
+#define CpTTB0		0			/* secure ttb */
+#define CpTTB1		1			/* non-secure ttb (v7) */
+#define CpTTBctl	2			/* v7 */
+
+/*
+ * CpFSR op1==0, Crm==0 opcode 2 values.
+ */
+#define CpDFSR		0			/* data fault status */
+#define CpIFSR		1			/* instruction fault status */
+
+/*
+ * CpFAR op1==0, Crm==0 opcode 2 values.
+ */
+#define CpDFAR		0			/* data fault address */
+#define CpIFAR		2			/* instruction fault address */
+
+/*
+ * CpID Secondary (CRm) registers.
+ */
+#define CpIDidct	0
+
+/*
+ * CpID CpIDidct op1==0 opcode2 fields.
+ */
+#define CpIDid		0			/* main ID */
+#define CpIDct		1			/* cache type */
+#define CpIDtlb		3			/* tlb type (cortex) */
+#define CpIDmpid	5			/* multiprocessor id (cortex) */
+
+/* CpIDid op1 values */
+#define CpIDcsize	1			/* cache size (cortex) */
+#define CpIDcssel	2			/* cache size select (cortex) */
+
+/*
+ * CpID CpIDidct op1==CpIDcsize opcode2 fields.
+ */
+#define CpIDcasize	0			/* cache size */
+#define CpIDclvlid	1			/* cache-level id */
+
+/*
+ * CpCONTROL op2 codes, op1==0, Crm==0.
+ */
+#define CpMainctl	0		/* sctlr */
+#define CpAuxctl	1
+#define CpCPaccess	2
+
+/*
+ * CpCONTROL: op1==0, CRm==0, op2==CpMainctl.
+ * main control register.
+ * cortex/armv7 has more ops and CRm values.
+ */
+#define CpCmmu		0x00000001	/* M: MMU enable */
+#define CpCalign	0x00000002	/* A: alignment fault enable */
+#define CpCdcache	0x00000004	/* C: data cache on */
+#define CpBigend	(1<<7)
+#define CpCsw		(1<<10)		/* SW: SWP(B) enable (deprecated in v7) */
+#define CpCpredict	0x00000800	/* Z: branch prediction (armv7) */
+#define CpCicache	0x00001000	/* I: instruction cache on */
+#define CpChv		0x00002000	/* V: high vectors */
+#define CpCrr		(1<<14)	/* RR: round robin vs random cache replacement */
+#define CpCha		(1<<17)		/* HA: hw access flag enable */
+#define CpCdz		(1<<19)		/* DZ: divide by zero fault enable (not cortex-a9) */
+#define CpCfi		(1<<21)		/* FI: fast intrs */
+#define CpCve		(1<<24)		/* VE: intr vectors enable */
+#define CpCee		(1<<25)		/* EE: exception endianness: big */
+#define CpCnmfi		(1<<27)		/* NMFI: non-maskable fast intrs. (RO) */
+#define CpCtre		(1<<28)		/* TRE: TEX remap enable */
+#define CpCafe		(1<<29)		/* AFE: access flag (ttb) enable */
+#define CpCte		(1<<30)		/* TE: thumb exceptions */
+
+#define CpCsbz (1<<31 | CpCte | CpCafe | CpCtre | 1<<26 | CpCee | CpCve | \
+	CpCfi | 3<<19 | CpCha | 1<<15 | 3<<8 | CpBigend) /* must be 0 (armv7) */
+#define CpCsbo (3<<22 | 1<<18 | 1<<16 | CpChv | CpCsw | 017<<3)	/* must be 1 (armv7) */
+
+/*
+ * CpCONTROL: op1==0, CRm==0, op2==CpAuxctl.
+ * Auxiliary control register on cortex-a9.
+ * these differ from even the cortex-a8 bits.
+ */
+#define CpACparity		(1<<9)
+#define CpACca1way		(1<<8)	/* cache in a single way */
+#define CpACcaexcl		(1<<7)	/* exclusive cache */
+#define CpACsmp			(1<<6)	/* SMP l1 caches coherence; needed for ldrex/strex */
+#define CpAClwr0line		(1<<3)	/* write full cache line of 0s; see Fullline0 */
+#define CpACl1pref		(1<<2)	/* l1 prefetch enable */
+#define CpACl2pref		(1<<1)	/* l2 prefetch enable */
+#define CpACmaintbcast		(1<<0)	/* broadcast cache & tlb maint. ops */
+
+/*
+ * CpCONTROL Secondary (CRm) registers and opcode2 fields.
+ */
+#define CpCONTROLscr	1
+
+#define CpSCRscr	0			/* secure configuration */
+
+/*
+ * CpCACHE Secondary (CRm) registers and opcode2 fields.  op1==0.
+ * In ARM-speak, 'flush' means invalidate and 'clean' means writeback.
+ */
+#define CpCACHEintr	0			/* interrupt (op2==4) */
+#define CpCACHEisi	1			/* inner-sharable I cache (v7) */
+#define CpCACHEpaddr	4			/* 0: phys. addr (cortex) */
+#define CpCACHEinvi	5			/* instruction, branch table */
+#define CpCACHEinvd	6			/* data or unified */
+// #define CpCACHEinvu	7			/* unified (not on cortex) */
+#define CpCACHEva2pa	8			/* va -> pa translation (cortex) */
+#define CpCACHEwb	10			/* writeback */
+#define CpCACHEinvdse	11			/* data or unified by mva */
+#define CpCACHEwbi	14			/* writeback+invalidate */
+
+#define CpCACHEall	0			/* entire (not for invd nor wb(i) on cortex) */
+#define CpCACHEse	1			/* single entry */
+#define CpCACHEsi	2			/* set/index (set/way) */
+#define CpCACHEtest	3			/* test loop */
+#define CpCACHEwait	4			/* wait (prefetch flush on cortex) */
+#define CpCACHEdmbarr	5			/* wb only (cortex) */
+#define CpCACHEflushbtc	6			/* flush branch-target cache (cortex) */
+#define CpCACHEflushbtse 7			/* ⋯ or just one entry in it (cortex) */
+
+/*
+ * CpTLB Secondary (CRm) registers and opcode2 fields.
+ */
+#define CpTLBinvi	5			/* instruction */
+#define CpTLBinvd	6			/* data */
+#define CpTLBinvu	7			/* unified */
+
+#define CpTLBinv	0			/* invalidate all */
+#define CpTLBinvse	1			/* invalidate single entry */
+#define CpTBLasid	2			/* by ASID (cortex) */
+
+/*
+ * CpCLD Secondary (CRm) registers and opcode2 fields for op1==0. (cortex)
+ */
+#define CpCLDena	12			/* enables */
+#define CpCLDcyc	13			/* cycle counter */
+#define CpCLDuser	14			/* user enable */
+
+#define CpCLDenapmnc	0
+#define CpCLDenacyc	1
+
+/*
+ * CpCLD Secondary (CRm) registers and opcode2 fields for op1==1.
+ */
+#define CpCLDl2		0			/* l2 cache */
+
+#define CpCLDl2aux	2			/* auxiliary control */
+
+/*
+ * l2 cache aux. control
+ */
+#define CpCl2ecc	(1<<28)			/* use ecc, not parity */
+#define CpCl2noldforw	(1<<27)			/* no ld forwarding */
+#define CpCl2nowrcomb	(1<<25)			/* no write combining */
+#define CpCl2nowralldel	(1<<24)			/* no write allocate delay */
+#define CpCl2nowrallcomb (1<<23)		/* no write allocate combine */
+#define CpCl2nowralloc	(1<<22)			/* no write allocate */
+#define CpCl2eccparity	(1<<21)			/* enable ecc or parity */
+#define CpCl2inner	(1<<16)			/* inner cacheability */
+/* other bits are tag ram & data ram latencies */
+
+/*
+ * CpTLD Secondary (CRm) registers and opcode2 fields.
+ */
+#define CpTLDlock	0			/* TLB lockdown registers */
+#define CpTLDpreload	1			/* TLB preload */
+
+#define CpTLDi		0			/* TLB instr. lockdown reg. */
+#define CpTLDd		1			/* " data " " */
+
+/*
+ * CpVECS Secondary (CRm) registers and opcode2 fields.
+ */
+#define CpVECSbase	0
+
+#define CpVECSnorm	0			/* (non-)secure base addr */
+#define CpVECSmon	1			/* secure monitor base addr */
+
+/*
+ * MMU page table entries.
+ * memory must be cached, buffered, sharable and wralloc to participate in
+ * automatic L1 cache coherency.
+ */
+#define Mbz		(0<<4)			/* L1 page tables: must be 0 */
+#define Noexecsect	(1<<4)			/* L1 sections: no execute */
+#define Fault		0x00000000		/* L[12] pte: unmapped */
+
+#define Coarse		(Mbz|1)			/* L1: page table */
+#define Section		(Mbz|2)			/* L1 1MB */
+/*
+ * next 2 bits (L1wralloc & L1sharable) and Buffered and Cached must be
+ * set in l1 ptes for LDREX/STREX to work.
+ */
+#define L1wralloc	(1<<12)			/* L1 TEX */
+#define L1sharable	(1<<16)
+#define L1nonglobal	(1<<17)			/* tied to asid */
+#define Nonsecuresect	(1<<19)			/* L1 sections */
+
+#define Large		0x00000001		/* L2 64KB */
+#define Noexecsmall	1			/* L2: no execute */
+#define Small		0x00000002		/* L2 4KB */
+/*
+ * next 4 bits (Buffered, Cached, L2wralloc & L2sharable) must be set in
+ * l2 ptes for memory containing locks because LDREX/STREX require them.
+ */
+#define Buffered	0x00000004		/* L[12]: 0 write-thru, 1 -back */
+#define Cached		0x00000008		/* L[12] */
+#define L2wralloc	(1<<6)			/* L2 TEX (small pages) */
+#define L2apro		(1<<9)			/* L2 AP: read only */
+#define L2sharable	(1<<10)
+#define L2nonglobal	(1<<11)			/* tied to asid */
+#define Dom0		0
+
+/* attributes for memory containing locks */
+#define L1ptedramattrs	(Cached | Buffered | L1wralloc | L1sharable)
+#define L2ptedramattrs	(Cached | Buffered | L2wralloc | L2sharable)
+
+#define Noaccess	0			/* AP, DAC */
+#define Krw		1			/* AP */
+/* armv7 deprecates AP[2] == 1 & AP[1:0] == 2 (Uro), prefers 3 (new in v7) */
+#define Uro		2			/* AP */
+#define Urw		3			/* AP */
+#define Client		1			/* DAC */
+#define Manager		3			/* DAC */
+
+#define AP(n, v)	F((v), ((n)*2)+4, 2)
+#define L1AP(ap)	(AP(3, (ap)))
+#define L2AP(ap)	(AP(0, (ap)))		/* armv7 */
+#define DAC(n, v)	F((v), (n)*2, 2)
+
+#define HVECTORS	0xffff0000

+ 135 - 0
sys/src/9/teg2/arm.s

@@ -0,0 +1,135 @@
+/*
+ * nvidia tegra 2 machine assist, definitions
+ * dual-core cortex-a9 processor
+ *
+ * R9 and R10 are used for `extern register' variables.
+ * R11 is used by the loader as a temporary, so avoid it.
+ */
+
+#include "mem.h"
+#include "arm.h"
+
+#undef B					/* B is for 'botch' */
+
+#define KADDR(pa)	(KZERO    | ((pa) & ~KSEGM))
+#define PADDR(va)	(PHYSDRAM | ((va) & ~KSEGM))
+
+#define L1X(va)		(((((va))>>20) & 0x0fff)<<2)
+
+#define MACHADDR	(L1-MACHSIZE)		/* only room for cpu0's */
+
+/* L1 pte values */
+#define PTEDRAM	(Dom0|L1AP(Krw)|Section|L1ptedramattrs)
+#define PTEIO	(Dom0|L1AP(Krw)|Section)
+
+#define DOUBLEMAPMBS	 512	/* megabytes of low dram to double-map */
+
+/* steps on R0 */
+#define DELAY(label, mloops) \
+	MOVW	$((mloops)*1000000), R0; \
+label: \
+	SUB.S	$1, R0; \
+	BNE	label
+
+/* print a byte on the serial console; clobbers R0 & R6; needs R12 (SB) set */
+#define PUTC(c) \
+	BARRIERS; \
+	MOVW	$(c), R0; \
+	MOVW	$PHYSCONS, R6; \
+	MOVW	R0, (R6); \
+	BARRIERS
+
+/*
+ * new instructions
+ */
+
+#define SMC	WORD	$0xe1600070	/* low 4-bits are call # (trustzone) */
+/* flush branch-target cache */
+#define FLBTC  MTCP CpSC, 0, PC, C(CpCACHE), C(CpCACHEinvi), CpCACHEflushbtc
+/* flush one entry of the branch-target cache, va in R0 (cortex) */
+#define FLBTSE MTCP CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEflushbtse
+
+/* arm v7 arch defines these */
+#define DSB	WORD	$0xf57ff04f	/* data synch. barrier; last f = SY */
+#define DMB	WORD	$0xf57ff05f	/* data mem. barrier; last f = SY */
+#define ISB	WORD	$0xf57ff06f	/* instr. sync. barrier; last f = SY */
+
+#define WFI	WORD	$0xe320f003	/* wait for interrupt */
+#define NOOP	WORD	$0xe320f000
+
+#define CLZ(s, d) WORD	$(0xe16f0f10 | (d) << 12 | (s))	/* count leading 0s */
+
+#define SETEND(o) WORD	$(0xf1010000 | (o) << 9)  /* o==0, little-endian */
+
+#define CPSIE	WORD	$0xf1080080	/* intr enable: zeroes I bit */
+#define CPSID	WORD	$0xf10c00c0	/* intr disable: sets I,F bits */
+#define CPSAE	WORD	$0xf1080100	/* async abt enable: zeroes A bit */
+#define CPSMODE(m) WORD $(0xf1020000 | (m)) /* switch to mode m (PsrM*) */
+
+#define	CLREX	WORD	$0xf57ff01f
+#define	LDREX(fp,t)   WORD $(0xe<<28|0x01900f9f | (fp)<<16 | (t)<<12)
+/* `The order of operands is from left to right in dataflow order' - asm man */
+#define	STREX(f,tp,r) WORD $(0xe<<28|0x01800f90 | (tp)<<16 | (r)<<12 | (f)<<0)
+
+/* floating point */
+#define VMRS(fp, cpu) WORD $(0xeef00a10 | (fp)<<16 | (cpu)<<12) /* FP → arm */
+#define VMSR(cpu, fp) WORD $(0xeee00a10 | (fp)<<16 | (cpu)<<12) /* arm → FP */
+
+/*
+ * a popular code sequence used to write a pte for va is:
+ *
+ *	MOVW	R(n), TTB[LnX(va)]
+ *	// clean the cache line
+ *	DSB
+ *	// invalidate tlb entry for va
+ *	FLBTC
+ *	DSB
+ * 	PFF (now ISB)
+ */
+#define	BARRIERS	FLBTC; DSB; ISB
+
+/*
+ * invoked with PTE bits in R2, pa in R3, PTE pointed to by R4.
+ * fill PTE pointed to by R4 and increment R4 past it.
+ * increment R3 by a MB.  clobbers R1.
+ */
+#define FILLPTE() \
+	ORR	R3, R2, R1;			/* pte bits in R2, pa in R3 */ \
+	MOVW	R1, (R4); \
+	ADD	$4, R4;				/* bump PTE address */ \
+	ADD	$MiB, R3;			/* bump pa */ \
+
+/* zero PTE pointed to by R4 and increment R4 past it. assumes R0 is 0. */
+#define ZEROPTE() \
+	MOVW	R0, (R4); \
+	ADD	$4, R4;				/* bump PTE address */
+
+/*
+ * set kernel SB for zero segment (instead of usual KZERO segment).
+ * NB: the next line puts rubbish in R12:
+ *	MOVW	$setR12-KZERO(SB), R12
+ */
+#define SETZSB \
+	MOVW	$setR12(SB), R12;		/* load kernel's SB */ \
+	SUB	$KZERO, R12; \
+	ADD	$PHYSDRAM, R12
+
+/*
+ * note that 5a's RFE is not the v6/7 arch. instruction (0xf8900a00),
+ * which loads CPSR from the word after the PC at (R13), but rather
+ * the pre-v6 simulation `MOVM.IA.S.W (R13), [R15]' (0xe8fd8000 since
+ * MOVM is LDM in this case), which loads CPSR not from memory but
+ * from SPSR due to `.S'.
+ */
+#define RFEV7(r)    WORD $(0xf8900a00 | (r) << 16)
+#define RFEV7W(r)   WORD $(0xf8900a00 | (r) << 16 | 0x00200000)	/* RFE.W */
+#define RFEV7DB(r)  WORD $(0xf9100a00 | (r) << 16)		/* RFE.DB */
+#define RFEV7DBW(r) WORD $(0xf9100a00 | (r) << 16 | 0x00200000)	/* RFE.DB.W */
+
+#define CKPSR(psr, tmp, bad)
+#define CKCPSR(psrtmp, tmp, bad)
+
+/* return with cpu id in r and condition codes set from "r == 0" */
+#define CPUID(r) \
+	MFCP	CpSC, 0, r, C(CpID), C(CpIDidct), CpIDmpid; \
+	AND.S	$(MAXMACH-1), r			/* mask out non-cpu-id bits */

+ 56 - 0
sys/src/9/teg2/atom.s

@@ -0,0 +1,56 @@
+#include "arm.s"
+
+/*
+ * int cas(ulong *p, ulong ov, ulong nv);
+ */
+
+TEXT	cas+0(SB),0,$12		/* r0 holds p */
+TEXT	casp+0(SB),0,$12	/* r0 holds p */
+	MOVW	ov+4(FP), R1
+	MOVW	nv+8(FP), R2
+spincas:
+	LDREX(0,3)	/*	LDREX	0(R0),R3	*/
+	CMP.S	R3, R1
+	BNE	fail
+	STREX(2,0,4)	/*	STREX	0(R0),R2,R4	*/
+	CMP.S	$0, R4
+	BNE	spincas
+	MOVW	$1, R0
+	BARRIERS
+	RET
+fail:
+	CLREX
+	MOVW	$0, R0
+	RET
+
+TEXT _xinc(SB), $0	/* void	_xinc(long *); */
+TEXT ainc(SB), $0	/* long ainc(long *); */
+spinainc:
+	LDREX(0,3)	/*	LDREX	0(R0),R3	*/
+	ADD	$1,R3
+	STREX(3,0,4)	/*	STREX	0(R0),R3,R4	*/
+	CMP.S	$0, R4
+	BNE	spinainc
+	MOVW	R3, R0
+	RET
+
+TEXT _xdec(SB), $0	/* long _xdec(long *); */
+TEXT adec(SB), $0	/* long adec(long *); */
+spinadec:
+	LDREX(0,3)	/*	LDREX	0(R0),R3	*/
+	SUB	$1,R3
+	STREX(3,0,4)	/*	STREX	0(R0),R3,R4	*/
+	CMP.S	$0, R4
+	BNE	spinadec
+	MOVW	R3, R0
+	RET
+
+TEXT loadlinked(SB), $0	/* long loadlinked(long *); */
+	LDREX(0,0)	/*	LDREX	0(R0),R0	*/
+	RET
+
+TEXT storecond(SB), $0	/* int storecond(long *, long); */
+	MOVW	ov+4(FP), R3
+	STREX(3,0,0)	/*	STREX	0(R0),R3,R0	*/
+	RSB	$1, R0
+	RET

+ 456 - 0
sys/src/9/teg2/cache-l2-pl310.c

@@ -0,0 +1,456 @@
+/*
+ * PL310 level 2 cache (non-architectural bag on the side)
+ *
+ * guaranteed to work incorrectly with default settings; must set Sharovr.
+ *
+ * clean & invalidate (wbinv) is buggy, so we work around erratum 588369
+ * by disabling write-back and cache line-fill before, and restoring after.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+#include "arm.h"
+
+#define NWAYS(l2p)	((l2p)->auxctl & Assoc16way? 16: 8)
+#define L2P		((L2pl310 *)soc.l2cache)
+
+enum {
+	L2size		= 1024 * 1024,	/* according to the tegra 2 manual */
+	Wayszgran	= 16 * KiB,	/* granularity of way sizes */
+};
+
+typedef struct L2pl310 L2pl310;
+typedef struct Pl310op Pl310op;
+
+struct Pl310op {
+	ulong	pa;
+	ulong	_pad;
+	ulong	indexway;
+	ulong	way;
+};
+
+struct L2pl310 {
+	ulong	id;
+	ulong	type;
+	uchar	_pad0[0x100 - 0x8];
+	ulong	ctl;
+	ulong	auxctl;
+
+	uchar	_pad1[0x730 - 0x108];	/* boring regs */
+	ulong	sync;
+	uchar	_pad2[0x740 - 0x734];
+	ulong	r3p0sync;		/* workaround for r3p0 bug */
+	uchar	_pad3[0x770 - 0x744];
+	Pl310op	inv;			/* inv.indexway doesn't exist */
+	uchar	_pad4[0x7b0 - 0x780];
+	Pl310op	clean;
+	uchar	_pad5[0x7f0 - 0x7c0];
+	Pl310op	cleaninv;
+	uchar	_pad6[0xc00 - 0x7d0];
+	ulong	filtstart;
+	ulong	filtend;
+	uchar	_pad6[0xf40 - 0xc08];
+	ulong	debug;
+	/* ... */
+};
+
+enum {
+	/* ctl bits */
+	L2enable = 1,
+
+	/* auxctl bits */
+	Ipref	= 1<<29,		/* prefetch enables */
+	Dpref	= 1<<28,
+	Mbo	= 1<<25,
+	Sharovr	= 1<<22, /* shared attribute override (i.e., work right!) */
+	Parity	= 1<<21,
+	Waycfgshift= 17,
+	Waycfgmask = (1<<3) - 1,
+	Assoc16way = 1<<16,
+	/*
+	 * optim'n to 0 cache lines; must be enabled in a9(?!).
+	 * set CpAClwr0line on all cpus 1st.
+	 */
+	Fullline0= 1<<0,
+
+	/* debug bits */
+	Wt	= 1<<1,			/* write-through, not write-back */
+	Nolinefill= 1<<0,
+
+	Basecfg = Wt | Nolinefill,
+};
+
+static Lock l2lock;
+static int disallowed;			/* by user: *l2off= in plan9.ini */
+static int l2ison;
+static int bg_op_running;
+static ulong waysmask;
+
+static Cacheimpl l2cacheimpl;
+
+static void
+awaitbgop(void)
+{
+	while (bg_op_running)
+		;
+}
+
+static void
+getlock(void)
+{
+	awaitbgop();		/* wait at normal PL first */
+	ilock(&l2lock);
+	awaitbgop();		/* wait under lock */
+}
+
+static void
+l2pl310sync(void)
+{
+	L2P->sync = 0;
+	coherence();
+}
+
+/* call this first to set sets/ways configuration */
+void
+l2pl310init(void)
+{
+	int waysz, nways;
+	ulong new;
+	L2pl310 *l2p = L2P;
+	static int configed;
+
+	if (getconf("*l2off") != nil) {
+//		iprint("l2 cache (pl310) disabled\n");
+		disallowed = 1;
+		return;
+	}
+	if (l2ison || configed)
+		return;
+	l2cache = &l2cacheimpl;
+	cachedwb();
+
+	/*
+	 * default config is:
+	 * l2: ext unified, 8 ways 512 sets 32 bytes/line => 128KB
+	 * but the tegra 2 manual says there's 1MB available.
+	 * ways or way-size may be fixed by hardware; the only way to tell
+	 * is to try to change the setting and read it back.
+	 */
+	l2pl310sync();
+	l2cache->inv();
+
+	/* figure out number of ways */
+	l2pl310sync();
+	nways = NWAYS(l2p);
+	if (!(l2p->auxctl & Assoc16way)) {
+		l2p->auxctl |= Assoc16way;
+		coherence();
+		l2pl310sync();
+		nways = NWAYS(l2p);
+//		iprint("\nl2: was set for 8 ways, asked for 16, got %d\n", nways);
+	}
+	waysmask = MASK(nways);
+
+	/* figure out way size (and thus number of sets) */
+	waysz = L2size / nways;
+	new = l2p->auxctl & ~(Waycfgmask << Waycfgshift) |
+		(log2(waysz / Wayszgran) + 1) << Waycfgshift;
+	l2p->auxctl = new;
+	coherence();
+	l2pl310sync();
+	l2cache->inv();
+
+//	iprint("\nl2: configed %d ways, %d sets (way size %d)\n", nways,
+//		waysz / CACHELINESZ, waysz);
+	if (l2p->auxctl != new)
+		iprint("l2 config %#8.8lux didn't stick; is now %#8.8lux\n",
+			new, l2p->auxctl);
+	configed++;
+}
+
+void
+l2pl310info(Memcache *cp)
+{
+	int pow2;
+	ulong waysz;
+	L2pl310 *l2p = L2P;
+
+	memset(cp, 0, sizeof *cp);
+	if (!l2ison)
+		return;
+
+	l2pl310init();
+	assert((l2p->id >> 24) == 'A');
+	cp->level = 2;
+	cp->type = Unified;
+	cp->external = Extcache;
+	cp->setsways = Cara | Cawa | Cawt | Cawb;
+	cp->l1ip = 3<<14;				/* PIPT */
+	cp->setsh = cp->waysh = 0;			/* bag on the side */
+
+	cp->linelen = CACHELINESZ;
+	cp->log2linelen = log2(CACHELINESZ);
+
+	cp->nways = NWAYS(l2p);
+	pow2 = ((l2p->auxctl >> Waycfgshift) & Waycfgmask) - 1;
+	if (pow2 < 0)
+		pow2 = 0;
+	waysz = (1 << pow2) * Wayszgran;
+	cp->nsets = waysz / CACHELINESZ;
+}
+
+void
+l2pl310on(void)
+{
+	ulong ctl;
+	L2pl310 *l2p = L2P;
+
+	if (getconf("*l2off") != nil) {
+//		iprint("l2 cache (pl310) disabled\n");
+		disallowed = 1;
+		return;
+	}
+	if (l2ison)
+		return;
+
+	l2pl310init();
+	l2cache->inv();
+
+	/*
+	 * drain l1.  can't turn it off (which would make locks not work)
+	 * because doing so makes references below to the l2 registers wedge
+	 * the system.
+	 */
+	cacheuwbinv();
+	cacheiinv();
+
+	/*
+	 * this is only called once, on cpu0 at startup,
+	 * so we don't need locks here.
+	 * must do all configuration before enabling l2 cache.
+	 */
+	l2p->filtend = 0;
+	coherence();
+	l2p->filtstart = 0;		/* no enable bit */
+	l2p->debug = 0;			/* write-back, line fills allowed */
+	coherence();
+
+	ctl = l2p->auxctl;
+	/* don't change number of sets & ways, but reset all else. */
+	ctl &= Waycfgmask << Waycfgshift | Assoc16way;
+	ctl |= Sharovr;		/* actually work correctly for a change */
+	ctl |= Mbo | Ipref | Dpref | Parity | Fullline0;
+	l2p->auxctl = ctl;
+	coherence();
+
+	l2p->ctl |= L2enable;
+	coherence();
+
+	l2ison = 1;
+
+//	iprint("l2 cache (pl310) now on\n");
+}
+
+void
+l2pl310off(void)
+{
+	if (!l2ison)
+		return;
+	l2cache->wbinv();
+	getlock();
+	L2P->ctl &= ~L2enable;
+	coherence();
+	l2ison = 0;
+	iunlock(&l2lock);
+}
+
+
+static void
+applyrange(ulong *reg, void *ava, int len)
+{
+	uintptr va, endva;
+
+	if (disallowed || !l2ison)
+		return;
+	if (len < 0)
+		panic("l2cache*se called with negative length");
+	endva = (uintptr)ava + len;
+	for (va = (uintptr)ava & ~(CACHELINESZ-1); va < endva;
+	     va += CACHELINESZ)
+		*reg = PADDR(va);
+	l2pl310sync();
+}
+
+void
+l2pl310invse(void *va, int bytes)
+{
+	uintptr start, end;
+	L2pl310 *l2p = L2P;
+
+	/*
+	 * if start & end addresses are not on cache-line boundaries,
+	 * flush first & last cachelines before invalidating.
+	 */
+	start = (uintptr)va;
+	end = start + bytes;
+	getlock();
+	if (start % CACHELINESZ != 0) {
+//		iprint("l2pl310invse: unaligned start %#p from %#p\n", start,
+//			getcallerpc(&va));
+		applyrange(&l2p->clean.pa, va, 1);
+	}
+	if (end % CACHELINESZ != 0) {
+//		iprint("l2pl310invse: unaligned end %#p from %#p\n", end,
+//			getcallerpc(&va));
+		applyrange(&l2p->clean.pa, (char *)va + bytes, 1);
+	}
+
+	applyrange(&l2p->inv.pa, va, bytes);
+	iunlock(&l2lock);
+}
+
+void
+l2pl310wbse(void *va, int bytes)
+{
+	getlock();
+	applyrange(&L2P->clean.pa, va, bytes);
+	iunlock(&l2lock);
+}
+
+/*
+ * assume that ldrex/strex (thus locks) won't work when Wt in is effect,
+ * so don't manipulate locks between setting and clearing Wt.
+ */
+void
+l2pl310wbinvse(void *va, int bytes)
+{
+	int odb;
+	L2pl310 *l2p = L2P;
+
+	if (!l2ison)
+		return;
+	getlock();
+	applyrange(&l2p->clean.pa, va, bytes);	/* paranoia */
+
+	odb = l2p->debug;
+	l2p->debug |= Wt | Nolinefill;		/* erratum workaround */
+	coherence();
+
+	applyrange(&l2p->cleaninv.pa, va, bytes);
+
+	l2p->debug = odb;
+	iunlock(&l2lock);
+}
+
+
+/*
+ * we want to wait for completion at normal PL.
+ * if waiting is interrupted, interrupt code that calls
+ * these ops could deadlock on a uniprocessor, so we only
+ * give up l2lock before waiting on multiprocessors.
+ * in this port, only cpu 0 gets interrupts other than local timer ones.
+ */
+
+void
+l2pl310inv(void)
+{
+	L2pl310 *l2p = L2P;
+
+	if (disallowed)
+		return;
+
+	getlock();
+	bg_op_running = 1;
+	l2p->inv.way = waysmask;
+	coherence();
+	if (conf.nmach > 1)
+		iunlock(&l2lock);
+
+	while (l2p->inv.way & waysmask)
+		;
+
+	if (conf.nmach > 1)
+		ilock(&l2lock);
+	l2pl310sync();
+	bg_op_running = 0;
+	iunlock(&l2lock);
+}
+
+/*
+ * maximum time seen is 2542µs, typical is 625µs.
+ */
+void
+l2pl310wb(void)
+{
+	L2pl310 *l2p = L2P;
+
+	if (disallowed || !l2ison)
+		return;
+
+	getlock();
+	bg_op_running = 1;
+	l2p->clean.way = waysmask;
+	coherence();
+	if (conf.nmach > 1)
+		iunlock(&l2lock);
+
+	while (l2p->clean.way & waysmask)
+		;
+
+	if (conf.nmach > 1)
+		ilock(&l2lock);
+	l2pl310sync();
+	bg_op_running = 0;
+	iunlock(&l2lock);
+}
+
+void
+l2pl310wbinv(void)
+{
+	int odb;
+	L2pl310 *l2p = L2P;
+
+	if (disallowed || !l2ison)
+		return;
+
+	l2pl310wb();			/* paranoia */
+
+	getlock();
+	bg_op_running = 1;
+	odb = l2p->debug;
+	l2p->debug |= Wt | Nolinefill;	/* erratum workaround */
+	coherence();
+
+	l2p->cleaninv.way = waysmask;
+	coherence();
+	if (conf.nmach > 1)
+		iunlock(&l2lock);
+
+	while (l2p->cleaninv.way & waysmask)
+		;
+
+	if (conf.nmach > 1)
+		ilock(&l2lock);
+	l2pl310sync();
+	l2p->debug = odb;
+	bg_op_running = 0;
+	iunlock(&l2lock);
+}
+
+static Cacheimpl l2cacheimpl = {
+	.info	= l2pl310info,
+	.on	= l2pl310on,
+	.off	= l2pl310off,
+
+	.inv	= l2pl310inv,
+	.wb	= l2pl310wb,
+	.wbinv	= l2pl310wbinv,
+
+	.invse	= l2pl310invse,
+	.wbse	= l2pl310wbse,
+	.wbinvse= l2pl310wbinvse,
+};

+ 240 - 0
sys/src/9/teg2/cache.v7.s

@@ -0,0 +1,240 @@
+/*
+ * cortex arm arch v7 cache flushing and invalidation
+ * included by l.s and rebootcode.s
+ */
+
+TEXT cacheiinv(SB), $-4				/* I invalidate */
+	MOVW	$0, R0
+	MTCP	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall /* ok on cortex */
+	ISB
+	RET
+
+/*
+ * set/way operators, passed a suitable set/way value in R0.
+ */
+TEXT cachedwb_sw(SB), $-4
+	MTCP	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEsi
+	RET
+
+TEXT cachedwbinv_sw(SB), $-4
+	MTCP	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEsi
+	RET
+
+TEXT cachedinv_sw(SB), $-4
+	MTCP	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvd), CpCACHEsi
+	RET
+
+	/* set cache size select */
+TEXT setcachelvl(SB), $-4
+	MTCP	CpSC, CpIDcssel, R0, C(CpID), C(CpIDidct), 0
+	ISB
+	RET
+
+	/* return cache sizes */
+TEXT getwayssets(SB), $-4
+	MFCP	CpSC, CpIDcsize, R0, C(CpID), C(CpIDidct), 0
+	RET
+
+/*
+ * l1 cache operations.
+ * l1 and l2 ops are intended to be called from C, thus need save no
+ * caller's regs, only those we need to preserve across calls.
+ */
+
+TEXT cachedwb(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwb_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cachedwbinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwbinv_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cachedinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedinv_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cacheuwbinv(SB), $-4
+	MOVM.DB.W [R14], (R13)	/* save lr on stack */
+	MOVW	CPSR, R1
+	CPSID			/* splhi */
+
+	MOVM.DB.W [R1], (R13)	/* save R1 on stack */
+
+	BL	cachedwbinv(SB)
+	BL	cacheiinv(SB)
+
+	MOVM.IA.W (R13), [R1]	/* restore R1 (saved CPSR) */
+	MOVW	R1, CPSR
+	MOVM.IA.W (R13), [R14]	/* restore lr */
+	RET
+
+/*
+ * architectural l2 cache operations
+ */
+
+TEXT _l2cacheuwb(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwb_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15	/* return */
+
+TEXT _l2cacheuwbinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	CPSR, R1
+	CPSID			/* splhi */
+
+	MOVM.DB.W [R1], (R13)	/* save R1 on stack */
+
+	MOVW	$cachedwbinv_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+
+	BL	_l2cacheuinv(SB)
+
+	MOVM.IA.W (R13), [R1]	/* restore R1 (saved CPSR) */
+	MOVW	R1, CPSR
+	MOVW.P	8(R13), R15	/* return */
+
+TEXT _l2cacheuinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedinv_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15	/* return */
+
+/*
+ * callers are assumed to be the above l1 and l2 ops.
+ * R0 is the function to call in the innermost loop.
+ * R8 is the cache level (1-origin: 1 or 2).
+ *
+ * R0	func to call at entry
+ * R1	func to call after entry
+ * R2	nsets
+ * R3	way shift (computed from R8)
+ * R4	set shift (computed from R8)
+ * R5	nways
+ * R6	set scratch
+ * R7	way scratch
+ * R8	cache level, 0-origin
+ * R9	extern reg up
+ * R10	extern reg m
+ *
+ * initial translation by 5c, then massaged by hand.
+ */
+TEXT wholecache+0(SB), $-4
+	MOVW	CPSR, R2
+	MOVM.DB.W [R2,R14], (SP) /* save regs on stack */
+
+	MOVW	R0, R1		/* save argument for inner loop in R1 */
+	SUB	$1, R8		/* convert cache level to zero origin */
+
+	/* we might not have the MMU on yet, so map R1 (func) to R14's space */
+	MOVW	R14, R0		/* get R14's segment ... */
+	AND	$KSEGM, R0
+	BIC	$KSEGM,	R1	/* strip segment from func address */
+	ORR	R0, R1		/* combine them */
+
+	/* get cache sizes */
+	SLL	$1, R8, R0	/* R0 = (cache - 1) << 1 */
+	MTCP	CpSC, CpIDcssel, R0, C(CpID), C(CpIDidct), 0 /* set cache select */
+	ISB
+	MFCP	CpSC, CpIDcsize, R0, C(CpID), C(CpIDidct), 0 /* get cache sizes */
+
+	/* compute # of ways and sets for this cache level */
+	SRA	$3, R0, R5	/* R5 (ways) = R0 >> 3 */
+	AND	$((1<<10)-1), R5 /* R5 = (R0 >> 3) & MASK(10) */
+	ADD	$1, R5		/* R5 (ways) = ((R0 >> 3) & MASK(10)) + 1 */
+
+	SRA	$13, R0, R2	/* R2 = R0 >> 13 */
+	AND	$((1<<15)-1), R2 /* R2 = (R0 >> 13) & MASK(15) */
+	ADD	$1, R2		/* R2 (sets) = ((R0 >> 13) & MASK(15)) + 1 */
+
+	/* precompute set/way shifts for inner loop */
+	MOVW	$(CACHECONF+0), R3	/* +0 = l1waysh */
+	MOVW	$(CACHECONF+4), R4	/* +4 = l1setsh */
+	CMP	$0, R8		/* cache == 1? */
+	ADD.NE	$(4*2), R3	/* no, assume l2: +8 = l2waysh */
+	ADD.NE	$(4*2), R3	/* +12 = l2setsh */
+
+	MOVW	R14, R0		/* get R14's segment ... */
+	AND	$KSEGM, R0
+
+	BIC	$KSEGM,	R3	/* strip segment from address */
+	ORR	R0, R3		/* combine them */
+	BIC	$KSEGM,	R4	/* strip segment from address */
+	ORR	R0, R4		/* combine them */
+	MOVW	(R3), R3
+	MOVW	(R4), R4
+
+	CMP	$0, R3		/* sanity checks */
+	BEQ	wbuggery
+	CMP	$0, R4
+	BEQ	sbuggery
+
+	CPSID			/* splhi to make entire op atomic */
+	BARRIERS
+
+	/* iterate over ways */
+	MOVW	$0, R7		/* R7: way */
+outer:
+	/* iterate over sets */
+	MOVW	$0, R6		/* R6: set */
+inner:
+	/* compute set/way register contents */
+	SLL	R3, R7, R0 	/* R0 = way << R3 (L?WAYSH) */
+	ORR	R8<<1, R0	/* R0 = way << L?WAYSH | (cache - 1) << 1 */
+	ORR	R6<<R4, R0 	/* R0 = way<<L?WAYSH | (cache-1)<<1 |set<<R4 */
+
+	BL	(R1)		/* call set/way operation with R0 arg. */
+
+	ADD	$1, R6		/* set++ */
+	CMP	R2, R6		/* set >= sets? */
+	BLT	inner		/* no, do next set */
+
+	ADD	$1, R7		/* way++ */
+	CMP	R5, R7		/* way >= ways? */
+	BLT	outer		/* no, do next way */
+
+	MOVM.IA.W (SP), [R2,R14] /* restore regs */
+	BARRIERS
+	MOVW	R2, CPSR	/* splx */
+
+	RET
+
+wbuggery:
+	PUTC('?')
+	PUTC('c')
+	PUTC('w')
+	B	topanic
+sbuggery:
+	PUTC('?')
+	PUTC('c')
+	PUTC('s')
+topanic:
+	MOVW	$.string<>+0(SB), R0
+	BIC	$KSEGM,	R0	/* strip segment from address */
+	MOVW	R14, R1		/* get R14's segment ... */
+	AND	$KSEGM, R1
+	ORR	R1, R0		/* combine them */
+	SUB	$12, R13	/* not that it matters, since we're panicing */
+	MOVW	R14, 8(R13)
+	BL	panic(SB)	/* panic("msg %#p", LR) */
+bugloop:
+	WFI
+	B	bugloop
+
+	DATA	.string<>+0(SB)/8,$"bad cach"
+	DATA	.string<>+8(SB)/8,$"e params"
+	DATA	.string<>+16(SB)/8,$"\073 pc %\043p"
+	DATA	.string<>+24(SB)/1,$"\z"
+	GLOBL	.string<>+0(SB),$25

+ 106 - 0
sys/src/9/teg2/caches-v7.c

@@ -0,0 +1,106 @@
+/*
+ * caches defined by arm v7 architecture
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "io.h"
+#include "arm.h"
+
+static char *
+l1iptype(uint type)
+{
+	static char *types[] = {
+		"reserved",
+		"asid-tagged VIVT",
+		"VIPT",
+		"PIPT",
+	};
+
+	if (type >= nelem(types) || types[type] == nil)
+		return "GOK";
+	return types[type];
+}
+
+static char *catype[] = {
+	"none,",
+	"i,",
+	"d,",
+	"split i&d,",
+	"unified,",
+	"gok,",
+	"gok,",
+	"gok,",
+};
+
+void
+cacheinfo(int level, Memcache *cp, int ext, int type)
+{
+	ulong setsways;
+
+	memset(cp, 0, sizeof *cp);
+	if (type == Nocache)
+		return;
+	cp->level = level;
+	cp->type = type;
+	cp->external = ext;
+	if (level == 2) {			/* external PL310 */
+		allcache->info(cp);
+		setsways = cp->setsways;
+	} else {
+		/* select internal cache level */
+		cpwrsc(CpIDcssel, CpID, CpIDid, 0, (level - 1) << 1);
+
+		setsways = cprdsc(CpIDcsize, CpID, CpIDid, 0);
+		cp->l1ip = cpctget();
+		cp->nways = ((setsways >> 3)  & MASK(10)) + 1;
+		cp->nsets = ((setsways >> 13) & MASK(15)) + 1;
+		cp->log2linelen = (setsways & MASK(2)) + 2 + 2;
+	}
+	cp->linelen = 1 << cp->log2linelen;
+	cp->setsways = setsways;
+	cp->setsh = cp->log2linelen;
+	cp->waysh = 32 - log2(cp->nways);
+}
+
+void
+allcacheinfo(Memcache *mc)
+{
+	int n;
+	ulong lvl;
+
+	lvl = cprdsc(CpIDcsize, CpID, CpIDidct, CpIDclvlid);
+	n = 1;
+	for (lvl &= MASK(21); lvl; lvl >>= 3)
+		cacheinfo(n, &mc[n], Intcache, lvl & MASK(3));
+//	cacheinfo(2, &mc[2], Extcache, Unified);		/* PL310 */
+}
+
+void
+prcachecfg(void)
+{
+	int cache;
+	Memcache *mc;
+
+	for (cache = 1; cache < 8 && cachel[cache].type; cache++) {
+		mc = &cachel[cache];
+		iprint("l%d: %s %-10s %2d ways %4d sets %d bytes/line; can W[",
+			mc->level, mc->external? "ext": "int", catype[mc->type],
+			mc->nways, mc->nsets, mc->linelen);
+		if (mc->linelen != CACHELINESZ)
+			iprint(" *should* be %d", CACHELINESZ);
+		if (mc->setsways & Cawt)
+			iprint("T");
+		if (mc->setsways & Cawb)
+			iprint("B");
+		if (mc->setsways & Cawa)
+			iprint("A");
+		iprint("]");
+		if (cache == 1)
+			iprint("; l1-i %s", l1iptype((mc->l1ip >> 14) & MASK(2)));
+		iprint("\n");
+	}
+}

+ 198 - 0
sys/src/9/teg2/caches.c

@@ -0,0 +1,198 @@
+/*
+ * operations on all memory data or unified caches, a no-op cache,
+ * and an l1-only cache ops cache.
+ * i-caches are not handled here.
+ *
+ * there are only three cache operations that we care about:
+ * force cache contents to memory (before dma out or shutdown),
+ * ignore cache contents in favour of memory (initialisation, after dma in),
+ * both (update page tables and force cpu to read new contents).
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+
+static Cacheimpl allcaches, nullcaches, l1caches;
+
+void
+cachesinfo(Memcache *cp)
+{
+	memset(cp, 0, sizeof *cp);
+	cp->setsways = Cara | Cawa | Cawt | Cawb;
+	cp->l1ip = 3<<14;				/* PIPT */
+	cp->log2linelen = log2(CACHELINESZ);
+}
+
+void
+allcacheson(void)
+{
+	l2pl310init();
+	allcache = &allcaches;
+	nocache = &nullcaches;
+	l1cache = &l1caches;
+}
+
+void
+cachesoff(void)
+{
+	l2cache->off();
+}
+
+void
+cachesinvse(void *va, int bytes)
+{
+	int s;
+
+	s = splhi();
+	l2cache->invse(va, bytes);
+	cachedinvse(va, bytes);
+	splx(s);
+}
+
+void
+cacheswbse(void *va, int bytes)
+{
+	int s;
+
+	s = splhi();
+	cachedwbse(va, bytes);
+	l2cache->wbse(va, bytes);
+	splx(s);
+}
+
+void
+cacheswbinvse(void *va, int bytes)
+{
+	int s;
+
+	s = splhi();
+	cachedwbse(va, bytes);
+	l2cache->wbinvse(va, bytes);
+	cachedwbinvse(va, bytes);
+	splx(s);
+}
+
+
+void
+cachesinv(void)
+{
+	int s;
+
+	s = splhi();
+	l2cache->inv();
+	cachedinv();
+	splx(s);
+}
+
+void
+cacheswb(void)
+{
+	int s;
+
+	s = splhi();
+	cachedwb();
+	l2cache->wb();
+	splx(s);
+}
+
+void
+cacheswbinv(void)
+{
+	int s;
+
+	s = splhi();
+	cachedwb();
+	l2cache->wbinv();
+	cachedwbinv();
+	splx(s);
+}
+
+static Cacheimpl allcaches = {
+	.info	= cachesinfo,
+	.on	= allcacheson,
+	.off	= cachesoff,
+
+	.inv	= cachesinv,
+	.wb	= cacheswb,
+	.wbinv	= cacheswbinv,
+
+	.invse	= cachesinvse,
+	.wbse	= cacheswbse,
+	.wbinvse= cacheswbinvse,
+};
+
+
+/*
+ * null cache ops
+ */
+
+void
+nullinfo(Memcache *cp)
+{
+	memset(cp, 0, sizeof *cp);
+	cp->log2linelen = 2;
+}
+
+void
+nullon(void)
+{
+	nocache = &nullcaches;
+}
+
+void
+nullop(void)
+{
+}
+
+void
+nullse(void *, int)
+{
+}
+
+static Cacheimpl nullcaches = {
+	.info	= nullinfo,
+	.on	= nullon,
+	.off	= nullop,
+
+	.inv	= nullop,
+	.wb	= nullop,
+	.wbinv	= nullop,
+
+	.invse	= nullse,
+	.wbse	= nullse,
+	.wbinvse= nullse,
+};
+
+/*
+ * l1-only ops
+ */
+
+void
+l1cachesinfo(Memcache *)
+{
+}
+
+void
+l1cacheson(void)
+{
+	l1cache = &l1caches;
+}
+
+static Cacheimpl l1caches = {
+	.info	= l1cachesinfo,
+	.on	= l1cacheson,
+	.off	= nullop,
+
+	.inv	= cachedinv,
+	.wb	= cachedwb,
+	.wbinv	= cachedwbinv,
+
+	.invse	= cachedinvse,
+	.wbse	= cachedwbse,
+	.wbinvse= cachedwbinvse,
+};

+ 138 - 0
sys/src/9/teg2/clock-tegra.c

@@ -0,0 +1,138 @@
+/*
+ * tegra 2 SoC clocks; excludes cortex-a timers.
+ *
+ * SoC provides these shared clocks:
+ * 4 29-bit count-down `timers' @ 1MHz,
+ * 1 32-bit count-up time-stamp counter @ 1MHz,
+ * and a real-time clock @ 32KHz.
+ * the tegra watchdog (tegra 2 ref man §5.4.1) is tied to timers, not rtc.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "arm.h"
+
+typedef struct Shrdtmr Shrdtmr;
+typedef struct µscnt µscnt;
+
+/* tegra2 shared-intr timer registers */
+struct Shrdtmr {		/* 29-bit count-down timer (4); unused */
+	ulong	trigger;
+	ulong	prescnt;
+};
+
+enum {
+	/* trigger bits */
+	Enable =	1u<<31,
+	Periodintr =	1<<30,
+	Countmask =	MASK(29),
+
+	/* prescnt bits */
+	Intrclr =	1<<30,
+	/* Countmask is ro */
+};
+
+struct µscnt {		/* tegra2 shared 32-bit count-up µs counter (1) */
+	ulong	cntr;
+	/*
+	 * oscillator clock fraction - 1; initially 0xb (11) from u-boot
+	 * for 12MHz periphclk.
+	 */
+	ulong	cfg;
+	uchar	_pad0[0x3c - 0x8];
+	ulong	freeze;
+};
+
+enum {
+	/* cfg bits */
+	Dividendshift =	8,
+	Dividendmask =	MASK(8),
+	Divisorshift =	0,
+	Divisormask =	MASK(8),
+};
+
+void
+tegclockintr(void)
+{
+	int junk;
+	Shrdtmr *tmr;
+
+	/* appease the tegra dog */
+	tmr = (Shrdtmr *)soc.tmr[0];
+	junk = tmr->trigger;
+	USED(junk);
+}
+
+/*
+ * if on cpu0, shutdown the shared tegra2 watchdog timer.
+ */
+void
+tegclockshutdown(void)
+{
+	Shrdtmr *tmr;
+
+	if (m->machno == 0) {
+		tmr = (Shrdtmr *)soc.tmr[0];
+		tmr->prescnt = tmr->trigger = 0;
+		coherence();
+	}
+}
+
+void
+tegwdogintr(Ureg *, void *v)
+{
+	int junk;
+	Shrdtmr *tmr;
+
+	tmr = (Shrdtmr *)v;
+	tmr->prescnt |= Intrclr;
+	coherence();
+	/* the lousy documentation says we also have to read trigger */
+	junk = tmr->trigger;
+	USED(junk);
+}
+
+/* start tegra2 shared watch dog */
+void
+tegclock0init(void)
+{
+	Shrdtmr *tmr;
+
+	tmr = (Shrdtmr *)soc.tmr[0];
+	irqenable(Tn0irq, tegwdogintr, tmr, "tegra watchdog");
+
+	/*
+	 * tegra watchdog only fires on the second missed interrupt, thus /2.
+	 */
+	tmr->trigger = (Dogsectimeout * Mhz / 2 - 1) | Periodintr | Enable;
+	coherence();
+}
+
+/*
+ * µscnt is a freerunning timer (cycle counter); it needs no
+ * initialisation, wraps and does not dispatch interrupts.
+ */
+void
+tegclockinit(void)
+{
+	ulong old;
+	µscnt *µs = (µscnt *)soc.µs;
+
+	/* verify µs counter sanity */
+	assert(µs->cfg == 0xb);			/* set by u-boot */
+	old = µs->cntr;
+	delay(1);
+	assert(old != µs->cntr);
+}
+
+ulong
+perfticks(void)			/* MHz rate, assumed by timing loops */
+{
+	ulong v;
+
+	/* keep it non-zero to prevent m->fastclock ever going to zero. */
+	v = ((µscnt *)soc.µs)->cntr;
+	return v == 0? 1: v;
+}

+ 623 - 0
sys/src/9/teg2/clock.c

@@ -0,0 +1,623 @@
+/*
+ * cortex-a clocks; excludes tegra 2 SoC clocks
+ *
+ * cortex-a processors include private `global' and local timers
+ * at soc.scu + 0x200 (global) and + 0x600 (local).
+ * the global timer is a single count-up timer shared by all cores
+ * but with per-cpu comparator and auto-increment registers.
+ * a local count-down timer can be used as a watchdog.
+ *
+ * v7 arch provides a 32-bit count-up cycle counter (at about 1GHz in our case)
+ * but it's unsuitable as our source of fastticks, because it stops advancing
+ * when the cpu is suspended by WFI.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "arm.h"
+
+enum {
+	Debug		= 0,
+
+	Basetickfreq	= Mhz,			/* soc.µs rate in Hz */
+	/* the local timers seem to run at half the expected rate */
+	Clockfreqbase	= 250*Mhz / 2,	/* private timer rate (PERIPHCLK/2) */
+	Tcycles		= Clockfreqbase / HZ,	/* cycles per clock tick */
+
+	MinPeriod	= Tcycles / 100,
+	MaxPeriod	= Tcycles,
+
+	Dogtimeout	= Dogsectimeout * Clockfreqbase,
+};
+
+typedef struct Ltimer Ltimer;
+typedef struct Pglbtmr Pglbtmr;
+typedef struct Ploctmr Ploctmr;
+
+/*
+ * cortex-a private-intr local timer registers.  all cpus see their
+ * own local timers at the same base address.
+ */
+struct Ltimer {
+	ulong	load;		/* new value + 1 */
+	ulong	cnt;		/* counts down */
+	ulong	ctl;
+	ulong	isr;
+
+	/* watchdog only */
+	ulong	wdrst;
+	ulong	wddis;		/* wo */
+
+	ulong	_pad0[2];
+};
+struct Ploctmr {
+	Ltimer	loc;
+	Ltimer	wd;
+};
+
+enum {
+	/* ctl bits */
+	Tmrena	= 1<<0,		/* timer enabled */
+	Wdogena = Tmrena,	/* watchdog enabled */
+	Xreload	= 1<<1,		/* reload on intr; periodic interrupts */
+	Tintena	= 1<<2,		/* enable irq 29 at cnt==0 (30 for watchdog) */
+	Wdog	= 1<<3,		/* watchdog, not timer, mode */
+	Xsclrshift = 8,
+	Xsclrmask = MASK(8),
+
+	/* isr bits */
+	Xisrclk	= 1<<0,		/* write to clear */
+
+	/* wdrst bits */
+	Wdrst	= 1<<0,
+
+	/* wddis values */
+	Wdon	= 1,
+	Wdoff1	= 0x12345678,	/* send these two to switch to timer mode */
+	Wdoff2	= 0x87654321,
+};
+
+/* cortex-a private-intr globl timer registers */
+struct Pglbtmr {
+	ulong	cnt[2];		/* counts up; little-endian uvlong */
+	ulong	ctl;
+	ulong	isr;
+	ulong	cmp[2];		/* little-endian uvlong */
+	ulong	inc;
+};
+
+enum {
+	/* unique ctl bits (otherwise see X* above) */
+	Gcmp	= 1<<1,
+//	Gtintena= 1<<2,		/* enable irq 27 */
+	Gincr	= 1<<3,
+};
+
+/*
+ * until 5[cal] inline vlong ops, avoid them where possible,
+ * they are currently slow function calls.
+ */
+typedef union Counter Counter;
+union Counter {
+	uvlong	uvl;
+	struct {			/* little-endian */
+		ulong	low;
+		ulong	high;
+	};
+};
+
+static int fired;
+static int ticking[MAXMACH];
+
+/* no lock is needed to update our local timer.  splhi keeps it tight. */
+static void
+setltimer(Ltimer *tn, ulong ticks)
+{
+	int s;
+
+	assert(ticks <= Clockfreqbase);
+	s = splhi();
+	tn->load = ticks - 1;
+	coherence();
+	tn->ctl = Tmrena | Tintena | Xreload;
+	coherence();
+	splx(s);
+}
+
+static void
+ckstuck(int cpu, long myticks, long histicks)
+{
+	if (labs(histicks - myticks) > HZ) {
+//		iprint("cpu%d: clock ticks %ld (vs myticks %ld cpu0 %ld); "
+//			"apparently stopped\n",
+//			cpu, histicks, myticks, MACHP(0)->ticks);
+		if (!ticking[cpu])
+			panic("cpu%d: clock not interrupting", cpu);
+	}
+}
+
+static void
+mpclocksanity(void)
+{
+	int cpu, mycpu;
+	long myticks, histicks;
+
+	if (conf.nmach <= 1 || active.exiting || navailcpus == 0)
+		return;
+
+	mycpu = m->machno;
+	myticks = m->ticks;
+	if (myticks == HZ)
+		ticking[mycpu] = 1;
+
+	if (myticks < 5*HZ)
+		return;
+
+	for (cpu = 0; cpu < navailcpus; cpu++) {
+		if (cpu == mycpu)
+			continue;
+		histicks = MACHP(cpu)->ticks;
+		if (myticks == 5*HZ || histicks > 1)
+			ckstuck(cpu, myticks, histicks);
+	}
+}
+
+static void
+clockintr(Ureg* ureg, void *arg)
+{
+	Ltimer *wd, *tn;
+	Ploctmr *lt;
+
+	lt = (Ploctmr *)arg;
+	tn = &lt->loc;
+	tn->isr = Xisrclk;
+	coherence();
+
+	timerintr(ureg, 0);
+
+#ifdef watchdog_not_bloody_useless
+	/* appease the dogs */
+	wd = &lt->wd;
+	if (wd->cnt == 0 &&
+	    (wd->ctl & (Wdog | Wdogena | Tintena)) == (Wdog | Wdogena))
+		panic("cpu%d: zero watchdog count but no system reset",
+			m->machno);
+	wd->load = Dogtimeout - 1;
+	coherence();
+#endif
+	SET(wd); USED(wd);
+	tegclockintr();
+
+	mpclocksanity();
+}
+
+void
+clockprod(Ureg *ureg)
+{
+	Ltimer *tn;
+
+	timerintr(ureg, 0);
+	tegclockintr();
+	if (m->machno != 0) {		/* cpu1 gets stuck */
+		tn = &((Ploctmr *)soc.loctmr)->loc;
+		setltimer(tn, Tcycles);
+	}
+}
+
+static void
+clockreset(Ltimer *tn)
+{
+	if (probeaddr((uintptr)tn) < 0)
+		panic("no clock at %#p", tn);
+	tn->ctl = 0;
+	coherence();
+}
+
+void
+watchdogoff(Ltimer *wd)
+{
+	wd->ctl &= ~Wdogena;
+	coherence();
+	wd->wddis = Wdoff1;
+	coherence();
+	wd->wddis = Wdoff2;
+	coherence();
+}
+
+/* clear any pending watchdog intrs or causes */
+void
+wdogclrintr(Ltimer *wd)
+{
+#ifdef watchdog_not_bloody_useless
+	wd->isr = Xisrclk;