Browse Source

import gpl nix into /sys/src/9

David du Colombier 9 years ago
parent
commit
f4a1a0881a
100 changed files with 42119 additions and 0 deletions
  1. 87 0
      sys/src/9/386/aoe.h
  2. 554 0
      sys/src/9/386/devether.c
  3. 469 0
      sys/src/9/386/devrtc.c
  4. 1215 0
      sys/src/9/386/ether8169.c
  5. 1342 0
      sys/src/9/386/ether82557.c
  6. 1745 0
      sys/src/9/386/ether82563.c
  7. 2003 0
      sys/src/9/386/etherigbe.c
  8. 1646 0
      sys/src/9/386/etherm10g.c
  9. 648 0
      sys/src/9/386/kbd.c
  10. 767 0
      sys/src/9/386/pci.c
  11. 146 0
      sys/src/9/386/random.c
  12. 794 0
      sys/src/9/386/uarti8250.c
  13. 189 0
      sys/src/9/386/uartpci.c
  14. 22 0
      sys/src/9/bench/1/kern
  15. 20 0
      sys/src/9/bench/1/output
  16. 26 0
      sys/src/9/bench/1/runbench
  17. 15 0
      sys/src/9/bench/Benchs
  18. 29 0
      sys/src/9/bench/Locks
  19. 5 0
      sys/src/9/bench/Mean
  20. 85 0
      sys/src/9/bench/README
  21. 23 0
      sys/src/9/bench/Time
  22. 42 0
      sys/src/9/bench/runbenchs
  23. 37 0
      sys/src/9/bench/tools
  24. 193 0
      sys/src/9/boot/aux.c
  25. 356 0
      sys/src/9/boot/boot.c
  26. 81 0
      sys/src/9/boot/boot.h
  27. 82 0
      sys/src/9/boot/bootauth.c
  28. 89 0
      sys/src/9/boot/bootcache.c
  29. 213 0
      sys/src/9/boot/bootip.c
  30. 135 0
      sys/src/9/boot/doauthenticate.c
  31. 83 0
      sys/src/9/boot/embed.c
  32. 52 0
      sys/src/9/boot/getpasswd.c
  33. 284 0
      sys/src/9/boot/local.c
  34. 61 0
      sys/src/9/boot/nopsession.c
  35. 76 0
      sys/src/9/boot/paq.c
  36. 31 0
      sys/src/9/boot/printstub.c
  37. 59 0
      sys/src/9/boot/sac.c
  38. 158 0
      sys/src/9/boot/settime.c
  39. 689 0
      sys/src/9/ip/arp.c
  40. 133 0
      sys/src/9/ip/chandial.c
  41. 1425 0
      sys/src/9/ip/devip.c
  42. 794 0
      sys/src/9/ip/ethermedium.c
  43. 290 0
      sys/src/9/ip/gre.c
  44. 501 0
      sys/src/9/ip/icmp.c
  45. 908 0
      sys/src/9/ip/icmp6.c
  46. 50 0
      sys/src/9/ip/inferno.c
  47. 814 0
      sys/src/9/ip/ip.c
  48. 654 0
      sys/src/9/ip/ip.h
  49. 377 0
      sys/src/9/ip/ipaux.c
  50. 1663 0
      sys/src/9/ip/ipifc.c
  51. 861 0
      sys/src/9/ip/iproute.c
  52. 738 0
      sys/src/9/ip/ipv6.c
  53. 194 0
      sys/src/9/ip/ipv6.h
  54. 129 0
      sys/src/9/ip/loopbackmedium.c
  55. 162 0
      sys/src/9/ip/netdevmedium.c
  56. 272 0
      sys/src/9/ip/netlog.c
  57. 48 0
      sys/src/9/ip/nullmedium.c
  58. 88 0
      sys/src/9/ip/pktmedium.c
  59. 81 0
      sys/src/9/ip/ptclbsum.c
  60. 3264 0
      sys/src/9/ip/tcp.c
  61. 660 0
      sys/src/9/ip/udp.c
  62. 2 0
      sys/src/9/k10/Linux
  63. 332 0
      sys/src/9/k10/acore.c
  64. 391 0
      sys/src/9/k10/acore.c.old
  65. 419 0
      sys/src/9/k10/acpi.h
  66. 205 0
      sys/src/9/k10/amd64.h
  67. 416 0
      sys/src/9/k10/apic.c
  68. 101 0
      sys/src/9/k10/apic.h
  69. 115 0
      sys/src/9/k10/arch.c
  70. 380 0
      sys/src/9/k10/archk10.c
  71. 256 0
      sys/src/9/k10/archk8.c
  72. 438 0
      sys/src/9/k10/asm.c
  73. 5 0
      sys/src/9/k10/boot.fs
  74. 172 0
      sys/src/9/k10/cga.c
  75. 25 0
      sys/src/9/k10/cpuidamd64.s
  76. 138 0
      sys/src/9/k10/crap.c
  77. 430 0
      sys/src/9/k10/dat.h
  78. 1721 0
      sys/src/9/k10/devacpi.c
  79. 632 0
      sys/src/9/k10/devarch.c
  80. 2160 0
      sys/src/9/k10/ether82563.c
  81. 895 0
      sys/src/9/k10/etherbcm.c
  82. 60 0
      sys/src/9/k10/etherif.h
  83. 261 0
      sys/src/9/k10/fns.h
  84. 549 0
      sys/src/9/k10/fpu.c
  85. 475 0
      sys/src/9/k10/fpu.c.old
  86. 176 0
      sys/src/9/k10/i8254.c
  87. 238 0
      sys/src/9/k10/i8259.c
  88. 16 0
      sys/src/9/k10/init9.c
  89. 275 0
      sys/src/9/k10/io.h
  90. 490 0
      sys/src/9/k10/ioapic.c
  91. 24 0
      sys/src/9/k10/iob.h
  92. 198 0
      sys/src/9/k10/k8cpu
  93. 193 0
      sys/src/9/k10/k8cpufs
  94. 202 0
      sys/src/9/k10/k8cpukexec
  95. 235 0
      sys/src/9/k10/l32p.s
  96. 341 0
      sys/src/9/k10/l64acidt.s
  97. 80 0
      sys/src/9/k10/l64acsyscall.s
  98. 26 0
      sys/src/9/k10/l64cpuid.s
  99. 46 0
      sys/src/9/k10/l64fpu.s
  100. 344 0
      sys/src/9/k10/l64idt.s

+ 87 - 0
sys/src/9/386/aoe.h

@@ -0,0 +1,87 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * ATA-over-Ethernet (AoE) protocol
+ */
+enum {
+	ACata,
+	ACconfig,
+};
+
+enum {
+	AQCread,
+	AQCtest,
+	AQCprefix,
+	AQCset,
+	AQCfset,
+};
+
+enum {
+	AEcmd	= 1,
+	AEarg,
+	AEdev,
+	AEcfg,
+	AEver,
+};
+
+enum {
+	Aoetype	= 0x88a2,
+	Aoesectsz = 512,			/* standard sector size */
+	Aoever	= 1,
+
+	AFerr	= 1<<2,
+	AFrsp	= 1<<3,
+
+	AAFwrite= 1,
+	AAFext	= 1<<6,
+};
+
+typedef struct {
+	uchar	dst[Eaddrlen];
+	uchar	src[Eaddrlen];
+	uchar	type[2];
+	uchar	verflag;
+	uchar	error;
+	uchar	major[2];
+	uchar	minor;
+	uchar	cmd;
+	uchar	tag[4];
+	uchar	payload[];
+} Aoehdr;
+
+#define AOEHDRSZ	offsetof(Aoehdr, payload[0])
+
+typedef struct {
+	Aoehdr;
+	uchar	aflag;
+	uchar	errfeat;
+	uchar	scnt;
+	uchar	cmdstat;
+	uchar	lba[6];
+	uchar	res[2];
+	uchar	payload[];
+} Aoeata;
+
+#define AOEATASZ	offsetof(Aoeata, payload[0])
+
+typedef struct {
+	Aoehdr;
+	uchar	bufcnt[2];
+	uchar	fwver[2];
+	uchar	scnt;
+	uchar	verccmd;
+	uchar	cslen[2];
+	uchar	payload[];
+} Aoeqc;
+
+#define AOEQCSZ		offsetof(Aoeqc, payload[0])
+
+extern char Echange[];
+extern char Enotup[];

+ 554 - 0
sys/src/9/386/devether.c

@@ -0,0 +1,554 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+
+#include "etherif.h"
+
+static Ether *etherxx[MaxEther];
+
+Chan*
+etherattach(char* spec)
+{
+	ulong ctlrno;
+	char *p;
+	Chan *chan;
+
+	ctlrno = 0;
+	if(spec && *spec){
+		ctlrno = strtoul(spec, &p, 0);
+		if((ctlrno == 0 && p == spec) || *p || (ctlrno >= MaxEther))
+			error(Ebadarg);
+	}
+	if(etherxx[ctlrno] == 0)
+		error(Enodev);
+
+	chan = devattach('l', spec);
+	if(waserror()){
+		chanfree(chan);
+		nexterror();
+	}
+	chan->devno = ctlrno;
+	if(etherxx[ctlrno]->attach)
+		etherxx[ctlrno]->attach(etherxx[ctlrno]);
+	poperror();
+	return chan;
+}
+
+static Walkqid*
+etherwalk(Chan* chan, Chan* nchan, char** name, int nname)
+{
+	return netifwalk(etherxx[chan->devno], chan, nchan, name, nname);
+}
+
+static long
+etherstat(Chan* chan, uchar* dp, long n)
+{
+	return netifstat(etherxx[chan->devno], chan, dp, n);
+}
+
+static Chan*
+etheropen(Chan* chan, int omode)
+{
+	return netifopen(etherxx[chan->devno], chan, omode);
+}
+
+static void
+ethercreate(Chan*, char*, int, int)
+{
+}
+
+static void
+etherclose(Chan* chan)
+{
+	netifclose(etherxx[chan->devno], chan);
+}
+
+static long
+etherread(Chan* chan, void* buf, long n, vlong off)
+{
+	Ether *ether;
+	ulong offset = off;
+
+	ether = etherxx[chan->devno];
+	if((chan->qid.type & QTDIR) == 0 && ether->ifstat){
+		/*
+		 * With some controllers it is necessary to reach
+		 * into the chip to extract statistics.
+		 */
+		if(NETTYPE(chan->qid.path) == Nifstatqid)
+			return ether->ifstat(ether, buf, n, offset);
+		else if(NETTYPE(chan->qid.path) == Nstatqid)
+			ether->ifstat(ether, buf, 0, offset);
+	}
+
+	return netifread(ether, chan, buf, n, offset);
+}
+
+static Block*
+etherbread(Chan* chan, long n, vlong offset)
+{
+	return netifbread(etherxx[chan->devno], chan, n, offset);
+}
+
+static long
+etherwstat(Chan* chan, uchar* dp, long n)
+{
+	return netifwstat(etherxx[chan->devno], chan, dp, n);
+}
+
+static void
+etherrtrace(Netfile* f, Etherpkt* pkt, int len)
+{
+	int i, n;
+	Block *bp;
+
+	if(qwindow(f->iq) <= 0)
+		return;
+	if(len > 58)
+		n = 58;
+	else
+		n = len;
+	bp = iallocb(64);
+	if(bp == nil)
+		return;
+	memmove(bp->wp, pkt->d, n);
+	i = TK2MS(sys->ticks);
+	bp->wp[58] = len>>8;
+	bp->wp[59] = len;
+	bp->wp[60] = i>>24;
+	bp->wp[61] = i>>16;
+	bp->wp[62] = i>>8;
+	bp->wp[63] = i;
+	bp->wp += 64;
+	qpass(f->iq, bp);
+}
+
+Block*
+etheriq(Ether* ether, Block* bp, int fromwire)
+{
+	Etherpkt *pkt;
+	ushort type;
+	int len, multi, tome, fromme;
+	Netfile **ep, *f, **fp, *fx;
+	Block *xbp;
+
+	ether->inpackets++;
+
+	pkt = (Etherpkt*)bp->rp;
+	len = BLEN(bp);
+	type = (pkt->type[0]<<8)|pkt->type[1];
+	fx = 0;
+	ep = &ether->f[Ntypes];
+
+	multi = pkt->d[0] & 1;
+	/* check for valid multicast addresses */
+	if(multi && memcmp(pkt->d, ether->bcast, sizeof(pkt->d)) != 0 && ether->prom == 0){
+		if(!activemulti(ether, pkt->d, sizeof(pkt->d))){
+			if(fromwire){
+				freeb(bp);
+				bp = 0;
+			}
+			return bp;
+		}
+	}
+
+	/* is it for me? */
+	tome = memcmp(pkt->d, ether->ea, sizeof(pkt->d)) == 0;
+	fromme = memcmp(pkt->s, ether->ea, sizeof(pkt->s)) == 0;
+
+	/*
+	 * Multiplex the packet to all the connections which want it.
+	 * If the packet is not to be used subsequently (fromwire != 0),
+	 * attempt to simply pass it into one of the connections, thereby
+	 * saving a copy of the data (usual case hopefully).
+	 */
+	for(fp = ether->f; fp < ep; fp++){
+		if(f = *fp)
+		if(f->type == type || f->type < 0)
+		if(tome || multi || f->prom || f->bridge & 2){
+			/* Don't want to hear bridged packets */
+			if(f->bridge && !fromwire && !fromme)
+				continue;
+			if(!f->headersonly){
+				if(fromwire && fx == 0)
+					fx = f;
+				else if(xbp = iallocb(len)){
+					memmove(xbp->wp, pkt, len);
+					xbp->wp += len;
+					if(qpass(f->iq, xbp) < 0)
+						ether->soverflows++;
+				}
+				else
+					ether->soverflows++;
+			}
+			else
+				etherrtrace(f, pkt, len);
+		}
+	}
+
+	if(fx){
+		if(qpass(fx->iq, bp) < 0)
+			ether->soverflows++;
+		return 0;
+	}
+	if(fromwire){
+		freeb(bp);
+		return 0;
+	}
+
+	return bp;
+}
+
+static int
+etheroq(Ether* ether, Block* bp)
+{
+	int len, loopback, s;
+	Etherpkt *pkt;
+
+	ether->outpackets++;
+
+	/*
+	 * Check if the packet has to be placed back onto the input queue,
+	 * i.e. if it's a loopback or broadcast packet or the interface is
+	 * in promiscuous mode.
+	 * If it's a loopback packet indicate to etheriq that the data isn't
+	 * needed and return, etheriq will pass-on or free the block.
+	 * To enable bridging to work, only packets that were originated
+	 * by this interface are fed back.
+	 */
+	pkt = (Etherpkt*)bp->rp;
+	len = BLEN(bp);
+	loopback = memcmp(pkt->d, ether->ea, sizeof(pkt->d)) == 0;
+	if(loopback || memcmp(pkt->d, ether->bcast, sizeof(pkt->d)) == 0 || ether->prom){
+		s = splhi();
+		etheriq(ether, bp, 0);
+		splx(s);
+	}
+
+	if(!loopback){
+		qbwrite(ether->oq, bp);
+		if(ether->transmit != nil)
+			ether->transmit(ether);
+	} else
+		freeb(bp);
+
+	return len;
+}
+
+static long
+etherwrite(Chan* chan, void* buf, long n, vlong)
+{
+	Ether *ether;
+	Block *bp;
+	int nn, onoff;
+	Cmdbuf *cb;
+
+	ether = etherxx[chan->devno];
+	if(NETTYPE(chan->qid.path) != Ndataqid) {
+		nn = netifwrite(ether, chan, buf, n);
+		if(nn >= 0)
+			return nn;
+		cb = parsecmd(buf, n);
+		if(cb->f[0] && strcmp(cb->f[0], "nonblocking") == 0){
+			if(cb->nf <= 1)
+				onoff = 1;
+			else
+				onoff = atoi(cb->f[1]);
+			qnoblock(ether->oq, onoff);
+			free(cb);
+			return n;
+		}
+		free(cb);
+		if(ether->ctl != nil)
+			return ether->ctl(ether, buf, n);
+
+		error(Ebadctl);
+	}
+
+	if(n > ether->mtu)
+		error(Etoobig);
+	if(n < ether->minmtu)
+		error(Etoosmall);
+
+	bp = allocb(n);
+	if(waserror()){
+		freeb(bp);
+		nexterror();
+	}
+	memmove(bp->rp, buf, n);
+	if((ether->f[NETID(chan->qid.path)]->bridge & 2) == 0)
+		memmove(bp->rp+Eaddrlen, ether->ea, Eaddrlen);
+	poperror();
+	bp->wp += n;
+
+	return etheroq(ether, bp);
+}
+
+static long
+etherbwrite(Chan* chan, Block* bp, vlong)
+{
+	Ether *ether;
+	long n;
+
+	n = BLEN(bp);
+	if(NETTYPE(chan->qid.path) != Ndataqid){
+		if(waserror()) {
+			freeb(bp);
+			nexterror();
+		}
+		n = etherwrite(chan, bp->rp, n, 0);
+		poperror();
+		freeb(bp);
+		return n;
+	}
+	ether = etherxx[chan->devno];
+
+	if(n > ether->mtu){
+		freeb(bp);
+		error(Etoobig);
+	}
+	if(n < ether->minmtu){
+		freeb(bp);
+		error(Etoosmall);
+	}
+
+	return etheroq(ether, bp);
+}
+
+static struct {
+	char*	type;
+	int	(*reset)(Ether*);
+} cards[MaxEther+1];
+
+void
+addethercard(char* t, int (*r)(Ether*))
+{
+	static int ncard;
+
+	if(ncard == MaxEther)
+		panic("too many ether cards");
+	cards[ncard].type = t;
+	cards[ncard].reset = r;
+	ncard++;
+}
+
+int
+parseether(uchar *to, char *from)
+{
+	char nip[4];
+	char *p;
+	int i;
+
+	p = from;
+	for(i = 0; i < Eaddrlen; i++){
+		if(*p == 0)
+			return -1;
+		nip[0] = *p++;
+		if(*p == 0)
+			return -1;
+		nip[1] = *p++;
+		nip[2] = 0;
+		to[i] = strtoul(nip, 0, 16);
+		if(*p == ':')
+			p++;
+	}
+	return 0;
+}
+
+static Ether*
+etherprobe(int cardno, int ctlrno)
+{
+	int i, j;
+	Ether *ether;
+	char buf[128], name[32];
+
+	ether = malloc(sizeof(Ether));
+	memset(ether, 0, sizeof(Ether));
+	ether->ctlrno = ctlrno;
+	ether->tbdf = BUSUNKNOWN;
+	ether->mbps = 10;
+	ether->minmtu = ETHERMINTU;
+	ether->mtu = ETHERMAXTU;
+	ether->maxmtu = ETHERMAXTU;
+
+	if(cardno < 0){
+		if(isaconfig("ether", ctlrno, ether) == 0){
+			free(ether);
+			return nil;
+		}
+		for(cardno = 0; cards[cardno].type; cardno++){
+			if(cistrcmp(cards[cardno].type, ether->type))
+				continue;
+			for(i = 0; i < ether->nopt; i++){
+				if(strncmp(ether->opt[i], "ea=", 3))
+					continue;
+				if(parseether(ether->ea, &ether->opt[i][3]))
+					memset(ether->ea, 0, Eaddrlen);
+			}
+			break;
+		}
+	}
+
+	if(cardno >= MaxEther || cards[cardno].type == nil){
+		free(ether);
+		return nil;
+	}
+	if(cards[cardno].reset(ether) < 0){
+		free(ether);
+		return nil;
+	}
+
+	/*
+	 * IRQ2 doesn't really exist, it's used to gang the interrupt
+	 * controllers together. A device set to IRQ2 will appear on
+	 * the second interrupt controller as IRQ9.
+	 */
+	if(ether->irq == 2)
+		ether->irq = 9;
+	snprint(name, sizeof(name), "ether%d", ctlrno);
+
+	/*
+	 * If ether->irq is <0, it is a hack to indicate no interrupt
+	 * used by ethersink.
+	 */
+	if(ether->irq >= 0)
+		intrenable(ether->irq, ether->interrupt, ether, ether->tbdf, name);
+
+	i = sprint(buf, "#l%d: %s: %dMbps port %#p irq %d tu %d",
+		ctlrno, cards[cardno].type, ether->mbps, ether->port, ether->irq, ether->mtu);
+	if(ether->mem)
+		i += sprint(buf+i, " addr %#p", ether->mem);
+	if(ether->size)
+		i += sprint(buf+i, " size 0x%luX", ether->size);
+	i += sprint(buf+i, ": %2.2ux%2.2ux%2.2ux%2.2ux%2.2ux%2.2ux",
+		ether->ea[0], ether->ea[1], ether->ea[2],
+		ether->ea[3], ether->ea[4], ether->ea[5]);
+	sprint(buf+i, "\n");
+	print(buf);
+
+	j = ether->mbps;
+	if(j > 1000)
+		j *= 10;
+	for(i = 0; j >= 100; i++)
+		j /= 10;
+	i = (128<<i)*1024;
+	netifinit(ether, name, Ntypes, i);
+	if(ether->oq == 0)
+		ether->oq = qopen(i, Qmsg, 0, 0);
+	if(ether->oq == 0)
+		panic("etherreset %s", name);
+	ether->alen = Eaddrlen;
+	memmove(ether->addr, ether->ea, Eaddrlen);
+	memset(ether->bcast, 0xFF, Eaddrlen);
+
+	return ether;
+}
+
+static void
+etherreset(void)
+{
+	Ether *ether;
+	int cardno, ctlrno;
+
+	for(ctlrno = 0; ctlrno < MaxEther; ctlrno++){
+		if((ether = etherprobe(-1, ctlrno)) == nil)
+			continue;
+		etherxx[ctlrno] = ether;
+	}
+
+	if(getconf("*noetherprobe"))
+		return;
+
+	cardno = ctlrno = 0;
+	while(cards[cardno].type != nil && ctlrno < MaxEther){
+		if(etherxx[ctlrno] != nil){
+			ctlrno++;
+			continue;
+		}
+		if((ether = etherprobe(cardno, ctlrno)) == nil){
+			cardno++;
+			continue;
+		}
+		etherxx[ctlrno] = ether;
+		ctlrno++;
+	}
+}
+
+static void
+ethershutdown(void)
+{
+	char name[32];
+	int i;
+	Ether *ether;
+
+	for(i = 0; i < MaxEther; i++){
+		ether = etherxx[i];
+		if(ether == nil)
+			continue;
+		if(ether->shutdown == nil) {
+			print("#l%d: no shutdown function\n", i);
+			continue;
+		}
+		snprint(name, sizeof(name), "ether%d", i);
+		if(ether->irq >= 0){
+		//	intrdisable(ether->irq, ether->interrupt, ether, ether->tbdf, name);
+		}
+		(*ether->shutdown)(ether);
+	}
+}
+
+
+#define POLY 0xedb88320
+
+/* really slow 32 bit crc for ethers */
+ulong
+ethercrc(uchar *p, int len)
+{
+	int i, j;
+	ulong crc, b;
+
+	crc = 0xffffffff;
+	for(i = 0; i < len; i++){
+		b = *p++;
+		for(j = 0; j < 8; j++){
+			crc = (crc>>1) ^ (((crc^b) & 1) ? POLY : 0);
+			b >>= 1;
+		}
+	}
+	return crc;
+}
+
+Dev etherdevtab = {
+	'l',
+	"ether",
+
+	etherreset,
+	devinit,
+	ethershutdown,
+	etherattach,
+	etherwalk,
+	etherstat,
+	etheropen,
+	ethercreate,
+	etherclose,
+	etherread,
+	etherbread,
+	etherwrite,
+	etherbwrite,
+	devremove,
+	etherwstat,
+};

+ 469 - 0
sys/src/9/386/devrtc.c

@@ -0,0 +1,469 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+/*
+ *  real time clock and non-volatile ram
+ */
+
+enum {
+	Paddr=		0x70,	/* address port */
+	Pdata=		0x71,	/* data port */
+
+	Seconds=	0x00,
+	Minutes=	0x02,
+	Hours=		0x04,
+	Mday=		0x07,
+	Month=		0x08,
+	Year=		0x09,
+	Status=		0x0A,
+
+	Nvoff=		128,	/* where usable nvram lives */
+	Nvsize=		256,
+
+	Nbcd=		6,
+};
+
+typedef struct Rtc	Rtc;
+struct Rtc
+{
+	int	sec;
+	int	min;
+	int	hour;
+	int	mday;
+	int	mon;
+	int	year;
+};
+
+
+enum{
+	Qdir = 0,
+	Qrtc,
+	Qnvram,
+};
+
+Dirtab rtcdir[]={
+	".",	{Qdir, 0, QTDIR},	0,	0555,
+	"nvram",	{Qnvram, 0},	Nvsize,	0664,
+	"rtc",		{Qrtc, 0},	0,	0664,
+};
+
+static ulong rtc2sec(Rtc*);
+static void sec2rtc(ulong, Rtc*);
+
+void
+rtcinit(void)
+{
+	if(ioalloc(Paddr, 2, 0, "rtc/nvr") < 0)
+		panic("rtcinit: ioalloc failed");
+}
+
+static Chan*
+rtcattach(char* spec)
+{
+	return devattach('r', spec);
+}
+
+static Walkqid*	 
+rtcwalk(Chan* c, Chan *nc, char** name, int nname)
+{
+	return devwalk(c, nc, name, nname, rtcdir, nelem(rtcdir), devgen);
+}
+
+static long	 
+rtcstat(Chan* c, uchar* dp, long n)
+{
+	return devstat(c, dp, n, rtcdir, nelem(rtcdir), devgen);
+}
+
+static Chan*
+rtcopen(Chan* c, int omode)
+{
+	omode = openmode(omode);
+	switch((ulong)c->qid.path){
+	case Qrtc:
+		if(strcmp(up->user, eve)!=0 && omode!=OREAD)
+			error(Eperm);
+		break;
+	case Qnvram:
+		if(strcmp(up->user, eve)!=0)
+			error(Eperm);
+	}
+	return devopen(c, omode, rtcdir, nelem(rtcdir), devgen);
+}
+
+static void	 
+rtcclose(Chan*)
+{
+}
+
+#define GETBCD(o) ((bcdclock[o]&0xf) + 10*(bcdclock[o]>>4))
+
+static long	 
+rtcextract(void)
+{
+	uchar bcdclock[Nbcd];
+	Rtc rtc;
+	int i;
+
+	/* don't do the read until the clock is no longer busy */
+	for(i = 0; i < 10000; i++){
+		outb(Paddr, Status);
+		if(inb(Pdata) & 0x80)
+			continue;
+
+		/* read clock values */
+		outb(Paddr, Seconds);	bcdclock[0] = inb(Pdata);
+		outb(Paddr, Minutes);	bcdclock[1] = inb(Pdata);
+		outb(Paddr, Hours);	bcdclock[2] = inb(Pdata);
+		outb(Paddr, Mday);	bcdclock[3] = inb(Pdata);
+		outb(Paddr, Month);	bcdclock[4] = inb(Pdata);
+		outb(Paddr, Year);	bcdclock[5] = inb(Pdata);
+
+		outb(Paddr, Status);
+		if((inb(Pdata) & 0x80) == 0)
+			break;
+	}
+
+	/*
+	 *  convert from BCD
+	 */
+	rtc.sec = GETBCD(0);
+	rtc.min = GETBCD(1);
+	rtc.hour = GETBCD(2);
+	rtc.mday = GETBCD(3);
+	rtc.mon = GETBCD(4);
+	rtc.year = GETBCD(5);
+
+	/*
+	 *  the world starts jan 1 1970
+	 */
+	if(rtc.year < 70)
+		rtc.year += 2000;
+	else
+		rtc.year += 1900;
+	return rtc2sec(&rtc);
+}
+
+static Lock nvrtlock;
+
+long
+rtctime(void)
+{
+	int i;
+	long t, ot;
+
+	ilock(&nvrtlock);
+
+	/* loop till we get two reads in a row the same */
+	t = rtcextract();
+	for(i = 0; i < 100; i++){
+		ot = rtcextract();
+		if(ot == t)
+			break;
+	}
+	iunlock(&nvrtlock);
+
+	if(i == 100) print("we are boofheads\n");
+
+	return t;
+}
+
+static long	 
+rtcread(Chan* c, void* buf, long n, vlong off)
+{
+	ulong t;
+	char *a, *start;
+	ulong offset = off;
+
+	if(c->qid.type & QTDIR)
+		return devdirread(c, buf, n, rtcdir, nelem(rtcdir), devgen);
+
+	switch((ulong)c->qid.path){
+	case Qrtc:
+		t = rtctime();
+		n = readnum(offset, buf, n, t, 12);
+		return n;
+	case Qnvram:
+		if(n == 0)
+			return 0;
+		if(n > Nvsize)
+			n = Nvsize;
+		a = start = smalloc(n);
+
+		ilock(&nvrtlock);
+		for(t = offset; t < offset + n; t++){
+			if(t >= Nvsize)
+				break;
+			outb(Paddr, Nvoff+t);
+			*a++ = inb(Pdata);
+		}
+		iunlock(&nvrtlock);
+
+		if(waserror()){
+			free(start);
+			nexterror();
+		}
+		memmove(buf, start, t - offset);
+		poperror();
+
+		free(start);
+		return t - offset;
+	}
+	error(Ebadarg);
+	return 0;
+}
+
+#define PUTBCD(n,o) bcdclock[o] = (n % 10) | (((n / 10) % 10)<<4)
+
+static long	 
+rtcwrite(Chan* c, void* buf, long n, vlong off)
+{
+	int t;
+	char *a, *start;
+	Rtc rtc;
+	ulong secs;
+	uchar bcdclock[Nbcd];
+	char *cp, *ep;
+	ulong offset = off;
+
+	if(offset!=0)
+		error(Ebadarg);
+
+
+	switch((ulong)c->qid.path){
+	case Qrtc:
+		/*
+		 *  read the time
+		 */
+		cp = ep = buf;
+		ep += n;
+		while(cp < ep){
+			if(*cp>='0' && *cp<='9')
+				break;
+			cp++;
+		}
+		secs = strtoul(cp, 0, 0);
+
+		/*
+		 *  convert to bcd
+		 */
+		sec2rtc(secs, &rtc);
+		PUTBCD(rtc.sec, 0);
+		PUTBCD(rtc.min, 1);
+		PUTBCD(rtc.hour, 2);
+		PUTBCD(rtc.mday, 3);
+		PUTBCD(rtc.mon, 4);
+		PUTBCD(rtc.year, 5);
+
+		/*
+		 *  write the clock
+		 */
+		ilock(&nvrtlock);
+		outb(Paddr, Seconds);	outb(Pdata, bcdclock[0]);
+		outb(Paddr, Minutes);	outb(Pdata, bcdclock[1]);
+		outb(Paddr, Hours);	outb(Pdata, bcdclock[2]);
+		outb(Paddr, Mday);	outb(Pdata, bcdclock[3]);
+		outb(Paddr, Month);	outb(Pdata, bcdclock[4]);
+		outb(Paddr, Year);	outb(Pdata, bcdclock[5]);
+		iunlock(&nvrtlock);
+		return n;
+	case Qnvram:
+		if(n == 0)
+			return 0;
+		if(n > Nvsize)
+			n = Nvsize;
+	
+		start = a = smalloc(n);
+		if(waserror()){
+			free(start);
+			nexterror();
+		}
+		memmove(a, buf, n);
+		poperror();
+
+		ilock(&nvrtlock);
+		for(t = offset; t < offset + n; t++){
+			if(t >= Nvsize)
+				break;
+			outb(Paddr, Nvoff+t);
+			outb(Pdata, *a++);
+		}
+		iunlock(&nvrtlock);
+
+		free(start);
+		return t - offset;
+	}
+	error(Ebadarg);
+	return 0;
+}
+
+Dev rtcdevtab = {
+	'r',
+	"rtc",
+
+	devreset,
+	rtcinit,
+	devshutdown,
+	rtcattach,
+	rtcwalk,
+	rtcstat,
+	rtcopen,
+	devcreate,
+	rtcclose,
+	rtcread,
+	devbread,
+	rtcwrite,
+	devbwrite,
+	devremove,
+	devwstat,
+};
+
+#define SEC2MIN 60L
+#define SEC2HOUR (60L*SEC2MIN)
+#define SEC2DAY (24L*SEC2HOUR)
+
+/*
+ *  days per month plus days/year
+ */
+static	int	dmsize[] =
+{
+	365, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+static	int	ldmsize[] =
+{
+	366, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ *  return the days/month for the given year
+ */
+static int*
+yrsize(int y)
+{
+	if((y%4) == 0 && ((y%100) != 0 || (y%400) == 0))
+		return ldmsize;
+	else
+		return dmsize;
+}
+
+/*
+ *  compute seconds since Jan 1 1970
+ */
+static ulong
+rtc2sec(Rtc *rtc)
+{
+	ulong secs;
+	int i;
+	int *d2m;
+
+	secs = 0;
+
+	/*
+	 *  seconds per year
+	 */
+	for(i = 1970; i < rtc->year; i++){
+		d2m = yrsize(i);
+		secs += d2m[0] * SEC2DAY;
+	}
+
+	/*
+	 *  seconds per month
+	 */
+	d2m = yrsize(rtc->year);
+	for(i = 1; i < rtc->mon; i++)
+		secs += d2m[i] * SEC2DAY;
+
+	secs += (rtc->mday-1) * SEC2DAY;
+	secs += rtc->hour * SEC2HOUR;
+	secs += rtc->min * SEC2MIN;
+	secs += rtc->sec;
+
+	return secs;
+}
+
+/*
+ *  compute rtc from seconds since Jan 1 1970
+ */
+static void
+sec2rtc(ulong secs, Rtc *rtc)
+{
+	int d;
+	long hms, day;
+	int *d2m;
+
+	/*
+	 * break initial number into days
+	 */
+	hms = secs % SEC2DAY;
+	day = secs / SEC2DAY;
+	if(hms < 0) {
+		hms += SEC2DAY;
+		day -= 1;
+	}
+
+	/*
+	 * generate hours:minutes:seconds
+	 */
+	rtc->sec = hms % 60;
+	d = hms / 60;
+	rtc->min = d % 60;
+	d /= 60;
+	rtc->hour = d;
+
+	/*
+	 * year number
+	 */
+	if(day >= 0)
+		for(d = 1970; day >= *yrsize(d); d++)
+			day -= *yrsize(d);
+	else
+		for (d = 1970; day < 0; d--)
+			day += *yrsize(d-1);
+	rtc->year = d;
+
+	/*
+	 * generate month
+	 */
+	d2m = yrsize(rtc->year);
+	for(d = 1; day >= d2m[d]; d++)
+		day -= d2m[d];
+	rtc->mday = day + 1;
+	rtc->mon = d;
+
+	return;
+}
+
+uchar
+nvramread(int addr)
+{
+	uchar data;
+
+	ilock(&nvrtlock);
+	outb(Paddr, addr);
+	data = inb(Pdata);
+	iunlock(&nvrtlock);
+
+	return data;
+}
+
+void
+nvramwrite(int addr, uchar data)
+{
+	ilock(&nvrtlock);
+	outb(Paddr, addr);
+	outb(Pdata, data);
+	iunlock(&nvrtlock);
+}

+ 1215 - 0
sys/src/9/386/ether8169.c

@@ -0,0 +1,1215 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Realtek RTL8110S/8169S.
+ * Mostly there. There are some magic register values used
+ * which are not described in any datasheet or driver but seem
+ * to be necessary.
+ * No tuning has been done. Only tested on an RTL8110S, there
+ * are slight differences between the chips in the series so some
+ * tweaks may be needed.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+
+#include "../port/ethermii.h"
+#include "../port/netif.h"
+
+#include "etherif.h"
+
+enum {					/* registers */
+	Idr0		= 0x00,		/* MAC address */
+	Mar0		= 0x08,		/* Multicast address */
+	Dtccr		= 0x10,		/* Dump Tally Counter Command */
+	Tnpds		= 0x20,		/* Transmit Normal Priority Descriptors */
+	Thpds		= 0x28,		/* Transmit High Priority Descriptors */
+	Flash		= 0x30,		/* Flash Memory Read/Write */
+	Erbcr		= 0x34,		/* Early Receive Byte Count */
+	Ersr		= 0x36,		/* Early Receive Status */
+	Cr		= 0x37,		/* Command Register */
+	Tppoll		= 0x38,		/* Transmit Priority Polling */
+	Imr		= 0x3C,		/* Interrupt Mask */
+	Isr		= 0x3E,		/* Interrupt Status */
+	Tcr		= 0x40,		/* Transmit Configuration */
+	Rcr		= 0x44,		/* Receive Configuration */
+	Tctr		= 0x48,		/* Timer Count */
+	Mpc		= 0x4C,		/* Missed Packet Counter */
+	Cr9346		= 0x50,		/* 9346 Command Register */
+	Config0		= 0x51,		/* Configuration Register 0 */
+	Config1		= 0x52,		/* Configuration Register 1 */
+	Config2		= 0x53,		/* Configuration Register 2 */
+	Config3		= 0x54,		/* Configuration Register 3 */
+	Config4		= 0x55,		/* Configuration Register 4 */
+	Config5		= 0x56,		/* Configuration Register 5 */
+	Timerint	= 0x58,		/* Timer Interrupt */
+	Mulint		= 0x5C,		/* Multiple Interrupt Select */
+	Phyar		= 0x60,		/* PHY Access */
+	Tbicsr0		= 0x64,		/* TBI Control and Status */
+	Tbianar		= 0x68,		/* TBI Auto-Negotiation Advertisment */
+	Tbilpar		= 0x6A,		/* TBI Auto-Negotiation Link Partner */
+
+	Rms		= 0xDA,		/* Receive Packet Maximum Size */
+	Cplusc		= 0xE0,		/* C+ Command */
+	Rdsar		= 0xE4,		/* Receive Descriptor Start Address */
+	Mtps		= 0xEC,		/* Max. Transmit Packet Size */
+};
+
+enum {					/* Dtccr */
+	Cmd		= 0x00000008,	/* Command */
+};
+
+enum {					/* Cr */
+	Te		= 0x04,		/* Transmitter Enable */
+	Re		= 0x08,		/* Receiver Enable */
+	Rst		= 0x10,		/* Software Reset */
+};
+
+enum {					/* Tppoll */
+	Fswint		= 0x01,		/* Forced Software Interrupt */
+	Npq		= 0x40,		/* Normal Priority Queue polling */
+	Hpq		= 0x80,		/* High Priority Queue polling */
+};
+
+enum {					/* Imr/Isr */
+	Rok		= 0x0001,	/* Receive OK */
+	Rer		= 0x0002,	/* Receive Error */
+	Tok		= 0x0004,	/* Transmit OK */
+	Ter		= 0x0008,	/* Transmit Error */
+	Rdu		= 0x0010,	/* Receive Descriptor Unavailable */
+	Punlc		= 0x0020,	/* Packet Underrun or Link Change */
+	Fovw		= 0x0040,	/* Receive FIFO Overflow */
+	Tdu		= 0x0080,	/* Transmit Descriptor Unavailable */
+	Swint		= 0x0100,	/* Software Interrupt */
+	Timeout		= 0x4000,	/* Timer */
+	Serr		= 0x8000,	/* System Error */
+};
+
+enum {					/* Tcr */
+	MtxdmaSHIFT	= 8,		/* Max. DMA Burst Size */
+	MtxdmaMASK	= 0x00000700,
+	Mtxdmaunlimited	= 0x00000700,
+	Acrc		= 0x00010000,	/* Append CRC (not) */
+	Lbk0		= 0x00020000,	/* Loopback Test 0 */
+	Lbk1		= 0x00040000,	/* Loopback Test 1 */
+	Ifg2		= 0x00080000,	/* Interframe Gap 2 */
+	HwveridSHIFT	= 23,		/* Hardware Version ID */
+	HwveridMASK	= 0x7C800000,
+	Macv01		= 0x00000000,	/* RTL8169 */
+	Macv02		= 0x00800000,	/* RTL8169S/8110S */
+	Macv03		= 0x04000000,	/* RTL8169S/8110S */
+	Macv04		= 0x10000000,	/* RTL8169SB/8110SB */
+	Macv05		= 0x18000000,	/* RTL8169SC/8110SC */
+	Macv11		= 0x30000000,	/* RTL8168B/8111B */
+	Macv12		= 0x38000000,	/* RTL8169B/8111B */
+	Macv13		= 0x34000000,	/* RTL8101E */
+	Macv14		= 0x30800000,	/* RTL8100E */
+	Macv15		= 0x38800000,	/* RTL8100E */
+	Ifg0		= 0x01000000,	/* Interframe Gap 0 */
+	Ifg1		= 0x02000000,	/* Interframe Gap 1 */
+};
+
+enum {					/* Rcr */
+	Aap		= 0x00000001,	/* Accept All Packets */
+	Apm		= 0x00000002,	/* Accept Physical Match */
+	Am		= 0x00000004,	/* Accept Multicast */
+	Ab		= 0x00000008,	/* Accept Broadcast */
+	Ar		= 0x00000010,	/* Accept Runt */
+	Aer		= 0x00000020,	/* Accept Error */
+	Sel9356		= 0x00000040,	/* 9356 EEPROM used */
+	MrxdmaSHIFT	= 8,		/* Max. DMA Burst Size */
+	MrxdmaMASK	= 0x00000700,
+	Mrxdmaunlimited	= 0x00000700,
+	RxfthSHIFT	= 13,		/* Receive Buffer Length */
+	RxfthMASK	= 0x0000E000,
+	Rxfth256	= 0x00008000,
+	Rxfthnone	= 0x0000E000,
+	Rer8		= 0x00010000,	/* Accept Error Packets > 8 bytes */
+	MulERINT	= 0x01000000,	/* Multiple Early Interrupt Select */
+};
+
+enum {					/* Cr9346 */
+	Eedo		= 0x01,		/* */
+	Eedi		= 0x02,		/* */
+	Eesk		= 0x04,		/* */
+	Eecs		= 0x08,		/* */
+	Eem0		= 0x40,		/* Operating Mode */
+	Eem1		= 0x80,
+};
+
+enum {					/* Phyar */
+	DataMASK	= 0x0000FFFF,	/* 16-bit GMII/MII Register Data */
+	DataSHIFT	= 0,
+	RegaddrMASK	= 0x001F0000,	/* 5-bit GMII/MII Register Address */
+	RegaddrSHIFT	= 16,
+	Flag		= 0x80000000,	/* */
+};
+
+enum {					/* Cplusc */
+	Mulrw		= 0x0008,	/* PCI Multiple R/W Enable */
+	Dac		= 0x0010,	/* PCI Dual Address Cycle Enable */
+	Rxchksum	= 0x0020,	/* Receive Checksum Offload Enable */
+	Rxvlan		= 0x0040,	/* Receive VLAN De-tagging Enable */
+	Endian		= 0x0200,	/* Endian Mode */
+};
+
+typedef struct D D;			/* Transmit/Receive Descriptor */
+struct D {
+	u32int	control;
+	u32int	vlan;
+	u32int	addrlo;
+	u32int	addrhi;
+};
+
+enum {					/* Transmit Descriptor control */
+	TxflMASK	= 0x0000FFFF,	/* Transmit Frame Length */
+	TxflSHIFT	= 0,
+	Tcps		= 0x00010000,	/* TCP Checksum Offload */
+	Udpcs		= 0x00020000,	/* UDP Checksum Offload */
+	Ipcs		= 0x00040000,	/* IP Checksum Offload */
+	Lgsen		= 0x08000000,	/* Large Send */
+};
+
+enum {					/* Receive Descriptor control */
+	RxflMASK	= 0x00003FFF,	/* Receive Frame Length */
+	RxflSHIFT	= 0,
+	Tcpf		= 0x00004000,	/* TCP Checksum Failure */
+	Udpf		= 0x00008000,	/* UDP Checksum Failure */
+	Ipf		= 0x00010000,	/* IP Checksum Failure */
+	Pid0		= 0x00020000,	/* Protocol ID0 */
+	Pid1		= 0x00040000,	/* Protocol ID1 */
+	Crce		= 0x00080000,	/* CRC Error */
+	Runt		= 0x00100000,	/* Runt Packet */
+	Res		= 0x00200000,	/* Receive Error Summary */
+	Rwt		= 0x00400000,	/* Receive Watchdog Timer Expired */
+	Fovf		= 0x00800000,	/* FIFO Overflow */
+	Bovf		= 0x01000000,	/* Buffer Overflow */
+	Bar		= 0x02000000,	/* Broadcast Address Received */
+	Pam		= 0x04000000,	/* Physical Address Matched */
+	Mar		= 0x08000000,	/* Multicast Address Received */
+};
+
+enum {					/* General Descriptor control */
+	Ls		= 0x10000000,	/* Last Segment Descriptor */
+	Fs		= 0x20000000,	/* First Segment Descriptor */
+	Eor		= 0x40000000,	/* End of Descriptor Ring */
+	Own		= 0x80000000,	/* Ownership */
+};
+
+/*
+ */
+enum {					/* Ring sizes  (<= 1024) */
+	Ntd		= 32,		/* Transmit Ring */
+	Nrd		= 128,		/* Receive Ring */
+
+	Mps		= ROUNDUP(ETHERMAXTU+4, 128),
+};
+
+typedef struct Dtcc Dtcc;
+struct Dtcc {
+	u64int	txok;
+	u64int	rxok;
+	u64int	txer;
+	u32int	rxer;
+	u16int	misspkt;
+	u16int	fae;
+	u32int	tx1col;
+	u32int	txmcol;
+	u64int	rxokph;
+	u64int	rxokbrd;
+	u32int	rxokmu;
+	u16int	txabt;
+	u16int	txundrn;
+};
+
+enum {						/* Variants */
+	Rtl8100e	= (0x8136<<16)|0x10EC,	/* RTL810[01]E: pci -e */
+	Rtl8169c	= (0x0116<<16)|0x16EC,	/* RTL8169C+ (USR997902) */
+	Rtl8169sc	= (0x8167<<16)|0x10EC,	/* RTL8169SC */
+	Rtl8168b	= (0x8168<<16)|0x10EC,	/* RTL8168B: pci-e */
+	Rtl8169		= (0x8169<<16)|0x10EC,	/* RTL8169 */
+};
+
+typedef struct Ctlr Ctlr;
+typedef struct Ctlr {
+	int	port;
+	Pcidev*	pcidev;
+	Ctlr*	next;
+	int	active;
+
+	QLock	alock;			/* attach */
+	Lock	ilock;			/* init */
+	int	init;			/*  */
+
+	int	pciv;			/*  */
+	int	macv;			/* MAC version */
+	int	phyv;			/* PHY version */
+	int	pcie;			/* flag: pci-express device? */
+
+	uvlong	mchash;			/* multicast hash */
+
+	Mii*	mii;
+
+	Lock	tlock;			/* transmit */
+	D*	td;			/* descriptor ring */
+	Block**	tb;			/* transmit buffers */
+	int	ntd;
+
+	int	tdh;			/* head - producer index (host) */
+	int	tdt;			/* tail - consumer index (NIC) */
+	int	ntdfree;
+	int	ntq;
+
+	int	mtps;			/* Max. Transmit Packet Size */
+
+	Lock	rlock;			/* receive */
+	D*	rd;			/* descriptor ring */
+	Block**	rb;			/* receive buffers */
+	int	nrd;
+
+	int	rdh;			/* head - producer index (NIC) */
+	int	rdt;			/* tail - consumer index (host) */
+	int	nrdfree;
+
+	int	tcr;			/* transmit configuration register */
+	int	rcr;			/* receive configuration register */
+	int	imr;
+
+	QLock	slock;			/* statistics */
+	Dtcc*	dtcc;
+	uint	txdu;
+	uint	tcpf;
+	uint	udpf;
+	uint	ipf;
+	uint	fovf;
+	uint	ierrs;
+	uint	rer;
+	uint	rdu;
+	uint	punlc;
+	uint	fovw;
+	uint	mcast;
+} Ctlr;
+
+static Ctlr* rtl8169ctlrhead;
+static Ctlr* rtl8169ctlrtail;
+
+#define csr8r(c, r)	(inb((c)->port+(r)))
+#define csr16r(c, r)	(ins((c)->port+(r)))
+#define csr32r(c, r)	(inl((c)->port+(r)))
+#define csr8w(c, r, b)	(outb((c)->port+(r), (u8int)(b)))
+#define csr16w(c, r, w)	(outs((c)->port+(r), (u16int)(w)))
+#define csr32w(c, r, l)	(outl((c)->port+(r), (u32int)(l)))
+
+static int
+rtl8169miimir(Ctlr* ctlr, int pa, int ra)
+{
+	uint r;
+	int timeo;
+
+	if(pa != 1)
+		return -1;
+
+	r = (ra<<16) & RegaddrMASK;
+	csr32w(ctlr, Phyar, r);
+	delay(1);
+	for(timeo = 0; timeo < 2000; timeo++){
+		if((r = csr32r(ctlr, Phyar)) & Flag)
+			break;
+		microdelay(100);
+	}
+	if(!(r & Flag))
+		return -1;
+
+	return (r & DataMASK)>>DataSHIFT;
+}
+
+static int
+rtl8169miimiw(Ctlr* ctlr, int pa, int ra, int data)
+{
+	uint r;
+	int timeo;
+
+	if(pa != 1)
+		return -1;
+
+	r = Flag|((ra<<16) & RegaddrMASK)|((data<<DataSHIFT) & DataMASK);
+	csr32w(ctlr, Phyar, r);
+	delay(1);
+	for(timeo = 0; timeo < 2000; timeo++){
+		if(!((r = csr32r(ctlr, Phyar)) & Flag))
+			break;
+		microdelay(100);
+	}
+	if(r & Flag)
+		return -1;
+
+	return 0;
+}
+
+static int
+rtl8169miirw(Mii* mii, int write, int pa, int ra, int data)
+{
+	if(write)
+		return rtl8169miimiw(mii->ctlr, pa, ra, data);
+
+	return rtl8169miimir(mii->ctlr, pa, ra);
+}
+
+static Mii*
+rtl8169mii(Ctlr* ctlr)
+{
+	Mii* mii;
+	MiiPhy *phy;
+
+	/*
+	 * Link management.
+	 *
+	 * Get rev number out of Phyidr2 so can config properly.
+	 * There's probably more special stuff for Macv0[234] needed here.
+	 */
+	ctlr->phyv = rtl8169miimir(ctlr, 1, Phyidr2) & 0x0F;
+	if(ctlr->macv == Macv02){
+		csr8w(ctlr, 0x82, 1);				/* magic */
+		rtl8169miimiw(ctlr, 1, 0x0B, 0x0000);		/* magic */
+	}
+	if((mii = miiattach(ctlr, (1<<1), rtl8169miirw)) == nil)
+		return nil;
+
+	phy = mii->curphy;
+	print("oui %#ux phyno %d, macv = %#8.8ux phyv = %#4.4ux\n",
+		phy->oui, phy->phyno, ctlr->macv, ctlr->phyv);
+
+	if(miistatus(mii) < 0){
+		miireset(mii);
+		miiane(mii, ~0, ~0, ~0);
+	}
+
+	return mii;
+}
+
+static void
+rtl8169promiscuous(void* arg, int on)
+{
+	Ether *edev;
+	Ctlr * ctlr;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+	ilock(&ctlr->ilock);
+
+	if(on)
+		ctlr->rcr |= Aap;
+	else
+		ctlr->rcr &= ~Aap;
+	csr32w(ctlr, Rcr, ctlr->rcr);
+	iunlock(&ctlr->ilock);
+}
+
+enum {
+	/* everyone else uses 0x04c11db7, but they both produce the same crc */
+	Etherpolybe = 0x04c11db6,
+	Bytemask = (1<<8) - 1,
+};
+
+static ulong
+ethercrcbe(uchar *addr, long len)
+{
+	int i, j;
+	ulong c, crc, carry;
+
+	crc = ~0UL;
+	for (i = 0; i < len; i++) {
+		c = addr[i];
+		for (j = 0; j < 8; j++) {
+			carry = ((crc & (1UL << 31))? 1: 0) ^ (c & 1);
+			crc <<= 1;
+			c >>= 1;
+			if (carry)
+				crc = (crc ^ Etherpolybe) | carry;
+		}
+	}
+	return crc;
+}
+
+static ulong
+swabl(ulong l)
+{
+	return l>>24 | (l>>8) & (Bytemask<<8) |
+		(l<<8) & (Bytemask<<16) | l<<24;
+}
+
+static void
+rtl8169multicast(void* ether, uchar *eaddr, int add)
+{
+	Ether *edev;
+	Ctlr *ctlr;
+
+	if (!add)
+		return;	/* ok to keep receiving on old mcast addrs */
+
+	edev = ether;
+	ctlr = edev->ctlr;
+	ilock(&ctlr->ilock);
+
+	ctlr->mchash |= 1ULL << (ethercrcbe(eaddr, Eaddrlen) >> 26);
+
+	ctlr->rcr |= Am;
+	csr32w(ctlr, Rcr, ctlr->rcr);
+
+	/* pci-e variants reverse the order of the hash byte registers */
+	if (ctlr->pcie) {
+		csr32w(ctlr, Mar0,   swabl(ctlr->mchash>>32));
+		csr32w(ctlr, Mar0+4, swabl(ctlr->mchash));
+	} else {
+		csr32w(ctlr, Mar0,   ctlr->mchash);
+		csr32w(ctlr, Mar0+4, ctlr->mchash>>32);
+	}
+
+	iunlock(&ctlr->ilock);
+}
+
+static long
+rtl8169ifstat(Ether* edev, void* a, long n, ulong offset)
+{
+	Ctlr *ctlr;
+	Dtcc *dtcc;
+	int timeo;
+	char *alloc, *e, *p;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->slock);
+
+	alloc = nil;
+	if(waserror()){
+		qunlock(&ctlr->slock);
+		free(alloc);
+		nexterror();
+	}
+
+	csr32w(ctlr, Dtccr+4, 0);
+	csr32w(ctlr, Dtccr, PCIWADDR(ctlr->dtcc)|Cmd);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Dtccr) & Cmd))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Dtccr) & Cmd)
+		error(Eio);
+	dtcc = ctlr->dtcc;
+
+	edev->oerrs = dtcc->txer;
+	edev->crcs = dtcc->rxer;
+	edev->frames = dtcc->fae;
+	edev->buffs = dtcc->misspkt;
+	edev->overflows = ctlr->txdu+ctlr->rdu;
+
+	if(n == 0){
+		qunlock(&ctlr->slock);
+		poperror();
+		return 0;
+	}
+
+	if((alloc = malloc(READSTR)) == nil)
+		error(Enomem);
+	e = alloc+READSTR;
+
+	p = seprint(alloc, e, "TxOk: %llud\n", dtcc->txok);
+	p = seprint(p, e, "RxOk: %llud\n", dtcc->rxok);
+	p = seprint(p, e, "TxEr: %llud\n", dtcc->txer);
+	p = seprint(p, e, "RxEr: %ud\n", dtcc->rxer);
+	p = seprint(p, e, "MissPkt: %ud\n", dtcc->misspkt);
+	p = seprint(p, e, "FAE: %ud\n", dtcc->fae);
+	p = seprint(p, e, "Tx1Col: %ud\n", dtcc->tx1col);
+	p = seprint(p, e, "TxMCol: %ud\n", dtcc->txmcol);
+	p = seprint(p, e, "RxOkPh: %llud\n", dtcc->rxokph);
+	p = seprint(p, e, "RxOkBrd: %llud\n", dtcc->rxokbrd);
+	p = seprint(p, e, "RxOkMu: %ud\n", dtcc->rxokmu);
+	p = seprint(p, e, "TxAbt: %ud\n", dtcc->txabt);
+	p = seprint(p, e, "TxUndrn: %ud\n", dtcc->txundrn);
+
+	p = seprint(p, e, "txdu: %ud\n", ctlr->txdu);
+	p = seprint(p, e, "tcpf: %ud\n", ctlr->tcpf);
+	p = seprint(p, e, "udpf: %ud\n", ctlr->udpf);
+	p = seprint(p, e, "ipf: %ud\n", ctlr->ipf);
+	p = seprint(p, e, "fovf: %ud\n", ctlr->fovf);
+	p = seprint(p, e, "ierrs: %ud\n", ctlr->ierrs);
+	p = seprint(p, e, "rer: %ud\n", ctlr->rer);
+	p = seprint(p, e, "rdu: %ud\n", ctlr->rdu);
+	p = seprint(p, e, "punlc: %ud\n", ctlr->punlc);
+	p = seprint(p, e, "fovw: %ud\n", ctlr->fovw);
+
+	p = seprint(p, e, "tcr: %#8.8ux\n", ctlr->tcr);
+	p = seprint(p, e, "rcr: %#8.8ux\n", ctlr->rcr);
+	p = seprint(p, e, "multicast: %ud\n", ctlr->mcast);
+
+	if(ctlr->mii != nil && ctlr->mii->curphy != nil)
+		miidumpphy(ctlr->mii, p, e);
+
+	n = readstr(offset, a, n, alloc);
+
+	qunlock(&ctlr->slock);
+	poperror();
+	free(alloc);
+
+	return n;
+}
+
+static void
+rtl8169halt(Ctlr* ctlr)
+{
+	csr8w(ctlr, Cr, 0);
+	csr16w(ctlr, Imr, 0);
+	csr16w(ctlr, Isr, ~0);
+}
+
+static int
+rtl8169reset(Ctlr* ctlr)
+{
+	u32int r;
+	int timeo;
+
+	/*
+	 * Soft reset the controller.
+	 */
+	csr8w(ctlr, Cr, Rst);
+	for(r = timeo = 0; timeo < 1000; timeo++){
+		r = csr8r(ctlr, Cr);
+		if(!(r & Rst))
+			break;
+		delay(1);
+	}
+	rtl8169halt(ctlr);
+
+	if(r & Rst)
+		return -1;
+	return 0;
+}
+
+static void
+rtl8169replenish(Ctlr* ctlr)
+{
+	D *d;
+	int rdt;
+	Block *bp;
+
+	rdt = ctlr->rdt;
+	while(NEXT(rdt, ctlr->nrd) != ctlr->rdh){
+		d = &ctlr->rd[rdt];
+		if(ctlr->rb[rdt] == nil){
+			/*
+			 * Simple allocation for now.
+			 * This better be aligned on 8.
+			 */
+			bp = iallocb(Mps);
+			if(bp == nil){
+				iprint("no available buffers\n");
+				break;
+			}
+			ctlr->rb[rdt] = bp;
+			d->addrlo = PCIWADDR(bp->rp);
+			d->addrhi = 0;
+		}
+		coherence();
+		d->control |= Own|Mps;
+		rdt = NEXT(rdt, ctlr->nrd);
+		ctlr->nrdfree++;
+	}
+	ctlr->rdt = rdt;
+}
+
+static int
+rtl8169init(Ether* edev)
+{
+	int i;
+	u32int r;
+	Block *bp;
+	Ctlr *ctlr;
+	u8int cplusc;
+
+	ctlr = edev->ctlr;
+	ilock(&ctlr->ilock);
+
+	rtl8169halt(ctlr);
+
+	/*
+	 * MAC Address.
+	 * Must put chip into config register write enable mode.
+	 */
+	csr8w(ctlr, Cr9346, Eem1|Eem0);
+	r = (edev->ea[3]<<24)|(edev->ea[2]<<16)|(edev->ea[1]<<8)|edev->ea[0];
+	csr32w(ctlr, Idr0, r);
+	r = (edev->ea[5]<<8)|edev->ea[4];
+	csr32w(ctlr, Idr0+4, r);
+
+	/*
+	 * Transmitter.
+	 */
+	memset(ctlr->td, 0, sizeof(D)*ctlr->ntd);
+	ctlr->tdh = ctlr->tdt = 0;
+	ctlr->td[ctlr->ntd-1].control = Eor;
+
+	/*
+	 * Receiver.
+	 * Need to do something here about the multicast filter.
+	 */
+	memset(ctlr->rd, 0, sizeof(D)*ctlr->nrd);
+	ctlr->nrdfree = ctlr->rdh = ctlr->rdt = 0;
+	ctlr->rd[ctlr->nrd-1].control = Eor;
+
+	for(i = 0; i < ctlr->nrd; i++){
+		if((bp = ctlr->rb[i]) != nil){
+			ctlr->rb[i] = nil;
+			freeb(bp);
+		}
+	}
+	rtl8169replenish(ctlr);
+	ctlr->rcr = Rxfthnone|Mrxdmaunlimited|Ab|Am|Apm;
+
+	/*
+	 * Mtps is in units of 128 except for the RTL8169
+	 * where is is 32. If using jumbo frames should be
+	 * set to 0x3F.
+	 * Setting Mulrw in Cplusc disables the Tx/Rx DMA burst
+	 * settings in Tcr/Rcr; the (1<<14) is magic.
+	 */
+	ctlr->mtps = HOWMANY(Mps, 128);
+	cplusc = csr16r(ctlr, Cplusc) & ~(1<<14);
+	cplusc |= /*Rxchksum|*/Mulrw;
+	switch(ctlr->macv){
+	default:
+		return -1;
+	case Macv01:
+		ctlr->mtps = HOWMANY(Mps, 32);
+		break;
+	case Macv02:
+	case Macv03:
+		cplusc |= (1<<14);			/* magic */
+		break;
+	case Macv05:
+		/*
+		 * This is interpreted from clearly bogus code
+		 * in the manufacturer-supplied driver, it could
+		 * be wrong. Untested.
+		 */
+		r = csr8r(ctlr, Config2) & 0x07;
+		if(r == 0x01)				/* 66MHz PCI */
+			csr32w(ctlr, 0x7C, 0x0007FFFF);	/* magic */
+		else
+			csr32w(ctlr, 0x7C, 0x0007FF00);	/* magic */
+		pciclrmwi(ctlr->pcidev);
+		break;
+	case Macv13:
+		/*
+		 * This is interpreted from clearly bogus code
+		 * in the manufacturer-supplied driver, it could
+		 * be wrong. Untested.
+		 */
+		pcicfgw8(ctlr->pcidev, 0x68, 0x00);	/* magic */
+		pcicfgw8(ctlr->pcidev, 0x69, 0x08);	/* magic */
+		break;
+	case Macv04:
+	case Macv11:
+	case Macv12:
+	case Macv14:
+	case Macv15:
+		break;
+	}
+
+	/*
+	 * Enable receiver/transmitter.
+	 * Need to do this first or some of the settings below
+	 * won't take.
+	 */
+	switch(ctlr->pciv){
+	default:
+		csr8w(ctlr, Cr, Te|Re);
+		csr32w(ctlr, Tcr, Ifg1|Ifg0|Mtxdmaunlimited);
+		csr32w(ctlr, Rcr, ctlr->rcr);
+		csr32w(ctlr, Mar0,   0);
+		csr32w(ctlr, Mar0+4, 0);
+		ctlr->mchash = 0;
+	case Rtl8169sc:
+	case Rtl8168b:
+		break;
+	}
+
+	/*
+	 * Interrupts.
+	 * Disable Tdu|Tok for now, the transmit routine will tidy.
+	 * Tdu means the NIC ran out of descriptors to send, so it
+	 * doesn't really need to ever be on.
+	 */
+	csr32w(ctlr, Timerint, 0);
+	ctlr->imr = Serr|Timeout|Fovw|Punlc|Rdu|Ter|Rer|Rok;
+	csr16w(ctlr, Imr, ctlr->imr);
+
+	/*
+	 * Clear missed-packet counter;
+	 * initial early transmit threshold value;
+	 * set the descriptor ring base addresses;
+	 * set the maximum receive packet size;
+	 * no early-receive interrupts.
+	 */
+	csr32w(ctlr, Mpc, 0);
+	csr8w(ctlr, Mtps, ctlr->mtps);
+	csr32w(ctlr, Tnpds+4, 0);
+	csr32w(ctlr, Tnpds, PCIWADDR(ctlr->td));
+	csr32w(ctlr, Rdsar+4, 0);
+	csr32w(ctlr, Rdsar, PCIWADDR(ctlr->rd));
+	csr16w(ctlr, Rms, Mps);
+	r = csr16r(ctlr, Mulint) & 0xF000;
+	csr16w(ctlr, Mulint, r);
+	csr16w(ctlr, Cplusc, cplusc);
+
+	/*
+	 * Set configuration.
+	 */
+	switch(ctlr->pciv){
+	default:
+		break;
+	case Rtl8169sc:
+		csr16w(ctlr, 0xE2, 0);			/* magic */
+		csr8w(ctlr, Cr, Te|Re);
+		csr32w(ctlr, Tcr, Ifg1|Ifg0|Mtxdmaunlimited);
+		csr32w(ctlr, Rcr, ctlr->rcr);
+		break;
+	case Rtl8168b:
+	case Rtl8169c:
+		csr16w(ctlr, 0xE2, 0);			/* magic */
+		csr16w(ctlr, Cplusc, 0x2000);		/* magic */
+		csr8w(ctlr, Cr, Te|Re);
+		csr32w(ctlr, Tcr, Ifg1|Ifg0|Mtxdmaunlimited);
+		csr32w(ctlr, Rcr, ctlr->rcr);
+		csr16w(ctlr, Rms, 0x0800);
+		csr8w(ctlr, Mtps, 0x3F);
+		break;
+	}
+	ctlr->tcr = csr32r(ctlr, Tcr);
+	csr8w(ctlr, Cr9346, 0);
+
+	iunlock(&ctlr->ilock);
+
+//	rtl8169mii(ctlr);
+
+	return 0;
+}
+
+static void
+rtl8169attach(Ether* edev)
+{
+	int timeo;
+	Ctlr *ctlr;
+	MiiPhy *phy;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->alock);
+	if(ctlr->init == 0){
+		/*
+		 * Handle allocation/init errors here.
+		 */
+		ctlr->td = mallocalign(sizeof(D)*Ntd, 256, 0, 0);
+		ctlr->tb = malloc(Ntd*sizeof(Block*));
+		ctlr->ntd = Ntd;
+		ctlr->rd = mallocalign(sizeof(D)*Nrd, 256, 0, 0);
+		ctlr->rb = malloc(Nrd*sizeof(Block*));
+		ctlr->nrd = Nrd;
+		ctlr->dtcc = mallocalign(sizeof(Dtcc), 64, 0, 0);
+		rtl8169init(edev);
+		ctlr->init = 1;
+	}
+	qunlock(&ctlr->alock);
+
+	/*
+	 * Wait for link to be ready.
+	 */
+	for(timeo = 0; timeo < 350; timeo++){
+		if(miistatus(ctlr->mii) == 0)
+			break;
+		tsleep(&up->sleep, return0, 0, 10);
+	}
+	phy = ctlr->mii->curphy;
+	print("%s: speed %d fd %d link %d rfc %d tfc %d\n",
+		edev->name, phy->speed, phy->fd, phy->link, phy->rfc, phy->tfc);
+}
+
+static void
+rtl8169link(Ether* edev)
+{
+	int limit;
+	Ctlr *ctlr;
+	MiiPhy *phy;
+
+	ctlr = edev->ctlr;
+
+	/*
+	 * Maybe the link changed - do we care very much?
+	 * Could stall transmits if no link, maybe?
+	 */
+	if(ctlr->mii == nil || ctlr->mii->curphy == nil)
+		return;
+
+	phy = ctlr->mii->curphy;
+	if(miistatus(ctlr->mii) < 0){
+		iprint("%slink n: speed %d fd %d link %d rfc %d tfc %d\n",
+			edev->name, phy->speed, phy->fd, phy->link,
+			phy->rfc, phy->tfc);
+		edev->link = 0;
+		return;
+	}
+	edev->link = 1;
+
+	limit = 256*1024;
+	if(phy->speed == 10){
+		edev->mbps = 10;
+		limit = 65*1024;
+	}
+	else if(phy->speed == 100)
+		edev->mbps = 100;
+	else if(phy->speed == 1000)
+		edev->mbps = 1000;
+	iprint("%slink y: speed %d fd %d link %d rfc %d tfc %d\n",
+		edev->name, phy->speed, phy->fd, phy->link,
+		phy->rfc, phy->tfc);
+
+	if(edev->oq != nil)
+		qsetlimit(edev->oq, limit);
+}
+
+static void
+rtl8169transmit(Ether* edev)
+{
+	D *d;
+	Block *bp;
+	Ctlr *ctlr;
+	int control, x;
+
+	ctlr = edev->ctlr;
+
+	ilock(&ctlr->tlock);
+	for(x = ctlr->tdh; ctlr->ntq > 0; x = NEXT(x, ctlr->ntd)){
+		d = &ctlr->td[x];
+		if((control = d->control) & Own)
+			break;
+
+		/*
+		 * Check errors and log here.
+		 */
+		USED(control);
+
+		/*
+		 * Free it up.
+		 * Need to clean the descriptor here? Not really.
+		 * Simple freeb for now (no chain and freeblist).
+		 * Use ntq count for now.
+		 */
+		freeb(ctlr->tb[x]);
+		ctlr->tb[x] = nil;
+		d->control &= Eor;
+
+		ctlr->ntq--;
+	}
+	ctlr->tdh = x;
+
+	x = ctlr->tdt;
+	while(ctlr->ntq < (ctlr->ntd-1)){
+		if((bp = qget(edev->oq)) == nil)
+			break;
+
+		d = &ctlr->td[x];
+		d->addrlo = PCIWADDR(bp->rp);
+		d->addrhi = 0;
+		ctlr->tb[x] = bp;
+		coherence();
+		d->control |= Own|Fs|Ls|((BLEN(bp)<<TxflSHIFT) & TxflMASK);
+
+		x = NEXT(x, ctlr->ntd);
+		ctlr->ntq++;
+	}
+	if(x != ctlr->tdt){
+		ctlr->tdt = x;
+		csr8w(ctlr, Tppoll, Npq);
+	}
+	else if(ctlr->ntq >= (ctlr->ntd-1))
+		ctlr->txdu++;
+
+	iunlock(&ctlr->tlock);
+}
+
+static void
+rtl8169receive(Ether* edev)
+{
+	D *d;
+	int rdh;
+	Block *bp;
+	Ctlr *ctlr;
+	u32int control;
+
+	ctlr = edev->ctlr;
+
+	rdh = ctlr->rdh;
+	for(;;){
+		d = &ctlr->rd[rdh];
+
+		if(d->control & Own)
+			break;
+
+		control = d->control;
+		if((control & (Fs|Ls|Res)) == (Fs|Ls)){
+			bp = ctlr->rb[rdh];
+			ctlr->rb[rdh] = nil;
+			bp->wp = bp->rp + ((control & RxflMASK)>>RxflSHIFT)-4;
+			bp->next = nil;
+
+			if(control & Fovf)
+				ctlr->fovf++;
+			if(control & Mar)
+				ctlr->mcast++;
+
+			switch(control & (Pid1|Pid0)){
+			default:
+				break;
+			case Pid0:
+				if(control & Tcpf){
+					ctlr->tcpf++;
+					break;
+				}
+				bp->flag |= Btcpck;
+				break;
+			case Pid1:
+				if(control & Udpf){
+					ctlr->udpf++;
+					break;
+				}
+				bp->flag |= Budpck;
+				break;
+			case Pid1|Pid0:
+				if(control & Ipf){
+					ctlr->ipf++;
+					break;
+				}
+				bp->flag |= Bipck;
+				break;
+			}
+			etheriq(edev, bp, 1);
+		}
+		else{
+			/*
+			 * Error stuff here.
+			print("control %#8.8ux\n", control);
+			 */
+		}
+		d->control &= Eor;
+		ctlr->nrdfree--;
+		rdh = NEXT(rdh, ctlr->nrd);
+
+		if(ctlr->nrdfree < ctlr->nrd/2)
+			rtl8169replenish(ctlr);
+	}
+	ctlr->rdh = rdh;
+}
+
+static void
+rtl8169interrupt(Ureg*, void* arg)
+{
+	Ctlr *ctlr;
+	Ether *edev;
+	u32int isr;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	while((isr = csr16r(ctlr, Isr)) != 0 && isr != 0xFFFF){
+		csr16w(ctlr, Isr, isr);
+		if((isr & ctlr->imr) == 0)
+			break;
+		if(isr & (Fovw|Punlc|Rdu|Rer|Rok)){
+			rtl8169receive(edev);
+			if(!(isr & (Punlc|Rok)))
+				ctlr->ierrs++;
+			if(isr & Rer)
+				ctlr->rer++;
+			if(isr & Rdu)
+				ctlr->rdu++;
+			if(isr & Punlc)
+				ctlr->punlc++;
+			if(isr & Fovw)
+				ctlr->fovw++;
+			isr &= ~(Fovw|Rdu|Rer|Rok);
+		}
+
+		if(isr & (Tdu|Ter|Tok)){
+			rtl8169transmit(edev);
+			isr &= ~(Tdu|Ter|Tok);
+		}
+
+		if(isr & Punlc){
+			rtl8169link(edev);
+			isr &= ~Punlc;
+		}
+
+		/*
+		 * Some of the reserved bits get set sometimes...
+		 */
+		if(isr & (Serr|Timeout|Tdu|Fovw|Punlc|Rdu|Ter|Tok|Rer|Rok))
+			panic("rtl8169interrupt: imr %#4.4ux isr %#4.4ux\n",
+				csr16r(ctlr, Imr), isr);
+	}
+}
+
+static void
+rtl8169pci(void)
+{
+	Pcidev *p;
+	Ctlr *ctlr;
+	int i, port, pcie;
+
+	p = nil;
+	while(p = pcimatch(p, 0, 0)){
+		if(p->ccrb != 0x02 || p->ccru != 0)
+			continue;
+
+		pcie = 0;
+		switch(i = ((p->did<<16)|p->vid)){
+		default:
+			continue;
+		case Rtl8100e:			/* RTL810[01]E ? */
+		case Rtl8168b:			/* RTL8168B */
+			pcie = 1;
+			break;
+		case Rtl8169c:			/* RTL8169C */
+		case Rtl8169sc:			/* RTL8169SC */
+		case Rtl8169:			/* RTL8169 */
+			break;
+		case (0xC107<<16)|0x1259:	/* Corega CG-LAPCIGT */
+			i = Rtl8169;
+			break;
+		}
+
+		port = p->mem[0].bar & ~0x01;
+		if(ioalloc(port, p->mem[0].size, 0, "rtl8169") < 0){
+			print("rtl8169: port %#ux in use\n", port);
+			continue;
+		}
+
+		ctlr = malloc(sizeof(Ctlr));
+		ctlr->port = port;
+		ctlr->pcidev = p;
+		ctlr->pciv = i;
+		ctlr->pcie = pcie;
+
+		if(pcigetpms(p) > 0){
+			pcisetpms(p, 0);
+
+			for(i = 0; i < 6; i++)
+				pcicfgw32(p, PciBAR0+i*4, p->mem[i].bar);
+			pcicfgw8(p, PciINTL, p->intl);
+			pcicfgw8(p, PciLTR, p->ltr);
+			pcicfgw8(p, PciCLS, p->cls);
+			pcicfgw16(p, PciPCR, p->pcr);
+		}
+
+		if(rtl8169reset(ctlr)){
+			iofree(port);
+			free(ctlr);
+			continue;
+		}
+
+		/*
+		 * Extract the chip hardware version,
+		 * needed to configure each properly.
+		 */
+		ctlr->macv = csr32r(ctlr, Tcr) & HwveridMASK;
+		if((ctlr->mii = rtl8169mii(ctlr)) == nil){
+			iofree(port);
+			free(ctlr);
+			continue;
+		}
+
+		pcisetbme(p);
+
+		if(rtl8169ctlrhead != nil)
+			rtl8169ctlrtail->next = ctlr;
+		else
+			rtl8169ctlrhead = ctlr;
+		rtl8169ctlrtail = ctlr;
+	}
+}
+
+static int
+rtl8169pnp(Ether* edev)
+{
+	u32int r;
+	Ctlr *ctlr;
+	uchar ea[Eaddrlen];
+
+	if(rtl8169ctlrhead == nil)
+		rtl8169pci();
+
+	/*
+	 * Any adapter matches if no edev->port is supplied,
+	 * otherwise the ports must match.
+	 */
+	for(ctlr = rtl8169ctlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+	if(ctlr == nil)
+		return -1;
+
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 100;
+
+	/*
+	 * Check if the adapter's station address is to be overridden.
+	 * If not, read it from the device and set in edev->ea.
+	 */
+	memset(ea, 0, Eaddrlen);
+	if(memcmp(ea, edev->ea, Eaddrlen) == 0){
+		r = csr32r(ctlr, Idr0);
+		edev->ea[0] = r;
+		edev->ea[1] = r>>8;
+		edev->ea[2] = r>>16;
+		edev->ea[3] = r>>24;
+		r = csr32r(ctlr, Idr0+4);
+		edev->ea[4] = r;
+		edev->ea[5] = r>>8;
+	}
+
+	edev->attach = rtl8169attach;
+	edev->transmit = rtl8169transmit;
+	edev->interrupt = rtl8169interrupt;
+	edev->ifstat = rtl8169ifstat;
+
+	edev->arg = edev;
+	edev->promiscuous = rtl8169promiscuous;
+	edev->multicast = rtl8169multicast;
+//	edev->shutdown = rtl8169shutdown;
+
+	rtl8169link(edev);
+
+	return 0;
+}
+
+void
+ether8169link(void)
+{
+	addethercard("rtl8169", rtl8169pnp);
+}

+ 1342 - 0
sys/src/9/386/ether82557.c

@@ -0,0 +1,1342 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Intel 82557 Fast Ethernet PCI Bus LAN Controller
+ * as found on the Intel EtherExpress PRO/100B. This chip is full
+ * of smarts, unfortunately they're not all in the right place.
+ * To do:
+ *	the PCI scanning code could be made common to other adapters;
+ *	auto-negotiation, full-duplex;
+ *	optionally use memory-mapped registers;
+ *	detach for PCI reset problems (also towards loadable drivers).
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "../port/netif.h"
+
+#include "etherif.h"
+#include "io.h"
+
+enum {
+	Nrfd		= 64,		/* receive frame area */
+	Ncb		= 64,		/* maximum control blocks queued */
+
+	NullPointer	= 0xFFFFFFFF,	/* 82557 NULL pointer */
+};
+
+enum {					/* CSR */
+	Status		= 0x00,		/* byte or word (word includes Ack) */
+	Ack		= 0x01,		/* byte */
+	CommandR	= 0x02,		/* byte or word (word includes Interrupt) */
+	Interrupt	= 0x03,		/* byte */
+	General		= 0x04,		/* dword */
+	Port		= 0x08,		/* dword */
+	Fcr		= 0x0C,		/* Flash control register */
+	Ecr		= 0x0E,		/* EEPROM control register */
+	Mcr		= 0x10,		/* MDI control register */
+	Gstatus		= 0x1D,		/* General status register */
+};
+
+enum {					/* Status */
+	RUidle		= 0x0000,
+	RUsuspended	= 0x0004,
+	RUnoresources	= 0x0008,
+	RUready		= 0x0010,
+	RUrbd		= 0x0020,	/* bit */
+	RUstatus	= 0x003F,	/* mask */
+
+	CUidle		= 0x0000,
+	CUsuspended	= 0x0040,
+	CUactive	= 0x0080,
+	CUstatus	= 0x00C0,	/* mask */
+
+	StatSWI		= 0x0400,	/* SoftWare generated Interrupt */
+	StatMDI		= 0x0800,	/* MDI r/w done */
+	StatRNR		= 0x1000,	/* Receive unit Not Ready */
+	StatCNA		= 0x2000,	/* Command unit Not Active (Active->Idle) */
+	StatFR		= 0x4000,	/* Finished Receiving */
+	StatCX		= 0x8000,	/* Command eXecuted */
+	StatTNO		= 0x8000,	/* Transmit NOT OK */
+};
+
+enum {					/* Command (byte) */
+	CUnop		= 0x00,
+	CUstart		= 0x10,
+	CUresume	= 0x20,
+	LoadDCA		= 0x40,		/* Load Dump Counters Address */
+	DumpSC		= 0x50,		/* Dump Statistical Counters */
+	LoadCUB		= 0x60,		/* Load CU Base */
+	ResetSA		= 0x70,		/* Dump and Reset Statistical Counters */
+
+	RUstart		= 0x01,
+	RUresume	= 0x02,
+	RUabort		= 0x04,
+	LoadHDS		= 0x05,		/* Load Header Data Size */
+	LoadRUB		= 0x06,		/* Load RU Base */
+	RBDresume	= 0x07,		/* Resume frame reception */
+};
+
+enum {					/* Interrupt (byte) */
+	InterruptM	= 0x01,		/* interrupt Mask */
+	InterruptSI	= 0x02,		/* Software generated Interrupt */
+};
+
+enum {					/* Ecr */
+	EEsk		= 0x01,		/* serial clock */
+	EEcs		= 0x02,		/* chip select */
+	EEdi		= 0x04,		/* serial data in */
+	EEdo		= 0x08,		/* serial data out */
+
+	EEstart		= 0x04,		/* start bit */
+	EEread		= 0x02,		/* read opcode */
+};
+
+enum {					/* Mcr */
+	MDIread		= 0x08000000,	/* read opcode */
+	MDIwrite	= 0x04000000,	/* write opcode */
+	MDIready	= 0x10000000,	/* ready bit */
+	MDIie		= 0x20000000,	/* interrupt enable */
+};
+
+typedef struct Rfd {
+	int	field;
+	ulong	link;
+	ulong	rbd;
+	ushort	count;
+	ushort	size;
+
+	uchar	data[1700];
+} Rfd;
+
+enum {					/* field */
+	RfdCollision	= 0x00000001,
+	RfdIA		= 0x00000002,	/* IA match */
+	RfdRxerr	= 0x00000010,	/* PHY character error */
+	RfdType		= 0x00000020,	/* Type frame */
+	RfdRunt		= 0x00000080,
+	RfdOverrun	= 0x00000100,
+	RfdBuffer	= 0x00000200,
+	RfdAlignment	= 0x00000400,
+	RfdCRC		= 0x00000800,
+
+	RfdOK		= 0x00002000,	/* frame received OK */
+	RfdC		= 0x00008000,	/* reception Complete */
+	RfdSF		= 0x00080000,	/* Simplified or Flexible (1) Rfd */
+	RfdH		= 0x00100000,	/* Header RFD */
+
+	RfdI		= 0x20000000,	/* Interrupt after completion */
+	RfdS		= 0x40000000,	/* Suspend after completion */
+	RfdEL		= 0x80000000,	/* End of List */
+};
+
+enum {					/* count */
+	RfdF		= 0x4000,
+	RfdEOF		= 0x8000,
+};
+
+typedef struct Cb Cb;
+typedef struct Cb {
+	ushort	status;
+	ushort	command;
+	ulong	link;
+	union {
+		uchar	data[24];	/* CbIAS + CbConfigure */
+		struct {
+			ulong	tbd;
+			ushort	count;
+			uchar	threshold;
+			uchar	number;
+
+			ulong	tba;
+			ushort	tbasz;
+			ushort	pad;
+		};
+	};
+
+	Block*	bp;
+	Cb*	next;
+} Cb;
+
+enum {					/* action command */
+	CbU		= 0x1000,	/* transmit underrun */
+	CbOK		= 0x2000,	/* DMA completed OK */
+	CbC		= 0x8000,	/* execution Complete */
+
+	CbNOP		= 0x0000,
+	CbIAS		= 0x0001,	/* Individual Address Setup */
+	CbConfigure	= 0x0002,
+	CbMAS		= 0x0003,	/* Multicast Address Setup */
+	CbTransmit	= 0x0004,
+	CbDump		= 0x0006,
+	CbDiagnose	= 0x0007,
+	CbCommand	= 0x0007,	/* mask */
+
+	CbSF		= 0x0008,	/* Flexible-mode CbTransmit */
+
+	CbI		= 0x2000,	/* Interrupt after completion */
+	CbS		= 0x4000,	/* Suspend after completion */
+	CbEL		= 0x8000,	/* End of List */
+};
+
+enum {					/* CbTransmit count */
+	CbEOF		= 0x8000,
+};
+
+typedef struct Ctlr Ctlr;
+typedef struct Ctlr {
+	Lock	slock;			/* attach */
+	int	state;
+
+	int	port;
+	Pcidev*	pcidev;
+	Ctlr*	next;
+	int	active;
+
+	int	eepromsz;		/* address size in bits */
+	ushort*	eeprom;
+
+	Lock	miilock;
+
+	int	tick;
+
+	Lock	rlock;			/* registers */
+	int	command;		/* last command issued */
+
+	Block*	rfdhead;		/* receive side */
+	Block*	rfdtail;
+	int	nrfd;
+
+	Lock	cblock;			/* transmit side */
+	int	action;
+	int	nop;
+	uchar	configdata[24];
+	int	threshold;
+	int	ncb;
+	Cb*	cbr;
+	Cb*	cbhead;
+	Cb*	cbtail;
+	int	cbq;
+	int	cbqmax;
+	int	cbqmaxhw;
+
+	Lock	dlock;			/* dump statistical counters */
+	ulong	dump[17];
+} Ctlr;
+
+static Ctlr* ctlrhead;
+static Ctlr* ctlrtail;
+
+static uchar configdata[24] = {
+	0x16,				/* byte count */
+	0x08,				/* Rx/Tx FIFO limit */
+	0x00,				/* adaptive IFS */
+	0x00,
+	0x00,				/* Rx DMA maximum byte count */
+//	0x80,				/* Tx DMA maximum byte count */
+	0x00,				/* Tx DMA maximum byte count */
+	0x32,				/* !late SCB, CNA interrupts */
+	0x03,				/* discard short Rx frames */
+	0x00,				/* 503/MII */
+
+	0x00,	
+	0x2E,				/* normal operation, NSAI */
+	0x00,				/* linear priority */
+	0x60,				/* inter-frame spacing */
+	0x00,	
+	0xF2,	
+	0xC8,				/* 503, promiscuous mode off */
+	0x00,	
+	0x40,	
+	0xF3,				/* transmit padding enable */
+	0x80,				/* full duplex pin enable */
+	0x3F,				/* no Multi IA */
+	0x05,				/* no Multi Cast ALL */
+};
+
+#define csr8r(c, r)	(inb((c)->port+(r)))
+#define csr16r(c, r)	(ins((c)->port+(r)))
+#define csr32r(c, r)	(inl((c)->port+(r)))
+#define csr8w(c, r, b)	(outb((c)->port+(r), (int)(b)))
+#define csr16w(c, r, w)	(outs((c)->port+(r), (ushort)(w)))
+#define csr32w(c, r, l)	(outl((c)->port+(r), (ulong)(l)))
+
+static void
+command(Ctlr* ctlr, int c, int v)
+{
+	int timeo;
+
+	ilock(&ctlr->rlock);
+
+	/*
+	 * Only back-to-back CUresume can be done
+	 * without waiting for any previous command to complete.
+	 * This should be the common case.
+	 * Unfortunately there's a chip errata where back-to-back
+	 * CUresumes can be lost, the fix is to always wait.
+	if(c == CUresume && ctlr->command == CUresume){
+		csr8w(ctlr, CommandR, c);
+		iunlock(&ctlr->rlock);
+		return;
+	}
+	 */
+
+	for(timeo = 0; timeo < 100; timeo++){
+		if(!csr8r(ctlr, CommandR))
+			break;
+		microdelay(1);
+	}
+	if(timeo >= 100){
+		ctlr->command = -1;
+		iunlock(&ctlr->rlock);
+		iprint("i82557: command %#ux %#ux timeout\n", c, v);
+		return;
+	}
+
+	switch(c){
+
+	case CUstart:
+	case LoadDCA:
+	case LoadCUB:
+	case RUstart:
+	case LoadHDS:
+	case LoadRUB:
+		csr32w(ctlr, General, v);
+		break;
+
+	/*
+	case CUnop:
+	case CUresume:
+	case DumpSC:
+	case ResetSA:
+	case RUresume:
+	case RUabort:
+	 */
+	default:
+		break;
+	}
+	csr8w(ctlr, CommandR, c);
+	ctlr->command = c;
+
+	iunlock(&ctlr->rlock);
+}
+
+static Block*
+rfdalloc(ulong link)
+{
+	Block *bp;
+	Rfd *rfd;
+
+	if(bp = iallocb(sizeof(Rfd))){
+		rfd = (Rfd*)bp->rp;
+		rfd->field = 0;
+		rfd->link = link;
+		rfd->rbd = NullPointer;
+		rfd->count = 0;
+		rfd->size = sizeof(Etherpkt);
+	}
+
+	return bp;
+}
+
+static void
+watchdog(void* arg)
+{
+	Ether *ether;
+	Ctlr *ctlr;
+	static void txstart(Ether*);
+
+	ether = arg;
+	for(;;){
+		tsleep(&up->sleep, return0, 0, 4000);
+
+		/*
+		 * Hmmm. This doesn't seem right. Currently
+		 * the device can't be disabled but it may be in
+		 * the future.
+		 */
+		ctlr = ether->ctlr;
+		if(ctlr == nil || ctlr->state == 0){
+			print("%s: exiting\n", up->text);
+			pexit("disabled", 0);
+		}
+
+		ilock(&ctlr->cblock);
+		if(ctlr->tick++){
+			ctlr->action = CbMAS;
+			txstart(ether);
+		}
+		iunlock(&ctlr->cblock);
+	}
+}
+
+static void
+attach(Ether* ether)
+{
+	Ctlr *ctlr;
+	char name[KNAMELEN];
+
+	ctlr = ether->ctlr;
+	lock(&ctlr->slock);
+	if(ctlr->state == 0){
+		ilock(&ctlr->rlock);
+		csr8w(ctlr, Interrupt, 0);
+		iunlock(&ctlr->rlock);
+		command(ctlr, RUstart, PADDR(ctlr->rfdhead->rp));
+		ctlr->state = 1;
+
+		/*
+		 * Start the watchdog timer for the receive lockup errata
+		 * unless the EEPROM compatibility word indicates it may be
+		 * omitted.
+		 */
+		if((ctlr->eeprom[0x03] & 0x0003) != 0x0003){
+			snprint(name, KNAMELEN, "#l%dwatchdog", ether->ctlrno);
+			kproc(name, watchdog, ether);
+		}
+	}
+	unlock(&ctlr->slock);
+}
+
+static long
+ifstat(Ether* ether, void* a, long n, ulong offset)
+{
+	char *p;
+	int i, len, phyaddr;
+	Ctlr *ctlr;
+	ulong dump[17];
+
+	ctlr = ether->ctlr;
+	lock(&ctlr->dlock);
+
+	/*
+	 * Start the command then
+	 * wait for completion status,
+	 * should be 0xA005.
+	 */
+	ctlr->dump[16] = 0;
+	command(ctlr, DumpSC, 0);
+	while(ctlr->dump[16] == 0)
+		;
+
+	ether->oerrs = ctlr->dump[1]+ctlr->dump[2]+ctlr->dump[3];
+	ether->crcs = ctlr->dump[10];
+	ether->frames = ctlr->dump[11];
+	ether->buffs = ctlr->dump[12]+ctlr->dump[15];
+	ether->overflows = ctlr->dump[13];
+
+	if(n == 0){
+		unlock(&ctlr->dlock);
+		return 0;
+	}
+
+	memmove(dump, ctlr->dump, sizeof(dump));
+	unlock(&ctlr->dlock);
+
+	p = malloc(READSTR);
+	len = snprint(p, READSTR, "transmit good frames: %lud\n", dump[0]);
+	len += snprint(p+len, READSTR-len, "transmit maximum collisions errors: %lud\n", dump[1]);
+	len += snprint(p+len, READSTR-len, "transmit late collisions errors: %lud\n", dump[2]);
+	len += snprint(p+len, READSTR-len, "transmit underrun errors: %lud\n", dump[3]);
+	len += snprint(p+len, READSTR-len, "transmit lost carrier sense: %lud\n", dump[4]);
+	len += snprint(p+len, READSTR-len, "transmit deferred: %lud\n", dump[5]);
+	len += snprint(p+len, READSTR-len, "transmit single collisions: %lud\n", dump[6]);
+	len += snprint(p+len, READSTR-len, "transmit multiple collisions: %lud\n", dump[7]);
+	len += snprint(p+len, READSTR-len, "transmit total collisions: %lud\n", dump[8]);
+	len += snprint(p+len, READSTR-len, "receive good frames: %lud\n", dump[9]);
+	len += snprint(p+len, READSTR-len, "receive CRC errors: %lud\n", dump[10]);
+	len += snprint(p+len, READSTR-len, "receive alignment errors: %lud\n", dump[11]);
+	len += snprint(p+len, READSTR-len, "receive resource errors: %lud\n", dump[12]);
+	len += snprint(p+len, READSTR-len, "receive overrun errors: %lud\n", dump[13]);
+	len += snprint(p+len, READSTR-len, "receive collision detect errors: %lud\n", dump[14]);
+	len += snprint(p+len, READSTR-len, "receive short frame errors: %lud\n", dump[15]);
+	len += snprint(p+len, READSTR-len, "nop: %d\n", ctlr->nop);
+	if(ctlr->cbqmax > ctlr->cbqmaxhw)
+		ctlr->cbqmaxhw = ctlr->cbqmax;
+	len += snprint(p+len, READSTR-len, "cbqmax: %d\n", ctlr->cbqmax);
+	ctlr->cbqmax = 0;
+	len += snprint(p+len, READSTR-len, "threshold: %d\n", ctlr->threshold);
+
+	len += snprint(p+len, READSTR-len, "eeprom:");
+	for(i = 0; i < (1<<ctlr->eepromsz); i++){
+		if(i && ((i & 0x07) == 0))
+			len += snprint(p+len, READSTR-len, "\n       ");
+		len += snprint(p+len, READSTR-len, " %4.4ux", ctlr->eeprom[i]);
+	}
+
+	if((ctlr->eeprom[6] & 0x1F00) && !(ctlr->eeprom[6] & 0x8000)){
+		phyaddr = ctlr->eeprom[6] & 0x00FF;
+		len += snprint(p+len, READSTR-len, "\nphy %2d:", phyaddr);
+		for(i = 0; i < 6; i++){
+			static int miir(Ctlr*, int, int);
+
+			len += snprint(p+len, READSTR-len, " %4.4ux",
+				miir(ctlr, phyaddr, i));
+		}
+	}
+
+	snprint(p+len, READSTR-len, "\n");
+	n = readstr(offset, a, n, p);
+	free(p);
+
+	return n;
+}
+
+static void
+txstart(Ether* ether)
+{
+	Ctlr *ctlr;
+	Block *bp;
+	Cb *cb;
+
+	ctlr = ether->ctlr;
+	while(ctlr->cbq < (ctlr->ncb-1)){
+		cb = ctlr->cbhead->next;
+		if(ctlr->action == 0){
+			bp = qget(ether->oq);
+			if(bp == nil)
+				break;
+
+			cb->command = CbS|CbSF|CbTransmit;
+			cb->tbd = PADDR(&cb->tba);
+			cb->count = 0;
+			cb->threshold = ctlr->threshold;
+			cb->number = 1;
+			cb->tba = PADDR(bp->rp);
+			cb->bp = bp;
+			cb->tbasz = BLEN(bp);
+		}
+		else if(ctlr->action == CbConfigure){
+			cb->command = CbS|CbConfigure;
+			memmove(cb->data, ctlr->configdata, sizeof(ctlr->configdata));
+			ctlr->action = 0;
+		}
+		else if(ctlr->action == CbIAS){
+			cb->command = CbS|CbIAS;
+			memmove(cb->data, ether->ea, Eaddrlen);
+			ctlr->action = 0;
+		}
+		else if(ctlr->action == CbMAS){
+			cb->command = CbS|CbMAS;
+			memset(cb->data, 0, sizeof(cb->data));
+			ctlr->action = 0;
+		}
+		else{
+			print("#l%d: action %#ux\n", ether->ctlrno, ctlr->action);
+			ctlr->action = 0;
+			break;
+		}
+		cb->status = 0;
+
+		coherence();
+		ctlr->cbhead->command &= ~CbS;
+		ctlr->cbhead = cb;
+		ctlr->cbq++;
+	}
+
+	/*
+	 * Workaround for some broken HUB chips
+	 * when connected at 10Mb/s half-duplex.
+	 */
+	if(ctlr->nop){
+		command(ctlr, CUnop, 0);
+		microdelay(1);
+	}
+	command(ctlr, CUresume, 0);
+
+	if(ctlr->cbq > ctlr->cbqmax)
+		ctlr->cbqmax = ctlr->cbq;
+}
+
+static void
+configure(Ether* ether, int promiscuous)
+{
+	Ctlr *ctlr;
+
+	ctlr = ether->ctlr;
+	ilock(&ctlr->cblock);
+	if(promiscuous){
+		ctlr->configdata[6] |= 0x80;		/* Save Bad Frames */
+		//ctlr->configdata[6] &= ~0x40;		/* !Discard Overrun Rx Frames */
+		ctlr->configdata[7] &= ~0x01;		/* !Discard Short Rx Frames */
+		ctlr->configdata[15] |= 0x01;		/* Promiscuous mode */
+		ctlr->configdata[18] &= ~0x01;		/* (!Padding enable?), !stripping enable */
+		ctlr->configdata[21] |= 0x08;		/* Multi Cast ALL */
+	}
+	else{
+		ctlr->configdata[6] &= ~0x80;
+		//ctlr->configdata[6] |= 0x40;
+		ctlr->configdata[7] |= 0x01;
+		ctlr->configdata[15] &= ~0x01;
+		ctlr->configdata[18] |= 0x01;		/* 0x03? */
+		ctlr->configdata[21] &= ~0x08;
+	}
+	ctlr->action = CbConfigure;
+	txstart(ether);
+	iunlock(&ctlr->cblock);
+}
+
+static void
+promiscuous(void* arg, int on)
+{
+	configure(arg, on);
+}
+
+static void
+multicast(void* ether, uchar *addr, int add)
+{
+	USED(addr);
+	/*
+	 * TODO: if (add) add addr to list of mcast addrs in controller
+	 *	else remove addr from list of mcast addrs in controller
+	 * enable multicast input (see CbMAS) instead of promiscuous mode.
+	 */
+	if (add)
+		configure(ether, 1);
+}
+
+static void
+transmit(Ether* ether)
+{
+	Ctlr *ctlr;
+
+	ctlr = ether->ctlr;
+	ilock(&ctlr->cblock);
+	txstart(ether);
+	iunlock(&ctlr->cblock);
+}
+
+static void
+receive(Ether* ether)
+{
+	Rfd *rfd;
+	Ctlr *ctlr;
+	int count;
+	Block *bp, *pbp, *xbp;
+
+	ctlr = ether->ctlr;
+	bp = ctlr->rfdhead;
+	for(rfd = (Rfd*)bp->rp; rfd->field & RfdC; rfd = (Rfd*)bp->rp){
+		/*
+		 * If it's an OK receive frame
+		 * 1) save the count 
+		 * 2) if it's small, try to allocate a block and copy
+		 *    the data, then adjust the necessary fields for reuse;
+		 * 3) if it's big, try to allocate a new Rfd and if
+		 *    successful
+		 *	adjust the received buffer pointers for the
+		 *	  actual data received;
+		 *	initialise the replacement buffer to point to
+		 *	  the next in the ring;
+		 *	initialise bp to point to the replacement;
+		 * 4) if there's a good packet, pass it on for disposal.
+		 */
+		if(rfd->field & RfdOK){
+			pbp = nil;
+			count = rfd->count & 0x3FFF;
+			if((count < ETHERMAXTU/4) && (pbp = iallocb(count))){
+				memmove(pbp->rp, bp->rp+offsetof(Rfd, data[0]), count);
+				pbp->wp = pbp->rp + count;
+
+				rfd->count = 0;
+				rfd->field = 0;
+			}
+			else if(xbp = rfdalloc(rfd->link)){
+				bp->rp += offsetof(Rfd, data[0]);
+				bp->wp = bp->rp + count;
+
+				xbp->next = bp->next;
+				bp->next = 0;
+
+				pbp = bp;
+				bp = xbp;
+			}
+			if(pbp != nil)
+				etheriq(ether, pbp, 1);
+		}
+		else{
+			rfd->count = 0;
+			rfd->field = 0;
+		}
+
+		/*
+		 * The ring tail pointer follows the head with with one
+		 * unused buffer in between to defeat hardware prefetch;
+		 * once the tail pointer has been bumped on to the next
+		 * and the new tail has the Suspend bit set, it can be
+		 * removed from the old tail buffer.
+		 * As a replacement for the current head buffer may have
+		 * been allocated above, ensure that the new tail points
+		 * to it (next and link).
+		 */
+		rfd = (Rfd*)ctlr->rfdtail->rp;
+		ctlr->rfdtail = ctlr->rfdtail->next;
+		ctlr->rfdtail->next = bp;
+		((Rfd*)ctlr->rfdtail->rp)->link = PADDR(bp->rp);
+		((Rfd*)ctlr->rfdtail->rp)->field |= RfdS;
+		coherence();
+		rfd->field &= ~RfdS;
+
+		/*
+		 * Finally done with the current (possibly replaced)
+		 * head, move on to the next and maintain the sentinel
+		 * between tail and head.
+		 */
+		ctlr->rfdhead = bp->next;
+		bp = ctlr->rfdhead;
+	}
+}
+
+static void
+interrupt(Ureg*, void* arg)
+{
+	Cb* cb;
+	Ctlr *ctlr;
+	Ether *ether;
+	int status;
+
+	ether = arg;
+	ctlr = ether->ctlr;
+
+	for(;;){
+		ilock(&ctlr->rlock);
+		status = csr16r(ctlr, Status);
+		csr8w(ctlr, Ack, (status>>8) & 0xFF);
+		iunlock(&ctlr->rlock);
+
+		if(!(status & (StatCX|StatFR|StatCNA|StatRNR|StatMDI|StatSWI)))
+			break;
+
+		/*
+		 * If the watchdog timer for the receiver lockup errata is running,
+		 * let it know the receiver is active.
+		 */
+		if(status & (StatFR|StatRNR)){
+			ilock(&ctlr->cblock);
+			ctlr->tick = 0;
+			iunlock(&ctlr->cblock);
+		}
+
+		if(status & StatFR){
+			receive(ether);
+			status &= ~StatFR;
+		}
+
+		if(status & StatRNR){
+			command(ctlr, RUresume, 0);
+			status &= ~StatRNR;
+		}
+
+		if(status & StatCNA){
+			ilock(&ctlr->cblock);
+
+			cb = ctlr->cbtail;
+			while(ctlr->cbq){
+				if(!(cb->status & CbC))
+					break;
+				if(cb->bp){
+					freeb(cb->bp);
+					cb->bp = nil;
+				}
+				if((cb->status & CbU) && ctlr->threshold < 0xE0)
+					ctlr->threshold++;
+
+				ctlr->cbq--;
+				cb = cb->next;
+			}
+			ctlr->cbtail = cb;
+
+			txstart(ether);
+			iunlock(&ctlr->cblock);
+
+			status &= ~StatCNA;
+		}
+
+		if(status & (StatCX|StatFR|StatCNA|StatRNR|StatMDI|StatSWI))
+			panic("#l%d: status %#ux\n", ether->ctlrno, status);
+	}
+}
+
+static void
+ctlrinit(Ctlr* ctlr)
+{
+	int i;
+	Block *bp;
+	Rfd *rfd;
+	ulong link;
+
+	/*
+	 * Create the Receive Frame Area (RFA) as a ring of allocated
+	 * buffers.
+	 * A sentinel buffer is maintained between the last buffer in
+	 * the ring (marked with RfdS) and the head buffer to defeat the
+	 * hardware prefetch of the next RFD and allow dynamic buffer
+	 * allocation.
+	 */
+	link = NullPointer;
+	for(i = 0; i < Nrfd; i++){
+		bp = rfdalloc(link);
+		if(ctlr->rfdhead == nil)
+			ctlr->rfdtail = bp;
+		bp->next = ctlr->rfdhead;
+		ctlr->rfdhead = bp;
+		link = PADDR(bp->rp);
+	}
+	ctlr->rfdtail->next = ctlr->rfdhead;
+	rfd = (Rfd*)ctlr->rfdtail->rp;
+	rfd->link = PADDR(ctlr->rfdhead->rp);
+	rfd->field |= RfdS;
+	ctlr->rfdhead = ctlr->rfdhead->next;
+
+	/*
+	 * Create a ring of control blocks for the
+	 * transmit side.
+	 */
+	ilock(&ctlr->cblock);
+	ctlr->cbr = malloc(ctlr->ncb*sizeof(Cb));
+	for(i = 0; i < ctlr->ncb; i++){
+		ctlr->cbr[i].status = CbC|CbOK;
+		ctlr->cbr[i].command = CbS|CbNOP;
+		ctlr->cbr[i].link = PADDR(&ctlr->cbr[NEXT(i, ctlr->ncb)].status);
+		ctlr->cbr[i].next = &ctlr->cbr[NEXT(i, ctlr->ncb)];
+	}
+	ctlr->cbhead = ctlr->cbr;
+	ctlr->cbtail = ctlr->cbr;
+	ctlr->cbq = 0;
+
+	memmove(ctlr->configdata, configdata, sizeof(configdata));
+	ctlr->threshold = 80;
+	ctlr->tick = 0;
+
+	iunlock(&ctlr->cblock);
+}
+
+static int
+miir(Ctlr* ctlr, int phyadd, int regadd)
+{
+	int mcr, timo;
+
+	lock(&ctlr->miilock);
+	csr32w(ctlr, Mcr, MDIread|(phyadd<<21)|(regadd<<16));
+	mcr = 0;
+	for(timo = 64; timo; timo--){
+		mcr = csr32r(ctlr, Mcr);
+		if(mcr & MDIready)
+			break;
+		microdelay(1);
+	}
+	unlock(&ctlr->miilock);
+
+	if(mcr & MDIready)
+		return mcr & 0xFFFF;
+
+	return -1;
+}
+
+static int
+miiw(Ctlr* ctlr, int phyadd, int regadd, int data)
+{
+	int mcr, timo;
+
+	lock(&ctlr->miilock);
+	csr32w(ctlr, Mcr, MDIwrite|(phyadd<<21)|(regadd<<16)|(data & 0xFFFF));
+	mcr = 0;
+	for(timo = 64; timo; timo--){
+		mcr = csr32r(ctlr, Mcr);
+		if(mcr & MDIready)
+			break;
+		microdelay(1);
+	}
+	unlock(&ctlr->miilock);
+
+	if(mcr & MDIready)
+		return 0;
+
+	return -1;
+}
+
+static int
+hy93c46r(Ctlr* ctlr, int r)
+{
+	int data, i, op, size;
+
+	/*
+	 * Hyundai HY93C46 or equivalent serial EEPROM.
+	 * This sequence for reading a 16-bit register 'r'
+	 * in the EEPROM is taken straight from Section
+	 * 3.3.4.2 of the Intel 82557 User's Guide.
+	 */
+reread:
+	csr16w(ctlr, Ecr, EEcs);
+	op = EEstart|EEread;
+	for(i = 2; i >= 0; i--){
+		data = (((op>>i) & 0x01)<<2)|EEcs;
+		csr16w(ctlr, Ecr, data);
+		csr16w(ctlr, Ecr, data|EEsk);
+		microdelay(1);
+		csr16w(ctlr, Ecr, data);
+		microdelay(1);
+	}
+
+	/*
+	 * First time through must work out the EEPROM size.
+	 */
+	if((size = ctlr->eepromsz) == 0)
+		size = 8;
+
+	for(size = size-1; size >= 0; size--){
+		data = (((r>>size) & 0x01)<<2)|EEcs;
+		csr16w(ctlr, Ecr, data);
+		csr16w(ctlr, Ecr, data|EEsk);
+		delay(1);
+		csr16w(ctlr, Ecr, data);
+		microdelay(1);
+		if(!(csr16r(ctlr, Ecr) & EEdo))
+			break;
+	}
+
+	data = 0;
+	for(i = 15; i >= 0; i--){
+		csr16w(ctlr, Ecr, EEcs|EEsk);
+		microdelay(1);
+		if(csr16r(ctlr, Ecr) & EEdo)
+			data |= (1<<i);
+		csr16w(ctlr, Ecr, EEcs);
+		microdelay(1);
+	}
+
+	csr16w(ctlr, Ecr, 0);
+
+	if(ctlr->eepromsz == 0){
+		ctlr->eepromsz = 8-size;
+		ctlr->eeprom = malloc((1<<ctlr->eepromsz)*sizeof(ushort));
+		goto reread;
+	}
+
+	return data;
+}
+
+static void
+i82557pci(void)
+{
+	Pcidev *p;
+	Ctlr *ctlr;
+	int i, nop, port;
+
+	p = nil;
+	nop = 0;
+	while(p = pcimatch(p, 0x8086, 0)){
+		switch(p->did){
+		default:
+			continue;
+		case 0x1031:		/* Intel 82562EM */
+		case 0x1050:		/* Intel 82562EZ */
+		case 0x1039:		/* Intel 82801BD PRO/100 VE */
+		case 0x103A:		/* Intel 82562 PRO/100 VE */
+		case 0x103D:		/* Intel 82562 PRO/100 VE */
+		case 0x1064:		/* Intel 82562 PRO/100 VE */
+		case 0x2449:		/* Intel 82562ET */
+		case 0x27DC:		/* Intel 82801G PRO/100 VE */
+			nop = 1;
+			/*FALLTHROUGH*/
+		case 0x1209:		/* Intel 82559ER */
+		case 0x1229:		/* Intel 8255[789] */
+		case 0x1030:		/* Intel 82559 InBusiness 10/100  */
+			break;
+		}
+
+		if(pcigetpms(p) > 0){
+			pcisetpms(p, 0);
+
+			for(i = 0; i < 6; i++)
+				pcicfgw32(p, PciBAR0+i*4, p->mem[i].bar);
+			pcicfgw8(p, PciINTL, p->intl);
+			pcicfgw8(p, PciLTR, p->ltr);
+			pcicfgw8(p, PciCLS, p->cls);
+			pcicfgw16(p, PciPCR, p->pcr);
+		}
+
+		/*
+		 * bar[0] is the memory-mapped register address (4KB),
+		 * bar[1] is the I/O port register address (32 bytes) and
+		 * bar[2] is for the flash ROM (1MB).
+		 */
+		port = p->mem[1].bar & ~0x01;
+		if(ioalloc(port, p->mem[1].size, 0, "i82557") < 0){
+			print("i82557: port %#ux in use\n", port);
+			continue;
+		}
+
+		ctlr = malloc(sizeof(Ctlr));
+		ctlr->port = port;
+		ctlr->pcidev = p;
+		ctlr->nop = nop;
+
+		if(ctlrhead != nil)
+			ctlrtail->next = ctlr;
+		else
+			ctlrhead = ctlr;
+		ctlrtail = ctlr;
+
+		pcisetbme(p);
+	}
+}
+
+static char* mediatable[9] = {
+	"10BASE-T",				/* TP */
+	"10BASE-2",				/* BNC */
+	"10BASE-5",				/* AUI */
+	"100BASE-TX",
+	"10BASE-TFD",
+	"100BASE-TXFD",
+	"100BASE-T4",
+	"100BASE-FX",
+	"100BASE-FXFD",
+};
+
+static int
+scanphy(Ctlr* ctlr)
+{
+	int i, oui, x;
+
+	for(i = 0; i < 32; i++){
+		if((oui = miir(ctlr, i, 2)) == -1 || oui == 0 || oui == 0xFFFF)
+			continue;
+		oui <<= 6;
+		x = miir(ctlr, i, 3);
+		oui |= x>>10;
+		//print("phy%d: oui %#ux reg1 %#ux\n", i, oui, miir(ctlr, i, 1));
+
+		ctlr->eeprom[6] = i;
+		if(oui == 0xAA00)
+			ctlr->eeprom[6] |= 0x07<<8;
+		else if(oui == 0x80017){
+			if(x & 0x01)
+				ctlr->eeprom[6] |= 0x0A<<8;
+			else
+				ctlr->eeprom[6] |= 0x04<<8;
+		}
+		return i;
+	}
+	return -1;
+}
+
+static void
+shutdown(Ether* ether)
+{
+	Ctlr *ctlr = ether->ctlr;
+
+	csr32w(ctlr, Port, 0);
+	delay(1);
+	csr8w(ctlr, Interrupt, InterruptM);
+}
+
+
+static int
+reset(Ether* ether)
+{
+	int anar, anlpar, bmcr, bmsr, i, k, medium, phyaddr, x;
+	unsigned short sum;
+	uchar ea[Eaddrlen];
+	Ctlr *ctlr;
+
+	if(ctlrhead == nil)
+		i82557pci();
+
+	/*
+	 * Any adapter matches if no ether->port is supplied,
+	 * otherwise the ports must match.
+	 */
+	for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(ether->port == 0 || ether->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+	if(ctlr == nil)
+		return -1;
+
+	/*
+	 * Initialise the Ctlr structure.
+	 * Perform a software reset after which should ensure busmastering
+	 * is still enabled. The EtherExpress PRO/100B appears to leave
+	 * the PCI configuration alone (see the 'To do' list above) so punt
+	 * for now.
+	 * Load the RUB and CUB registers for linear addressing (0).
+	 */
+	ether->ctlr = ctlr;
+	ether->port = ctlr->port;
+	ether->irq = ctlr->pcidev->intl;
+	ether->tbdf = ctlr->pcidev->tbdf;
+
+	ilock(&ctlr->rlock);
+	csr32w(ctlr, Port, 0);
+	delay(1);
+	csr8w(ctlr, Interrupt, InterruptM);
+	iunlock(&ctlr->rlock);
+
+	command(ctlr, LoadRUB, 0);
+	command(ctlr, LoadCUB, 0);
+	command(ctlr, LoadDCA, PADDR(ctlr->dump));
+
+	/*
+	 * Initialise the receive frame, transmit ring and configuration areas.
+	 */
+	ctlr->ncb = Ncb;
+	ctlrinit(ctlr);
+
+	/*
+	 * Read the EEPROM.
+	 * Do a dummy read first to get the size
+	 * and allocate ctlr->eeprom.
+	 */
+	hy93c46r(ctlr, 0);
+	sum = 0;
+	for(i = 0; i < (1<<ctlr->eepromsz); i++){
+		x = hy93c46r(ctlr, i);
+		ctlr->eeprom[i] = x;
+		sum += x;
+	}
+	if(sum != 0xBABA)
+		print("#l%d: EEPROM checksum - %#4.4ux\n", ether->ctlrno, sum);
+
+	/*
+	 * Eeprom[6] indicates whether there is a PHY and whether
+	 * it's not 10Mb-only, in which case use the given PHY address
+	 * to set any PHY specific options and determine the speed.
+	 * Unfortunately, sometimes the EEPROM is blank except for
+	 * the ether address and checksum; in this case look at the
+	 * controller type and if it's am 82558 or 82559 it has an
+	 * embedded PHY so scan for that.
+	 * If no PHY, assume 82503 (serial) operation.
+	 */
+	if((ctlr->eeprom[6] & 0x1F00) && !(ctlr->eeprom[6] & 0x8000))
+		phyaddr = ctlr->eeprom[6] & 0x00FF;
+	else
+	switch(ctlr->pcidev->rid){
+	case 0x01:			/* 82557 A-step */
+	case 0x02:			/* 82557 B-step */
+	case 0x03:			/* 82557 C-step */
+	default:
+		phyaddr = -1;
+		break;
+	case 0x04:			/* 82558 A-step */
+	case 0x05:			/* 82558 B-step */
+	case 0x06:			/* 82559 A-step */
+	case 0x07:			/* 82559 B-step */
+	case 0x08:			/* 82559 C-step */
+	case 0x09:			/* 82559ER A-step */
+		phyaddr = scanphy(ctlr);
+		break;
+	}
+	if(phyaddr >= 0){
+		/*
+		 * Resolve the highest common ability of the two
+		 * link partners. In descending order:
+		 *	0x0100		100BASE-TX Full Duplex
+		 *	0x0200		100BASE-T4
+		 *	0x0080		100BASE-TX
+		 *	0x0040		10BASE-T Full Duplex
+		 *	0x0020		10BASE-T
+		 */
+		anar = miir(ctlr, phyaddr, 0x04);
+		anlpar = miir(ctlr, phyaddr, 0x05) & 0x03E0;
+		anar &= anlpar;
+		bmcr = 0;
+		if(anar & 0x380)
+			bmcr = 0x2000;
+		if(anar & 0x0140)
+			bmcr |= 0x0100;
+
+		switch((ctlr->eeprom[6]>>8) & 0x001F){
+
+		case 0x04:				/* DP83840 */
+		case 0x0A:				/* DP83840A */
+			/*
+			 * The DP83840[A] requires some tweaking for
+			 * reliable operation.
+			 * The manual says bit 10 should be unconditionally
+			 * set although it supposedly only affects full-duplex
+			 * operation (an & 0x0140).
+			 */
+			x = miir(ctlr, phyaddr, 0x17) & ~0x0520;
+			x |= 0x0420;
+			for(i = 0; i < ether->nopt; i++){
+				if(cistrcmp(ether->opt[i], "congestioncontrol"))
+					continue;
+				x |= 0x0100;
+				break;
+			}
+			miiw(ctlr, phyaddr, 0x17, x);
+
+			/*
+			 * If the link partner can't autonegotiate, determine
+			 * the speed from elsewhere.
+			 */
+			if(anlpar == 0){
+				miir(ctlr, phyaddr, 0x01);
+				bmsr = miir(ctlr, phyaddr, 0x01);
+				x = miir(ctlr, phyaddr, 0x19);
+				if((bmsr & 0x0004) && !(x & 0x0040))
+					bmcr = 0x2000;
+			}
+			break;
+
+		case 0x07:				/* Intel 82555 */
+			/*
+			 * Auto-negotiation may fail if the other end is
+			 * a DP83840A and the cable is short.
+			 */
+			miir(ctlr, phyaddr, 0x01);
+			bmsr = miir(ctlr, phyaddr, 0x01);
+			if((miir(ctlr, phyaddr, 0) & 0x1000) && !(bmsr & 0x0020)){
+				miiw(ctlr, phyaddr, 0x1A, 0x2010);
+				x = miir(ctlr, phyaddr, 0);
+				miiw(ctlr, phyaddr, 0, 0x0200|x);
+				for(i = 0; i < 3000; i++){
+					delay(1);
+					if(miir(ctlr, phyaddr, 0x01) & 0x0020)
+						break;
+				}
+				miiw(ctlr, phyaddr, 0x1A, 0x2000);
+					
+				anar = miir(ctlr, phyaddr, 0x04);
+				anlpar = miir(ctlr, phyaddr, 0x05) & 0x03E0;
+				anar &= anlpar;
+				bmcr = 0;
+				if(anar & 0x380)
+					bmcr = 0x2000;
+				if(anar & 0x0140)
+					bmcr |= 0x0100;
+			}
+			break;
+		}
+
+		/*
+		 * Force speed and duplex if no auto-negotiation.
+		 */
+		if(anlpar == 0){
+			medium = -1;
+			for(i = 0; i < ether->nopt; i++){
+				for(k = 0; k < nelem(mediatable); k++){
+					if(cistrcmp(mediatable[k], ether->opt[i]))
+						continue;
+					medium = k;
+					break;
+				}
+		
+				switch(medium){
+				default:
+					break;
+
+				case 0x00:			/* 10BASE-T */
+				case 0x01:			/* 10BASE-2 */
+				case 0x02:			/* 10BASE-5 */
+					bmcr &= ~(0x2000|0x0100);
+					ctlr->configdata[19] &= ~0x40;
+					break;
+
+				case 0x03:			/* 100BASE-TX */
+				case 0x06:			/* 100BASE-T4 */
+				case 0x07:			/* 100BASE-FX */
+					ctlr->configdata[19] &= ~0x40;
+					bmcr |= 0x2000;
+					break;
+
+				case 0x04:			/* 10BASE-TFD */
+					bmcr = (bmcr & ~0x2000)|0x0100;
+					ctlr->configdata[19] |= 0x40;
+					break;
+
+				case 0x05:			/* 100BASE-TXFD */
+				case 0x08:			/* 100BASE-FXFD */
+					bmcr |= 0x2000|0x0100;
+					ctlr->configdata[19] |= 0x40;
+					break;
+				}
+			}
+			if(medium != -1)
+				miiw(ctlr, phyaddr, 0x00, bmcr);
+		}
+
+		if(bmcr & 0x2000)
+			ether->mbps = 100;
+
+		ctlr->configdata[8] = 1;
+		ctlr->configdata[15] &= ~0x80;
+	}
+	else{
+		ctlr->configdata[8] = 0;
+		ctlr->configdata[15] |= 0x80;
+	}
+
+	/*
+	 * Workaround for some broken HUB chips when connected at 10Mb/s
+	 * half-duplex.
+	 * This is a band-aid, but as there's no dynamic auto-negotiation
+	 * code at the moment, only deactivate the workaround code in txstart
+	 * if the link is 100Mb/s.
+	 */
+	if(ether->mbps != 10)
+		ctlr->nop = 0;
+
+	/*
+	 * Load the chip configuration and start it off.
+	 */
+	if(ether->oq == 0)
+		ether->oq = qopen(256*1024, Qmsg, 0, 0);
+	configure(ether, 0);
+	command(ctlr, CUstart, PADDR(&ctlr->cbr->status));
+
+	/*
+	 * Check if the adapter's station address is to be overridden.
+	 * If not, read it from the EEPROM and set in ether->ea prior to loading
+	 * the station address with the Individual Address Setup command.
+	 */
+	memset(ea, 0, Eaddrlen);
+	if(memcmp(ea, ether->ea, Eaddrlen) == 0){
+		for(i = 0; i < Eaddrlen/2; i++){
+			x = ctlr->eeprom[i];
+			ether->ea[2*i] = x;
+			ether->ea[2*i+1] = x>>8;
+		}
+	}
+
+	ilock(&ctlr->cblock);
+	ctlr->action = CbIAS;
+	txstart(ether);
+	iunlock(&ctlr->cblock);
+
+	/*
+	 * Linkage to the generic ethernet driver.
+	 */
+	ether->attach = attach;
+	ether->transmit = transmit;
+	ether->interrupt = interrupt;
+	ether->ifstat = ifstat;
+	ether->shutdown = shutdown;
+
+	ether->promiscuous = promiscuous;
+	ether->multicast = multicast;
+	ether->arg = ether;
+
+	return 0;
+}
+
+void
+ether82557link(void)
+{
+	addethercard("i82557",  reset);
+}

+ 1745 - 0
sys/src/9/386/ether82563.c

@@ -0,0 +1,1745 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Intel Gigabit Ethernet PCI-Express Controllers.
+ *	8256[36], 8257[12], 82573[ev]
+ *	82575eb
+ * Pretty basic, does not use many of the chip smarts.
+ * The interrupt mitigation tuning for each chip variant
+ * is probably different. The reset/initialisation
+ * sequence needs straightened out. Doubt the PHY code
+ * for the 82575eb is right.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "../port/netif.h"
+
+#include "etherif.h"
+#include "io.h"
+
+/*
+ * these are in the order they appear in the manual, not numeric order.
+ * It was too hard to find them in the book. Ref 21489, rev 2.6
+ */
+
+enum {
+	/* General */
+
+	Ctrl		= 0x0000,	/* Device Control */
+	Status		= 0x0008,	/* Device Status */
+	Eec		= 0x0010,	/* EEPROM/Flash Control/Data */
+	Eerd		= 0x0014,	/* EEPROM Read */
+	Ctrlext		= 0x0018,	/* Extended Device Control */
+	Fla		= 0x001c,	/* Flash Access */
+	Mdic		= 0x0020,	/* MDI Control */
+	Seresctl	= 0x0024,	/* Serdes ana */
+	Fcal		= 0x0028,	/* Flow Control Address Low */
+	Fcah		= 0x002C,	/* Flow Control Address High */
+	Fct		= 0x0030,	/* Flow Control Type */
+	Kumctrlsta	= 0x0034,	/* MAC-PHY Interface */
+	Vet		= 0x0038,	/* VLAN EtherType */
+	Fcttv		= 0x0170,	/* Flow Control Transmit Timer Value */
+	Txcw		= 0x0178,	/* Transmit Configuration Word */
+	Rxcw		= 0x0180,	/* Receive Configuration Word */
+	Ledctl		= 0x0E00,	/* LED control */
+	Pba		= 0x1000,	/* Packet Buffer Allocation */
+	Pbs		= 0x1008,	/* Packet Buffer Size */
+
+	/* Interrupt */
+
+	Icr		= 0x00C0,	/* Interrupt Cause Read */
+	Itr		= 0x00c4,	/* Interrupt Throttling Rate */
+	Ics		= 0x00C8,	/* Interrupt Cause Set */
+	Ims		= 0x00D0,	/* Interrupt Mask Set/Read */
+	Imc		= 0x00D8,	/* Interrupt mask Clear */
+	Iam		= 0x00E0,	/* Interrupt acknowledge Auto Mask */
+
+	/* Receive */
+
+	Rctl		= 0x0100,	/* Control */
+	Ert		= 0x2008,	/* Early Receive Threshold (573[EVL] only) */
+	Fcrtl		= 0x2160,	/* Flow Control RX Threshold Low */
+	Fcrth		= 0x2168,	/* Flow Control Rx Threshold High */
+	Psrctl		= 0x2170,	/* Packet Split Receive Control */
+	Rdbal		= 0x2800,	/* Rdesc Base Address Low Queue 0 */
+	Rdbah		= 0x2804,	/* Rdesc Base Address High Queue 0 */
+	Rdlen		= 0x2808,	/* Descriptor Length Queue 0 */
+	Rdh		= 0x2810,	/* Descriptor Head Queue 0 */
+	Rdt		= 0x2818,	/* Descriptor Tail Queue 0 */
+	Rdtr		= 0x2820,	/* Descriptor Timer Ring */
+	Rxdctl		= 0x2828,	/* Descriptor Control */
+	Radv		= 0x282C,	/* Interrupt Absolute Delay Timer */
+	Rdbal1		= 0x2900,	/* Rdesc Base Address Low Queue 1 */
+	Rdbah1		= 0x2804,	/* Rdesc Base Address High Queue 1 */
+	Rdlen1		= 0x2908,	/* Descriptor Length Queue 1 */
+	Rdh1		= 0x2910,	/* Descriptor Head Queue 1 */
+	Rdt1		= 0x2918,	/* Descriptor Tail Queue 1 */
+	Rxdctl1		= 0x2928,	/* Descriptor Control Queue 1 */
+	Rsrpd		= 0x2c00,	/* Small Packet Detect */
+	Raid		= 0x2c08,	/* ACK interrupt delay */
+	Cpuvec		= 0x2c10,	/* CPU Vector */
+	Rxcsum		= 0x5000,	/* Checksum Control */
+	Rfctl		= 0x5008,	/* Filter Control */
+	Mta		= 0x5200,	/* Multicast Table Array */
+	Ral		= 0x5400,	/* Receive Address Low */
+	Rah		= 0x5404,	/* Receive Address High */
+	Vfta		= 0x5600,	/* VLAN Filter Table Array */
+	Mrqc		= 0x5818,	/* Multiple Receive Queues Command */
+	Rssim		= 0x5864,	/* RSS Interrupt Mask */
+	Rssir		= 0x5868,	/* RSS Interrupt Request */
+	Reta		= 0x5c00,	/* Redirection Table */
+	Rssrk		= 0x5c80,	/* RSS Random Key */
+
+	/* Transmit */
+
+	Tctl		= 0x0400,	/* Transmit Control */
+	Tipg		= 0x0410,	/* Transmit IPG */
+	Tkabgtxd	= 0x3004,	/* glci afe band gap transmit ref data, or something */
+	Tdbal		= 0x3800,	/* Tdesc Base Address Low */
+	Tdbah		= 0x3804,	/* Tdesc Base Address High */
+	Tdlen		= 0x3808,	/* Descriptor Length */
+	Tdh		= 0x3810,	/* Descriptor Head */
+	Tdt		= 0x3818,	/* Descriptor Tail */
+	Tidv		= 0x3820,	/* Interrupt Delay Value */
+	Txdctl		= 0x3828,	/* Descriptor Control */
+	Tadv		= 0x382C,	/* Interrupt Absolute Delay Timer */
+	Tarc0		= 0x3840,	/* Arbitration Counter Queue 0 */
+	Tdbal1		= 0x3900,	/* Descriptor Base Low Queue 1 */
+	Tdbah1		= 0x3904,	/* Descriptor Base High Queue 1 */
+	Tdlen1		= 0x3908,	/* Descriptor Length Queue 1 */
+	Tdh1		= 0x3910,	/* Descriptor Head Queue 1 */
+	Tdt1		= 0x3918,	/* Descriptor Tail Queue 1 */
+	Txdctl1		= 0x3928,	/* Descriptor Control 1 */
+	Tarc1		= 0x3940,	/* Arbitration Counter Queue 1 */
+
+	/* Statistics */
+
+	Statistics	= 0x4000,	/* Start of Statistics Area */
+	Gorcl		= 0x88/4,	/* Good Octets Received Count */
+	Gotcl		= 0x90/4,	/* Good Octets Transmitted Count */
+	Torl		= 0xC0/4,	/* Total Octets Received */
+	Totl		= 0xC8/4,	/* Total Octets Transmitted */
+	Nstatistics	= 0x124/4,
+};
+
+enum {					/* Ctrl */
+	GIOmd		= 1<<2,		/* BIO master disable */
+	Lrst		= 1<<3,		/* link reset */
+	Slu		= 1<<6,		/* Set Link Up */
+	SspeedMASK	= 3<<8,		/* Speed Selection */
+	SspeedSHIFT	= 8,
+	Sspeed10	= 0x00000000,	/* 10Mb/s */
+	Sspeed100	= 0x00000100,	/* 100Mb/s */
+	Sspeed1000	= 0x00000200,	/* 1000Mb/s */
+	Frcspd		= 1<<11,	/* Force Speed */
+	Frcdplx		= 1<<12,	/* Force Duplex */
+	SwdpinsloMASK	= 0x003C0000,	/* Software Defined Pins - lo nibble */
+	SwdpinsloSHIFT	= 18,
+	SwdpioloMASK	= 0x03C00000,	/* Software Defined Pins - I or O */
+	SwdpioloSHIFT	= 22,
+	Devrst		= 1<<26,	/* Device Reset */
+	Rfce		= 1<<27,	/* Receive Flow Control Enable */
+	Tfce		= 1<<28,	/* Transmit Flow Control Enable */
+	Vme		= 1<<30,	/* VLAN Mode Enable */
+	Phyrst		= 1<<31,	/* Phy Reset */
+};
+
+enum {					/* Status */
+	Lu		= 1<<1,		/* Link Up */
+	Lanid		= 3<<2,		/* mask for Lan ID. */
+	Txoff		= 1<<4,		/* Transmission Paused */
+	Tbimode		= 1<<5,		/* TBI Mode Indication */
+	Phyra		= 1<<10,	/* PHY Reset Asserted */
+	GIOme		= 1<<19,	/* GIO Master Enable Status */
+};
+
+enum {					/* Eerd */
+	EEstart		= 1<<0,		/* Start Read */
+	EEdone		= 1<<1,		/* Read done */
+};
+
+enum {					/* Ctrlext */
+	Asdchk		= 1<<12,	/* ASD Check */
+	Eerst		= 1<<13,	/* EEPROM Reset */
+	Spdbyps		= 1<<15,	/* Speed Select Bypass */
+};
+
+enum {					/* EEPROM content offsets */
+	Ea		= 0x00,		/* Ethernet Address */
+	Cf		= 0x03,		/* Compatibility Field */
+	Icw1		= 0x0A,		/* Initialization Control Word 1 */
+	Sid		= 0x0B,		/* Subsystem ID */
+	Svid		= 0x0C,		/* Subsystem Vendor ID */
+	Did		= 0x0D,		/* Device ID */
+	Vid		= 0x0E,		/* Vendor ID */
+	Icw2		= 0x0F,		/* Initialization Control Word 2 */
+};
+
+enum {					/* Mdic */
+	MDIdMASK	= 0x0000FFFF,	/* Data */
+	MDIdSHIFT	= 0,
+	MDIrMASK	= 0x001F0000,	/* PHY Register Address */
+	MDIrSHIFT	= 16,
+	MDIpMASK	= 0x03E00000,	/* PHY Address */
+	MDIpSHIFT	= 21,
+	MDIwop		= 0x04000000,	/* Write Operation */
+	MDIrop		= 0x08000000,	/* Read Operation */
+	MDIready	= 0x10000000,	/* End of Transaction */
+	MDIie		= 0x20000000,	/* Interrupt Enable */
+	MDIe		= 0x40000000,	/* Error */
+};
+
+enum {					/* phy interface registers */
+	Phyctl		= 0,		/* phy ctl */
+	Physsr		= 17,		/* phy secondary status */
+	Phyier		= 18,		/* 82573 phy interrupt enable */
+	Phyisr		= 19,		/* 82563 phy interrupt status */
+	Phylhr		= 19,		/* 8257[12] link health */
+
+	Rtlink		= 1<<10,	/* realtime link status */
+	Phyan		= 1<<11,	/* phy has auto-negotiated */
+
+	/* Phyctl bits */
+	Ran		= 1<<9,		/* restart auto-negotiation */
+	Ean		= 1<<12,	/* enable auto-negotiation */
+
+	/* 82573 Phyier bits */
+	Lscie		= 1<<10,	/* link status changed ie */
+	Ancie		= 1<<11,	/* auto-negotiation complete ie */
+	Spdie		= 1<<14,	/* speed changed ie */
+	Panie		= 1<<15,	/* phy auto-negotiation error ie */
+
+	/* Phylhr/Phyisr bits */
+	Anf		= 1<<6,		/* lhr: auto-negotiation fault */
+	Ane		= 1<<15,	/* isr: auto-negotiation error */
+};
+
+enum {					/* Icr, Ics, Ims, Imc */
+	Txdw		= 0x00000001,	/* Transmit Descriptor Written Back */
+	Txqe		= 0x00000002,	/* Transmit Queue Empty */
+	Lsc		= 0x00000004,	/* Link Status Change */
+	Rxseq		= 0x00000008,	/* Receive Sequence Error */
+	Rxdmt0		= 0x00000010,	/* Rdesc Minimum Threshold Reached */
+	Rxo		= 0x00000040,	/* Receiver Overrun */
+	Rxt0		= 0x00000080,	/* Receiver Timer Interrupt */
+	Mdac		= 0x00000200,	/* MDIO Access Completed */
+	Rxcfg		= 0x00000400,	/* Receiving /C/ ordered sets */
+	Gpi0		= 0x00000800,	/* General Purpose Interrupts */
+	Gpi1		= 0x00001000,
+	Gpi2		= 0x00002000,
+	Gpi3		= 0x00004000,
+	Ack		= 0x00020000,	/* Receive ACK frame */
+};
+
+enum {					/* Txcw */
+	TxcwFd		= 0x00000020,	/* Full Duplex */
+	TxcwHd		= 0x00000040,	/* Half Duplex */
+	TxcwPauseMASK	= 0x00000180,	/* Pause */
+	TxcwPauseSHIFT	= 7,
+	TxcwPs		= 1<<TxcwPauseSHIFT,	/* Pause Supported */
+	TxcwAs		= 2<<TxcwPauseSHIFT,	/* Asymmetric FC desired */
+	TxcwRfiMASK	= 0x00003000,	/* Remote Fault Indication */
+	TxcwRfiSHIFT	= 12,
+	TxcwNpr		= 0x00008000,	/* Next Page Request */
+	TxcwConfig	= 0x40000000,	/* Transmit Config Control */
+	TxcwAne		= 0x80000000,	/* Auto-Negotiation Enable */
+};
+
+enum {					/* Rctl */
+	Rrst		= 0x00000001,	/* Receiver Software Reset */
+	Ren		= 0x00000002,	/* Receiver Enable */
+	Sbp		= 0x00000004,	/* Store Bad Packets */
+	Upe		= 0x00000008,	/* Unicast Promiscuous Enable */
+	Mpe		= 0x00000010,	/* Multicast Promiscuous Enable */
+	Lpe		= 0x00000020,	/* Long Packet Reception Enable */
+	LbmMASK		= 0x000000C0,	/* Loopback Mode */
+	LbmOFF		= 0x00000000,	/* No Loopback */
+	LbmTBI		= 0x00000040,	/* TBI Loopback */
+	LbmMII		= 0x00000080,	/* GMII/MII Loopback */
+	LbmXCVR		= 0x000000C0,	/* Transceiver Loopback */
+	RdtmsMASK	= 0x00000300,	/* Rdesc Minimum Threshold Size */
+	RdtmsHALF	= 0x00000000,	/* Threshold is 1/2 Rdlen */
+	RdtmsQUARTER	= 0x00000100,	/* Threshold is 1/4 Rdlen */
+	RdtmsEIGHTH	= 0x00000200,	/* Threshold is 1/8 Rdlen */
+	MoMASK		= 0x00003000,	/* Multicast Offset */
+	Bam		= 0x00008000,	/* Broadcast Accept Mode */
+	BsizeMASK	= 0x00030000,	/* Receive Buffer Size */
+	Bsize16384	= 0x00010000,	/* Bsex = 1 */
+	Bsize8192	= 0x00020000, 	/* Bsex = 1 */
+	Bsize2048	= 0x00000000,
+	Bsize1024	= 0x00010000,
+	Bsize512	= 0x00020000,
+	Bsize256	= 0x00030000,
+	BsizeFlex	= 0x08000000,	/* Flexible Bsize in 1KB increments */
+	Vfe		= 0x00040000,	/* VLAN Filter Enable */
+	Cfien		= 0x00080000,	/* Canonical Form Indicator Enable */
+	Cfi		= 0x00100000,	/* Canonical Form Indicator value */
+	Dpf		= 0x00400000,	/* Discard Pause Frames */
+	Pmcf		= 0x00800000,	/* Pass MAC Control Frames */
+	Bsex		= 0x02000000,	/* Buffer Size Extension */
+	Secrc		= 0x04000000,	/* Strip CRC from incoming packet */
+};
+
+enum {					/* Tctl */
+	Trst		= 0x00000001,	/* Transmitter Software Reset */
+	Ten		= 0x00000002,	/* Transmit Enable */
+	Psp		= 0x00000008,	/* Pad Short Packets */
+	Mulr		= 0x10000000,	/* Allow multiple concurrent requests */
+	CtMASK		= 0x00000FF0,	/* Collision Threshold */
+	CtSHIFT		= 4,
+	ColdMASK	= 0x003FF000,	/* Collision Distance */
+	ColdSHIFT	= 12,
+	Swxoff		= 0x00400000,	/* Sofware XOFF Transmission */
+	Pbe		= 0x00800000,	/* Packet Burst Enable */
+	Rtlc		= 0x01000000,	/* Re-transmit on Late Collision */
+	Nrtu		= 0x02000000,	/* No Re-transmit on Underrrun */
+};
+
+enum {					/* [RT]xdctl */
+	PthreshMASK	= 0x0000003F,	/* Prefetch Threshold */
+	PthreshSHIFT	= 0,
+	HthreshMASK	= 0x00003F00,	/* Host Threshold */
+	HthreshSHIFT	= 8,
+	WthreshMASK	= 0x003F0000,	/* Writeback Threshold */
+	WthreshSHIFT	= 16,
+	Gran		= 0x01000000,	/* Granularity */
+	Qenable		= 0x02000000,	/* Queue Enable (82575) */
+};
+
+enum {					/* Rxcsum */
+	PcssMASK	= 0x00FF,	/* Packet Checksum Start */
+	PcssSHIFT	= 0,
+	Ipofl		= 0x0100,	/* IP Checksum Off-load Enable */
+	Tuofl		= 0x0200,	/* TCP/UDP Checksum Off-load Enable */
+};
+
+enum {					/* Receive Delay Timer Ring */
+	DelayMASK	= 0xFFFF,	/* delay timer in 1.024nS increments */
+	DelaySHIFT	= 0,
+	Fpd		= 0x80000000,	/* Flush partial Descriptor Block */
+};
+
+typedef struct Rd {			/* Receive Descriptor */
+	u32int	addr[2];
+	u16int	length;
+	u16int	checksum;
+	u8int	status;
+	u8int	errors;
+	u16int	special;
+} Rd;
+
+enum {					/* Rd status */
+	Rdd		= 0x01,		/* Descriptor Done */
+	Reop		= 0x02,		/* End of Packet */
+	Ixsm		= 0x04,		/* Ignore Checksum Indication */
+	Vp		= 0x08,		/* Packet is 802.1Q (matched VET) */
+	Tcpcs		= 0x20,		/* TCP Checksum Calculated on Packet */
+	Ipcs		= 0x40,		/* IP Checksum Calculated on Packet */
+	Pif		= 0x80,		/* Passed in-exact filter */
+};
+
+enum {					/* Rd errors */
+	Ce		= 0x01,		/* CRC Error or Alignment Error */
+	Se		= 0x02,		/* Symbol Error */
+	Seq		= 0x04,		/* Sequence Error */
+	Cxe		= 0x10,		/* Carrier Extension Error */
+	Tcpe		= 0x20,		/* TCP/UDP Checksum Error */
+	Ipe		= 0x40,		/* IP Checksum Error */
+	Rxe		= 0x80,		/* RX Data Error */
+};
+
+typedef struct {			/* Transmit Descriptor */
+	u32int	addr[2];		/* Data */
+	u32int	control;
+	u32int	status;
+} Td;
+
+enum {					/* Tdesc control */
+	LenMASK		= 0x000FFFFF,	/* Data/Packet Length Field */
+	LenSHIFT	= 0,
+	DtypeCD		= 0x00000000,	/* Data Type 'Context Descriptor' */
+	DtypeDD		= 0x00100000,	/* Data Type 'Data Descriptor' */
+	PtypeTCP	= 0x01000000,	/* TCP/UDP Packet Type (CD) */
+	Teop		= 0x01000000,	/* End of Packet (DD) */
+	PtypeIP		= 0x02000000,	/* IP Packet Type (CD) */
+	Ifcs		= 0x02000000,	/* Insert FCS (DD) */
+	Tse		= 0x04000000,	/* TCP Segmentation Enable */
+	Rs		= 0x08000000,	/* Report Status */
+	Rps		= 0x10000000,	/* Report Status Sent */
+	Dext		= 0x20000000,	/* Descriptor Extension */
+	Vle		= 0x40000000,	/* VLAN Packet Enable */
+	Ide		= 0x80000000,	/* Interrupt Delay Enable */
+};
+
+enum {					/* Tdesc status */
+	Tdd		= 0x0001,	/* Descriptor Done */
+	Ec		= 0x0002,	/* Excess Collisions */
+	Lc		= 0x0004,	/* Late Collision */
+	Tu		= 0x0008,	/* Transmit Underrun */
+	CssMASK		= 0xFF00,	/* Checksum Start Field */
+	CssSHIFT	= 8,
+};
+
+typedef struct {
+	u16int	*reg;
+	u32int	*reg32;
+	int	sz;
+} Flash;
+
+enum {
+	/* 16 and 32-bit flash registers for ich flash parts */
+	Bfpr	= 0x00/4,		/* flash base 0:12; lim 16:28 */
+	Fsts	= 0x04/2,		/* flash status;  Hsfsts */
+	Fctl	= 0x06/2,		/* flash control; Hsfctl */
+	Faddr	= 0x08/4,		/* flash address to r/w */
+	Fdata	= 0x10/4,		/* data @ address */
+
+	/* status register */
+	Fdone	= 1<<0,			/* flash cycle done */
+	Fcerr	= 1<<1,			/* cycle error; write 1 to clear */
+	Ael	= 1<<2,			/* direct access error log; 1 to clear */
+	Scip	= 1<<5,			/* spi cycle in progress */
+	Fvalid	= 1<<14,		/* flash descriptor valid */
+
+	/* control register */
+	Fgo	= 1<<0,			/* start cycle */
+	Flcycle	= 1<<1,			/* two bits: r=0; w=2 */
+	Fdbc	= 1<<8,			/* bytes to read; 5 bits */
+};
+
+enum {
+	Nrd		= 256,		/* power of two */
+	Ntd		= 128,		/* power of two */
+	Nrb		= 512,		/* private receive buffers per Ctlr */
+};
+
+enum {
+	Iany,
+	i82563,
+	i82566,
+	i82571,
+	i82572,
+	i82573,
+	i82575,
+	i82576,
+};
+
+static int rbtab[] = {
+	0,
+	9014,
+	1514,
+	9234,
+	9234,
+	8192,				/* terrible performance above 8k */
+	1514,
+};
+
+static char *tname[] = {
+	"any",
+	"i82563",
+	"i82566",
+	"i82571",
+	"i82572",
+	"i82573",
+	"i82575",
+	"i82576",
+};
+
+typedef struct Ctlr Ctlr;
+struct Ctlr {
+	int	port;
+	Pcidev	*pcidev;
+	Ctlr	*next;
+	int	active;
+	int	type;
+	ushort	eeprom[0x40];
+
+	QLock	alock;			/* attach */
+	int	attached;
+	int	nrd;
+	int	ntd;
+	int	nrb;			/* how many this Ctlr has in the pool */
+	unsigned rbsz;			/* unsigned for % and / by 1024 */
+
+	int	*nic;
+	Lock	imlock;
+	int	im;			/* interrupt mask */
+
+	Rendez	lrendez;
+	int	lim;
+
+	QLock	slock;
+	uint	statistics[Nstatistics];
+	uint	lsleep;
+	uint	lintr;
+	uint	rsleep;
+	uint	rintr;
+	uint	txdw;
+	uint	tintr;
+	uint	ixsm;
+	uint	ipcs;
+	uint	tcpcs;
+	uint	speeds[4];
+
+	uchar	ra[Eaddrlen];		/* receive address */
+	ulong	mta[128];		/* multicast table array */
+
+	Rendez	rrendez;
+	int	rim;
+	int	rdfree;
+	Rd	*rdba;			/* receive descriptor base address */
+	Block	**rb;			/* receive buffers */
+	int	rdh;			/* receive descriptor head */
+	int	rdt;			/* receive descriptor tail */
+	int	rdtr;			/* receive delay timer ring value */
+	int	radv;			/* receive interrupt absolute delay timer */
+
+	Rendez	trendez;
+	QLock	tlock;
+	int	tbusy;
+	Td	*tdba;			/* transmit descriptor base address */
+	Block	**tb;			/* transmit buffers */
+	int	tdh;			/* transmit descriptor head */
+	int	tdt;			/* transmit descriptor tail */
+
+	int	fcrtl;
+	int	fcrth;
+
+	uint	pba;			/* packet buffer allocation */
+};
+
+#define csr32r(c, r)	(*((c)->nic+((r)/4)))
+#define csr32w(c, r, v)	(*((c)->nic+((r)/4)) = (v))
+
+static Ctlr* i82563ctlrhead;
+static Ctlr* i82563ctlrtail;
+
+static Lock i82563rblock;		/* free receive Blocks */
+static Block* i82563rbpool;
+
+static char* statistics[] = {
+	"CRC Error",
+	"Alignment Error",
+	"Symbol Error",
+	"RX Error",
+	"Missed Packets",
+	"Single Collision",
+	"Excessive Collisions",
+	"Multiple Collision",
+	"Late Collisions",
+	nil,
+	"Collision",
+	"Transmit Underrun",
+	"Defer",
+	"Transmit - No CRS",
+	"Sequence Error",
+	"Carrier Extension Error",
+	"Receive Error Length",
+	nil,
+	"XON Received",
+	"XON Transmitted",
+	"XOFF Received",
+	"XOFF Transmitted",
+	"FC Received Unsupported",
+	"Packets Received (64 Bytes)",
+	"Packets Received (65-127 Bytes)",
+	"Packets Received (128-255 Bytes)",
+	"Packets Received (256-511 Bytes)",
+	"Packets Received (512-1023 Bytes)",
+	"Packets Received (1024-mtu Bytes)",
+	"Good Packets Received",
+	"Broadcast Packets Received",
+	"Multicast Packets Received",
+	"Good Packets Transmitted",
+	nil,
+	"Good Octets Received",
+	nil,
+	"Good Octets Transmitted",
+	nil,
+	nil,
+	nil,
+	"Receive No Buffers",
+	"Receive Undersize",
+	"Receive Fragment",
+	"Receive Oversize",
+	"Receive Jabber",
+	"Management Packets Rx",
+	"Management Packets Drop",
+	"Management Packets Tx",
+	"Total Octets Received",
+	nil,
+	"Total Octets Transmitted",
+	nil,
+	"Total Packets Received",
+	"Total Packets Transmitted",
+	"Packets Transmitted (64 Bytes)",
+	"Packets Transmitted (65-127 Bytes)",
+	"Packets Transmitted (128-255 Bytes)",
+	"Packets Transmitted (256-511 Bytes)",
+	"Packets Transmitted (512-1023 Bytes)",
+	"Packets Transmitted (1024-mtu Bytes)",
+	"Multicast Packets Transmitted",
+	"Broadcast Packets Transmitted",
+	"TCP Segmentation Context Transmitted",
+	"TCP Segmentation Context Fail",
+	"Interrupt Assertion",
+	"Interrupt Rx Pkt Timer",
+	"Interrupt Rx Abs Timer",
+	"Interrupt Tx Pkt Timer",
+	"Interrupt Tx Abs Timer",
+	"Interrupt Tx Queue Empty",
+	"Interrupt Tx Desc Low",
+	"Interrupt Rx Min",
+	"Interrupt Rx Overrun",
+};
+
+static long
+i82563ifstat(Ether* edev, void* a, long n, ulong offset)
+{
+	Ctlr *ctlr;
+	char *s, *p, *e, *stat;
+	int i, r;
+	uvlong tuvl, ruvl;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->slock);
+	p = s = malloc(2*READSTR);
+	e = p + 2*READSTR;
+
+	for(i = 0; i < Nstatistics; i++){
+		r = csr32r(ctlr, Statistics + i*4);
+		if((stat = statistics[i]) == nil)
+			continue;
+		switch(i){
+		case Gorcl:
+		case Gotcl:
+		case Torl:
+		case Totl:
+			ruvl = r;
+			ruvl += (uvlong)csr32r(ctlr, Statistics+(i+1)*4) << 32;
+			tuvl = ruvl;
+			tuvl += ctlr->statistics[i];
+			tuvl += (uvlong)ctlr->statistics[i+1] << 32;
+			if(tuvl == 0)
+				continue;
+			ctlr->statistics[i] = tuvl;
+			ctlr->statistics[i+1] = tuvl >> 32;
+			p = seprint(p, e, "%s: %llud %llud\n", stat, tuvl, ruvl);
+			i++;
+			break;
+
+		default:
+			ctlr->statistics[i] += r;
+			if(ctlr->statistics[i] == 0)
+				continue;
+			p = seprint(p, e, "%s: %ud %ud\n", stat,
+				ctlr->statistics[i], r);
+			break;
+		}
+	}
+
+	p = seprint(p, e, "lintr: %ud %ud\n", ctlr->lintr, ctlr->lsleep);
+	p = seprint(p, e, "rintr: %ud %ud\n", ctlr->rintr, ctlr->rsleep);
+	p = seprint(p, e, "tintr: %ud %ud\n", ctlr->tintr, ctlr->txdw);
+	p = seprint(p, e, "ixcs: %ud %ud %ud\n", ctlr->ixsm, ctlr->ipcs, ctlr->tcpcs);
+	p = seprint(p, e, "rdtr: %ud\n", ctlr->rdtr);
+	p = seprint(p, e, "radv: %ud\n", ctlr->radv);
+	p = seprint(p, e, "ctrl: %.8ux\n", csr32r(ctlr, Ctrl));
+	p = seprint(p, e, "ctrlext: %.8ux\n", csr32r(ctlr, Ctrlext));
+	p = seprint(p, e, "status: %.8ux\n", csr32r(ctlr, Status));
+	p = seprint(p, e, "txcw: %.8ux\n", csr32r(ctlr, Txcw));
+	p = seprint(p, e, "txdctl: %.8ux\n", csr32r(ctlr, Txdctl));
+	p = seprint(p, e, "pba: %.8ux\n", ctlr->pba);
+
+	p = seprint(p, e, "speeds: 10:%ud 100:%ud 1000:%ud ?:%ud\n",
+		ctlr->speeds[0], ctlr->speeds[1], ctlr->speeds[2], ctlr->speeds[3]);
+	p = seprint(p, e, "type: %s\n", tname[ctlr->type]);
+
+//	p = seprint(p, e, "eeprom:");
+//	for(i = 0; i < 0x40; i++){
+//		if(i && ((i & 7) == 0))
+//			p = seprint(p, e, "\n       ");
+//		p = seprint(p, e, " %4.4ux", ctlr->eeprom[i]);
+//	}
+//	p = seprint(p, e, "\n");
+
+	USED(p);
+	n = readstr(offset, a, n, s);
+	free(s);
+	qunlock(&ctlr->slock);
+
+	return n;
+}
+
+enum {
+	CMrdtr,
+	CMradv,
+};
+
+static Cmdtab i82563ctlmsg[] = {
+	CMrdtr,	"rdtr",	2,
+	CMradv,	"radv",	2,
+};
+
+static long
+i82563ctl(Ether* edev, void* buf, long n)
+{
+	ulong v;
+	char *p;
+	Ctlr *ctlr;
+	Cmdbuf *cb;
+	Cmdtab *ct;
+
+	if((ctlr = edev->ctlr) == nil)
+		error(Enonexist);
+
+	cb = parsecmd(buf, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+
+	ct = lookupcmd(cb, i82563ctlmsg, nelem(i82563ctlmsg));
+	switch(ct->index){
+	case CMrdtr:
+		v = strtoul(cb->f[1], &p, 0);
+		if(p == cb->f[1] || v > 0xFFFF)
+			error(Ebadarg);
+		ctlr->rdtr = v;
+		csr32w(ctlr, Rdtr, v);
+		break;
+	case CMradv:
+		v = strtoul(cb->f[1], &p, 0);
+		if(p == cb->f[1] || v > 0xFFFF)
+			error(Ebadarg);
+		ctlr->radv = v;
+		csr32w(ctlr, Radv, v);
+	}
+	free(cb);
+	poperror();
+
+	return n;
+}
+
+static void
+i82563promiscuous(void* arg, int on)
+{
+	int rctl;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	rctl = csr32r(ctlr, Rctl);
+	rctl &= ~MoMASK;
+	if(on)
+		rctl |= Upe|Mpe;
+	else
+		rctl &= ~(Upe|Mpe);
+	csr32w(ctlr, Rctl, rctl);
+}
+
+static void
+i82563multicast(void* arg, uchar* addr, int on)
+{
+	int bit, x;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	x = addr[5]>>1;
+	if(ctlr->type == i82566)
+		x &= 31;
+	bit = ((addr[5] & 1)<<4)|(addr[4]>>4);
+	/*
+	 * multiple ether addresses can hash to the same filter bit,
+	 * so it's never safe to clear a filter bit.
+	 * if we want to clear filter bits, we need to keep track of
+	 * all the multicast addresses in use, clear all the filter bits,
+	 * then set the ones corresponding to in-use addresses.
+	 */
+	if(on)
+		ctlr->mta[x] |= 1<<bit;
+//	else
+//		ctlr->mta[x] &= ~(1<<bit);
+
+	csr32w(ctlr, Mta+x*4, ctlr->mta[x]);
+}
+
+static Block*
+i82563rballoc(void)
+{
+	Block *bp;
+
+	ilock(&i82563rblock);
+	if((bp = i82563rbpool) != nil){
+		i82563rbpool = bp->next;
+		bp->next = nil;
+//		ainc(&bp->ref);
+	}
+	iunlock(&i82563rblock);
+
+	return bp;
+}
+
+static void
+i82563rbfree(Block* b)
+{
+	b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB);
+ 	b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck);
+	ilock(&i82563rblock);
+	b->next = i82563rbpool;
+	i82563rbpool = b;
+	iunlock(&i82563rblock);
+}
+
+static void
+i82563im(Ctlr* ctlr, int im)
+{
+	ilock(&ctlr->imlock);
+	ctlr->im |= im;
+	csr32w(ctlr, Ims, ctlr->im);
+	iunlock(&ctlr->imlock);
+}
+
+static void
+i82563txinit(Ctlr* ctlr)
+{
+	int i, r;
+	Block *bp;
+
+	csr32w(ctlr, Tctl, 0x0F<<CtSHIFT | Psp | 66<<ColdSHIFT | Mulr);
+	csr32w(ctlr, Tipg, 6<<20 | 8<<10 | 8);		/* yb sez: 0x702008 */
+	csr32w(ctlr, Tdbal, PCIWADDR(ctlr->tdba));
+	csr32w(ctlr, Tdbah, 0);
+	csr32w(ctlr, Tdlen, ctlr->ntd * sizeof(Td));
+	ctlr->tdh = PREV(0, ctlr->ntd);
+	csr32w(ctlr, Tdh, 0);
+	ctlr->tdt = 0;
+	csr32w(ctlr, Tdt, 0);
+	for(i = 0; i < ctlr->ntd; i++){
+		if((bp = ctlr->tb[i]) != nil){
+			ctlr->tb[i] = nil;
+			freeb(bp);
+		}
+		memset(&ctlr->tdba[i], 0, sizeof(Td));
+	}
+	csr32w(ctlr, Tidv, 128);
+	r = csr32r(ctlr, Txdctl);
+	r &= ~(WthreshMASK|PthreshSHIFT);
+	r |= 4<<WthreshSHIFT | 4<<PthreshSHIFT;
+	if(ctlr->type == i82575 || ctlr->type == i82576)
+		r |= Qenable;
+	csr32w(ctlr, Tadv, 64);
+	csr32w(ctlr, Txdctl, r);
+	r = csr32r(ctlr, Tctl);
+	r |= Ten;
+	csr32w(ctlr, Tctl, r);
+//	if(ctlr->type == i82671)
+//		csr32w(ctlr, Tarc0, csr32r(ctlr, Tarc0) | 7<<24); /* yb sez? */
+}
+
+#define Next(x, m)	(((x)+1) & (m))
+
+static int
+i82563cleanup(Ctlr *c)
+{
+	Block *b;
+	int tdh, m, n;
+
+	tdh = c->tdh;
+	m = c->ntd-1;
+	while(c->tdba[n = Next(tdh, m)].status & Tdd){
+		tdh = n;
+		if((b = c->tb[tdh]) != nil){
+			c->tb[tdh] = nil;
+			freeb(b);
+		}else
+			iprint("82563 tx underrun!\n");
+		c->tdba[tdh].status = 0;
+	}
+
+	return c->tdh = tdh;
+}
+
+static void
+i82563transmit(Ether* edev)
+{
+	Td *td;
+	Block *bp;
+	Ctlr *ctlr;
+	int tdh, tdt, m;
+
+	ctlr = edev->ctlr;
+
+	qlock(&ctlr->tlock);
+
+	/*
+	 * Free any completed packets
+	 */
+	tdh = i82563cleanup(ctlr);
+
+	/*
+	 * Try to fill the ring back up.
+	 */
+	tdt = ctlr->tdt;
+	m = ctlr->ntd-1;
+	for(;;){
+		if(Next(tdt, m) == tdh){
+			ctlr->txdw++;
+			i82563im(ctlr, Txdw);
+			break;
+		}
+		if((bp = qget(edev->oq)) == nil)
+			break;
+		td = &ctlr->tdba[tdt];
+		td->addr[0] = PCIWADDR(bp->rp);
+		td->control = Ide|Rs|Ifcs|Teop|BLEN(bp);
+		ctlr->tb[tdt] = bp;
+		tdt = Next(tdt, m);
+	}
+	if(ctlr->tdt != tdt){
+		ctlr->tdt = tdt;
+		csr32w(ctlr, Tdt, tdt);
+	}
+	qunlock(&ctlr->tlock);
+}
+
+static void
+i82563replenish(Ctlr* ctlr)
+{
+	Rd *rd;
+	int rdt, m;
+	Block *bp;
+
+	rdt = ctlr->rdt;
+	m = ctlr->nrd-1;
+	while(Next(rdt, m) != ctlr->rdh){
+		rd = &ctlr->rdba[rdt];
+		if(ctlr->rb[rdt] != nil){
+			iprint("82563: tx overrun\n");
+			break;
+		}
+		bp = i82563rballoc();
+		if(bp == nil){
+			iprint("82563: no available buffers\n");
+			break;
+		}
+		ctlr->rb[rdt] = bp;
+		rd->addr[0] = PCIWADDR(bp->rp);
+//		rd->addr[1] = 0;
+		rd->status = 0;
+		ctlr->rdfree++;
+		rdt = Next(rdt, m);
+	}
+	ctlr->rdt = rdt;
+	csr32w(ctlr, Rdt, rdt);
+}
+
+static void
+i82563rxinit(Ctlr* ctlr)
+{
+	Block *bp;
+	int i, r, rctl;
+
+	if(ctlr->rbsz <= 2048)
+		rctl = Dpf|Bsize2048|Bam|RdtmsHALF;
+	else if(ctlr->rbsz <= 8192)
+		rctl = Lpe|Dpf|Bsize8192|Bsex|Bam|RdtmsHALF|Secrc;
+	else if(ctlr->rbsz <= 12*1024){
+		i = ctlr->rbsz / 1024;
+		if(ctlr->rbsz % 1024)
+			i++;
+		rctl = Lpe|Dpf|BsizeFlex*i|Bam|RdtmsHALF|Secrc;
+	}
+	else
+		rctl = Lpe|Dpf|Bsize16384|Bsex|Bam|RdtmsHALF|Secrc;
+
+	if(ctlr->type == i82575 || ctlr->type == i82576){
+		/*
+		 * Setting Qenable in Rxdctl does not
+		 * appear to stick unless Ren is on.
+		 */
+		csr32w(ctlr, Rctl, Ren|rctl);
+		r = csr32r(ctlr, Rxdctl);
+		r |= Qenable;
+		csr32w(ctlr, Rxdctl, r);
+	}
+	csr32w(ctlr, Rctl, rctl);
+
+	if(ctlr->type == i82573)
+		csr32w(ctlr, Ert, 1024/8);
+
+	if(ctlr->type == i82566)
+		csr32w(ctlr, Pbs, 16);
+
+	csr32w(ctlr, Rdbal, PCIWADDR(ctlr->rdba));
+	csr32w(ctlr, Rdbah, 0);
+	csr32w(ctlr, Rdlen, ctlr->nrd * sizeof(Rd));
+	ctlr->rdh = 0;
+	csr32w(ctlr, Rdh, 0);
+	ctlr->rdt = 0;
+	csr32w(ctlr, Rdt, 0);
+	ctlr->rdtr = 25;
+	ctlr->radv = 500;
+	csr32w(ctlr, Rdtr, ctlr->rdtr);
+	csr32w(ctlr, Radv, ctlr->radv);
+
+	for(i = 0; i < ctlr->nrd; i++){
+		if((bp = ctlr->rb[i]) != nil){
+			ctlr->rb[i] = nil;
+			freeb(bp);
+		}
+	}
+	i82563replenish(ctlr);
+
+	if(ctlr->type == i82575 || ctlr->type == i82576){
+		/*
+		 * See comment above for Qenable.
+		 * Could shuffle the code?
+		 */
+		r = csr32r(ctlr, Rxdctl);
+		r &= ~(WthreshSHIFT|PthreshSHIFT);
+		r |= (2<<WthreshSHIFT)|(2<<PthreshSHIFT);
+		csr32w(ctlr, Rxdctl, r);
+	}
+
+	/*
+	 * Enable checksum offload.
+	 */
+	csr32w(ctlr, Rxcsum, Tuofl | Ipofl | ETHERHDRSIZE<<PcssSHIFT);
+}
+
+static int
+i82563rim(void* ctlr)
+{
+	return ((Ctlr*)ctlr)->rim != 0;
+}
+
+static void
+i82563rproc(void* arg)
+{
+	Rd *rd;
+	Block *bp;
+	Ctlr *ctlr;
+	int r, m, rdh, rim;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	i82563rxinit(ctlr);
+	r = csr32r(ctlr, Rctl);
+	r |= Ren;
+	csr32w(ctlr, Rctl, r);
+	m = ctlr->nrd-1;
+
+	for(;;){
+		i82563im(ctlr, Rxt0|Rxo|Rxdmt0|Rxseq|Ack);
+		ctlr->rsleep++;
+//		coherence();
+		sleep(&ctlr->rrendez, i82563rim, ctlr);
+
+		rdh = ctlr->rdh;
+		for(;;){
+			rd = &ctlr->rdba[rdh];
+			rim = ctlr->rim;
+			ctlr->rim = 0;
+			if(!(rd->status & Rdd))
+				break;
+
+			/*
+			 * Accept eop packets with no errors.
+			 * With no errors and the Ixsm bit set,
+			 * the descriptor status Tpcs and Ipcs bits give
+			 * an indication of whether the checksums were
+			 * calculated and valid.
+			 */
+			bp = ctlr->rb[rdh];
+			if((rd->status & Reop) && rd->errors == 0){
+				bp->wp += rd->length;
+				bp->lim = bp->wp;	/* lie like a dog. */
+				if(!(rd->status & Ixsm)){
+					ctlr->ixsm++;
+					if(rd->status & Ipcs){
+						/*
+						 * IP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->ipcs++;
+						bp->flag |= Bipck;
+					}
+					if(rd->status & Tcpcs){
+						/*
+						 * TCP/UDP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->tcpcs++;
+						bp->flag |= Btcpck|Budpck;
+					}
+					bp->checksum = rd->checksum;
+					bp->flag |= Bpktck;
+				}
+				etheriq(edev, bp, 1);
+			} else
+				freeb(bp);
+			ctlr->rb[rdh] = nil;
+
+			rd->status = 0;
+			ctlr->rdfree--;
+			ctlr->rdh = rdh = Next(rdh, m);
+			if(ctlr->nrd-ctlr->rdfree >= 32 || (rim & Rxdmt0))
+				i82563replenish(ctlr);
+		}
+	}
+}
+
+static int
+i82563lim(void* c)
+{
+	return ((Ctlr*)c)->lim != 0;
+}
+
+static int speedtab[] = {
+	10, 100, 1000, 0
+};
+
+static uint
+phyread(Ctlr *c, int reg)
+{
+	uint phy, i;
+
+	csr32w(c, Mdic, MDIrop | 1<<MDIpSHIFT | reg<<MDIrSHIFT);
+	phy = 0;
+	for(i = 0; i < 64; i++){
+		phy = csr32r(c, Mdic);
+		if(phy & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+	if((phy & (MDIe|MDIready)) != MDIready)
+		return ~0;
+	return phy & 0xffff;
+}
+
+static uint
+phywrite(Ctlr *c, int reg, ushort val)
+{
+	uint phy, i;
+
+	csr32w(c, Mdic, MDIwop | 1<<MDIpSHIFT | reg<<MDIrSHIFT | val);
+	phy = 0;
+	for(i = 0; i < 64; i++){
+		phy = csr32r(c, Mdic);
+		if(phy & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+	if((phy & (MDIe|MDIready)) != MDIready)
+		return ~0;
+	return 0;
+}
+
+/*
+ * watch for changes of link state
+ */
+static void
+i82563lproc(void *v)
+{
+	uint phy, i, a;
+	Ctlr *c;
+	Ether *e;
+
+	e = v;
+	c = e->ctlr;
+
+	if(c->type == i82573 && (phy = phyread(c, Phyier)) != ~0)
+		phywrite(c, Phyier, phy | Lscie | Ancie | Spdie | Panie);
+	for(;;){
+		phy = phyread(c, Physsr);
+		if(phy == ~0)
+			goto next;
+		i = (phy>>14) & 3;
+
+		switch(c->type){
+		case i82563:
+			a = phyread(c, Phyisr) & Ane;
+			break;
+		case i82571:
+		case i82572:
+			a = phyread(c, Phylhr) & Anf;
+			i = (i-1) & 3;
+			break;
+		default:
+			a = 0;
+			break;
+		}
+		if(a)
+			phywrite(c, Phyctl, phyread(c, Phyctl) | Ran | Ean);
+		e->link = (phy & Rtlink) != 0;
+		if(e->link){
+			c->speeds[i]++;
+			e->mbps = speedtab[i];
+		}
+next:
+		c->lim = 0;
+		i82563im(c, Lsc);
+		c->lsleep++;
+		sleep(&c->lrendez, i82563lim, c);
+	}
+}
+
+static void
+i82563tproc(void *v)
+{
+	Ether *e;
+	Ctlr *c;
+
+	e = v;
+	c = e->ctlr;
+	for(;;){
+		sleep(&c->trendez, return0, 0);
+		i82563transmit(e);
+	}
+}
+
+static void
+i82563attach(Ether* edev)
+{
+	Block *bp;
+	Ctlr *ctlr;
+	char name[KNAMELEN];
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->alock);
+	if(ctlr->attached){
+		qunlock(&ctlr->alock);
+		return;
+	}
+
+	ctlr->nrd = Nrd;
+	ctlr->ntd = Ntd;
+
+	if(waserror()){
+		while(ctlr->nrb > 0){
+			bp = i82563rballoc();
+			bp->free = nil;
+			freeb(bp);
+			ctlr->nrb--;
+		}
+		free(ctlr->tb);
+		ctlr->tb = nil;
+		free(ctlr->rb);
+		ctlr->rb = nil;
+		free(ctlr->tdba);
+		ctlr->tdba = nil;
+		free(ctlr->rdba);
+		ctlr->rdba = nil;
+		qunlock(&ctlr->alock);
+		nexterror();
+	}
+
+	if((ctlr->rdba = mallocalign(ctlr->nrd*sizeof(Rd), 128, 0, 0)) == nil)
+		error(Enomem);
+	if((ctlr->tdba = mallocalign(ctlr->ntd*sizeof(Td), 128, 0, 0)) == nil)
+		error(Enomem);
+	if((ctlr->rb = malloc(ctlr->nrd*sizeof(Block*))) == nil)
+		error(Enomem);
+	if((ctlr->tb = malloc(ctlr->ntd*sizeof(Block*))) == nil)
+		error(Enomem);
+
+	for(ctlr->nrb = 0; ctlr->nrb < Nrb; ctlr->nrb++){
+		if((bp = allocb(ctlr->rbsz + 4*KiB)) == nil)
+			break;
+		bp->free = i82563rbfree;
+		freeb(bp);
+	}
+
+	ctlr->attached = 1;
+
+	snprint(name, sizeof name, "#l%dl", edev->ctlrno);
+	kproc(name, i82563lproc, edev);
+
+	snprint(name, sizeof name, "#l%dr", edev->ctlrno);
+	kproc(name, i82563rproc, edev);
+
+	snprint(name, sizeof name, "#l%dt", edev->ctlrno);
+	kproc(name, i82563tproc, edev);
+
+	i82563txinit(ctlr);
+
+	qunlock(&ctlr->alock);
+	poperror();
+}
+
+static void
+i82563interrupt(Ureg*, void* arg)
+{
+	Ctlr *ctlr;
+	Ether *edev;
+	int icr, im;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	ilock(&ctlr->imlock);
+	csr32w(ctlr, Imc, ~0);
+	im = ctlr->im;
+
+	for(icr = csr32r(ctlr, Icr); icr & ctlr->im; icr = csr32r(ctlr, Icr)){
+		if(icr & Lsc){
+			im &= ~Lsc;
+			ctlr->lim = icr & Lsc;
+			wakeup(&ctlr->lrendez);
+			ctlr->lintr++;
+		}
+		if(icr & (Rxt0|Rxo|Rxdmt0|Rxseq|Ack)){
+			ctlr->rim = icr & (Rxt0|Rxo|Rxdmt0|Rxseq|Ack);
+			im &= ~(Rxt0|Rxo|Rxdmt0|Rxseq|Ack);
+			wakeup(&ctlr->rrendez);
+			ctlr->rintr++;
+		}
+		if(icr & Txdw){
+			im &= ~Txdw;
+			ctlr->tintr++;
+			wakeup(&ctlr->trendez);
+		}
+	}
+
+	ctlr->im = im;
+	csr32w(ctlr, Ims, im);
+	iunlock(&ctlr->imlock);
+}
+
+static int
+i82563detach(Ctlr* ctlr)
+{
+	int r, timeo;
+
+	/*
+	 * Perform a device reset to get the chip back to the
+	 * power-on state, followed by an EEPROM reset to read
+	 * the defaults for some internal registers.
+	 */
+	csr32w(ctlr, Imc, ~0);
+	csr32w(ctlr, Rctl, 0);
+	csr32w(ctlr, Tctl, 0);
+
+	delay(10);
+
+	r = csr32r(ctlr, Ctrl);
+	if(ctlr->type == i82566)
+		r |= Phyrst;
+	csr32w(ctlr, Ctrl, Devrst | r);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Ctrl) & Devrst))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrl) & Devrst)
+		return -1;
+
+	r = csr32r(ctlr, Ctrlext);
+	csr32w(ctlr, Ctrlext, r|Eerst);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Ctrlext) & Eerst))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrlext) & Eerst)
+		return -1;
+
+	csr32w(ctlr, Imc, ~0);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!csr32r(ctlr, Icr))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Icr))
+		return -1;
+
+	/*
+	 * Balance Rx/Tx packet buffer.
+	 * No need to set PBA register unless using jumbo, defaults to 32KB
+	 * for receive. If it is changed, then have to do a MAC reset,
+	 * and need to do that at the the right time as it will wipe stuff.
+	 */
+	if(ctlr->rbsz > 8192 && (ctlr->type == i82563 || ctlr->type == i82571 ||
+	    ctlr->type == i82572)){
+		ctlr->pba = csr32r(ctlr, Pba);
+		r = ctlr->pba >> 16;
+		r += ctlr->pba & 0xffff;
+		r >>= 1;
+		csr32w(ctlr, Pba, r);
+	} else if(ctlr->type == i82573 && ctlr->rbsz > 1514)
+		csr32w(ctlr, Pba, 14);
+	ctlr->pba = csr32r(ctlr, Pba);
+
+	r = csr32r(ctlr, Ctrl);
+	csr32w(ctlr, Ctrl, Slu|r);
+
+	return 0;
+}
+
+static void
+i82563shutdown(Ether* ether)
+{
+	i82563detach(ether->ctlr);
+}
+
+static ushort
+eeread(Ctlr *ctlr, int adr)
+{
+	csr32w(ctlr, Eerd, EEstart | adr << 2);
+	while ((csr32r(ctlr, Eerd) & EEdone) == 0)
+		;
+	return csr32r(ctlr, Eerd) >> 16;
+}
+
+static int
+eeload(Ctlr *ctlr)
+{
+	ushort sum;
+	int data, adr;
+
+	sum = 0;
+	for (adr = 0; adr < 0x40; adr++) {
+		data = eeread(ctlr, adr);
+		ctlr->eeprom[adr] = data;
+		sum += data;
+	}
+	return sum;
+}
+
+static int
+fcycle(Ctlr *, Flash *f)
+{
+	ushort s, i;
+
+	s = f->reg[Fsts];
+	if((s&Fvalid) == 0)
+		return -1;
+	f->reg[Fsts] |= Fcerr | Ael;
+	for(i = 0; i < 10; i++){
+		if((s&Scip) == 0)
+			return 0;
+		delay(1);
+		s = f->reg[Fsts];
+	}
+	return -1;
+}
+
+static int
+fread(Ctlr *c, Flash *f, int ladr)
+{
+	ushort s;
+
+	delay(1);
+	if(fcycle(c, f) == -1)
+		return -1;
+	f->reg[Fsts] |= Fdone;
+	f->reg32[Faddr] = ladr;
+
+	/* setup flash control register */
+	s = f->reg[Fctl];
+	s &= ~(0x1f << 8);
+	s |= (2-1) << 8;		/* 2 bytes */
+	s &= ~(2*Flcycle);		/* read */
+	f->reg[Fctl] = s | Fgo;
+
+	while((f->reg[Fsts] & Fdone) == 0)
+		;
+	if(f->reg[Fsts] & (Fcerr|Ael))
+		return -1;
+	return f->reg32[Fdata] & 0xffff;
+}
+
+static int
+fload(Ctlr *c)
+{
+	ulong data, io, r, adr;
+	ushort sum;
+	Flash f;
+
+	io = c->pcidev->mem[1].bar & ~0x0f;
+	f.reg = vmap(io, c->pcidev->mem[1].size);
+	if(f.reg == nil)
+		return -1;
+	f.reg32 = (void*)f.reg;
+	f.sz = f.reg32[Bfpr];
+	r = f.sz & 0x1fff;
+	if(csr32r(c, Eec) & (1<<22))
+		++r;
+	r <<= 12;
+
+	sum = 0;
+	for (adr = 0; adr < 0x40; adr++) {
+		data = fread(c, &f, r + adr*2);
+		if(data == -1)
+			break;
+		c->eeprom[adr] = data;
+		sum += data;
+	}
+	vunmap(f.reg, c->pcidev->mem[1].size);
+	return sum;
+}
+
+static int
+i82563reset(Ctlr *ctlr)
+{
+	int i, r;
+
+	if(i82563detach(ctlr))
+		return -1;
+	if(ctlr->type == i82566)
+		r = fload(ctlr);
+	else
+		r = eeload(ctlr);
+	if (r != 0 && r != 0xBABA){
+		print("%s: bad EEPROM checksum - %#.4ux\n",
+			tname[ctlr->type], r);
+		return -1;
+	}
+
+	for(i = 0; i < Eaddrlen/2; i++){
+		ctlr->ra[2*i]   = ctlr->eeprom[Ea+i];
+		ctlr->ra[2*i+1] = ctlr->eeprom[Ea+i] >> 8;
+	}
+	r = (csr32r(ctlr, Status) & Lanid) >> 2;
+	ctlr->ra[5] += r;		/* ea ctlr[1] = ea ctlr[0]+1 */
+
+	r = ctlr->ra[3]<<24 | ctlr->ra[2]<<16 | ctlr->ra[1]<<8 | ctlr->ra[0];
+	csr32w(ctlr, Ral, r);
+	r = 0x80000000 | ctlr->ra[5]<<8 | ctlr->ra[4];
+	csr32w(ctlr, Rah, r);
+	for(i = 1; i < 16; i++){
+		csr32w(ctlr, Ral+i*8, 0);
+		csr32w(ctlr, Rah+i*8, 0);
+	}
+	memset(ctlr->mta, 0, sizeof(ctlr->mta));
+	for(i = 0; i < 128; i++)
+		csr32w(ctlr, Mta + i*4, 0);
+
+	/*
+	 * Does autonegotiation affect this manual setting?
+	 * The correct values here should depend on the PBA value
+	 * and maximum frame length, no?
+	 * ctlr->fcrt[lh] arenever set so default to 0.
+	 */
+	csr32w(ctlr, Fcal, 0x00C28001);
+	csr32w(ctlr, Fcah, 0x0100);
+	csr32w(ctlr, Fct, 0x8808);
+	csr32w(ctlr, Fcttv, 0x0100);
+
+	csr32w(ctlr, Fcrtl, ctlr->fcrtl);
+	csr32w(ctlr, Fcrth, ctlr->fcrth);
+
+	return 0;
+}
+
+static void
+i82563pci(void)
+{
+	int type;
+	ulong io;
+	void *mem;
+	Pcidev *p;
+	Ctlr *ctlr;
+
+	p = nil;
+	while(p = pcimatch(p, 0x8086, 0)){
+		switch(p->did){
+		default:
+			continue;
+		case 0x1096:
+		case 0x10ba:
+			type = i82563;
+			break;
+		case 0x1049:		/* mm */
+		case 0x104a:		/* dm */
+		case 0x104d:		/* v */
+		case 0x10bd:		/* dm */
+			type = i82566;
+			break;
+		case 0x10a4:
+		case 0x105e:
+			type = i82571;
+			break;
+		case 0x10b9:		/* sic, 82572 */
+			type = i82572;
+			break;
+		case 0x108b:		/*  e */
+		case 0x108c:		/*  e (iamt) */
+		case 0x109a:		/*  l */
+			type = i82573;
+			break;
+		case 0x10a7:		/* 82575eb */
+			type = i82575;
+			break;
+		case 0x10c9:		/* 82576 copper */
+		case 0x10e6:		/* 82576 fiber */
+		case 0x10e7:		/* 82576 serdes */
+			type = i82576;
+			break;
+		}
+
+		io = p->mem[0].bar & ~0x0F;
+		mem = vmap(io, p->mem[0].size);
+		if(mem == nil){
+			print("%s: can't map %.8lux\n", tname[type], io);
+			continue;
+		}
+		ctlr = malloc(sizeof(Ctlr));
+		ctlr->port = io;
+		ctlr->pcidev = p;
+		ctlr->type = type;
+		ctlr->rbsz = rbtab[type];
+		ctlr->nic = mem;
+
+		if(i82563reset(ctlr)){
+			vunmap(mem, p->mem[0].size);
+			free(ctlr);
+			continue;
+		}
+		pcisetbme(p);
+
+		if(i82563ctlrhead != nil)
+			i82563ctlrtail->next = ctlr;
+		else
+			i82563ctlrhead = ctlr;
+		i82563ctlrtail = ctlr;
+	}
+}
+
+static int
+pnp(Ether* edev, int type)
+{
+	Ctlr *ctlr;
+	static int done;
+
+	if(!done) {
+		i82563pci();
+		done = 1;
+	}
+
+	/*
+	 * Any adapter matches if no edev->port is supplied,
+	 * otherwise the ports must match.
+	 */
+	for(ctlr = i82563ctlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(type != Iany && ctlr->type != type)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+	if(ctlr == nil)
+		return -1;
+
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 1000;
+	edev->maxmtu = ctlr->rbsz;
+	memmove(edev->ea, ctlr->ra, Eaddrlen);
+
+	/*
+	 * Linkage to the generic ethernet driver.
+	 */
+	edev->attach = i82563attach;
+	edev->transmit = i82563transmit;
+	edev->interrupt = i82563interrupt;
+	edev->ifstat = i82563ifstat;
+	edev->ctl = i82563ctl;
+
+	edev->arg = edev;
+	edev->promiscuous = i82563promiscuous;
+	edev->shutdown = i82563shutdown;
+	edev->multicast = i82563multicast;
+
+	return 0;
+}
+
+static int
+anypnp(Ether *e)
+{
+	return pnp(e, Iany);
+}
+
+static int
+i82563pnp(Ether *e)
+{
+	return pnp(e, i82563);
+}
+
+static int
+i82566pnp(Ether *e)
+{
+	return pnp(e, i82566);
+}
+
+static int
+i82571pnp(Ether *e)
+{
+	return pnp(e, i82571);
+}
+
+static int
+i82572pnp(Ether *e)
+{
+	return pnp(e, i82572);
+}
+
+static int
+i82573pnp(Ether *e)
+{
+	return pnp(e, i82573);
+}
+
+static int
+i82575pnp(Ether *e)
+{
+	return pnp(e, i82575);
+}
+
+static int
+i82576pnp(Ether *e)
+{
+	return pnp(e, i82576);
+}
+
+void
+ether82563link(void)
+{
+	/* recognise lots of model numbers for debugging assistance */
+	addethercard("i82563", i82563pnp);
+	addethercard("i82566", i82566pnp);
+	addethercard("i82571", i82571pnp);
+	addethercard("i82572", i82572pnp);
+	addethercard("i82573", i82573pnp);
+	addethercard("i82575", i82575pnp);
+	addethercard("i82576", i82576pnp);
+	addethercard("igbepcie", anypnp);
+}

+ 2003 - 0
sys/src/9/386/etherigbe.c

@@ -0,0 +1,2003 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Intel 8254[340]NN Gigabit Ethernet Controller
+ * as found on the Intel PRO/1000 series of adapters:
+ *	82543GC	Intel PRO/1000 T
+ *	82544EI Intel PRO/1000 XT
+ *	82540EM Intel PRO/1000 MT
+ *	82541[GP]I
+ *	82547GI
+ *	82546GB
+ *	82546EB
+ * To Do:
+ *	finish autonegotiation code;
+ *	integrate fiber stuff back in (this ONLY handles
+ *	the CAT5 cards at the moment);
+ *	add checksum-offload;
+ *	add tuning control via ctl file;
+ *	this driver is little-endian specific.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "../port/ethermii.h"
+#include "../port/netif.h"
+
+#include "etherif.h"
+#include "io.h"
+
+enum {
+	i82542		= (0x1000<<16)|0x8086,
+	i82543gc	= (0x1004<<16)|0x8086,
+	i82544ei	= (0x1008<<16)|0x8086,
+	i82547ei	= (0x1019<<16)|0x8086,
+	i82540em	= (0x100E<<16)|0x8086,
+	i82540eplp	= (0x101E<<16)|0x8086,
+	i82545gmc	= (0x1026<<16)|0x8086,
+	i82547gi	= (0x1075<<16)|0x8086,
+	i82541gi	= (0x1076<<16)|0x8086,
+	i82541gi2	= (0x1077<<16)|0x8086,
+	i82546gb	= (0x1079<<16)|0x8086,
+	i82541pi	= (0x107c<<16)|0x8086,
+	i82546eb	= (0x1010<<16)|0x8086,
+};
+
+enum {
+	Ctrl		= 0x00000000,	/* Device Control */
+	Ctrldup		= 0x00000004,	/* Device Control Duplicate */
+	Status		= 0x00000008,	/* Device Status */
+	Eecd		= 0x00000010,	/* EEPROM/Flash Control/Data */
+	Ctrlext		= 0x00000018,	/* Extended Device Control */
+	Mdic		= 0x00000020,	/* MDI Control */
+	Fcal		= 0x00000028,	/* Flow Control Address Low */
+	Fcah		= 0x0000002C,	/* Flow Control Address High */
+	Fct		= 0x00000030,	/* Flow Control Type */
+	Icr		= 0x000000C0,	/* Interrupt Cause Read */
+	Ics		= 0x000000C8,	/* Interrupt Cause Set */
+	Ims		= 0x000000D0,	/* Interrupt Mask Set/Read */
+	Imc		= 0x000000D8,	/* Interrupt mask Clear */
+	Rctl		= 0x00000100,	/* Receive Control */
+	Fcttv		= 0x00000170,	/* Flow Control Transmit Timer Value */
+	Txcw		= 0x00000178,	/* Transmit Configuration Word */
+	Rxcw		= 0x00000180,	/* Receive Configuration Word */
+	Tctl		= 0x00000400,	/* Transmit Control */
+	Tipg		= 0x00000410,	/* Transmit IPG */
+	Tbt		= 0x00000448,	/* Transmit Burst Timer */
+	Ait		= 0x00000458,	/* Adaptive IFS Throttle */
+	Fcrtl		= 0x00002160,	/* Flow Control RX Threshold Low */
+	Fcrth		= 0x00002168,	/* Flow Control Rx Threshold High */
+	Rdfh		= 0x00002410,	/* Receive data fifo head */
+	Rdft		= 0x00002418,	/* Receive data fifo tail */
+	Rdfhs		= 0x00002420,	/* Receive data fifo head saved */
+	Rdfts		= 0x00002428,	/* Receive data fifo tail saved */
+	Rdfpc		= 0x00002430,	/* Receive data fifo packet count */
+	Rdbal		= 0x00002800,	/* Rd Base Address Low */
+	Rdbah		= 0x00002804,	/* Rd Base Address High */
+	Rdlen		= 0x00002808,	/* Receive Descriptor Length */
+	Rdh		= 0x00002810,	/* Receive Descriptor Head */
+	Rdt		= 0x00002818,	/* Receive Descriptor Tail */
+	Rdtr		= 0x00002820,	/* Receive Descriptor Timer Ring */
+	Rxdctl		= 0x00002828,	/* Receive Descriptor Control */
+	Radv		= 0x0000282C,	/* Receive Interrupt Absolute Delay Timer */
+	Txdmac		= 0x00003000,	/* Transfer DMA Control */
+	Ett		= 0x00003008,	/* Early Transmit Control */
+	Tdfh		= 0x00003410,	/* Transmit data fifo head */
+	Tdft		= 0x00003418,	/* Transmit data fifo tail */
+	Tdfhs		= 0x00003420,	/* Transmit data Fifo Head saved */
+	Tdfts		= 0x00003428,	/* Transmit data fifo tail saved */
+	Tdfpc		= 0x00003430,	/* Trasnmit data Fifo packet count */
+	Tdbal		= 0x00003800,	/* Td Base Address Low */
+	Tdbah		= 0x00003804,	/* Td Base Address High */
+	Tdlen		= 0x00003808,	/* Transmit Descriptor Length */
+	Tdh		= 0x00003810,	/* Transmit Descriptor Head */
+	Tdt		= 0x00003818,	/* Transmit Descriptor Tail */
+	Tidv		= 0x00003820,	/* Transmit Interrupt Delay Value */
+	Txdctl		= 0x00003828,	/* Transmit Descriptor Control */
+	Tadv		= 0x0000382C,	/* Transmit Interrupt Absolute Delay Timer */
+
+	Statistics	= 0x00004000,	/* Start of Statistics Area */
+	Gorcl		= 0x88/4,	/* Good Octets Received Count */
+	Gotcl		= 0x90/4,	/* Good Octets Transmitted Count */
+	Torl		= 0xC0/4,	/* Total Octets Received */
+	Totl		= 0xC8/4,	/* Total Octets Transmitted */
+	Nstatistics	= 64,
+
+	Rxcsum		= 0x00005000,	/* Receive Checksum Control */
+	Mta		= 0x00005200,	/* Multicast Table Array */
+	Ral		= 0x00005400,	/* Receive Address Low */
+	Rah		= 0x00005404,	/* Receive Address High */
+	Manc		= 0x00005820,	/* Management Control */
+};
+
+enum {					/* Ctrl */
+	Bem		= 0x00000002,	/* Big Endian Mode */
+	Prior		= 0x00000004,	/* Priority on the PCI bus */
+	Lrst		= 0x00000008,	/* Link Reset */
+	Asde		= 0x00000020,	/* Auto-Speed Detection Enable */
+	Slu		= 0x00000040,	/* Set Link Up */
+	Ilos		= 0x00000080,	/* Invert Loss of Signal (LOS) */
+	SspeedMASK	= 0x00000300,	/* Speed Selection */
+	SspeedSHIFT	= 8,
+	Sspeed10	= 0x00000000,	/* 10Mb/s */
+	Sspeed100	= 0x00000100,	/* 100Mb/s */
+	Sspeed1000	= 0x00000200,	/* 1000Mb/s */
+	Frcspd		= 0x00000800,	/* Force Speed */
+	Frcdplx		= 0x00001000,	/* Force Duplex */
+	SwdpinsloMASK	= 0x003C0000,	/* Software Defined Pins - lo nibble */
+	SwdpinsloSHIFT	= 18,
+	SwdpioloMASK	= 0x03C00000,	/* Software Defined Pins - I or O */
+	SwdpioloSHIFT	= 22,
+	Devrst		= 0x04000000,	/* Device Reset */
+	Rfce		= 0x08000000,	/* Receive Flow Control Enable */
+	Tfce		= 0x10000000,	/* Transmit Flow Control Enable */
+	Vme		= 0x40000000,	/* VLAN Mode Enable */
+};
+
+enum {					/* Status */
+	Lu		= 0x00000002,	/* Link Up */
+	Tckok		= 0x00000004,	/* Transmit clock is running */
+	Rbcok		= 0x00000008,	/* Receive clock is running */
+	Txoff		= 0x00000010,	/* Transmission Paused */
+	Tbimode		= 0x00000020,	/* TBI Mode Indication */
+	LspeedMASK	= 0x000000C0,	/* Link Speed Setting */
+	LspeedSHIFT	= 6,
+	Lspeed10	= 0x00000000,	/* 10Mb/s */
+	Lspeed100	= 0x00000040,	/* 100Mb/s */
+	Lspeed1000	= 0x00000080,	/* 1000Mb/s */
+	Mtxckok		= 0x00000400,	/* MTX clock is running */
+	Pci66		= 0x00000800,	/* PCI Bus speed indication */
+	Bus64		= 0x00001000,	/* PCI Bus width indication */
+	Pcixmode	= 0x00002000,	/* PCI-X mode */
+	PcixspeedMASK	= 0x0000C000,	/* PCI-X bus speed */
+	PcixspeedSHIFT	= 14,
+	Pcix66		= 0x00000000,	/* 50-66MHz */
+	Pcix100		= 0x00004000,	/* 66-100MHz */
+	Pcix133		= 0x00008000,	/* 100-133MHz */
+};
+
+enum {					/* Ctrl and Status */
+	Fd		= 0x00000001,	/* Full-Duplex */
+	AsdvMASK	= 0x00000300,
+	AsdvSHIFT	= 8,
+	Asdv10		= 0x00000000,	/* 10Mb/s */
+	Asdv100		= 0x00000100,	/* 100Mb/s */
+	Asdv1000	= 0x00000200,	/* 1000Mb/s */
+};
+
+enum {					/* Eecd */
+	Sk		= 0x00000001,	/* Clock input to the EEPROM */
+	Cs		= 0x00000002,	/* Chip Select */
+	Di		= 0x00000004,	/* Data Input to the EEPROM */
+	Do		= 0x00000008,	/* Data Output from the EEPROM */
+	Areq		= 0x00000040,	/* EEPROM Access Request */
+	Agnt		= 0x00000080,	/* EEPROM Access Grant */
+	Eepresent	= 0x00000100,	/* EEPROM Present */
+	Eesz256		= 0x00000200,	/* EEPROM is 256 words not 64 */
+	Eeszaddr	= 0x00000400,	/* EEPROM size for 8254[17] */
+	Spi		= 0x00002000,	/* EEPROM is SPI not Microwire */
+};
+
+enum {					/* Ctrlext */
+	Gpien		= 0x0000000F,	/* General Purpose Interrupt Enables */
+	SwdpinshiMASK	= 0x000000F0,	/* Software Defined Pins - hi nibble */
+	SwdpinshiSHIFT	= 4,
+	SwdpiohiMASK	= 0x00000F00,	/* Software Defined Pins - I or O */
+	SwdpiohiSHIFT	= 8,
+	Asdchk		= 0x00001000,	/* ASD Check */
+	Eerst		= 0x00002000,	/* EEPROM Reset */
+	Ips		= 0x00004000,	/* Invert Power State */
+	Spdbyps		= 0x00008000,	/* Speed Select Bypass */
+};
+
+enum {					/* EEPROM content offsets */
+	Ea		= 0x00,		/* Ethernet Address */
+	Cf		= 0x03,		/* Compatibility Field */
+	Pba		= 0x08,		/* Printed Board Assembly number */
+	Icw1		= 0x0A,		/* Initialization Control Word 1 */
+	Sid		= 0x0B,		/* Subsystem ID */
+	Svid		= 0x0C,		/* Subsystem Vendor ID */
+	Did		= 0x0D,		/* Device ID */
+	Vid		= 0x0E,		/* Vendor ID */
+	Icw2		= 0x0F,		/* Initialization Control Word 2 */
+};
+
+enum {					/* Mdic */
+	MDIdMASK	= 0x0000FFFF,	/* Data */
+	MDIdSHIFT	= 0,
+	MDIrMASK	= 0x001F0000,	/* PHY Register Address */
+	MDIrSHIFT	= 16,
+	MDIpMASK	= 0x03E00000,	/* PHY Address */
+	MDIpSHIFT	= 21,
+	MDIwop		= 0x04000000,	/* Write Operation */
+	MDIrop		= 0x08000000,	/* Read Operation */
+	MDIready	= 0x10000000,	/* End of Transaction */
+	MDIie		= 0x20000000,	/* Interrupt Enable */
+	MDIe		= 0x40000000,	/* Error */
+};
+
+enum {					/* Icr, Ics, Ims, Imc */
+	Txdw		= 0x00000001,	/* Transmit Descriptor Written Back */
+	Txqe		= 0x00000002,	/* Transmit Queue Empty */
+	Lsc		= 0x00000004,	/* Link Status Change */
+	Rxseq		= 0x00000008,	/* Receive Sequence Error */
+	Rxdmt0		= 0x00000010,	/* Rd Minimum Threshold Reached */
+	Rxo		= 0x00000040,	/* Receiver Overrun */
+	Rxt0		= 0x00000080,	/* Receiver Timer Interrupt */
+	Mdac		= 0x00000200,	/* MDIO Access Completed */
+	Rxcfg		= 0x00000400,	/* Receiving /C/ ordered sets */
+	Gpi0		= 0x00000800,	/* General Purpose Interrupts */
+	Gpi1		= 0x00001000,
+	Gpi2		= 0x00002000,
+	Gpi3		= 0x00004000,
+};
+
+/*
+ * The Mdic register isn't implemented on the 82543GC,
+ * the software defined pins are used instead.
+ * These definitions work for the Intel PRO/1000 T Server Adapter.
+ * The direction pin bits are read from the EEPROM.
+ */
+enum {
+	Mdd		= ((1<<2)<<SwdpinsloSHIFT),	/* data */
+	Mddo		= ((1<<2)<<SwdpioloSHIFT),	/* pin direction */
+	Mdc		= ((1<<3)<<SwdpinsloSHIFT),	/* clock */
+	Mdco		= ((1<<3)<<SwdpioloSHIFT),	/* pin direction */
+	Mdr		= ((1<<0)<<SwdpinshiSHIFT),	/* reset */
+	Mdro		= ((1<<0)<<SwdpiohiSHIFT),	/* pin direction */
+};
+
+enum {					/* Txcw */
+	TxcwFd		= 0x00000020,	/* Full Duplex */
+	TxcwHd		= 0x00000040,	/* Half Duplex */
+	TxcwPauseMASK	= 0x00000180,	/* Pause */
+	TxcwPauseSHIFT	= 7,
+	TxcwPs		= (1<<TxcwPauseSHIFT),	/* Pause Supported */
+	TxcwAs		= (2<<TxcwPauseSHIFT),	/* Asymmetric FC desired */
+	TxcwRfiMASK	= 0x00003000,	/* Remote Fault Indication */
+	TxcwRfiSHIFT	= 12,
+	TxcwNpr		= 0x00008000,	/* Next Page Request */
+	TxcwConfig	= 0x40000000,	/* Transmit COnfig Control */
+	TxcwAne		= 0x80000000,	/* Auto-Negotiation Enable */
+};
+
+enum {					/* Rxcw */
+	Rxword		= 0x0000FFFF,	/* Data from auto-negotiation process */
+	Rxnocarrier	= 0x04000000,	/* Carrier Sense indication */
+	Rxinvalid	= 0x08000000,	/* Invalid Symbol during configuration */
+	Rxchange	= 0x10000000,	/* Change to the Rxword indication */
+	Rxconfig	= 0x20000000,	/* /C/ order set reception indication */
+	Rxsync		= 0x40000000,	/* Lost bit synchronization indication */
+	Anc		= 0x80000000,	/* Auto Negotiation Complete */
+};
+
+enum {					/* Rctl */
+	Rrst		= 0x00000001,	/* Receiver Software Reset */
+	Ren		= 0x00000002,	/* Receiver Enable */
+	Sbp		= 0x00000004,	/* Store Bad Packets */
+	Upe		= 0x00000008,	/* Unicast Promiscuous Enable */
+	Mpe		= 0x00000010,	/* Multicast Promiscuous Enable */
+	Lpe		= 0x00000020,	/* Long Packet Reception Enable */
+	LbmMASK		= 0x000000C0,	/* Loopback Mode */
+	LbmOFF		= 0x00000000,	/* No Loopback */
+	LbmTBI		= 0x00000040,	/* TBI Loopback */
+	LbmMII		= 0x00000080,	/* GMII/MII Loopback */
+	LbmXCVR		= 0x000000C0,	/* Transceiver Loopback */
+	RdtmsMASK	= 0x00000300,	/* Rd Minimum Threshold Size */
+	RdtmsHALF	= 0x00000000,	/* Threshold is 1/2 Rdlen */
+	RdtmsQUARTER	= 0x00000100,	/* Threshold is 1/4 Rdlen */
+	RdtmsEIGHTH	= 0x00000200,	/* Threshold is 1/8 Rdlen */
+	MoMASK		= 0x00003000,	/* Multicast Offset */
+	Mo47b36		= 0x00000000,	/* bits [47:36] of received address */
+	Mo46b35		= 0x00001000,	/* bits [46:35] of received address */
+	Mo45b34		= 0x00002000,	/* bits [45:34] of received address */
+	Mo43b32		= 0x00003000,	/* bits [43:32] of received address */
+	Bam		= 0x00008000,	/* Broadcast Accept Mode */
+	BsizeMASK	= 0x00030000,	/* Receive Buffer Size */
+	Bsize2048	= 0x00000000,	/* Bsex = 0 */
+	Bsize1024	= 0x00010000,	/* Bsex = 0 */
+	Bsize512	= 0x00020000,	/* Bsex = 0 */
+	Bsize256	= 0x00030000,	/* Bsex = 0 */
+	Bsize16384	= 0x00010000,	/* Bsex = 1 */
+	Vfe		= 0x00040000,	/* VLAN Filter Enable */
+	Cfien		= 0x00080000,	/* Canonical Form Indicator Enable */
+	Cfi		= 0x00100000,	/* Canonical Form Indicator value */
+	Dpf		= 0x00400000,	/* Discard Pause Frames */
+	Pmcf		= 0x00800000,	/* Pass MAC Control Frames */
+	Bsex		= 0x02000000,	/* Buffer Size Extension */
+	Secrc		= 0x04000000,	/* Strip CRC from incoming packet */
+};
+
+enum {					/* Tctl */
+	Trst		= 0x00000001,	/* Transmitter Software Reset */
+	Ten		= 0x00000002,	/* Transmit Enable */
+	Psp		= 0x00000008,	/* Pad Short Packets */
+	CtMASK		= 0x00000FF0,	/* Collision Threshold */
+	CtSHIFT		= 4,
+	ColdMASK	= 0x003FF000,	/* Collision Distance */
+	ColdSHIFT	= 12,
+	Swxoff		= 0x00400000,	/* Sofware XOFF Transmission */
+	Pbe		= 0x00800000,	/* Packet Burst Enable */
+	Rtlc		= 0x01000000,	/* Re-transmit on Late Collision */
+	Nrtu		= 0x02000000,	/* No Re-transmit on Underrrun */
+};
+
+enum {					/* [RT]xdctl */
+	PthreshMASK	= 0x0000003F,	/* Prefetch Threshold */
+	PthreshSHIFT	= 0,
+	HthreshMASK	= 0x00003F00,	/* Host Threshold */
+	HthreshSHIFT	= 8,
+	WthreshMASK	= 0x003F0000,	/* Writeback Threshold */
+	WthreshSHIFT	= 16,
+	Gran		= 0x01000000,	/* Granularity */
+	LthreshMASK	= 0xFE000000,	/* Low Threshold */
+	LthreshSHIFT	= 25,
+};
+
+enum {					/* Rxcsum */
+	PcssMASK	= 0x000000FF,	/* Packet Checksum Start */
+	PcssSHIFT	= 0,
+	Ipofl		= 0x00000100,	/* IP Checksum Off-load Enable */
+	Tuofl		= 0x00000200,	/* TCP/UDP Checksum Off-load Enable */
+};
+
+enum {					/* Manc */
+	Arpen		= 0x00002000,	/* Enable ARP Request Filtering */
+};
+
+enum {					/* Receive Delay Timer Ring */
+	DelayMASK	= 0x0000FFFF,	/* delay timer in 1.024nS increments */
+	DelaySHIFT	= 0,
+	Fpd		= 0x80000000,	/* Flush partial Descriptor Block */
+};
+
+typedef struct Rd {			/* Receive Descriptor */
+	uint	addr[2];
+	ushort	length;
+	ushort	checksum;
+	uchar	status;
+	uchar	errors;
+	ushort	special;
+} Rd;
+
+enum {					/* Rd status */
+	Rdd		= 0x01,		/* Descriptor Done */
+	Reop		= 0x02,		/* End of Packet */
+	Ixsm		= 0x04,		/* Ignore Checksum Indication */
+	Vp		= 0x08,		/* Packet is 802.1Q (matched VET) */
+	Tcpcs		= 0x20,		/* TCP Checksum Calculated on Packet */
+	Ipcs		= 0x40,		/* IP Checksum Calculated on Packet */
+	Pif		= 0x80,		/* Passed in-exact filter */
+};
+
+enum {					/* Rd errors */
+	Ce		= 0x01,		/* CRC Error or Alignment Error */
+	Se		= 0x02,		/* Symbol Error */
+	Seq		= 0x04,		/* Sequence Error */
+	Cxe		= 0x10,		/* Carrier Extension Error */
+	Tcpe		= 0x20,		/* TCP/UDP Checksum Error */
+	Ipe		= 0x40,		/* IP Checksum Error */
+	Rxe		= 0x80,		/* RX Data Error */
+};
+
+typedef struct Td Td;
+struct Td {				/* Transmit Descriptor */
+	union {
+		uint	addr[2];	/* Data */
+		struct {		/* Context */
+			uchar	ipcss;
+			uchar	ipcso;
+			ushort	ipcse;
+			uchar	tucss;
+			uchar	tucso;
+			ushort	tucse;
+		};
+	};
+	uint	control;
+	uint	status;
+};
+
+enum {					/* Td control */
+	LenMASK		= 0x000FFFFF,	/* Data/Packet Length Field */
+	LenSHIFT	= 0,
+	DtypeCD		= 0x00000000,	/* Data Type 'Context Descriptor' */
+	DtypeDD		= 0x00100000,	/* Data Type 'Data Descriptor' */
+	PtypeTCP	= 0x01000000,	/* TCP/UDP Packet Type (CD) */
+	Teop		= 0x01000000,	/* End of Packet (DD) */
+	PtypeIP		= 0x02000000,	/* IP Packet Type (CD) */
+	Ifcs		= 0x02000000,	/* Insert FCS (DD) */
+	Tse		= 0x04000000,	/* TCP Segmentation Enable */
+	Rs		= 0x08000000,	/* Report Status */
+	Rps		= 0x10000000,	/* Report Status Sent */
+	Dext		= 0x20000000,	/* Descriptor Extension */
+	Vle		= 0x40000000,	/* VLAN Packet Enable */
+	Ide		= 0x80000000,	/* Interrupt Delay Enable */
+};
+
+enum {					/* Td status */
+	Tdd		= 0x00000001,	/* Descriptor Done */
+	Ec		= 0x00000002,	/* Excess Collisions */
+	Lc		= 0x00000004,	/* Late Collision */
+	Tu		= 0x00000008,	/* Transmit Underrun */
+	Iixsm		= 0x00000100,	/* Insert IP Checksum */
+	Itxsm		= 0x00000200,	/* Insert TCP/UDP Checksum */
+	HdrlenMASK	= 0x0000FF00,	/* Header Length (Tse) */
+	HdrlenSHIFT	= 8,
+	VlanMASK	= 0x0FFF0000,	/* VLAN Identifier */
+	VlanSHIFT	= 16,
+	Tcfi		= 0x10000000,	/* Canonical Form Indicator */
+	PriMASK		= 0xE0000000,	/* User Priority */
+	PriSHIFT	= 29,
+	MssMASK		= 0xFFFF0000,	/* Maximum Segment Size (Tse) */
+	MssSHIFT	= 16,
+};
+
+enum {
+	Nrd		= 256,		/* multiple of 8 */
+	Ntd		= 64,		/* multiple of 8 */
+	Nrb		= 1024,		/* private receive buffers per Ctlr */
+	Rbsz		= 2048,
+};
+
+typedef struct Ctlr Ctlr;
+typedef struct Ctlr {
+	int	port;
+	Pcidev*	pcidev;
+	Ctlr*	next;
+	int	active;
+	int	started;
+	int	id;
+	int	cls;
+	ushort	eeprom[0x40];
+
+	QLock	alock;			/* attach */
+	void*	alloc;			/* receive/transmit descriptors */
+	int	nrd;
+	int	ntd;
+	int	nrb;			/* how many this Ctlr has in the pool */
+
+	int*	nic;
+	Lock	imlock;
+	int	im;			/* interrupt mask */
+
+	Mii*	mii;
+	Rendez	lrendez;
+	int	lim;
+
+	int	link;
+
+	QLock	slock;
+	uint	statistics[Nstatistics];
+	uint	lsleep;
+	uint	lintr;
+	uint	rsleep;
+	uint	rintr;
+	uint	txdw;
+	uint	tintr;
+	uint	ixsm;
+	uint	ipcs;
+	uint	tcpcs;
+
+	uchar	ra[Eaddrlen];		/* receive address */
+	ulong	mta[128];		/* multicast table array */
+
+	Rendez	rrendez;
+	int	rim;
+	int	rdfree;
+	Rd*	rdba;			/* receive descriptor base address */
+	Block**	rb;			/* receive buffers */
+	int	rdh;			/* receive descriptor head */
+	int	rdt;			/* receive descriptor tail */
+	int	rdtr;			/* receive delay timer ring value */
+
+	Lock	tlock;
+	int	tbusy;
+	int	tdfree;
+	Td*	tdba;			/* transmit descriptor base address */
+	Block**	tb;			/* transmit buffers */
+	int	tdh;			/* transmit descriptor head */
+	int	tdt;			/* transmit descriptor tail */
+
+	int	txcw;
+	int	fcrtl;
+	int	fcrth;
+} Ctlr;
+
+#define csr32r(c, r)	(*((c)->nic+((r)/4)))
+#define csr32w(c, r, v)	(*((c)->nic+((r)/4)) = (v))
+
+static Ctlr* igbectlrhead;
+static Ctlr* igbectlrtail;
+
+static Lock igberblock;		/* free receive Blocks */
+static Block* igberbpool;
+
+static char* statistics[Nstatistics] = {
+	"CRC Error",
+	"Alignment Error",
+	"Symbol Error",
+	"RX Error",
+	"Missed Packets",
+	"Single Collision",
+	"Excessive Collisions",
+	"Multiple Collision",
+	"Late Collisions",
+	nil,
+	"Collision",
+	"Transmit Underrun",
+	"Defer",
+	"Transmit - No CRS",
+	"Sequence Error",
+	"Carrier Extension Error",
+	"Receive Error Length",
+	nil,
+	"XON Received",
+	"XON Transmitted",
+	"XOFF Received",
+	"XOFF Transmitted",
+	"FC Received Unsupported",
+	"Packets Received (64 Bytes)",
+	"Packets Received (65-127 Bytes)",
+	"Packets Received (128-255 Bytes)",
+	"Packets Received (256-511 Bytes)",
+	"Packets Received (512-1023 Bytes)",
+	"Packets Received (1024-1522 Bytes)",
+	"Good Packets Received",
+	"Broadcast Packets Received",
+	"Multicast Packets Received",
+	"Good Packets Transmitted",
+	nil,
+	"Good Octets Received",
+	nil,
+	"Good Octets Transmitted",
+	nil,
+	nil,
+	nil,
+	"Receive No Buffers",
+	"Receive Undersize",
+	"Receive Fragment",
+	"Receive Oversize",
+	"Receive Jabber",
+	nil,
+	nil,
+	nil,
+	"Total Octets Received",
+	nil,
+	"Total Octets Transmitted",
+	nil,
+	"Total Packets Received",
+	"Total Packets Transmitted",
+	"Packets Transmitted (64 Bytes)",
+	"Packets Transmitted (65-127 Bytes)",
+	"Packets Transmitted (128-255 Bytes)",
+	"Packets Transmitted (256-511 Bytes)",
+	"Packets Transmitted (512-1023 Bytes)",
+	"Packets Transmitted (1024-1522 Bytes)",
+	"Multicast Packets Transmitted",
+	"Broadcast Packets Transmitted",
+	"TCP Segmentation Context Transmitted",
+	"TCP Segmentation Context Fail",
+};
+
+static long
+igbeifstat(Ether* edev, void* a, long n, ulong offset)
+{
+	Ctlr *ctlr;
+	char *p, *s;
+	int i, l, r;
+	uvlong tuvl, ruvl;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->slock);
+	p = malloc(2*READSTR);
+	l = 0;
+	for(i = 0; i < Nstatistics; i++){
+		r = csr32r(ctlr, Statistics+i*4);
+		if((s = statistics[i]) == nil)
+			continue;
+		switch(i){
+		case Gorcl:
+		case Gotcl:
+		case Torl:
+		case Totl:
+			ruvl = r;
+			ruvl += ((uvlong)csr32r(ctlr, Statistics+(i+1)*4))<<32;
+			tuvl = ruvl;
+			tuvl += ctlr->statistics[i];
+			tuvl += ((uvlong)ctlr->statistics[i+1])<<32;
+			if(tuvl == 0)
+				continue;
+			ctlr->statistics[i] = tuvl;
+			ctlr->statistics[i+1] = tuvl>>32;
+			l += snprint(p+l, 2*READSTR-l, "%s: %llud %llud\n",
+				s, tuvl, ruvl);
+			i++;
+			break;
+
+		default:
+			ctlr->statistics[i] += r;
+			if(ctlr->statistics[i] == 0)
+				continue;
+			l += snprint(p+l, 2*READSTR-l, "%s: %ud %ud\n",
+				s, ctlr->statistics[i], r);
+			break;
+		}
+	}
+
+	l += snprint(p+l, 2*READSTR-l, "lintr: %ud %ud\n",
+		ctlr->lintr, ctlr->lsleep);
+	l += snprint(p+l, 2*READSTR-l, "rintr: %ud %ud\n",
+		ctlr->rintr, ctlr->rsleep);
+	l += snprint(p+l, 2*READSTR-l, "tintr: %ud %ud\n",
+		ctlr->tintr, ctlr->txdw);
+	l += snprint(p+l, 2*READSTR-l, "ixcs: %ud %ud %ud\n",
+		ctlr->ixsm, ctlr->ipcs, ctlr->tcpcs);
+	l += snprint(p+l, 2*READSTR-l, "rdtr: %ud\n", ctlr->rdtr);
+	l += snprint(p+l, 2*READSTR-l, "Ctrlext: %08x\n", csr32r(ctlr, Ctrlext));
+
+	l += snprint(p+l, 2*READSTR-l, "eeprom:");
+	for(i = 0; i < 0x40; i++){
+		if(i && ((i & 0x07) == 0))
+			l += snprint(p+l, 2*READSTR-l, "\n       ");
+		l += snprint(p+l, 2*READSTR-l, " %4.4uX", ctlr->eeprom[i]);
+	}
+	l += snprint(p+l, 2*READSTR-l, "\n");
+
+	if(ctlr->mii != nil && ctlr->mii->curphy != nil){
+		l += snprint(p+l, 2*READSTR, "phy:   ");
+		for(i = 0; i < NMiiPhyr; i++){
+			if(i && ((i & 0x07) == 0))
+				l += snprint(p+l, 2*READSTR-l, "\n       ");
+			r = miimir(ctlr->mii, i);
+			l += snprint(p+l, 2*READSTR-l, " %4.4uX", r);
+		}
+		snprint(p+l, 2*READSTR-l, "\n");
+	}
+	n = readstr(offset, a, n, p);
+	free(p);
+	qunlock(&ctlr->slock);
+
+	return n;
+}
+
+enum {
+	CMrdtr,
+};
+
+static Cmdtab igbectlmsg[] = {
+	CMrdtr,	"rdtr",	2,
+};
+
+static long
+igbectl(Ether* edev, void* buf, long n)
+{
+	int v;
+	char *p;
+	Ctlr *ctlr;
+	Cmdbuf *cb;
+	Cmdtab *ct;
+
+	if((ctlr = edev->ctlr) == nil)
+		error(Enonexist);
+
+	cb = parsecmd(buf, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+
+	ct = lookupcmd(cb, igbectlmsg, nelem(igbectlmsg));
+	switch(ct->index){
+	case CMrdtr:
+		v = strtol(cb->f[1], &p, 0);
+		if(v < 0 || p == cb->f[1] || v > 0xFFFF)
+			error(Ebadarg);
+		ctlr->rdtr = v;;
+		csr32w(ctlr, Rdtr, Fpd|v);
+		break;
+	}
+	free(cb);
+	poperror();
+
+	return n;
+}
+
+static void
+igbepromiscuous(void* arg, int on)
+{
+	int rctl;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	rctl = csr32r(ctlr, Rctl);
+	rctl &= ~MoMASK;
+	rctl |= Mo47b36;
+	if(on)
+		rctl |= Upe|Mpe;
+	else
+		rctl &= ~(Upe|Mpe);
+	csr32w(ctlr, Rctl, rctl);
+}
+
+static void
+igbemulticast(void* arg, uchar* addr, int on)
+{
+	int bit, x;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	x = addr[5]>>1;
+	bit = ((addr[5] & 1)<<4)|(addr[4]>>4);
+	if(on)
+		ctlr->mta[x] |= 1<<bit;
+	else
+		ctlr->mta[x] &= ~(1<<bit);
+
+	csr32w(ctlr, Mta+x*4, ctlr->mta[x]);
+}
+
+static Block*
+igberballoc(void)
+{
+	Block *bp;
+
+	ilock(&igberblock);
+	if((bp = igberbpool) != nil){
+		igberbpool = bp->next;
+		bp->next = nil;
+	}
+	iunlock(&igberblock);
+
+	return bp;
+}
+
+static void
+igberbfree(Block* bp)
+{
+	bp->rp = bp->lim - Rbsz;
+	bp->wp = bp->rp;
+ 	bp->flag &= ~(Bpktck|Btcpck|Budpck|Bipck);
+
+	ilock(&igberblock);
+	bp->next = igberbpool;
+	igberbpool = bp;
+	iunlock(&igberblock);
+}
+
+static void
+igbeim(Ctlr* ctlr, int im)
+{
+	ilock(&ctlr->imlock);
+	ctlr->im |= im;
+	csr32w(ctlr, Ims, ctlr->im);
+	iunlock(&ctlr->imlock);
+}
+
+static int
+igbelim(void* ctlr)
+{
+	return ((Ctlr*)ctlr)->lim != 0;
+}
+
+static void
+igbelproc(void* arg)
+{
+	Ctlr *ctlr;
+	Ether *edev;
+	MiiPhy *phy;
+	int ctrl, r;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+	for(;;){
+		if(ctlr->mii == nil || ctlr->mii->curphy == nil)
+			continue;
+
+		/*
+		 * To do:
+		 *	logic to manage status change,
+		 *	this is incomplete but should work
+		 *	one time to set up the hardware.
+		 *
+		 *	MiiPhy.speed, etc. should be in Mii.
+		 */
+		if(miistatus(ctlr->mii) < 0)
+			//continue;
+			goto enable;
+
+		phy = ctlr->mii->curphy;
+		ctrl = csr32r(ctlr, Ctrl);
+
+		switch(ctlr->id){
+		case i82543gc:
+		case i82544ei:
+		default:
+			if(!(ctrl & Asde)){
+				ctrl &= ~(SspeedMASK|Ilos|Fd);
+				ctrl |= Frcdplx|Frcspd;
+				if(phy->speed == 1000)
+					ctrl |= Sspeed1000;
+				else if(phy->speed == 100)
+					ctrl |= Sspeed100;
+				if(phy->fd)
+					ctrl |= Fd;
+			}
+			break;
+
+		case i82540em:
+		case i82540eplp:
+		case i82547gi:
+		case i82541gi:
+		case i82541gi2:
+		case i82541pi:
+			break;
+		}
+
+		/*
+		 * Collision Distance.
+		 */
+		r = csr32r(ctlr, Tctl);
+		r &= ~ColdMASK;
+		if(phy->fd)
+			r |= 64<<ColdSHIFT;
+		else
+			r |= 512<<ColdSHIFT;
+		csr32w(ctlr, Tctl, r);
+
+		/*
+		 * Flow control.
+		 */
+		if(phy->rfc)
+			ctrl |= Rfce;
+		if(phy->tfc)
+			ctrl |= Tfce;
+		csr32w(ctlr, Ctrl, ctrl);
+
+enable:
+		ctlr->lim = 0;
+		igbeim(ctlr, Lsc);
+
+		ctlr->lsleep++;
+		sleep(&ctlr->lrendez, igbelim, ctlr);
+	}
+}
+
+static void
+igbetxinit(Ctlr* ctlr)
+{
+	int i, r;
+	Block *bp;
+
+	csr32w(ctlr, Tctl, (0x0F<<CtSHIFT)|Psp|(66<<ColdSHIFT));
+	switch(ctlr->id){
+	default:
+		r = 6;
+		break;
+	case i82543gc:
+	case i82544ei:
+	case i82547ei:
+	case i82540em:
+	case i82540eplp:
+	case i82541gi:
+	case i82541gi2:
+	case i82541pi:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+	case i82547gi:
+		r = 8;
+		break;
+	}
+	csr32w(ctlr, Tipg, (6<<20)|(8<<10)|r);
+	csr32w(ctlr, Ait, 0);
+	csr32w(ctlr, Txdmac, 0);
+
+	csr32w(ctlr, Tdbal, PCIWADDR(ctlr->tdba));
+	csr32w(ctlr, Tdbah, 0);
+	csr32w(ctlr, Tdlen, ctlr->ntd*sizeof(Td));
+	ctlr->tdh = PREV(0, ctlr->ntd);
+	csr32w(ctlr, Tdh, 0);
+	ctlr->tdt = 0;
+	csr32w(ctlr, Tdt, 0);
+
+	for(i = 0; i < ctlr->ntd; i++){
+		if((bp = ctlr->tb[i]) != nil){
+			ctlr->tb[i] = nil;
+			freeb(bp);
+		}
+		memset(&ctlr->tdba[i], 0, sizeof(Td));
+	}
+	ctlr->tdfree = ctlr->ntd;
+
+	csr32w(ctlr, Tidv, 128);
+	r = (4<<WthreshSHIFT)|(4<<HthreshSHIFT)|(8<<PthreshSHIFT);
+
+	switch(ctlr->id){
+	default:
+		break;
+	case i82540em:
+	case i82540eplp:
+	case i82547gi:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+	case i82541gi:
+	case i82541gi2:
+	case i82541pi:
+		r = csr32r(ctlr, Txdctl);
+		r &= ~WthreshMASK;
+		r |= Gran|(4<<WthreshSHIFT);
+
+		csr32w(ctlr, Tadv, 64);
+		break;
+	}
+
+	csr32w(ctlr, Txdctl, r);
+
+	r = csr32r(ctlr, Tctl);
+	r |= Ten;
+	csr32w(ctlr, Tctl, r);
+}
+
+static void
+igbetransmit(Ether* edev)
+{
+	Td *td;
+	Block *bp;
+	Ctlr *ctlr;
+	int tdh, tdt;
+
+	ctlr = edev->ctlr;
+
+	ilock(&ctlr->tlock);
+
+	/*
+	 * Free any completed packets
+	 */
+	tdh = ctlr->tdh;
+	while(NEXT(tdh, ctlr->ntd) != csr32r(ctlr, Tdh)){
+		if((bp = ctlr->tb[tdh]) != nil){
+			ctlr->tb[tdh] = nil;
+			freeb(bp);
+		}
+		memset(&ctlr->tdba[tdh], 0, sizeof(Td));
+		tdh = NEXT(tdh, ctlr->ntd);
+	}
+	ctlr->tdh = tdh;
+
+	/*
+	 * Try to fill the ring back up.
+	 */
+	tdt = ctlr->tdt;
+	while(NEXT(tdt, ctlr->ntd) != tdh){
+		if((bp = qget(edev->oq)) == nil)
+			break;
+		td = &ctlr->tdba[tdt];
+		td->addr[0] = PCIWADDR(bp->rp);
+		td->control = ((BLEN(bp) & LenMASK)<<LenSHIFT);
+		td->control |= Dext|Ifcs|Teop|DtypeDD;
+		ctlr->tb[tdt] = bp;
+		tdt = NEXT(tdt, ctlr->ntd);
+		if(NEXT(tdt, ctlr->ntd) == tdh){
+			td->control |= Rs;
+			ctlr->txdw++;
+			ctlr->tdt = tdt;
+			csr32w(ctlr, Tdt, tdt);
+			igbeim(ctlr, Txdw);
+			break;
+		}
+		ctlr->tdt = tdt;
+		csr32w(ctlr, Tdt, tdt);
+	}
+
+	iunlock(&ctlr->tlock);
+}
+
+static void
+igbereplenish(Ctlr* ctlr)
+{
+	Rd *rd;
+	int rdt;
+	Block *bp;
+
+	rdt = ctlr->rdt;
+	while(NEXT(rdt, ctlr->nrd) != ctlr->rdh){
+		rd = &ctlr->rdba[rdt];
+		if(ctlr->rb[rdt] == nil){
+			bp = igberballoc();
+			if(bp == nil){
+				iprint("no available buffers\n");
+				break;
+			}
+			ctlr->rb[rdt] = bp;
+			rd->addr[0] = PCIWADDR(bp->rp);
+			rd->addr[1] = 0;
+		}
+		coherence();
+		rd->status = 0;
+		rdt = NEXT(rdt, ctlr->nrd);
+		ctlr->rdfree++;
+	}
+	ctlr->rdt = rdt;
+	csr32w(ctlr, Rdt, rdt);
+}
+
+static void
+igberxinit(Ctlr* ctlr)
+{
+	int i;
+	Block *bp;
+
+	csr32w(ctlr, Rctl, Dpf|Bsize2048|Bam|RdtmsHALF);
+
+	csr32w(ctlr, Rdbal, PCIWADDR(ctlr->rdba));
+	csr32w(ctlr, Rdbah, 0);
+	csr32w(ctlr, Rdlen, ctlr->nrd*sizeof(Rd));
+	ctlr->rdh = 0;
+	csr32w(ctlr, Rdh, 0);
+	ctlr->rdt = 0;
+	csr32w(ctlr, Rdt, 0);
+	ctlr->rdtr = 0;
+	csr32w(ctlr, Rdtr, Fpd|0);
+
+	for(i = 0; i < ctlr->nrd; i++){
+		if((bp = ctlr->rb[i]) != nil){
+			ctlr->rb[i] = nil;
+			freeb(bp);
+		}
+	}
+	igbereplenish(ctlr);
+
+	switch(ctlr->id){
+	case i82540em:
+	case i82540eplp:
+	case i82541gi:
+	case i82541gi2:
+	case i82541pi:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+	case i82547gi:
+		csr32w(ctlr, Radv, 64);
+		break;
+	}
+	csr32w(ctlr, Rxdctl, (8<<WthreshSHIFT)|(8<<HthreshSHIFT)|4);
+
+	/*
+	 * Enable checksum offload.
+	 */
+	csr32w(ctlr, Rxcsum, Tuofl|Ipofl|(ETHERHDRSIZE<<PcssSHIFT));
+}
+
+static int
+igberim(void* ctlr)
+{
+	return ((Ctlr*)ctlr)->rim != 0;
+}
+
+static void
+igberproc(void* arg)
+{
+	Rd *rd;
+	Block *bp;
+	Ctlr *ctlr;
+	int r, rdh;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	igberxinit(ctlr);
+	r = csr32r(ctlr, Rctl);
+	r |= Ren;
+	csr32w(ctlr, Rctl, r);
+
+	for(;;){
+		ctlr->rim = 0;
+		igbeim(ctlr, Rxt0|Rxo|Rxdmt0|Rxseq);
+		ctlr->rsleep++;
+		sleep(&ctlr->rrendez, igberim, ctlr);
+
+		rdh = ctlr->rdh;
+		for(;;){
+			rd = &ctlr->rdba[rdh];
+
+			if(!(rd->status & Rdd))
+				break;
+
+			/*
+			 * Accept eop packets with no errors.
+			 * With no errors and the Ixsm bit set,
+			 * the descriptor status Tpcs and Ipcs bits give
+			 * an indication of whether the checksums were
+			 * calculated and valid.
+			 */
+			if((rd->status & Reop) && rd->errors == 0){
+				bp = ctlr->rb[rdh];
+				ctlr->rb[rdh] = nil;
+				bp->wp += rd->length;
+				bp->next = nil;
+				if(!(rd->status & Ixsm)){
+					ctlr->ixsm++;
+					if(rd->status & Ipcs){
+						/*
+						 * IP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->ipcs++;
+						bp->flag |= Bipck;
+					}
+					if(rd->status & Tcpcs){
+						/*
+						 * TCP/UDP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->tcpcs++;
+						bp->flag |= Btcpck|Budpck;
+					}
+					bp->checksum = rd->checksum;
+					bp->flag |= Bpktck;
+				}
+				etheriq(edev, bp, 1);
+			}
+			else if(ctlr->rb[rdh] != nil){
+				freeb(ctlr->rb[rdh]);
+				ctlr->rb[rdh] = nil;
+			}
+
+			memset(rd, 0, sizeof(Rd));
+			coherence();
+			ctlr->rdfree--;
+			rdh = NEXT(rdh, ctlr->nrd);
+		}
+		ctlr->rdh = rdh;
+
+		if(ctlr->rdfree < ctlr->nrd/2 || (ctlr->rim & Rxdmt0))
+			igbereplenish(ctlr);
+	}
+}
+
+static void
+igbeattach(Ether* edev)
+{
+	Block *bp;
+	Ctlr *ctlr;
+	char name[KNAMELEN];
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->alock);
+	if(ctlr->alloc != nil){
+		qunlock(&ctlr->alock);
+		return;
+	}
+
+	ctlr->nrd = ROUNDUP(Nrd, 8);
+	ctlr->ntd = ROUNDUP(Ntd, 8);
+	ctlr->alloc = malloc(ctlr->nrd*sizeof(Rd)+ctlr->ntd*sizeof(Td) + 127);
+	if(ctlr->alloc == nil){
+		qunlock(&ctlr->alock);
+		return;
+	}
+	ctlr->rdba = (Rd*)ROUNDUP((uintptr)ctlr->alloc, 128);
+	ctlr->tdba = (Td*)(ctlr->rdba+ctlr->nrd);
+
+	ctlr->rb = malloc(ctlr->nrd*sizeof(Block*));
+	ctlr->tb = malloc(ctlr->ntd*sizeof(Block*));
+
+	if(waserror()){
+		while(ctlr->nrb > 0){
+			bp = igberballoc();
+			bp->free = nil;
+			freeb(bp);
+			ctlr->nrb--;
+		}
+		free(ctlr->tb);
+		ctlr->tb = nil;
+		free(ctlr->rb);
+		ctlr->rb = nil;
+		free(ctlr->alloc);
+		ctlr->alloc = nil;
+		qunlock(&ctlr->alock);
+		nexterror();
+	}
+
+	for(ctlr->nrb = 0; ctlr->nrb < Nrb; ctlr->nrb++){
+		if((bp = allocb(Rbsz)) == nil)
+			break;
+		bp->free = igberbfree;
+		freeb(bp);
+	}
+
+	snprint(name, KNAMELEN, "#l%dlproc", edev->ctlrno);
+	kproc(name, igbelproc, edev);
+
+	snprint(name, KNAMELEN, "#l%drproc", edev->ctlrno);
+	kproc(name, igberproc, edev);
+
+	igbetxinit(ctlr);
+
+	qunlock(&ctlr->alock);
+	poperror();
+}
+
+static void
+igbeinterrupt(Ureg*, void* arg)
+{
+	Ctlr *ctlr;
+	Ether *edev;
+	int icr, im, txdw;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	ilock(&ctlr->imlock);
+	csr32w(ctlr, Imc, ~0);
+	im = ctlr->im;
+	txdw = 0;
+
+	while((icr = csr32r(ctlr, Icr) & ctlr->im) != 0){
+		if(icr & Lsc){
+			im &= ~Lsc;
+			ctlr->lim = icr & Lsc;
+			wakeup(&ctlr->lrendez);
+			ctlr->lintr++;
+		}
+		if(icr & (Rxt0|Rxo|Rxdmt0|Rxseq)){
+			im &= ~(Rxt0|Rxo|Rxdmt0|Rxseq);
+			ctlr->rim = icr & (Rxt0|Rxo|Rxdmt0|Rxseq);
+			wakeup(&ctlr->rrendez);
+			ctlr->rintr++;
+		}
+		if(icr & Txdw){
+			im &= ~Txdw;
+			txdw++;
+			ctlr->tintr++;
+		}
+	}
+
+	ctlr->im = im;
+	csr32w(ctlr, Ims, im);
+	iunlock(&ctlr->imlock);
+
+	if(txdw)
+		igbetransmit(edev);
+}
+
+static int
+i82543mdior(Ctlr* ctlr, int n)
+{
+	int ctrl, data, i, r;
+
+	/*
+	 * Read n bits from the Management Data I/O Interface.
+	 */
+	ctrl = csr32r(ctlr, Ctrl);
+	r = (ctrl & ~Mddo)|Mdco;
+	data = 0;
+	for(i = n-1; i >= 0; i--){
+		if(csr32r(ctlr, Ctrl) & Mdd)
+			data |= (1<<i);
+		csr32w(ctlr, Ctrl, Mdc|r);
+		csr32w(ctlr, Ctrl, r);
+	}
+	csr32w(ctlr, Ctrl, ctrl);
+
+	return data;
+}
+
+static int
+i82543mdiow(Ctlr* ctlr, int bits, int n)
+{
+	int ctrl, i, r;
+
+	/*
+	 * Write n bits to the Management Data I/O Interface.
+	 */
+	ctrl = csr32r(ctlr, Ctrl);
+	r = Mdco|Mddo|ctrl;
+	for(i = n-1; i >= 0; i--){
+		if(bits & (1<<i))
+			r |= Mdd;
+		else
+			r &= ~Mdd;
+		csr32w(ctlr, Ctrl, Mdc|r);
+		csr32w(ctlr, Ctrl, r);
+	}
+	csr32w(ctlr, Ctrl, ctrl);
+
+	return 0;
+}
+
+static int
+i82543miimir(Mii* mii, int pa, int ra)
+{
+	int data;
+	Ctlr *ctlr;
+
+	ctlr = mii->ctlr;
+
+	/*
+	 * MII Management Interface Read.
+	 *
+	 * Preamble;
+	 * ST+OP+PHYAD+REGAD;
+	 * TA + 16 data bits.
+	 */
+	i82543mdiow(ctlr, 0xFFFFFFFF, 32);
+	i82543mdiow(ctlr, 0x1800|(pa<<5)|ra, 14);
+	data = i82543mdior(ctlr, 18);
+
+	if(data & 0x10000)
+		return -1;
+
+	return data & 0xFFFF;
+}
+
+static int
+i82543miimiw(Mii* mii, int pa, int ra, int data)
+{
+	Ctlr *ctlr;
+
+	ctlr = mii->ctlr;
+
+	/*
+	 * MII Management Interface Write.
+	 *
+	 * Preamble;
+	 * ST+OP+PHYAD+REGAD+TA + 16 data bits;
+	 * Z.
+	 */
+	i82543mdiow(ctlr, 0xFFFFFFFF, 32);
+	data &= 0xFFFF;
+	data |= (0x05<<(5+5+2+16))|(pa<<(5+2+16))|(ra<<(2+16))|(0x02<<16);
+	i82543mdiow(ctlr, data, 32);
+
+	return 0;
+}
+
+static int
+i82543miirw(Mii* mii, int write, int pa, int ra, int data)
+{
+	if(write)
+		return i82543miimiw(mii, pa, ra, data);
+
+	return i82543miimir(mii, pa, ra);
+}
+
+static int
+igbemiimir(Mii* mii, int pa, int ra)
+{
+	Ctlr *ctlr;
+	int mdic, timo;
+
+	ctlr = mii->ctlr;
+
+	csr32w(ctlr, Mdic, MDIrop|(pa<<MDIpSHIFT)|(ra<<MDIrSHIFT));
+	mdic = 0;
+	for(timo = 64; timo; timo--){
+		mdic = csr32r(ctlr, Mdic);
+		if(mdic & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+
+	if((mdic & (MDIe|MDIready)) == MDIready)
+		return mdic & 0xFFFF;
+	return -1;
+}
+
+static int
+igbemiimiw(Mii* mii, int pa, int ra, int data)
+{
+	Ctlr *ctlr;
+	int mdic, timo;
+
+	ctlr = mii->ctlr;
+
+	data &= MDIdMASK;
+	csr32w(ctlr, Mdic, MDIwop|(pa<<MDIpSHIFT)|(ra<<MDIrSHIFT)|data);
+	mdic = 0;
+	for(timo = 64; timo; timo--){
+		mdic = csr32r(ctlr, Mdic);
+		if(mdic & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+	if((mdic & (MDIe|MDIready)) == MDIready)
+		return 0;
+	return -1;
+}
+
+static int
+igbemiirw(Mii* mii, int write, int pa, int ra, int data)
+{
+	if(write)
+		return igbemiimiw(mii, pa, ra, data);
+
+	return igbemiimir(mii, pa, ra);
+}
+
+static Mii*
+igbemii(Ctlr* ctlr)
+{
+	Mii *mii;
+	int ctrl, p, r;
+	int (*rw)(Mii*, int, int, int, int);
+
+	r = csr32r(ctlr, Status);
+	if(r & Tbimode)
+		return nil;
+
+	ctrl = csr32r(ctlr, Ctrl);
+	ctrl |= Slu;
+
+	switch(ctlr->id){
+	case i82543gc:
+		ctrl |= Frcdplx|Frcspd;
+		csr32w(ctlr, Ctrl, ctrl);
+
+		/*
+		 * The reset pin direction (Mdro) should already
+		 * be set from the EEPROM load.
+		 * If it's not set this configuration is unexpected
+		 * so bail.
+		 */
+		r = csr32r(ctlr, Ctrlext);
+		if(!(r & Mdro))
+			return nil;
+		csr32w(ctlr, Ctrlext, r);
+		delay(20);
+		r = csr32r(ctlr, Ctrlext);
+		r &= ~Mdr;
+		csr32w(ctlr, Ctrlext, r);
+		delay(20);
+		r = csr32r(ctlr, Ctrlext);
+		r |= Mdr;
+		csr32w(ctlr, Ctrlext, r);
+		delay(20);
+
+		rw = i82543miirw;
+		break;
+	case i82544ei:
+	case i82547ei:
+	case i82540em:
+	case i82540eplp:
+	case i82547gi:
+	case i82541gi:
+	case i82541gi2:
+	case i82541pi:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+		ctrl &= ~(Frcdplx|Frcspd);
+		csr32w(ctlr, Ctrl, ctrl);
+		rw = igbemiirw;
+		break;
+	default:
+		return nil;
+	}
+
+	if((mii = miiattach(ctlr, ~0, rw)) == nil)
+		return nil;
+
+	/*
+	 * 8254X-specific PHY registers not in 802.3:
+	 *	0x10	PHY specific control
+	 *	0x14	extended PHY specific control
+	 * Set appropriate values then reset the PHY to have
+	 * changes noted.
+	 */
+	switch(ctlr->id){
+	case i82547gi:
+	case i82541gi:
+	case i82541gi2:
+	case i82541pi:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+		break;
+	default:
+		r = miimir(mii, 16);
+		r |= 0x0800;			/* assert CRS on Tx */
+		r |= 0x0060;			/* auto-crossover all speeds */
+		r |= 0x0002;			/* polarity reversal enabled */
+		miimiw(mii, 16, r);
+
+		r = miimir(mii, 20);
+		r |= 0x0070;			/* +25MHz clock */
+		r &= ~0x0F00;
+		r |= 0x0100;			/* 1x downshift */
+		miimiw(mii, 20, r);
+
+		miireset(mii);
+		p = 0;
+		if(ctlr->txcw & TxcwPs)
+			p |= AnaP;
+		if(ctlr->txcw & TxcwAs)
+			p |= AnaAP;
+		miiane(mii, ~0, p, ~0);
+		break;
+	}
+
+	return mii;
+}
+
+static int
+at93c46io(Ctlr* ctlr, char* op, int data)
+{
+	char *lp, *p;
+	int i, loop, eecd, r;
+
+	eecd = csr32r(ctlr, Eecd);
+
+	r = 0;
+	loop = -1;
+	lp = nil;
+	for(p = op; *p != '\0'; p++){
+		switch(*p){
+		default:
+			return -1;
+		case ' ':
+			continue;
+		case ':':			/* start of loop */
+			loop = strtol(p+1, &lp, 0)-1;
+			lp--;
+			if(p == lp)
+				loop = 7;
+			p = lp;
+			continue;
+		case ';':			/* end of loop */
+			if(lp == nil)
+				return -1;
+			loop--;
+			if(loop >= 0)
+				p = lp;
+			else
+				lp = nil;
+			continue;
+		case 'C':			/* assert clock */
+			eecd |= Sk;
+			break;
+		case 'c':			/* deassert clock */
+			eecd &= ~Sk;
+			break;
+		case 'D':			/* next bit in 'data' byte */
+			if(loop < 0)
+				return -1;
+			if(data & (1<<loop))
+				eecd |= Di;
+			else
+				eecd &= ~Di;
+			break;
+		case 'O':			/* collect data output */
+			i = (csr32r(ctlr, Eecd) & Do) != 0;
+			if(loop >= 0)
+				r |= (i<<loop);
+			else
+				r = i;
+			continue;
+		case 'I':			/* assert data input */
+			eecd |= Di;
+			break;
+		case 'i':			/* deassert data input */
+			eecd &= ~Di;
+			break;
+		case 'S':			/* enable chip select */
+			eecd |= Cs;
+			break;
+		case 's':			/* disable chip select */
+			eecd &= ~Cs;
+			break;
+		}
+		csr32w(ctlr, Eecd, eecd);
+		microdelay(50);
+	}
+	if(loop >= 0)
+		return -1;
+	return r;
+}
+
+static int
+at93c46r(Ctlr* ctlr)
+{
+	ushort sum;
+	char rop[20];
+	int addr, areq, bits, data, eecd, i;
+
+	eecd = csr32r(ctlr, Eecd);
+	if(eecd & Spi){
+		print("igbe: SPI EEPROM access not implemented\n");
+		return 0;
+	}
+	if(eecd & (Eeszaddr|Eesz256))
+		bits = 8;
+	else
+		bits = 6;
+
+	sum = 0;
+
+	switch(ctlr->id){
+	default:
+		areq = 0;
+		break;
+	case i82541gi:
+	case i82547gi:
+	case i82540em:
+	case i82540eplp:
+	case i82541pi:
+	case i82541gi2:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+		areq = 1;
+		csr32w(ctlr, Eecd, eecd|Areq);
+		for(i = 0; i < 1000; i++){
+			if((eecd = csr32r(ctlr, Eecd)) & Agnt)
+				break;
+			microdelay(5);
+		}
+		if(!(eecd & Agnt)){
+			print("igbe: not granted EEPROM access\n");
+			goto release;
+		}
+		break;
+	}
+	snprint(rop, sizeof(rop), "S :%dDCc;", bits+3);
+
+	for(addr = 0; addr < 0x40; addr++){
+		/*
+		 * Read a word at address 'addr' from the Atmel AT93C46
+		 * 3-Wire Serial EEPROM or compatible. The EEPROM access is
+		 * controlled by 4 bits in Eecd. See the AT93C46 datasheet
+		 * for protocol details.
+		 */
+		if(at93c46io(ctlr, rop, (0x06<<bits)|addr) != 0){
+			print("igbe: can't set EEPROM address 0x%2.2X\n", addr);
+			goto release;
+		}
+		data = at93c46io(ctlr, ":16COc;", 0);
+		at93c46io(ctlr, "sic", 0);
+		ctlr->eeprom[addr] = data;
+		sum += data;
+	}
+
+release:
+	if(areq)
+		csr32w(ctlr, Eecd, eecd & ~Areq);
+	return sum;
+}
+
+static int
+igbedetach(Ctlr* ctlr)
+{
+	int r, timeo;
+
+	/*
+	 * Perform a device reset to get the chip back to the
+	 * power-on state, followed by an EEPROM reset to read
+	 * the defaults for some internal registers.
+	 */
+	csr32w(ctlr, Imc, ~0);
+	csr32w(ctlr, Rctl, 0);
+	csr32w(ctlr, Tctl, 0);
+
+	delay(10);
+
+	csr32w(ctlr, Ctrl, Devrst);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Ctrl) & Devrst))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrl) & Devrst)
+		return -1;
+	r = csr32r(ctlr, Ctrlext);
+	csr32w(ctlr, Ctrlext, r|Eerst);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Ctrlext) & Eerst))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrlext) & Eerst)
+		return -1;
+
+	switch(ctlr->id){
+	default:
+		break;
+	case i82540em:
+	case i82540eplp:
+	case i82541gi:
+	case i82541pi:
+	case i82547gi:
+	case i82541gi2:
+	case i82545gmc:
+	case i82546gb:
+	case i82546eb:
+		r = csr32r(ctlr, Manc);
+		r &= ~Arpen;
+		csr32w(ctlr, Manc, r);
+		break;
+	}
+
+	csr32w(ctlr, Imc, ~0);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!csr32r(ctlr, Icr))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Icr))
+		return -1;
+
+	return 0;
+}
+
+static void
+igbeshutdown(Ether* ether)
+{
+	igbedetach(ether->ctlr);
+}
+
+static int
+igbereset(Ctlr* ctlr)
+{
+	int ctrl, i, pause, r, swdpio, txcw;
+
+	if(igbedetach(ctlr))
+		return -1;
+
+	/*
+	 * Read the EEPROM, validate the checksum
+	 * then get the device back to a power-on state.
+	 */
+	if((r = at93c46r(ctlr)) != 0xBABA){
+		print("igbe: bad EEPROM checksum - 0x%4.4uX\n", r);
+		return -1;
+	}
+
+	/*
+	 * Snarf and set up the receive addresses.
+	 * There are 16 addresses. The first should be the MAC address.
+	 * The others are cleared and not marked valid (MS bit of Rah).
+	 */
+	if ((ctlr->id == i82546gb || ctlr->id == i82546eb) && BUSFNO(ctlr->pcidev->tbdf) == 1)
+		ctlr->eeprom[Ea+2] += 0x100;	// second interface
+	for(i = Ea; i < Eaddrlen/2; i++){
+		ctlr->ra[2*i] = ctlr->eeprom[i];
+		ctlr->ra[2*i+1] = ctlr->eeprom[i]>>8;
+	}
+	r = (ctlr->ra[3]<<24)|(ctlr->ra[2]<<16)|(ctlr->ra[1]<<8)|ctlr->ra[0];
+	csr32w(ctlr, Ral, r);
+	r = 0x80000000|(ctlr->ra[5]<<8)|ctlr->ra[4];
+	csr32w(ctlr, Rah, r);
+	for(i = 1; i < 16; i++){
+		csr32w(ctlr, Ral+i*8, 0);
+		csr32w(ctlr, Rah+i*8, 0);
+	}
+
+	/*
+	 * Clear the Multicast Table Array.
+	 * It's a 4096 bit vector accessed as 128 32-bit registers.
+	 */
+	memset(ctlr->mta, 0, sizeof(ctlr->mta));
+	for(i = 0; i < 128; i++)
+		csr32w(ctlr, Mta+i*4, 0);
+
+	/*
+	 * Just in case the Eerst didn't load the defaults
+	 * (doesn't appear to fully on the 8243GC), do it manually.
+	 */
+	if (ctlr->id == i82543gc) {	// 82543
+		txcw = csr32r(ctlr, Txcw);
+		txcw &= ~(TxcwAne|TxcwPauseMASK|TxcwFd);
+		ctrl = csr32r(ctlr, Ctrl);
+		ctrl &= ~(SwdpioloMASK|Frcspd|Ilos|Lrst|Fd);
+
+		if(ctlr->eeprom[Icw1] & 0x0400){
+			ctrl |= Fd;
+			txcw |= TxcwFd;
+		}
+		if(ctlr->eeprom[Icw1] & 0x0200)
+			ctrl |= Lrst;
+		if(ctlr->eeprom[Icw1] & 0x0010)
+			ctrl |= Ilos;
+		if(ctlr->eeprom[Icw1] & 0x0800)
+			ctrl |= Frcspd;
+		swdpio = (ctlr->eeprom[Icw1] & 0x01E0)>>5;
+		ctrl |= swdpio<<SwdpioloSHIFT;
+		csr32w(ctlr, Ctrl, ctrl);
+
+		ctrl = csr32r(ctlr, Ctrlext);
+		ctrl &= ~(Ips|SwdpiohiMASK);
+		swdpio = (ctlr->eeprom[Icw2] & 0x00F0)>>4;
+		if(ctlr->eeprom[Icw1] & 0x1000)
+			ctrl |= Ips;
+		ctrl |= swdpio<<SwdpiohiSHIFT;
+		csr32w(ctlr, Ctrlext, ctrl);
+
+		if(ctlr->eeprom[Icw2] & 0x0800)
+			txcw |= TxcwAne;
+		pause = (ctlr->eeprom[Icw2] & 0x3000)>>12;
+		txcw |= pause<<TxcwPauseSHIFT;
+		switch(pause){
+		default:
+			ctlr->fcrtl = 0x00002000;
+			ctlr->fcrth = 0x00004000;
+			txcw |= TxcwAs|TxcwPs;
+			break;
+		case 0:
+			ctlr->fcrtl = 0x00002000;
+			ctlr->fcrth = 0x00004000;
+			break;
+		case 2:
+			ctlr->fcrtl = 0;
+			ctlr->fcrth = 0;
+			txcw |= TxcwAs;
+			break;
+		}
+		ctlr->txcw = txcw;
+		csr32w(ctlr, Txcw, txcw);
+	}
+
+
+	/*
+	 * Flow control - values from the datasheet.
+	 */
+	csr32w(ctlr, Fcal, 0x00C28001);
+	csr32w(ctlr, Fcah, 0x00000100);
+	csr32w(ctlr, Fct, 0x00008808);
+	csr32w(ctlr, Fcttv, 0x00000100);
+
+	csr32w(ctlr, Fcrtl, ctlr->fcrtl);
+	csr32w(ctlr, Fcrth, ctlr->fcrth);
+
+	if((ctlr->mii = igbemii(ctlr)) == nil)
+		return -1;
+
+	return 0;
+}
+
+static void
+igbepci(void)
+{
+	int cls;
+	Pcidev *p;
+	Ctlr *ctlr;
+	void *mem;
+
+	p = nil;
+	while(p = pcimatch(p, 0, 0)){
+		if(p->ccrb != 0x02 || p->ccru != 0)
+			continue;
+
+		switch((p->did<<16)|p->vid){
+		default:
+			continue;
+		case i82543gc:
+		case i82544ei:
+		case i82547ei:
+		case i82540em:
+		case i82540eplp:
+		case i82541gi:
+		case i82547gi:
+		case i82541gi2:
+		case i82541pi:
+		case i82545gmc:
+		case i82546gb:
+		case i82546eb:
+			break;
+		}
+
+		mem = vmap(p->mem[0].bar & ~0x0F, p->mem[0].size);
+		if(mem == nil){
+			print("igbe: can't map %#8.8lux\n", p->mem[0].bar);
+			continue;
+		}
+		cls = pcicfgr8(p, PciCLS);
+		switch(cls){
+			default:
+				print("igbe: unexpected CLS - %d\n", cls*4);
+				break;
+			case 0x00:
+			case 0xFF:
+				print("igbe: unusable CLS\n");
+				continue;
+			case 0x08:
+			case 0x10:
+				break;
+		}
+		ctlr = malloc(sizeof(Ctlr));
+		ctlr->port = p->mem[0].bar & ~0x0F;
+		ctlr->pcidev = p;
+		ctlr->id = (p->did<<16)|p->vid;
+		ctlr->cls = cls*4;
+		ctlr->nic = mem;
+
+		if(igbereset(ctlr)){
+			free(ctlr);
+			vunmap(mem, p->mem[0].size);
+			continue;
+		}
+		pcisetbme(p);
+
+		if(igbectlrhead != nil)
+			igbectlrtail->next = ctlr;
+		else
+			igbectlrhead = ctlr;
+		igbectlrtail = ctlr;
+	}
+}
+
+static int
+igbepnp(Ether* edev)
+{
+	Ctlr *ctlr;
+
+	if(igbectlrhead == nil)
+		igbepci();
+
+	/*
+	 * Any adapter matches if no edev->port is supplied,
+	 * otherwise the ports must match.
+	 */
+	for(ctlr = igbectlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+	if(ctlr == nil)
+		return -1;
+
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 1000;
+	memmove(edev->ea, ctlr->ra, Eaddrlen);
+
+	/*
+	 * Linkage to the generic ethernet driver.
+	 */
+	edev->attach = igbeattach;
+	edev->transmit = igbetransmit;
+	edev->interrupt = igbeinterrupt;
+	edev->ifstat = igbeifstat;
+	edev->ctl = igbectl;
+
+	edev->arg = edev;
+	edev->promiscuous = igbepromiscuous;
+	edev->shutdown = igbeshutdown;
+	edev->multicast = igbemulticast;
+
+	return 0;
+}
+
+void
+etherigbelink(void)
+{
+	addethercard("i82543", igbepnp);
+	addethercard("igbe", igbepnp);
+}
+

+ 1646 - 0
sys/src/9/386/etherm10g.c

@@ -0,0 +1,1646 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * myricom 10 Gb ethernet driver
+ * © 2007 erik quanstrom, coraid
+ *
+ * the card is big endian.
+ * we use u64int rather than uintptr to hold addresses so that
+ * we don't get "warning: stupid shift" on 32-bit architectures.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "../port/netif.h"
+
+#include "etherif.h"
+#include "io.h"
+
+#ifndef KiB
+#define KiB		1024u			/* Kibi 0x0000000000000400 */
+#define MiB		1048576u		/* Mebi 0x0000000000100000 */
+#endif /* KiB */
+
+#define	dprint(...)	if(debug) print(__VA_ARGS__)
+#define	pcicapdbg(...)
+#define malign(n)	mallocalign((n), 4*KiB, 0, 0)
+
+#include "etherm10g2k.i"
+#include "etherm10g4k.i"
+
+static int 	debug		= 0;
+static char	Etimeout[]	= "timeout";
+
+enum {
+	Epromsz	= 256,
+	Maxslots= 1024,
+	Align	= 4096,
+	Maxmtu	= 9000,
+	Noconf	= 0xffffffff,
+
+	Fwoffset= 1*MiB,
+	Cmdoff	= 0xf80000,	/* command port offset */
+	Fwsubmt	= 0xfc0000,	/* firmware submission command port offset */
+	Rdmaoff	= 0xfc01c0,	/* rdma command port offset */
+};
+
+enum {
+	CZero,
+	Creset,
+	Cversion,
+
+	CSintrqdma,	/* issue these before Cetherup */
+	CSbigsz,	/* in bytes bigsize = 2^n */
+	CSsmallsz,
+
+	CGsendoff,
+	CGsmallrxoff,
+	CGbigrxoff,
+	CGirqackoff,
+	CGirqdeassoff,
+	CGsendrgsz,
+	CGrxrgsz,
+
+	CSintrqsz,	/* 2^n */
+	Cetherup,	/* above parameters + mtu/mac addr must be set first. */
+	Cetherdn,
+
+	CSmtu,		/* below may be issued live */
+	CGcoaloff,	/* in µs */
+	CSstatsrate,	/* in µs */
+	CSstatsdma,
+
+	Cpromisc,
+	Cnopromisc,
+	CSmac,
+
+	Cenablefc,
+	Cdisablefc,
+
+	Cdmatest,	/* address in d[0-1], d[2]=length */
+
+	Cenableallmc,
+	Cdisableallmc,
+
+	CSjoinmc,
+	CSleavemc,
+	Cleaveallmc,
+
+	CSstatsdma2,	/* adds (unused) multicast stats */
+};
+
+typedef union {
+	uint	i[2];
+	uchar	c[8];
+} Cmd;
+
+typedef ulong Slot;
+typedef struct {
+	u16int	cksum;
+	u16int	len;
+} Slotparts;
+
+enum {
+	SFsmall	= 1,
+	SFfirst	= 2,
+	SFalign	= 4,
+	SFnotso	= 16,
+};
+
+typedef struct {
+	u32int	high;
+	u32int	low;
+	u16int	hdroff;
+	u16int	len;
+	uchar	pad;
+	uchar	nrdma;
+	uchar	chkoff;
+	uchar	flags;
+} Send;
+
+typedef struct {
+	QLock;
+	Send	*lanai;		/* tx ring (cksum+len in lanai memory) */
+	Send	*host;		/* tx ring (data in our memory) */
+	Block	**bring;
+//	uchar	*wcfifo;	/* what the heck is a w/c fifo? */
+	int	size;		/* of buffers in the z8's memory */
+	u32int	segsz;
+	uint	n;		/* rxslots */
+	uint	m;		/* mask; rxslots must be a power of two */
+	uint	i;		/* number of segments (not frames) queued */
+	uint	cnt;		/* number of segments sent by the card */
+
+	ulong	npkt;
+	vlong	nbytes;
+} Tx;
+
+typedef struct {
+	Lock;
+	Block	*head;
+	uint	size;		/* buffer size of each block */
+	uint	n;		/* n free buffers */
+	uint	cnt;
+} Bpool;
+
+static Bpool	smpool 	= { .size = 128, };
+static Bpool	bgpool	= { .size = Maxmtu, };
+
+typedef struct {
+	Bpool	*pool;		/* free buffers */
+	u32int	*lanai;		/* rx ring; we have no permanent host shadow */
+	Block	**host;		/* called "info" in myricom driver */
+//	uchar	*wcfifo;	/* cmd submission fifo */
+	uint	m;
+	uint	n;		/* rxslots */
+	uint	i;
+	uint	cnt;		/* number of buffers allocated (lifetime) */
+	uint	allocfail;
+} Rx;
+
+/* dma mapped.  unix network byte order. */
+typedef struct {
+	uchar	txcnt[4];
+	uchar	linkstat[4];
+	uchar	dlink[4];
+	uchar	derror[4];
+	uchar	drunt[4];
+	uchar	doverrun[4];
+	uchar	dnosm[4];
+	uchar	dnobg[4];
+	uchar	nrdma[4];
+	uchar	txstopped;
+	uchar	down;
+	uchar	updated;
+	uchar	valid;
+} Stats;
+
+enum {
+	Detached,
+	Attached,
+	Runed,
+};
+
+typedef struct {
+	Slot 	*entry;
+	u64int	busaddr;
+	uint	m;
+	uint	n;
+	uint	i;
+} Done;
+
+typedef struct Ctlr Ctlr;
+typedef struct Ctlr {
+	QLock;
+	int	state;
+	int	kprocs;
+	u64int	port;
+	Pcidev*	pcidev;
+	Ctlr*	next;
+	int	active;
+	int	id;		/* do we need this? */
+
+	uchar	ra[Eaddrlen];
+
+	int	ramsz;
+	uchar	*ram;
+
+	u32int	*irqack;
+	u32int	*irqdeass;
+	u32int	*coal;
+
+	char	eprom[Epromsz];
+	ulong	serial;		/* unit serial number */
+
+	QLock	cmdl;
+	Cmd	*cmd;		/* address of command return */
+	u64int	cprt;		/* bus address of command */
+
+	u64int	boot;		/* boot address */
+
+	Done	done;
+	Tx	tx;
+	Rx	sm;
+	Rx	bg;
+	Stats	*stats;
+	u64int	statsprt;
+
+	Rendez	rxrendez;
+	Rendez	txrendez;
+
+	int	msi;
+	u32int	linkstat;
+	u32int	nrdma;
+} Ctlr;
+
+static Ctlr 	*ctlrs;
+
+enum {
+	PciCapPMG	 = 0x01,	/* power management */
+	PciCapAGP	 = 0x02,
+	PciCapVPD	 = 0x03,	/* vital product data */
+	PciCapSID	 = 0x04,	/* slot id */
+	PciCapMSI	 = 0x05,
+	PciCapCHS	 = 0x06,	/* compact pci hot swap */
+	PciCapPCIX	 = 0x07,
+	PciCapHTC	 = 0x08,	/* hypertransport irq conf */
+	PciCapVND	 = 0x09,	/* vendor specific information */
+	PciCapHSW	 = 0x0C,	/* hot swap */
+	PciCapPCIe	 = 0x10,
+	PciCapMSIX	 = 0x11,
+};
+
+enum {
+	PcieAERC = 1,
+	PcieVC,
+	PcieSNC,
+	PciePBC,
+};
+
+enum {
+	AercCCR	= 0x18,		/* control register */
+};
+
+enum {
+	PcieCTL	= 8,
+	PcieLCR	= 12,
+	PcieMRD	= 0x7000,	/* maximum read size */
+};
+
+static int
+pcicap(Pcidev *p, int cap)
+{
+	int i, c, off;
+
+	pcicapdbg("pcicap: %x:%d\n", p->vid, p->did);
+	off = 0x34;			/* 0x14 for cardbus */
+	for(i = 48; i--; ){
+		pcicapdbg("\t" "loop %x\n", off);
+		off = pcicfgr8(p, off);
+		pcicapdbg("\t" "pcicfgr8 %x\n", off);
+		if(off < 0x40)
+			break;
+		off &= ~3;
+		c = pcicfgr8(p, off);
+		pcicapdbg("\t" "pcicfgr8 %x\n", c);
+		if(c == 0xff)
+			break;
+		if(c == cap)
+			return off;
+		off++;
+	}
+	return 0;
+}
+
+/*
+ * this function doesn't work because pcicgr32 doesn't have access
+ * to the pcie extended configuration space.
+ */
+static int
+pciecap(Pcidev *p, int cap)
+{
+	uint off, i;
+
+	off = 0x100;
+	while(((i = pcicfgr32(p, off))&0xffff) != cap){
+		off = i >> 20;
+		print("pciecap offset = %ud\n",  off);
+		if(off < 0x100 || off >= 4*KiB - 1)
+			return 0;
+	}
+	print("pciecap found = %ud\n",  off);
+	return off;
+}
+
+static int
+setpcie(Pcidev *p)
+{
+	int off;
+
+	/* set 4k writes */
+	off = pcicap(p, PciCapPCIe);
+	if(off < 64)
+		return -1;
+	off += PcieCTL;
+	pcicfgw16(p, off, (pcicfgr16(p, off) & ~PcieMRD) | 5<<12);
+	return 0;
+}
+
+static int
+whichfw(Pcidev *p)
+{
+	char *s;
+	int i, off, lanes, ecrc;
+	u32int cap;
+
+	/* check the number of configured lanes. */
+	off = pcicap(p, PciCapPCIe);
+	if(off < 64)
+		return -1;
+	off += PcieLCR;
+	cap = pcicfgr16(p, off);
+	lanes = (cap>>4) & 0x3f;
+
+	/* check AERC register.  we need it on.  */
+	off = pciecap(p, PcieAERC);
+	print("%d offset\n", off);
+	cap = 0;
+	if(off != 0){
+		off += AercCCR;
+		cap = pcicfgr32(p, off);
+		print("%ud cap\n", cap);
+	}
+	ecrc = (cap>>4) & 0xf;
+	/* if we don't like the aerc, kick it here. */
+
+	print("m10g %d lanes; ecrc=%d; ", lanes, ecrc);
+	if(s = getconf("myriforce")){
+		i = atoi(s);
+		if(i != 4*KiB || i != 2*KiB)
+			i = 2*KiB;
+		print("fw=%d [forced]\n", i);
+		return i;
+	}
+	if(lanes <= 4){
+		print("fw = 4096 [lanes]\n");
+		return 4*KiB;
+	}
+	if(ecrc & 10){
+		print("fw = 4096 [ecrc set]\n");
+		return 4*KiB;
+	}
+	print("fw = 4096 [default]\n");
+	return 4*KiB;
+}
+
+static int
+parseeprom(Ctlr *c)
+{
+	int i, j, k, l, bits;
+	char *s;
+
+	dprint("m10g eprom:\n");
+	s = c->eprom;
+	bits = 3;
+	for(i = 0; s[i] && i < Epromsz; i++){
+		l = strlen(s+i);
+		dprint("\t%s\n", s+i);
+		if(strncmp(s+i, "MAC=", 4) == 0 && l == 4+12+5){
+			bits ^= 1;
+			j = i + 4;
+			for(k = 0; k < 6; k++)
+				c->ra[k] = strtoul(s+j+3*k, 0, 16);
+		}else if(strncmp(s+i, "SN=", 3) == 0){
+			bits ^= 2;
+			c->serial = atoi(s+i+3);
+		}
+		i += l;
+	}
+	if(bits)
+		return -1;
+	return 0;
+}
+
+static u16int
+pbit16(u16int i)
+{
+	u16int j;
+	uchar *p;
+
+	p = (uchar*)&j;
+	p[1] = i;
+	p[0] = i>>8;
+	return j;
+}
+
+static u16int
+gbit16(uchar i[2])
+{
+	u16int j;
+
+	j  = i[1];
+	j |= i[0]<<8;
+	return j;
+}
+
+static u32int
+pbit32(u32int i)
+{
+	u32int j;
+	uchar *p;
+
+	p = (uchar*)&j;
+	p[3] = i;
+	p[2] = i>>8;
+	p[1] = i>>16;
+	p[0] = i>>24;
+	return j;
+}
+
+static u32int
+gbit32(uchar i[4])
+{
+	u32int j;
+
+	j  = i[3];
+	j |= i[2]<<8;
+	j |= i[1]<<16;
+	j |= i[0]<<24;
+	return j;
+}
+
+static void
+prepcmd(uint *cmd, int i)
+{
+	while(i-- > 0)
+		cmd[i] = pbit32(cmd[i]);
+}
+
+/*
+ * the command looks like this (int 32bit integers)
+ * cmd type
+ * addr (low)
+ * addr (high)
+ * pad (used for dma testing)
+ * response (high)
+ * response (low)
+ * 40 byte = 5 int pad.
+ */
+
+u32int
+cmd(Ctlr *c, int type, u64int data)
+{
+	u32int buf[16], i;
+	Cmd *cmd;
+
+	qlock(&c->cmdl);
+	cmd = c->cmd;
+	cmd->i[1] = Noconf;
+	memset(buf, 0, sizeof buf);
+	buf[0] = type;
+	buf[1] = data;
+	buf[2] = data >> 32;
+	buf[4] = c->cprt >> 32;
+	buf[5] = c->cprt;
+	prepcmd(buf, 6);
+	coherence();
+	memmove(c->ram + Cmdoff, buf, sizeof buf);
+
+	if(waserror())
+		nexterror();
+	for(i = 0; i < 15; i++){
+		if(cmd->i[1] != Noconf){
+			poperror();
+			i = gbit32(cmd->c);
+			qunlock(&c->cmdl);
+			if(cmd->i[1] != 0)
+				dprint("[%ux]", i);
+			return i;
+		}
+		tsleep(&up->sleep, return0, 0, 1);
+	}
+	qunlock(&c->cmdl);
+	iprint("m10g: cmd timeout [%ux %ux] cmd=%d\n",
+		cmd->i[0], cmd->i[1], type);
+	error(Etimeout);
+	return ~0;			/* silence! */
+}
+
+u32int
+maccmd(Ctlr *c, int type, uchar *m)
+{
+	u32int buf[16], i;
+	Cmd *cmd;
+
+	qlock(&c->cmdl);
+	cmd = c->cmd;
+	cmd->i[1] = Noconf;
+	memset(buf, 0, sizeof buf);
+	buf[0] = type;
+	buf[1] = m[0]<<24 | m[1]<<16 | m[2]<<8 | m[3];
+	buf[2] = m[4]<< 8 | m[5];
+	buf[4] = c->cprt >> 32;
+	buf[5] = c->cprt;
+	prepcmd(buf, 6);
+	coherence();
+	memmove(c->ram + Cmdoff, buf, sizeof buf);
+
+	if(waserror())
+		nexterror();
+	for(i = 0; i < 15; i++){
+		if(cmd->i[1] != Noconf){
+			poperror();
+			i = gbit32(cmd->c);
+			qunlock(&c->cmdl);
+			if(cmd->i[1] != 0)
+				dprint("[%ux]", i);
+			return i;
+		}
+		tsleep(&up->sleep, return0, 0, 1);
+	}
+	qunlock(&c->cmdl);
+	iprint("m10g: maccmd timeout [%ux %ux] cmd=%d\n",
+		cmd->i[0], cmd->i[1], type);
+	error(Etimeout);
+	return ~0;			/* silence! */
+}
+
+/* remove this garbage after testing */
+enum {
+	DMAread	= 0x10000,
+	DMAwrite= 0x1,
+};
+
+u32int
+dmatestcmd(Ctlr *c, int type, u64int addr, int len)
+{
+	u32int buf[16], i;
+
+	memset(buf, 0, sizeof buf);
+	memset(c->cmd, Noconf, sizeof *c->cmd);
+	buf[0] = Cdmatest;
+	buf[1] = addr;
+	buf[2] = addr >> 32;
+	buf[3] = len * type;
+	buf[4] = c->cprt >> 32;
+	buf[5] = c->cprt;
+	prepcmd(buf, 6);
+	coherence();
+	memmove(c->ram + Cmdoff, buf, sizeof buf);
+
+	if(waserror())
+		nexterror();
+	for(i = 0; i < 15; i++){
+		if(c->cmd->i[1] != Noconf){
+			i = gbit32(c->cmd->c);
+			if(i == 0)
+				error(Eio);
+			poperror();
+			return i;
+		}
+		tsleep(&up->sleep, return0, 0, 5);
+	}
+	error(Etimeout);
+	return ~0;			/* silence! */
+}
+
+u32int
+rdmacmd(Ctlr *c, int on)
+{
+	u32int buf[16], i;
+
+	memset(buf, 0, sizeof buf);
+	c->cmd->i[0] = 0;
+	coherence();
+	buf[0] = c->cprt >> 32;
+	buf[1] = c->cprt;
+	buf[2] = Noconf;
+	buf[3] = c->cprt >> 32;
+	buf[4] = c->cprt;
+	buf[5] = on;
+	prepcmd(buf, 6);
+	memmove(c->ram + Rdmaoff, buf, sizeof buf);
+
+	if(waserror())
+		nexterror();
+	for(i = 0; i < 20; i++){
+		if(c->cmd->i[0] == Noconf){
+			poperror();
+			return gbit32(c->cmd->c);
+		}
+		tsleep(&up->sleep, return0, 0, 1);
+	}
+	error(Etimeout);
+	iprint("m10g: rdmacmd timeout\n");
+	return ~0;			/* silence! */
+}
+
+static int
+loadfw(Ctlr *c, int *align)
+{
+	uint *f, *s, sz;
+	int i;
+
+	if((*align = whichfw(c->pcidev)) == 4*KiB){
+		f = (u32int*)fw4k;
+		sz = sizeof fw4k;
+	}else{
+		f = (u32int*)fw2k;
+		sz = sizeof fw2k;
+	}
+
+	s = (u32int*)(c->ram + Fwoffset);
+	for(i = 0; i < sz / 4; i++)
+		s[i] = f[i];
+	return sz & ~3;
+}
+
+static int
+bootfw(Ctlr *c)
+{
+	int i, sz, align;
+	uint buf[16];
+	Cmd* cmd;
+
+	if((sz = loadfw(c, &align)) == 0)
+		return 0;
+	dprint("bootfw %d bytes ... ", sz);
+	cmd = c->cmd;
+
+	memset(buf, 0, sizeof buf);
+	c->cmd->i[0] = 0;
+	coherence();
+	buf[0] = c->cprt >> 32;	/* upper dma target address */
+	buf[1] = c->cprt;	/* lower */
+	buf[2] = Noconf;	/* writeback */
+	buf[3] = Fwoffset + 8,
+	buf[4] = sz - 8;
+	buf[5] = 8;
+	buf[6] = 0;
+	prepcmd(buf, 7);
+	coherence();
+	memmove(c->ram + Fwsubmt, buf, sizeof buf);
+
+	for(i = 0; i < 20; i++){
+		if(cmd->i[0] == Noconf)
+			break;
+		delay(1);
+	}
+	dprint("[%ux %ux]", gbit32(cmd->c), gbit32(cmd->c+4));
+	if(i == 20){
+		print("m10g: cannot load fw\n");
+		return -1;
+	}
+	dprint("\n");
+	c->tx.segsz = align;
+	return 0;
+}
+
+static int
+kickthebaby(Pcidev *p, Ctlr *c)
+{
+	/* don't kick the baby! */
+	u32int code;
+
+	pcicfgw8(p,  0x10 + c->boot, 0x3);
+	pcicfgw32(p, 0x18 + c->boot, 0xfffffff0);
+	code = pcicfgr32(p, 0x14 + c->boot);
+
+	dprint("reboot status = %ux\n", code);
+	if(code != 0xfffffff0)
+		return -1;
+	return 0;
+}
+
+typedef struct {
+	uchar	len[4];
+	uchar	type[4];
+	char	version[128];
+	uchar	globals[4];
+	uchar	ramsz[4];
+	uchar	specs[4];
+	uchar	specssz[4];
+} Fwhdr;
+
+enum {
+	Tmx	= 0x4d582020,
+	Tpcie	= 0x70636965,
+	Teth	= 0x45544820,
+	Tmcp0	= 0x4d435030,
+};
+
+static char *
+fwtype(u32int type)
+{
+	switch(type){
+	case Tmx:
+		return "mx";
+	case Tpcie:
+		return "PCIe";
+	case Teth:
+		return "eth";
+	case Tmcp0:
+		return "mcp0";
+	}
+	return "*GOK*";
+}
+
+static int
+chkfw(Ctlr *c)
+{
+	uintptr off;
+	Fwhdr *h;
+	u32int type;
+
+	off = gbit32(c->ram+0x3c);
+	dprint("firmware %llux\n", (u64int)off);
+	if((off&3) || off + sizeof *h > c->ramsz){
+		print("!m10g: bad firmware %llux\n", (u64int)off);
+		return -1;
+	}
+	h = (Fwhdr*)(c->ram + off);
+	type = gbit32(h->type);
+	dprint("\t" "type	%s\n", fwtype(type));
+	dprint("\t" "vers	%s\n", h->version);
+	dprint("\t" "ramsz	%ux\n", gbit32(h->ramsz));
+	if(type != Teth){
+		print("!m10g: bad card type %s\n", fwtype(type));
+		return -1;
+	}
+
+	return bootfw(c) || rdmacmd(c, 0);
+}
+
+static int
+reset(Ether *e, Ctlr *c)
+{
+	u32int i, sz;
+
+	if(waserror()){
+		print("m10g: reset error\n");
+		nexterror();
+		return -1;
+	}
+
+	chkfw(c);
+	cmd(c, Creset, 0);
+
+	cmd(c, CSintrqsz, c->done.n * sizeof *c->done.entry);
+	cmd(c, CSintrqdma, c->done.busaddr);
+	c->irqack =   (u32int*)(c->ram + cmd(c, CGirqackoff, 0));
+	/* required only if we're not doing msi? */
+	c->irqdeass = (u32int*)(c->ram + cmd(c, CGirqdeassoff, 0));
+	/* this is the driver default, why fiddle with this? */
+	c->coal = (u32int*)(c->ram + cmd(c, CGcoaloff, 0));
+	*c->coal = pbit32(25);
+
+	dprint("dma stats:\n");
+	rdmacmd(c, 1);
+	sz = c->tx.segsz;
+	i = dmatestcmd(c, DMAread, c->done.busaddr, sz);
+	print("\t" "read: %ud MB/s\n", ((i>>16)*sz*2)/(i&0xffff));
+	i = dmatestcmd(c, DMAwrite, c->done.busaddr, sz);
+	print("\t" "write: %ud MB/s\n", ((i>>16)*sz*2)/(i&0xffff));
+	i = dmatestcmd(c, DMAwrite|DMAread, c->done.busaddr, sz);
+	print("\t" "r/w: %ud MB/s\n", ((i>>16)*sz*2*2)/(i&0xffff));
+	memset(c->done.entry, 0, c->done.n * sizeof *c->done.entry);
+
+	maccmd(c, CSmac, c->ra);
+//	cmd(c, Cnopromisc, 0);
+	cmd(c, Cenablefc, 0);
+	e->maxmtu = Maxmtu;
+	cmd(c, CSmtu, e->maxmtu);
+	dprint("CSmtu %d...\n", e->maxmtu);
+
+	poperror();
+	return 0;
+}
+
+static void
+ctlrfree(Ctlr *c)
+{
+	/* free up all the Block*s, too */
+	free(c->tx.host);
+	free(c->sm.host);
+	free(c->bg.host);
+	free(c->cmd);
+	free(c->done.entry);
+	free(c->stats);
+	free(c);
+}
+
+static int
+setmem(Pcidev *p, Ctlr *c)
+{
+	u32int i;
+	u64int raddr;
+	Done *d;
+	void *mem;
+
+	c->tx.segsz = 2048;
+	c->ramsz = 2*MiB - (2*48*KiB + 32*KiB) - 0x100;
+	if(c->ramsz > p->mem[0].size)
+		return -1;
+
+	raddr = p->mem[0].bar & ~0x0F;
+	mem = vmap(raddr, p->mem[0].size);
+	if(mem == nil){
+		print("m10g: can't map %8.8lux\n", p->mem[0].bar);
+		return -1;
+	}
+	dprint("%llux <- vmap(mem[0].size = %ux)\n", raddr, p->mem[0].size);
+	c->port = raddr;
+	c->ram = mem;
+	c->cmd = malign(sizeof *c->cmd);
+	c->cprt = PCIWADDR(c->cmd);
+
+	d = &c->done;
+	d->n = Maxslots;
+	d->m = d->n - 1;
+	i = d->n * sizeof *d->entry;
+	d->entry = malign(i);
+	memset(d->entry, 0, i);
+	d->busaddr = PCIWADDR(d->entry);
+
+	c->stats = malign(sizeof *c->stats);
+	memset(c->stats, 0, sizeof *c->stats);
+	c->statsprt = PCIWADDR(c->stats);
+
+	memmove(c->eprom, c->ram + c->ramsz - Epromsz, Epromsz-2);
+	return setpcie(p) || parseeprom(c);
+}
+
+static Rx*
+whichrx(Ctlr *c, int sz)
+{
+	if(sz <= smpool.size)
+		return &c->sm;
+	return &c->bg;
+}
+
+static Block*
+balloc(Rx* rx)
+{
+	Block *b;
+
+	ilock(rx->pool);
+	if((b = rx->pool->head) != nil){
+		rx->pool->head = b->next;
+		b->next = nil;
+		rx->pool->n--;
+	}
+	iunlock(rx->pool);
+	return b;
+}
+
+static void
+smbfree(Block *b)
+{
+	Bpool *p;
+
+	b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB);
+ 	b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck);
+
+	p = &smpool;
+	ilock(p);
+	b->next = p->head;
+	p->head = b;
+	p->n++;
+	p->cnt++;
+	iunlock(p);
+}
+
+static void
+bgbfree(Block *b)
+{
+	Bpool *p;
+
+	b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB);
+ 	b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck);
+
+	p = &bgpool;
+	ilock(p);
+	b->next = p->head;
+	p->head = b;
+	p->n++;
+	p->cnt++;
+	iunlock(p);
+}
+
+static void
+replenish(Rx *rx)
+{
+	u32int buf[16], i, idx, e;
+	Bpool *p;
+	Block *b;
+
+	p = rx->pool;
+	if(p->n < 8)
+		return;
+	memset(buf, 0, sizeof buf);
+	e = (rx->i - rx->cnt) & ~7;
+	e += rx->n;
+	while(p->n >= 8 && e){
+		idx = rx->cnt & rx->m;
+		for(i = 0; i < 8; i++){
+			b = balloc(rx);
+			buf[i*2]   = pbit32((u64int)PCIWADDR(b->wp) >> 32);
+			buf[i*2+1] = pbit32(PCIWADDR(b->wp));
+			rx->host[idx+i] = b;
+			assert(b);
+		}
+		memmove(rx->lanai + 2*idx, buf, sizeof buf);
+		coherence();
+		rx->cnt += 8;
+		e -= 8;
+	}
+	if(e && p->n > 7+1)
+		print("should panic? pool->n = %d\n", p->n);
+}
+
+/*
+ * future:
+ * if (c->mtrr >= 0) {
+ *	c->tx.wcfifo = c->ram+0x200000;
+ *	c->sm.wcfifo = c->ram+0x300000;
+ *	c->bg.wcfifo = c->ram+0x340000;
+ * }
+ */
+
+static int
+nextpow(int j)
+{
+	int i;
+
+	for(i = 0; j > (1 << i); i++)
+		;
+	return 1 << i;
+}
+
+static void*
+emalign(int sz)
+{
+	void *v;
+
+	v = malign(sz);
+	if(v == nil)
+		error(Enomem);
+	memset(v, 0, sz);
+	return v;
+}
+
+static void
+open0(Ether *e, Ctlr *c)
+{
+	Block *b;
+	int i, sz, entries;
+
+	entries = cmd(c, CGsendrgsz, 0) / sizeof *c->tx.lanai;
+	c->tx.lanai = (Send*)(c->ram + cmd(c, CGsendoff, 0));
+	c->tx.host  = emalign(entries * sizeof *c->tx.host);
+	c->tx.bring = emalign(entries * sizeof *c->tx.bring);
+	c->tx.n = entries;
+	c->tx.m = entries-1;
+
+	entries = cmd(c, CGrxrgsz, 0)/8;
+	c->sm.pool = &smpool;
+	cmd(c, CSsmallsz, c->sm.pool->size);
+	c->sm.lanai = (u32int*)(c->ram + cmd(c, CGsmallrxoff, 0));
+	c->sm.n = entries;
+	c->sm.m = entries-1;
+	c->sm.host = emalign(entries * sizeof *c->sm.host);
+
+	c->bg.pool = &bgpool;
+	c->bg.pool->size = nextpow(2 + e->maxmtu);  /* 2-byte alignment pad */
+	cmd(c, CSbigsz, c->bg.pool->size);
+	c->bg.lanai = (u32int*)(c->ram + cmd(c, CGbigrxoff, 0));
+	c->bg.n = entries;
+	c->bg.m = entries-1;
+	c->bg.host = emalign(entries * sizeof *c->bg.host);
+
+	sz = c->sm.pool->size + 4*KiB;
+	for(i = 0; i < c->sm.n; i++){
+		if((b = allocb(sz)) == 0)
+			break;
+		b->free = smbfree;
+		freeb(b);
+	}
+	sz = c->bg.pool->size + 4*KiB;
+	for(i = 0; i < c->bg.n; i++){
+		if((b = allocb(sz)) == 0)
+			break;
+		b->free = bgbfree;
+		freeb(b);
+	}
+
+	cmd(c, CSstatsdma, c->statsprt);
+	c->linkstat = ~0;
+	c->nrdma = 15;
+
+	cmd(c, Cetherup, 0);
+}
+
+static Block*
+nextblock(Ctlr *c)
+{
+	uint i;
+	u16int l, k;
+	Block *b;
+	Done *d;
+	Rx *rx;
+	Slot *s;
+	Slotparts *sp;
+
+	d = &c->done;
+	s = d->entry;
+	i = d->i & d->m;
+	sp = (Slotparts *)(s + i);
+	l = sp->len;
+	if(l == 0)
+		return 0;
+	k = sp->cksum;
+	s[i] = 0;
+	d->i++;
+	l = gbit16((uchar*)&l);
+//dprint("nextb: i=%d l=%d\n", d->i, l);
+	rx = whichrx(c, l);
+	if(rx->i >= rx->cnt){
+		iprint("m10g: overrun\n");
+		return 0;
+	}
+	i = rx->i & rx->m;
+	b = rx->host[i];
+	rx->host[i] = 0;
+	if(b == 0){
+		iprint("m10g: error rx to no block.  memory is hosed.\n");
+		return 0;
+	}
+	rx->i++;
+
+	b->flag |= Bipck|Btcpck|Budpck;
+	b->checksum = k;
+	b->rp += 2;
+	b->wp += 2+l;
+	b->lim = b->wp;			/* lie like a dog. */
+	return b;
+}
+
+static int
+rxcansleep(void *v)
+{
+	Ctlr *c;
+	Slot *s;
+	Slotparts *sp;
+	Done *d;
+
+	c = v;
+	d = &c->done;
+	s = c->done.entry;
+	sp = (Slotparts *)(s + (d->i & d->m));
+	if(sp->len != 0)
+		return -1;
+	c->irqack[0] = pbit32(3);
+	return 0;
+}
+
+static void
+m10rx(void *v)
+{
+	Ether *e;
+	Ctlr *c;
+	Block *b;
+
+	e = v;
+	c = e->ctlr;
+	for(;;){
+		replenish(&c->sm);
+		replenish(&c->bg);
+		sleep(&c->rxrendez, rxcansleep, c);
+		while(b = nextblock(c))
+			etheriq(e, b, 1);
+	}
+}
+
+static void
+txcleanup(Tx *tx, u32int n)
+{
+	Block *b;
+	uint j, l, m;
+
+	if(tx->npkt == n)
+		return;
+	l = 0;
+	m = tx->m;
+	/*
+	 * if tx->cnt == tx->i, yet tx->npkt == n-1, we just
+	 * caught ourselves and myricom card updating.
+	 */
+	for(;; tx->cnt++){
+		j = tx->cnt & tx->m;
+		if(b = tx->bring[j]){
+			tx->bring[j] = 0;
+			tx->nbytes += BLEN(b);
+			freeb(b);
+			if(++tx->npkt == n)
+				return;
+		}
+		if(tx->cnt == tx->i)
+			return;
+		if(l++ == m){
+			iprint("tx ovrun: %ud %uld\n", n, tx->npkt);
+			return;
+		}
+	}
+}
+
+static int
+txcansleep(void *v)
+{
+	Ctlr *c;
+
+	c = v;
+	if(c->tx.cnt != c->tx.i && c->tx.npkt != gbit32(c->stats->txcnt))
+		return -1;
+	return 0;
+}
+
+static void
+txproc(void *v)
+{
+	Ether *e;
+	Ctlr *c;
+	Tx *tx;
+
+	e = v;
+	c = e->ctlr;
+	tx = &c->tx;
+	for(;;){
+ 		sleep(&c->txrendez, txcansleep, c);
+		txcleanup(tx, gbit32(c->stats->txcnt));
+	}
+}
+
+static void
+submittx(Tx *tx, int n)
+{
+	Send *l, *h;
+	int i0, i, m;
+
+	m = tx->m;
+	i0 = tx->i & m;
+	l = tx->lanai;
+	h = tx->host;
+	for(i = n-1; i >= 0; i--)
+		memmove(l+(i + i0 & m), h+(i + i0 & m), sizeof *h);
+	tx->i += n;
+//	coherence();
+}
+
+static int
+nsegments(Block *b, int segsz)
+{
+	uintptr bus, end, slen, len;
+	int i;
+
+	bus = PCIWADDR(b->rp);
+	i = 0;
+	for(len = BLEN(b); len; len -= slen){
+		end = bus + segsz & ~(segsz-1);
+		slen = end - bus;
+		if(slen > len)
+			slen = len;
+		bus += slen;
+		i++;
+	}
+	return i;
+}
+
+static void
+m10gtransmit(Ether *e)
+{
+	u16int slen;
+	u32int i, cnt, rdma, nseg, count, end, bus, len, segsz;
+	uchar flags;
+	Block *b;
+	Ctlr *c;
+	Send *s, *s0, *s0m8;
+	Tx *tx;
+
+	c = e->ctlr;
+	tx = &c->tx;
+	segsz = tx->segsz;
+
+	qlock(tx);
+	count = 0;
+	s = tx->host + (tx->i & tx->m);
+	cnt = tx->cnt;
+	s0 =   tx->host + (cnt & tx->m);
+	s0m8 = tx->host + ((cnt - 8) & tx->m);
+	i = tx->i;
+	for(; s >= s0 || s < s0m8; i += nseg){
+		if((b = qget(e->oq)) == nil)
+			break;
+		flags = SFfirst|SFnotso;
+		if((len = BLEN(b)) < 1520)
+			flags |= SFsmall;
+		rdma = nseg = nsegments(b, segsz);
+		bus = PCIWADDR(b->rp);
+		for(; len; len -= slen){
+			end = bus + segsz & ~(segsz-1);
+			slen = end - bus;
+			if(slen > len)
+				slen = len;
+			s->low = pbit32(bus);
+			s->len = pbit16(slen);
+			s->nrdma = rdma;
+			s->flags = flags;
+
+			bus += slen;
+			if(++s ==  tx->host + tx->n)
+				s = tx->host;
+			count++;
+			flags &= ~SFfirst;
+			rdma = 1;
+		}
+		tx->bring[i + nseg - 1 & tx->m] = b;
+		if(1 || count > 0){
+			submittx(tx, count);
+			count = 0;
+			cnt = tx->cnt;
+			s0 =   tx->host + (cnt & tx->m);
+			s0m8 = tx->host + ((cnt - 8) & tx->m);
+		}
+	}
+	qunlock(tx);
+}
+
+static void
+checkstats(Ether *e, Ctlr *c, Stats *s)
+{
+	u32int i;
+
+	if(s->updated == 0)
+		return;
+
+	i = gbit32(s->linkstat);
+	if(c->linkstat != i){
+		e->link = i;
+		if(c->linkstat = i)
+			dprint("m10g: link up\n");
+		else
+			dprint("m10g: link down\n");
+	}
+	i = gbit32(s->nrdma);
+	if(i != c->nrdma){
+		dprint("m10g: rdma timeout %d\n", i);
+		c->nrdma = i;
+	}
+}
+
+static void
+waitintx(Ctlr *c)
+{
+	int i;
+
+	for(i = 0; i < 1024*1024; i++){
+		if(c->stats->valid == 0)
+			break;
+		coherence();
+	}
+}
+
+static void
+m10ginterrupt(Ureg *, void *v)
+{
+	Ether *e;
+	Ctlr *c;
+
+	e = v;
+	c = e->ctlr;
+
+	if(c->state != Runed || c->stats->valid == 0)	/* not ready for us? */
+		return;
+
+	if(c->stats->valid & 1)
+		wakeup(&c->rxrendez);
+	if(gbit32(c->stats->txcnt) != c->tx.npkt)
+		wakeup(&c->txrendez);
+	if(c->msi == 0)
+		*c->irqdeass = 0;
+	else
+		c->stats->valid = 0;
+	waitintx(c);
+	checkstats(e, c, c->stats);
+	c->irqack[1] = pbit32(3);
+}
+
+static void
+m10gattach(Ether *e)
+{
+	Ctlr *c;
+	char name[12];
+
+	dprint("m10gattach\n");
+
+	qlock(e->ctlr);
+	c = e->ctlr;
+	if(c->state != Detached){
+		qunlock(c);
+		return;
+	}
+	if(waserror()){
+		c->state = Detached;
+		qunlock(c);
+		nexterror();
+	}
+	reset(e, c);
+	c->state = Attached;
+	open0(e, c);
+	if(c->kprocs == 0){
+		c->kprocs++;
+		snprint(name, sizeof name, "#l%drxproc", e->ctlrno);
+		kproc(name, m10rx, e);
+		snprint(name, sizeof name, "#l%dtxproc", e->ctlrno);
+		kproc(name, txproc, e);
+	}
+	c->state = Runed;
+	qunlock(c);
+	poperror();
+}
+
+static int
+m10gdetach(Ctlr *c)
+{
+	dprint("m10gdetach\n");
+//	reset(e->ctlr);
+	vunmap(c->ram, c->pcidev->mem[0].size);
+	ctlrfree(c);
+	return -1;
+}
+
+static int
+lstcount(Block *b)
+{
+	int i;
+
+	i = 0;
+	for(; b; b = b->next)
+		i++;
+	return i;
+}
+
+static long
+m10gifstat(Ether *e, void *v, long n, ulong off)
+{
+	int l, lim;
+	char *p;
+	Ctlr *c;
+	Stats s;
+
+	c = e->ctlr;
+	lim = 2*READSTR-1;
+	p = malloc(lim+1);
+	l = 0;
+	/* no point in locking this because this is done via dma. */
+	memmove(&s, c->stats, sizeof s);
+
+	// l +=
+	snprint(p+l, lim,
+		"txcnt = %ud\n"	  "linkstat = %ud\n" 	"dlink = %ud\n"
+		"derror = %ud\n"  "drunt = %ud\n" 	"doverrun = %ud\n"
+		"dnosm = %ud\n"	  "dnobg = %ud\n"	"nrdma = %ud\n"
+		"txstopped = %ud\n" "down = %ud\n" 	"updated = %ud\n"
+		"valid = %ud\n\n"
+		"tx pkt = %uld\n" "tx bytes = %lld\n"
+		"tx cnt = %ud\n"  "tx n = %ud\n"	"tx i = %ud\n"
+		"sm cnt = %ud\n"  "sm i = %ud\n"	"sm n = %ud\n"
+		"sm lst = %ud\n"
+		"bg cnt = %ud\n"  "bg i = %ud\n"	"bg n = %ud\n"
+		"bg lst = %ud\n"
+		"segsz = %ud\n"   "coal = %d\n",
+		gbit32(s.txcnt),  gbit32(s.linkstat),	gbit32(s.dlink),
+		gbit32(s.derror), gbit32(s.drunt),	gbit32(s.doverrun),
+		gbit32(s.dnosm),  gbit32(s.dnobg),	gbit32(s.nrdma),
+		s.txstopped,  s.down, s.updated, s.valid,
+		c->tx.npkt, c->tx.nbytes,
+		c->tx.cnt, c->tx.n, c->tx.i,
+		c->sm.cnt, c->sm.i, c->sm.pool->n, lstcount(c->sm.pool->head),
+		c->bg.cnt, c->bg.i, c->bg.pool->n, lstcount(c->bg.pool->head),
+		c->tx.segsz, gbit32((uchar*)c->coal));
+
+	n = readstr(off, v, n, p);
+	free(p);
+	return n;
+}
+
+//static void
+//summary(Ether *e)
+//{
+//	char *buf;
+//	int n, i, j;
+//
+//	if(e == 0)
+//		return;
+//	buf = malloc(n=250);
+//	if(buf == 0)
+//		return;
+//
+//	snprint(buf, n, "oq\n");
+//	qsummary(e->oq, buf+3, n-3-1);
+//	iprint("%s", buf);
+//
+//	if(e->f) for(i = 0; e->f[i]; i++){
+//		j = snprint(buf, n, "f%d %d\n", i, e->f[i]->type);
+//		qsummary(e->f[i]->in, buf+j, n-j-1);
+//		print("%s", buf);
+//	}
+//
+//	free(buf);
+//}
+
+static void
+rxring(Ctlr *c)
+{
+	Done *d;
+	Slot *s;
+	Slotparts *sp;
+	int i;
+
+	d = &c->done;
+	s = d->entry;
+	for(i = 0; i < d->n; i++) {
+		sp = (Slotparts *)(s + i);
+		if(sp->len)
+			iprint("s[%d] = %d\n", i, sp->len);
+	}
+}
+
+enum {
+	CMdebug,
+	CMcoal,
+	CMwakeup,
+	CMtxwakeup,
+	CMqsummary,
+	CMrxring,
+};
+
+static Cmdtab ctab[] = {
+	CMdebug,	"debug",	2,
+	CMcoal,		"coal",		2,
+	CMwakeup,	"wakeup",	1,
+	CMtxwakeup,	"txwakeup",	1,
+//	CMqsummary,	"q",		1,
+	CMrxring,	"rxring",	1,
+};
+
+static long
+m10gctl(Ether *e, void *v, long n)
+{
+	int i;
+	Cmdbuf *c;
+	Cmdtab *t;
+
+	dprint("m10gctl\n");
+	if(e->ctlr == nil)
+		error(Enonexist);
+
+	c = parsecmd(v, n);
+	if(waserror()){
+		free(c);
+		nexterror();
+	}
+	t = lookupcmd(c, ctab, nelem(ctab));
+	switch(t->index){
+	case CMdebug:
+		debug = (strcmp(c->f[1], "on") == 0);
+		break;
+	case CMcoal:
+		i = atoi(c->f[1]);
+		if(i < 0 || i > 1000)
+			error(Ebadarg);
+		*((Ctlr*)e->ctlr)->coal = pbit32(i);
+		break;
+	case CMwakeup:
+		wakeup(&((Ctlr*)e->ctlr)->rxrendez); /* you're kidding, right? */
+		break;
+	case CMtxwakeup:
+		wakeup(&((Ctlr*)e->ctlr)->txrendez); /* you're kidding, right? */
+		break;
+//	case CMqsummary:
+//		summary(e);
+//		break;
+	case CMrxring:
+		rxring(e->ctlr);
+		break;
+	default:
+		error(Ebadarg);
+	}
+	free(c);
+	poperror();
+	return n;
+}
+
+static void
+m10gshutdown(Ether *e)
+{
+	dprint("m10gshutdown\n");
+	m10gdetach(e->ctlr);
+}
+
+static void
+m10gpromiscuous(void *v, int on)
+{
+	Ether *e;
+	int i;
+
+	dprint("m10gpromiscuous\n");
+	e = v;
+	if(on)
+		i = Cpromisc;
+	else
+		i = Cnopromisc;
+	cmd(e->ctlr, i, 0);
+}
+
+static int	mcctab[]  = { CSleavemc, CSjoinmc };
+static char	*mcntab[] = { "leave", "join" };
+
+static void
+m10gmulticast(void *v, uchar *ea, int on)
+{
+	Ether *e;
+	int i;
+
+	dprint("m10gmulticast\n");
+	e = v;
+	if((i = maccmd(e->ctlr, mcctab[on], ea)) != 0)
+		print("m10g: can't %s %E: %d\n", mcntab[on], ea, i);
+}
+
+static void
+m10gpci(void)
+{
+	Pcidev *p;
+	Ctlr *t, *c;
+
+	t = 0;
+	for(p = 0; p = pcimatch(p, 0x14c1, 0x0008); ){
+		c = malloc(sizeof *c);
+		if(c == nil)
+			continue;
+		memset(c, 0, sizeof *c);
+		c->pcidev = p;
+		c->id = p->did<<16 | p->vid;
+		c->boot = pcicap(p, PciCapVND);
+//		kickthebaby(p, c);
+		pcisetbme(p);
+		if(setmem(p, c) == -1){
+			print("m10g failed\n");
+			free(c);
+			/* cleanup */
+			continue;
+		}
+		if(t)
+			t->next = c;
+		else
+			ctlrs = c;
+		t = c;
+	}
+}
+
+static int
+m10gpnp(Ether *e)
+{
+	Ctlr *c;
+
+	if(ctlrs == nil)
+		m10gpci();
+
+	for(c = ctlrs; c != nil; c = c->next)
+		if(c->active)
+			continue;
+		else if(e->port == 0 || e->port == c->port)
+			break;
+	if(c == nil)
+		return -1;
+	c->active = 1;
+
+	e->ctlr = c;
+	e->port = c->port;
+	e->irq = c->pcidev->intl;
+	e->tbdf = c->pcidev->tbdf;
+	e->mbps = 10000;
+	memmove(e->ea, c->ra, Eaddrlen);
+
+	e->attach = m10gattach;
+	e->detach = m10gshutdown;
+	e->transmit = m10gtransmit;
+	e->interrupt = m10ginterrupt;
+	e->ifstat = m10gifstat;
+	e->ctl = m10gctl;
+//	e->power = m10gpower;
+	e->shutdown = m10gshutdown;
+
+	e->arg = e;
+	e->promiscuous = m10gpromiscuous;
+	e->multicast = m10gmulticast;
+
+	return 0;
+}
+
+void
+etherm10glink(void)
+{
+	addethercard("m10g", m10gpnp);
+}

+ 648 - 0
sys/src/9/386/kbd.c

@@ -0,0 +1,648 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"io.h"
+
+enum {
+	Data=		0x60,		/* data port */
+
+	Status=		0x64,		/* status port */
+	 Inready=	0x01,		/*  input character ready */
+	 Outbusy=	0x02,		/*  output busy */
+	 Sysflag=	0x04,		/*  system flag */
+	 Cmddata=	0x08,		/*  cmd==0, data==1 */
+	 Inhibit=	0x10,		/*  keyboard/mouse inhibited */
+	 Minready=	0x20,		/*  mouse character ready */
+	 Rtimeout=	0x40,		/*  general timeout */
+	 Parity=	0x80,
+
+	Cmd=		0x64,		/* command port (write only) */
+
+	Spec=		0xF800,		/* Unicode private space */
+	PF=		Spec|0x20,	/* num pad function key */
+	View=		Spec|0x00,	/* view (shift window up) */
+	KF=		0xF000,		/* function key (begin Unicode private space) */
+	Shift=		Spec|0x60,
+	Break=		Spec|0x61,
+	Ctrl=		Spec|0x62,
+	Latin=		Spec|0x63,
+	Caps=		Spec|0x64,
+	Num=		Spec|0x65,
+	Middle=		Spec|0x66,
+	Altgr=		Spec|0x67,
+	Kmouse=		Spec|0x100,
+	No=		0x00,		/* peter */
+
+	Home=		KF|13,
+	Up=		KF|14,
+	Pgup=		KF|15,
+	Print=		KF|16,
+	Left=		KF|17,
+	Right=		KF|18,
+	End=		KF|24,
+	Down=		View,
+	Pgdown=		KF|19,
+	Ins=		KF|20,
+	Del=		0x7F,
+	Scroll=		KF|21,
+
+	Nscan=	128,
+};
+
+/*
+ * The codes at 0x79 and 0x81 are produed by the PFU Happy Hacking keyboard.
+ * A 'standard' keyboard doesn't produce anything above 0x58.
+ */
+Rune kbtab[Nscan] =
+{
+[0x00]	No,	0x1b,	'1',	'2',	'3',	'4',	'5',	'6',
+[0x08]	'7',	'8',	'9',	'0',	'-',	'=',	'\b',	'\t',
+[0x10]	'q',	'w',	'e',	'r',	't',	'y',	'u',	'i',
+[0x18]	'o',	'p',	'[',	']',	'\n',	Ctrl,	'a',	's',
+[0x20]	'd',	'f',	'g',	'h',	'j',	'k',	'l',	';',
+[0x28]	'\'',	'`',	Shift,	'\\',	'z',	'x',	'c',	'v',
+[0x30]	'b',	'n',	'm',	',',	'.',	'/',	Shift,	'*',
+[0x38]	Latin,	' ',	Ctrl,	KF|1,	KF|2,	KF|3,	KF|4,	KF|5,
+[0x40]	KF|6,	KF|7,	KF|8,	KF|9,	KF|10,	Num,	Scroll,	'7',
+[0x48]	'8',	'9',	'-',	'4',	'5',	'6',	'+',	'1',
+[0x50]	'2',	'3',	'0',	'.',	No,	No,	No,	KF|11,
+[0x58]	KF|12,	No,	No,	No,	No,	No,	No,	No,
+[0x60]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x68]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x70]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x78]	No,	View,	No,	Up,	No,	No,	No,	No,
+};
+
+Rune kbtabshift[Nscan] =
+{
+[0x00]	No,	0x1b,	'!',	'@',	'#',	'$',	'%',	'^',
+[0x08]	'&',	'*',	'(',	')',	'_',	'+',	'\b',	'\t',
+[0x10]	'Q',	'W',	'E',	'R',	'T',	'Y',	'U',	'I',
+[0x18]	'O',	'P',	'{',	'}',	'\n',	Ctrl,	'A',	'S',
+[0x20]	'D',	'F',	'G',	'H',	'J',	'K',	'L',	':',
+[0x28]	'"',	'~',	Shift,	'|',	'Z',	'X',	'C',	'V',
+[0x30]	'B',	'N',	'M',	'<',	'>',	'?',	Shift,	'*',
+[0x38]	Latin,	' ',	Ctrl,	KF|1,	KF|2,	KF|3,	KF|4,	KF|5,
+[0x40]	KF|6,	KF|7,	KF|8,	KF|9,	KF|10,	Num,	Scroll,	'7',
+[0x48]	'8',	'9',	'-',	'4',	'5',	'6',	'+',	'1',
+[0x50]	'2',	'3',	'0',	'.',	No,	No,	No,	KF|11,
+[0x58]	KF|12,	No,	No,	No,	No,	No,	No,	No,
+[0x60]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x68]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x70]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x78]	No,	Up,	No,	Up,	No,	No,	No,	No,
+};
+
+Rune kbtabesc1[Nscan] =
+{
+[0x00]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x08]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x10]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x18]	No,	No,	No,	No,	'\n',	Ctrl,	No,	No,
+[0x20]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x28]	No,	No,	Shift,	No,	No,	No,	No,	No,
+[0x30]	No,	No,	No,	No,	No,	'/',	No,	Print,
+[0x38]	Altgr,	No,	No,	No,	No,	No,	No,	No,
+[0x40]	No,	No,	No,	No,	No,	No,	Break,	Home,
+[0x48]	Up,	Pgup,	No,	Left,	No,	Right,	No,	End,
+[0x50]	Down,	Pgdown,	Ins,	Del,	No,	No,	No,	No,
+[0x58]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x60]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x68]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x70]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x78]	No,	Up,	No,	No,	No,	No,	No,	No,
+};
+
+Rune kbtabaltgr[Nscan] =
+{
+[0x00]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x08]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x10]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x18]	No,	No,	No,	No,	'\n',	Ctrl,	No,	No,
+[0x20]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x28]	No,	No,	Shift,	No,	No,	No,	No,	No,
+[0x30]	No,	No,	No,	No,	No,	'/',	No,	Print,
+[0x38]	Altgr,	No,	No,	No,	No,	No,	No,	No,
+[0x40]	No,	No,	No,	No,	No,	No,	Break,	Home,
+[0x48]	Up,	Pgup,	No,	Left,	No,	Right,	No,	End,
+[0x50]	Down,	Pgdown,	Ins,	Del,	No,	No,	No,	No,
+[0x58]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x60]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x68]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x70]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x78]	No,	Up,	No,	No,	No,	No,	No,	No,
+};
+
+Rune kbtabctrl[] =
+{
+[0x00]	No,	'', 	'', 	'', 	'', 	'', 	'', 	'', 
+[0x08]	'', 	'', 	'', 	'', 	'
', 	'', 	'\b',	'\t',
+[0x10]	'', 	'', 	'', 	'', 	'', 	'', 	'', 	'\t',
+[0x18]	'', 	'', 	'', 	'', 	'\n',	Ctrl,	'', 	'', 
+[0x20]	'', 	'', 	'', 	'\b',	'\n',	'', 	'', 	'', 
+[0x28]	'', 	No, 	Shift,	'', 	'', 	'', 	'', 	'', 
+[0x30]	'', 	'', 	'
', 	'', 	'', 	'', 	Shift,	'\n',
+[0x38]	Latin,	No, 	Ctrl,	'', 	'', 	'', 	'', 	'', 
+[0x40]	'', 	'', 	'', 	'
', 	'', 	'', 	'', 	'', 
+[0x48]	'', 	'', 	'
', 	'', 	'', 	'', 	'', 	'', 
+[0x50]	'', 	'', 	'', 	'', 	No,	No,	No,	'', 
+[0x58]	'', 	No,	No,	No,	No,	No,	No,	No,
+[0x60]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x68]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x70]	No,	No,	No,	No,	No,	No,	No,	No,
+[0x78]	No,	'', 	No,	'\b',	No,	No,	No,	No,
+};
+
+enum
+{
+	/* controller command byte */
+	Cscs1=		(1<<6),		/* scan code set 1 */
+	Cauxdis=	(1<<5),		/* mouse disable */
+	Ckbddis=	(1<<4),		/* kbd disable */
+	Csf=		(1<<2),		/* system flag */
+	Cauxint=	(1<<1),		/* mouse interrupt enable */
+	Ckbdint=	(1<<0),		/* kbd interrupt enable */
+};
+
+static Queue *kbdq;
+
+int mouseshifted;
+void (*kbdmouse)(int);
+static int nokbd = 1;
+
+static Lock i8042lock;
+static uchar ccc;
+static void (*auxputc)(int, int);
+
+/*
+ *  wait for output no longer busy
+ */
+static int
+outready(void)
+{
+	int tries;
+
+	for(tries = 0; (inb(Status) & Outbusy); tries++){
+		if(tries > 500)
+			return -1;
+		delay(2);
+	}
+	return 0;
+}
+
+/*
+ *  wait for input
+ */
+static int
+inready(void)
+{
+	int tries;
+
+	for(tries = 0; !(inb(Status) & Inready); tries++){
+		if(tries > 500)
+			return -1;
+		delay(2);
+	}
+	return 0;
+}
+
+/*
+ *  ask 8042 to reset the machine
+ */
+void
+i8042reset(void)
+{
+	ushort *s = KADDR(0x472);
+	int i, x;
+
+	if(nokbd)
+		return;
+
+	*s = 0x1234;		/* BIOS warm-boot flag */
+
+	/*
+	 *  newer reset the machine command
+	 */
+	outready();
+	outb(Cmd, 0xFE);
+	outready();
+
+	/*
+	 *  Pulse it by hand (old somewhat reliable)
+	 */
+	x = 0xDF;
+	for(i = 0; i < 5; i++){
+		x ^= 1;
+		outready();
+		outb(Cmd, 0xD1);
+		outready();
+		outb(Data, x);	/* toggle reset */
+		delay(100);
+	}
+}
+
+int
+i8042auxcmd(int cmd)
+{
+	unsigned int c;
+	int tries;
+
+	c = 0;
+	tries = 0;
+
+	ilock(&i8042lock);
+	do{
+		if(tries++ > 2)
+			break;
+		if(outready() < 0)
+			break;
+		outb(Cmd, 0xD4);
+		if(outready() < 0)
+			break;
+		outb(Data, cmd);
+		if(outready() < 0)
+			break;
+		if(inready() < 0)
+			break;
+		c = inb(Data);
+	} while(c == 0xFE || c == 0);
+	iunlock(&i8042lock);
+
+	if(c != 0xFA){
+		print("i8042: %2.2ux returned to the %2.2ux command\n", c, cmd);
+		return -1;
+	}
+	return 0;
+}
+
+int
+i8042auxcmds(uchar *cmd, int ncmd)
+{
+	int i;
+
+	ilock(&i8042lock);
+	for(i=0; i<ncmd; i++){
+		if(outready() < 0)
+			break;
+		outb(Cmd, 0xD4);
+		if(outready() < 0)
+			break;
+		outb(Data, cmd[i]);
+	}
+	iunlock(&i8042lock);
+	return i;
+}
+
+struct {
+	int esc1;
+	int esc2;
+	int alt;
+	int altgr;
+	int caps;
+	int ctl;
+	int num;
+	int shift;
+	int collecting;
+	int nk;
+	Rune kc[5];
+	int buttons;
+} kbscan;
+
+/*
+ *  keyboard interrupt
+ */
+static void
+i8042intr(Ureg*, void*)
+{
+	int s, c, i;
+	int keyup;
+
+	/*
+	 *  get status
+	 */
+	ilock(&i8042lock);
+	s = inb(Status);
+	if(!(s&Inready)){
+		iunlock(&i8042lock);
+		return;
+	}
+
+	/*
+	 *  get the character
+	 */
+	c = inb(Data);
+	iunlock(&i8042lock);
+
+	/*
+	 *  if it's the aux port...
+	 */
+	if(s & Minready){
+		if(auxputc != nil)
+			auxputc(c, kbscan.shift);
+		return;
+	}
+
+	/*
+	 *  e0's is the first of a 2 character sequence, e1 the first
+	 *  of a 3 character sequence (on the safari)
+	 */
+	if(c == 0xe0){
+		kbscan.esc1 = 1;
+		return;
+	} else if(c == 0xe1){
+		kbscan.esc2 = 2;
+		return;
+	}
+
+	keyup = c&0x80;
+	c &= 0x7f;
+	if(c > sizeof kbtab){
+		c |= keyup;
+		if(c != 0xFF)	/* these come fairly often: CAPSLOCK U Y */
+			print("unknown key %ux\n", c);
+		return;
+	}
+
+	if(kbscan.esc1){
+		c = kbtabesc1[c];
+		kbscan.esc1 = 0;
+	} else if(kbscan.esc2){
+		kbscan.esc2--;
+		return;
+	} else if(kbscan.shift)
+		c = kbtabshift[c];
+	else if(kbscan.altgr)
+		c = kbtabaltgr[c];
+	else if(kbscan.ctl)
+		c = kbtabctrl[c];
+	else
+		c = kbtab[c];
+
+	if(kbscan.caps && c<='z' && c>='a')
+		c += 'A' - 'a';
+
+	/*
+	 *  keyup only important for shifts
+	 */
+	if(keyup){
+		switch(c){
+		case Latin:
+			kbscan.alt = 0;
+			break;
+		case Shift:
+			kbscan.shift = 0;
+			mouseshifted = 0;
+			break;
+		case Ctrl:
+			kbscan.ctl = 0;
+			break;
+		case Altgr:
+			kbscan.altgr = 0;
+			break;
+		case Kmouse|1:
+		case Kmouse|2:
+		case Kmouse|3:
+		case Kmouse|4:
+		case Kmouse|5:
+			kbscan.buttons &= ~(1<<(c-Kmouse-1));
+			if(kbdmouse)
+				kbdmouse(kbscan.buttons);
+			break;
+		}
+		return;
+	}
+
+	/*
+ 	 *  normal character
+	 */
+	if(!(c & (Spec|KF))){
+		if(kbscan.ctl)
+			if(kbscan.alt && c == Del)
+				exit(0);
+		if(!kbscan.collecting){
+			kbdputc(kbdq, c);
+			return;
+		}
+		kbscan.kc[kbscan.nk++] = c;
+		c = latin1(kbscan.kc, kbscan.nk);
+		if(c < -1)	/* need more keystrokes */
+			return;
+		if(c != -1)	/* valid sequence */
+			kbdputc(kbdq, c);
+		else	/* dump characters */
+			for(i=0; i<kbscan.nk; i++)
+				kbdputc(kbdq, kbscan.kc[i]);
+		kbscan.nk = 0;
+		kbscan.collecting = 0;
+		return;
+	} else {
+		switch(c){
+		case Caps:
+			kbscan.caps ^= 1;
+			return;
+		case Num:
+			kbscan.num ^= 1;
+			return;
+		case Shift:
+			kbscan.shift = 1;
+			mouseshifted = 1;
+			return;
+		case Latin:
+			kbscan.alt = 1;
+			/*
+			 * VMware and Qemu use Ctl-Alt as the key combination
+			 * to make the VM give up keyboard and mouse focus.
+			 * This has the unfortunate side effect that when you
+			 * come back into focus, Plan 9 thinks you want to type
+			 * a compose sequence (you just typed alt). 
+			 *
+			 * As a clumsy hack around this, we look for ctl-alt
+			 * and don't treat it as the start of a compose sequence.
+			 */
+			if(!kbscan.ctl){
+				kbscan.collecting = 1;
+				kbscan.nk = 0;
+			}
+			return;
+		case Ctrl:
+			kbscan.ctl = 1;
+			return;
+		case Altgr:
+			kbscan.altgr = 1;
+			return;
+		case Kmouse|1:
+		case Kmouse|2:
+		case Kmouse|3:
+		case Kmouse|4:
+		case Kmouse|5:
+			kbscan.buttons |= 1<<(c-Kmouse-1);
+			if(kbdmouse)
+				kbdmouse(kbscan.buttons);
+			return;
+		}
+	}
+	kbdputc(kbdq, c);
+}
+
+void
+i8042auxenable(void (*putc)(int, int))
+{
+	char *err = "i8042: aux init failed\n";
+
+	/* enable kbd/aux xfers and interrupts */
+	ccc &= ~Cauxdis;
+	ccc |= Cauxint;
+
+	ilock(&i8042lock);
+	if(outready() < 0)
+		print(err);
+	outb(Cmd, 0x60);			/* write control register */
+	if(outready() < 0)
+		print(err);
+	outb(Data, ccc);
+	if(outready() < 0)
+		print(err);
+	outb(Cmd, 0xA8);			/* auxilliary device enable */
+	if(outready() < 0){
+		iunlock(&i8042lock);
+		return;
+	}
+	auxputc = putc;
+	intrenable(IrqAUX, i8042intr, 0, BUSUNKNOWN, "kbdaux");
+	iunlock(&i8042lock);
+}
+
+static char *initfailed = "i8042: kbdinit failed\n";
+
+static int
+outbyte(int port, int c)
+{
+	outb(port, c);
+	if(outready() < 0) {
+		print(initfailed);
+		return -1;
+	}
+	return 0;
+}
+
+void
+kbdinit(void)
+{
+	int c, try;
+
+	/* wait for a quiescent controller */
+	try = 1000;
+	while(try-- > 0 && (c = inb(Status)) & (Outbusy | Inready)) {
+		if(c & Inready)
+			inb(Data);
+		delay(1);
+	}
+	if (try <= 0) {
+		print(initfailed);
+		return;
+	}
+
+	/* get current controller command byte */
+	outb(Cmd, 0x20);
+	if(inready() < 0){
+		print("i8042: kbdinit can't read ccc\n");
+		ccc = 0;
+	} else
+		ccc = inb(Data);
+
+	/* enable kbd xfers and interrupts */
+	ccc &= ~Ckbddis;
+	ccc |= Csf | Ckbdint | Cscs1;
+	if(outready() < 0) {
+		print(initfailed);
+		return;
+	}
+
+	nokbd = 0;
+
+	/* disable mouse */
+	if (outbyte(Cmd, 0x60) < 0 || outbyte(Data, ccc) < 0)
+		print("i8042: kbdinit mouse disable failed\n");
+}
+
+void
+kbdenable(void)
+{
+	kbdq = qopen(4*1024, 0, 0, 0);
+	if(kbdq == nil)
+		panic("kbdinit");
+	qnoblock(kbdq, 1);
+	addkbdq(kbdq, -1);
+
+	ioalloc(Data, 1, 0, "kbd");
+	ioalloc(Cmd, 1, 0, "kbd");
+
+	intrenable(IrqKBD, i8042intr, 0, BUSUNKNOWN, "kbd");
+}
+
+void
+kbdputmap(ushort m, ushort scanc, Rune r)
+{
+	if(scanc >= Nscan)
+		error(Ebadarg);
+	switch(m) {
+	default:
+		error(Ebadarg);
+	case 0:
+		kbtab[scanc] = r;
+		break;
+	case 1:
+		kbtabshift[scanc] = r;
+		break;
+	case 2:
+		kbtabesc1[scanc] = r;
+		break;
+	case 3:
+		kbtabaltgr[scanc] = r;
+		break;
+	case 4:	
+		kbtabctrl[scanc] = r;
+		break;
+	}
+}
+
+int
+kbdgetmap(int offset, int *t, int *sc, Rune *r)
+{
+	*t = offset/Nscan;
+	*sc = offset%Nscan;
+	if(*t < 0 || *sc < 0)
+		error(Ebadarg);
+	switch(*t) {
+	default:
+		return 0;
+	case 0:
+		*r = kbtab[*sc];
+		return 1;
+	case 1:
+		*r = kbtabshift[*sc];
+		return 1;
+	case 2:
+		*r = kbtabesc1[*sc];
+		return 1;
+	case 3:
+		*r = kbtabaltgr[*sc];
+		return 1;
+	case 4:
+		*r = kbtabctrl[*sc];
+		return 1;
+	}
+}

+ 767 - 0
sys/src/9/386/pci.c

@@ -0,0 +1,767 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * PCI support code.
+ * Needs a massive rewrite.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "io.h"
+
+enum
+{
+	PciADDR		= 0xCF8,	/* CONFIG_ADDRESS */
+	PciDATA		= 0xCFC,	/* CONFIG_DATA */
+
+	Maxfn			= 7,
+	Maxdev			= 31,
+	Maxbus			= 255,
+
+	/* command register */
+	IOen		= (1<<0),
+	MEMen		= (1<<1),
+	MASen		= (1<<2),
+	MemWrInv	= (1<<4),
+	PErrEn		= (1<<6),
+	SErrEn		= (1<<8),
+
+	Write,
+	Read,
+};
+
+static Lock pcicfglock;
+static Lock pcicfginitlock;
+static int pcicfgmode = -1;
+static Pcidev* pciroot;
+static Pcidev* pcilist;
+static Pcidev* pcitail;
+
+static char* bustypes[] = {
+	"CBUSI",
+	"CBUSII",
+	"EISA",
+	"FUTURE",
+	"INTERN",
+	"ISA",
+	"MBI",
+	"MBII",
+	"MCA",
+	"MPI",
+	"MPSA",
+	"NUBUS",
+	"PCI",
+	"PCMCIA",
+	"TC",
+	"VL",
+	"VME",
+	"XPRESS",
+};
+
+static	int	pcicfgrw(int, int, int, int, int);
+
+static int
+tbdffmt(Fmt* fmt)
+{
+	char *p;
+	int l, r;
+	uint type, tbdf;
+
+	if((p = malloc(READSTR)) == nil)
+		return fmtstrcpy(fmt, "(tbdfconv)");
+
+	switch(fmt->r){
+	case 'T':
+		tbdf = va_arg(fmt->args, uint);
+		type = BUSTYPE(tbdf);
+		if(type < nelem(bustypes))
+			l = snprint(p, READSTR, bustypes[type]);
+		else
+			l = snprint(p, READSTR, "%d", type);
+		snprint(p+l, READSTR-l, ".%d.%d.%d",
+			BUSBNO(tbdf), BUSDNO(tbdf), BUSFNO(tbdf));
+		break;
+
+	default:
+		snprint(p, READSTR, "(tbdfconv)");
+		break;
+	}
+	r = fmtstrcpy(fmt, p);
+	free(p);
+
+	return r;
+}
+
+static ulong
+pcibarsize(Pcidev *p, int rno)
+{
+	ulong v, size;
+
+	v = pcicfgr32(p, rno);
+	pcicfgw32(p, rno, 0xFFFFFFF0);
+	size = pcicfgr32(p, rno);
+	if(v & 1)
+		size |= 0xFFFF0000;
+	pcicfgw32(p, rno, v);
+
+	return -(size & ~0x0F);
+}
+
+static int
+pcilscan(int bno, Pcidev** list)
+{
+	Pcidev *p, *head, *tail;
+	int dno, fno, i, hdt, l, maxfno, maxubn, sbn, tbdf, ubn;
+
+	maxubn = bno;
+	head = nil;
+	tail = nil;
+	for(dno = 0; dno <= Maxdev; dno++){
+		maxfno = 0;
+		for(fno = 0; fno <= maxfno; fno++){
+			/*
+			 * For this possible device, form the
+			 * bus+device+function triplet needed to address it
+			 * and try to read the vendor and device ID.
+			 * If successful, allocate a device struct and
+			 * start to fill it in with some useful information
+			 * from the device's configuration space.
+			 */
+			tbdf = MKBUS(BusPCI, bno, dno, fno);
+			l = pcicfgrw(tbdf, PciVID, 0, Read, 4);
+			if(l == 0xFFFFFFFF || l == 0)
+				continue;
+			p = malloc(sizeof(*p));
+			p->tbdf = tbdf;
+			p->vid = l;
+			p->did = l>>16;
+
+			if(pcilist != nil)
+				pcitail->list = p;
+			else
+				pcilist = p;
+			pcitail = p;
+
+			p->pcr = pcicfgr16(p, PciPCR);
+			p->rid = pcicfgr8(p, PciRID);
+			p->ccrp = pcicfgr8(p, PciCCRp);
+			p->ccru = pcicfgr8(p, PciCCRu);
+			p->ccrb = pcicfgr8(p, PciCCRb);
+			p->cls = pcicfgr8(p, PciCLS);
+			p->ltr = pcicfgr8(p, PciLTR);
+
+			p->intl = pcicfgr8(p, PciINTL);
+
+			/*
+			 * If the device is a multi-function device adjust the
+			 * loop count so all possible functions are checked.
+			 */
+			hdt = pcicfgr8(p, PciHDT);
+			if(hdt & 0x80)
+				maxfno = Maxfn;
+
+			/*
+			 * If appropriate, read the base address registers
+			 * and work out the sizes.
+			 */
+			switch(p->ccrb) {
+			default:
+				if((hdt & 0x7F) != 0)
+					break;
+				for(i = 0; i < nelem(p->mem); i++) {
+					p->mem[i].bar = pcicfgr32(p, PciBAR0+4*i);
+					p->mem[i].size = pcibarsize(p, PciBAR0+4*i);
+				}
+				break;
+
+			case 0x00:
+			case 0x05:		/* memory controller */
+			case 0x06:		/* bridge device */
+				break;
+			}
+
+			if(head != nil)
+				tail->link = p;
+			else
+				head = p;
+			tail = p;
+		}
+	}
+
+	*list = head;
+	for(p = head; p != nil; p = p->link){
+		/*
+		 * Find PCI-PCI bridges and recursively descend the tree.
+		 */
+		if(p->ccrb != 0x06 || p->ccru != 0x04)
+			continue;
+
+		/*
+		 * If the secondary or subordinate bus number is not
+		 * initialised try to do what the PCI BIOS should have
+		 * done and fill in the numbers as the tree is descended.
+		 * On the way down the subordinate bus number is set to
+		 * the maximum as it's not known how many buses are behind
+		 * this one; the final value is set on the way back up.
+		 */
+		sbn = pcicfgr8(p, PciSBN);
+		ubn = pcicfgr8(p, PciUBN);
+
+		if(sbn == 0 || ubn == 0) {
+			print("%T: unconfigured bridge\n", p->tbdf);
+
+			sbn = maxubn+1;
+			/*
+			 * Make sure memory, I/O and master enables are
+			 * off, set the primary, secondary and subordinate
+			 * bus numbers and clear the secondary status before
+			 * attempting to scan the secondary bus.
+			 *
+			 * Initialisation of the bridge should be done here.
+			 */
+			pcicfgw32(p, PciPCR, 0xFFFF0000);
+			pcicfgw32(p, PciPBN, Maxbus<<16 | sbn<<8 | bno);
+			pcicfgw16(p, PciSPSR, 0xFFFF);
+			maxubn = pcilscan(sbn, &p->bridge);
+			pcicfgw32(p, PciPBN, maxubn<<16 | sbn<<8 | bno);
+		}
+		else {
+			/*
+			 * You can't go back.
+			 * This shouldn't be possible, but the
+			 * Iwill DK8-HTX seems to have subordinate
+			 * bus numbers which get smaller on the
+			 * way down. Need to look more closely at
+			 * this.
+			 */
+			if(ubn > maxubn)
+				maxubn = ubn;
+			pcilscan(sbn, &p->bridge);
+		}
+	}
+
+	return maxubn;
+}
+
+static uchar 
+pIIxget(Pcidev *router, uchar link)
+{
+	uchar pirq;
+
+	/* link should be 0x60, 0x61, 0x62, 0x63 */
+	pirq = pcicfgr8(router, link);
+	return (pirq < 16)? pirq: 0;
+}
+
+static void 
+pIIxset(Pcidev *router, uchar link, uchar irq)
+{
+	pcicfgw8(router, link, irq);
+}
+
+static uchar 
+viaget(Pcidev *router, uchar link)
+{
+	uchar pirq;
+
+	/* link should be 1, 2, 3, 5 */
+	pirq = (link < 6)? pcicfgr8(router, 0x55 + (link>>1)): 0;
+
+	return (link & 1)? (pirq >> 4): (pirq & 15);
+}
+
+static void 
+viaset(Pcidev *router, uchar link, uchar irq)
+{
+	uchar pirq;
+
+	pirq = pcicfgr8(router, 0x55 + (link >> 1));
+	pirq &= (link & 1)? 0x0f: 0xf0;
+	pirq |= (link & 1)? (irq << 4): (irq & 15);
+	pcicfgw8(router, 0x55 + (link>>1), pirq);
+}
+
+typedef struct Bridge Bridge;
+struct Bridge
+{
+	ushort	vid;
+	ushort	did;
+	uchar	(*get)(Pcidev *, uchar);
+	void	(*set)(Pcidev *, uchar, uchar);	
+};
+
+static Bridge southbridges[] = {
+	{ 0x8086, 0xffff, pIIxget, pIIxset },	// Intel *
+	{ 0x1106, 0x3227, viaget, viaset },	// Viatech VT8237
+
+	{ 0x1022, 0x746B, nil, nil },		// AMD 8111
+	{ 0x10DE, 0x00D1, nil, nil },		// NVIDIA nForce 3
+	{ 0x1166, 0x0200, nil, nil },		// ServerWorks ServerSet III LE
+	{ 0x1002, 0x4377, nil, nil },		// ATI Radeon Xpress 200M
+};
+
+typedef struct Slot Slot;
+struct Slot {
+	uchar	bus;			// Pci bus number
+	uchar	dev;			// Pci device number
+	uchar	maps[12];		// Avoid structs!  Link and mask.
+	uchar	slot;			// Add-in/built-in slot
+	uchar	reserved;
+};
+
+typedef struct Router Router;
+struct Router {
+	uchar	signature[4];		// Routing table signature
+	uchar	version[2];		// Version number
+	uchar	size[2];		// Total table size
+	uchar	bus;			// Interrupt router bus number
+	uchar	devfn;			// Router's devfunc
+	uchar	pciirqs[2];		// Exclusive PCI irqs
+	uchar	compat[4];		// Compatible PCI interrupt router
+	uchar	miniport[4];		// Miniport data
+	uchar	reserved[11];
+	uchar	checksum;
+};
+
+
+static void
+pcirouting(void)
+{
+	uchar *p, pin, irq, link, *map;
+	int size, i, fn, tbdf;
+	Bridge *southbridge;
+	Pcidev *sbpci, *pci;
+	Router *r;
+	Slot *e;
+
+	// Search for PCI interrupt routing table in BIOS
+	for(p = (uchar *)KADDR(0xf0000); p < (uchar *)KADDR(0xfffff); p += 16)
+		if(p[0] == '$' && p[1] == 'P' && p[2] == 'I' && p[3] == 'R')
+			break;
+
+	if(p >= (uchar *)KADDR(0xfffff))
+		return;
+
+	r = (Router *)p;
+
+	if(0)
+		print("PCI interrupt routing table version %d.%d at %.6llux\n",
+			r->version[0], r->version[1], (uintptr)r & 0xfffff);
+
+	tbdf = (BusPCI << 24)|(r->bus << 16)|(r->devfn << 8);
+	sbpci = pcimatchtbdf(tbdf);
+	if(sbpci == nil) {
+		print("pcirouting: Cannot find south bridge %T\n", tbdf);
+		return;
+	}
+
+	for(i = 0; i != nelem(southbridges); i++)
+		if(sbpci->vid == southbridges[i].vid
+		&& (sbpci->did == southbridges[i].did || southbridges[i].did == 0xffff))
+			break;
+
+	if(i == nelem(southbridges)) {
+		print("pcirouting: ignoring south bridge %T %.4ux/%.4ux\n", tbdf, sbpci->vid, sbpci->did);
+		return;
+	}
+	southbridge = &southbridges[i];
+	if(southbridge->get == nil || southbridge->set == nil)
+		return;
+
+	size = (r->size[1] << 8)|r->size[0];
+	for(e = (Slot *)&r[1]; (uchar *)e < p + size; e++) {
+		if(0){
+			print("%.2ux/%.2ux %.2ux: ", e->bus, e->dev, e->slot);
+			for (i = 0; i != 4; i++) {
+				uchar *m = &e->maps[i * 3];
+				print("[%d] %.2ux %.4ux ",
+					i, m[0], (m[2] << 8)|m[1]);
+			}
+			print("\n");
+		}
+
+		for(fn = 0; fn <= Maxfn; fn++) {
+			tbdf = MKBUS(BusPCI, e->bus, e->dev, fn);
+			pci = pcimatchtbdf(tbdf);
+			if(pci == nil)
+				continue;
+			pin = pcicfgr8(pci, PciINTP);
+			if(pin == 0 || pin == 0xff) 
+				continue;
+
+			map = &e->maps[(pin - 1) * 3];
+			link = map[0];
+			irq = southbridge->get(sbpci, link);
+			if(irq == 0 || irq == pci->intl)
+				continue;
+			if(pci->intl != 0 && pci->intl != 0xFF) {
+				print("pcirouting: BIOS workaround: %T at pin %d link %d irq %d -> %d\n",
+					  tbdf, pin, link, irq, pci->intl);
+				southbridge->set(sbpci, link, pci->intl);
+				continue;
+			}
+			print("pcirouting: %T at pin %d link %d irq %d\n", tbdf, pin, link, irq);
+			pcicfgw8(pci, PciINTL, irq);
+			pci->intl = irq;
+		}
+	}
+}
+
+static void
+pcireservemem(void)
+{
+	int i;
+	Pcidev *p;
+	
+	for(p = nil; p = pcimatch(p, 0, 0); )
+		for(i=0; i<nelem(p->mem); i++)
+			if(p->mem[i].bar && (p->mem[i].bar&1) == 0)
+				asmmapinit(p->mem[i].bar&~0x0F, p->mem[i].size, 5);
+}
+
+static void
+pcicfginit(void)
+{
+	int sbno, bno, n;
+	Pcidev **list, *p;
+
+	if(pcicfgmode != -1)
+		return;
+	lock(&pcicfginitlock);
+	if(pcicfgmode != -1){
+		unlock(&pcicfginitlock);
+		return;
+	}
+
+	fmtinstall('T', tbdffmt);
+
+	/*
+	 * Try to determine if PCI Mode1 configuration implemented.
+	 * (Bits [30:24] of PciADDR must be 0, according to the spec.)
+	 * Mode2 won't appear in 64-bit machines.
+	 */
+	n = inl(PciADDR);
+	if(!(n & 0x7F000000)){
+		outl(PciADDR, 0x80000000);
+		outb(PciADDR+3, 0);
+		if(inl(PciADDR) & 0x80000000)
+			pcicfgmode = 1;
+	}
+	outl(PciADDR, n);
+	
+	if(pcicfgmode < 0){
+		unlock(&pcicfginitlock);
+		return;
+	}
+
+	list = &pciroot;
+	for(bno = 0; bno <= Maxbus; bno++) {
+		sbno = bno;
+		bno = pcilscan(bno, list);
+
+		while(*list)
+			list = &(*list)->link;
+		if(sbno != 0)
+			continue;
+		/*
+		 * If we have found a PCI-to-Cardbus bridge, make sure
+		 * it has no valid mappings anymore.  
+		 */
+		for(p = pciroot; p != nil; p = p->link){
+			if (p->ccrb == 6 && p->ccru == 7) {
+				/* reset the cardbus */
+				pcicfgw16(p, PciBCR, 0x40 | pcicfgr16(p, PciBCR));
+				delay(50);
+			}
+		}
+	}
+
+	if(pciroot != nil && getconf("*nopcirouting") == nil)
+		pcirouting();
+	pcireservemem();
+	unlock(&pcicfginitlock);
+
+	if(getconf("*pcihinv"))
+		pcihinv(nil);
+}
+
+static int
+pcicfgrw(int tbdf, int r, int data, int rw, int w)
+{
+	int o, x, er;
+
+	pcicfginit();
+	if(pcicfgmode != 1)
+		return -1;
+	if(BUSDNO(tbdf) > Maxdev)
+		return -1;
+
+	lock(&pcicfglock);
+	o = r & 4-w;
+	er = r&0xfc | (r & 0xf00)<<16;
+	outl(PciADDR, 0x80000000|BUSBDF(tbdf)|er);
+	if(rw == Read){
+		x = -1;
+		switch(w){
+		case 1:
+			x = inb(PciDATA+o);
+			break;
+		case 2:
+			x = ins(PciDATA+o);
+			break;
+		case 4:
+			x = inl(PciDATA+o);
+			break;
+		}
+	}else{
+		x = 0;
+		switch(w){
+		case 1:
+			outb(PciDATA+o, data);
+			break;
+		case 2:
+			outs(PciDATA+o, data);
+			break;
+		case 4:
+			outl(PciDATA+o, data);
+			break;
+		}
+	}
+//	outl(PciADDR, 0);
+	unlock(&pcicfglock);
+
+	return x;
+}
+
+int
+pcicfgr8(Pcidev *p, int rno)
+{
+	return pcicfgrw(p->tbdf, rno, 0, Read, 1);
+}
+
+void
+pcicfgw8(Pcidev *p, int rno, int data)
+{
+	pcicfgrw(p->tbdf, rno, data, Write, 1);
+}
+
+int
+pcicfgr16(Pcidev *p, int rno)
+{
+	return pcicfgrw(p->tbdf, rno, 0, Read, 2);
+}
+
+void
+pcicfgw16(Pcidev *p, int rno, int data)
+{
+	pcicfgrw(p->tbdf, rno, data, Write, 2);
+}
+
+int
+pcicfgr32(Pcidev *p, int rno)
+{
+	return pcicfgrw(p->tbdf, rno, 0, Read, 4);
+}
+
+void
+pcicfgw32(Pcidev *p, int rno, int data)
+{
+	pcicfgrw(p->tbdf, rno, data, Write, 4);
+}
+
+Pcidev*
+pcimatch(Pcidev* prev, int vid, int did)
+{
+	pcicfginit();
+	prev = prev? prev->list: pcilist;
+	for(; prev != nil; prev = prev->list){
+		if((vid == 0 || prev->vid == vid)
+		&& (did == 0 || prev->did == did))
+			break;
+	}
+	return prev;
+}
+
+Pcidev*
+pcimatchtbdf(int tbdf)
+{
+	Pcidev *p;
+
+	for(p = nil; p = pcimatch(p, 0, 0); )
+		if(p->tbdf == tbdf)
+			break;
+	return p;
+}
+
+static void
+pcilhinv(Pcidev* p)
+{
+	int i;
+	Pcidev *t;
+
+	for(t = p; t != nil; t = t->link) {
+		print("%d  %2d/%d %.2ux %.2ux %.2ux %.4ux %.4ux %3d  ",
+			BUSBNO(t->tbdf), BUSDNO(t->tbdf), BUSFNO(t->tbdf),
+			t->ccrb, t->ccru, t->ccrp, t->vid, t->did, t->intl);
+
+		for(i = 0; i < nelem(p->mem); i++) {
+			if(t->mem[i].size == 0)
+				continue;
+			print("%d:%.8lux %d ", i, t->mem[i].bar, t->mem[i].size);
+		}
+		if(t->ioa.bar || t->ioa.size)
+			print("ioa:%.8lux %d ", t->ioa.bar, t->ioa.size);
+		if(t->mema.bar || t->mema.size)
+			print("mema:%.8lux %d ", t->mema.bar, t->mema.size);
+		if(t->bridge)
+			print("->%d", BUSBNO(t->bridge->tbdf));
+		print("\n");
+	}
+	for(; p != nil; p = p->link)
+		if(p->bridge != nil)
+			pcilhinv(p->bridge);
+}
+
+void
+pcihinv(Pcidev* p)
+{
+	pcicfginit();
+	lock(&pcicfginitlock);
+	if(p == nil){
+		p = pciroot;
+		print("bus dev type vid  did intl memory\n");
+	}
+	pcilhinv(p);
+	unlock(&pcicfginitlock);
+}
+
+void
+pcireset(void)
+{
+	Pcidev *p;
+
+	for(p = nil; p = pcimatch(p, 0, 0); )
+		/* don't mess with the bridges */
+		if(p->ccrb != 0x06)
+			pciclrbme(p);
+}
+
+void
+pcisetbme(Pcidev* p)
+{
+	p->pcr |= MASen;
+	pcicfgw16(p, PciPCR, p->pcr);
+}
+
+void
+pciclrbme(Pcidev* p)
+{
+	p->pcr &= ~MASen;
+	pcicfgw16(p, PciPCR, p->pcr);
+}
+
+void
+pcisetmwi(Pcidev* p)
+{
+	p->pcr |= MemWrInv;
+	pcicfgw16(p, PciPCR, p->pcr);
+}
+
+void
+pciclrmwi(Pcidev* p)
+{
+	p->pcr &= ~MemWrInv;
+	pcicfgw16(p, PciPCR, p->pcr);
+}
+
+int
+pcicap(Pcidev *p, int cap)
+{
+	int i, c, off;
+
+	/* status register bit 4 has capabilities */
+	if((pcicfgr16(p, PciPSR) & 1<<4) == 0)
+		return -1;	
+	switch(pcicfgr8(p, PciHDT) & 0x7f){
+	default:
+		return -1;
+	case 0:				/* etc */
+	case 1:				/* pci to pci bridge */
+		off = 0x34;
+		break;
+	case 2:				/* cardbus bridge */
+		off = 0x14;
+		break;
+	}
+	for(i = 48; i--;){
+		off = pcicfgr8(p, off);
+		if(off < 0x40 || (off & 3))
+			break;
+		off &= ~3;
+		c = pcicfgr8(p, off);
+		if(c == 0xff)
+			break;
+		if(c == cap)
+			return off;
+		off++;
+	}
+	return -1;
+}
+
+enum {
+	Pmgcap	= 2,		/* capabilities; 2 bytes*/
+	Pmgctl	= 4,		/* ctl/status; 2 bytes */
+	Pmgbrg	= 6,		/* bridge support */
+	Pmgdata	= 7,
+};
+
+int
+pcigetpms(Pcidev* p)
+{
+	int ptr;
+
+	if((ptr = pcicap(p, PciCapPMG)) == -1)
+		return -1;
+	return pcicfgr16(p, ptr+Pmgctl) & 0x0003;
+}
+
+int
+pcisetpms(Pcidev* p, int state)
+{
+	int pmc, pmcsr, ptr;
+
+	if((ptr = pcicap(p, PciCapPMG)) == -1)
+		return -1;
+
+	pmc = pcicfgr16(p, ptr+Pmgcap);
+	pmcsr = pcicfgr16(p, ptr+Pmgctl);
+
+	switch(state){
+	default:
+		return -1;
+	case 0:
+		break;
+	case 1:
+		if(!(pmc & 0x0200))
+			return -1;
+		break;
+	case 2:
+		if(!(pmc & 0x0400))
+			return -1;
+		break;
+	case 3:
+		break;
+	}
+	pcicfgw16(p, ptr+4, (pmcsr & ~3)  | state);
+	return pmcsr & 3;
+}

+ 146 - 0
sys/src/9/386/random.c

@@ -0,0 +1,146 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+
+struct Rb
+{
+	QLock;
+	Rendez	producer;
+	Rendez	consumer;
+	ulong	randomcount;
+	uchar	buf[1024];
+	uchar	*ep;
+	uchar	*rp;
+	uchar	*wp;
+	uchar	next;
+	uchar	wakeme;
+	ushort	bits;
+	ulong	randn;
+} rb;
+
+static int
+rbnotfull(void*)
+{
+	int i;
+
+	i = rb.rp - rb.wp;
+	return i != 1 && i != (1 - sizeof(rb.buf));
+}
+
+static int
+rbnotempty(void*)
+{
+	return rb.wp != rb.rp;
+}
+
+static void
+genrandom(void*)
+{
+	up->basepri = PriNormal;
+	up->priority = up->basepri;
+
+	for(;;){
+		for(;;)
+			if(++rb.randomcount > 100000)
+				break;
+		if(anyhigher())
+			sched();
+		if(!rbnotfull(0))
+			sleep(&rb.producer, rbnotfull, 0);
+	}
+}
+
+/*
+ *  produce random bits in a circular buffer
+ */
+static void
+randomclock(void)
+{
+	if(rb.randomcount == 0 || !rbnotfull(0))
+		return;
+
+	rb.bits = (rb.bits<<2) ^ rb.randomcount;
+	rb.randomcount = 0;
+
+	rb.next++;
+	if(rb.next != 8/2)
+		return;
+	rb.next = 0;
+
+	*rb.wp ^= rb.bits;
+	if(rb.wp+1 == rb.ep)
+		rb.wp = rb.buf;
+	else
+		rb.wp = rb.wp+1;
+
+	if(rb.wakeme)
+		wakeup(&rb.consumer);
+}
+
+void
+randominit(void)
+{
+	/* Frequency close but not equal to HZ */
+	addclock0link(randomclock, 13);
+	rb.ep = rb.buf + sizeof(rb.buf);
+	rb.rp = rb.wp = rb.buf;
+	kproc("genrandom", genrandom, 0);
+}
+
+/*
+ *  consume random bytes from a circular buffer
+ */
+ulong
+randomread(void *xp, ulong n)
+{
+	uchar *e, *p;
+	ulong x;
+
+	p = xp;
+
+	if(waserror()){
+		qunlock(&rb);
+		nexterror();
+	}
+
+	qlock(&rb);
+	for(e = p + n; p < e; ){
+		if(rb.wp == rb.rp){
+			rb.wakeme = 1;
+			wakeup(&rb.producer);
+			sleep(&rb.consumer, rbnotempty, 0);
+			rb.wakeme = 0;
+			continue;
+		}
+
+		/*
+		 *  beating clocks will be precictable if
+		 *  they are synchronized.  Use a cheap pseudo
+		 *  random number generator to obscure any cycles.
+		 */
+		x = rb.randn*1103515245 ^ *rb.rp;
+		*p++ = rb.randn = x;
+
+		if(rb.rp+1 == rb.ep)
+			rb.rp = rb.buf;
+		else
+			rb.rp = rb.rp+1;
+	}
+	qunlock(&rb);
+	poperror();
+
+	wakeup(&rb.producer);
+
+	return n;
+}

+ 794 - 0
sys/src/9/386/uarti8250.c

@@ -0,0 +1,794 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+/*
+ * 8250 UART and compatibles.
+ */
+enum {
+	Uart0		= 0x3F8,	/* COM1 */
+	Uart0IRQ	= 4,
+	Uart1		= 0x2F8,	/* COM2 */
+	Uart1IRQ	= 3,
+
+	UartFREQ	= 1843200,
+};
+
+enum {					/* registers */
+	Rbr		= 0,		/* Receiver Buffer (RO) */
+	Thr		= 0,		/* Transmitter Holding (WO) */
+	Ier		= 1,		/* Interrupt Enable */
+	Iir		= 2,		/* Interrupt Identification (RO) */
+	Fcr		= 2,		/* FIFO Control (WO) */
+	Lcr		= 3,		/* Line Control */
+	Mcr		= 4,		/* Modem Control */
+	Lsr		= 5,		/* Line Status */
+	Msr		= 6,		/* Modem Status */
+	Scr		= 7,		/* Scratch Pad */
+	Dll		= 0,		/* Divisor Latch LSB */
+	Dlm		= 1,		/* Divisor Latch MSB */
+};
+
+enum {					/* Ier */
+	Erda		= 0x01,		/* Enable Received Data Available */
+	Ethre		= 0x02,		/* Enable Thr Empty */
+	Erls		= 0x04,		/* Enable Receiver Line Status */
+	Ems		= 0x08,		/* Enable Modem Status */
+};
+
+enum {					/* Iir */
+	Ims		= 0x00,		/* Ms interrupt */
+	Ip		= 0x01,		/* Interrupt Pending (not) */
+	Ithre		= 0x02,		/* Thr Empty */
+	Irda		= 0x04,		/* Received Data Available */
+	Irls		= 0x06,		/* Receiver Line Status */
+	Ictoi		= 0x0C,		/* Character Time-out Indication */
+	IirMASK		= 0x3F,
+	Ifena		= 0xC0,		/* FIFOs enabled */
+};
+
+enum {					/* Fcr */
+	FIFOena		= 0x01,		/* FIFO enable */
+	FIFOrclr	= 0x02,		/* clear Rx FIFO */
+	FIFOtclr	= 0x04,		/* clear Tx FIFO */
+	FIFO1		= 0x00,		/* Rx FIFO trigger level 1 byte */
+	FIFO4		= 0x40,		/*	4 bytes */
+	FIFO8		= 0x80,		/*	8 bytes */
+	FIFO14		= 0xC0,		/*	14 bytes */
+};
+
+enum {					/* Lcr */
+	Wls5		= 0x00,		/* Word Length Select 5 bits/byte */
+	Wls6		= 0x01,		/*	6 bits/byte */
+	Wls7		= 0x02,		/*	7 bits/byte */
+	Wls8		= 0x03,		/*	8 bits/byte */
+	WlsMASK		= 0x03,
+	Stb		= 0x04,		/* 2 stop bits */
+	Pen		= 0x08,		/* Parity Enable */
+	Eps		= 0x10,		/* Even Parity Select */
+	Stp		= 0x20,		/* Stick Parity */
+	Brk		= 0x40,		/* Break */
+	Dlab		= 0x80,		/* Divisor Latch Access Bit */
+};
+
+enum {					/* Mcr */
+	Dtr		= 0x01,		/* Data Terminal Ready */
+	Rts		= 0x02,		/* Ready To Send */
+	Out1		= 0x04,		/* no longer in use */
+	Ie		= 0x08,		/* IRQ Enable */
+	Dm		= 0x10,		/* Diagnostic Mode loopback */
+};
+
+enum {					/* Lsr */
+	Dr		= 0x01,		/* Data Ready */
+	Oe		= 0x02,		/* Overrun Error */
+	Pe		= 0x04,		/* Parity Error */
+	Fe		= 0x08,		/* Framing Error */
+	Bi		= 0x10,		/* Break Interrupt */
+	Thre		= 0x20,		/* Thr Empty */
+	Temt		= 0x40,		/* Tramsmitter Empty */
+	FIFOerr		= 0x80,		/* error in receiver FIFO */
+};
+
+enum {					/* Msr */
+	Dcts		= 0x01,		/* Delta Cts */
+	Ddsr		= 0x02,		/* Delta Dsr */
+	Teri		= 0x04,		/* Trailing Edge of Ri */
+	Ddcd		= 0x08,		/* Delta Dcd */
+	Cts		= 0x10,		/* Clear To Send */
+	Dsr		= 0x20,		/* Data Set Ready */
+	Ri		= 0x40,		/* Ring Indicator */
+	Dcd		= 0x80,		/* Data Set Ready */
+};
+
+typedef struct Ctlr {
+	int	io;
+	int	irq;
+	int	tbdf;
+	int	iena;
+	void*	vector;
+	int	poll;
+
+	uchar	sticky[8];
+
+	Lock;
+	int	hasfifo;
+	int	checkfifo;
+	int	fena;
+} Ctlr;
+
+extern PhysUart i8250physuart;
+
+static Ctlr i8250ctlr[2] = {
+{	.io	= Uart0,
+	.irq	= Uart0IRQ,
+	.tbdf	= -1,
+	.poll	= 0, },
+
+{	.io	= Uart1,
+	.irq	= Uart1IRQ,
+	.tbdf	= -1,
+	.poll	= 0, },
+};
+
+static Uart i8250uart[2] = {
+{	.regs	= &i8250ctlr[0],
+	.name	= "COM1",
+	.freq	= UartFREQ,
+	.phys	= &i8250physuart,
+	.special= 0,
+	.next	= &i8250uart[1], },
+
+{	.regs	= &i8250ctlr[1],
+	.name	= "COM2",
+	.freq	= UartFREQ,
+	.phys	= &i8250physuart,
+	.special= 0,
+	.next	= nil, },
+};
+
+#define csr8r(c, r)	inb((c)->io+(r))
+#define csr8w(c, r, v)	outb((c)->io+(r), (c)->sticky[(r)]|(v))
+#define csr8o(c, r, v)	outb((c)->io+(r), (v))
+
+static long
+i8250status(Uart* uart, void* buf, long n, long offset)
+{
+	char *p;
+	Ctlr *ctlr;
+	uchar ier, lcr, mcr, msr;
+
+	ctlr = uart->regs;
+	p = malloc(READSTR);
+	mcr = ctlr->sticky[Mcr];
+	msr = csr8r(ctlr, Msr);
+	ier = ctlr->sticky[Ier];
+	lcr = ctlr->sticky[Lcr];
+	snprint(p, READSTR,
+		"b%d c%d d%d e%d l%d m%d p%c r%d s%d i%d\n"
+		"dev(%d) type(%d) framing(%d) overruns(%d) "
+		"berr(%d) serr(%d)%s%s%s%s\n",
+
+		uart->baud,
+		uart->hup_dcd,
+		(msr & Dsr) != 0,
+		uart->hup_dsr,
+		(lcr & WlsMASK) + 5,
+		(ier & Ems) != 0, 
+		(lcr & Pen) ? ((lcr & Eps) ? 'e': 'o'): 'n',
+		(mcr & Rts) != 0,
+		(lcr & Stb) ? 2: 1,
+		ctlr->fena,
+
+		uart->dev,
+		uart->type,
+		uart->ferr,
+		uart->oerr,
+		uart->berr,
+		uart->serr,
+		(msr & Cts) ? " cts": "",
+		(msr & Dsr) ? " dsr": "",
+		(msr & Dcd) ? " dcd": "",
+		(msr & Ri) ? " ring": ""
+	);
+	n = readstr(offset, buf, n, p);
+	free(p);
+
+	return n;
+}
+
+static void
+i8250fifo(Uart* uart, int level)
+{
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	if(ctlr->hasfifo == 0)
+		return;
+
+	/*
+	 * Changing the FIFOena bit in Fcr flushes data
+	 * from both receive and transmit FIFOs; there's
+	 * no easy way to guarantee not losing data on
+	 * the receive side, but it's possible to wait until
+	 * the transmitter is really empty.
+	 */
+	ilock(ctlr);
+	while(!(csr8r(ctlr, Lsr) & Temt))
+		;
+
+	/*
+	 * Set the trigger level, default is the max.
+	 * value.
+	 * Some UARTs require FIFOena to be set before
+	 * other bits can take effect, so set it twice.
+	 */
+	ctlr->fena = level;
+	switch(level){
+	case 0:
+		break;
+	case 1:
+		level = FIFO1|FIFOena;
+		break;
+	case 4:
+		level = FIFO4|FIFOena;
+		break;
+	case 8:
+		level = FIFO8|FIFOena;
+		break;
+	default:
+		level = FIFO14|FIFOena;
+		break;
+	}
+	csr8w(ctlr, Fcr, level);
+	csr8w(ctlr, Fcr, level);
+	iunlock(ctlr);
+}
+
+static void
+i8250dtr(Uart* uart, int on)
+{
+	Ctlr *ctlr;
+
+	/*
+	 * Toggle DTR.
+	 */
+	ctlr = uart->regs;
+	if(on)
+		ctlr->sticky[Mcr] |= Dtr;
+	else
+		ctlr->sticky[Mcr] &= ~Dtr;
+	csr8w(ctlr, Mcr, 0);
+}
+
+static void
+i8250rts(Uart* uart, int on)
+{
+	Ctlr *ctlr;
+
+	/*
+	 * Toggle RTS.
+	 */
+	ctlr = uart->regs;
+	if(on)
+		ctlr->sticky[Mcr] |= Rts;
+	else
+		ctlr->sticky[Mcr] &= ~Rts;
+	csr8w(ctlr, Mcr, 0);
+}
+
+static void
+i8250modemctl(Uart* uart, int on)
+{
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	ilock(&uart->tlock);
+	if(on){
+		ctlr->sticky[Ier] |= Ems;
+		csr8w(ctlr, Ier, ctlr->sticky[Ier]);
+		uart->modem = 1;
+		uart->cts = csr8r(ctlr, Msr) & Cts;
+	}
+	else{
+		ctlr->sticky[Ier] &= ~Ems;
+		csr8w(ctlr, Ier, ctlr->sticky[Ier]);
+		uart->modem = 0;
+		uart->cts = 1;
+	}
+	iunlock(&uart->tlock);
+
+	/* modem needs fifo */
+	(*uart->phys->fifo)(uart, on);
+}
+
+static int
+i8250parity(Uart* uart, int parity)
+{
+	int lcr;
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	lcr = ctlr->sticky[Lcr] & ~(Eps|Pen);
+
+	switch(parity){
+	case 'e':
+		lcr |= Eps|Pen;
+		break;
+	case 'o':
+		lcr |= Pen;
+		break;
+	case 'n':
+		break;
+	default:
+		return -1;
+	}
+	ctlr->sticky[Lcr] = lcr;
+	csr8w(ctlr, Lcr, 0);
+
+	uart->parity = parity;
+
+	return 0;
+}
+
+static int
+i8250stop(Uart* uart, int stop)
+{
+	int lcr;
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	lcr = ctlr->sticky[Lcr] & ~Stb;
+
+	switch(stop){
+	case 1:
+		break;
+	case 2:
+		lcr |= Stb;
+		break;
+	default:
+		return -1;
+	}
+	ctlr->sticky[Lcr] = lcr;
+	csr8w(ctlr, Lcr, 0);
+
+	uart->stop = stop;
+
+	return 0;
+}
+
+static int
+i8250bits(Uart* uart, int bits)
+{
+	int lcr;
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	lcr = ctlr->sticky[Lcr] & ~WlsMASK;
+
+	switch(bits){
+	case 5:
+		lcr |= Wls5;
+		break;
+	case 6:
+		lcr |= Wls6;
+		break;
+	case 7:
+		lcr |= Wls7;
+		break;
+	case 8:
+		lcr |= Wls8;
+		break;
+	default:
+		return -1;
+	}
+	ctlr->sticky[Lcr] = lcr;
+	csr8w(ctlr, Lcr, 0);
+
+	uart->bits = bits;
+
+	return 0;
+}
+
+static int
+i8250baud(Uart* uart, int baud)
+{
+	ulong bgc;
+	Ctlr *ctlr;
+
+	/*
+	 * Set the Baud rate by calculating and setting the Baud rate
+	 * Generator Constant. This will work with fairly non-standard
+	 * Baud rates.
+	 */
+	if(uart->freq == 0 || baud <= 0)
+		return -1;
+	bgc = (uart->freq+8*baud-1)/(16*baud);
+
+	ctlr = uart->regs;
+	csr8w(ctlr, Lcr, Dlab);
+	csr8o(ctlr, Dlm, bgc>>8);
+	csr8o(ctlr, Dll, bgc);
+	csr8w(ctlr, Lcr, 0);
+
+	uart->baud = baud;
+
+	return 0;
+}
+
+static void
+i8250break(Uart* uart, int ms)
+{
+	Ctlr *ctlr;
+
+	/*
+	 * Send a break.
+	 */
+	if(ms <= 0)
+		ms = 200;
+
+	ctlr = uart->regs;
+	csr8w(ctlr, Lcr, Brk);
+	tsleep(&up->sleep, return0, 0, ms);
+	csr8w(ctlr, Lcr, 0);
+}
+
+static void
+i8250kick(Uart* uart)
+{
+	int i;
+	Ctlr *ctlr;
+
+	if(uart->cts == 0 || uart->blocked)
+		return;
+
+	/*
+	 *  128 here is an arbitrary limit to make sure
+	 *  we don't stay in this loop too long.  If the
+	 *  chip's output queue is longer than 128, too
+	 *  bad -- presotto
+	 */
+	ctlr = uart->regs;
+	for(i = 0; i < 128; i++){
+		if(!(csr8r(ctlr, Lsr) & Thre))
+			break;
+		if(uart->op >= uart->oe && uartstageoutput(uart) == 0)
+			break;
+		csr8o(ctlr, Thr, *(uart->op++));
+	}
+}
+
+static void
+i8250interrupt(Ureg*, void* arg)
+{
+	Ctlr *ctlr;
+	Uart *uart;
+	int iir, lsr, old, r;
+
+	uart = arg;
+
+	ctlr = uart->regs;
+	for(iir = csr8r(ctlr, Iir); !(iir & Ip); iir = csr8r(ctlr, Iir)){
+		switch(iir & IirMASK){
+		case Ims:		/* Ms interrupt */
+			r = csr8r(ctlr, Msr);
+			if(r & Dcts){
+				ilock(&uart->tlock);
+				old = uart->cts;
+				uart->cts = r & Cts;
+				if(old == 0 && uart->cts)
+					uart->ctsbackoff = 2;
+				iunlock(&uart->tlock);
+			}
+		 	if(r & Ddsr){
+				old = r & Dsr;
+				if(uart->hup_dsr && uart->dsr && !old)
+					uart->dohup = 1;
+				uart->dsr = old;
+			}
+		 	if(r & Ddcd){
+				old = r & Dcd;
+				if(uart->hup_dcd && uart->dcd && !old)
+					uart->dohup = 1;
+				uart->dcd = old;
+			}
+			break;
+		case Ithre:		/* Thr Empty */
+			uartkick(uart);
+			break;
+		case Irda:		/* Received Data Available */
+		case Irls:		/* Receiver Line Status */
+		case Ictoi:		/* Character Time-out Indication */
+			/*
+			 * Consume any received data.
+			 * If the received byte came in with a break,
+			 * parity or framing error, throw it away;
+			 * overrun is an indication that something has
+			 * already been tossed.
+			 */
+			while((lsr = csr8r(ctlr, Lsr)) & Dr){
+				if(lsr & (FIFOerr|Oe))
+					uart->oerr++;
+				if(lsr & Pe)
+					uart->perr++;
+				if(lsr & Fe)
+					uart->ferr++;
+				r = csr8r(ctlr, Rbr);
+				if(!(lsr & (Bi|Fe|Pe)))
+					uartrecv(uart, r);
+			}
+			break;
+
+		default:
+			iprint("weird uart interrupt 0x%2.2uX\n", iir);
+			break;
+		}
+	}
+}
+
+static void
+i8250disable(Uart* uart)
+{
+	Ctlr *ctlr;
+
+	/*
+ 	 * Turn off DTR and RTS, disable interrupts and fifos.
+	 */
+	(*uart->phys->dtr)(uart, 0);
+	(*uart->phys->rts)(uart, 0);
+	(*uart->phys->fifo)(uart, 0);
+
+	ctlr = uart->regs;
+	ctlr->sticky[Ier] = 0;
+	csr8w(ctlr, Ier, ctlr->sticky[Ier]);
+
+	if(ctlr->iena != 0){
+		if(intrdisable(ctlr->vector) == 0)
+			ctlr->iena = 0;
+	}
+}
+
+static void
+i8250enable(Uart* uart, int ie)
+{
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+
+	/*
+	 * Check if there is a FIFO.
+	 * Changing the FIFOena bit in Fcr flushes data
+	 * from both receive and transmit FIFOs; there's
+	 * no easy way to guarantee not losing data on
+	 * the receive side, but it's possible to wait until
+	 * the transmitter is really empty.
+	 * Also, reading the Iir outwith i8250interrupt()
+	 * can be dangerous, but this should only happen
+	 * once, before interrupts are enabled.
+	 */
+	ilock(ctlr);
+	if(!ctlr->checkfifo){
+		/*
+		 * Wait until the transmitter is really empty.
+		 */
+		while(!(csr8r(ctlr, Lsr) & Temt))
+			;
+		csr8w(ctlr, Fcr, FIFOena);
+		if(csr8r(ctlr, Iir) & Ifena)
+			ctlr->hasfifo = 1;
+		csr8w(ctlr, Fcr, 0);
+		ctlr->checkfifo = 1;
+	}
+	iunlock(ctlr);
+
+	/*
+ 	 * Enable interrupts and turn on DTR and RTS.
+	 * Be careful if this is called to set up a polled serial line
+	 * early on not to try to enable interrupts as interrupt-
+	 * -enabling mechanisms might not be set up yet.
+	 */
+	if(ie){
+		if(ctlr->iena == 0 && !ctlr->poll){
+			ctlr->vector = intrenable(ctlr->irq, i8250interrupt, uart, ctlr->tbdf, uart->name);
+			ctlr->iena = 1;
+		}
+		ctlr->sticky[Ier] = Ethre|Erda;
+		ctlr->sticky[Mcr] |= Ie;
+	}
+	else{
+		ctlr->sticky[Ier] = 0;
+		ctlr->sticky[Mcr] = 0;
+	}
+	csr8w(ctlr, Ier, ctlr->sticky[Ier]);
+	csr8w(ctlr, Mcr, ctlr->sticky[Mcr]);
+
+	(*uart->phys->dtr)(uart, 1);
+	(*uart->phys->rts)(uart, 1);
+
+	/*
+	 * During startup, the i8259 interrupt controller is reset.
+	 * This may result in a lost interrupt from the i8250 uart.
+	 * The i8250 thinks the interrupt is still outstanding and does not
+	 * generate any further interrupts. The workaround is to call the
+	 * interrupt handler to clear any pending interrupt events.
+	 * Note: this must be done after setting Ier.
+	 */
+	if(ie)
+		i8250interrupt(nil, uart);
+}
+
+void*
+i8250alloc(int io, int irq, int tbdf)
+{
+	Ctlr *ctlr;
+
+	if((ctlr = malloc(sizeof(Ctlr))) != nil){
+		ctlr->io = io;
+		ctlr->irq = irq;
+		ctlr->tbdf = tbdf;
+	}
+
+	return ctlr;
+}
+
+static Uart*
+i8250pnp(void)
+{
+	int i;
+	Ctlr *ctlr;
+	Uart *head, *uart;
+
+	head = i8250uart;
+	for(i = 0; i < nelem(i8250uart); i++){
+		/*
+		 * Does it exist?
+		 * Should be able to write/read the Scratch Pad
+		 * and reserve the I/O space.
+		 */
+		uart = &i8250uart[i];
+		ctlr = uart->regs;
+		csr8o(ctlr, Scr, 0x55);
+		if(csr8r(ctlr, Scr) == 0x55)
+			continue;
+		if(ioalloc(ctlr->io, 8, 0, uart->name) < 0)
+			continue;
+		if(uart == head)
+			head = uart->next;
+		else
+			(uart-1)->next = uart->next;
+	}
+
+	return head;
+}
+
+static int
+i8250getc(Uart* uart)
+{
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	while(!(csr8r(ctlr, Lsr) & Dr))
+		delay(1);
+	return csr8r(ctlr, Rbr);
+}
+
+static void
+i8250putc(Uart* uart, int c)
+{
+	int i;
+	Ctlr *ctlr;
+
+	ctlr = uart->regs;
+	for(i = 0; !(csr8r(ctlr, Lsr) & Thre) && i < 128; i++)
+		delay(1);
+	csr8o(ctlr, Thr, c);
+	for(i = 0; !(csr8r(ctlr, Lsr) & Thre) && i < 128; i++)
+		delay(1);
+}
+
+static void
+i8250poll(Uart* uart)
+{
+	Ctlr *ctlr;
+
+	/*
+	 * If PhysUart has a non-nil .poll member, this
+	 * routine will be called from the uartclock timer.
+	 * If the Ctlr .poll member is non-zero, when the
+	 * Uart is enabled interrupts will not be enabled
+	 * and the result is polled input and output.
+	 * Not very useful here, but ports to new hardware
+	 * or simulators can use this to get serial I/O
+	 * without setting up the interrupt mechanism.
+	 */
+	ctlr = uart->regs;
+	if(ctlr->iena || !ctlr->poll)
+		return;
+	i8250interrupt(nil, uart);
+}
+
+PhysUart i8250physuart = {
+	.name		= "i8250",
+	.pnp		= i8250pnp,
+	.enable		= i8250enable,
+	.disable	= i8250disable,
+	.kick		= i8250kick,
+	.dobreak	= i8250break,
+	.baud		= i8250baud,
+	.bits		= i8250bits,
+	.stop		= i8250stop,
+	.parity		= i8250parity,
+	.modemctl	= i8250modemctl,
+	.rts		= i8250rts,
+	.dtr		= i8250dtr,
+	.status		= i8250status,
+	.fifo		= i8250fifo,
+	.getc		= i8250getc,
+	.putc		= i8250putc,
+	.poll		= i8250poll,
+};
+
+Uart*
+i8250console(char* cfg)
+{
+	int i;
+	Uart *uart;
+	Ctlr *ctlr;
+	char *cmd, *p;
+
+	/*
+	 * Before i8250pnp() is run can only set the console
+	 * to 0 or 1 because those are the only uart structs which
+	 * will be the same before and after that.
+	 */
+	if((p = getconf("console")) == nil && (p = cfg) == nil)
+		return nil;
+	i = strtoul(p, &cmd, 0);
+	if(p == cmd)
+		return nil;
+	if((uart = uartconsole(i, cmd)) != nil){
+		consuart = uart;
+		return uart;
+	}
+	switch(i){
+	default:
+		return nil;
+	case 0:
+		uart = &i8250uart[0];
+		break;
+	case 1:
+		uart = &i8250uart[1];
+		break;	
+	}
+
+	/*
+	 * Does it exist?
+	 * Should be able to write/read
+	 * the Scratch Pad.
+	 */
+	ctlr = uart->regs;
+	csr8o(ctlr, Scr, 0x55);
+	if(csr8r(ctlr, Scr) != 0x55)
+		return nil;
+
+	(*uart->phys->enable)(uart, 0);
+	uartctl(uart, "b9600 l8 pn s1 i1");
+	if(*cmd != '\0')
+		uartctl(uart, cmd);
+
+	consuart = uart;
+	uart->console = 1;
+
+	return uart;
+}

+ 189 - 0
sys/src/9/386/uartpci.c

@@ -0,0 +1,189 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "io.h"
+
+extern PhysUart i8250physuart;
+extern PhysUart pciphysuart;
+extern void* i8250alloc(int, int, int);
+
+static Uart*
+uartpci(int ctlrno, Pcidev* p, int barno, int n, int freq, char* name)
+{
+	int i, io;
+	void *ctlr;
+	char buf[64];
+	Uart *head, *uart;
+
+	io = p->mem[barno].bar & ~0x01;
+	snprint(buf, sizeof(buf), "%s%d", pciphysuart.name, ctlrno);
+	if(ioalloc(io, p->mem[barno].size, 0, buf) < 0){
+		print("uartpci: I/O 0x%uX in use\n", io);
+		return nil;
+	}
+
+	head = uart = malloc(sizeof(Uart)*n);
+
+	for(i = 0; i < n; i++){
+		ctlr = i8250alloc(io, p->intl, p->tbdf);
+		io += 8;
+		if(ctlr == nil)
+			continue;
+
+		uart->regs = ctlr;
+		snprint(buf, sizeof(buf), "%s.%8.8uX", name, p->tbdf);
+		kstrdup(&uart->name, buf);
+		uart->freq = freq;
+		uart->phys = &i8250physuart;
+		if(uart != head)
+			(uart-1)->next = uart;
+		uart++;
+	}
+
+	return head;
+}
+
+static Uart*
+uartpcipnp(void)
+{
+	Pcidev *p;
+	char *name;
+	int ctlrno, n, subid;
+	Uart *head, *tail, *uart;
+
+	/*
+	 * Loop through all PCI devices looking for simple serial
+	 * controllers (ccrb == 0x07) and configure the ones which
+	 * are familiar. All suitable devices are configured to
+	 * simply point to the generic i8250 driver.
+	 */
+	head = tail = nil;
+	ctlrno = 0;
+	for(p = pcimatch(nil, 0, 0); p != nil; p = pcimatch(p, 0, 0)){
+		if(p->ccrb != 0x07 || p->ccru > 2)
+			continue;
+
+		switch((p->did<<16)|p->vid){
+		default:
+			continue;
+		case (0x9835<<16)|0x9710:	/* StarTech PCI2S550 */
+			uart = uartpci(ctlrno, p, 0, 1, 1843200, "PCI2S550-0");
+			if(uart == nil)
+				continue;
+			uart->next = uartpci(ctlrno, p, 1, 1, 1843200, "PCI2S550-1");
+			break;
+		case (0x950A<<16)|0x1415:	/* Oxford Semi OX16PCI954 */
+			/*
+			 * These are common devices used by 3rd-party
+			 * manufacturers.
+			 * Must check the subsystem VID and DID for correct
+			 * match, mostly to get the clock frequency right.
+			 */
+			subid = pcicfgr16(p, PciSVID);
+			subid |= pcicfgr16(p, PciSID)<<16;
+			switch(subid){
+			default:
+				continue;
+			case (0x2000<<16)|0x131F:/* SIIG CyberSerial PCIe */
+				uart = uartpci(ctlrno, p, 0, 1, 18432000, "CyberSerial-1S");
+				if(uart == nil)
+					continue;
+				break;
+			}
+			break;
+		case (0x9501<<16)|0x1415:	/* Oxford Semi OX16PCI954 */
+			/*
+			 * These are common devices used by 3rd-party
+			 * manufacturers.
+			 * Should check the subsystem VID and DID for correct
+			 * match, mostly to get the clock frequency right.
+			 */
+			subid = pcicfgr16(p, PciSVID);
+			subid |= pcicfgr16(p, PciSID)<<16;
+			switch(subid){
+			default:
+				continue;
+			case (0<<16)|0x1415:	/* StarTech PCI4S550 */
+				uart = uartpci(ctlrno, p, 0, 1, 18432000, "PCI4S550-0");
+				if(uart == nil)
+					continue;
+				break;
+			}
+			break;
+		case (0x9050<<16)|0x10B5:	/* Perle PCI-Fast4 series */
+		case (0x9030<<16)|0x10B5:	/* Perle Ultraport series */
+			/*
+			 * These devices consists of a PLX bridge (the above
+			 * PCI VID+DID) behind which are some 16C654 UARTs.
+			 * Must check the subsystem VID and DID for correct
+			 * match.
+			 */
+			subid = pcicfgr16(p, PciSVID);
+			subid |= pcicfgr16(p, PciSID)<<16;
+			switch(subid){
+			default:
+				continue;
+			case (0x0011<<16)|0x12E0:	/* Perle PCI-Fast16 */
+				n = 16;
+				name = "PCI-Fast16";
+				break;
+			case (0x0021<<16)|0x12E0:	/* Perle PCI-Fast8 */
+				n = 8;
+				name = "PCI-Fast8";
+				break;
+			case (0x0031<<16)|0x12E0:	/* Perle PCI-Fast4 */
+				n = 4;
+				name = "PCI-Fast4";
+				break;
+			case (0x0021<<16)|0x155F:	/* Perle Ultraport8 */
+				n = 8;
+				name = "Ultraport8";	/* 16C754 UARTs */
+				break;
+			}
+			uart = uartpci(ctlrno, p, 2, n, 7372800, name);
+			if(uart == nil)
+				continue;
+			break;
+		}
+
+		if(head != nil)
+			tail->next = uart;
+		else
+			head = uart;
+		for(tail = uart; tail->next != nil; tail = tail->next)
+			;
+		ctlrno++;
+	}
+
+	return head;
+}
+
+PhysUart pciphysuart = {
+	.name		= "UartPCI",
+	.pnp		= uartpcipnp,
+	.enable		= nil,
+	.disable	= nil,
+	.kick		= nil,
+	.dobreak	= nil,
+	.baud		= nil,
+	.bits		= nil,
+	.stop		= nil,
+	.parity		= nil,
+	.modemctl	= nil,
+	.rts		= nil,
+	.dtr		= nil,
+	.status		= nil,
+	.fifo		= nil,
+};

+ 22 - 0
sys/src/9/bench/1/kern

@@ -0,0 +1,22 @@
+#!/bin/rc
+
+rfork ne
+
+# import rc functions popular among scripts, e.g. fail
+#
+. ../tools
+
+# we might have a source file here and do something like:
+# to override the source used for this benchmark
+bind proc.c ../../port/proc.c
+
+cp /cfg/pxe/003048ff2106 pxeorig
+cp 003048ff2106 /cfg/pxe
+
+# we might change the std source like this:
+# sed 's/initialTCs = [0-9]+/initialTCs = 16/' <  ../../k10/main.c >main.c
+# bind main.c ../../k10/main.c
+
+cd /sys/src/nix/k10
+mk clean
+mk install

+ 20 - 0
sys/src/9/bench/1/output

@@ -0,0 +1,20 @@
+# sleep 2
+0.00u 0.02s 2.02r 	 rc -c sleep 2
+0.00u 0.03s 2.02r 	 rc -c sleep 2
+0.00u 0.02s 2.02r 	 rc -c sleep 2
+0.00u 0.01s 2.03r 	 rc -c sleep 2
+0.00u 0.03s 2.02r 	 rc -c sleep 2
+0.00u 0.02s 2.02r 	 rc -c sleep 2
+0.00u 0.02s 2.02r 	 rc -c sleep 2
+0.00u 0.04s 2.02r 	 rc -c sleep 2
+0.00u 0.02s 2.02r 	 rc -c sleep 2
+0.00u 0.04s 2.03r 	 rc -c sleep 2
+times 0 0.025 2.022
+#cat /dev/debug
+steal 1
+donate 0
+locks 965080
+glare 6450
+inglare 8840
+qlock 73101
+qlockq 66

+ 26 - 0
sys/src/9/bench/1/runbench

@@ -0,0 +1,26 @@
+#!/bin/rc
+
+#
+# kernel as of /n/nixdump/2012/0119/sys/src/nix/bench
+# Single sched, 32 TCs.
+# Time to make a kernel
+#
+
+rfork ne
+
+# restore the pxe file we saved
+cp pxeorig /cfg/pxe/003048ff2106
+
+# import rc functions popular among scripts, e.g. fail
+#
+. ../tools
+
+# How much time does it take to make a kernel
+../Time 'cd /sys/src/nix/k10 ; mk clean ; mk'
+
+
+# What's the value for measures taken from the kernel?
+# echo '#cat /dev/debug' 
+# cat /dev/debug
+# NB: this is an example. /dev/debug is reported already by Time
+

+ 15 - 0
sys/src/9/bench/Benchs

@@ -0,0 +1,15 @@
+#!/bin/rc
+
+# change the boot sequence to run benchs.
+
+# remove output from all benchs:
+#  rm -f [0-9]*/^(koutput output FAIL KMESG) k[0-9]*/^(koutput output FAIL KMESG)
+
+# remove output from some benchs, to rerun them:
+# for(t in 93 94 95) rm -f $t/^(koutput output FAIL KMESG)
+
+
+# arrange for them to run after rebooting
+cp runbenchs /cfg/$sysname/runbenchs 
+runbenchs
+

+ 29 - 0
sys/src/9/bench/Locks

@@ -0,0 +1,29 @@
+#!/bin/rc
+rfork e
+cmd=$1
+echo '#' $1
+echo measuring $1... >'#c/cons'
+rm -f /tmp/bench.time
+rc -c $cmd >'#c/cons'	# warm cache and be able to see the output of the cmd...
+#
+# collect stats. A single run this time.
+#
+echo >/dev/sysstat
+bind -a '#W' /dev
+echo start >/dev/wsctl
+time rc -c $cmd >/dev/null >>[2]/tmp/bench.time
+echo stop >/dev/wsctl
+cp /dev/wsdata /tmp/wsdata
+cp /dev/sysstat /tmp/sysstat
+tail -1 /tmp/bench.time
+tail -1 /tmp/bench.time >'#c/cons'
+sed 's/[sur]//g' </tmp/bench.time |
+	awk 'BEGIN{u=0.0; s=0.0; t=0.0}
+	{u += $1; s += $2; t += $3}
+	END{printf("times %g %g %g\n", u/NR, s/NR, t/NR);}'
+awk '{printf("%s\t", $0); system("src -n -s " $2 " /amd64/9k8cpu")}' </tmp/wsdata
+rm -f /tmp/bench.time /tmp/wsdata
+cat /dev/debug
+echo #/dev/sysstat for 10 runs
+cat /tmp/sysstat
+exit ''

+ 5 - 0
sys/src/9/bench/Mean

@@ -0,0 +1,5 @@
+#!/bin/rc
+sed 's/[sur]//g' $* |
+	awk 'BEGIN{u=0.0; s=0.0; t=0.0}
+	{u += $1; s += $2; t += $3}
+	END{printf("times %g %g %g\n", u/NR, s/NR, t/NR);}'

+ 85 - 0
sys/src/9/bench/README

@@ -0,0 +1,85 @@
+This is performance measurement for nix.
+
+Each directory named with a number  represents a single measure to be taken.
+(e.g., 1/ 2/ ...)
+A bench directory may be also named k1, k2, k....
+All k benchs run affter all std. benchs. They are meant to install ad-hoc
+kernels for measurement.
+
+The script actually running the benchs is runbenchs, which should be run from
+cpustart during the machine boot process only if it's found at /cfg/$sysname
+That is, include
+	test -x /cfg/$sysname/runbenchs && /cfg/$sysname/runbenchs
+in your  cpustart file.
+
+It is not meant to start the bench sequence by hand.
+To (re)run all the benchmarks, you should run ./Benchs instead.
+
+See 1/ for a template. Copy it to your own and tweak it at will.
+
+To start benchs, run the script ./Benchs in clu, which would
+change the boot sequence such that the machine starts to run benchs
+(perhaps installing different kernels and rebooting) until all ones
+have been run or one has failed.
+
+Each directory must contain:
+
+- kern:	a script used to compile and install a kernel.
+ if no such file is found, no new kernel is installed. the current one
+ is used. Otherwise, the indicated kernel is installed and the machine
+ reboots using this kernel.
+
+- runbench: a script used to run a bench. This is mandatory.
+
+- whichever other files must be available for the benchs to run.
+
+Benchs generate within each test directory whichever files they want.
+By convention, repeated times measured get into TIMES, and debug counters
+get into COUNTERS. The last line in TIMES reports the average.
+Repeated timing is done by Time. See the bench in ./1 as a template.
+
+
+As a result of running an to control when to run them, these files are
+generated:
+- koutput:	a file keeping the output for a kern that did run
+- output:	a file keeping the output for a test that did run,
+		this includes also /dev/debug and /dev/sysstat
+- FAIL:		an empty file, reporting that a test did fail.
+- KMESG:	copy of /dev/kmesg after booting a installed kernel
+
+BEWARE that if you install a kernel for a bench then that kernel is used
+for all following ones.
+As a convention, benchs installing a kernel should be named k0, k1, ...
+test 1 installs the std kernel, so that all benchs use the regular kernel.
+
+The Time script used to measure the mean of repeated runs also gathers the
+output of /dev/debug, and the cumulative value of /dev/sysstat for 10 runs.
+
+To create benchs for the same test using multiple parameter values, create
+different directories for each value. That we we keep the output for each one
+in a different place, under control. For example:
+
+Replicating bench 1 for 1 to 32 cores, the first one is going to be bench #31:
+
+First create and adapt the first benchmark
+mkdir 31
+dircp 1 31
+rm -f 31/^(koutput output FAIL KMESG)
+B 31/kern
+B 31/runbench
+
+Now replicate this one, changing the parameter for each of the new ones:
+for(x in `{seq 31}){ d=`{echo $x + 31|hoc} {mkdir  $d; dircp 31 $d; sed 's/ck 1$/ck '^$x^'/' < 31/003048ff2106 >$d/003048ff2106}}
+
+Then run ./Benchs
+
+for(x in `{seq 30}){ d=`{echo $x + 67|hoc} ; nc=`{echo $x + 2 | hoc}; echo dir  $d  echo $nc cores ;  ;  sed 's/ck 2$/ck '^$nc^'/'  < 67/003048ff2106 >$d/003048ff2106}
+
+Beware that many benchs are made assuming the kernel is implemented in a
+certain way, at least, those depending on a particular kernel.
+That means that, for example, if you clear benchs 1-99, you might have to
+rely on /n/nixdump/2012/0123/sys/src/nix sources; otherwise the kernel might
+not compile, or you might be measuring something else.
+
+In short: feel free to clear only the benchmarks you are responsible for.
+You should know what you are measuring in any case.

+ 23 - 0
sys/src/9/bench/Time

@@ -0,0 +1,23 @@
+#!/bin/rc
+rfork e
+cmd=$1
+echo '#' $1
+echo measuring $1... >'#c/cons'
+rm -f /tmp/bench.time
+rc -c $cmd >'#c/cons'	# warm cache and be able to see the output of the cmd...
+echo >/dev/sysstat
+for(i in `{seq 10}){
+	time rc -c $cmd >/dev/null >>[2]/tmp/bench.time
+	tail -1 /tmp/bench.time
+	tail -1 /tmp/bench.time >'#c/cons'
+}
+cp /dev/sysstat /tmp/sysstat
+sed 's/[sur]//g' </tmp/bench.time |
+	awk 'BEGIN{u=0.0; s=0.0; t=0.0}
+	{u += $1; s += $2; t += $3}
+	END{printf("times %g %g %g\n", u/NR, s/NR, t/NR);}'
+rm -f /tmp/bench.time
+cat /dev/debug
+echo #/dev/sysstat for 10 runs
+cat /tmp/sysstat
+exit ''

+ 42 - 0
sys/src/9/bench/runbenchs

@@ -0,0 +1,42 @@
+#!/bin/rc
+
+echo benchs...
+cd /sys/src/nix/bench
+. tools
+failed=()
+benchs=`{ls -d [0-9]* | sort -n}
+if(test -e k[0-9]*)
+	benchs=($benchs k[0-9]*)
+
+for(t in $benchs){
+	cd $t || fail cannot cd into benchs $t
+	if(test -e FAIL)
+		failed=($failed $t)
+	if not{
+		if(! test -e output && ! test -e FAIL) {
+			echo running bench $t
+			if(test -x kern && ! test -e koutput){
+				echo running kern for bench $t
+				if(! kern >koutput >[2=1]){
+					touch FAIL
+					fail bench $t failed
+				}
+				reboot
+			}
+			if(test -x kern)
+				cp /dev/kmesg KMESG
+			if(! runbench>output >[2=1]){
+				touch FAIL
+				fail bench $t failed
+			}
+			echo bench $t ok
+		}
+	}
+	cd ..
+}
+if(! ~ $#failed 0)
+	echo benchs $failed failed
+if not
+	echo all benchs done
+
+rm /cfg/$sysname/runbenchs

+ 37 - 0
sys/src/9/bench/tools

@@ -0,0 +1,37 @@
+fn fail  {
+	echo $* >[1=2]
+	exit fail
+}
+
+fn log {
+	echo $*
+	echo $* >/dev/cons
+}
+
+
+fn repeatforallnumcores {
+	test -e kern || fail no kernel bench
+	test -e 003048ff2106 || fail no pxe file
+	test -e koutput || fail no koutput
+	test -e output || fail no output
+
+	NC=`{grep '^bootfile' 003048ff2106 | awk '{print $NF}'}
+	if(~ $NC 32){
+		cp output output.32
+		cp KMESG KMESG.32
+		exit ''
+	}
+	@{
+		NNC=`{echo $NC + 1|hoc}
+		mv 003048ff2106 003048ff2106_
+		sed 's/-ck .*/-ck '^$NNC^' '^$NNC^'/' < 003048ff2106_ >003048ff2106
+		
+		mv output output.$NC
+		mv KMESG KMESG.$NC
+		cp 003048ff2106 /cfg/pxe
+#		mv koutput koutput.$NC
+		echo reboot to run `{pwd} with $NNC cores...
+		reboot
+	@} >'#c/cons' >[2]'#c/cons'
+	status=''
+}

+ 193 - 0
sys/src/9/boot/aux.c

@@ -0,0 +1,193 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+/*
+int
+plumb(char *dir, char *dest, int *efd, char *here)
+{
+	char buf[128];
+	char name[128];
+	int n;
+
+	sprint(name, "%s/clone", dir);
+	efd[0] = open(name, ORDWR);
+	if(efd[0] < 0)
+		return -1;
+	n = read(efd[0], buf, sizeof(buf)-1);
+	if(n < 0){
+		close(efd[0]);
+		return -1;
+	}
+	buf[n] = 0;
+	sprint(name, "%s/%s/data", dir, buf);
+	if(here){
+		sprint(buf, "announce %s", here);
+		if(sendmsg(efd[0], buf) < 0){
+			close(efd[0]);
+			return -1;
+		}
+	}
+	sprint(buf, "connect %s", dest);
+	if(sendmsg(efd[0], buf) < 0){
+		close(efd[0]);
+		return -1;
+	}
+	efd[1] = open(name, ORDWR);
+	if(efd[1] < 0){
+		close(efd[0]);
+		return -1;
+	}
+	return efd[1];
+}
+ */
+
+int
+sendmsg(int fd, char *msg)
+{
+	int n;
+
+	n = strlen(msg);
+	if(write(fd, msg, n) != n)
+		return -1;
+	return 0;
+}
+
+void
+warning(char *s)
+{
+	char buf[ERRMAX];
+
+	buf[0] = '\0';
+	errstr(buf, sizeof buf);
+	fprint(2, "boot: %s: %s\n", s, buf);
+}
+
+void
+fatal(char *s)
+{
+	char buf[ERRMAX];
+
+	buf[0] = '\0';
+	errstr(buf, sizeof buf);
+	fprint(2, "boot: %s: %s\n", s, buf);
+	exits(0);
+}
+
+int
+readfile(char *name, char *buf, int len)
+{
+	int f, n;
+
+	buf[0] = 0;
+	f = open(name, OREAD);
+	if(f < 0)
+		return -1;
+	n = read(f, buf, len-1);
+	if(n >= 0)
+		buf[n] = 0;
+	close(f);
+	return 0;
+}
+
+int
+writefile(char *name, char *buf, int len)
+{
+	int f, n;
+
+	f = open(name, OWRITE);
+	if(f < 0)
+		return -1;
+	n = write(f, buf, len);
+	close(f);
+	return (n != len) ? -1 : 0;
+}
+
+void
+setenv(char *name, char *val)
+{
+	int f;
+	char ename[64];
+
+	snprint(ename, sizeof ename, "#e/%s", name);
+	f = create(ename, 1, 0666);
+	if(f < 0){
+		fprint(2, "create %s: %r\n", ename);
+		return;
+	}
+	write(f, val, strlen(val));
+	close(f);
+}
+
+void
+srvcreate(char *name, int fd)
+{
+	char *srvname;
+	int f;
+	char buf[64];
+
+	srvname = strrchr(name, '/');
+	if(srvname)
+		srvname++;
+	else
+		srvname = name;
+
+	snprint(buf, sizeof buf, "#s/%s", srvname);
+	f = create(buf, 1, 0666);
+	if(f < 0)
+		fatal(buf);
+	sprint(buf, "%d", fd);
+	if(write(f, buf, strlen(buf)) != strlen(buf))
+		fatal("write");
+	close(f);
+}
+
+void
+catchint(void *a, char *note)
+{
+	USED(a);
+	if(strcmp(note, "alarm") == 0)
+		noted(NCONT);
+	noted(NDFLT);
+}
+
+int
+outin(char *prompt, char *def, int len)
+{
+	int n;
+	char buf[256];
+
+	if(len >= sizeof buf)
+		len = sizeof(buf)-1;
+
+	if(cpuflag){
+		notify(catchint);
+		alarm(15*1000);
+	}
+	print("%s[%s]: ", prompt, *def ? def : "no default");
+	memset(buf, 0, sizeof buf);
+	n = read(0, buf, len);
+	if(cpuflag){
+		alarm(0);
+		notify(0);
+	}
+
+	if(n < 0){
+		print("\n");
+		return 1;
+	}
+	if(n > 1){
+		buf[n-1] = 0;
+		strcpy(def, buf);
+	}
+	return n;
+}

+ 356 - 0
sys/src/9/boot/boot.c

@@ -0,0 +1,356 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include "../boot/boot.h"
+
+char	cputype[64];
+char	sys[2*64];
+char 	reply[256];
+int	printcol;
+int	mflag;
+int	fflag;
+int	kflag;
+
+char	*bargv[Nbarg];
+int	bargc;
+
+static void	swapproc(void);
+static Method	*rootserver(char*);
+static void	usbinit(void);
+static void	kbmap(void);
+
+void
+boot(int argc, char *argv[])
+{
+	int fd, afd;
+	Method *mp;
+	char *cmd, cmdbuf[64], *iargv[16];
+	char rootbuf[64];
+	int islocal, ishybrid;
+	char *rp, *rsp;
+	int iargc, n;
+	char buf[32];
+	AuthInfo *ai;
+
+	fmtinstall('r', errfmt);
+
+	bind("#c", "/dev", MBEFORE);
+	open("/dev/cons", OREAD);
+	open("/dev/cons", OWRITE);
+	open("/dev/cons", OWRITE);
+	/*
+	 * init will reinitialize its namespace.
+	 * #ec gets us plan9.ini settings (*var variables).
+	 */
+	bind("#ec", "/env", MREPL);
+	bind("#e", "/env", MBEFORE|MCREATE);
+	bind("#s", "/srv", MREPL|MCREATE);
+#ifdef DEBUG
+	print("argc=%d\n", argc);
+	for(fd = 0; fd < argc; fd++)
+		print("%#p %s ", argv[fd], argv[fd]);
+	print("\n");
+#endif DEBUG
+
+	ARGBEGIN{
+	case 'k':
+		kflag = 1;
+		break;
+	case 'm':
+		mflag = 1;
+		break;
+	case 'f':
+		fflag = 1;
+		break;
+	}ARGEND
+
+	readfile("#e/cputype", cputype, sizeof(cputype));
+
+	/*
+	 *  set up usb keyboard, mouse and disk, if any.
+	 */
+	usbinit();
+
+	/*
+	 *  pick a method and initialize it
+	 */
+	if(method[0].name == nil)
+		fatal("no boot methods");
+	mp = rootserver(argc ? *argv : 0);
+	(*mp->config)(mp);
+	islocal = strcmp(mp->name, "local") == 0;
+	ishybrid = strcmp(mp->name, "hybrid") == 0;
+
+	/*
+	 *  load keymap if it is there.
+	 */
+	kbmap();
+
+	/*
+ 	 *  authentication agent
+	 */
+	authentication(cpuflag);
+
+print("connect...");
+	/*
+	 *  connect to the root file system
+	 */
+	fd = (*mp->connect)();
+	if(fd < 0)
+		fatal("can't connect to file server");
+	if(getenv("srvold9p"))
+		fd = old9p(fd);
+	if(!islocal && !ishybrid){
+		if(cfs)
+			fd = (*cfs)(fd);
+	}
+print("\n");
+	print("version...");
+	buf[0] = '\0';
+	n = fversion(fd, 0, buf, sizeof buf);
+	if(n < 0)
+		fatal("can't init 9P");
+	srvcreate("boot", fd);
+
+	/*
+	 *  create the name space, mount the root fs
+	 */
+	if(bind("/", "/", MREPL) < 0)
+		fatal("bind /");
+	rp = getenv("rootspec");
+	if(rp == nil)
+		rp = "";
+	
+	afd = fauth(fd, rp);
+	if(afd >= 0){
+		ai = auth_proxy(afd, auth_getkey, "proto=p9any role=client");
+		if(ai == nil)
+			print("authentication failed (%r), trying mount anyways\n");
+	}
+	if(mount(fd, afd, "/root", MREPL|MCREATE, rp) < 0)
+		fatal("mount /");
+	rsp = rp;
+	rp = getenv("rootdir");
+	if(rp == nil)
+		rp = rootdir;
+	if(bind(rp, "/", MAFTER|MCREATE) < 0){
+		if(strncmp(rp, "/root", 5) == 0){
+			fprint(2, "boot: couldn't bind $rootdir=%s to root: %r\n", rp);
+			fatal("second bind /");
+		}
+		snprint(rootbuf, sizeof rootbuf, "/root/%s", rp);
+		rp = rootbuf;
+		if(bind(rp, "/", MAFTER|MCREATE) < 0){
+			fprint(2, "boot: couldn't bind $rootdir=%s to root: %r\n", rp);
+			if(strcmp(rootbuf, "/root//plan9") == 0){
+				fprint(2, "**** warning: remove rootdir=/plan9 entry from plan9.ini\n");
+				rp = "/root";
+				if(bind(rp, "/", MAFTER|MCREATE) < 0)
+					fatal("second bind /");
+			}else
+				fatal("second bind /");
+		}
+	}
+	close(fd);
+	setenv("rootdir", rp);
+
+	settime(islocal, afd, rsp);
+	if(afd > 0)
+		close(afd);
+
+	cmd = getenv("init");
+	if(cmd == nil){
+		sprint(cmdbuf, "/%s/init -%s%s", cputype,
+			cpuflag ? "c" : "t", mflag ? "m" : "");
+		cmd = cmdbuf;
+	}
+	iargc = tokenize(cmd, iargv, nelem(iargv)-1);
+	cmd = iargv[0];
+
+	/* make iargv[0] basename(iargv[0]) */
+	if(iargv[0] = strrchr(iargv[0], '/'))
+		iargv[0]++;
+	else
+		iargv[0] = cmd;
+
+	iargv[iargc] = nil;
+
+	exec(cmd, iargv);
+	fatal(cmd);
+}
+
+static Method*
+findmethod(char *a)
+{
+	Method *mp;
+	int i, j;
+	char *cp;
+
+	if((i = strlen(a)) == 0)
+		return nil;
+	cp = strchr(a, '!');
+	if(cp)
+		i = cp - a;
+	for(mp = method; mp->name; mp++){
+		j = strlen(mp->name);
+		if(j > i)
+			j = i;
+		if(strncmp(a, mp->name, j) == 0)
+			break;
+	}
+	if(mp->name)
+		return mp;
+	return nil;
+}
+
+/*
+ *  ask user from whence cometh the root file system
+ */
+static Method*
+rootserver(char *arg)
+{
+	char prompt[256];
+	Method *mp;
+	char *cp;
+	int n;
+
+	/* look for required reply */
+	readfile("#e/nobootprompt", reply, sizeof(reply));
+	if(reply[0]){
+		mp = findmethod(reply);
+		if(mp)
+			goto HaveMethod;
+		print("boot method %s not found\n", reply);
+		reply[0] = 0;
+	}
+
+	/* make list of methods */
+	mp = method;
+	n = sprint(prompt, "root is from (%s", mp->name);
+	for(mp++; mp->name; mp++)
+		n += sprint(prompt+n, ", %s", mp->name);
+	sprint(prompt+n, ")");
+
+	/* create default reply */
+	readfile("#e/bootargs", reply, sizeof(reply));
+	if(reply[0] == 0 && arg != 0)
+		strcpy(reply, arg);
+	if(reply[0]){
+		mp = findmethod(reply);
+		if(mp == 0)
+			reply[0] = 0;
+	}
+	if(reply[0] == 0)
+		strcpy(reply, method->name);
+
+	/* parse replies */
+	do{
+		outin(prompt, reply, sizeof(reply));
+		mp = findmethod(reply);
+	}while(mp == nil);
+
+HaveMethod:
+	bargc = tokenize(reply, bargv, Nbarg-2);
+	bargv[bargc] = nil;
+	cp = strchr(reply, '!');
+	if(cp)
+		strcpy(sys, cp+1);
+	return mp;
+}
+
+static void
+swapproc(void)
+{
+	int fd;
+
+	fd = open("#c/swap", OWRITE);
+	if(fd < 0){
+		warning("opening #c/swap");
+		return;
+	}
+	if(write(fd, "start", 5) <= 0)
+		warning("starting swap kproc");
+	close(fd);
+}
+
+int
+old9p(int fd)
+{
+	int p[2];
+
+	if(pipe(p) < 0)
+		fatal("pipe");
+
+	print("srvold9p...");
+	switch(fork()) {
+	case -1:
+		fatal("rfork srvold9p");
+	case 0:
+		dup(fd, 1);
+		close(fd);
+		dup(p[0], 0);
+		close(p[0]);
+		close(p[1]);
+		execl("/srvold9p", "srvold9p", "-s", 0);
+		fatal("exec srvold9p");
+	default:
+		close(fd);
+		close(p[0]);
+	}
+	return p[1];
+}
+
+static void
+usbinit(void)
+{
+	static char usbd[] = "/boot/usbd";
+
+	if(access("#u/usb/ctl", 0) >= 0 && bind("#u", "/dev", MAFTER) >= 0 &&
+	    access(usbd, AEXIST) >= 0)
+		run(usbd, nil);
+}
+
+static void
+kbmap(void)
+{
+	char *f;
+	int n, in, out;
+	char buf[1024];
+
+	f = getenv("kbmap");
+	if(f == nil)
+		return;
+	if(bind("#κ", "/dev", MAFTER) < 0){
+		warning("can't bind #κ");
+		return;
+	}
+
+	in = open(f, OREAD);
+	if(in < 0){
+		warning("can't open kbd map");
+		return;
+	}
+	out = open("/dev/kbmap", OWRITE);
+	if(out < 0) {
+		warning("can't open /dev/kbmap");
+		close(in);
+		return;
+	}
+	while((n = read(in, buf, sizeof(buf))) > 0)
+		if(write(out, buf, n) != n){
+			warning("write to /dev/kbmap failed");
+			break;
+		}
+	close(in);
+	close(out);
+}

+ 81 - 0
sys/src/9/boot/boot.h

@@ -0,0 +1,81 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+typedef struct Method	Method;
+struct Method
+{
+	char	*name;
+	void	(*config)(Method*);
+	int	(*connect)(void);
+	char	*arg;
+};
+enum
+{
+	Statsz=	256,
+	Nbarg=	16,
+};
+
+extern void	authentication(int);
+extern char*	bootdisk;
+extern char*	rootdir;
+extern int	(*cfs)(int);
+extern int	cpuflag;
+extern char	cputype[];
+extern int	fflag;
+extern int	kflag;
+extern Method	method[];
+extern void	(*pword)(int, Method*);
+extern char	sys[];
+extern uchar	hostkey[];
+extern uchar	statbuf[Statsz];
+extern int	bargc;
+extern char	*bargv[Nbarg];
+
+/* libc equivalent */
+extern int	cache(int);
+extern char*	checkkey(Method*, char*, char*);
+extern void	fatal(char*);
+extern void	getpasswd(char*, int);
+extern void	key(int, Method*);
+extern int	outin(char*, char*, int);
+extern int	plumb(char*, char*, int*, char*);
+extern int	readfile(char*, char*, int);
+extern long	readn(int, void*, long);
+extern void	run(char *file, ...);
+extern int	sendmsg(int, char*);
+extern void	setenv(char*, char*);
+extern void	settime(int, int, char*);
+extern void	srvcreate(char*, int);
+extern void	warning(char*);
+extern int	writefile(char*, char*, int);
+extern void	boot(int, char **);
+extern void	doauthenticate(int, Method*);
+extern int		old9p(int);
+extern int	parsefields(char*, char**, int, char*);
+
+/* methods */
+extern void	configtcp(Method*);
+extern int	connecttcp(void);
+
+extern void	configlocal(Method*);
+extern int	connectlocal(void);
+
+extern void	configsac(Method*);
+extern int	connectsac(void);
+
+extern void	configpaq(Method*);
+extern int	connectpaq(void);
+
+extern void	configembed(Method*);
+extern int	connectembed(void);
+
+extern void	configip(int, char**, int);
+
+/* hack for passing authentication address */
+extern char	*authaddr;

+ 82 - 0
sys/src/9/boot/bootauth.c

@@ -0,0 +1,82 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include "../boot/boot.h"
+
+char	*authaddr;
+static void glenda(void);
+
+void
+authentication(int cpuflag)
+{
+	char *argv[16], **av;
+	int ac;
+
+	if(access("/boot/factotum", AEXEC) < 0){
+		glenda();
+		return;
+	}
+
+	/* start agent */
+	ac = 0;
+	av = argv;
+	av[ac++] = "factotum";
+	if(getenv("debugfactotum"))
+		av[ac++] = "-p";
+//	av[ac++] = "-d";		/* debug traces */
+//	av[ac++] = "-D";		/* 9p messages */
+	if(cpuflag)
+		av[ac++] = "-S";
+	else
+		av[ac++] = "-u";
+	av[ac++] = "-sfactotum";
+	if(authaddr != nil){
+		av[ac++] = "-a";
+		av[ac++] = authaddr;
+	}
+	av[ac] = 0;
+	switch(fork()){
+	case -1:
+		fatal("starting factotum");
+	case 0:
+		exec("/boot/factotum", av);
+		fatal("execing /boot/factotum");
+	default:
+		break;
+	}
+
+	/* wait for agent to really be there */
+	while(access("/mnt/factotum", 0) < 0)
+		sleep(250);
+
+	if(cpuflag)
+		return;
+}
+
+static void
+glenda(void)
+{
+	int fd;
+	char *s;
+
+	s = getenv("user");
+	if(s == nil)
+		s = "glenda";
+
+	fd = open("#c/hostowner", OWRITE);
+	if(fd >= 0){
+		if(write(fd, s, strlen(s)) != strlen(s))
+			fprint(2, "setting #c/hostowner to %s: %r\n", s);
+		close(fd);
+	}
+}

+ 89 - 0
sys/src/9/boot/bootcache.c

@@ -0,0 +1,89 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+uchar statbuf[Statsz];
+
+int
+cache(int fd)
+{
+	int argc, i, p[2];
+	char *argv[5], bd[32], buf[256], partition[64], *pp;
+
+	if(stat("/boot/cfs", statbuf, sizeof statbuf) < 0)
+		return fd;
+
+	*partition = 0;
+
+	bind("#S", "/dev", MAFTER);
+	readfile("#e/cfs", buf, sizeof(buf));
+	if(*buf){
+		argc = tokenize(buf, argv, 4);
+		for(i = 0; i < argc; i++){
+			if(strcmp(argv[i], "off") == 0)
+				return fd;
+			else if(stat(argv[i], statbuf, sizeof statbuf) >= 0){
+				strncpy(partition, argv[i], sizeof(partition)-1);
+				partition[sizeof(partition)-1] = 0;
+			}
+		}
+	}
+
+	if(*partition == 0){
+		readfile("#e/bootdisk", bd, sizeof(bd));
+		if(*bd){
+			if(pp = strchr(bd, ':'))
+				*pp = 0;
+			/* damned artificial intelligence */
+			i = strlen(bd);
+			if(strcmp("disk", &bd[i-4]) == 0)
+				bd[i-4] = 0;
+			else if(strcmp("fs", &bd[i-2]) == 0)
+				bd[i-2] = 0;
+			else if(strcmp("fossil", &bd[i-6]) == 0)
+				bd[i-6] = 0;
+			sprint(partition, "%scache", bd);
+			if(stat(partition, statbuf, sizeof statbuf) < 0)
+				*bd = 0;
+		}
+		if(*bd == 0){
+			sprint(partition, "%scache", bootdisk);
+			if(stat(partition, statbuf, sizeof statbuf) < 0)
+				return fd;
+		}
+	}
+
+	print("cfs...");
+	if(pipe(p)<0)
+		fatal("pipe");
+	switch(fork()){
+	case -1:
+		fatal("fork");
+	case 0:
+		close(p[1]);
+		dup(fd, 0);
+		close(fd);
+		dup(p[0], 1);
+		close(p[0]);
+		if(fflag)
+			execl("/boot/cfs", "bootcfs", "-rs", "-f", partition, 0);
+		else
+			execl("/boot/cfs", "bootcfs", "-s", "-f", partition, 0);
+		break;
+	default:
+		close(p[0]);
+		close(fd);
+		fd = p[1];
+		break;
+	}
+	return fd;
+}

+ 213 - 0
sys/src/9/boot/bootip.c

@@ -0,0 +1,213 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <ip.h>
+
+#include "boot.h"
+
+static	uchar	fsip[IPaddrlen];
+	uchar	auip[IPaddrlen];
+static	char	mpoint[32];
+
+static int isvalidip(uchar*);
+static void netndb(char*, uchar*);
+static void netenv(char*, uchar*);
+
+
+void
+configip(int bargc, char **bargv, int needfs)
+{
+	Waitmsg *w;
+	int argc, pid;
+	char **arg, **argv, buf[32], *p;
+
+	fmtinstall('I', eipfmt);
+	fmtinstall('M', eipfmt);
+	fmtinstall('E', eipfmt);
+
+	arg = malloc((bargc+1) * sizeof(char*));
+	if(arg == nil)
+		fatal("malloc");
+	memmove(arg, bargv, bargc * sizeof(char*));
+	arg[bargc] = 0;
+
+print("ipconfig...");
+	argc = bargc;
+	argv = arg;
+	strcpy(mpoint, "/net");
+	ARGBEGIN {
+	case 'x':
+		p = ARGF();
+		if(p != nil)
+			snprint(mpoint, sizeof(mpoint), "/net%s", p);
+		break;
+	case 'g':
+	case 'b':
+	case 'h':
+	case 'm':
+		p = ARGF();
+		USED(p);
+		break;
+	} ARGEND;
+
+	/* bind in an ip interface */
+	if(bind("#I", mpoint, MAFTER) < 0)
+		fatal("bind #I\n");
+	if(access("#l0", 0) == 0 && bind("#l0", mpoint, MAFTER) < 0)
+		print("bind #l0: %r\n");
+	if(access("#l1", 0) == 0 && bind("#l1", mpoint, MAFTER) < 0)
+		print("bind #l1: %r\n");
+	if(access("#l2", 0) == 0 && bind("#l2", mpoint, MAFTER) < 0)
+		print("bind #l2: %r\n");
+	if(access("#l3", 0) == 0 && bind("#l3", mpoint, MAFTER) < 0)
+		print("bind #l3: %r\n");
+	werrstr("");
+
+	/* let ipconfig configure the ip interface */
+	switch(pid = fork()){
+	case -1:
+		fatal("fork configuring ip");
+	case 0:
+		exec("/boot/ipconfig", arg);
+		fatal("execing /ipconfig");
+	default:
+		break;
+	}
+
+	/* wait for ipconfig to finish */
+	for(;;){
+		w = wait();
+		if(w != nil && w->pid == pid){
+			if(w->msg[0] != 0)
+				fatal(w->msg);
+			free(w);
+			break;
+		} else if(w == nil)
+			fatal("configuring ip");
+		free(w);
+	}
+
+	if(!needfs)
+		return;
+
+	/* if we didn't get a file and auth server, query user */
+	netndb("fs", fsip);
+	if(!isvalidip(fsip))
+		netenv("fs", fsip);
+	while(!isvalidip(fsip)){
+		buf[0] = 0;
+		outin("filesystem IP address", buf, sizeof(buf));
+		if (parseip(fsip, buf) == -1)
+			fprint(2, "configip: can't parse fs ip %s\n", buf);
+	}
+
+	netndb("auth", auip);
+	if(!isvalidip(auip))
+		netenv("auth", auip);
+	while(!isvalidip(auip)){
+		buf[0] = 0;
+		outin("authentication server IP address", buf, sizeof(buf));
+		if (parseip(auip, buf) == -1)
+			fprint(2, "configip: can't parse auth ip %s\n", buf);
+	}
+}
+
+static void
+setauthaddr(char *proto, int port)
+{
+	char buf[128];
+
+	snprint(buf, sizeof buf, "%s!%I!%d", proto, auip, port);
+	authaddr = strdup(buf);
+}
+
+void
+configtcp(Method*)
+{
+	configip(bargc, bargv, 1);
+	setauthaddr("tcp", 567);
+}
+
+int
+connecttcp(void)
+{
+	int fd;
+	char buf[64];
+
+	snprint(buf, sizeof buf, "tcp!%I!564", fsip);
+	fd = dial(buf, 0, 0, 0);
+	if (fd < 0)
+		werrstr("dial %s: %r", buf);
+	return fd;
+}
+
+static int
+isvalidip(uchar *ip)
+{
+	if(ipcmp(ip, IPnoaddr) == 0)
+		return 0;
+	if(ipcmp(ip, v4prefix) == 0)
+		return 0;
+	return 1;
+}
+
+static void
+netenv(char *attr, uchar *ip)
+{
+	int fd, n;
+	char buf[128];
+
+	ipmove(ip, IPnoaddr);
+	snprint(buf, sizeof(buf), "#e/%s", attr);
+	fd = open(buf, OREAD);
+	if(fd < 0)
+		return;
+
+	n = read(fd, buf, sizeof(buf)-1);
+	if(n <= 0)
+		return;
+	buf[n] = 0;
+	if (parseip(ip, buf) == -1)
+		fprint(2, "netenv: can't parse ip %s\n", buf);
+}
+
+static void
+netndb(char *attr, uchar *ip)
+{
+	int fd, n, c;
+	char buf[1024];
+	char *p;
+
+	ipmove(ip, IPnoaddr);
+	snprint(buf, sizeof(buf), "%s/ndb", mpoint);
+	fd = open(buf, OREAD);
+	if(fd < 0)
+		return;
+	n = read(fd, buf, sizeof(buf)-1);
+	close(fd);
+	if(n <= 0)
+		return;
+	buf[n] = 0;
+	n = strlen(attr);
+	for(p = buf; ; p++){
+		p = strstr(p, attr);
+		if(p == nil)
+			break;
+		c = *(p-1);
+		if(*(p + n) == '=' && (p == buf || c == '\n' || c == ' ' || c == '\t')){
+			p += n+1;
+			if (parseip(ip, p) == -1)
+				fprint(2, "netndb: can't parse ip %s\n", p);
+			return;
+		}
+	}
+	return;
+}

+ 135 - 0
sys/src/9/boot/doauthenticate.c

@@ -0,0 +1,135 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include "../boot/boot.h"
+
+static char *pbmsg = "AS protocol botch";
+static char *ccmsg = "can't connect to AS";
+
+long
+readn(int fd, void *buf, long len)
+{
+	int m, n;
+	char *p;
+
+	p = buf;
+	for(n = 0; n < len; n += m){
+		m = read(fd, p+n, len-n);
+		if(m <= 0)
+			return -1;
+	}
+	return n;
+}
+
+static char*
+fromauth(Method *mp, char *trbuf, char *tbuf)
+{
+	int afd;
+	char t;
+	char *msg;
+	static char error[2*ERRMAX];
+
+	if(mp->auth == 0)
+		fatal("no method for accessing auth server");
+	afd = (*mp->auth)();
+	if(afd < 0) {
+		sprint(error, "%s: %r", ccmsg);
+		return error;
+	}
+
+	if(write(afd, trbuf, TICKREQLEN) < 0 || read(afd, &t, 1) != 1){
+		close(afd);
+		sprint(error, "%s: %r", pbmsg);
+		return error;
+	}
+	switch(t){
+	case AuthOK:
+		msg = 0;
+		if(readn(afd, tbuf, 2*TICKETLEN) < 0) {
+			sprint(error, "%s: %r", pbmsg);
+			msg = error;
+		}
+		break;
+	case AuthErr:
+		if(readn(afd, error, ERRMAX) < 0) {
+			sprint(error, "%s: %r", pbmsg);
+			msg = error;
+		}
+		else {
+			error[ERRMAX-1] = 0;
+			msg = error;
+		}
+		break;
+	default:
+		msg = pbmsg;
+		break;
+	}
+
+	close(afd);
+	return msg;
+}
+
+void
+doauthenticate(int fd, Method *mp)
+{
+	char *msg;
+	char trbuf[TICKREQLEN];
+	char tbuf[2*TICKETLEN];
+
+	print("session...");
+	if(fsession(fd, trbuf, sizeof trbuf) < 0)
+		fatal("session command failed");
+
+	/* no authentication required? */
+	memset(tbuf, 0, 2*TICKETLEN);
+	if(trbuf[0] == 0)
+		return;
+
+	/* try getting to an auth server */
+	print("getting ticket...");
+	msg = fromauth(mp, trbuf, tbuf);
+	print("authenticating...");
+	if(msg == 0)
+		if(fauth(fd, tbuf) >= 0)
+			return;
+
+	/* didn't work, go for the security hole */
+	fprint(2, "no authentication server (%s), using your key as server key\n", msg);
+}
+
+char*
+checkkey(Method *mp, char *name, char *key)
+{
+	char *msg;
+	Ticketreq tr;
+	Ticket t;
+	char trbuf[TICKREQLEN];
+	char tbuf[TICKETLEN];
+
+	memset(&tr, 0, sizeof tr);
+	tr.type = AuthTreq;
+	strcpy(tr.authid, name);
+	strcpy(tr.hostid, name);
+	strcpy(tr.uid, name);
+	convTR2M(&tr, trbuf);
+	msg = fromauth(mp, trbuf, tbuf);
+	if(msg == ccmsg){
+		fprint(2, "boot: can't contact auth server, passwd unchecked\n");
+		return 0;
+	}
+	if(msg)
+		return msg;
+	convM2T(tbuf, &t, key);
+	if(t.num == AuthTc && strcmp(name, t.cuid)==0)
+		return 0;
+	return "no match";
+}

+ 83 - 0
sys/src/9/boot/embed.c

@@ -0,0 +1,83 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+static char *paqfile;
+
+void
+configembed(Method *m)
+{
+	if(*sys == '/' || *sys == '#'){
+		/*
+		 *  if the user specifies the disk in the boot cmd or
+		 * 'root is from' prompt, use it
+		 */
+		paqfile = sys;
+	} else if(m->arg){
+		/*
+		 *  a default is supplied when the kernel is made
+		 */
+		paqfile = m->arg;
+	}
+}
+
+int
+connectembed(void)
+{
+	int i, p[2];
+	Dir *dir;
+	char **arg, **argp;
+
+	dir = dirstat("/boot/paqfs");
+	if(dir == nil)
+		return -1;
+	free(dir);
+
+	dir = dirstat(paqfile);
+	if(dir == nil || dir->mode & DMDIR)
+		return -1;
+	free(dir);
+
+	print("paqfs...");
+	if(bind("#c", "/dev", MREPL) < 0)
+		fatal("bind #c");
+	if(bind("#p", "/proc", MREPL) < 0)
+		fatal("bind #p");
+	if(pipe(p)<0)
+		fatal("pipe");
+	switch(fork()){
+	case -1:
+		fatal("fork");
+	case 0:
+		arg = malloc((bargc+5)*sizeof(char*));
+		argp = arg;
+		*argp++ = "/boot/paqfs";
+		*argp++ = "-iv";
+		*argp++ = paqfile;
+		for(i=1; i<bargc; i++)
+			*argp++ = bargv[i];
+		*argp = 0;
+
+		dup(p[0], 0);
+		dup(p[1], 1);
+		close(p[0]);
+		close(p[1]);
+		exec("/boot/paqfs", arg);
+		fatal("can't exec paqfs");
+	default:
+		break;
+	}
+	waitpid();
+
+	close(p[1]);
+	return p[0];
+}

+ 52 - 0
sys/src/9/boot/getpasswd.c

@@ -0,0 +1,52 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+void
+getpasswd(char *p, int len)
+{
+	char c;
+	int i, n, fd;
+
+	fd = open("#c/consctl", OWRITE);
+	if(fd < 0)
+		fatal("can't open consctl; please reboot");
+	write(fd, "rawon", 5);
+ Prompt:
+	print("password: ");
+	n = 0;
+	for(;;){
+		do{
+			i = read(0, &c, 1);
+			if(i < 0)
+				fatal("can't read cons; please reboot");
+		}while(i == 0);
+		switch(c){
+		case '\n':
+			p[n] = '\0';
+			close(fd);
+			print("\n");
+			return;
+		case '\b':
+			if(n > 0)
+				n--;
+			break;
+		case 'u' - 'a' + 1:		/* cntrl-u */
+			print("\n");
+			goto Prompt;
+		default:
+			if(n < len - 1)
+				p[n++] = c;
+			break;
+		}
+	}
+}

+ 284 - 0
sys/src/9/boot/local.c

@@ -0,0 +1,284 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+static char diskname[64];
+static char *disk;
+static char **args;
+
+void
+configlocal(Method *mp)
+{
+	char *p;
+	int n;
+
+	if(*sys == '/' || *sys == '#'){
+		/*
+		 *  if the user specifies the disk in the boot cmd or
+		 * 'root is from' prompt, use it
+		 */
+		disk = sys;
+	} else if(strncmp(argv0, "dksc(0,", 7) == 0){
+		/*
+		 *  on many mips arg0 of the boot command specifies the
+		 *  scsi logical unit number
+		 */
+		p = strchr(argv0, ',');
+		n = strtoul(p+1, 0, 10);
+		sprint(diskname, "#w%d/sd%dfs", n, n);
+		disk = diskname;
+	} else if(mp->arg){
+		/*
+		 *  a default is supplied when the kernel is made
+		 */
+		disk = mp->arg;
+	} else if(*bootdisk){
+		/*
+		 *  an environment variable from a pc's plan9.ini or
+		 *  from the mips nvram or generated by the kernel
+		 *  is the last resort.
+		 */
+		disk = bootdisk;
+	}
+
+	/* if we've decided on one, pass it on to all programs */
+	if(disk)
+		setenv("bootdisk", disk);
+
+	USED(mp);
+}
+
+int
+connectlocalkfs(void)
+{
+	int i, pid, fd, p[2];
+	char partition[64];
+	char *dev;
+	char **arg, **argp;
+	Dir *d;
+
+	if(stat("/boot/kfs", statbuf, sizeof statbuf) < 0)
+		return -1;
+
+	dev = disk ? disk : bootdisk;
+	snprint(partition, sizeof partition, "%sfs", dev);
+	fd = open(partition, OREAD);
+	if(fd < 0){
+		strcpy(partition, dev);
+		fd = open(partition, OREAD);
+		if(fd < 0)
+			return -1;
+	}
+	/*
+	 * can't do this check -- might be some other server posing as kfs.
+	 *
+	memset(buf, 0, sizeof buf);
+	pread(fd, buf, 512, 0);
+	close(fd);
+	if(memcmp(buf+256, "kfs wren device\n", 16) != 0){
+		if(strstr(partition, "/fs"))
+			print("no kfs file system found on %s\n", partition);
+		return -1;
+	}
+	 *
+	 */
+	d = dirfstat(fd);
+	close(fd);
+	if(d == nil)
+		return -1;
+	if(d->mode&DMDIR){
+		free(d);
+		return -1;
+	}
+	free(d);
+
+	print("kfs...");
+	if(pipe(p)<0)
+		fatal("pipe");
+	switch(pid = fork()){
+	case -1:
+		fatal("fork");
+	case 0:
+		arg = malloc((bargc+5)*sizeof(char*));
+		argp = arg;
+		*argp++ = "kfs";
+		*argp++ = "-f";
+		*argp++ = partition;
+		*argp++ = "-s";
+		for(i=1; i<bargc; i++)
+			*argp++ = bargv[i];
+		*argp = 0;
+
+		dup(p[0], 0);
+		dup(p[1], 1);
+		close(p[0]);
+		close(p[1]);
+		exec("/boot/kfs", arg);
+		fatal("can't exec kfs");
+	default:
+		break;
+	}
+	for(;;){
+		if((i = waitpid()) == -1)
+			fatal("waitpid for kfs failed");
+		if(i == pid)
+			break;
+	}
+
+	close(p[1]);
+	return p[0];
+}
+
+void
+run(char *file, ...)
+{
+	int i, pid;
+
+	switch(pid = fork()){
+	case -1:
+		fatal("fork");
+	case 0:
+		exec(file, &file);
+		fatal(smprint("can't exec %s: %r", file));
+	default:
+		while ((i = waitpid()) != pid && i != -1)
+			;
+		if(i == -1)
+			fatal(smprint("wait failed running %s", file));
+	}
+}
+
+static int
+print1(int fd, char *s)
+{
+	return write(fd, s, strlen(s));
+}
+
+void
+configloopback(void)
+{
+	int fd;
+
+	if((fd = open("/net/ipifc/clone", ORDWR)) < 0){
+		bind("#I", "/net", MAFTER);
+		if((fd = open("/net/ipifc/clone", ORDWR)) < 0)
+			fatal("open /net/ipifc/clone for loopback");
+	}
+	if(print1(fd, "bind loopback /dev/null") < 0
+	|| print1(fd, "add 127.0.0.1 255.255.255.255") < 0)
+		fatal("write /net/ipifc/clone for loopback");
+}
+
+int
+connectlocalfossil(void)
+{
+	int fd;
+	char *venti, *f[32], *p;
+	int nf;
+	char partition[128], buf[512];
+	char *dev;
+
+	if(stat("/boot/fossil", statbuf, sizeof statbuf) < 0)
+		return -1;
+
+	/* look for fossil partition */
+	dev = disk ? disk : bootdisk;
+	snprint(partition, sizeof partition, "%sfossil", dev);
+	fd = open(partition, OREAD);
+	if(fd < 0){
+		strcpy(partition, dev);
+		fd = open(partition, OREAD);
+		if(fd < 0)
+			return -1;
+	}
+	memset(buf, 0, sizeof buf);
+	pread(fd, buf, 512, 127*1024);
+	close(fd);
+	if(memcmp(buf, "fossil config\n", 14) != 0){
+		if(strstr(partition, "/fossil"))
+			print("no fossil config found on %s\n", partition);
+		return -1;
+	}
+
+	settime(1, -1, nil);
+
+	/* make venti available */
+	if((venti = getenv("venti")) && (nf = tokenize(venti, f, nelem(f)))){
+		if((fd = open(f[0], OREAD)) >= 0){
+			print("venti...");
+			memset(buf, 0, sizeof buf);
+			pread(fd, buf, 512, 248*1024);
+			close(fd);
+			if(memcmp(buf, "venti config\n", 13) != 0){
+				print("no venti config found on %s\n", f[0]);
+				return -1;
+			}
+			if(stat("/boot/venti", statbuf, sizeof statbuf) < 0){
+				print("/boot/venti does not exist\n");
+				return -1;
+			}
+			switch(nf){
+			case 1:
+				f[1] = "tcp!127.1!17034";
+			case 2:
+				f[2] = "tcp!127.1!8000";
+			}
+			configloopback();
+			run("/boot/venti", "-c", f[0], "-a", f[1], "-h", f[2], 0);
+			/*
+			 * If the announce address is tcp!*!foo, then set
+			 * $venti to tcp!127.1!foo instead, which is actually dialable.
+			 */
+			if((p = strstr(f[1], "!*!")) != 0){
+				*p = 0;
+				snprint(buf, sizeof buf, "%s!127.1!%s", f[1], p+3);
+				f[1] = buf;
+			}
+			setenv("venti", f[1]);
+		}else{
+			/* set up the network so we can talk to the venti server */
+			/* this is such a crock. */
+			configip(nf, f, 0);
+			setenv("venti", f[0]);
+		}
+	}
+
+	/* start fossil */
+	print("fossil(%s)...", partition);
+	run("/boot/fossil", "-f", partition, "-c", "srv -A fboot", "-c", "srv -p fscons", 0);
+	fd = open("#s/fboot", ORDWR);
+	if(fd < 0){
+		print("open #s/fboot: %r\n");
+		return -1;
+	}
+	remove("#s/fboot");	/* we'll repost as #s/boot */
+	return fd;
+}
+
+int
+connectlocal(void)
+{
+	int fd;
+
+	if(bind("#c", "/dev", MREPL) < 0)
+		fatal("bind #c");
+	if(bind("#p", "/proc", MREPL) < 0)
+		fatal("bind #p");
+	bind("#S", "/dev", MAFTER);
+	bind("#k", "/dev", MAFTER);
+	bind("#æ", "/dev", MAFTER);
+
+	if((fd = connectlocalfossil()) < 0)
+	if((fd = connectlocalkfs()) < 0)
+		return -1;
+	return fd;
+}

+ 61 - 0
sys/src/9/boot/nopsession.c

@@ -0,0 +1,61 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include "../boot/boot.h"
+
+static Fcall	hdr;
+
+static void
+rpc(int fd, int type)
+{
+	int n, l;
+	char buf[128], *p;
+
+	hdr.type = type;
+	hdr.tag = NOTAG;
+	n = convS2M(&hdr, buf);
+	if(write(fd, buf, n) != n)
+		fatal("write rpc");
+
+	print("...");
+	p = buf;
+	l = 0;
+	while(l < 3) {
+		n = read(fd, p, 3);
+		if(n <= 0)
+			fatal("read rpc");
+		if(n == 2 && l == 0 && buf[0] == 'O' && buf[1] == 'K')
+			continue;
+		p += n;
+		l += n;
+	}
+	if(convM2S(buf, &hdr, n) == 0){
+		print("%ux %ux %ux\n", buf[0], buf[1], buf[2]);
+		fatal("rpc format");
+	}
+	if(hdr.tag != NOTAG)
+		fatal("rpc tag not NOTAG");
+	if(hdr.type == Rerror){
+		print("error %s;", hdr.ename);
+		fatal("remote error");
+	}
+	if(hdr.type != type+1)
+		fatal("not reply");
+}
+
+void
+nop(int fd)
+{
+	print("nop");
+	rpc(fd, Tnop);
+}

+ 76 - 0
sys/src/9/boot/paq.c

@@ -0,0 +1,76 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+char *fparts[] =
+{
+	"add bootldr	0x0000000 0x0040000",
+	"add params	0x0040000 0x0080000",
+	"add kernel	0x0080000 0x0140000",
+	"add user	0x0140000 0x0200000",
+	"add ramdisk	0x0200000 0x0600000",
+};
+
+void
+configpaq(Method*)
+{
+	int fd;
+	int i;
+
+	if(bind("#F", "/dev", MAFTER) < 0)
+		fatal("bind #c");
+	if(bind("#p", "/proc", MREPL) < 0)
+		fatal("bind #p");
+	fd = open("/dev/flash/flashctl", OWRITE);
+	if(fd < 0)
+		fatal("opening flashctl");
+	for(i = 0; i < nelem(fparts); i++)
+		if(fprint(fd, fparts[i]) < 0)
+			fatal(fparts[i]);
+	close(fd);
+}
+
+int
+connectpaq(void)
+{
+	int  p[2];
+	char **arg, **argp;
+
+	print("paq...");
+	if(pipe(p)<0)
+		fatal("pipe");
+	switch(fork()){
+	case -1:
+		fatal("fork");
+	case 0:
+		arg = malloc(10*sizeof(char*));
+		argp = arg;
+		*argp++ = "paqfs";
+		*argp++ = "-v";
+		*argp++ = "-i";
+		*argp++ = "/dev/flash/ramdisk";
+		*argp = 0;
+
+		dup(p[0], 0);
+		dup(p[1], 1);
+		close(p[0]);
+		close(p[1]);
+		exec("/boot/paqfs", arg);
+		fatal("can't exec paqfs");
+	default:
+		break;
+	}
+	waitpid();
+
+	close(p[1]);
+	return p[0];
+}

+ 31 - 0
sys/src/9/boot/printstub.c

@@ -0,0 +1,31 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+
+static Lock fmtl;
+
+void
+_fmtlock(void)
+{
+	lock(&fmtl);
+}
+
+void
+_fmtunlock(void)
+{
+	unlock(&fmtl);
+}
+
+int
+_efgfmt(Fmt*)
+{
+	return -1;
+}

+ 59 - 0
sys/src/9/boot/sac.c

@@ -0,0 +1,59 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <../boot/boot.h>
+
+/*
+ * HACK - take over from boot since file system is not
+ * available on a pipe
+ */
+
+void
+configsac(Method *mp)
+{
+	int fd;
+	char cmd[64];
+
+	USED(mp);
+
+	/*
+	 *  create the name space, mount the root fs
+	 */
+	if(bind("/", "/", MREPL) < 0)
+		fatal("bind /");
+	if(bind("#C", "/", MAFTER) < 0)
+		fatal("bind /");
+
+	/* fixed sysname - enables correct namespace file */
+	fd = open("#c/sysname", OWRITE);
+	if(fd < 0)
+		fatal("open sysname");
+	write(fd, "brick", 5);
+	close(fd);
+
+	fd = open("#c/hostowner", OWRITE);
+	if(fd < 0)
+		fatal("open sysname");
+	write(fd, "brick", 5);
+	close(fd);
+
+	sprint(cmd, "/%s/init", cputype);
+	print("starting %s\n", cmd);
+	execl(cmd, "init", "-c", 0);
+	fatal(cmd);
+}
+
+int
+connectsac(void)
+{
+	/* does not get here */
+	return -1;
+}

+ 158 - 0
sys/src/9/boot/settime.c

@@ -0,0 +1,158 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include "../boot/boot.h"
+
+static long lusertime(char*);
+
+char *timeserver = "#s/boot";
+
+void
+settime(int islocal, int afd, char *rp)
+{
+	int n, f;
+	int timeset;
+	Dir dir[2];
+	char timebuf[64];
+
+	print("time...");
+	timeset = 0;
+	if(islocal){
+		/*
+		 *  set the time from the real time clock
+		 */
+		f = open("#r/rtc", ORDWR);
+		if(f >= 0){
+			if((n = read(f, timebuf, sizeof(timebuf)-1)) > 0){
+				timebuf[n] = '\0';
+				timeset = 1;
+			}
+			close(f);
+		}else do{
+			strcpy(timebuf, "yymmddhhmm[ss]");
+			outin("\ndate/time ", timebuf, sizeof(timebuf));
+		}while((timeset=lusertime(timebuf)) <= 0);
+	}
+	if(timeset == 0){
+		/*
+		 *  set the time from the access time of the root
+		 */
+		f = open(timeserver, ORDWR);
+		if(f < 0)
+			return;
+		if(mount(f, afd, "/tmp", MREPL, rp) < 0){
+			warning("settime mount");
+			close(f);
+			return;
+		}
+		close(f);
+		if(stat("/tmp", statbuf, sizeof statbuf) < 0)
+			fatal("stat");
+		convM2D(statbuf, sizeof statbuf, &dir[0], (char*)&dir[1]);
+		sprint(timebuf, "%ld", dir[0].atime);
+		unmount(0, "/tmp");
+	}
+
+	f = open("#c/time", OWRITE);
+	if(write(f, timebuf, strlen(timebuf)) < 0)
+		warning("can't set #c/time");
+	close(f);
+	print("\n");
+}
+
+#define SEC2MIN 60L
+#define SEC2HOUR (60L*SEC2MIN)
+#define SEC2DAY (24L*SEC2HOUR)
+
+int
+g2(char **pp)
+{
+	int v;
+
+	v = 10*((*pp)[0]-'0') + (*pp)[1]-'0';
+	*pp += 2;
+	return v;
+}
+
+/*
+ *  days per month plus days/year
+ */
+static	int	dmsize[] =
+{
+	365, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+static	int	ldmsize[] =
+{
+	366, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ *  return the days/month for the given year
+ */
+static int *
+yrsize(int y)
+{
+
+	if((y%4) == 0 && ((y%100) != 0 || (y%400) == 0))
+		return ldmsize;
+	else
+		return dmsize;
+}
+
+/*
+ *  compute seconds since Jan 1 1970
+ */
+static long
+lusertime(char *argbuf)
+{
+	char *buf;
+	ulong secs;
+	int i, y, m;
+	int *d2m;
+
+	buf = argbuf;
+	i = strlen(buf);
+	if(i != 10 && i != 12)
+		return -1;
+	secs = 0;
+	y = g2(&buf);
+	m = g2(&buf);
+	if(y < 70)
+		y += 2000;
+	else
+		y += 1900;
+
+	/*
+	 *  seconds per year
+	 */
+	for(i = 1970; i < y; i++){
+		d2m = yrsize(i);
+		secs += d2m[0] * SEC2DAY;
+	}
+
+	/*
+	 *  seconds per month
+	 */
+	d2m = yrsize(y);
+	for(i = 1; i < m; i++)
+		secs += d2m[i] * SEC2DAY;
+
+	secs += (g2(&buf)-1) * SEC2DAY;
+	secs += g2(&buf) * SEC2HOUR;
+	secs += g2(&buf) * SEC2MIN;
+	if(*buf)
+		secs += g2(&buf);
+
+	sprint(argbuf, "%ld", secs);
+	return secs;
+}

+ 689 - 0
sys/src/9/ip/arp.c

@@ -0,0 +1,689 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+#include "ipv6.h"
+
+/*
+ *  address resolution tables
+ */
+
+enum
+{
+	NHASH		= (1<<6),
+	NCACHE		= 256,
+
+	AOK		= 1,
+	AWAIT		= 2,
+};
+
+char *arpstate[] =
+{
+	"UNUSED",
+	"OK",
+	"WAIT",
+};
+
+/*
+ *  one per Fs
+ */
+struct Arp
+{
+	QLock;
+	Fs	*f;
+	Arpent	*hash[NHASH];
+	Arpent	cache[NCACHE];
+	Arpent	*rxmt;
+	Proc	*rxmitp;	/* neib sol re-transmit proc */
+	Rendez	rxmtq;
+	Block 	*dropf, *dropl;
+};
+
+char *Ebadarp = "bad arp";
+
+#define haship(s) ((s)[IPaddrlen-1]%NHASH)
+
+extern int 	ReTransTimer = RETRANS_TIMER;
+static void 	rxmitproc(void *v);
+
+void
+arpinit(Fs *f)
+{
+	f->arp = smalloc(sizeof(Arp));
+	f->arp->f = f;
+	f->arp->rxmt = nil;
+	f->arp->dropf = f->arp->dropl = nil;
+	kproc("rxmitproc", rxmitproc, f->arp);
+}
+
+/*
+ *  create a new arp entry for an ip address.
+ */
+static Arpent*
+newarp6(Arp *arp, uchar *ip, Ipifc *ifc, int addrxt)
+{
+	uint t;
+	Block *next, *xp;
+	Arpent *a, *e, *f, **l;
+	int empty;
+
+	/* find oldest entry */
+	e = &arp->cache[NCACHE];
+	a = arp->cache;
+	t = a->utime;
+	for(f = a; f < e; f++){
+		if(f->utime < t){
+			t = f->utime;
+			a = f;
+		}
+	}
+
+	/* dump waiting packets */
+	xp = a->hold;
+	a->hold = nil;
+
+	if(isv4(a->ip)){
+		while(xp){
+			next = xp->list;
+			freeblist(xp);
+			xp = next;
+		}
+	}
+	else { /* queue icmp unreachable for rxmitproc later on, w/o arp lock */
+		if(xp){
+			if(arp->dropl == nil)
+				arp->dropf = xp;
+			else
+				arp->dropl->list = xp;
+
+			for(next = xp->list; next; next = next->list)
+				xp = next;
+			arp->dropl = xp;
+			wakeup(&arp->rxmtq);
+		}
+	}
+
+	/* take out of current chain */
+	l = &arp->hash[haship(a->ip)];
+	for(f = *l; f; f = f->hash){
+		if(f == a){
+			*l = a->hash;
+			break;
+		}
+		l = &f->hash;
+	}
+
+	/* insert into new chain */
+	l = &arp->hash[haship(ip)];
+	a->hash = *l;
+	*l = a;
+
+	memmove(a->ip, ip, sizeof(a->ip));
+	a->utime = NOW;
+	a->ctime = 0;
+	a->type = ifc->medium;
+
+	a->rtime = NOW + ReTransTimer;
+	a->rxtsrem = MAX_MULTICAST_SOLICIT;
+	a->ifc = ifc;
+	a->ifcid = ifc->ifcid;
+
+	/* put to the end of re-transmit chain; addrxt is 0 when isv4(a->ip) */
+	if(!ipismulticast(a->ip) && addrxt){
+		l = &arp->rxmt;
+		empty = (*l==nil);
+
+		for(f = *l; f; f = f->nextrxt){
+			if(f == a){
+				*l = a->nextrxt;
+				break;
+			}
+			l = &f->nextrxt;
+		}
+		for(f = *l; f; f = f->nextrxt){
+			l = &f->nextrxt;
+		}
+		*l = a;
+		if(empty)
+			wakeup(&arp->rxmtq);
+	}
+
+	a->nextrxt = nil;
+
+	return a;
+}
+
+/* called with arp qlocked */
+
+void
+cleanarpent(Arp *arp, Arpent *a)
+{
+	Arpent *f, **l;
+
+	a->utime = 0;
+	a->ctime = 0;
+	a->type = 0;
+	a->state = 0;
+
+	/* take out of current chain */
+	l = &arp->hash[haship(a->ip)];
+	for(f = *l; f; f = f->hash){
+		if(f == a){
+			*l = a->hash;
+			break;
+		}
+		l = &f->hash;
+	}
+
+	/* take out of re-transmit chain */
+	l = &arp->rxmt;
+	for(f = *l; f; f = f->nextrxt){
+		if(f == a){
+			*l = a->nextrxt;
+			break;
+		}
+		l = &f->nextrxt;
+	}
+	a->nextrxt = nil;
+	a->hash = nil;
+	a->hold = nil;
+	a->last = nil;
+	a->ifc = nil;
+}
+
+/*
+ *  fill in the media address if we have it.  Otherwise return an
+ *  Arpent that represents the state of the address resolution FSM
+ *  for ip.  Add the packet to be sent onto the list of packets
+ *  waiting for ip->mac to be resolved.
+ */
+Arpent*
+arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uchar *ip, uchar *mac)
+{
+	int hash;
+	Arpent *a;
+	Medium *type;
+	uchar v6ip[IPaddrlen];
+
+	if(version == V4){
+		v4tov6(v6ip, ip);
+		ip = v6ip;
+	}
+
+	qlock(arp);
+	hash = haship(ip);
+	type = ifc->medium;
+	for(a = arp->hash[hash]; a; a = a->hash){
+		if(memcmp(ip, a->ip, sizeof(a->ip)) == 0)
+		if(type == a->type)
+			break;
+	}
+
+	if(a == nil){
+		a = newarp6(arp, ip, ifc, (version != V4));
+		a->state = AWAIT;
+	}
+	a->utime = NOW;
+	if(a->state == AWAIT){
+		if(bp != nil){
+			if(a->hold)
+				a->last->list = bp;
+			else
+				a->hold = bp;
+			a->last = bp;
+			bp->list = nil;
+		}
+		return a;		/* return with arp qlocked */
+	}
+
+	memmove(mac, a->mac, a->type->maclen);
+
+	/* remove old entries */
+	if(NOW - a->ctime > 15*60*1000)
+		cleanarpent(arp, a);
+
+	qunlock(arp);
+	return nil;
+}
+
+/*
+ * called with arp locked
+ */
+void
+arprelease(Arp *arp, Arpent*)
+{
+	qunlock(arp);
+}
+
+/*
+ * Copy out the mac address from the Arpent.  Return the
+ * block waiting to get sent to this mac address.
+ *
+ * called with arp locked
+ */
+Block*
+arpresolve(Arp *arp, Arpent *a, Medium *type, uchar *mac)
+{
+	Block *bp;
+	Arpent *f, **l;
+
+	if(!isv4(a->ip)){
+		l = &arp->rxmt;
+		for(f = *l; f; f = f->nextrxt){
+			if(f == a){
+				*l = a->nextrxt;
+				break;
+			}
+			l = &f->nextrxt;
+		}
+	}
+
+	memmove(a->mac, mac, type->maclen);
+	a->type = type;
+	a->state = AOK;
+	a->utime = NOW;
+	bp = a->hold;
+	a->hold = nil;
+	qunlock(arp);
+
+	return bp;
+}
+
+void
+arpenter(Fs *fs, int version, uchar *ip, uchar *mac, int n, int refresh)
+{
+	Arp *arp;
+	Route *r;
+	Arpent *a, *f, **l;
+	Ipifc *ifc;
+	Medium *type;
+	Block *bp, *next;
+	uchar v6ip[IPaddrlen];
+
+	arp = fs->arp;
+
+	if(n != 6){
+//		print("arp: len = %d\n", n);
+		return;
+	}
+
+	switch(version){
+	case V4:
+		r = v4lookup(fs, ip, nil);
+		v4tov6(v6ip, ip);
+		ip = v6ip;
+		break;
+	case V6:
+		r = v6lookup(fs, ip, nil);
+		break;
+	default:
+		panic("arpenter: version %d", version);
+		return;	/* to supress warnings */
+	}
+
+	if(r == nil){
+//		print("arp: no route for entry\n");
+		return;
+	}
+
+	ifc = r->ifc;
+	type = ifc->medium;
+
+	qlock(arp);
+	for(a = arp->hash[haship(ip)]; a; a = a->hash){
+		if(a->type != type || (a->state != AWAIT && a->state != AOK))
+			continue;
+
+		if(ipcmp(a->ip, ip) == 0){
+			a->state = AOK;
+			memmove(a->mac, mac, type->maclen);
+
+			if(version == V6){
+				/* take out of re-transmit chain */
+				l = &arp->rxmt;
+				for(f = *l; f; f = f->nextrxt){
+					if(f == a){
+						*l = a->nextrxt;
+						break;
+					}
+					l = &f->nextrxt;
+				}
+			}
+
+			a->ifc = ifc;
+			a->ifcid = ifc->ifcid;
+			bp = a->hold;
+			a->hold = nil;
+			if(version == V4)
+				ip += IPv4off;
+			a->utime = NOW;
+			a->ctime = a->utime;
+			qunlock(arp);
+
+			while(bp){
+				next = bp->list;
+				if(ifc != nil){
+					if(waserror()){
+						runlock(ifc);
+						nexterror();
+					}
+					rlock(ifc);
+					if(ifc->medium != nil)
+						ifc->medium->bwrite(ifc, bp, version, ip);
+					else
+						freeb(bp);
+					runlock(ifc);
+					poperror();
+				} else
+					freeb(bp);
+				bp = next;
+			}
+			return;
+		}
+	}
+
+	if(refresh == 0){
+		a = newarp6(arp, ip, ifc, 0);
+		a->state = AOK;
+		a->type = type;
+		a->ctime = NOW;
+		memmove(a->mac, mac, type->maclen);
+	}
+
+	qunlock(arp);
+}
+
+int
+arpwrite(Fs *fs, char *s, int len)
+{
+	int n;
+	Route *r;
+	Arp *arp;
+	Block *bp;
+	Arpent *a, *fl, **l;
+	Medium *type;
+	char *f[4], buf[256];
+	uchar ip[IPaddrlen], mac[MAClen];
+
+	arp = fs->arp;
+
+	if(len == 0)
+		error(Ebadarp);
+	if(len >= sizeof(buf))
+		len = sizeof(buf)-1;
+	strncpy(buf, s, len);
+	buf[len] = 0;
+	if(len > 0 && buf[len-1] == '\n')
+		buf[len-1] = 0;
+
+	n = getfields(buf, f, 4, 1, " ");
+	if(strcmp(f[0], "flush") == 0){
+		qlock(arp);
+		for(a = arp->cache; a < &arp->cache[NCACHE]; a++){
+			memset(a->ip, 0, sizeof(a->ip));
+			memset(a->mac, 0, sizeof(a->mac));
+			a->hash = nil;
+			a->state = 0;
+			a->utime = 0;
+			while(a->hold != nil){
+				bp = a->hold->list;
+				freeblist(a->hold);
+				a->hold = bp;
+			}
+		}
+		memset(arp->hash, 0, sizeof(arp->hash));
+		/* clear all pkts on these lists (rxmt, dropf/l) */
+		arp->rxmt = nil;
+		arp->dropf = nil;
+		arp->dropl = nil;
+		qunlock(arp);
+	} else if(strcmp(f[0], "add") == 0){
+		switch(n){
+		default:
+			error(Ebadarg);
+		case 3:
+			parseip(ip, f[1]);
+			if(isv4(ip))
+				r = v4lookup(fs, ip+IPv4off, nil);
+			else
+				r = v6lookup(fs, ip, nil);
+			if(r == nil)
+				error("Destination unreachable");
+			type = r->ifc->medium;
+			n = parsemac(mac, f[2], type->maclen);
+			break;
+		case 4:
+			type = ipfindmedium(f[1]);
+			if(type == nil)
+				error(Ebadarp);
+			parseip(ip, f[2]);
+			n = parsemac(mac, f[3], type->maclen);
+			break;
+		}
+
+		if(type->ares == nil)
+			error(Ebadarp);
+
+		type->ares(fs, V6, ip, mac, n, 0);
+	} else if(strcmp(f[0], "del") == 0){
+		if(n != 2)
+			error(Ebadarg);
+
+		parseip(ip, f[1]);
+		qlock(arp);
+
+		l = &arp->hash[haship(ip)];
+		for(a = *l; a; a = a->hash){
+			if(memcmp(ip, a->ip, sizeof(a->ip)) == 0){
+				*l = a->hash;
+				break;
+			}
+			l = &a->hash;
+		}
+
+		if(a){
+			/* take out of re-transmit chain */
+			l = &arp->rxmt;
+			for(fl = *l; fl; fl = fl->nextrxt){
+				if(fl == a){
+					*l = a->nextrxt;
+					break;
+				}
+				l = &fl->nextrxt;
+			}
+
+			a->nextrxt = nil;
+			a->hash = nil;
+			a->hold = nil;
+			a->last = nil;
+			a->ifc = nil;
+			memset(a->ip, 0, sizeof(a->ip));
+			memset(a->mac, 0, sizeof(a->mac));
+		}
+		qunlock(arp);
+	} else
+		error(Ebadarp);
+
+	return len;
+}
+
+enum
+{
+	Alinelen=	90,
+};
+
+char *aformat = "%-6.6s %-8.8s %-40.40I %-32.32s\n";
+
+static void
+convmac(char *p, uchar *mac, int n)
+{
+	while(n-- > 0)
+		p += sprint(p, "%2.2ux", *mac++);
+}
+
+int
+arpread(Arp *arp, char *p, ulong offset, int len)
+{
+	Arpent *a;
+	int n;
+	char mac[2*MAClen+1];
+
+	if(offset % Alinelen)
+		return 0;
+
+	offset = offset/Alinelen;
+	len = len/Alinelen;
+
+	n = 0;
+	for(a = arp->cache; len > 0 && a < &arp->cache[NCACHE]; a++){
+		if(a->state == 0)
+			continue;
+		if(offset > 0){
+			offset--;
+			continue;
+		}
+		len--;
+		qlock(arp);
+		convmac(mac, a->mac, a->type->maclen);
+		n += sprint(p+n, aformat, a->type->name, arpstate[a->state], a->ip, mac);
+		qunlock(arp);
+	}
+
+	return n;
+}
+
+extern int
+rxmitsols(Arp *arp)
+{
+	uint sflag;
+	Block *next, *xp;
+	Arpent *a, *b, **l;
+	Fs *f;
+	uchar ipsrc[IPaddrlen];
+	Ipifc *ifc = nil;
+	long nrxt;
+
+	qlock(arp);
+	f = arp->f;
+
+	a = arp->rxmt;
+	if(a==nil){
+		nrxt = 0;
+		goto dodrops; 		/* return nrxt; */
+	}
+	nrxt = a->rtime - NOW;
+	if(nrxt > 3*ReTransTimer/4)
+		goto dodrops; 		/* return nrxt; */
+
+	for(; a; a = a->nextrxt){
+		ifc = a->ifc;
+		assert(ifc != nil);
+		if((a->rxtsrem <= 0) || !(canrlock(ifc)) || (a->ifcid != ifc->ifcid)){
+			xp = a->hold;
+			a->hold = nil;
+
+			if(xp){
+				if(arp->dropl == nil)
+					arp->dropf = xp;
+				else
+					arp->dropl->list = xp;
+			}
+
+			cleanarpent(arp, a);
+		}
+		else
+			break;
+	}
+	if(a == nil)
+		goto dodrops;
+
+	qunlock(arp);	/* for icmpns */
+	if((sflag = ipv6anylocal(ifc, ipsrc)) != SRC_UNSPEC)
+		icmpns(f, ipsrc, sflag, a->ip, TARG_MULTI, ifc->mac);
+
+	runlock(ifc);
+	qlock(arp);
+
+	/* put to the end of re-transmit chain */
+	l = &arp->rxmt;
+	for(b = *l; b; b = b->nextrxt){
+		if(b == a){
+			*l = a->nextrxt;
+			break;
+		}
+		l = &b->nextrxt;
+	}
+	for(b = *l; b; b = b->nextrxt){
+		l = &b->nextrxt;
+	}
+	*l = a;
+	a->rxtsrem--;
+	a->nextrxt = nil;
+	a->rtime = NOW + ReTransTimer;
+
+	a = arp->rxmt;
+	if(a==nil)
+		nrxt = 0;
+	else
+		nrxt = a->rtime - NOW;
+
+dodrops:
+	xp = arp->dropf;
+	arp->dropf = nil;
+	arp->dropl = nil;
+	qunlock(arp);
+
+	for(; xp; xp = next){
+		next = xp->list;
+		icmphostunr(f, ifc, xp, icmp6_adr_unreach, 1);
+	}
+
+	return nrxt;
+
+}
+
+static int
+rxready(void *v)
+{
+	Arp *arp = (Arp *) v;
+	int x;
+
+	x = ((arp->rxmt != nil) || (arp->dropf != nil));
+
+	return x;
+}
+
+static void
+rxmitproc(void *v)
+{
+	Arp *arp = v;
+	long wakeupat;
+
+	arp->rxmitp = up;
+	//print("arp rxmitproc started\n");
+	if(waserror()){
+		arp->rxmitp = 0;
+		pexit("hangup", 1);
+	}
+	for(;;){
+		wakeupat = rxmitsols(arp);
+		if(wakeupat == 0)
+			sleep(&arp->rxmtq, rxready, v);
+		else if(wakeupat > ReTransTimer/4)
+			tsleep(&arp->rxmtq, return0, 0, wakeupat);
+	}
+}
+

+ 133 - 0
sys/src/9/ip/chandial.c

@@ -0,0 +1,133 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"../ip/ip.h"
+
+typedef struct DS DS;
+static Chan*	call(char*, char*, DS*);
+static void	_dial_string_parse(char*, DS*);
+
+enum
+{
+	Maxstring=	128,
+};
+
+struct DS
+{
+	char	buf[Maxstring];			/* dist string */
+	char	*netdir;
+	char	*proto;
+	char	*rem;
+	char	*local;				/* other args */
+	char	*dir;
+	Chan	**ctlp;
+};
+
+/*
+ *  the dialstring is of the form '[/net/]proto!dest'
+ */
+Chan*
+chandial(char *dest, char *local, char *dir, Chan **ctlp)
+{
+	DS ds;
+	char clone[Maxpath];
+
+	ds.local = local;
+	ds.dir = dir;
+	ds.ctlp = ctlp;
+
+	_dial_string_parse(dest, &ds);
+	if(ds.netdir == 0)
+		ds.netdir = "/net";
+
+	/* no connection server, don't translate */
+	snprint(clone, sizeof(clone), "%s/%s/clone", ds.netdir, ds.proto);
+	return call(clone, ds.rem, &ds);
+}
+
+static Chan*
+call(char *clone, char *dest, DS *ds)
+{
+	int n;
+	Chan *dchan, *cchan;
+	char name[Maxpath], data[Maxpath], *p;
+
+	cchan = namec(clone, Aopen, ORDWR, 0);
+
+	/* get directory name */
+	if(waserror()){
+		cclose(cchan);
+		nexterror();
+	}
+	n = cchan->dev->read(cchan, name, sizeof(name)-1, 0);
+	name[n] = 0;
+	for(p = name; *p == ' '; p++)
+		;
+	sprint(name, "%lud", strtoul(p, 0, 0));
+	p = strrchr(clone, '/');
+	*p = 0;
+	if(ds->dir)
+		snprint(ds->dir, Maxpath, "%s/%s", clone, name);
+	snprint(data, sizeof(data), "%s/%s/data", clone, name);
+
+	/* connect */
+	if(ds->local)
+		snprint(name, sizeof(name), "connect %s %s", dest, ds->local);
+	else
+		snprint(name, sizeof(name), "connect %s", dest);
+	cchan->dev->write(cchan, name, strlen(name), 0);
+
+	/* open data connection */
+	dchan = namec(data, Aopen, ORDWR, 0);
+	if(ds->ctlp)
+		*ds->ctlp = cchan;
+	else
+		cclose(cchan);
+	poperror();
+	return dchan;
+
+}
+
+/*
+ *  parse a dial string
+ */
+static void
+_dial_string_parse(char *str, DS *ds)
+{
+	char *p, *p2;
+
+	strncpy(ds->buf, str, Maxstring);
+	ds->buf[Maxstring-1] = 0;
+
+	p = strchr(ds->buf, '!');
+	if(p == 0) {
+		ds->netdir = 0;
+		ds->proto = "net";
+		ds->rem = ds->buf;
+	} else {
+		if(*ds->buf != '/' && *ds->buf != '#'){
+			ds->netdir = 0;
+			ds->proto = ds->buf;
+		} else {
+			for(p2 = p; *p2 != '/'; p2--)
+				;
+			*p2++ = 0;
+			ds->netdir = ds->buf;
+			ds->proto = p2;
+		}
+		*p = 0;
+		ds->rem = p + 1;
+	}
+}

+ 1425 - 0
sys/src/9/ip/devip.c

@@ -0,0 +1,1425 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"../ip/ip.h"
+
+enum
+{
+	Qtopdir=	1,		/* top level directory */
+	Qtopbase,
+	Qarp=		Qtopbase,
+	Qbootp,
+	Qndb,
+	Qiproute,
+	Qipselftab,
+	Qlog,
+
+	Qprotodir,			/* directory for a protocol */
+	Qprotobase,
+	Qclone=		Qprotobase,
+	Qstats,
+
+	Qconvdir,			/* directory for a conversation */
+	Qconvbase,
+	Qctl=		Qconvbase,
+	Qdata,
+	Qerr,
+	Qlisten,
+	Qlocal,
+	Qremote,
+	Qstatus,
+	Qsnoop,
+
+	Logtype=	5,
+	Masktype=	(1<<Logtype)-1,
+	Logconv=	12,
+	Maskconv=	(1<<Logconv)-1,
+	Shiftconv=	Logtype,
+	Logproto=	8,
+	Maskproto=	(1<<Logproto)-1,
+	Shiftproto=	Logtype + Logconv,
+
+	Nfs=		128,
+};
+#define TYPE(x) 	( ((ulong)(x).path) & Masktype )
+#define CONV(x) 	( (((ulong)(x).path) >> Shiftconv) & Maskconv )
+#define PROTO(x) 	( (((ulong)(x).path) >> Shiftproto) & Maskproto )
+#define QID(p, c, y) 	( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y) )
+
+static char network[] = "network";
+
+QLock	fslock;
+Fs	*ipfs[Nfs];	/* attached fs's */
+Queue	*qlog;
+
+extern	void nullmediumlink(void);
+extern	void pktmediumlink(void);
+	long ndbwrite(Fs *f, char *a, ulong off, int n);
+
+static int
+ip3gen(Chan *c, int i, Dir *dp)
+{
+	Qid q;
+	Conv *cv;
+	char *p;
+
+	cv = ipfs[c->devno]->p[PROTO(c->qid)]->conv[CONV(c->qid)];
+	if(cv->owner == nil)
+		kstrdup(&cv->owner, eve);
+	mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
+
+	switch(i) {
+	default:
+		return -1;
+	case Qctl:
+		devdir(c, q, "ctl", 0, cv->owner, cv->perm, dp);
+		return 1;
+	case Qdata:
+		devdir(c, q, "data", qlen(cv->rq), cv->owner, cv->perm, dp);
+		return 1;
+	case Qerr:
+		devdir(c, q, "err", qlen(cv->eq), cv->owner, cv->perm, dp);
+		return 1;
+	case Qlisten:
+		devdir(c, q, "listen", 0, cv->owner, cv->perm, dp);
+		return 1;
+	case Qlocal:
+		p = "local";
+		break;
+	case Qremote:
+		p = "remote";
+		break;
+	case Qsnoop:
+		if(strcmp(cv->p->name, "ipifc") != 0)
+			return -1;
+		devdir(c, q, "snoop", qlen(cv->sq), cv->owner, 0400, dp);
+		return 1;
+	case Qstatus:
+		p = "status";
+		break;
+	}
+	devdir(c, q, p, 0, cv->owner, 0444, dp);
+	return 1;
+}
+
+static int
+ip2gen(Chan *c, int i, Dir *dp)
+{
+	Qid q;
+
+	switch(i) {
+	case Qclone:
+		mkqid(&q, QID(PROTO(c->qid), 0, Qclone), 0, QTFILE);
+		devdir(c, q, "clone", 0, network, 0666, dp);
+		return 1;
+	case Qstats:
+		mkqid(&q, QID(PROTO(c->qid), 0, Qstats), 0, QTFILE);
+		devdir(c, q, "stats", 0, network, 0444, dp);
+		return 1;
+	}
+	return -1;
+}
+
+static int
+ip1gen(Chan *c, int i, Dir *dp)
+{
+	Qid q;
+	char *p;
+	int prot;
+	int len = 0;
+	Fs *f;
+	extern ulong	kerndate;
+
+	f = ipfs[c->devno];
+
+	prot = 0666;
+	mkqid(&q, QID(0, 0, i), 0, QTFILE);
+	switch(i) {
+	default:
+		return -1;
+	case Qarp:
+		p = "arp";
+		prot = 0664;
+		break;
+	case Qbootp:
+		p = "bootp";
+		break;
+	case Qndb:
+		p = "ndb";
+		len = strlen(f->ndb);
+		q.vers = f->ndbvers;
+		break;
+	case Qiproute:
+		p = "iproute";
+		prot = 0664;
+		break;
+	case Qipselftab:
+		p = "ipselftab";
+		prot = 0444;
+		break;
+	case Qlog:
+		p = "log";
+		break;
+	}
+	devdir(c, q, p, len, network, prot, dp);
+	if(i == Qndb && f->ndbmtime > kerndate)
+		dp->mtime = f->ndbmtime;
+	return 1;
+}
+
+static int
+ipgen(Chan *c, char*, Dirtab*, int, int s, Dir *dp)
+{
+	Qid q;
+	Conv *cv;
+	Fs *f;
+
+	f = ipfs[c->devno];
+
+	switch(TYPE(c->qid)) {
+	case Qtopdir:
+		if(s == DEVDOTDOT){
+			mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
+			sprint(up->genbuf, "#I%ud", c->devno);
+			devdir(c, q, up->genbuf, 0, network, 0555, dp);
+			return 1;
+		}
+		if(s < f->np) {
+			if(f->p[s]->connect == nil)
+				return 0;	/* protocol with no user interface */
+			mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
+			devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
+			return 1;
+		}
+		s -= f->np;
+		return ip1gen(c, s+Qtopbase, dp);
+	case Qarp:
+	case Qbootp:
+	case Qndb:
+	case Qlog:
+	case Qiproute:
+	case Qipselftab:
+		return ip1gen(c, TYPE(c->qid), dp);
+	case Qprotodir:
+		if(s == DEVDOTDOT){
+			mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
+			sprint(up->genbuf, "#I%ud", c->devno);
+			devdir(c, q, up->genbuf, 0, network, 0555, dp);
+			return 1;
+		}
+		if(s < f->p[PROTO(c->qid)]->ac) {
+			cv = f->p[PROTO(c->qid)]->conv[s];
+			sprint(up->genbuf, "%d", s);
+			mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
+			devdir(c, q, up->genbuf, 0, cv->owner, 0555, dp);
+			return 1;
+		}
+		s -= f->p[PROTO(c->qid)]->ac;
+		return ip2gen(c, s+Qprotobase, dp);
+	case Qclone:
+	case Qstats:
+		return ip2gen(c, TYPE(c->qid), dp);
+	case Qconvdir:
+		if(s == DEVDOTDOT){
+			s = PROTO(c->qid);
+			mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
+			devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
+			return 1;
+		}
+		return ip3gen(c, s+Qconvbase, dp);
+	case Qctl:
+	case Qdata:
+	case Qerr:
+	case Qlisten:
+	case Qlocal:
+	case Qremote:
+	case Qstatus:
+	case Qsnoop:
+		return ip3gen(c, TYPE(c->qid), dp);
+	}
+	return -1;
+}
+
+static void
+ipreset(void)
+{
+	nullmediumlink();
+	pktmediumlink();
+
+	fmtinstall('i', eipfmt);
+	fmtinstall('I', eipfmt);
+	fmtinstall('E', eipfmt);
+	fmtinstall('V', eipfmt);
+	fmtinstall('M', eipfmt);
+}
+
+static Fs*
+ipgetfs(int dev)
+{
+	extern void (*ipprotoinit[])(Fs*);
+	Fs *f;
+	int i;
+
+	if(dev >= Nfs)
+		return nil;
+
+	qlock(&fslock);
+	if(ipfs[dev] == nil){
+		f = smalloc(sizeof(Fs));
+		ip_init(f);
+		arpinit(f);
+		netloginit(f);
+		for(i = 0; ipprotoinit[i]; i++)
+			ipprotoinit[i](f);
+		f->dev = dev;
+		ipfs[dev] = f;
+	}
+	qunlock(&fslock);
+
+	return ipfs[dev];
+}
+
+IPaux*
+newipaux(char *owner, char *tag)
+{
+	IPaux *a;
+	int n;
+
+	a = smalloc(sizeof(*a));
+	kstrdup(&a->owner, owner);
+	memset(a->tag, ' ', sizeof(a->tag));
+	n = strlen(tag);
+	if(n > sizeof(a->tag))
+		n = sizeof(a->tag);
+	memmove(a->tag, tag, n);
+	return a;
+}
+
+#define ATTACHER(c) (((IPaux*)((c)->aux))->owner)
+
+static Chan*
+ipattach(char* spec)
+{
+	Chan *c;
+	int devno;
+
+	devno = atoi(spec);
+	if(devno >= Nfs)
+		error("bad specification");
+
+	ipgetfs(devno);
+	c = devattach('I', spec);
+	mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
+	c->devno = devno;
+
+	c->aux = newipaux(commonuser(), "none");
+
+	return c;
+}
+
+static Walkqid*
+ipwalk(Chan* c, Chan *nc, char **name, int nname)
+{
+	IPaux *a = c->aux;
+	Walkqid* w;
+
+	w = devwalk(c, nc, name, nname, nil, 0, ipgen);
+	if(w != nil && w->clone != nil)
+		w->clone->aux = newipaux(a->owner, a->tag);
+	return w;
+}
+
+static long
+ipstat(Chan* c, uchar* db, long n)
+{
+	return devstat(c, db, n, nil, 0, ipgen);
+}
+
+static int
+incoming(void* arg)
+{
+	Conv *conv;
+
+	conv = arg;
+	return conv->incall != nil;
+}
+
+static int m2p[] = {
+	[OREAD]		4,
+	[OWRITE]	2,
+	[ORDWR]		6
+};
+
+static Chan*
+ipopen(Chan* c, int omode)
+{
+	Conv *cv, *nc;
+	Proto *p;
+	int perm;
+	Fs *f;
+
+	perm = m2p[omode&3];
+
+	f = ipfs[c->devno];
+
+	switch(TYPE(c->qid)) {
+	default:
+		break;
+	case Qndb:
+		if(omode & (OWRITE|OTRUNC) && !iseve())
+			error(Eperm);
+		if((omode & (OWRITE|OTRUNC)) == (OWRITE|OTRUNC))
+			f->ndb[0] = 0;
+		break;
+	case Qlog:
+		netlogopen(f);
+		break;
+	case Qiproute:
+	case Qarp:
+		if(omode != OREAD && !iseve())
+			error(Eperm);
+		break;
+	case Qtopdir:
+	case Qprotodir:
+	case Qconvdir:
+	case Qstatus:
+	case Qremote:
+	case Qlocal:
+	case Qstats:
+	case Qbootp:
+	case Qipselftab:
+		if(omode != OREAD)
+			error(Eperm);
+		break;
+	case Qsnoop:
+		if(omode != OREAD)
+			error(Eperm);
+		p = f->p[PROTO(c->qid)];
+		cv = p->conv[CONV(c->qid)];
+		if(strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
+			error(Eperm);
+		incref(&cv->snoopers);
+		break;
+	case Qclone:
+		p = f->p[PROTO(c->qid)];
+		qlock(p);
+		if(waserror()){
+			qunlock(p);
+			nexterror();
+		}
+		cv = Fsprotoclone(p, ATTACHER(c));
+		qunlock(p);
+		poperror();
+		if(cv == nil) {
+			error(Enodev);
+			break;
+		}
+		mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
+		break;
+	case Qdata:
+	case Qctl:
+	case Qerr:
+		p = f->p[PROTO(c->qid)];
+		qlock(p);
+		cv = p->conv[CONV(c->qid)];
+		qlock(cv);
+		if(waserror()) {
+			qunlock(cv);
+			qunlock(p);
+			nexterror();
+		}
+		if((perm & (cv->perm>>6)) != perm) {
+			if(strcmp(ATTACHER(c), cv->owner) != 0)
+				error(Eperm);
+		 	if((perm & cv->perm) != perm)
+				error(Eperm);
+
+		}
+		cv->inuse++;
+		if(cv->inuse == 1){
+			kstrdup(&cv->owner, ATTACHER(c));
+			cv->perm = 0660;
+		}
+		qunlock(cv);
+		qunlock(p);
+		poperror();
+		break;
+	case Qlisten:
+		cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
+		if((perm & (cv->perm>>6)) != perm) {
+			if(strcmp(ATTACHER(c), cv->owner) != 0)
+				error(Eperm);
+		 	if((perm & cv->perm) != perm)
+				error(Eperm);
+
+		}
+
+		if(cv->state != Announced)
+			error("not announced");
+
+		if(waserror()){
+			closeconv(cv);
+			nexterror();
+		}
+		qlock(cv);
+		cv->inuse++;
+		qunlock(cv);
+
+		nc = nil;
+		while(nc == nil) {
+			/* give up if we got a hangup */
+			if(qisclosed(cv->rq))
+				error("listen hungup");
+
+			qlock(&cv->listenq);
+			if(waserror()) {
+				qunlock(&cv->listenq);
+				nexterror();
+			}
+
+			/* wait for a connect */
+			sleep(&cv->listenr, incoming, cv);
+
+			qlock(cv);
+			nc = cv->incall;
+			if(nc != nil){
+				cv->incall = nc->next;
+				mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
+				kstrdup(&cv->owner, ATTACHER(c));
+			}
+			qunlock(cv);
+
+			qunlock(&cv->listenq);
+			poperror();
+		}
+		closeconv(cv);
+		poperror();
+		break;
+	}
+	c->mode = openmode(omode);
+	c->flag |= COPEN;
+	c->offset = 0;
+	return c;
+}
+
+static void
+ipcreate(Chan*, char*, int, int)
+{
+	error(Eperm);
+}
+
+static void
+ipremove(Chan*)
+{
+	error(Eperm);
+}
+
+static long
+ipwstat(Chan *c, uchar *dp, long n)
+{
+	Dir d;
+	Conv *cv;
+	Fs *f;
+	Proto *p;
+
+	f = ipfs[c->devno];
+	switch(TYPE(c->qid)) {
+	default:
+		error(Eperm);
+		break;
+	case Qctl:
+	case Qdata:
+		break;
+	}
+
+	n = convM2D(dp, n, &d, nil);
+	if(n > 0){
+		p = f->p[PROTO(c->qid)];
+		cv = p->conv[CONV(c->qid)];
+		if(!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
+			error(Eperm);
+		if(d.uid[0])
+			kstrdup(&cv->owner, d.uid);
+		cv->perm = d.mode & 0777;
+	}
+	return n;
+}
+
+void
+closeconv(Conv *cv)
+{
+	Conv *nc;
+	Ipmulti *mp;
+
+	qlock(cv);
+
+	if(--cv->inuse > 0) {
+		qunlock(cv);
+		return;
+	}
+
+	/* close all incoming calls since no listen will ever happen */
+	for(nc = cv->incall; nc; nc = cv->incall){
+		cv->incall = nc->next;
+		closeconv(nc);
+	}
+	cv->incall = nil;
+
+	kstrdup(&cv->owner, network);
+	cv->perm = 0660;
+
+	while((mp = cv->multi) != nil)
+		ipifcremmulti(cv, mp->ma, mp->ia);
+
+	cv->r = nil;
+	cv->rgen = 0;
+	cv->p->close(cv);
+	cv->state = Idle;
+	qunlock(cv);
+}
+
+static void
+ipclose(Chan* c)
+{
+	Fs *f;
+
+	f = ipfs[c->devno];
+	switch(TYPE(c->qid)) {
+	default:
+		break;
+	case Qlog:
+		if(c->flag & COPEN)
+			netlogclose(f);
+		break;
+	case Qdata:
+	case Qctl:
+	case Qerr:
+		if(c->flag & COPEN)
+			closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
+		break;
+	case Qsnoop:
+		if(c->flag & COPEN)
+			decref(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
+		break;
+	}
+	free(((IPaux*)c->aux)->owner);
+	free(c->aux);
+}
+
+enum
+{
+	Statelen=	32*1024,
+};
+
+static long
+ipread(Chan *ch, void *a, long n, vlong off)
+{
+	Conv *c;
+	Proto *x;
+	char *buf, *p;
+	long offset, rv;
+	Fs *f;
+
+	f = ipfs[ch->devno];
+
+	p = a;
+	offset = off;
+	switch(TYPE(ch->qid)) {
+	default:
+		error(Eperm);
+	case Qtopdir:
+	case Qprotodir:
+	case Qconvdir:
+		return devdirread(ch, a, n, 0, 0, ipgen);
+	case Qarp:
+		return arpread(f->arp, a, offset, n);
+ 	case Qbootp:
+ 		return bootpread(a, offset, n);
+ 	case Qndb:
+		return readstr(offset, a, n, f->ndb);
+	case Qiproute:
+		return routeread(f, a, offset, n);
+	case Qipselftab:
+		return ipselftabread(f, a, offset, n);
+	case Qlog:
+		return netlogread(f, a, offset, n);
+	case Qctl:
+		buf = smalloc(16);
+		sprint(buf, "%lud", CONV(ch->qid));
+		rv = readstr(offset, p, n, buf);
+		free(buf);
+		return rv;
+	case Qremote:
+		buf = smalloc(Statelen);
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+		if(x->remote == nil) {
+			sprint(buf, "%I!%d\n", c->raddr, c->rport);
+		} else {
+			(*x->remote)(c, buf, Statelen-2);
+		}
+		rv = readstr(offset, p, n, buf);
+		free(buf);
+		return rv;
+	case Qlocal:
+		buf = smalloc(Statelen);
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+		if(x->local == nil) {
+			sprint(buf, "%I!%d\n", c->laddr, c->lport);
+		} else {
+			(*x->local)(c, buf, Statelen-2);
+		}
+		rv = readstr(offset, p, n, buf);
+		free(buf);
+		return rv;
+	case Qstatus:
+		buf = smalloc(Statelen);
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+		(*x->state)(c, buf, Statelen-2);
+		rv = readstr(offset, p, n, buf);
+		free(buf);
+		return rv;
+	case Qdata:
+		c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
+		return qread(c->rq, a, n);
+	case Qerr:
+		c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
+		return qread(c->eq, a, n);
+	case Qsnoop:
+		c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
+		return qread(c->sq, a, n);
+	case Qstats:
+		x = f->p[PROTO(ch->qid)];
+		if(x->stats == nil)
+			error("stats not implemented");
+		buf = smalloc(Statelen);
+		(*x->stats)(x, buf, Statelen);
+		rv = readstr(offset, p, n, buf);
+		free(buf);
+		return rv;
+	}
+}
+
+static Block*
+ipbread(Chan* ch, long n, vlong offset)
+{
+	Conv *c;
+	Proto *x;
+	Fs *f;
+
+	switch(TYPE(ch->qid)){
+	case Qdata:
+		f = ipfs[ch->devno];
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+		return qbread(c->rq, n);
+	default:
+		return devbread(ch, n, offset);
+	}
+}
+
+/*
+ *  set local address to be that of the ifc closest to remote address
+ */
+static void
+setladdr(Conv* c)
+{
+	findlocalip(c->p->f, c->laddr, c->raddr);
+}
+
+/*
+ *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
+ */
+char*
+setluniqueport(Conv* c, int lport)
+{
+	Proto *p;
+	Conv *xp;
+	int x;
+
+	p = c->p;
+
+	qlock(p);
+	for(x = 0; x < p->nc; x++){
+		xp = p->conv[x];
+		if(xp == nil)
+			break;
+		if(xp == c)
+			continue;
+		if((xp->state == Connected || xp->state == Announced)
+		&& xp->lport == lport
+		&& xp->rport == c->rport
+		&& ipcmp(xp->raddr, c->raddr) == 0
+		&& ipcmp(xp->laddr, c->laddr) == 0){
+			qunlock(p);
+			return "address in use";
+		}
+	}
+	c->lport = lport;
+	qunlock(p);
+	return nil;
+}
+
+
+/*
+ *  pick a local port and set it
+ */
+void
+setlport(Conv* c)
+{
+	Proto *p;
+	ushort *pp;
+	int x, found;
+
+	p = c->p;
+	if(c->restricted)
+		pp = &p->nextrport;
+	else
+		pp = &p->nextport;
+	qlock(p);
+	for(;;(*pp)++){
+		/*
+		 * Fsproto initialises p->nextport to 0 and the restricted
+		 * ports (p->nextrport) to 600.
+		 * Restricted ports must lie between 600 and 1024.
+		 * For the initial condition or if the unrestricted port number
+		 * has wrapped round, select a random port between 5000 and 1<<15
+		 * to start at.
+		 */
+		if(c->restricted){
+			if(*pp >= 1024)
+				*pp = 600;
+		}
+		else while(*pp < 5000)
+			*pp = nrand(1<<15);
+
+		found = 0;
+		for(x = 0; x < p->nc; x++){
+			if(p->conv[x] == nil)
+				break;
+			if(p->conv[x]->lport == *pp){
+				found = 1;
+				break;
+			}
+		}
+		if(!found)
+			break;
+	}
+	c->lport = (*pp)++;
+	qunlock(p);
+}
+
+/*
+ *  set a local address and port from a string of the form
+ *	[address!]port[!r]
+ */
+char*
+setladdrport(Conv* c, char* str, int announcing)
+{
+	char *p;
+	char *rv;
+	ushort lport;
+	uchar addr[IPaddrlen];
+
+	rv = nil;
+
+	/*
+	 *  ignore restricted part if it exists.  it's
+	 *  meaningless on local ports.
+	 */
+	p = strchr(str, '!');
+	if(p != nil){
+		*p++ = 0;
+		if(strcmp(p, "r") == 0)
+			p = nil;
+	}
+
+	c->lport = 0;
+	if(p == nil){
+		if(announcing)
+			ipmove(c->laddr, IPnoaddr);
+		else
+			setladdr(c);
+		p = str;
+	} else {
+		if(strcmp(str, "*") == 0)
+			ipmove(c->laddr, IPnoaddr);
+		else {
+			parseip(addr, str);
+			if(ipforme(c->p->f, addr))
+				ipmove(c->laddr, addr);
+			else
+				return "not a local IP address";
+		}
+	}
+
+	/* one process can get all connections */
+	if(announcing && strcmp(p, "*") == 0){
+		if(!iseve())
+			error(Eperm);
+		return setluniqueport(c, 0);
+	}
+
+	lport = atoi(p);
+	if(lport <= 0)
+		setlport(c);
+	else
+		rv = setluniqueport(c, lport);
+	return rv;
+}
+
+static char*
+setraddrport(Conv* c, char* str)
+{
+	char *p;
+
+	p = strchr(str, '!');
+	if(p == nil)
+		return "malformed address";
+	*p++ = 0;
+	parseip(c->raddr, str);
+	c->rport = atoi(p);
+	p = strchr(p, '!');
+	if(p){
+		if(strstr(p, "!r") != nil)
+			c->restricted = 1;
+	}
+	return nil;
+}
+
+/*
+ *  called by protocol connect routine to set addresses
+ */
+char*
+Fsstdconnect(Conv *c, char *argv[], int argc)
+{
+	char *p;
+
+	switch(argc) {
+	default:
+		return "bad args to connect";
+	case 2:
+		p = setraddrport(c, argv[1]);
+		if(p != nil)
+			return p;
+		setladdr(c);
+		setlport(c);
+		break;
+	case 3:
+		p = setraddrport(c, argv[1]);
+		if(p != nil)
+			return p;
+		p = setladdrport(c, argv[2], 0);
+		if(p != nil)
+			return p;
+	}
+
+	if( (memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
+		memcmp(c->laddr, v4prefix, IPv4off) == 0)
+		|| ipcmp(c->raddr, IPnoaddr) == 0)
+		c->ipversion = V4;
+	else
+		c->ipversion = V6;
+
+	return nil;
+}
+/*
+ *  initiate connection and sleep till its set up
+ */
+static int
+connected(void* a)
+{
+	return ((Conv*)a)->state == Connected;
+}
+static void
+connectctlmsg(Proto *x, Conv *c, Cmdbuf *cb)
+{
+	char *p;
+
+	if(c->state != 0)
+		error(Econinuse);
+	c->state = Connecting;
+	c->cerr[0] = '\0';
+	if(x->connect == nil)
+		error("connect not supported");
+	p = x->connect(c, cb->f, cb->nf);
+	if(p != nil)
+		error(p);
+
+	qunlock(c);
+	if(waserror()){
+		qlock(c);
+		nexterror();
+	}
+	sleep(&c->cr, connected, c);
+	qlock(c);
+	poperror();
+
+	if(c->cerr[0] != '\0')
+		error(c->cerr);
+}
+
+/*
+ *  called by protocol announce routine to set addresses
+ */
+char*
+Fsstdannounce(Conv* c, char* argv[], int argc)
+{
+	memset(c->raddr, 0, sizeof(c->raddr));
+	c->rport = 0;
+	switch(argc){
+	default:
+		break;
+	case 2:
+		return setladdrport(c, argv[1], 1);
+	}
+	return "bad args to announce";
+}
+
+/*
+ *  initiate announcement and sleep till its set up
+ */
+static int
+announced(void* a)
+{
+	return ((Conv*)a)->state == Announced;
+}
+static void
+announcectlmsg(Proto *x, Conv *c, Cmdbuf *cb)
+{
+	char *p;
+
+	if(c->state != 0)
+		error(Econinuse);
+	c->state = Announcing;
+	c->cerr[0] = '\0';
+	if(x->announce == nil)
+		error("announce not supported");
+	p = x->announce(c, cb->f, cb->nf);
+	if(p != nil)
+		error(p);
+
+	qunlock(c);
+	if(waserror()){
+		qlock(c);
+		nexterror();
+	}
+	sleep(&c->cr, announced, c);
+	qlock(c);
+	poperror();
+
+	if(c->cerr[0] != '\0')
+		error(c->cerr);
+}
+
+/*
+ *  called by protocol bind routine to set addresses
+ */
+char*
+Fsstdbind(Conv* c, char* argv[], int argc)
+{
+	switch(argc){
+	default:
+		break;
+	case 2:
+		return setladdrport(c, argv[1], 0);
+	}
+	return "bad args to bind";
+}
+
+static void
+bindctlmsg(Proto *x, Conv *c, Cmdbuf *cb)
+{
+	char *p;
+
+	if(x->bind == nil)
+		p = Fsstdbind(c, cb->f, cb->nf);
+	else
+		p = x->bind(c, cb->f, cb->nf);
+	if(p != nil)
+		error(p);
+}
+
+static void
+tosctlmsg(Conv *c, Cmdbuf *cb)
+{
+	if(cb->nf < 2)
+		c->tos = 0;
+	else
+		c->tos = atoi(cb->f[1]);
+}
+
+static void
+ttlctlmsg(Conv *c, Cmdbuf *cb)
+{
+	if(cb->nf < 2)
+		c->ttl = MAXTTL;
+	else
+		c->ttl = atoi(cb->f[1]);
+}
+
+static long
+ipwrite(Chan* ch, void *v, long n, vlong off)
+{
+	Conv *c;
+	Proto *x;
+	char *p;
+	Cmdbuf *cb;
+	uchar ia[IPaddrlen], ma[IPaddrlen];
+	Fs *f;
+	char *a;
+	ulong offset = off;
+
+	a = v;
+	f = ipfs[ch->devno];
+
+	switch(TYPE(ch->qid)){
+	default:
+		error(Eperm);
+	case Qdata:
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+
+		if(c->wq == nil)
+			error(Eperm);
+
+		qwrite(c->wq, a, n);
+		break;
+	case Qarp:
+		return arpwrite(f, a, n);
+	case Qiproute:
+		return routewrite(f, ch, a, n);
+	case Qlog:
+		netlogctl(f, a, n);
+		return n;
+	case Qndb:
+		return ndbwrite(f, a, offset, n);
+		break;
+	case Qctl:
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+		cb = parsecmd(a, n);
+
+		qlock(c);
+		if(waserror()) {
+			qunlock(c);
+			free(cb);
+			nexterror();
+		}
+		if(cb->nf < 1)
+			error("short control request");
+		if(strcmp(cb->f[0], "connect") == 0)
+			connectctlmsg(x, c, cb);
+		else if(strcmp(cb->f[0], "announce") == 0)
+			announcectlmsg(x, c, cb);
+		else if(strcmp(cb->f[0], "bind") == 0)
+			bindctlmsg(x, c, cb);
+		else if(strcmp(cb->f[0], "ttl") == 0)
+			ttlctlmsg(c, cb);
+		else if(strcmp(cb->f[0], "tos") == 0)
+			tosctlmsg(c, cb);
+		else if(strcmp(cb->f[0], "ignoreadvice") == 0)
+			c->ignoreadvice = 1;
+		else if(strcmp(cb->f[0], "addmulti") == 0){
+			if(cb->nf < 2)
+				error("addmulti needs interface address");
+			if(cb->nf == 2){
+				if(!ipismulticast(c->raddr))
+					error("addmulti for a non multicast address");
+				parseip(ia, cb->f[1]);
+				ipifcaddmulti(c, c->raddr, ia);
+			} else {
+				parseip(ma, cb->f[2]);
+				if(!ipismulticast(ma))
+					error("addmulti for a non multicast address");
+				parseip(ia, cb->f[1]);
+				ipifcaddmulti(c, ma, ia);
+			}
+		} else if(strcmp(cb->f[0], "remmulti") == 0){
+			if(cb->nf < 2)
+				error("remmulti needs interface address");
+			if(!ipismulticast(c->raddr))
+				error("remmulti for a non multicast address");
+			parseip(ia, cb->f[1]);
+			ipifcremmulti(c, c->raddr, ia);
+		} else if(x->ctl != nil) {
+			p = x->ctl(c, cb->f, cb->nf);
+			if(p != nil)
+				error(p);
+		} else
+			error("unknown control request");
+		qunlock(c);
+		free(cb);
+		poperror();
+	}
+	return n;
+}
+
+static long
+ipbwrite(Chan* ch, Block* bp, vlong offset)
+{
+	Conv *c;
+	Proto *x;
+	Fs *f;
+	int n;
+
+	switch(TYPE(ch->qid)){
+	case Qdata:
+		f = ipfs[ch->devno];
+		x = f->p[PROTO(ch->qid)];
+		c = x->conv[CONV(ch->qid)];
+
+		if(c->wq == nil)
+			error(Eperm);
+
+		if(bp->next)
+			bp = concatblock(bp);
+		n = BLEN(bp);
+		qbwrite(c->wq, bp);
+		return n;
+	default:
+		return devbwrite(ch, bp, offset);
+	}
+}
+
+Dev ipdevtab = {
+	'I',
+	"ip",
+
+	ipreset,
+	devinit,
+	devshutdown,
+	ipattach,
+	ipwalk,
+	ipstat,
+	ipopen,
+	ipcreate,
+	ipclose,
+	ipread,
+	ipbread,
+	ipwrite,
+	ipbwrite,
+	ipremove,
+	ipwstat,
+};
+
+int
+Fsproto(Fs *f, Proto *p)
+{
+	if(f->np >= Maxproto)
+		return -1;
+
+	p->f = f;
+
+	if(p->ipproto > 0){
+		if(f->t2p[p->ipproto] != nil)
+			return -1;
+		f->t2p[p->ipproto] = p;
+	}
+
+	p->qid.type = QTDIR;
+	p->qid.path = QID(f->np, 0, Qprotodir);
+	p->conv = malloc(sizeof(Conv*)*(p->nc+1));
+	if(p->conv == nil)
+		panic("Fsproto");
+
+	p->x = f->np;
+	p->nextport = 0;
+	p->nextrport = 600;
+	f->p[f->np++] = p;
+
+	return 0;
+}
+
+/*
+ *  return true if this protocol is
+ *  built in
+ */
+int
+Fsbuiltinproto(Fs* f, uchar proto)
+{
+	return f->t2p[proto] != nil;
+}
+
+/*
+ *  called with protocol locked
+ */
+Conv*
+Fsprotoclone(Proto *p, char *user)
+{
+	Conv *c, **pp, **ep;
+
+retry:
+	c = nil;
+	ep = &p->conv[p->nc];
+	for(pp = p->conv; pp < ep; pp++) {
+		c = *pp;
+		if(c == nil){
+			c = malloc(sizeof(Conv));
+			if(c == nil)
+				error(Enomem);
+			qlock(c);
+			c->p = p;
+			c->x = pp - p->conv;
+			if(p->ptclsize != 0){
+				c->ptcl = malloc(p->ptclsize);
+				if(c->ptcl == nil) {
+					free(c);
+					error(Enomem);
+				}
+			}
+			*pp = c;
+			p->ac++;
+			c->eq = qopen(1024, Qmsg, 0, 0);
+			(*p->create)(c);
+			break;
+		}
+		if(canqlock(c)){
+			/*
+			 *  make sure both processes and protocol
+			 *  are done with this Conv
+			 */
+			if(c->inuse == 0 && (p->inuse == nil || (*p->inuse)(c) == 0))
+				break;
+
+			qunlock(c);
+		}
+	}
+	if(pp >= ep) {
+		if(p->gc != nil && (*p->gc)(p))
+			goto retry;
+		return nil;
+	}
+
+	c->inuse = 1;
+	kstrdup(&c->owner, user);
+	c->perm = 0660;
+	c->state = Idle;
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->r = nil;
+	c->rgen = 0;
+	c->lport = 0;
+	c->rport = 0;
+	c->restricted = 0;
+	c->ttl = MAXTTL;
+	qreopen(c->rq);
+	qreopen(c->wq);
+	qreopen(c->eq);
+
+	qunlock(c);
+	return c;
+}
+
+int
+Fsconnected(Conv* c, char* msg)
+{
+	if(msg != nil && *msg != '\0')
+		strncpy(c->cerr, msg, ERRMAX-1);
+
+	switch(c->state){
+
+	case Announcing:
+		c->state = Announced;
+		break;
+
+	case Connecting:
+		c->state = Connected;
+		break;
+	}
+
+	wakeup(&c->cr);
+	return 0;
+}
+
+Proto*
+Fsrcvpcol(Fs* f, uchar proto)
+{
+	if(f->ipmux)
+		return f->ipmux;
+	else
+		return f->t2p[proto];
+}
+
+Proto*
+Fsrcvpcolx(Fs *f, uchar proto)
+{
+	return f->t2p[proto];
+}
+
+/*
+ *  called with protocol locked
+ */
+Conv*
+Fsnewcall(Conv *c, uchar *raddr, ushort rport, uchar *laddr, ushort lport, uchar version)
+{
+	Conv *nc;
+	Conv **l;
+	int i;
+
+	qlock(c);
+	i = 0;
+	for(l = &c->incall; *l; l = &(*l)->next)
+		i++;
+	if(i >= Maxincall) {
+		qunlock(c);
+		return nil;
+	}
+
+	/* find a free conversation */
+	nc = Fsprotoclone(c->p, network);
+	if(nc == nil) {
+		qunlock(c);
+		return nil;
+	}
+	ipmove(nc->raddr, raddr);
+	nc->rport = rport;
+	ipmove(nc->laddr, laddr);
+	nc->lport = lport;
+	nc->next = nil;
+	*l = nc;
+	nc->state = Connected;
+	nc->ipversion = version;
+
+	qunlock(c);
+
+	wakeup(&c->listenr);
+
+	return nc;
+}
+
+long
+ndbwrite(Fs *f, char *a, ulong off, int n)
+{
+	if(off > strlen(f->ndb))
+		error(Eio);
+	if(off+n >= sizeof(f->ndb))
+		error(Eio);
+	memmove(f->ndb+off, a, n);
+	f->ndb[off+n] = 0;
+	f->ndbvers++;
+	f->ndbmtime = seconds();
+	return n;
+}
+
+ulong
+scalednconv(void)
+{
+	if(cpuserver && conf.npage*PGSZ >= 128*MB)
+		return Nchans*4;
+	return Nchans;
+}

+ 794 - 0
sys/src/9/ip/ethermedium.c

@@ -0,0 +1,794 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+#include "ipv6.h"
+
+typedef struct Etherhdr Etherhdr;
+struct Etherhdr
+{
+	uchar	d[6];
+	uchar	s[6];
+	uchar	t[2];
+};
+
+static uchar ipbroadcast[IPaddrlen] = {
+	0xff,0xff,0xff,0xff,
+	0xff,0xff,0xff,0xff,
+	0xff,0xff,0xff,0xff,
+	0xff,0xff,0xff,0xff,
+};
+
+static uchar etherbroadcast[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+static void	etherread4(void *a);
+static void	etherread6(void *a);
+static void	etherbind(Ipifc *ifc, int argc, char **argv);
+static void	etherunbind(Ipifc *ifc);
+static void	etherbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip);
+static void	etheraddmulti(Ipifc *ifc, uchar *a, uchar *ia);
+static void	etherremmulti(Ipifc *ifc, uchar *a, uchar *ia);
+static Block*	multicastarp(Fs *f, Arpent *a, Medium*, uchar *mac);
+static void	sendarp(Ipifc *ifc, Arpent *a);
+static void	sendgarp(Ipifc *ifc, uchar*);
+static int	multicastea(uchar *ea, uchar *ip);
+static void	recvarpproc(void*);
+static void	resolveaddr6(Ipifc *ifc, Arpent *a);
+static void	etherpref2addr(uchar *pref, uchar *ea);
+
+Medium ethermedium =
+{
+.name=		"ether",
+.hsize=		14,
+.mintu=		60,
+.maxtu=		1514,
+.maclen=	6,
+.bind=		etherbind,
+.unbind=	etherunbind,
+.bwrite=	etherbwrite,
+.addmulti=	etheraddmulti,
+.remmulti=	etherremmulti,
+.ares=		arpenter,
+.areg=		sendgarp,
+.pref2addr=	etherpref2addr,
+};
+
+Medium fbemedium =
+{
+.name=		"fbe",
+.hsize=		14,
+.mintu=		60,
+.maxtu=		4000,
+.maclen=	6,
+.bind=		etherbind,
+.unbind=	etherunbind,
+.bwrite=	etherbwrite,
+.addmulti=	etheraddmulti,
+.remmulti=	etherremmulti,
+.ares=		arpenter,
+.areg=		sendgarp,
+.pref2addr=	etherpref2addr,
+};
+
+Medium gbemedium =
+{
+.name=		"gbe",
+.hsize=		14,
+.mintu=		60,
+.maxtu=		9014,
+.maclen=	6,
+.bind=		etherbind,
+.unbind=	etherunbind,
+.bwrite=	etherbwrite,
+.addmulti=	etheraddmulti,
+.remmulti=	etherremmulti,
+.ares=		arpenter,
+.areg=		sendgarp,
+.pref2addr=	etherpref2addr,
+};
+
+typedef struct	Etherrock Etherrock;
+struct Etherrock
+{
+	Fs	*f;		/* file system we belong to */
+	Proc	*arpp;		/* arp process */
+	Proc	*read4p;	/* reading process (v4)*/
+	Proc	*read6p;	/* reading process (v6)*/
+	Chan	*mchan4;	/* Data channel for v4 */
+	Chan	*achan;		/* Arp channel */
+	Chan	*cchan4;	/* Control channel for v4 */
+	Chan	*mchan6;	/* Data channel for v6 */
+	Chan	*cchan6;	/* Control channel for v6 */
+};
+
+/*
+ *  ethernet arp request
+ */
+enum
+{
+	ETARP		= 0x0806,
+	ETIP4		= 0x0800,
+	ETIP6		= 0x86DD,
+	ARPREQUEST	= 1,
+	ARPREPLY	= 2,
+};
+
+typedef struct Etherarp Etherarp;
+struct Etherarp
+{
+	uchar	d[6];
+	uchar	s[6];
+	uchar	type[2];
+	uchar	hrd[2];
+	uchar	pro[2];
+	uchar	hln;
+	uchar	pln;
+	uchar	op[2];
+	uchar	sha[6];
+	uchar	spa[4];
+	uchar	tha[6];
+	uchar	tpa[4];
+};
+
+static char *nbmsg = "nonblocking";
+
+/*
+ *  called to bind an IP ifc to an ethernet device
+ *  called with ifc wlock'd
+ */
+static void
+etherbind(Ipifc *ifc, int argc, char **argv)
+{
+	Chan *mchan4, *cchan4, *achan, *mchan6, *cchan6, *schan;
+	char addr[Maxpath];	//char addr[2*KNAMELEN];
+	char dir[Maxpath];	//char dir[2*KNAMELEN];
+	char *buf;
+	int n;
+	char *ptr;
+	Etherrock *er;
+
+	if(argc < 2)
+		error(Ebadarg);
+
+	mchan4 = cchan4 = achan = mchan6 = cchan6 = nil;
+	buf = nil;
+	if(waserror()){
+		if(mchan4 != nil)
+			cclose(mchan4);
+		if(cchan4 != nil)
+			cclose(cchan4);
+		if(achan != nil)
+			cclose(achan);
+		if(mchan6 != nil)
+			cclose(mchan6);
+		if(cchan6 != nil)
+			cclose(cchan6);
+		if(buf != nil)
+			free(buf);
+		nexterror();
+	}
+
+	/*
+	 *  open ipv4 converstation
+	 *
+	 *  the dial will fail if the type is already open on
+	 *  this device.
+	 */
+	snprint(addr, sizeof(addr), "%s!0x800", argv[2]);
+	mchan4 = chandial(addr, nil, dir, &cchan4);
+
+	/*
+	 *  make it non-blocking
+	 */
+	cchan4->dev->write(cchan4, nbmsg, strlen(nbmsg), 0);
+
+	/*
+	 *  get mac address and speed
+	 */
+	snprint(addr, sizeof(addr), "%s/stats", argv[2]);
+	buf = smalloc(512);
+	schan = namec(addr, Aopen, OREAD, 0);
+	if(waserror()){
+		cclose(schan);
+		nexterror();
+	}
+	n = schan->dev->read(schan, buf, 511, 0);
+	cclose(schan);
+	poperror();
+	buf[n] = 0;
+
+	ptr = strstr(buf, "addr: ");
+	if(!ptr)
+		error(Eio);
+	ptr += 6;
+	parsemac(ifc->mac, ptr, 6);
+
+	ptr = strstr(buf, "mbps: ");
+	if(ptr){
+		ptr += 6;
+		ifc->mbps = atoi(ptr);
+	} else
+		ifc->mbps = 100;
+
+	/*
+ 	 *  open arp conversation
+	 */
+	snprint(addr, sizeof(addr), "%s!0x806", argv[2]);
+	achan = chandial(addr, nil, nil, nil);
+
+	/*
+	 *  open ipv6 conversation
+	 *
+	 *  the dial will fail if the type is already open on
+	 *  this device.
+	 */
+	snprint(addr, sizeof(addr), "%s!0x86DD", argv[2]);
+	mchan6 = chandial(addr, nil, dir, &cchan6);
+
+	/*
+	 *  make it non-blocking
+	 */
+	cchan6->dev->write(cchan6, nbmsg, strlen(nbmsg), 0);
+
+	er = smalloc(sizeof(*er));
+	er->mchan4 = mchan4;
+	er->cchan4 = cchan4;
+	er->achan = achan;
+	er->mchan6 = mchan6;
+	er->cchan6 = cchan6;
+	er->f = ifc->conv->p->f;
+	ifc->arg = er;
+
+	free(buf);
+	poperror();
+
+	kproc("etherread4", etherread4, ifc);
+	kproc("recvarpproc", recvarpproc, ifc);
+	kproc("etherread6", etherread6, ifc);
+}
+
+/*
+ *  called with ifc wlock'd
+ */
+static void
+etherunbind(Ipifc *ifc)
+{
+	Etherrock *er = ifc->arg;
+
+	if(er->read4p)
+		postnote(er->read4p, 1, "unbind", NUser);
+	if(er->read6p)
+		postnote(er->read6p, 1, "unbind", NUser);
+	if(er->arpp)
+		postnote(er->arpp, 1, "unbind", NUser);
+
+	/* wait for readers to die */
+	while(er->arpp != 0 || er->read4p != 0 || er->read6p != 0)
+		tsleep(&up->sleep, return0, 0, 300);
+
+	if(er->mchan4 != nil)
+		cclose(er->mchan4);
+	if(er->achan != nil)
+		cclose(er->achan);
+	if(er->cchan4 != nil)
+		cclose(er->cchan4);
+	if(er->mchan6 != nil)
+		cclose(er->mchan6);
+	if(er->cchan6 != nil)
+		cclose(er->cchan6);
+
+	free(er);
+}
+
+/*
+ *  called by ipoput with a single block to write with ifc rlock'd
+ */
+static void
+etherbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip)
+{
+	Etherhdr *eh;
+	Arpent *a;
+	uchar mac[6];
+	Etherrock *er = ifc->arg;
+
+	/* get mac address of destination */
+	a = arpget(er->f->arp, bp, version, ifc, ip, mac);
+	if(a){
+		/* check for broadcast or multicast */
+		bp = multicastarp(er->f, a, ifc->medium, mac);
+		if(bp==nil){
+			switch(version){
+			case V4:
+				sendarp(ifc, a);
+				break;
+			case V6:
+				resolveaddr6(ifc, a);
+				break;
+			default:
+				panic("etherbwrite: version %d", version);
+			}
+			return;
+		}
+	}
+
+	/* make it a single block with space for the ether header */
+	bp = padblock(bp, ifc->medium->hsize);
+	if(bp->next)
+		bp = concatblock(bp);
+	if(BLEN(bp) < ifc->mintu)
+		bp = adjustblock(bp, ifc->mintu);
+	eh = (Etherhdr*)bp->rp;
+
+	/* copy in mac addresses and ether type */
+	memmove(eh->s, ifc->mac, sizeof(eh->s));
+	memmove(eh->d, mac, sizeof(eh->d));
+
+ 	switch(version){
+	case V4:
+		eh->t[0] = 0x08;
+		eh->t[1] = 0x00;
+		er->mchan4->dev->bwrite(er->mchan4, bp, 0);
+		break;
+	case V6:
+		eh->t[0] = 0x86;
+		eh->t[1] = 0xDD;
+		er->mchan6->dev->bwrite(er->mchan6, bp, 0);
+		break;
+	default:
+		panic("etherbwrite2: version %d", version);
+	}
+	ifc->out++;
+}
+
+
+/*
+ *  process to read from the ethernet
+ */
+static void
+etherread4(void *a)
+{
+	Ipifc *ifc;
+	Block *bp;
+	Etherrock *er;
+
+	ifc = a;
+	er = ifc->arg;
+	er->read4p = up;	/* hide identity under a rock for unbind */
+	if(waserror()){
+		er->read4p = 0;
+		pexit("hangup", 1);
+	}
+	for(;;){
+		bp = er->mchan4->dev->bread(er->mchan4, ifc->maxtu, 0);
+		if(!canrlock(ifc)){
+			freeb(bp);
+			continue;
+		}
+		if(waserror()){
+			runlock(ifc);
+			nexterror();
+		}
+		ifc->in++;
+		bp->rp += ifc->medium->hsize;
+		if(ifc->lifc == nil)
+			freeb(bp);
+		else
+			ipiput4(er->f, ifc, bp);
+		runlock(ifc);
+		poperror();
+	}
+}
+
+
+/*
+ *  process to read from the ethernet, IPv6
+ */
+static void
+etherread6(void *a)
+{
+	Ipifc *ifc;
+	Block *bp;
+	Etherrock *er;
+
+	ifc = a;
+	er = ifc->arg;
+	er->read6p = up;	/* hide identity under a rock for unbind */
+	if(waserror()){
+		er->read6p = 0;
+		pexit("hangup", 1);
+	}
+	for(;;){
+		bp = er->mchan6->dev->bread(er->mchan6, ifc->maxtu, 0);
+		if(!canrlock(ifc)){
+			freeb(bp);
+			continue;
+		}
+		if(waserror()){
+			runlock(ifc);
+			nexterror();
+		}
+		ifc->in++;
+		bp->rp += ifc->medium->hsize;
+		if(ifc->lifc == nil)
+			freeb(bp);
+		else
+			ipiput6(er->f, ifc, bp);
+		runlock(ifc);
+		poperror();
+	}
+}
+
+static void
+etheraddmulti(Ipifc *ifc, uchar *a, uchar *)
+{
+	uchar mac[6];
+	char buf[64];
+	Etherrock *er = ifc->arg;
+	int version;
+
+	version = multicastea(mac, a);
+	sprint(buf, "addmulti %E", mac);
+	switch(version){
+	case V4:
+		er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0);
+		break;
+	case V6:
+		er->cchan6->dev->write(er->cchan6, buf, strlen(buf), 0);
+		break;
+	default:
+		panic("etheraddmulti: version %d", version);
+	}
+}
+
+static void
+etherremmulti(Ipifc *ifc, uchar *a, uchar *)
+{
+	uchar mac[6];
+	char buf[64];
+	Etherrock *er = ifc->arg;
+	int version;
+
+	version = multicastea(mac, a);
+	sprint(buf, "remmulti %E", mac);
+	switch(version){
+	case V4:
+		er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0);
+		break;
+	case V6:
+		er->cchan6->dev->write(er->cchan6, buf, strlen(buf), 0);
+		break;
+	default:
+		panic("etherremmulti: version %d", version);
+	}
+}
+
+/*
+ *  send an ethernet arp
+ *  (only v4, v6 uses the neighbor discovery, rfc1970)
+ */
+static void
+sendarp(Ipifc *ifc, Arpent *a)
+{
+	int n;
+	Block *bp;
+	Etherarp *e;
+	Etherrock *er = ifc->arg;
+
+	/* don't do anything if it's been less than a second since the last */
+	if(NOW - a->ctime < 1000){
+		arprelease(er->f->arp, a);
+		return;
+	}
+
+	/* remove all but the last message */
+	while((bp = a->hold) != nil){
+		if(bp == a->last)
+			break;
+		a->hold = bp->list;
+		freeblist(bp);
+	}
+
+	/* try to keep it around for a second more */
+	a->ctime = NOW;
+	arprelease(er->f->arp, a);
+
+	n = sizeof(Etherarp);
+	if(n < a->type->mintu)
+		n = a->type->mintu;
+	bp = allocb(n);
+	memset(bp->rp, 0, n);
+	e = (Etherarp*)bp->rp;
+	memmove(e->tpa, a->ip+IPv4off, sizeof(e->tpa));
+	ipv4local(ifc, e->spa);
+	memmove(e->sha, ifc->mac, sizeof(e->sha));
+	memset(e->d, 0xff, sizeof(e->d));		/* ethernet broadcast */
+	memmove(e->s, ifc->mac, sizeof(e->s));
+
+	hnputs(e->type, ETARP);
+	hnputs(e->hrd, 1);
+	hnputs(e->pro, ETIP4);
+	e->hln = sizeof(e->sha);
+	e->pln = sizeof(e->spa);
+	hnputs(e->op, ARPREQUEST);
+	bp->wp += n;
+
+	er->achan->dev->bwrite(er->achan, bp, 0);
+}
+
+static void
+resolveaddr6(Ipifc *ifc, Arpent *a)
+{
+	int sflag;
+	Block *bp;
+	Etherrock *er = ifc->arg;
+	uchar ipsrc[IPaddrlen];
+
+	/* don't do anything if it's been less than a second since the last */
+	if(NOW - a->ctime < ReTransTimer){
+		arprelease(er->f->arp, a);
+		return;
+	}
+
+	/* remove all but the last message */
+	while((bp = a->hold) != nil){
+		if(bp == a->last)
+			break;
+		a->hold = bp->list;
+		freeblist(bp);
+	}
+
+	/* try to keep it around for a second more */
+	a->ctime = NOW;
+	a->rtime = NOW + ReTransTimer;
+	if(a->rxtsrem <= 0) {
+		arprelease(er->f->arp, a);
+		return;
+	}
+
+	a->rxtsrem--;
+	arprelease(er->f->arp, a);
+
+	if(sflag = ipv6anylocal(ifc, ipsrc))
+		icmpns(er->f, ipsrc, sflag, a->ip, TARG_MULTI, ifc->mac);
+}
+
+/*
+ *  send a gratuitous arp to refresh arp caches
+ */
+static void
+sendgarp(Ipifc *ifc, uchar *ip)
+{
+	int n;
+	Block *bp;
+	Etherarp *e;
+	Etherrock *er = ifc->arg;
+
+	/* don't arp for our initial non address */
+	if(ipcmp(ip, IPnoaddr) == 0)
+		return;
+
+	n = sizeof(Etherarp);
+	if(n < ifc->medium->mintu)
+		n = ifc->medium->mintu;
+	bp = allocb(n);
+	memset(bp->rp, 0, n);
+	e = (Etherarp*)bp->rp;
+	memmove(e->tpa, ip+IPv4off, sizeof(e->tpa));
+	memmove(e->spa, ip+IPv4off, sizeof(e->spa));
+	memmove(e->sha, ifc->mac, sizeof(e->sha));
+	memset(e->d, 0xff, sizeof(e->d));		/* ethernet broadcast */
+	memmove(e->s, ifc->mac, sizeof(e->s));
+
+	hnputs(e->type, ETARP);
+	hnputs(e->hrd, 1);
+	hnputs(e->pro, ETIP4);
+	e->hln = sizeof(e->sha);
+	e->pln = sizeof(e->spa);
+	hnputs(e->op, ARPREQUEST);
+	bp->wp += n;
+
+	er->achan->dev->bwrite(er->achan, bp, 0);
+}
+
+static void
+recvarp(Ipifc *ifc)
+{
+	int n;
+	Block *ebp, *rbp;
+	Etherarp *e, *r;
+	uchar ip[IPaddrlen];
+	static uchar eprinted[4];
+	Etherrock *er = ifc->arg;
+
+	ebp = er->achan->dev->bread(er->achan, ifc->maxtu, 0);
+	if(ebp == nil)
+		return;
+
+	e = (Etherarp*)ebp->rp;
+	switch(nhgets(e->op)) {
+	default:
+		break;
+
+	case ARPREPLY:
+		/* check for machine using my ip address */
+		v4tov6(ip, e->spa);
+		if(iplocalonifc(ifc, ip) || ipproxyifc(er->f, ifc, ip)){
+			if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) != 0){
+				print("arprep: 0x%E/0x%E also has ip addr %V\n",
+					e->s, e->sha, e->spa);
+				break;
+			}
+		}
+
+		/* make sure we're not entering broadcast addresses */
+		if(ipcmp(ip, ipbroadcast) == 0 ||
+			!memcmp(e->sha, etherbroadcast, sizeof(e->sha))){
+			print("arprep: 0x%E/0x%E cannot register broadcast address %I\n",
+				e->s, e->sha, e->spa);
+			break;
+		}
+
+		arpenter(er->f, V4, e->spa, e->sha, sizeof(e->sha), 0);
+		break;
+
+	case ARPREQUEST:
+		/* don't answer arps till we know who we are */
+		if(ifc->lifc == 0)
+			break;
+
+		/* check for machine using my ip or ether address */
+		v4tov6(ip, e->spa);
+		if(iplocalonifc(ifc, ip) || ipproxyifc(er->f, ifc, ip)){
+			if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) != 0){
+				if (memcmp(eprinted, e->spa, sizeof(e->spa))){
+					/* print only once */
+					print("arpreq: 0x%E also has ip addr %V\n", e->sha, e->spa);
+					memmove(eprinted, e->spa, sizeof(e->spa));
+				}
+			}
+		} else {
+			if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) == 0){
+				print("arpreq: %V also has ether addr %E\n", e->spa, e->sha);
+				break;
+			}
+		}
+
+		/* refresh what we know about sender */
+		arpenter(er->f, V4, e->spa, e->sha, sizeof(e->sha), 1);
+
+		/* answer only requests for our address or systems we're proxying for */
+		v4tov6(ip, e->tpa);
+		if(!iplocalonifc(ifc, ip))
+		if(!ipproxyifc(er->f, ifc, ip))
+			break;
+
+		n = sizeof(Etherarp);
+		if(n < ifc->mintu)
+			n = ifc->mintu;
+		rbp = allocb(n);
+		r = (Etherarp*)rbp->rp;
+		memset(r, 0, sizeof(Etherarp));
+		hnputs(r->type, ETARP);
+		hnputs(r->hrd, 1);
+		hnputs(r->pro, ETIP4);
+		r->hln = sizeof(r->sha);
+		r->pln = sizeof(r->spa);
+		hnputs(r->op, ARPREPLY);
+		memmove(r->tha, e->sha, sizeof(r->tha));
+		memmove(r->tpa, e->spa, sizeof(r->tpa));
+		memmove(r->sha, ifc->mac, sizeof(r->sha));
+		memmove(r->spa, e->tpa, sizeof(r->spa));
+		memmove(r->d, e->sha, sizeof(r->d));
+		memmove(r->s, ifc->mac, sizeof(r->s));
+		rbp->wp += n;
+
+		er->achan->dev->bwrite(er->achan, rbp, 0);
+	}
+	freeb(ebp);
+}
+
+static void
+recvarpproc(void *v)
+{
+	Ipifc *ifc = v;
+	Etherrock *er = ifc->arg;
+
+	er->arpp = up;
+	if(waserror()){
+		er->arpp = 0;
+		pexit("hangup", 1);
+	}
+	for(;;)
+		recvarp(ifc);
+}
+
+static int
+multicastea(uchar *ea, uchar *ip)
+{
+	int x;
+
+	switch(x = ipismulticast(ip)){
+	case V4:
+		ea[0] = 0x01;
+		ea[1] = 0x00;
+		ea[2] = 0x5e;
+		ea[3] = ip[13] & 0x7f;
+		ea[4] = ip[14];
+		ea[5] = ip[15];
+		break;
+ 	case V6:
+ 		ea[0] = 0x33;
+ 		ea[1] = 0x33;
+ 		ea[2] = ip[12];
+		ea[3] = ip[13];
+ 		ea[4] = ip[14];
+ 		ea[5] = ip[15];
+ 		break;
+	}
+	return x;
+}
+
+/*
+ *  fill in an arp entry for broadcast or multicast
+ *  addresses.  Return the first queued packet for the
+ *  IP address.
+ */
+static Block*
+multicastarp(Fs *f, Arpent *a, Medium *medium, uchar *mac)
+{
+	/* is it broadcast? */
+	switch(ipforme(f, a->ip)){
+	case Runi:
+		return nil;
+	case Rbcast:
+		memset(mac, 0xff, 6);
+		return arpresolve(f->arp, a, medium, mac);
+	default:
+		break;
+	}
+
+	/* if multicast, fill in mac */
+	switch(multicastea(mac, a->ip)){
+	case V4:
+	case V6:
+		return arpresolve(f->arp, a, medium, mac);
+	}
+
+	/* let arp take care of it */
+	return nil;
+}
+
+void
+ethermediumlink(void)
+{
+	addipmedium(&ethermedium);
+	addipmedium(&fbemedium);
+	addipmedium(&gbemedium);
+}
+
+
+static void
+etherpref2addr(uchar *pref, uchar *ea)
+{
+	pref[8] = ea[0] | 0x2;
+	pref[9] = ea[1];
+	pref[10] = ea[2];
+	pref[11] = 0xFF;
+	pref[12] = 0xFE;
+	pref[13] = ea[3];
+	pref[14] = ea[4];
+	pref[15] = ea[5];
+}

+ 290 - 0
sys/src/9/ip/gre.c

@@ -0,0 +1,290 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Generic Routing Encapsulation over IPv4, rfc1702
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+enum
+{
+	GRE_IPONLY	= 12,		/* size of ip header */
+	GRE_IPPLUSGRE	= 12,		/* minimum size of GRE header */
+	IP_GREPROTO	= 47,
+
+	GRErxms		= 200,
+	GREtickms	= 100,
+	GREmaxxmit	= 10,
+};
+
+typedef struct GREhdr
+{
+	/* ip header */
+	uchar	vihl;		/* Version and header length */
+	uchar	tos;		/* Type of service */
+	uchar	len[2];		/* packet length (including headers) */
+	uchar	id[2];		/* Identification */
+	uchar	frag[2];	/* Fragment information */
+	uchar	Unused;
+	uchar	proto;		/* Protocol */
+	uchar	cksum[2];	/* checksum */
+	uchar	src[4];		/* Ip source */
+	uchar	dst[4];		/* Ip destination */
+
+	/* gre header */
+	uchar	flags[2];
+	uchar	eproto[2];	/* encapsulation protocol */
+} GREhdr;
+
+typedef struct GREpriv GREpriv;
+struct GREpriv
+{
+	int		raw;			/* Raw GRE mode */
+
+	/* non-MIB stats */
+	ulong		csumerr;		/* checksum errors */
+	ulong		lenerr;			/* short packet */
+};
+
+static void grekick(void *x, Block *bp);
+
+static char*
+greconnect(Conv *c, char **argv, int argc)
+{
+	Proto *p;
+	char *err;
+	Conv *tc, **cp, **ecp;
+
+	err = Fsstdconnect(c, argv, argc);
+	if(err != nil)
+		return err;
+
+	/* make sure noone's already connected to this other sys */
+	p = c->p;
+	qlock(p);
+	ecp = &p->conv[p->nc];
+	for(cp = p->conv; cp < ecp; cp++){
+		tc = *cp;
+		if(tc == nil)
+			break;
+		if(tc == c)
+			continue;
+		if(tc->rport == c->rport && ipcmp(tc->raddr, c->raddr) == 0){
+			err = "already connected to that addr/proto";
+			ipmove(c->laddr, IPnoaddr);
+			ipmove(c->raddr, IPnoaddr);
+			break;
+		}
+	}
+	qunlock(p);
+
+	if(err != nil)
+		return err;
+	Fsconnected(c, nil);
+
+	return nil;
+}
+
+static void
+grecreate(Conv *c)
+{
+	c->rq = qopen(64*1024, Qmsg, 0, c);
+	c->wq = qbypass(grekick, c);
+}
+
+static int
+grestate(Conv *c, char *state, int n)
+{
+	USED(c);
+	return snprint(state, n, "%s\n", "Datagram");
+}
+
+static char*
+greannounce(Conv*, char**, int)
+{
+	return "pktifc does not support announce";
+}
+
+static void
+greclose(Conv *c)
+{
+	qclose(c->rq);
+	qclose(c->wq);
+	qclose(c->eq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->lport = 0;
+	c->rport = 0;
+}
+
+static void
+grekick(void *x, Block *bp)
+{
+	Conv *c = x;
+	GREhdr *ghp;
+	uchar laddr[IPaddrlen], raddr[IPaddrlen];
+
+	if(bp == nil)
+		return;
+
+	/* Make space to fit ip header (gre header already there) */
+	bp = padblock(bp, GRE_IPONLY);
+	if(bp == nil)
+		return;
+
+	/* make sure the message has a GRE header */
+	bp = pullupblock(bp, GRE_IPONLY+GRE_IPPLUSGRE);
+	if(bp == nil)
+		return;
+
+	ghp = (GREhdr *)(bp->rp);
+	ghp->vihl = IP_VER4;
+
+	if(!((GREpriv*)c->p->priv)->raw){
+		v4tov6(raddr, ghp->dst);
+		if(ipcmp(raddr, v4prefix) == 0)
+			memmove(ghp->dst, c->raddr + IPv4off, IPv4addrlen);
+		v4tov6(laddr, ghp->src);
+		if(ipcmp(laddr, v4prefix) == 0){
+			if(ipcmp(c->laddr, IPnoaddr) == 0)
+				findlocalip(c->p->f, c->laddr, raddr); /* pick interface closest to dest */
+			memmove(ghp->src, c->laddr + IPv4off, IPv4addrlen);
+		}
+		hnputs(ghp->eproto, c->rport);
+	}
+
+	ghp->proto = IP_GREPROTO;
+	ghp->frag[0] = 0;
+	ghp->frag[1] = 0;
+
+	ipoput4(c->p->f, bp, 0, c->ttl, c->tos, nil);
+}
+
+static void
+greiput(Proto *gre, Ipifc*, Block *bp)
+{
+	int len;
+	GREhdr *ghp;
+	Conv *c, **p;
+	ushort eproto;
+	uchar raddr[IPaddrlen];
+	GREpriv *gpriv;
+
+	gpriv = gre->priv;
+	ghp = (GREhdr*)(bp->rp);
+
+	v4tov6(raddr, ghp->src);
+	eproto = nhgets(ghp->eproto);
+	qlock(gre);
+
+	/* Look for a conversation structure for this port and address */
+	c = nil;
+	for(p = gre->conv; *p; p++) {
+		c = *p;
+		if(c->inuse == 0)
+			continue;
+		if(c->rport == eproto &&
+			(gpriv->raw || ipcmp(c->raddr, raddr) == 0))
+			break;
+	}
+
+	if(*p == nil) {
+		qunlock(gre);
+		freeblist(bp);
+		return;
+	}
+
+	qunlock(gre);
+
+	/*
+	 * Trim the packet down to data size
+	 */
+	len = nhgets(ghp->len) - GRE_IPONLY;
+	if(len < GRE_IPPLUSGRE){
+		freeblist(bp);
+		return;
+	}
+	bp = trimblock(bp, GRE_IPONLY, len);
+	if(bp == nil){
+		gpriv->lenerr++;
+		return;
+	}
+
+	/*
+	 *  Can't delimit packet so pull it all into one block.
+	 */
+	if(qlen(c->rq) > 64*1024)
+		freeblist(bp);
+	else{
+		bp = concatblock(bp);
+		if(bp == 0)
+			panic("greiput");
+		qpass(c->rq, bp);
+	}
+}
+
+int
+grestats(Proto *gre, char *buf, int len)
+{
+	GREpriv *gpriv;
+
+	gpriv = gre->priv;
+
+	return snprint(buf, len, "gre: len %lud\n", gpriv->lenerr);
+}
+
+char*
+grectl(Conv *c, char **f, int n)
+{
+	GREpriv *gpriv;
+
+	gpriv = c->p->priv;
+	if(n == 1){
+		if(strcmp(f[0], "raw") == 0){
+			gpriv->raw = 1;
+			return nil;
+		}
+		else if(strcmp(f[0], "cooked") == 0){
+			gpriv->raw = 0;
+			return nil;
+		}
+	}
+	return "unknown control request";
+}
+
+void
+greinit(Fs *fs)
+{
+	Proto *gre;
+
+	gre = smalloc(sizeof(Proto));
+	gre->priv = smalloc(sizeof(GREpriv));
+	gre->name = "gre";
+	gre->connect = greconnect;
+	gre->announce = greannounce;
+	gre->state = grestate;
+	gre->create = grecreate;
+	gre->close = greclose;
+	gre->rcv = greiput;
+	gre->ctl = grectl;
+	gre->advise = nil;
+	gre->stats = grestats;
+	gre->ipproto = IP_GREPROTO;
+	gre->nc = 64;
+	gre->ptclsize = 0;
+
+	Fsproto(fs, gre);
+}

+ 501 - 0
sys/src/9/ip/icmp.c

@@ -0,0 +1,501 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+typedef struct Icmp {
+	uchar	vihl;		/* Version and header length */
+	uchar	tos;		/* Type of service */
+	uchar	length[2];	/* packet length */
+	uchar	id[2];		/* Identification */
+	uchar	frag[2];	/* Fragment information */
+	uchar	ttl;		/* Time to live */
+	uchar	proto;		/* Protocol */
+	uchar	ipcksum[2];	/* Header checksum */
+	uchar	src[4];		/* Ip source */
+	uchar	dst[4];		/* Ip destination */
+	uchar	type;
+	uchar	code;
+	uchar	cksum[2];
+	uchar	icmpid[2];
+	uchar	seq[2];
+	uchar	data[1];
+} Icmp;
+
+enum {			/* Packet Types */
+	EchoReply	= 0,
+	Unreachable	= 3,
+	SrcQuench	= 4,
+	Redirect	= 5,
+	EchoRequest	= 8,
+	TimeExceed	= 11,
+	InParmProblem	= 12,
+	Timestamp	= 13,
+	TimestampReply	= 14,
+	InfoRequest	= 15,
+	InfoReply	= 16,
+	AddrMaskRequest	= 17,
+	AddrMaskReply	= 18,
+
+	Maxtype		= 18,
+};
+
+enum
+{
+	MinAdvise	= 24,	/* minimum needed for us to advise another protocol */
+};
+
+char *icmpnames[Maxtype+1] =
+{
+[EchoReply]		"EchoReply",
+[Unreachable]		"Unreachable",
+[SrcQuench]		"SrcQuench",
+[Redirect]		"Redirect",
+[EchoRequest]		"EchoRequest",
+[TimeExceed]		"TimeExceed",
+[InParmProblem]		"InParmProblem",
+[Timestamp]		"Timestamp",
+[TimestampReply]	"TimestampReply",
+[InfoRequest]		"InfoRequest",
+[InfoReply]		"InfoReply",
+[AddrMaskRequest]	"AddrMaskRequest",
+[AddrMaskReply]		"AddrMaskReply",
+};
+
+enum {
+	IP_ICMPPROTO	= 1,
+	ICMP_IPSIZE	= 20,
+	ICMP_HDRSIZE	= 8,
+};
+
+enum
+{
+	InMsgs,
+	InErrors,
+	OutMsgs,
+	CsumErrs,
+	LenErrs,
+	HlenErrs,
+
+	Nstats,
+};
+
+static char *statnames[Nstats] =
+{
+[InMsgs]	"InMsgs",
+[InErrors]	"InErrors",
+[OutMsgs]	"OutMsgs",
+[CsumErrs]	"CsumErrs",
+[LenErrs]	"LenErrs",
+[HlenErrs]	"HlenErrs",
+};
+
+typedef struct Icmppriv Icmppriv;
+struct Icmppriv
+{
+	ulong	stats[Nstats];
+
+	/* message counts */
+	ulong	in[Maxtype+1];
+	ulong	out[Maxtype+1];
+};
+
+static void icmpkick(void *x, Block*);
+
+static void
+icmpcreate(Conv *c)
+{
+	c->rq = qopen(64*1024, Qmsg, 0, c);
+	c->wq = qbypass(icmpkick, c);
+}
+
+extern char*
+icmpconnect(Conv *c, char **argv, int argc)
+{
+	char *e;
+
+	e = Fsstdconnect(c, argv, argc);
+	if(e != nil)
+		return e;
+	Fsconnected(c, e);
+
+	return nil;
+}
+
+extern int
+icmpstate(Conv *c, char *state, int n)
+{
+	USED(c);
+	return snprint(state, n, "%s qin %d qout %d\n",
+		"Datagram",
+		c->rq ? qlen(c->rq) : 0,
+		c->wq ? qlen(c->wq) : 0
+	);
+}
+
+extern char*
+icmpannounce(Conv *c, char **argv, int argc)
+{
+	char *e;
+
+	e = Fsstdannounce(c, argv, argc);
+	if(e != nil)
+		return e;
+	Fsconnected(c, nil);
+
+	return nil;
+}
+
+extern void
+icmpclose(Conv *c)
+{
+	qclose(c->rq);
+	qclose(c->wq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->lport = 0;
+}
+
+static void
+icmpkick(void *x, Block *bp)
+{
+	Conv *c = x;
+	Icmp *p;
+	Icmppriv *ipriv;
+
+	if(bp == nil)
+		return;
+
+	if(blocklen(bp) < ICMP_IPSIZE + ICMP_HDRSIZE){
+		freeblist(bp);
+		return;
+	}
+	p = (Icmp *)(bp->rp);
+	p->vihl = IP_VER4;
+	ipriv = c->p->priv;
+	if(p->type <= Maxtype)
+		ipriv->out[p->type]++;
+
+	v6tov4(p->dst, c->raddr);
+	v6tov4(p->src, c->laddr);
+	p->proto = IP_ICMPPROTO;
+	hnputs(p->icmpid, c->lport);
+	memset(p->cksum, 0, sizeof(p->cksum));
+	hnputs(p->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE));
+	ipriv->stats[OutMsgs]++;
+	ipoput4(c->p->f, bp, 0, c->ttl, c->tos, nil);
+}
+
+extern void
+icmpttlexceeded(Fs *f, uchar *ia, Block *bp)
+{
+	Block	*nbp;
+	Icmp	*p, *np;
+
+	p = (Icmp *)bp->rp;
+
+	netlog(f, Logicmp, "sending icmpttlexceeded -> %V\n", p->src);
+	nbp = allocb(ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8);
+	nbp->wp += ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8;
+	np = (Icmp *)nbp->rp;
+	np->vihl = IP_VER4;
+	memmove(np->dst, p->src, sizeof(np->dst));
+	v6tov4(np->src, ia);
+	memmove(np->data, bp->rp, ICMP_IPSIZE + 8);
+	np->type = TimeExceed;
+	np->code = 0;
+	np->proto = IP_ICMPPROTO;
+	hnputs(np->icmpid, 0);
+	hnputs(np->seq, 0);
+	memset(np->cksum, 0, sizeof(np->cksum));
+	hnputs(np->cksum, ptclcsum(nbp, ICMP_IPSIZE, blocklen(nbp) - ICMP_IPSIZE));
+	ipoput4(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+
+}
+
+static void
+icmpunreachable(Fs *f, Block *bp, int code, int seq)
+{
+	Block	*nbp;
+	Icmp	*p, *np;
+	int	i;
+	uchar	addr[IPaddrlen];
+
+	p = (Icmp *)bp->rp;
+
+	/* only do this for unicast sources and destinations */
+	v4tov6(addr, p->dst);
+	i = ipforme(f, addr);
+	if((i&Runi) == 0)
+		return;
+	v4tov6(addr, p->src);
+	i = ipforme(f, addr);
+	if(i != 0 && (i&Runi) == 0)
+		return;
+
+	netlog(f, Logicmp, "sending icmpnoconv -> %V\n", p->src);
+	nbp = allocb(ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8);
+	nbp->wp += ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8;
+	np = (Icmp *)nbp->rp;
+	np->vihl = IP_VER4;
+	memmove(np->dst, p->src, sizeof(np->dst));
+	memmove(np->src, p->dst, sizeof(np->src));
+	memmove(np->data, bp->rp, ICMP_IPSIZE + 8);
+	np->type = Unreachable;
+	np->code = code;
+	np->proto = IP_ICMPPROTO;
+	hnputs(np->icmpid, 0);
+	hnputs(np->seq, seq);
+	memset(np->cksum, 0, sizeof(np->cksum));
+	hnputs(np->cksum, ptclcsum(nbp, ICMP_IPSIZE, blocklen(nbp) - ICMP_IPSIZE));
+	ipoput4(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+}
+
+extern void
+icmpnoconv(Fs *f, Block *bp)
+{
+	icmpunreachable(f, bp, 3, 0);
+}
+
+extern void
+icmpcantfrag(Fs *f, Block *bp, int mtu)
+{
+	icmpunreachable(f, bp, 4, mtu);
+}
+
+static void
+goticmpkt(Proto *icmp, Block *bp)
+{
+	Conv	**c, *s;
+	Icmp	*p;
+	uchar	dst[IPaddrlen];
+	ushort	recid;
+
+	p = (Icmp *) bp->rp;
+	v4tov6(dst, p->src);
+	recid = nhgets(p->icmpid);
+
+	for(c = icmp->conv; *c; c++) {
+		s = *c;
+		if(s->lport == recid)
+		if(ipcmp(s->raddr, dst) == 0){
+			bp = concatblock(bp);
+			if(bp != nil)
+				qpass(s->rq, bp);
+			return;
+		}
+	}
+	freeblist(bp);
+}
+
+static Block *
+mkechoreply(Block *bp)
+{
+	Icmp	*q;
+	uchar	ip[4];
+
+	q = (Icmp *)bp->rp;
+	q->vihl = IP_VER4;
+	memmove(ip, q->src, sizeof(q->dst));
+	memmove(q->src, q->dst, sizeof(q->src));
+	memmove(q->dst, ip, sizeof(q->dst));
+	q->type = EchoReply;
+	memset(q->cksum, 0, sizeof(q->cksum));
+	hnputs(q->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE));
+
+	return bp;
+}
+
+static char *unreachcode[] =
+{
+[0]	"net unreachable",
+[1]	"host unreachable",
+[2]	"protocol unreachable",
+[3]	"port unreachable",
+[4]	"fragmentation needed and DF set",
+[5]	"source route failed",
+};
+
+static void
+icmpiput(Proto *icmp, Ipifc*, Block *bp)
+{
+	int	n, iplen;
+	Icmp	*p;
+	Block	*r;
+	Proto	*pr;
+	char	*msg;
+	char	m2[128];
+	Icmppriv *ipriv;
+
+	ipriv = icmp->priv;
+
+	ipriv->stats[InMsgs]++;
+
+	p = (Icmp *)bp->rp;
+	netlog(icmp->f, Logicmp, "icmpiput %d %d\n", p->type, p->code);
+	n = blocklen(bp);
+	if(n < ICMP_IPSIZE+ICMP_HDRSIZE){
+		ipriv->stats[InErrors]++;
+		ipriv->stats[HlenErrs]++;
+		netlog(icmp->f, Logicmp, "icmp hlen %d\n", n);
+		goto raise;
+	}
+	iplen = nhgets(p->length);
+	if(iplen > n || (iplen % 1)){
+		ipriv->stats[LenErrs]++;
+		ipriv->stats[InErrors]++;
+		netlog(icmp->f, Logicmp, "icmp length error n %d iplen %d\n",
+			n, iplen);
+		goto raise;
+	}
+	if(ptclcsum(bp, ICMP_IPSIZE, iplen - ICMP_IPSIZE)){
+		ipriv->stats[InErrors]++;
+		ipriv->stats[CsumErrs]++;
+		netlog(icmp->f, Logicmp, "icmp checksum error n %d iplen %d\n",
+			n, iplen);
+		goto raise;
+	}
+	if(p->type <= Maxtype)
+		ipriv->in[p->type]++;
+
+	switch(p->type) {
+	case EchoRequest:
+		if (iplen < n)
+			bp = trimblock(bp, 0, iplen);
+		r = mkechoreply(bp);
+		ipriv->out[EchoReply]++;
+		ipoput4(icmp->f, r, 0, MAXTTL, DFLTTOS, nil);
+		break;
+	case Unreachable:
+		if(p->code > 5)
+			msg = unreachcode[1];
+		else
+			msg = unreachcode[p->code];
+
+		bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE;
+		if(blocklen(bp) < MinAdvise){
+			ipriv->stats[LenErrs]++;
+			goto raise;
+		}
+		p = (Icmp *)bp->rp;
+		pr = Fsrcvpcolx(icmp->f, p->proto);
+		if(pr != nil && pr->advise != nil) {
+			(*pr->advise)(pr, bp, msg);
+			return;
+		}
+
+		bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE;
+		goticmpkt(icmp, bp);
+		break;
+	case TimeExceed:
+		if(p->code == 0){
+			sprint(m2, "ttl exceeded at %V", p->src);
+
+			bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE;
+			if(blocklen(bp) < MinAdvise){
+				ipriv->stats[LenErrs]++;
+				goto raise;
+			}
+			p = (Icmp *)bp->rp;
+			pr = Fsrcvpcolx(icmp->f, p->proto);
+			if(pr != nil && pr->advise != nil) {
+				(*pr->advise)(pr, bp, m2);
+				return;
+			}
+			bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE;
+		}
+
+		goticmpkt(icmp, bp);
+		break;
+	default:
+		goticmpkt(icmp, bp);
+		break;
+	}
+	return;
+
+raise:
+	freeblist(bp);
+}
+
+void
+icmpadvise(Proto *icmp, Block *bp, char *msg)
+{
+	Conv	**c, *s;
+	Icmp	*p;
+	uchar	dst[IPaddrlen];
+	ushort	recid;
+
+	p = (Icmp *) bp->rp;
+	v4tov6(dst, p->dst);
+	recid = nhgets(p->icmpid);
+
+	for(c = icmp->conv; *c; c++) {
+		s = *c;
+		if(s->lport == recid)
+		if(ipcmp(s->raddr, dst) == 0){
+			qhangup(s->rq, msg);
+			qhangup(s->wq, msg);
+			break;
+		}
+	}
+	freeblist(bp);
+}
+
+int
+icmpstats(Proto *icmp, char *buf, int len)
+{
+	Icmppriv *priv;
+	char *p, *e;
+	int i;
+
+	priv = icmp->priv;
+	p = buf;
+	e = p+len;
+	for(i = 0; i < Nstats; i++)
+		p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
+	for(i = 0; i <= Maxtype; i++){
+		if(icmpnames[i])
+			p = seprint(p, e, "%s: %lud %lud\n", icmpnames[i], priv->in[i], priv->out[i]);
+		else
+			p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i], priv->out[i]);
+	}
+	return p - buf;
+}
+
+void
+icmpinit(Fs *fs)
+{
+	Proto *icmp;
+
+	icmp = smalloc(sizeof(Proto));
+	icmp->priv = smalloc(sizeof(Icmppriv));
+	icmp->name = "icmp";
+	icmp->connect = icmpconnect;
+	icmp->announce = icmpannounce;
+	icmp->state = icmpstate;
+	icmp->create = icmpcreate;
+	icmp->close = icmpclose;
+	icmp->rcv = icmpiput;
+	icmp->stats = icmpstats;
+	icmp->ctl = nil;
+	icmp->advise = icmpadvise;
+	icmp->gc = nil;
+	icmp->ipproto = IP_ICMPPROTO;
+	icmp->nc = 128;
+	icmp->ptclsize = 0;
+
+	Fsproto(fs, icmp);
+}

+ 908 - 0
sys/src/9/ip/icmp6.c

@@ -0,0 +1,908 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Internet Control Message Protocol for IPv6
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "ip.h"
+#include "ipv6.h"
+
+enum
+{
+	InMsgs6,
+	InErrors6,
+	OutMsgs6,
+	CsumErrs6,
+	LenErrs6,
+	HlenErrs6,
+	HoplimErrs6,
+	IcmpCodeErrs6,
+	TargetErrs6,
+	OptlenErrs6,
+	AddrmxpErrs6,
+	RouterAddrErrs6,
+
+	Nstats6,
+};
+
+enum {
+	ICMP_USEAD6	= 40,
+};
+
+enum {
+	Oflag	= 1<<5,
+	Sflag	= 1<<6,
+	Rflag	= 1<<7,
+};
+
+enum {
+	/* ICMPv6 types */
+	EchoReply	= 0,
+	UnreachableV6	= 1,
+	PacketTooBigV6	= 2,
+	TimeExceedV6	= 3,
+	SrcQuench	= 4,
+	ParamProblemV6	= 4,
+	Redirect	= 5,
+	EchoRequest	= 8,
+	TimeExceed	= 11,
+	InParmProblem	= 12,
+	Timestamp	= 13,
+	TimestampReply	= 14,
+	InfoRequest	= 15,
+	InfoReply	= 16,
+	AddrMaskRequest = 17,
+	AddrMaskReply   = 18,
+	EchoRequestV6	= 128,
+	EchoReplyV6	= 129,
+	RouterSolicit	= 133,
+	RouterAdvert	= 134,
+	NbrSolicit	= 135,
+	NbrAdvert	= 136,
+	RedirectV6	= 137,
+
+	Maxtype6	= 137,
+};
+
+typedef struct ICMPpkt ICMPpkt;
+typedef struct IPICMP IPICMP;
+typedef struct Ndpkt Ndpkt;
+typedef struct NdiscC NdiscC;
+
+struct ICMPpkt {
+	uchar	type;
+	uchar	code;
+	uchar	cksum[2];
+	uchar	icmpid[2];
+	uchar	seq[2];
+};
+
+struct IPICMP {
+	Ip6hdr;
+	ICMPpkt;
+};
+
+struct NdiscC
+{
+	IPICMP;
+	uchar	target[IPaddrlen];
+};
+
+struct Ndpkt
+{
+	NdiscC;
+	uchar	otype;
+	uchar	olen;		/* length in units of 8 octets(incl type, code),
+				 * 1 for IEEE 802 addresses */
+	uchar	lnaddr[6];	/* link-layer address */
+};
+
+typedef struct Icmppriv6
+{
+	ulong	stats[Nstats6];
+
+	/* message counts */
+	ulong	in[Maxtype6+1];
+	ulong	out[Maxtype6+1];
+} Icmppriv6;
+
+typedef struct Icmpcb6
+{
+	QLock;
+	uchar	headers;
+} Icmpcb6;
+
+char *icmpnames6[Maxtype6+1] =
+{
+[EchoReply]		"EchoReply",
+[UnreachableV6]		"UnreachableV6",
+[PacketTooBigV6]	"PacketTooBigV6",
+[TimeExceedV6]		"TimeExceedV6",
+[SrcQuench]		"SrcQuench",
+[Redirect]		"Redirect",
+[EchoRequest]		"EchoRequest",
+[TimeExceed]		"TimeExceed",
+[InParmProblem]		"InParmProblem",
+[Timestamp]		"Timestamp",
+[TimestampReply]	"TimestampReply",
+[InfoRequest]		"InfoRequest",
+[InfoReply]		"InfoReply",
+[AddrMaskRequest]	"AddrMaskRequest",
+[AddrMaskReply]		"AddrMaskReply",
+[EchoRequestV6]		"EchoRequestV6",
+[EchoReplyV6]		"EchoReplyV6",
+[RouterSolicit]		"RouterSolicit",
+[RouterAdvert]		"RouterAdvert",
+[NbrSolicit]		"NbrSolicit",
+[NbrAdvert]		"NbrAdvert",
+[RedirectV6]		"RedirectV6",
+};
+
+static char *statnames6[Nstats6] =
+{
+[InMsgs6]	"InMsgs",
+[InErrors6]	"InErrors",
+[OutMsgs6]	"OutMsgs",
+[CsumErrs6]	"CsumErrs",
+[LenErrs6]	"LenErrs",
+[HlenErrs6]	"HlenErrs",
+[HoplimErrs6]	"HoplimErrs",
+[IcmpCodeErrs6]	"IcmpCodeErrs",
+[TargetErrs6]	"TargetErrs",
+[OptlenErrs6]	"OptlenErrs",
+[AddrmxpErrs6]	"AddrmxpErrs",
+[RouterAddrErrs6]	"RouterAddrErrs",
+};
+
+static char *unreachcode[] =
+{
+[icmp6_no_route]	"no route to destination",
+[icmp6_ad_prohib]	"comm with destination administratively prohibited",
+[icmp6_unassigned]	"icmp unreachable: unassigned error code (2)",
+[icmp6_adr_unreach]	"address unreachable",
+[icmp6_port_unreach]	"port unreachable",
+[icmp6_unkn_code]	"icmp unreachable: unknown code",
+};
+
+static void icmpkick6(void *x, Block *bp);
+
+static void
+icmpcreate6(Conv *c)
+{
+	c->rq = qopen(64*1024, Qmsg, 0, c);
+	c->wq = qbypass(icmpkick6, c);
+}
+
+static void
+set_cksum(Block *bp)
+{
+	IPICMP *p = (IPICMP *)(bp->rp);
+
+	hnputl(p->vcf, 0);  	/* borrow IP header as pseudoheader */
+	hnputs(p->ploadlen, blocklen(bp)-IPV6HDR_LEN);
+	p->proto = 0;
+	p->ttl = ICMPv6;	/* ttl gets set later */
+	hnputs(p->cksum, 0);
+	hnputs(p->cksum, ptclcsum(bp, 0, blocklen(bp)));
+	p->proto = ICMPv6;
+}
+
+static Block *
+newIPICMP(int packetlen)
+{
+	Block *nbp;
+
+	nbp = allocb(packetlen);
+	nbp->wp += packetlen;
+	memset(nbp->rp, 0, packetlen);
+	return nbp;
+}
+
+void
+icmpadvise6(Proto *icmp, Block *bp, char *msg)
+{
+	ushort recid;
+	Conv **c, *s;
+	IPICMP *p;
+
+	p = (IPICMP *)bp->rp;
+	recid = nhgets(p->icmpid);
+
+	for(c = icmp->conv; *c; c++) {
+		s = *c;
+		if(s->lport == recid && ipcmp(s->raddr, p->dst) == 0){
+			qhangup(s->rq, msg);
+			qhangup(s->wq, msg);
+			break;
+		}
+	}
+	freeblist(bp);
+}
+
+static void
+icmpkick6(void *x, Block *bp)
+{
+	uchar laddr[IPaddrlen], raddr[IPaddrlen];
+	Conv *c = x;
+	IPICMP *p;
+	Icmppriv6 *ipriv = c->p->priv;
+	Icmpcb6 *icb = (Icmpcb6*)c->ptcl;
+
+	if(bp == nil)
+		return;
+
+	if(icb->headers==6) {
+		/* get user specified addresses */
+		bp = pullupblock(bp, ICMP_USEAD6);
+		if(bp == nil)
+			return;
+		bp->rp += 8;
+		ipmove(laddr, bp->rp);
+		bp->rp += IPaddrlen;
+		ipmove(raddr, bp->rp);
+		bp->rp += IPaddrlen;
+		bp = padblock(bp, sizeof(Ip6hdr));
+	}
+
+	if(blocklen(bp) < sizeof(IPICMP)){
+		freeblist(bp);
+		return;
+	}
+	p = (IPICMP *)(bp->rp);
+	if(icb->headers == 6) {
+		ipmove(p->dst, raddr);
+		ipmove(p->src, laddr);
+	} else {
+		ipmove(p->dst, c->raddr);
+		ipmove(p->src, c->laddr);
+		hnputs(p->icmpid, c->lport);
+	}
+
+	set_cksum(bp);
+	p->vcf[0] = 0x06 << 4;
+	if(p->type <= Maxtype6)
+		ipriv->out[p->type]++;
+	ipoput6(c->p->f, bp, 0, c->ttl, c->tos, nil);
+}
+
+char*
+icmpctl6(Conv *c, char **argv, int argc)
+{
+	Icmpcb6 *icb;
+
+	icb = (Icmpcb6*) c->ptcl;
+	if(argc==1 && strcmp(argv[0], "headers")==0) {
+		icb->headers = 6;
+		return nil;
+	}
+	return "unknown control request";
+}
+
+static void
+goticmpkt6(Proto *icmp, Block *bp, int muxkey)
+{
+	ushort recid;
+	uchar *addr;
+	Conv **c, *s;
+	IPICMP *p = (IPICMP *)bp->rp;
+
+	if(muxkey == 0) {
+		recid = nhgets(p->icmpid);
+		addr = p->src;
+	} else {
+		recid = muxkey;
+		addr = p->dst;
+	}
+
+	for(c = icmp->conv; *c; c++){
+		s = *c;
+		if(s->lport == recid && ipcmp(s->raddr, addr) == 0){
+			bp = concatblock(bp);
+			if(bp != nil)
+				qpass(s->rq, bp);
+			return;
+		}
+	}
+
+	freeblist(bp);
+}
+
+static Block *
+mkechoreply6(Block *bp, Ipifc *ifc)
+{
+	uchar addr[IPaddrlen];
+	IPICMP *p = (IPICMP *)(bp->rp);
+
+	ipmove(addr, p->src);
+	if(!isv6mcast(p->dst))
+		ipmove(p->src, p->dst);
+	else if (!ipv6anylocal(ifc, p->src))
+		return nil;
+	ipmove(p->dst, addr);
+	p->type = EchoReplyV6;
+	set_cksum(bp);
+	return bp;
+}
+
+/*
+ * sends out an ICMPv6 neighbor solicitation
+ * 	suni == SRC_UNSPEC or SRC_UNI,
+ *	tuni == TARG_MULTI => multicast for address resolution,
+ * 	and tuni == TARG_UNI => neighbor reachability.
+ */
+extern void
+icmpns(Fs *f, uchar* src, int suni, uchar* targ, int tuni, uchar* mac)
+{
+	Block *nbp;
+	Ndpkt *np;
+	Proto *icmp = f->t2p[ICMPv6];
+	Icmppriv6 *ipriv = icmp->priv;
+
+	nbp = newIPICMP(sizeof(Ndpkt));
+	np = (Ndpkt*) nbp->rp;
+
+	if(suni == SRC_UNSPEC)
+		memmove(np->src, v6Unspecified, IPaddrlen);
+	else
+		memmove(np->src, src, IPaddrlen);
+
+	if(tuni == TARG_UNI)
+		memmove(np->dst, targ, IPaddrlen);
+	else
+		ipv62smcast(np->dst, targ);
+
+	np->type = NbrSolicit;
+	np->code = 0;
+	memmove(np->target, targ, IPaddrlen);
+	if(suni != SRC_UNSPEC) {
+		np->otype = SRC_LLADDR;
+		np->olen = 1;		/* 1+1+6 = 8 = 1 8-octet */
+		memmove(np->lnaddr, mac, sizeof(np->lnaddr));
+	} else
+		nbp->wp -= sizeof(Ndpkt) - sizeof(NdiscC);
+
+	set_cksum(nbp);
+	np = (Ndpkt*)nbp->rp;
+	np->ttl = HOP_LIMIT;
+	np->vcf[0] = 0x06 << 4;
+	ipriv->out[NbrSolicit]++;
+	netlog(f, Logicmp, "sending neighbor solicitation %I\n", targ);
+	ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+}
+
+/*
+ * sends out an ICMPv6 neighbor advertisement. pktflags == RSO flags.
+ */
+extern void
+icmpna(Fs *f, uchar* src, uchar* dst, uchar* targ, uchar* mac, uchar flags)
+{
+	Block *nbp;
+	Ndpkt *np;
+	Proto *icmp = f->t2p[ICMPv6];
+	Icmppriv6 *ipriv = icmp->priv;
+
+	nbp = newIPICMP(sizeof(Ndpkt));
+	np = (Ndpkt*)nbp->rp;
+
+	memmove(np->src, src, IPaddrlen);
+	memmove(np->dst, dst, IPaddrlen);
+
+	np->type = NbrAdvert;
+	np->code = 0;
+	np->icmpid[0] = flags;
+	memmove(np->target, targ, IPaddrlen);
+
+	np->otype = TARGET_LLADDR;
+	np->olen = 1;
+	memmove(np->lnaddr, mac, sizeof(np->lnaddr));
+
+	set_cksum(nbp);
+	np = (Ndpkt*) nbp->rp;
+	np->ttl = HOP_LIMIT;
+	np->vcf[0] = 0x06 << 4;
+	ipriv->out[NbrAdvert]++;
+	netlog(f, Logicmp, "sending neighbor advertisement %I\n", src);
+	ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+}
+
+extern void
+icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free)
+{
+	int osz = BLEN(bp);
+	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	Block *nbp;
+	IPICMP *np;
+	Ip6hdr *p;
+	Proto *icmp = f->t2p[ICMPv6];
+	Icmppriv6 *ipriv = icmp->priv;
+
+	p = (Ip6hdr *)bp->rp;
+
+	if(isv6mcast(p->src))
+		goto clean;
+
+	nbp = newIPICMP(sz);
+	np = (IPICMP *)nbp->rp;
+
+	rlock(ifc);
+	if(ipv6anylocal(ifc, np->src))
+		netlog(f, Logicmp, "send icmphostunr -> s%I d%I\n",
+			p->src, p->dst);
+	else {
+		netlog(f, Logicmp, "icmphostunr fail -> s%I d%I\n",
+			p->src, p->dst);
+		freeblist(nbp);
+		if(free)
+			goto clean;
+		else
+			return;
+	}
+
+	memmove(np->dst, p->src, IPaddrlen);
+	np->type = UnreachableV6;
+	np->code = code;
+	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	set_cksum(nbp);
+	np->ttl = HOP_LIMIT;
+	np->vcf[0] = 0x06 << 4;
+	ipriv->out[UnreachableV6]++;
+
+	if(free)
+		ipiput6(f, ifc, nbp);
+	else {
+		ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+		return;
+	}
+
+clean:
+	runlock(ifc);
+	freeblist(bp);
+}
+
+extern void
+icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp)
+{
+	int osz = BLEN(bp);
+	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	Block *nbp;
+	IPICMP *np;
+	Ip6hdr *p;
+	Proto *icmp = f->t2p[ICMPv6];
+	Icmppriv6 *ipriv = icmp->priv;
+
+	p = (Ip6hdr *)bp->rp;
+
+	if(isv6mcast(p->src))
+		return;
+
+	nbp = newIPICMP(sz);
+	np = (IPICMP *) nbp->rp;
+
+	if(ipv6anylocal(ifc, np->src))
+		netlog(f, Logicmp, "send icmpttlexceeded6 -> s%I d%I\n",
+			p->src, p->dst);
+	else {
+		netlog(f, Logicmp, "icmpttlexceeded6 fail -> s%I d%I\n",
+			p->src, p->dst);
+		return;
+	}
+
+	memmove(np->dst, p->src, IPaddrlen);
+	np->type = TimeExceedV6;
+	np->code = 0;
+	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	set_cksum(nbp);
+	np->ttl = HOP_LIMIT;
+	np->vcf[0] = 0x06 << 4;
+	ipriv->out[TimeExceedV6]++;
+	ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+}
+
+extern void
+icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp)
+{
+	int osz = BLEN(bp);
+	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	Block *nbp;
+	IPICMP *np;
+	Ip6hdr *p;
+	Proto *icmp = f->t2p[ICMPv6];
+	Icmppriv6 *ipriv = icmp->priv;
+
+	p = (Ip6hdr *)bp->rp;
+
+	if(isv6mcast(p->src))
+		return;
+
+	nbp = newIPICMP(sz);
+	np = (IPICMP *)nbp->rp;
+
+	if(ipv6anylocal(ifc, np->src))
+		netlog(f, Logicmp, "send icmppkttoobig6 -> s%I d%I\n",
+			p->src, p->dst);
+	else {
+		netlog(f, Logicmp, "icmppkttoobig6 fail -> s%I d%I\n",
+			p->src, p->dst);
+		return;
+	}
+
+	memmove(np->dst, p->src, IPaddrlen);
+	np->type = PacketTooBigV6;
+	np->code = 0;
+	hnputl(np->icmpid, ifc->maxtu - ifc->medium->hsize);
+	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	set_cksum(nbp);
+	np->ttl = HOP_LIMIT;
+	np->vcf[0] = 0x06 << 4;
+	ipriv->out[PacketTooBigV6]++;
+	ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
+}
+
+/*
+ * RFC 2461, pages 39-40, pages 57-58.
+ */
+static int
+valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv)
+{
+	int sz, osz, unsp, n, ttl, iplen;
+	int pktsz = BLEN(bp);
+	uchar *packet = bp->rp;
+	IPICMP *p = (IPICMP *) packet;
+	Ndpkt *np;
+
+	USED(ifc);
+	n = blocklen(bp);
+	if(n < sizeof(IPICMP)) {
+		ipriv->stats[HlenErrs6]++;
+		netlog(icmp->f, Logicmp, "icmp hlen %d\n", n);
+		goto err;
+	}
+
+	iplen = nhgets(p->ploadlen);
+	if(iplen > n-IPV6HDR_LEN || (iplen % 1)) {
+		ipriv->stats[LenErrs6]++;
+		netlog(icmp->f, Logicmp, "icmp length %d\n", iplen);
+		goto err;
+	}
+
+	/* Rather than construct explicit pseudoheader, overwrite IPv6 header */
+	if(p->proto != ICMPv6) {
+		/* This code assumes no extension headers!!! */
+		netlog(icmp->f, Logicmp, "icmp error: extension header\n");
+		goto err;
+	}
+	memset(packet, 0, 4);
+	ttl = p->ttl;
+	p->ttl = p->proto;
+	p->proto = 0;
+	if(ptclcsum(bp, 0, iplen + IPV6HDR_LEN)) {
+		ipriv->stats[CsumErrs6]++;
+		netlog(icmp->f, Logicmp, "icmp checksum error\n");
+		goto err;
+	}
+	p->proto = p->ttl;
+	p->ttl = ttl;
+
+	/* additional tests for some pkt types */
+	if (p->type == NbrSolicit   || p->type == NbrAdvert ||
+	    p->type == RouterAdvert || p->type == RouterSolicit ||
+	    p->type == RedirectV6) {
+		if(p->ttl != HOP_LIMIT) {
+			ipriv->stats[HoplimErrs6]++;
+			goto err;
+		}
+		if(p->code != 0) {
+			ipriv->stats[IcmpCodeErrs6]++;
+			goto err;
+		}
+
+		switch (p->type) {
+		case NbrSolicit:
+		case NbrAdvert:
+			np = (Ndpkt*) p;
+			if(isv6mcast(np->target)) {
+				ipriv->stats[TargetErrs6]++;
+				goto err;
+			}
+			if(optexsts(np) && np->olen == 0) {
+				ipriv->stats[OptlenErrs6]++;
+				goto err;
+			}
+
+			if (p->type == NbrSolicit &&
+			    ipcmp(np->src, v6Unspecified) == 0)
+				if(!issmcast(np->dst) || optexsts(np)) {
+					ipriv->stats[AddrmxpErrs6]++;
+					goto err;
+				}
+
+			if(p->type == NbrAdvert)
+				if(isv6mcast(np->dst) &&
+				    (nhgets(np->icmpid) & Sflag)){
+					ipriv->stats[AddrmxpErrs6]++;
+					goto err;
+				}
+			break;
+
+		case RouterAdvert:
+			if(pktsz - sizeof(Ip6hdr) < 16) {
+				ipriv->stats[HlenErrs6]++;
+				goto err;
+			}
+			if(!islinklocal(p->src)) {
+				ipriv->stats[RouterAddrErrs6]++;
+				goto err;
+			}
+			sz = sizeof(IPICMP) + 8;
+			while (sz+1 < pktsz) {
+				osz = packet[sz+1];
+				if(osz <= 0) {
+					ipriv->stats[OptlenErrs6]++;
+					goto err;
+				}
+				sz += 8*osz;
+			}
+			break;
+
+		case RouterSolicit:
+			if(pktsz - sizeof(Ip6hdr) < 8) {
+				ipriv->stats[HlenErrs6]++;
+				goto err;
+			}
+			unsp = (ipcmp(p->src, v6Unspecified) == 0);
+			sz = sizeof(IPICMP) + 8;
+			while (sz+1 < pktsz) {
+				osz = packet[sz+1];
+				if(osz <= 0 ||
+				    (unsp && packet[sz] == SRC_LLADDR)) {
+					ipriv->stats[OptlenErrs6]++;
+					goto err;
+				}
+				sz += 8*osz;
+			}
+			break;
+
+		case RedirectV6:
+			/* to be filled in */
+			break;
+
+		default:
+			goto err;
+		}
+	}
+	return 1;
+err:
+	ipriv->stats[InErrors6]++;
+	return 0;
+}
+
+static int
+targettype(Fs *f, Ipifc *ifc, uchar *target)
+{
+	Iplifc *lifc;
+	int t;
+
+	rlock(ifc);
+	if(ipproxyifc(f, ifc, target)) {
+		runlock(ifc);
+		return Tuniproxy;
+	}
+
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next)
+		if(ipcmp(lifc->local, target) == 0) {
+			t = (lifc->tentative)? Tunitent: Tunirany;
+			runlock(ifc);
+			return t;
+		}
+
+	runlock(ifc);
+	return 0;
+}
+
+static void
+icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
+{
+	int refresh = 1;
+	char *msg, m2[128];
+	uchar pktflags;
+	uchar *packet = bp->rp;
+	uchar lsrc[IPaddrlen];
+	Block *r;
+	IPICMP *p = (IPICMP *)packet;
+	Icmppriv6 *ipriv = icmp->priv;
+	Iplifc *lifc;
+	Ndpkt* np;
+	Proto *pr;
+
+	if(!valid(icmp, ipifc, bp, ipriv) || p->type > Maxtype6)
+		goto raise;
+
+	ipriv->in[p->type]++;
+
+	switch(p->type) {
+	case EchoRequestV6:
+		r = mkechoreply6(bp, ipifc);
+		if(r == nil)
+			goto raise;
+		ipriv->out[EchoReply]++;
+		ipoput6(icmp->f, r, 0, MAXTTL, DFLTTOS, nil);
+		break;
+
+	case UnreachableV6:
+		if(p->code > 4)
+			msg = unreachcode[icmp6_unkn_code];
+		else
+			msg = unreachcode[p->code];
+
+		bp->rp += sizeof(IPICMP);
+		if(blocklen(bp) < 8){
+			ipriv->stats[LenErrs6]++;
+			goto raise;
+		}
+		p = (IPICMP *)bp->rp;
+		pr = Fsrcvpcolx(icmp->f, p->proto);
+		if(pr != nil && pr->advise != nil) {
+			(*pr->advise)(pr, bp, msg);
+			return;
+		}
+
+		bp->rp -= sizeof(IPICMP);
+		goticmpkt6(icmp, bp, 0);
+		break;
+
+	case TimeExceedV6:
+		if(p->code == 0){
+			sprint(m2, "ttl exceeded at %I", p->src);
+
+			bp->rp += sizeof(IPICMP);
+			if(blocklen(bp) < 8){
+				ipriv->stats[LenErrs6]++;
+				goto raise;
+			}
+			p = (IPICMP *)bp->rp;
+			pr = Fsrcvpcolx(icmp->f, p->proto);
+			if(pr && pr->advise) {
+				(*pr->advise)(pr, bp, m2);
+				return;
+			}
+			bp->rp -= sizeof(IPICMP);
+		}
+
+		goticmpkt6(icmp, bp, 0);
+		break;
+
+	case RouterAdvert:
+	case RouterSolicit:
+		/* using lsrc as a temp, munge hdr for goticmp6 */
+		if (0) {
+			memmove(lsrc, p->src, IPaddrlen);
+			memmove(p->src, p->dst, IPaddrlen);
+			memmove(p->dst, lsrc, IPaddrlen);
+		}
+		goticmpkt6(icmp, bp, p->type);
+		break;
+
+	case NbrSolicit:
+		np = (Ndpkt*) p;
+		pktflags = 0;
+		switch (targettype(icmp->f, ipifc, np->target)) {
+		case Tunirany:
+			pktflags |= Oflag;
+			/* fall through */
+
+		case Tuniproxy:
+			if(ipcmp(np->src, v6Unspecified) != 0) {
+				arpenter(icmp->f, V6, np->src, np->lnaddr,
+					8*np->olen-2, 0);
+				pktflags |= Sflag;
+			}
+			if(ipv6local(ipifc, lsrc))
+				icmpna(icmp->f, lsrc,
+					(ipcmp(np->src, v6Unspecified) == 0?
+						v6allnodesL: np->src),
+					np->target, ipifc->mac, pktflags);
+			else
+				freeblist(bp);
+			break;
+
+		case Tunitent:
+			/* not clear what needs to be done. send up
+			 * an icmp mesg saying don't use this address? */
+		default:
+			freeblist(bp);
+		}
+		break;
+
+	case NbrAdvert:
+		np = (Ndpkt*) p;
+
+		/*
+		 * if the target address matches one of the local interface
+		 * addresses and the local interface address has tentative bit
+		 * set, insert into ARP table. this is so the duplicate address
+		 * detection part of ipconfig can discover duplication through
+		 * the arp table.
+		 */
+		lifc = iplocalonifc(ipifc, np->target);
+		if(lifc && lifc->tentative)
+			refresh = 0;
+		arpenter(icmp->f, V6, np->target, np->lnaddr, 8*np->olen-2,
+			refresh);
+		freeblist(bp);
+		break;
+
+	case PacketTooBigV6:
+	default:
+		goticmpkt6(icmp, bp, 0);
+		break;
+	}
+	return;
+raise:
+	freeblist(bp);
+}
+
+int
+icmpstats6(Proto *icmp6, char *buf, int len)
+{
+	Icmppriv6 *priv;
+	char *p, *e;
+	int i;
+
+	priv = icmp6->priv;
+	p = buf;
+	e = p+len;
+	for(i = 0; i < Nstats6; i++)
+		p = seprint(p, e, "%s: %lud\n", statnames6[i], priv->stats[i]);
+	for(i = 0; i <= Maxtype6; i++)
+		if(icmpnames6[i])
+			p = seprint(p, e, "%s: %lud %lud\n", icmpnames6[i],
+				priv->in[i], priv->out[i]);
+/*		else
+			p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i],
+				priv->out[i]);
+ */
+	return p - buf;
+}
+
+
+/* import from icmp.c */
+extern int	icmpstate(Conv *c, char *state, int n);
+extern char*	icmpannounce(Conv *c, char **argv, int argc);
+extern char*	icmpconnect(Conv *c, char **argv, int argc);
+extern void	icmpclose(Conv *c);
+
+void
+icmp6init(Fs *fs)
+{
+	Proto *icmp6 = smalloc(sizeof(Proto));
+
+	icmp6->priv = smalloc(sizeof(Icmppriv6));
+	icmp6->name = "icmpv6";
+	icmp6->connect = icmpconnect;
+	icmp6->announce = icmpannounce;
+	icmp6->state = icmpstate;
+	icmp6->create = icmpcreate6;
+	icmp6->close = icmpclose;
+	icmp6->rcv = icmpiput6;
+	icmp6->stats = icmpstats6;
+	icmp6->ctl = icmpctl6;
+	icmp6->advise = icmpadvise6;
+	icmp6->gc = nil;
+	icmp6->ipproto = ICMPv6;
+	icmp6->nc = 16;
+	icmp6->ptclsize = sizeof(Icmpcb6);
+
+	Fsproto(fs, icmp6);
+}

+ 50 - 0
sys/src/9/ip/inferno.c

@@ -0,0 +1,50 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"ip.h"
+
+/*
+ *  some hacks for commonality twixt inferno and plan9
+ */
+
+char*
+commonuser(void)
+{
+	return up->user;
+}
+
+Chan*
+commonfdtochan(int fd, int mode, int a, int b)
+{
+	return fdtochan(fd, mode, a, b);
+}
+
+char*
+commonerror(void)
+{
+	return up->errstr;
+}
+
+char*
+bootp(Ipifc*)
+{
+	return "unimplmented";
+}
+
+int
+bootpread(char*, ulong, int)
+{
+	return	0;
+}

+ 814 - 0
sys/src/9/ip/ip.c

@@ -0,0 +1,814 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+
+typedef struct Ip4hdr		Ip4hdr;
+typedef struct IP		IP;
+typedef struct Fragment4	Fragment4;
+typedef struct Fragment6	Fragment6;
+typedef struct Ipfrag		Ipfrag;
+
+enum
+{
+	IP4HDR		= 20,		/* sizeof(Ip4hdr) */
+	IP6HDR		= 40,		/* sizeof(Ip6hdr) */
+	IP_HLEN4	= 0x05,		/* Header length in words */
+	IP_DF		= 0x4000,	/* Don't fragment */
+	IP_MF		= 0x2000,	/* More fragments */
+	IP6FHDR		= 8, 		/* sizeof(Fraghdr6) */
+	IP_MAX		= 64*1024,	/* Maximum Internet packet size */
+};
+
+#define BLKIPVER(xp)	(((Ip4hdr*)((xp)->rp))->vihl&0xF0)
+
+struct Ip4hdr
+{
+	uchar	vihl;		/* Version and header length */
+	uchar	tos;		/* Type of service */
+	uchar	length[2];	/* packet length */
+	uchar	id[2];		/* ip->identification */
+	uchar	frag[2];	/* Fragment information */
+	uchar	ttl;		/* Time to live */
+	uchar	proto;		/* Protocol */
+	uchar	cksum[2];	/* Header checksum */
+	uchar	src[4];		/* IP source */
+	uchar	dst[4];		/* IP destination */
+};
+
+/* MIB II counters */
+enum
+{
+	Forwarding,
+	DefaultTTL,
+	InReceives,
+	InHdrErrors,
+	InAddrErrors,
+	ForwDatagrams,
+	InUnknownProtos,
+	InDiscards,
+	InDelivers,
+	OutRequests,
+	OutDiscards,
+	OutNoRoutes,
+	ReasmTimeout,
+	ReasmReqds,
+	ReasmOKs,
+	ReasmFails,
+	FragOKs,
+	FragFails,
+	FragCreates,
+
+	Nstats,
+};
+
+struct Fragment4
+{
+	Block*	blist;
+	Fragment4*	next;
+	ulong 	src;
+	ulong 	dst;
+	ushort	id;
+	ulong 	age;
+};
+
+struct Fragment6
+{
+	Block*	blist;
+	Fragment6*	next;
+	uchar 	src[IPaddrlen];
+	uchar 	dst[IPaddrlen];
+	uint	id;
+	ulong 	age;
+};
+
+struct Ipfrag
+{
+	ushort	foff;
+	ushort	flen;
+};
+
+/* an instance of IP */
+struct IP
+{
+	ulong		stats[Nstats];
+
+	QLock		fraglock4;
+	Fragment4*	flisthead4;
+	Fragment4*	fragfree4;
+	Ref		id4;
+
+	QLock		fraglock6;
+	Fragment6*	flisthead6;
+	Fragment6*	fragfree6;
+	Ref		id6;
+
+	int		iprouting;	/* true if we route like a gateway */
+};
+
+static char *statnames[] =
+{
+[Forwarding]	"Forwarding",
+[DefaultTTL]	"DefaultTTL",
+[InReceives]	"InReceives",
+[InHdrErrors]	"InHdrErrors",
+[InAddrErrors]	"InAddrErrors",
+[ForwDatagrams]	"ForwDatagrams",
+[InUnknownProtos]	"InUnknownProtos",
+[InDiscards]	"InDiscards",
+[InDelivers]	"InDelivers",
+[OutRequests]	"OutRequests",
+[OutDiscards]	"OutDiscards",
+[OutNoRoutes]	"OutNoRoutes",
+[ReasmTimeout]	"ReasmTimeout",
+[ReasmReqds]	"ReasmReqds",
+[ReasmOKs]	"ReasmOKs",
+[ReasmFails]	"ReasmFails",
+[FragOKs]	"FragOKs",
+[FragFails]	"FragFails",
+[FragCreates]	"FragCreates",
+};
+
+#define BLKIP(xp)	((Ip4hdr*)((xp)->rp))
+/*
+ * This sleazy macro relies on the media header size being
+ * larger than sizeof(Ipfrag). ipreassemble checks this is true
+ */
+#define BKFG(xp)	((Ipfrag*)((xp)->base))
+
+ushort		ipcsum(uchar*);
+Block*		ip4reassemble(IP*, int, Block*, Ip4hdr*);
+void		ipfragfree4(IP*, Fragment4*);
+Fragment4*	ipfragallo4(IP*);
+
+
+void
+ip_init_6(Fs *f)
+{
+	v6params *v6p;
+
+	v6p = smalloc(sizeof(v6params));
+
+	v6p->rp.mflag		= 0;		/* default not managed */
+	v6p->rp.oflag		= 0;
+	v6p->rp.maxraint	= 600000;	/* millisecs */
+	v6p->rp.minraint	= 200000;
+	v6p->rp.linkmtu		= 0;		/* no mtu sent */
+	v6p->rp.reachtime	= 0;
+	v6p->rp.rxmitra		= 0;
+	v6p->rp.ttl		= MAXTTL;
+	v6p->rp.routerlt	= 3*(v6p->rp.maxraint);
+
+	v6p->hp.rxmithost	= 1000;		/* v6 RETRANS_TIMER */
+
+	v6p->cdrouter 		= -1;
+
+	f->v6p			= v6p;
+
+}
+
+void
+initfrag(IP *ip, int size)
+{
+	Fragment4 *fq4, *eq4;
+	Fragment6 *fq6, *eq6;
+
+	ip->fragfree4 = (Fragment4*)malloc(sizeof(Fragment4) * size);
+	if(ip->fragfree4 == nil)
+		panic("initfrag");
+
+	eq4 = &ip->fragfree4[size];
+	for(fq4 = ip->fragfree4; fq4 < eq4; fq4++)
+		fq4->next = fq4+1;
+
+	ip->fragfree4[size-1].next = nil;
+
+	ip->fragfree6 = (Fragment6*)malloc(sizeof(Fragment6) * size);
+	if(ip->fragfree6 == nil)
+		panic("initfrag");
+
+	eq6 = &ip->fragfree6[size];
+	for(fq6 = ip->fragfree6; fq6 < eq6; fq6++)
+		fq6->next = fq6+1;
+
+	ip->fragfree6[size-1].next = nil;
+}
+
+void
+ip_init(Fs *f)
+{
+	IP *ip;
+
+	ip = smalloc(sizeof(IP));
+	initfrag(ip, 100);
+	f->ip = ip;
+
+	ip_init_6(f);
+}
+
+void
+iprouting(Fs *f, int on)
+{
+	f->ip->iprouting = on;
+	if(f->ip->iprouting==0)
+		f->ip->stats[Forwarding] = 2;
+	else
+		f->ip->stats[Forwarding] = 1;
+}
+
+int
+ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
+{
+	Ipifc *ifc;
+	uchar *gate;
+	ulong fragoff;
+	Block *xp, *nb;
+	Ip4hdr *eh, *feh;
+	int lid, len, seglen, chunk, dlen, blklen, offset, medialen;
+	Route *r, *sr;
+	IP *ip;
+	int rv = 0;
+
+	ip = f->ip;
+
+	/* Fill out the ip header */
+	eh = (Ip4hdr*)(bp->rp);
+
+	ip->stats[OutRequests]++;
+
+	/* Number of uchars in data and ip header to write */
+	len = blocklen(bp);
+
+	if(gating){
+		chunk = nhgets(eh->length);
+		if(chunk > len){
+			ip->stats[OutDiscards]++;
+			netlog(f, Logip, "short gated packet\n");
+			goto free;
+		}
+		if(chunk < len)
+			len = chunk;
+	}
+	if(len >= IP_MAX){
+		ip->stats[OutDiscards]++;
+		netlog(f, Logip, "exceeded ip max size %V\n", eh->dst);
+		goto free;
+	}
+
+	r = v4lookup(f, eh->dst, c);
+	if(r == nil){
+		ip->stats[OutNoRoutes]++;
+		netlog(f, Logip, "no interface %V\n", eh->dst);
+		rv = -1;
+		goto free;
+	}
+
+	ifc = r->ifc;
+	if(r->type & (Rifc|Runi))
+		gate = eh->dst;
+	else
+	if(r->type & (Rbcast|Rmulti)) {
+		gate = eh->dst;
+		sr = v4lookup(f, eh->src, nil);
+		if(sr != nil && (sr->type & Runi))
+			ifc = sr->ifc;
+	}
+	else
+		gate = r->v4.gate;
+
+	if(!gating)
+		eh->vihl = IP_VER4|IP_HLEN4;
+	eh->ttl = ttl;
+	if(!gating)
+		eh->tos = tos;
+
+	if(!canrlock(ifc))
+		goto free;
+	if(waserror()){
+		runlock(ifc);
+		nexterror();
+	}
+	if(ifc->medium == nil)
+		goto raise;
+
+	/* If we dont need to fragment just send it */
+	medialen = ifc->maxtu - ifc->medium->hsize;
+	if(len <= medialen) {
+		if(!gating)
+			hnputs(eh->id, incref(&ip->id4));
+		hnputs(eh->length, len);
+		if(!gating){
+			eh->frag[0] = 0;
+			eh->frag[1] = 0;
+		}
+		eh->cksum[0] = 0;
+		eh->cksum[1] = 0;
+		hnputs(eh->cksum, ipcsum(&eh->vihl));
+		ifc->medium->bwrite(ifc, bp, V4, gate);
+		runlock(ifc);
+		poperror();
+		return 0;
+	}
+
+if((eh->frag[0] & (IP_DF>>8)) && !gating) print("%V: DF set\n", eh->dst);
+
+	if(eh->frag[0] & (IP_DF>>8)){
+		ip->stats[FragFails]++;
+		ip->stats[OutDiscards]++;
+		icmpcantfrag(f, bp, medialen);
+		netlog(f, Logip, "%V: eh->frag[0] & (IP_DF>>8)\n", eh->dst);
+		goto raise;
+	}
+
+	seglen = (medialen - IP4HDR) & ~7;
+	if(seglen < 8){
+		ip->stats[FragFails]++;
+		ip->stats[OutDiscards]++;
+		netlog(f, Logip, "%V seglen < 8\n", eh->dst);
+		goto raise;
+	}
+
+	dlen = len - IP4HDR;
+	xp = bp;
+	if(gating)
+		lid = nhgets(eh->id);
+	else
+		lid = incref(&ip->id4);
+
+	offset = IP4HDR;
+	while(xp != nil && offset && offset >= BLEN(xp)) {
+		offset -= BLEN(xp);
+		xp = xp->next;
+	}
+	xp->rp += offset;
+
+	if(gating)
+		fragoff = nhgets(eh->frag)<<3;
+	else
+		fragoff = 0;
+	dlen += fragoff;
+	for(; fragoff < dlen; fragoff += seglen) {
+		nb = allocb(IP4HDR+seglen);
+		feh = (Ip4hdr*)(nb->rp);
+
+		memmove(nb->wp, eh, IP4HDR);
+		nb->wp += IP4HDR;
+
+		if((fragoff + seglen) >= dlen) {
+			seglen = dlen - fragoff;
+			hnputs(feh->frag, fragoff>>3);
+		}
+		else
+			hnputs(feh->frag, (fragoff>>3)|IP_MF);
+
+		hnputs(feh->length, seglen + IP4HDR);
+		hnputs(feh->id, lid);
+
+		/* Copy up the data area */
+		chunk = seglen;
+		while(chunk) {
+			if(!xp) {
+				ip->stats[OutDiscards]++;
+				ip->stats[FragFails]++;
+				freeblist(nb);
+				netlog(f, Logip, "!xp: chunk %d\n", chunk);
+				goto raise;
+			}
+			blklen = chunk;
+			if(BLEN(xp) < chunk)
+				blklen = BLEN(xp);
+			memmove(nb->wp, xp->rp, blklen);
+			nb->wp += blklen;
+			xp->rp += blklen;
+			chunk -= blklen;
+			if(xp->rp == xp->wp)
+				xp = xp->next;
+		}
+
+		feh->cksum[0] = 0;
+		feh->cksum[1] = 0;
+		hnputs(feh->cksum, ipcsum(&feh->vihl));
+		ifc->medium->bwrite(ifc, nb, V4, gate);
+		ip->stats[FragCreates]++;
+	}
+	ip->stats[FragOKs]++;
+raise:
+	runlock(ifc);
+	poperror();
+free:
+	freeblist(bp);
+	return rv;
+}
+
+void
+ipiput4(Fs *f, Ipifc *ifc, Block *bp)
+{
+	int hl;
+	int hop, tos, proto, olen;
+	Ip4hdr *h;
+	Proto *p;
+	ushort frag;
+	int notforme;
+	uchar *dp, v6dst[IPaddrlen];
+	IP *ip;
+	Route *r;
+
+	if(BLKIPVER(bp) != IP_VER4) {
+		ipiput6(f, ifc, bp);
+		return;
+	}
+
+	ip = f->ip;
+	ip->stats[InReceives]++;
+
+	/*
+	 *  Ensure we have all the header info in the first
+	 *  block.  Make life easier for other protocols by
+	 *  collecting up to the first 64 bytes in the first block.
+	 */
+	if(BLEN(bp) < 64) {
+		hl = blocklen(bp);
+		if(hl < IP4HDR)
+			hl = IP4HDR;
+		if(hl > 64)
+			hl = 64;
+		bp = pullupblock(bp, hl);
+		if(bp == nil)
+			return;
+	}
+
+	h = (Ip4hdr*)(bp->rp);
+
+	/* dump anything that whose header doesn't checksum */
+	if((bp->flag & Bipck) == 0 && ipcsum(&h->vihl)) {
+		ip->stats[InHdrErrors]++;
+		netlog(f, Logip, "ip: checksum error %V\n", h->src);
+		freeblist(bp);
+		return;
+	}
+	v4tov6(v6dst, h->dst);
+	notforme = ipforme(f, v6dst) == 0;
+
+	/* Check header length and version */
+	if((h->vihl&0x0F) != IP_HLEN4) {
+		hl = (h->vihl&0xF)<<2;
+		if(hl < (IP_HLEN4<<2)) {
+			ip->stats[InHdrErrors]++;
+			netlog(f, Logip, "ip: %V bad hivl %ux\n", h->src, h->vihl);
+			freeblist(bp);
+			return;
+		}
+		/* If this is not routed strip off the options */
+		if(notforme == 0) {
+			olen = nhgets(h->length);
+			dp = bp->rp + (hl - (IP_HLEN4<<2));
+			memmove(dp, h, IP_HLEN4<<2);
+			bp->rp = dp;
+			h = (Ip4hdr*)(bp->rp);
+			h->vihl = (IP_VER4|IP_HLEN4);
+			hnputs(h->length, olen-hl+(IP_HLEN4<<2));
+		}
+	}
+
+	/* route */
+	if(notforme) {
+		Conv conv;
+
+		if(!ip->iprouting){
+			freeb(bp);
+			return;
+		}
+
+		/* don't forward to source's network */
+		conv.r = nil;
+		r = v4lookup(f, h->dst, &conv);
+		if(r == nil || r->ifc == ifc){
+			ip->stats[OutDiscards]++;
+			freeblist(bp);
+			return;
+		}
+
+		/* don't forward if packet has timed out */
+		hop = h->ttl;
+		if(hop < 1) {
+			ip->stats[InHdrErrors]++;
+			icmpttlexceeded(f, ifc->lifc->local, bp);
+			freeblist(bp);
+			return;
+		}
+
+		/* reassemble if the interface expects it */
+if(r->ifc == nil) panic("nil route rfc");
+		if(r->ifc->reassemble){
+			frag = nhgets(h->frag);
+			if(frag) {
+				h->tos = 0;
+				if(frag & IP_MF)
+					h->tos = 1;
+				bp = ip4reassemble(ip, frag, bp, h);
+				if(bp == nil)
+					return;
+				h = (Ip4hdr*)(bp->rp);
+			}
+		}
+
+		ip->stats[ForwDatagrams]++;
+		tos = h->tos;
+		hop = h->ttl;
+		ipoput4(f, bp, 1, hop - 1, tos, &conv);
+		return;
+	}
+
+	frag = nhgets(h->frag);
+	if(frag) {
+		h->tos = 0;
+		if(frag & IP_MF)
+			h->tos = 1;
+		bp = ip4reassemble(ip, frag, bp, h);
+		if(bp == nil)
+			return;
+		h = (Ip4hdr*)(bp->rp);
+	}
+
+	/* don't let any frag info go up the stack */
+	h->frag[0] = 0;
+	h->frag[1] = 0;
+
+	proto = h->proto;
+	p = Fsrcvpcol(f, proto);
+	if(p != nil && p->rcv != nil) {
+		ip->stats[InDelivers]++;
+		(*p->rcv)(p, ifc, bp);
+		return;
+	}
+	ip->stats[InDiscards]++;
+	ip->stats[InUnknownProtos]++;
+	freeblist(bp);
+}
+
+int
+ipstats(Fs *f, char *buf, int len)
+{
+	IP *ip;
+	char *p, *e;
+	int i;
+
+	ip = f->ip;
+	ip->stats[DefaultTTL] = MAXTTL;
+
+	p = buf;
+	e = p+len;
+	for(i = 0; i < Nstats; i++)
+		p = seprint(p, e, "%s: %lud\n", statnames[i], ip->stats[i]);
+	return p - buf;
+}
+
+Block*
+ip4reassemble(IP *ip, int offset, Block *bp, Ip4hdr *ih)
+{
+	int fend;
+	ushort id;
+	Fragment4 *f, *fnext;
+	ulong src, dst;
+	Block *bl, **l, *last, *prev;
+	int ovlap, len, fragsize, pktposn;
+
+	src = nhgetl(ih->src);
+	dst = nhgetl(ih->dst);
+	id = nhgets(ih->id);
+
+	/*
+	 *  block lists are too hard, pullupblock into a single block
+	 */
+	if(bp->next){
+		bp = pullupblock(bp, blocklen(bp));
+		ih = (Ip4hdr*)(bp->rp);
+	}
+
+	qlock(&ip->fraglock4);
+
+	/*
+	 *  find a reassembly queue for this fragment
+	 */
+	for(f = ip->flisthead4; f; f = fnext){
+		fnext = f->next;	/* because ipfragfree4 changes the list */
+		if(f->src == src && f->dst == dst && f->id == id)
+			break;
+		if(f->age < NOW){
+			ip->stats[ReasmTimeout]++;
+			ipfragfree4(ip, f);
+		}
+	}
+
+	/*
+	 *  if this isn't a fragmented packet, accept it
+	 *  and get rid of any fragments that might go
+	 *  with it.
+	 */
+	if(!ih->tos && (offset & ~(IP_MF|IP_DF)) == 0) {
+		if(f != nil) {
+			ipfragfree4(ip, f);
+			ip->stats[ReasmFails]++;
+		}
+		qunlock(&ip->fraglock4);
+		return bp;
+	}
+
+	if(bp->base+sizeof(Ipfrag) >= bp->rp){
+		bp = padblock(bp, sizeof(Ipfrag));
+		bp->rp += sizeof(Ipfrag);
+	}
+
+	BKFG(bp)->foff = offset<<3;
+	BKFG(bp)->flen = nhgets(ih->length)-IP4HDR;
+
+	/* First fragment allocates a reassembly queue */
+	if(f == nil) {
+		f = ipfragallo4(ip);
+		f->id = id;
+		f->src = src;
+		f->dst = dst;
+
+		f->blist = bp;
+
+		qunlock(&ip->fraglock4);
+		ip->stats[ReasmReqds]++;
+		return nil;
+	}
+
+	/*
+	 *  find the new fragment's position in the queue
+	 */
+	prev = nil;
+	l = &f->blist;
+	bl = f->blist;
+	while(bl != nil && BKFG(bp)->foff > BKFG(bl)->foff) {
+		prev = bl;
+		l = &bl->next;
+		bl = bl->next;
+	}
+
+	/* Check overlap of a previous fragment - trim away as necessary */
+	if(prev) {
+		ovlap = BKFG(prev)->foff + BKFG(prev)->flen - BKFG(bp)->foff;
+		if(ovlap > 0) {
+			if(ovlap >= BKFG(bp)->flen) {
+				freeblist(bp);
+				qunlock(&ip->fraglock4);
+				return nil;
+			}
+			BKFG(prev)->flen -= ovlap;
+		}
+	}
+
+	/* Link onto assembly queue */
+	bp->next = *l;
+	*l = bp;
+
+	/* Check to see if succeeding segments overlap */
+	if(bp->next) {
+		l = &bp->next;
+		fend = BKFG(bp)->foff + BKFG(bp)->flen;
+		/* Take completely covered segments out */
+		while(*l) {
+			ovlap = fend - BKFG(*l)->foff;
+			if(ovlap <= 0)
+				break;
+			if(ovlap < BKFG(*l)->flen) {
+				BKFG(*l)->flen -= ovlap;
+				BKFG(*l)->foff += ovlap;
+				/* move up ih hdrs */
+				memmove((*l)->rp + ovlap, (*l)->rp, IP4HDR);
+				(*l)->rp += ovlap;
+				break;
+			}
+			last = (*l)->next;
+			(*l)->next = nil;
+			freeblist(*l);
+			*l = last;
+		}
+	}
+
+	/*
+	 *  look for a complete packet.  if we get to a fragment
+	 *  without IP_MF set, we're done.
+	 */
+	pktposn = 0;
+	for(bl = f->blist; bl; bl = bl->next) {
+		if(BKFG(bl)->foff != pktposn)
+			break;
+		if((BLKIP(bl)->frag[0]&(IP_MF>>8)) == 0) {
+			bl = f->blist;
+			len = nhgets(BLKIP(bl)->length);
+			bl->wp = bl->rp + len;
+
+			/* Pullup all the fragment headers and
+			 * return a complete packet
+			 */
+			for(bl = bl->next; bl; bl = bl->next) {
+				fragsize = BKFG(bl)->flen;
+				len += fragsize;
+				bl->rp += IP4HDR;
+				bl->wp = bl->rp + fragsize;
+			}
+
+			bl = f->blist;
+			f->blist = nil;
+			ipfragfree4(ip, f);
+			ih = BLKIP(bl);
+			hnputs(ih->length, len);
+			qunlock(&ip->fraglock4);
+			ip->stats[ReasmOKs]++;
+			return bl;
+		}
+		pktposn += BKFG(bl)->flen;
+	}
+	qunlock(&ip->fraglock4);
+	return nil;
+}
+
+/*
+ * ipfragfree4 - Free a list of fragments - assume hold fraglock4
+ */
+void
+ipfragfree4(IP *ip, Fragment4 *frag)
+{
+	Fragment4 *fl, **l;
+
+	if(frag->blist)
+		freeblist(frag->blist);
+
+	frag->src = 0;
+	frag->id = 0;
+	frag->blist = nil;
+
+	l = &ip->flisthead4;
+	for(fl = *l; fl; fl = fl->next) {
+		if(fl == frag) {
+			*l = frag->next;
+			break;
+		}
+		l = &fl->next;
+	}
+
+	frag->next = ip->fragfree4;
+	ip->fragfree4 = frag;
+
+}
+
+/*
+ * ipfragallo4 - allocate a reassembly queue - assume hold fraglock4
+ */
+Fragment4 *
+ipfragallo4(IP *ip)
+{
+	Fragment4 *f;
+
+	while(ip->fragfree4 == nil) {
+		/* free last entry on fraglist */
+		for(f = ip->flisthead4; f->next; f = f->next)
+			;
+		ipfragfree4(ip, f);
+	}
+	f = ip->fragfree4;
+	ip->fragfree4 = f->next;
+	f->next = ip->flisthead4;
+	ip->flisthead4 = f;
+	f->age = NOW + 30000;
+
+	return f;
+}
+
+ushort
+ipcsum(uchar *addr)
+{
+	int len;
+	ulong sum;
+
+	sum = 0;
+	len = (addr[0]&0xf)<<2;
+
+	while(len > 0) {
+		sum += addr[0]<<8 | addr[1] ;
+		len -= 2;
+		addr += 2;
+	}
+
+	sum = (sum & 0xffff) + (sum >> 16);
+	sum = (sum & 0xffff) + (sum >> 16);
+
+	return (sum^0xffff);
+}

+ 654 - 0
sys/src/9/ip/ip.h

@@ -0,0 +1,654 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+typedef struct	Conv	Conv;
+typedef struct	Fs	Fs;
+typedef union	Hwaddr	Hwaddr;
+typedef struct	IP	IP;
+typedef struct	IPaux	IPaux;
+typedef struct	Ipself	Ipself;
+typedef struct	Ipselftab	Ipselftab;
+typedef struct	Iplink	Iplink;
+typedef struct	Iplifc	Iplifc;
+typedef struct	Ipmulti	Ipmulti;
+typedef struct	Ipifc	Ipifc;
+typedef struct	Iphash	Iphash;
+typedef struct	Ipht	Ipht;
+typedef struct	Netlog	Netlog;
+typedef struct	Medium	Medium;
+typedef struct	Proto	Proto;
+typedef struct	Arpent	Arpent;
+typedef struct	Arp Arp;
+typedef struct	Route	Route;
+
+typedef struct	Routerparams	Routerparams;
+typedef struct 	Hostparams	Hostparams;
+typedef struct 	v6router	v6router;
+typedef struct	v6params	v6params;
+
+#pragma incomplete Arp
+#pragma incomplete Ipself
+#pragma incomplete Ipselftab
+#pragma incomplete IP
+#pragma incomplete Netlog
+
+enum
+{
+	Addrlen=	64,
+	Maxproto=	20,
+	Nhash=		64,
+	Maxincall=	128,
+	Nchans=		1024,
+	MAClen=		16,		/* longest mac address */
+
+	MAXTTL=		255,
+	DFLTTOS=	0,
+
+	IPaddrlen=	16,
+	IPv4addrlen=	4,
+	IPv4off=	12,
+	IPllen=		4,
+
+	/* ip versions */
+	V4=		4,
+	V6=		6,
+	IP_VER4= 	0x40,
+	IP_VER6=	0x60,
+
+	/* 2^Lroot trees in the root table */
+	Lroot=		10,
+
+	Maxpath =	64,
+};
+
+enum
+{
+	Idle=		0,
+	Announcing=	1,
+	Announced=	2,
+	Connecting=	3,
+	Connected=	4,
+};
+
+/*
+ *  one per conversation directory
+ */
+struct Conv
+{
+	QLock;
+
+	int	x;			/* conversation index */
+	Proto*	p;
+
+	int	restricted;		/* remote port is restricted */
+	uint	ttl;			/* max time to live */
+	uint	tos;			/* type of service */
+	int	ignoreadvice;		/* don't terminate connection on icmp errors */
+
+	uchar	ipversion;
+	uchar	laddr[IPaddrlen];	/* local IP address */
+	uchar	raddr[IPaddrlen];	/* remote IP address */
+	ushort	lport;			/* local port number */
+	ushort	rport;			/* remote port number */
+
+	char	*owner;			/* protections */
+	int	perm;
+	int	inuse;			/* opens of listen/data/ctl */
+	int	length;
+	int	state;
+
+	/* udp specific */
+	int	headers;		/* data src/dst headers in udp */
+	int	reliable;		/* true if reliable udp */
+
+	Conv*	incall;			/* calls waiting to be listened for */
+	Conv*	next;
+
+	Queue*	rq;			/* queued data waiting to be read */
+	Queue*	wq;			/* queued data waiting to be written */
+	Queue*	eq;			/* returned error packets */
+	Queue*	sq;			/* snooping queue */
+	Ref	snoopers;		/* number of processes with snoop open */
+
+	QLock	car;
+	Rendez	cr;
+	char	cerr[ERRMAX];
+
+	QLock	listenq;
+	Rendez	listenr;
+
+	Ipmulti	*multi;			/* multicast bindings for this interface */
+
+	void*	ptcl;			/* protocol specific stuff */
+
+	Route	*r;			/* last route used */
+	ulong	rgen;			/* routetable generation for *r */
+};
+
+struct Medium
+{
+	char	*name;
+	int	hsize;		/* medium header size */
+	int	mintu;		/* default min mtu */
+	int	maxtu;		/* default max mtu */
+	int	maclen;		/* mac address length */
+	void	(*bind)(Ipifc*, int, char**);
+	void	(*unbind)(Ipifc*);
+	void	(*bwrite)(Ipifc *ifc, Block *b, int version, uchar *ip);
+
+	/* for arming interfaces to receive multicast */
+	void	(*addmulti)(Ipifc *ifc, uchar *a, uchar *ia);
+	void	(*remmulti)(Ipifc *ifc, uchar *a, uchar *ia);
+
+	/* process packets written to 'data' */
+	void	(*pktin)(Fs *f, Ipifc *ifc, Block *bp);
+
+	/* routes for router boards */
+	void	(*addroute)(Ipifc *ifc, int, uchar*, uchar*, uchar*, int);
+	void	(*remroute)(Ipifc *ifc, int, uchar*, uchar*);
+	void	(*flushroutes)(Ipifc *ifc);
+
+	/* for routing multicast groups */
+	void	(*joinmulti)(Ipifc *ifc, uchar *a, uchar *ia);
+	void	(*leavemulti)(Ipifc *ifc, uchar *a, uchar *ia);
+
+	/* address resolution */
+	void	(*ares)(Fs*, int, uchar*, uchar*, int, int);	/* resolve */
+	void	(*areg)(Ipifc*, uchar*);			/* register */
+
+	/* v6 address generation */
+	void	(*pref2addr)(uchar *pref, uchar *ea);
+
+	int	unbindonclose;	/* if non-zero, unbind on last close */
+};
+
+/* logical interface associated with a physical one */
+struct Iplifc
+{
+	uchar	local[IPaddrlen];
+	uchar	mask[IPaddrlen];
+	uchar	remote[IPaddrlen];
+	uchar	net[IPaddrlen];
+	uchar	tentative;	/* =1 => v6 dup disc on, =0 => confirmed unique */
+	uchar	onlink;		/* =1 => onlink, =0 offlink. */
+	uchar	autoflag;	/* v6 autonomous flag */
+	long 	validlt;	/* v6 valid lifetime */
+	long 	preflt;		/* v6 preferred lifetime */
+	long	origint;	/* time when addr was added */
+	Iplink	*link;		/* addresses linked to this lifc */
+	Iplifc	*next;
+};
+
+/* binding twixt Ipself and Iplifc */
+struct Iplink
+{
+	Ipself	*self;
+	Iplifc	*lifc;
+	Iplink	*selflink;	/* next link for this local address */
+	Iplink	*lifclink;	/* next link for this ifc */
+	ulong	expire;
+	Iplink	*next;		/* free list */
+	int	ref;
+};
+
+/* rfc 2461, pp.40—43. */
+
+/* default values, one per stack */
+struct Routerparams {
+	int	mflag;		/* flag: managed address configuration */
+	int	oflag;		/* flag: other stateful configuration */
+	int 	maxraint;	/* max. router adv interval (ms) */
+	int	minraint;	/* min. router adv interval (ms) */
+	int	linkmtu;	/* mtu options */
+	int	reachtime;	/* reachable time */
+	int	rxmitra;	/* retransmit interval */
+	int	ttl;		/* cur hop count limit */
+	int	routerlt;	/* router lifetime */
+};
+
+struct Hostparams {
+	int	rxmithost;
+};
+
+struct Ipifc
+{
+	RWlock;
+
+	Conv	*conv;		/* link to its conversation structure */
+	char	dev[64];	/* device we're attached to */
+	Medium	*medium;		/* Media pointer */
+	int	maxtu;		/* Maximum transfer unit */
+	int	mintu;		/* Minumum tranfer unit */
+	int	mbps;		/* megabits per second */
+	void	*arg;		/* medium specific */
+	int	reassemble;	/* reassemble IP packets before forwarding */
+
+	/* these are used so that we can unbind on the fly */
+	Lock	idlock;
+	uchar	ifcid;		/* incremented each 'bind/unbind/add/remove' */
+	int	ref;		/* number of proc's using this ipifc */
+	Rendez	wait;		/* where unbinder waits for ref == 0 */
+	int	unbinding;
+
+	uchar	mac[MAClen];	/* MAC address */
+
+	Iplifc	*lifc;		/* logical interfaces on this physical one */
+
+	ulong	in, out;	/* message statistics */
+	ulong	inerr, outerr;	/* ... */
+
+	uchar	sendra6;	/* flag: send router advs on this ifc */
+	uchar	recvra6;	/* flag: recv router advs on this ifc */
+	Routerparams rp;	/* router parameters as in RFC 2461, pp.40—43.
+					used only if node is router */
+};
+
+/*
+ *  one per multicast-lifc pair used by a Conv
+ */
+struct Ipmulti
+{
+	uchar	ma[IPaddrlen];
+	uchar	ia[IPaddrlen];
+	Ipmulti	*next;
+};
+
+/*
+ *  hash table for 2 ip addresses + 2 ports
+ */
+enum
+{
+	Nipht=		521,	/* convenient prime */
+
+	IPmatchexact=	0,	/* match on 4 tuple */
+	IPmatchany,		/* *!* */
+	IPmatchport,		/* *!port */
+	IPmatchaddr,		/* addr!* */
+	IPmatchpa,		/* addr!port */
+};
+struct Iphash
+{
+	Iphash	*next;
+	Conv	*c;
+	int	match;
+};
+struct Ipht
+{
+	Lock;
+	Iphash	*tab[Nipht];
+};
+void iphtadd(Ipht*, Conv*);
+void iphtrem(Ipht*, Conv*);
+Conv* iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp);
+
+/*
+ *  one per multiplexed protocol
+ */
+struct Proto
+{
+	QLock;
+	char*		name;		/* protocol name */
+	int		x;		/* protocol index */
+	int		ipproto;	/* ip protocol type */
+
+	char*		(*connect)(Conv*, char**, int);
+	char*		(*announce)(Conv*, char**, int);
+	char*		(*bind)(Conv*, char**, int);
+	int		(*state)(Conv*, char*, int);
+	void		(*create)(Conv*);
+	void		(*close)(Conv*);
+	void		(*rcv)(Proto*, Ipifc*, Block*);
+	char*		(*ctl)(Conv*, char**, int);
+	void		(*advise)(Proto*, Block*, char*);
+	int		(*stats)(Proto*, char*, int);
+	int		(*local)(Conv*, char*, int);
+	int		(*remote)(Conv*, char*, int);
+	int		(*inuse)(Conv*);
+	int		(*gc)(Proto*);	/* returns true if any conversations are freed */
+
+	Fs		*f;		/* file system this proto is part of */
+	Conv		**conv;		/* array of conversations */
+	int		ptclsize;	/* size of per protocol ctl block */
+	int		nc;		/* number of conversations */
+	int		ac;
+	Qid		qid;		/* qid for protocol directory */
+	ushort		nextport;
+	ushort		nextrport;
+
+	void		*priv;
+};
+
+
+/*
+ *  one per IP protocol stack
+ */
+struct Fs
+{
+	RWlock;
+	int	dev;
+
+	int	np;
+	Proto*	p[Maxproto+1];		/* list of supported protocols */
+	Proto*	t2p[256];		/* vector of all protocols */
+	Proto*	ipifc;			/* kludge for ipifcremroute & ipifcaddroute */
+	Proto*	ipmux;			/* kludge for finding an ip multiplexor */
+
+	IP	*ip;
+	Ipselftab	*self;
+	Arp	*arp;
+	v6params	*v6p;
+
+	Route	*v4root[1<<Lroot];	/* v4 routing forest */
+	Route	*v6root[1<<Lroot];	/* v6 routing forest */
+	Route	*queue;			/* used as temp when reinjecting routes */
+
+	Netlog	*alog;
+
+	char	ndb[1024];		/* an ndb entry for this interface */
+	int	ndbvers;
+	long	ndbmtime;
+};
+
+/* one per default router known to host */
+struct v6router {
+	uchar	inuse;
+	Ipifc	*ifc;
+	int	ifcid;
+	uchar	routeraddr[IPaddrlen];
+	long	ltorigin;
+	Routerparams	rp;
+};
+
+struct v6params
+{
+	Routerparams	rp;		/* v6 params, one copy per node now */
+	Hostparams	hp;
+	v6router	v6rlist[3];	/* max 3 default routers, currently */
+	int		cdrouter;	/* uses only v6rlist[cdrouter] if   */
+					/* cdrouter >= 0. */
+};
+
+
+int	Fsconnected(Conv*, char*);
+Conv*	Fsnewcall(Conv*, uchar*, ushort, uchar*, ushort, uchar);
+int	Fspcolstats(char*, int);
+int	Fsproto(Fs*, Proto*);
+int	Fsbuiltinproto(Fs*, uchar);
+Conv*	Fsprotoclone(Proto*, char*);
+Proto*	Fsrcvpcol(Fs*, uchar);
+Proto*	Fsrcvpcolx(Fs*, uchar);
+char*	Fsstdconnect(Conv*, char**, int);
+char*	Fsstdannounce(Conv*, char**, int);
+char*	Fsstdbind(Conv*, char**, int);
+ulong	scalednconv(void);
+void	closeconv(Conv*);
+/*
+ *  logging
+ */
+enum
+{
+	Logip=		1<<1,
+	Logtcp=		1<<2,
+	Logfs=		1<<3,
+	Logil=		1<<4,
+	Logicmp=	1<<5,
+	Logudp=		1<<6,
+	Logcompress=	1<<7,
+	Logilmsg=	1<<8,
+	Loggre=		1<<9,
+	Logppp=		1<<10,
+	Logtcprxmt=	1<<11,
+	Logigmp=	1<<12,
+	Logudpmsg=	1<<13,
+	Logipmsg=	1<<14,
+	Logrudp=	1<<15,
+	Logrudpmsg=	1<<16,
+	Logesp=		1<<17,
+	Logtcpwin=	1<<18,
+};
+
+void	netloginit(Fs*);
+void	netlogopen(Fs*);
+void	netlogclose(Fs*);
+void	netlogctl(Fs*, char*, int);
+long	netlogread(Fs*, void*, ulong, long);
+void	netlog(Fs*, int, char*, ...);
+void	ifcloginit(Fs*);
+long	ifclogread(Fs*, Chan *,void*, ulong, long);
+void	ifclog(Fs*, uchar *, int);
+void	ifclogopen(Fs*, Chan*);
+void	ifclogclose(Fs*, Chan*);
+
+/*
+ *  iproute.c
+ */
+typedef	struct RouteTree RouteTree;
+typedef struct Routewalk Routewalk;
+typedef struct V4route V4route;
+typedef struct V6route V6route;
+
+enum
+{
+
+	/* type bits */
+	Rv4=		(1<<0),		/* this is a version 4 route */
+	Rifc=		(1<<1),		/* this route is a directly connected interface */
+	Rptpt=		(1<<2),		/* this route is a pt to pt interface */
+	Runi=		(1<<3),		/* a unicast self address */
+	Rbcast=		(1<<4),		/* a broadcast self address */
+	Rmulti=		(1<<5),		/* a multicast self address */
+	Rproxy=		(1<<6),		/* this route should be proxied */
+};
+
+struct Routewalk
+{
+	int	o;
+	int	h;
+	char*	p;
+	char*	e;
+	void*	state;
+	void	(*walk)(Route*, Routewalk*);
+};
+
+struct	RouteTree
+{
+	Route*	right;
+	Route*	left;
+	Route*	mid;
+	uchar	depth;
+	uchar	type;
+	uchar	ifcid;		/* must match ifc->id */
+	Ipifc	*ifc;
+	char	tag[4];
+	int	ref;
+};
+
+struct V4route
+{
+	ulong	address;
+	ulong	endaddress;
+	uchar	gate[IPv4addrlen];
+};
+
+struct V6route
+{
+	ulong	address[IPllen];
+	ulong	endaddress[IPllen];
+	uchar	gate[IPaddrlen];
+};
+
+struct Route
+{
+	RouteTree;
+
+	union {
+		V6route	v6;
+		V4route v4;
+	};
+};
+extern void	v4addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type);
+extern void	v6addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type);
+extern void	v4delroute(Fs *f, uchar *a, uchar *mask, int dolock);
+extern void	v6delroute(Fs *f, uchar *a, uchar *mask, int dolock);
+extern Route*	v4lookup(Fs *f, uchar *a, Conv *c);
+extern Route*	v6lookup(Fs *f, uchar *a, Conv *c);
+extern long	routeread(Fs *f, char*, ulong, int);
+extern long	routewrite(Fs *f, Chan*, char*, int);
+extern void	routetype(int, char*);
+extern void	ipwalkroutes(Fs*, Routewalk*);
+extern void	convroute(Route*, uchar*, uchar*, uchar*, char*, int*);
+
+/*
+ *  devip.c
+ */
+
+/*
+ *  Hanging off every ip channel's ->aux is the following structure.
+ *  It maintains the state used by devip and iproute.
+ */
+struct IPaux
+{
+	char	*owner;		/* the user that did the attach */
+	char	tag[4];
+};
+
+extern IPaux*	newipaux(char*, char*);
+
+/*
+ *  arp.c
+ */
+struct Arpent
+{
+	uchar	ip[IPaddrlen];
+	uchar	mac[MAClen];
+	Medium	*type;			/* media type */
+	Arpent*	hash;
+	Block*	hold;
+	Block*	last;
+	uint	ctime;			/* time entry was created or refreshed */
+	uint	utime;			/* time entry was last used */
+	uchar	state;
+	Arpent	*nextrxt;		/* re-transmit chain */
+	uint	rtime;			/* time for next retransmission */
+	uchar	rxtsrem;
+	Ipifc	*ifc;
+	uchar	ifcid;			/* must match ifc->id */
+};
+
+extern void	arpinit(Fs*);
+extern int	arpread(Arp*, char*, ulong, int);
+extern int	arpwrite(Fs*, char*, int);
+extern Arpent*	arpget(Arp*, Block *bp, int version, Ipifc *ifc, uchar *ip, uchar *h);
+extern void	arprelease(Arp*, Arpent *a);
+extern Block*	arpresolve(Arp*, Arpent *a, Medium *type, uchar *mac);
+extern void	arpenter(Fs*, int version, uchar *ip, uchar *mac, int len, int norefresh);
+
+/*
+ * ipaux.c
+ */
+
+extern int	myetheraddr(uchar*, char*);
+extern vlong	parseip(uchar*, char*);
+extern vlong	parseipmask(uchar*, char*);
+extern char*	v4parseip(uchar*, char*);
+extern void	maskip(uchar *from, uchar *mask, uchar *to);
+extern int	parsemac(uchar *to, char *from, int len);
+extern uchar*	defmask(uchar*);
+extern int	isv4(uchar*);
+extern void	v4tov6(uchar *v6, uchar *v4);
+extern int	v6tov4(uchar *v4, uchar *v6);
+extern int	eipfmt(Fmt*);
+
+#define	ipmove(x, y) memmove(x, y, IPaddrlen)
+#define	ipcmp(x, y) ( (x)[IPaddrlen-1] != (y)[IPaddrlen-1] || memcmp(x, y, IPaddrlen) )
+
+extern uchar IPv4bcast[IPaddrlen];
+extern uchar IPv4bcastobs[IPaddrlen];
+extern uchar IPv4allsys[IPaddrlen];
+extern uchar IPv4allrouter[IPaddrlen];
+extern uchar IPnoaddr[IPaddrlen];
+extern uchar v4prefix[IPaddrlen];
+extern uchar IPallbits[IPaddrlen];
+
+#define	NOW	TK2MS(sys->machptr[0]->ticks)
+
+/*
+ *  media
+ */
+extern Medium	ethermedium;
+extern Medium	nullmedium;
+extern Medium	pktmedium;
+
+/*
+ *  ipifc.c
+ */
+extern Medium*	ipfindmedium(char *name);
+extern void	addipmedium(Medium *med);
+extern int	ipforme(Fs*, uchar *addr);
+extern int	iptentative(Fs*, uchar *addr);
+extern int	ipisbm(uchar *);
+extern int	ipismulticast(uchar *);
+extern Ipifc*	findipifc(Fs*, uchar *remote, int type);
+extern void	findlocalip(Fs*, uchar *local, uchar *remote);
+extern int	ipv4local(Ipifc *ifc, uchar *addr);
+extern int	ipv6local(Ipifc *ifc, uchar *addr);
+extern int	ipv6anylocal(Ipifc *ifc, uchar *addr);
+extern Iplifc*	iplocalonifc(Ipifc *ifc, uchar *ip);
+extern int	ipproxyifc(Fs *f, Ipifc *ifc, uchar *ip);
+extern int	ipismulticast(uchar *ip);
+extern int	ipisbooting(void);
+extern int	ipifccheckin(Ipifc *ifc, Medium *med);
+extern void	ipifccheckout(Ipifc *ifc);
+extern int	ipifcgrab(Ipifc *ifc);
+extern void	ipifcaddroute(Fs*, int, uchar*, uchar*, uchar*, int);
+extern void	ipifcremroute(Fs*, int, uchar*, uchar*);
+extern void	ipifcremmulti(Conv *c, uchar *ma, uchar *ia);
+extern void	ipifcaddmulti(Conv *c, uchar *ma, uchar *ia);
+extern char*	ipifcrem(Ipifc *ifc, char **argv, int argc);
+extern char*	ipifcadd(Ipifc *ifc, char **argv, int argc, int tentative, Iplifc *lifcp);
+extern long	ipselftabread(Fs*, char *a, ulong offset, int n);
+extern char*	ipifcadd6(Ipifc *ifc, char**argv, int argc);
+/*
+ *  ip.c
+ */
+extern void	iprouting(Fs*, int);
+extern void	icmpnoconv(Fs*, Block*);
+extern void	icmpcantfrag(Fs*, Block*, int);
+extern void	icmpttlexceeded(Fs*, uchar*, Block*);
+extern ushort	ipcsum(uchar*);
+extern void	ipiput4(Fs*, Ipifc*, Block*);
+extern void	ipiput6(Fs*, Ipifc*, Block*);
+extern int	ipoput4(Fs*, Block*, int, int, int, Conv*);
+extern int	ipoput6(Fs*, Block*, int, int, int, Conv*);
+extern int	ipstats(Fs*, char*, int);
+extern ushort	ptclbsum(uchar*, int);
+extern ushort	ptclcsum(Block*, int, int);
+extern void	ip_init(Fs*);
+/*
+ * bootp.c
+ */
+extern char*	bootp(Ipifc*);
+extern int	bootpread(char*, ulong, int);
+
+/*
+ *  resolving inferno/plan9 differences
+ */
+Chan*		commonfdtochan(int, int, int, int);
+char*		commonuser(void);
+char*		commonerror(void);
+
+/*
+ * chandial.c
+ */
+extern Chan*	chandial(char*, char*, char*, Chan**);
+
+/*
+ *  global to all of the stack
+ */
+extern void	(*igmpreportfn)(Ipifc*, uchar*);

+ 377 - 0
sys/src/9/ip/ipaux.c

@@ -0,0 +1,377 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"ip.h"
+#include	"ipv6.h"
+
+char *v6hdrtypes[Maxhdrtype] =
+{
+	[HBH]		"HopbyHop",
+	[ICMP]		"ICMP",
+	[IGMP]		"IGMP",
+	[GGP]		"GGP",
+	[IPINIP]	"IP",
+	[ST]		"ST",
+	[TCP]		"TCP",
+	[UDP]		"UDP",
+	[ISO_TP4]	"ISO_TP4",
+	[RH]		"Routinghdr",
+	[FH]		"Fraghdr",
+	[IDRP]		"IDRP",
+	[RSVP]		"RSVP",
+	[AH]		"Authhdr",
+	[ESP]		"ESP",
+	[ICMPv6]	"ICMPv6",
+	[NNH]		"Nonexthdr",
+	[ISO_IP]	"ISO_IP",
+	[IGRP]		"IGRP",
+	[OSPF]		"OSPF",
+};
+
+/*
+ *  well known IPv6 addresses
+ */
+uchar v6Unspecified[IPaddrlen] = {
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+uchar v6loopback[IPaddrlen] = {
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x01
+};
+
+uchar v6linklocal[IPaddrlen] = {
+	0xfe, 0x80, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+uchar v6linklocalmask[IPaddrlen] = {
+	0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+int v6llpreflen = 8;	/* link-local prefix length in bytes */
+
+uchar v6multicast[IPaddrlen] = {
+	0xff, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+uchar v6multicastmask[IPaddrlen] = {
+	0xff, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+int v6mcpreflen = 1;	/* multicast prefix length */
+
+uchar v6allnodesN[IPaddrlen] = {
+	0xff, 0x01, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x01
+};
+uchar v6allroutersN[IPaddrlen] = {
+	0xff, 0x01, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x02
+};
+uchar v6allnodesNmask[IPaddrlen] = {
+	0xff, 0xff, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+int v6aNpreflen = 2;	/* all nodes (N) prefix */
+
+uchar v6allnodesL[IPaddrlen] = {
+	0xff, 0x02, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x01
+};
+uchar v6allroutersL[IPaddrlen] = {
+	0xff, 0x02, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x02
+};
+uchar v6allnodesLmask[IPaddrlen] = {
+	0xff, 0xff, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0
+};
+int v6aLpreflen = 2;	/* all nodes (L) prefix */
+
+uchar v6solicitednode[IPaddrlen] = {
+	0xff, 0x02, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0, 0x01,
+	0xff, 0, 0, 0
+};
+uchar v6solicitednodemask[IPaddrlen] = {
+	0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff,
+	0xff, 0x0, 0x0, 0x0
+};
+int v6snpreflen = 13;
+
+ushort
+ptclcsum(Block *bp, int offset, int len)
+{
+	uchar *addr;
+	ulong losum, hisum;
+	ushort csum;
+	int odd, blocklen, x;
+
+	/* Correct to front of data area */
+	while(bp != nil && offset && offset >= BLEN(bp)) {
+		offset -= BLEN(bp);
+		bp = bp->next;
+	}
+	if(bp == nil)
+		return 0;
+
+	addr = bp->rp + offset;
+	blocklen = BLEN(bp) - offset;
+
+	if(bp->next == nil) {
+		if(blocklen < len)
+			len = blocklen;
+		return ~ptclbsum(addr, len) & 0xffff;
+	}
+
+	losum = 0;
+	hisum = 0;
+
+	odd = 0;
+	while(len) {
+		x = blocklen;
+		if(len < x)
+			x = len;
+
+		csum = ptclbsum(addr, x);
+		if(odd)
+			hisum += csum;
+		else
+			losum += csum;
+		odd = (odd+x) & 1;
+		len -= x;
+
+		bp = bp->next;
+		if(bp == nil)
+			break;
+		blocklen = BLEN(bp);
+		addr = bp->rp;
+	}
+
+	losum += hisum>>8;
+	losum += (hisum&0xff)<<8;
+	while((csum = losum>>16) != 0)
+		losum = csum + (losum & 0xffff);
+
+	return ~losum & 0xffff;
+}
+
+enum
+{
+	Isprefix= 16,
+};
+
+#define CLASS(p) ((*(uchar*)(p))>>6)
+
+void
+ipv62smcast(uchar *smcast, uchar *a)
+{
+	assert(IPaddrlen == 16);
+	memmove(smcast, v6solicitednode, IPaddrlen);
+	smcast[13] = a[13];
+	smcast[14] = a[14];
+	smcast[15] = a[15];
+}
+
+
+/*
+ *  parse a hex mac address
+ */
+int
+parsemac(uchar *to, char *from, int len)
+{
+	char nip[4];
+	char *p;
+	int i;
+
+	p = from;
+	memset(to, 0, len);
+	for(i = 0; i < len; i++){
+		if(p[0] == '\0' || p[1] == '\0')
+			break;
+
+		nip[0] = p[0];
+		nip[1] = p[1];
+		nip[2] = '\0';
+		p += 2;
+
+		to[i] = strtoul(nip, 0, 16);
+		if(*p == ':')
+			p++;
+	}
+	return i;
+}
+
+/*
+ *  hashing tcp, udp, ... connections
+ */
+ulong
+iphash(uchar *sa, ushort sp, uchar *da, ushort dp)
+{
+	return ((sa[IPaddrlen-1]<<24) ^ (sp << 16) ^ (da[IPaddrlen-1]<<8) ^ dp ) % Nhash;
+}
+
+void
+iphtadd(Ipht *ht, Conv *c)
+{
+	ulong hv;
+	Iphash *h;
+
+	hv = iphash(c->raddr, c->rport, c->laddr, c->lport);
+	h = smalloc(sizeof(*h));
+	if(ipcmp(c->raddr, IPnoaddr) != 0)
+		h->match = IPmatchexact;
+	else {
+		if(ipcmp(c->laddr, IPnoaddr) != 0){
+			if(c->lport == 0)
+				h->match = IPmatchaddr;
+			else
+				h->match = IPmatchpa;
+		} else {
+			if(c->lport == 0)
+				h->match = IPmatchany;
+			else
+				h->match = IPmatchport;
+		}
+	}
+	h->c = c;
+
+	lock(ht);
+	h->next = ht->tab[hv];
+	ht->tab[hv] = h;
+	unlock(ht);
+}
+
+void
+iphtrem(Ipht *ht, Conv *c)
+{
+	ulong hv;
+	Iphash **l, *h;
+
+	hv = iphash(c->raddr, c->rport, c->laddr, c->lport);
+	lock(ht);
+	for(l = &ht->tab[hv]; (*l) != nil; l = &(*l)->next)
+		if((*l)->c == c){
+			h = *l;
+			(*l) = h->next;
+			free(h);
+			break;
+		}
+	unlock(ht);
+}
+
+/* look for a matching conversation with the following precedence
+ *	connected && raddr,rport,laddr,lport
+ *	announced && laddr,lport
+ *	announced && *,lport
+ *	announced && laddr,*
+ *	announced && *,*
+ */
+Conv*
+iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp)
+{
+	ulong hv;
+	Iphash *h;
+	Conv *c;
+
+	/* exact 4 pair match (connection) */
+	hv = iphash(sa, sp, da, dp);
+	lock(ht);
+	for(h = ht->tab[hv]; h != nil; h = h->next){
+		if(h->match != IPmatchexact)
+			continue;
+		c = h->c;
+		if(sp == c->rport && dp == c->lport
+		&& ipcmp(sa, c->raddr) == 0 && ipcmp(da, c->laddr) == 0){
+			unlock(ht);
+			return c;
+		}
+	}
+
+	/* match local address and port */
+	hv = iphash(IPnoaddr, 0, da, dp);
+	for(h = ht->tab[hv]; h != nil; h = h->next){
+		if(h->match != IPmatchpa)
+			continue;
+		c = h->c;
+		if(dp == c->lport && ipcmp(da, c->laddr) == 0){
+			unlock(ht);
+			return c;
+		}
+	}
+
+	/* match just port */
+	hv = iphash(IPnoaddr, 0, IPnoaddr, dp);
+	for(h = ht->tab[hv]; h != nil; h = h->next){
+		if(h->match != IPmatchport)
+			continue;
+		c = h->c;
+		if(dp == c->lport){
+			unlock(ht);
+			return c;
+		}
+	}
+
+	/* match local address */
+	hv = iphash(IPnoaddr, 0, da, 0);
+	for(h = ht->tab[hv]; h != nil; h = h->next){
+		if(h->match != IPmatchaddr)
+			continue;
+		c = h->c;
+		if(ipcmp(da, c->laddr) == 0){
+			unlock(ht);
+			return c;
+		}
+	}
+
+	/* look for something that matches anything */
+	hv = iphash(IPnoaddr, 0, IPnoaddr, 0);
+	for(h = ht->tab[hv]; h != nil; h = h->next){
+		if(h->match != IPmatchany)
+			continue;
+		c = h->c;
+		unlock(ht);
+		return c;
+	}
+	unlock(ht);
+	return nil;
+}

+ 1663 - 0
sys/src/9/ip/ipifc.c

@@ -0,0 +1,1663 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+#include "ipv6.h"
+
+#define DPRINT if(0)print
+
+enum {
+	Maxmedia	= 32,
+	Nself		= Maxmedia*5,
+	NHASH		= 1<<6,
+	NCACHE		= 256,
+	QMAX		= 64*1024-1,
+};
+
+Medium *media[Maxmedia] = { 0 };
+
+/*
+ *  cache of local addresses (addresses we answer to)
+ */
+struct Ipself
+{
+	uchar	a[IPaddrlen];
+	Ipself	*hnext;		/* next address in the hash table */
+	Iplink	*link;		/* binding twixt Ipself and Ipifc */
+	ulong	expire;
+	uchar	type;		/* type of address */
+	int	ref;
+	Ipself	*next;		/* free list */
+};
+
+struct Ipselftab
+{
+	QLock;
+	int	inited;
+	int	acceptall;	/* true if an interface has the null address */
+	Ipself	*hash[NHASH];	/* hash chains */
+};
+
+/*
+ *  Multicast addresses are chained onto a Chan so that
+ *  we can remove them when the Chan is closed.
+ */
+typedef struct Ipmcast Ipmcast;
+struct Ipmcast
+{
+	Ipmcast	*next;
+	uchar	ma[IPaddrlen];	/* multicast address */
+	uchar	ia[IPaddrlen];	/* interface address */
+};
+
+/* quick hash for ip addresses */
+#define hashipa(a) ( ( ((a)[IPaddrlen-2]<<8) | (a)[IPaddrlen-1] )%NHASH )
+
+static char tifc[] = "ifc ";
+
+static void	addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a, int type);
+static void	remselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a);
+static char*	ipifcjoinmulti(Ipifc *ifc, char **argv, int argc);
+static char*	ipifcleavemulti(Ipifc *ifc, char **argv, int argc);
+static void	ipifcregisterproxy(Fs*, Ipifc*, uchar*);
+static char*	ipifcremlifc(Ipifc*, Iplifc*);
+
+/*
+ *  link in a new medium
+ */
+void
+addipmedium(Medium *med)
+{
+	int i;
+
+	for(i = 0; i < nelem(media)-1; i++)
+		if(media[i] == nil){
+			media[i] = med;
+			break;
+		}
+}
+
+/*
+ *  find the medium with this name
+ */
+Medium*
+ipfindmedium(char *name)
+{
+	Medium **mp;
+
+	for(mp = media; *mp != nil; mp++)
+		if(strcmp((*mp)->name, name) == 0)
+			break;
+	return *mp;
+}
+
+/*
+ *  attach a device (or pkt driver) to the interface.
+ *  called with c locked
+ */
+static char*
+ipifcbind(Conv *c, char **argv, int argc)
+{
+	Ipifc *ifc;
+	Medium *medium;
+
+	if(argc < 2)
+		return Ebadarg;
+
+	ifc = (Ipifc*)c->ptcl;
+
+	/* bind the device to the interface */
+	medium = ipfindmedium(argv[1]);
+	if(medium == nil)
+		return "unknown interface type";
+
+	wlock(ifc);
+	if(ifc->medium != nil){
+		wunlock(ifc);
+		return "interface already bound";
+	}
+	if(waserror()){
+		wunlock(ifc);
+		nexterror();
+	}
+
+	/* do medium specific binding */
+	(*medium->bind)(ifc, argc, argv);
+
+	/* set the bound device name */
+	if(argc > 2)
+		strncpy(ifc->dev, argv[2], sizeof(ifc->dev));
+	else
+		snprint(ifc->dev, sizeof ifc->dev, "%s%d", medium->name, c->x);
+	ifc->dev[sizeof(ifc->dev)-1] = 0;
+
+	/* set up parameters */
+	ifc->medium = medium;
+	ifc->mintu = ifc->medium->mintu;
+	ifc->maxtu = ifc->medium->maxtu;
+	if(ifc->medium->unbindonclose == 0)
+		ifc->conv->inuse++;
+	ifc->rp.mflag = 0;		/* default not managed */
+	ifc->rp.oflag = 0;
+	ifc->rp.maxraint = 600000;	/* millisecs */
+	ifc->rp.minraint = 200000;
+	ifc->rp.linkmtu = 0;		/* no mtu sent */
+	ifc->rp.reachtime = 0;
+	ifc->rp.rxmitra = 0;
+	ifc->rp.ttl = MAXTTL;
+	ifc->rp.routerlt = 3 * ifc->rp.maxraint;
+
+	/* any ancillary structures (like routes) no longer pertain */
+	ifc->ifcid++;
+
+	/* reopen all the queues closed by a previous unbind */
+	qreopen(c->rq);
+	qreopen(c->eq);
+	qreopen(c->sq);
+
+	wunlock(ifc);
+	poperror();
+
+	return nil;
+}
+
+/*
+ *  detach a device from an interface, close the interface
+ *  called with ifc->conv closed
+ */
+static char*
+ipifcunbind(Ipifc *ifc)
+{
+	char *err;
+
+	if(waserror()){
+		wunlock(ifc);
+		nexterror();
+	}
+	wlock(ifc);
+
+	/* dissociate routes */
+	if(ifc->medium != nil && ifc->medium->unbindonclose == 0)
+		ifc->conv->inuse--;
+	ifc->ifcid++;
+
+	/* disassociate logical interfaces (before zeroing ifc->arg) */
+	while(ifc->lifc){
+		err = ipifcremlifc(ifc, ifc->lifc);
+		/*
+		 * note: err non-zero means lifc not found,
+		 * which can't happen in this case.
+		 */
+		if(err)
+			error(err);
+	}
+
+	/* disassociate device */
+	if(ifc->medium && ifc->medium->unbind)
+		(*ifc->medium->unbind)(ifc);
+	memset(ifc->dev, 0, sizeof(ifc->dev));
+	ifc->arg = nil;
+	ifc->reassemble = 0;
+
+	/* close queues to stop queuing of packets */
+	qclose(ifc->conv->rq);
+	qclose(ifc->conv->wq);
+	qclose(ifc->conv->sq);
+
+	ifc->medium = nil;
+	wunlock(ifc);
+	poperror();
+	return nil;
+}
+
+char sfixedformat[] = "device %s maxtu %d sendra %d recvra %d mflag %d oflag"
+" %d maxraint %d minraint %d linkmtu %d reachtime %d rxmitra %d ttl %d routerlt"
+" %d pktin %lud pktout %lud errin %lud errout %lud\n";
+
+char slineformat[] = "	%-40I %-10M %-40I %-12lud %-12lud\n";
+
+static int
+ipifcstate(Conv *c, char *state, int n)
+{
+	Ipifc *ifc;
+	Iplifc *lifc;
+	int m;
+
+	ifc = (Ipifc*)c->ptcl;
+	m = snprint(state, n, sfixedformat,
+		ifc->dev, ifc->maxtu, ifc->sendra6, ifc->recvra6,
+		ifc->rp.mflag, ifc->rp.oflag, ifc->rp.maxraint,
+		ifc->rp.minraint, ifc->rp.linkmtu, ifc->rp.reachtime,
+		ifc->rp.rxmitra, ifc->rp.ttl, ifc->rp.routerlt,
+		ifc->in, ifc->out, ifc->inerr, ifc->outerr);
+
+	rlock(ifc);
+	for(lifc = ifc->lifc; lifc && n > m; lifc = lifc->next)
+		m += snprint(state+m, n - m, slineformat, lifc->local,
+			lifc->mask, lifc->remote, lifc->validlt, lifc->preflt);
+	if(ifc->lifc == nil)
+		m += snprint(state+m, n - m, "\n");
+	runlock(ifc);
+	return m;
+}
+
+static int
+ipifclocal(Conv *c, char *state, int n)
+{
+	Ipifc *ifc;
+	Iplifc *lifc;
+	Iplink *link;
+	int m;
+
+	ifc = (Ipifc*)c->ptcl;
+	m = 0;
+
+	rlock(ifc);
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+		m += snprint(state+m, n - m, "%-40.40I ->", lifc->local);
+		for(link = lifc->link; link; link = link->lifclink)
+			m += snprint(state+m, n - m, " %-40.40I", link->self->a);
+		m += snprint(state+m, n - m, "\n");
+	}
+	runlock(ifc);
+	return m;
+}
+
+static int
+ipifcinuse(Conv *c)
+{
+	Ipifc *ifc;
+
+	ifc = (Ipifc*)c->ptcl;
+	return ifc->medium != nil;
+}
+
+/*
+ *  called when a process writes to an interface's 'data'
+ */
+static void
+ipifckick(void *x)
+{
+	Conv *c = x;
+	Block *bp;
+	Ipifc *ifc;
+
+	bp = qget(c->wq);
+	if(bp == nil)
+		return;
+
+	ifc = (Ipifc*)c->ptcl;
+	if(!canrlock(ifc)){
+		freeb(bp);
+		return;
+	}
+	if(waserror()){
+		runlock(ifc);
+		nexterror();
+	}
+	if(ifc->medium == nil || ifc->medium->pktin == nil)
+		freeb(bp);
+	else
+		(*ifc->medium->pktin)(c->p->f, ifc, bp);
+	runlock(ifc);
+	poperror();
+}
+
+/*
+ *  called when a new ipifc structure is created
+ */
+static void
+ipifccreate(Conv *c)
+{
+	Ipifc *ifc;
+
+	c->rq = qopen(QMAX, 0, 0, 0);
+	c->sq = qopen(2*QMAX, 0, 0, 0);
+	c->wq = qopen(QMAX, Qkick, ipifckick, c);
+	ifc = (Ipifc*)c->ptcl;
+	ifc->conv = c;
+	ifc->unbinding = 0;
+	ifc->medium = nil;
+	ifc->reassemble = 0;
+}
+
+/*
+ *  called after last close of ipifc data or ctl
+ *  called with c locked, we must unlock
+ */
+static void
+ipifcclose(Conv *c)
+{
+	Ipifc *ifc;
+	Medium *medium;
+
+	ifc = (Ipifc*)c->ptcl;
+	medium = ifc->medium;
+	if(medium != nil && medium->unbindonclose)
+		ipifcunbind(ifc);
+}
+
+/*
+ *  change an interface's mtu
+ */
+char*
+ipifcsetmtu(Ipifc *ifc, char **argv, int argc)
+{
+	int mtu;
+
+	if(argc < 2 || ifc->medium == nil)
+		return Ebadarg;
+	mtu = strtoul(argv[1], 0, 0);
+	if(mtu < ifc->medium->mintu || mtu > ifc->medium->maxtu)
+		return Ebadarg;
+	ifc->maxtu = mtu;
+	return nil;
+}
+
+/*
+ *  add an address to an interface.
+ */
+char*
+ipifcadd(Ipifc *ifc, char **argv, int argc, int tentative, Iplifc *lifcp)
+{
+	int i, type, mtu, sendnbrdisc = 0;
+	uchar ip[IPaddrlen], mask[IPaddrlen], rem[IPaddrlen];
+	uchar bcast[IPaddrlen], net[IPaddrlen];
+	Iplifc *lifc, **l;
+	Fs *f;
+
+	if(ifc->medium == nil)
+		return "ipifc not yet bound to device";
+
+	f = ifc->conv->p->f;
+
+	type = Rifc;
+	memset(ip, 0, IPaddrlen);
+	memset(mask, 0, IPaddrlen);
+	memset(rem, 0, IPaddrlen);
+	switch(argc){
+	case 6:
+		if(strcmp(argv[5], "proxy") == 0)
+			type |= Rproxy;
+		/* fall through */
+	case 5:
+		mtu = strtoul(argv[4], 0, 0);
+		if(mtu >= ifc->medium->mintu && mtu <= ifc->medium->maxtu)
+			ifc->maxtu = mtu;
+		/* fall through */
+	case 4:
+		parseip(ip, argv[1]);
+		parseipmask(mask, argv[2]);
+		parseip(rem, argv[3]);
+		maskip(rem, mask, net);
+		break;
+	case 3:
+		parseip(ip, argv[1]);
+		parseipmask(mask, argv[2]);
+		maskip(ip, mask, rem);
+		maskip(rem, mask, net);
+		break;
+	case 2:
+		parseip(ip, argv[1]);
+		memmove(mask, defmask(ip), IPaddrlen);
+		maskip(ip, mask, rem);
+		maskip(rem, mask, net);
+		break;
+	default:
+		return Ebadarg;
+	}
+	if(isv4(ip))
+		tentative = 0;
+	wlock(ifc);
+
+	/* ignore if this is already a local address for this ifc */
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next) {
+		if(ipcmp(lifc->local, ip) == 0) {
+			if(lifc->tentative != tentative)
+				lifc->tentative = tentative;
+			if(lifcp) {
+				lifc->onlink = lifcp->onlink;
+				lifc->autoflag = lifcp->autoflag;
+				lifc->validlt = lifcp->validlt;
+				lifc->preflt = lifcp->preflt;
+				lifc->origint = lifcp->origint;
+			}
+			goto out;
+		}
+	}
+
+	/* add the address to the list of logical ifc's for this ifc */
+	lifc = smalloc(sizeof(Iplifc));
+	ipmove(lifc->local, ip);
+	ipmove(lifc->mask, mask);
+	ipmove(lifc->remote, rem);
+	ipmove(lifc->net, net);
+	lifc->tentative = tentative;
+	if(lifcp) {
+		lifc->onlink = lifcp->onlink;
+		lifc->autoflag = lifcp->autoflag;
+		lifc->validlt = lifcp->validlt;
+		lifc->preflt = lifcp->preflt;
+		lifc->origint = lifcp->origint;
+	} else {		/* default values */
+		lifc->onlink = lifc->autoflag = 1;
+		lifc->validlt = lifc->preflt = ~0L;
+		lifc->origint = NOW / 1000;
+	}
+	lifc->next = nil;
+
+	for(l = &ifc->lifc; *l; l = &(*l)->next)
+		;
+	*l = lifc;
+
+	/* check for point-to-point interface */
+	if(ipcmp(ip, v6loopback)) /* skip v6 loopback, it's a special address */
+	if(ipcmp(mask, IPallbits) == 0)
+		type |= Rptpt;
+
+	/* add local routes */
+	if(isv4(ip))
+		v4addroute(f, tifc, rem+IPv4off, mask+IPv4off, rem+IPv4off, type);
+	else
+		v6addroute(f, tifc, rem, mask, rem, type);
+
+	addselfcache(f, ifc, lifc, ip, Runi);
+
+	if((type & (Rproxy|Rptpt)) == (Rproxy|Rptpt)){
+		ipifcregisterproxy(f, ifc, rem);
+		goto out;
+	}
+
+	if(isv4(ip) || ipcmp(ip, IPnoaddr) == 0) {
+		/* add subnet directed broadcast address to the self cache */
+		for(i = 0; i < IPaddrlen; i++)
+			bcast[i] = (ip[i] & mask[i]) | ~mask[i];
+		addselfcache(f, ifc, lifc, bcast, Rbcast);
+
+		/* add subnet directed network address to the self cache */
+		for(i = 0; i < IPaddrlen; i++)
+			bcast[i] = (ip[i] & mask[i]) & mask[i];
+		addselfcache(f, ifc, lifc, bcast, Rbcast);
+
+		/* add network directed broadcast address to the self cache */
+		memmove(mask, defmask(ip), IPaddrlen);
+		for(i = 0; i < IPaddrlen; i++)
+			bcast[i] = (ip[i] & mask[i]) | ~mask[i];
+		addselfcache(f, ifc, lifc, bcast, Rbcast);
+
+		/* add network directed network address to the self cache */
+		memmove(mask, defmask(ip), IPaddrlen);
+		for(i = 0; i < IPaddrlen; i++)
+			bcast[i] = (ip[i] & mask[i]) & mask[i];
+		addselfcache(f, ifc, lifc, bcast, Rbcast);
+
+		addselfcache(f, ifc, lifc, IPv4bcast, Rbcast);
+	}
+	else {
+		if(ipcmp(ip, v6loopback) == 0) {
+			/* add node-local mcast address */
+			addselfcache(f, ifc, lifc, v6allnodesN, Rmulti);
+
+			/* add route for all node multicast */
+			v6addroute(f, tifc, v6allnodesN, v6allnodesNmask,
+				v6allnodesN, Rmulti);
+		}
+
+		/* add all nodes multicast address */
+		addselfcache(f, ifc, lifc, v6allnodesL, Rmulti);
+
+		/* add route for all nodes multicast */
+		v6addroute(f, tifc, v6allnodesL, v6allnodesLmask, v6allnodesL,
+			Rmulti);
+
+		/* add solicited-node multicast address */
+		ipv62smcast(bcast, ip);
+		addselfcache(f, ifc, lifc, bcast, Rmulti);
+
+		sendnbrdisc = 1;
+	}
+
+	/* register the address on this network for address resolution */
+	if(isv4(ip) && ifc->medium->areg != nil)
+		(*ifc->medium->areg)(ifc, ip);
+
+out:
+	wunlock(ifc);
+	if(tentative && sendnbrdisc)
+		icmpns(f, 0, SRC_UNSPEC, ip, TARG_MULTI, ifc->mac);
+	return nil;
+}
+
+/*
+ *  remove a logical interface from an ifc
+ *  always called with ifc wlock'd
+ */
+static char*
+ipifcremlifc(Ipifc *ifc, Iplifc *lifc)
+{
+	Iplifc **l;
+	Fs *f;
+
+	f = ifc->conv->p->f;
+
+	/*
+	 *  find address on this interface and remove from chain.
+	 *  for pt to pt we actually specify the remote address as the
+	 *  addresss to remove.
+	 */
+	for(l = &ifc->lifc; *l != nil && *l != lifc; l = &(*l)->next)
+		;
+	if(*l == nil)
+		return "address not on this interface";
+	*l = lifc->next;
+
+	/* disassociate any addresses */
+	while(lifc->link)
+		remselfcache(f, ifc, lifc, lifc->link->self->a);
+
+	/* remove the route for this logical interface */
+	if(isv4(lifc->local))
+		v4delroute(f, lifc->remote+IPv4off, lifc->mask+IPv4off, 1);
+	else {
+		v6delroute(f, lifc->remote, lifc->mask, 1);
+		if(ipcmp(lifc->local, v6loopback) == 0)
+			/* remove route for all node multicast */
+			v6delroute(f, v6allnodesN, v6allnodesNmask, 1);
+		else if(memcmp(lifc->local, v6linklocal, v6llpreflen) == 0)
+			/* remove route for all link multicast */
+			v6delroute(f, v6allnodesL, v6allnodesLmask, 1);
+	}
+
+	free(lifc);
+	return nil;
+}
+
+/*
+ *  remove an address from an interface.
+ *  called with c->car locked
+ */
+char*
+ipifcrem(Ipifc *ifc, char **argv, int argc)
+{
+	char *rv;
+	uchar ip[IPaddrlen], mask[IPaddrlen], rem[IPaddrlen];
+	Iplifc *lifc;
+
+	if(argc < 3)
+		return Ebadarg;
+
+	parseip(ip, argv[1]);
+	parseipmask(mask, argv[2]);
+	if(argc < 4)
+		maskip(ip, mask, rem);
+	else
+		parseip(rem, argv[3]);
+
+	wlock(ifc);
+
+	/*
+	 *  find address on this interface and remove from chain.
+	 *  for pt to pt we actually specify the remote address as the
+	 *  addresss to remove.
+	 */
+	for(lifc = ifc->lifc; lifc != nil; lifc = lifc->next) {
+		if (memcmp(ip, lifc->local, IPaddrlen) == 0
+		&& memcmp(mask, lifc->mask, IPaddrlen) == 0
+		&& memcmp(rem, lifc->remote, IPaddrlen) == 0)
+			break;
+	}
+
+	rv = ipifcremlifc(ifc, lifc);
+	wunlock(ifc);
+	return rv;
+}
+
+/*
+ * distribute routes to active interfaces like the
+ * TRIP linecards
+ */
+void
+ipifcaddroute(Fs *f, int vers, uchar *addr, uchar *mask, uchar *gate, int type)
+{
+	Medium *medium;
+	Conv **cp, **e;
+	Ipifc *ifc;
+
+	e = &f->ipifc->conv[f->ipifc->nc];
+	for(cp = f->ipifc->conv; cp < e; cp++){
+		if(*cp != nil) {
+			ifc = (Ipifc*)(*cp)->ptcl;
+			medium = ifc->medium;
+			if(medium != nil && medium->addroute != nil)
+				medium->addroute(ifc, vers, addr, mask, gate, type);
+		}
+	}
+}
+
+void
+ipifcremroute(Fs *f, int vers, uchar *addr, uchar *mask)
+{
+	Medium *medium;
+	Conv **cp, **e;
+	Ipifc *ifc;
+
+	e = &f->ipifc->conv[f->ipifc->nc];
+	for(cp = f->ipifc->conv; cp < e; cp++){
+		if(*cp != nil) {
+			ifc = (Ipifc*)(*cp)->ptcl;
+			medium = ifc->medium;
+			if(medium != nil && medium->remroute != nil)
+				medium->remroute(ifc, vers, addr, mask);
+		}
+	}
+}
+
+/*
+ *  associate an address with the interface.  This wipes out any previous
+ *  addresses.  This is a macro that means, remove all the old interfaces
+ *  and add a new one.
+ */
+static char*
+ipifcconnect(Conv* c, char **argv, int argc)
+{
+	char *err;
+	Ipifc *ifc;
+
+	ifc = (Ipifc*)c->ptcl;
+
+	if(ifc->medium == nil)
+		 return "ipifc not yet bound to device";
+
+	if(waserror()){
+		wunlock(ifc);
+		nexterror();
+	}
+	wlock(ifc);
+	while(ifc->lifc){
+		err = ipifcremlifc(ifc, ifc->lifc);
+		if(err)
+			error(err);
+	}
+	wunlock(ifc);
+	poperror();
+
+	err = ipifcadd(ifc, argv, argc, 0, nil);
+	if(err)
+		return err;
+
+	Fsconnected(c, nil);
+	return nil;
+}
+
+char*
+ipifcra6(Ipifc *ifc, char **argv, int argc)
+{
+	int i, argsleft, vmax = ifc->rp.maxraint, vmin = ifc->rp.minraint;
+
+	argsleft = argc - 1;
+	i = 1;
+
+	if(argsleft % 2 != 0)
+		return Ebadarg;
+
+	while (argsleft > 1) {
+		if(strcmp(argv[i], "recvra") == 0)
+			ifc->recvra6 = (atoi(argv[i+1]) != 0);
+		else if(strcmp(argv[i], "sendra") == 0)
+			ifc->sendra6 = (atoi(argv[i+1]) != 0);
+		else if(strcmp(argv[i], "mflag") == 0)
+			ifc->rp.mflag = (atoi(argv[i+1]) != 0);
+		else if(strcmp(argv[i], "oflag") == 0)
+			ifc->rp.oflag = (atoi(argv[i+1]) != 0);
+		else if(strcmp(argv[i], "maxraint") == 0)
+			ifc->rp.maxraint = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "minraint") == 0)
+			ifc->rp.minraint = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "linkmtu") == 0)
+			ifc->rp.linkmtu = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "reachtime") == 0)
+			ifc->rp.reachtime = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "rxmitra") == 0)
+			ifc->rp.rxmitra = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "ttl") == 0)
+			ifc->rp.ttl = atoi(argv[i+1]);
+		else if(strcmp(argv[i], "routerlt") == 0)
+			ifc->rp.routerlt = atoi(argv[i+1]);
+		else
+			return Ebadarg;
+
+		argsleft -= 2;
+		i += 2;
+	}
+
+	/* consistency check */
+	if(ifc->rp.maxraint < ifc->rp.minraint) {
+		ifc->rp.maxraint = vmax;
+		ifc->rp.minraint = vmin;
+		return Ebadarg;
+	}
+	return nil;
+}
+
+/*
+ *  non-standard control messages.
+ *  called with c->car locked.
+ */
+static char*
+ipifcctl(Conv* c, char**argv, int argc)
+{
+	Ipifc *ifc;
+	int i;
+
+	ifc = (Ipifc*)c->ptcl;
+	if(strcmp(argv[0], "add") == 0)
+		return ipifcadd(ifc, argv, argc, 0, nil);
+	else if(strcmp(argv[0], "try") == 0)
+		return ipifcadd(ifc, argv, argc, 1, nil);
+	else if(strcmp(argv[0], "remove") == 0)
+		return ipifcrem(ifc, argv, argc);
+	else if(strcmp(argv[0], "unbind") == 0)
+		return ipifcunbind(ifc);
+	else if(strcmp(argv[0], "joinmulti") == 0)
+		return ipifcjoinmulti(ifc, argv, argc);
+	else if(strcmp(argv[0], "leavemulti") == 0)
+		return ipifcleavemulti(ifc, argv, argc);
+	else if(strcmp(argv[0], "mtu") == 0)
+		return ipifcsetmtu(ifc, argv, argc);
+	else if(strcmp(argv[0], "reassemble") == 0){
+		ifc->reassemble = 1;
+		return nil;
+	}
+	else if(strcmp(argv[0], "iprouting") == 0){
+		i = 1;
+		if(argc > 1)
+			i = atoi(argv[1]);
+		iprouting(c->p->f, i);
+		return nil;
+	}
+	else if(strcmp(argv[0], "add6") == 0)
+		return ipifcadd6(ifc, argv, argc);
+	else if(strcmp(argv[0], "ra6") == 0)
+		return ipifcra6(ifc, argv, argc);
+	return "unsupported ctl";
+}
+
+int
+ipifcstats(Proto *ipifc, char *buf, int len)
+{
+	return ipstats(ipifc->f, buf, len);
+}
+
+void
+ipifcinit(Fs *f)
+{
+	Proto *ipifc;
+
+	ipifc = smalloc(sizeof(Proto));
+	ipifc->name = "ipifc";
+	ipifc->connect = ipifcconnect;
+	ipifc->announce = nil;
+	ipifc->bind = ipifcbind;
+	ipifc->state = ipifcstate;
+	ipifc->create = ipifccreate;
+	ipifc->close = ipifcclose;
+	ipifc->rcv = nil;
+	ipifc->ctl = ipifcctl;
+	ipifc->advise = nil;
+	ipifc->stats = ipifcstats;
+	ipifc->inuse = ipifcinuse;
+	ipifc->local = ipifclocal;
+	ipifc->ipproto = -1;
+	ipifc->nc = Maxmedia;
+	ipifc->ptclsize = sizeof(Ipifc);
+
+	f->ipifc = ipifc;	/* hack for ipifcremroute, findipifc, ... */
+	f->self = smalloc(sizeof(Ipselftab));	/* hack for ipforme */
+
+	Fsproto(f, ipifc);
+}
+
+/*
+ *  add to self routing cache
+ *	called with c->car locked
+ */
+static void
+addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a, int type)
+{
+	Ipself *p;
+	Iplink *lp;
+	int h;
+
+	qlock(f->self);
+
+	/* see if the address already exists */
+	h = hashipa(a);
+	for(p = f->self->hash[h]; p; p = p->next)
+		if(memcmp(a, p->a, IPaddrlen) == 0)
+			break;
+
+	/* allocate a local address and add to hash chain */
+	if(p == nil){
+		p = smalloc(sizeof(*p));
+		ipmove(p->a, a);
+		p->type = type;
+		p->next = f->self->hash[h];
+		f->self->hash[h] = p;
+
+		/* if the null address, accept all packets */
+		if(ipcmp(a, v4prefix) == 0 || ipcmp(a, IPnoaddr) == 0)
+			f->self->acceptall = 1;
+	}
+
+	/* look for a link for this lifc */
+	for(lp = p->link; lp; lp = lp->selflink)
+		if(lp->lifc == lifc)
+			break;
+
+	/* allocate a lifc-to-local link and link to both */
+	if(lp == nil){
+		lp = smalloc(sizeof(*lp));
+		lp->ref = 1;
+		lp->lifc = lifc;
+		lp->self = p;
+		lp->selflink = p->link;
+		p->link = lp;
+		lp->lifclink = lifc->link;
+		lifc->link = lp;
+
+		/* add to routing table */
+		if(isv4(a))
+			v4addroute(f, tifc, a+IPv4off, IPallbits+IPv4off,
+				a+IPv4off, type);
+		else
+			v6addroute(f, tifc, a, IPallbits, a, type);
+
+		if((type & Rmulti) && ifc->medium->addmulti != nil)
+			(*ifc->medium->addmulti)(ifc, a, lifc->local);
+	} else
+		lp->ref++;
+
+	qunlock(f->self);
+}
+
+/*
+ *  These structures are unlinked from their chains while
+ *  other threads may be using them.  To avoid excessive locking,
+ *  just put them aside for a while before freeing them.
+ *	called with f->self locked
+ */
+static Iplink *freeiplink;
+static Ipself *freeipself;
+
+static void
+iplinkfree(Iplink *p)
+{
+	Iplink **l, *np;
+	ulong now = NOW;
+
+	l = &freeiplink;
+	for(np = *l; np; np = *l){
+		if(np->expire > now){
+			*l = np->next;
+			free(np);
+			continue;
+		}
+		l = &np->next;
+	}
+	p->expire = now + 5000;	/* give other threads 5 secs to get out */
+	p->next = nil;
+	*l = p;
+}
+
+static void
+ipselffree(Ipself *p)
+{
+	Ipself **l, *np;
+	ulong now = NOW;
+
+	l = &freeipself;
+	for(np = *l; np; np = *l){
+		if(np->expire > now){
+			*l = np->next;
+			free(np);
+			continue;
+		}
+		l = &np->next;
+	}
+	p->expire = now + 5000;	/* give other threads 5 secs to get out */
+	p->next = nil;
+	*l = p;
+}
+
+/*
+ *  Decrement reference for this address on this link.
+ *  Unlink from selftab if this is the last ref.
+ *	called with c->car locked
+ */
+static void
+remselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a)
+{
+	Ipself *p, **l;
+	Iplink *link, **l_self, **l_lifc;
+
+	qlock(f->self);
+
+	/* find the unique selftab entry */
+	l = &f->self->hash[hashipa(a)];
+	for(p = *l; p; p = *l){
+		if(ipcmp(p->a, a) == 0)
+			break;
+		l = &p->next;
+	}
+
+	if(p == nil)
+		goto out;
+
+	/*
+	 *  walk down links from an ifc looking for one
+	 *  that matches the selftab entry
+	 */
+	l_lifc = &lifc->link;
+	for(link = *l_lifc; link; link = *l_lifc){
+		if(link->self == p)
+			break;
+		l_lifc = &link->lifclink;
+	}
+
+	if(link == nil)
+		goto out;
+
+	/*
+	 *  walk down the links from the selftab looking for
+	 *  the one we just found
+	 */
+	l_self = &p->link;
+	for(link = *l_self; link; link = *l_self){
+		if(link == *l_lifc)
+			break;
+		l_self = &link->selflink;
+	}
+
+	if(link == nil)
+		panic("remselfcache");
+
+	if(--(link->ref) != 0)
+		goto out;
+
+	if((p->type & Rmulti) && ifc->medium->remmulti != nil)
+		(*ifc->medium->remmulti)(ifc, a, lifc->local);
+
+	/* ref == 0, remove from both chains and free the link */
+	*l_lifc = link->lifclink;
+	*l_self = link->selflink;
+	iplinkfree(link);
+
+	if(p->link != nil)
+		goto out;
+
+	/* remove from routing table */
+	if(isv4(a))
+		v4delroute(f, a+IPv4off, IPallbits+IPv4off, 1);
+	else
+		v6delroute(f, a, IPallbits, 1);
+
+	/* no more links, remove from hash and free */
+	*l = p->next;
+	ipselffree(p);
+
+	/* if IPnoaddr, forget */
+	if(ipcmp(a, v4prefix) == 0 || ipcmp(a, IPnoaddr) == 0)
+		f->self->acceptall = 0;
+
+out:
+	qunlock(f->self);
+}
+
+static char *stformat = "%-44.44I %2.2d %4.4s\n";
+enum
+{
+	Nstformat= 41,
+};
+
+long
+ipselftabread(Fs *f, char *cp, ulong offset, int n)
+{
+	int i, m, nifc, off;
+	Ipself *p;
+	Iplink *link;
+	char state[8];
+
+	m = 0;
+	off = offset;
+	qlock(f->self);
+	for(i = 0; i < NHASH && m < n; i++){
+		for(p = f->self->hash[i]; p != nil && m < n; p = p->next){
+			nifc = 0;
+			for(link = p->link; link; link = link->selflink)
+				nifc++;
+			routetype(p->type, state);
+			m += snprint(cp + m, n - m, stformat, p->a, nifc, state);
+			if(off > 0){
+				off -= m;
+				m = 0;
+			}
+		}
+	}
+	qunlock(f->self);
+	return m;
+}
+
+int
+iptentative(Fs *f, uchar *addr)
+{
+ 	Ipself *p;
+
+	p = f->self->hash[hashipa(addr)];
+	for(; p; p = p->next){
+		if(ipcmp(addr, p->a) == 0)
+			return p->link->lifc->tentative;
+	}
+	return 0;
+}
+
+/*
+ *  returns
+ *	0		- no match
+ *	Runi
+ *	Rbcast
+ *	Rmcast
+ */
+int
+ipforme(Fs *f, uchar *addr)
+{
+	Ipself *p;
+
+	p = f->self->hash[hashipa(addr)];
+	for(; p; p = p->next){
+		if(ipcmp(addr, p->a) == 0)
+			return p->type;
+	}
+
+	/* hack to say accept anything */
+	if(f->self->acceptall)
+		return Runi;
+	return 0;
+}
+
+/*
+ *  find the ifc on same net as the remote system.  If none,
+ *  return nil.
+ */
+Ipifc*
+findipifc(Fs *f, uchar *remote, int type)
+{
+	Ipifc *ifc, *x;
+	Iplifc *lifc;
+	Conv **cp, **e;
+	uchar gnet[IPaddrlen], xmask[IPaddrlen];
+
+	x = nil;
+	memset(xmask, 0, IPaddrlen);
+
+	/* find most specific match */
+	e = &f->ipifc->conv[f->ipifc->nc];
+	for(cp = f->ipifc->conv; cp < e; cp++){
+		if(*cp == 0)
+			continue;
+		ifc = (Ipifc*)(*cp)->ptcl;
+		for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+			maskip(remote, lifc->mask, gnet);
+			if(ipcmp(gnet, lifc->net) == 0){
+				if(x == nil || ipcmp(lifc->mask, xmask) > 0){
+					x = ifc;
+					ipmove(xmask, lifc->mask);
+				}
+			}
+		}
+	}
+	if(x != nil)
+		return x;
+
+	/* for now for broadcast and multicast, just use first interface */
+	if(type & (Rbcast|Rmulti)){
+		for(cp = f->ipifc->conv; cp < e; cp++){
+			if(*cp == 0)
+				continue;
+			ifc = (Ipifc*)(*cp)->ptcl;
+			if(ifc->lifc != nil)
+				return ifc;
+		}
+	}
+	return nil;
+}
+
+enum {
+	unknownv6,		/* UGH */
+//	multicastv6,
+	unspecifiedv6,
+	linklocalv6,
+	globalv6,
+};
+
+int
+v6addrtype(uchar *addr)
+{
+	if(islinklocal(addr) ||
+	    isv6mcast(addr) && (addr[1] & 0xF) <= Link_local_scop)
+		return linklocalv6;
+	else
+		return globalv6;
+}
+
+#define v6addrcurr(lifc) ((lifc)->preflt == ~0L || \
+			(lifc)->origint + (lifc)->preflt >= NOW/1000)
+
+static void
+findprimaryipv6(Fs *f, uchar *local)
+{
+	int atype, atypel;
+	Conv **cp, **e;
+	Ipifc *ifc;
+	Iplifc *lifc;
+
+	ipmove(local, v6Unspecified);
+	atype = unspecifiedv6;
+
+	/*
+	 * find "best" (global > link local > unspecified)
+	 * local address; address must be current.
+	 */
+	e = &f->ipifc->conv[f->ipifc->nc];
+	for(cp = f->ipifc->conv; cp < e; cp++){
+		if(*cp == 0)
+			continue;
+		ifc = (Ipifc*)(*cp)->ptcl;
+		for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+			atypel = v6addrtype(lifc->local);
+			if(atypel > atype && v6addrcurr(lifc)) {
+				ipmove(local, lifc->local);
+				atype = atypel;
+				if(atype == globalv6)
+					return;
+			}
+		}
+	}
+}
+
+/*
+ *  returns first ip address configured
+ */
+static void
+findprimaryipv4(Fs *f, uchar *local)
+{
+	Conv **cp, **e;
+	Ipifc *ifc;
+	Iplifc *lifc;
+
+	/* find first ifc local address */
+	e = &f->ipifc->conv[f->ipifc->nc];
+	for(cp = f->ipifc->conv; cp < e; cp++){
+		if(*cp == 0)
+			continue;
+		ifc = (Ipifc*)(*cp)->ptcl;
+		if((lifc = ifc->lifc) != nil){
+			ipmove(local, lifc->local);
+			return;
+		}
+	}
+}
+
+/*
+ *  find the local address 'closest' to the remote system, copy it to
+ *  local and return the ifc for that address
+ */
+void
+findlocalip(Fs *f, uchar *local, uchar *remote)
+{
+	int version, atype = unspecifiedv6, atypel = unknownv6;
+	int atyper, deprecated;
+	uchar gate[IPaddrlen], gnet[IPaddrlen];
+	Ipifc *ifc;
+	Iplifc *lifc;
+	Route *r;
+
+	USED(atype);
+	USED(atypel);
+	qlock(f->ipifc);
+	r = v6lookup(f, remote, nil);
+ 	version = (memcmp(remote, v4prefix, IPv4off) == 0)? V4: V6;
+
+	if(r != nil){
+		ifc = r->ifc;
+		if(r->type & Rv4)
+			v4tov6(gate, r->v4.gate);
+		else {
+			ipmove(gate, r->v6.gate);
+			ipmove(local, v6Unspecified);
+		}
+
+		switch(version) {
+		case V4:
+			/* find ifc address closest to the gateway to use */
+			for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+				maskip(gate, lifc->mask, gnet);
+				if(ipcmp(gnet, lifc->net) == 0){
+					ipmove(local, lifc->local);
+					goto out;
+				}
+			}
+			break;
+		case V6:
+			/* find ifc address with scope matching the destination */
+			atyper = v6addrtype(remote);
+			deprecated = 0;
+			for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+				atypel = v6addrtype(lifc->local);
+				/* prefer appropriate scope */
+				if(atypel > atype && atype < atyper ||
+				   atypel < atype && atype > atyper){
+					ipmove(local, lifc->local);
+					deprecated = !v6addrcurr(lifc);
+					atype = atypel;
+				} else if(atypel == atype){
+					/* avoid deprecated addresses */
+					if(deprecated && v6addrcurr(lifc)){
+						ipmove(local, lifc->local);
+						atype = atypel;
+						deprecated = 0;
+					}
+				}
+				if(atype == atyper && !deprecated)
+					goto out;
+			}
+			if(atype >= atyper)
+				goto out;
+			break;
+		default:
+			panic("findlocalip: version %d", version);
+		}
+	}
+
+	switch(version){
+	case V4:
+		findprimaryipv4(f, local);
+		break;
+	case V6:
+		findprimaryipv6(f, local);
+		break;
+	default:
+		panic("findlocalip2: version %d", version);
+	}
+
+out:
+	qunlock(f->ipifc);
+}
+
+/*
+ *  return first v4 address associated with an interface
+ */
+int
+ipv4local(Ipifc *ifc, uchar *addr)
+{
+	Iplifc *lifc;
+
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+		if(isv4(lifc->local)){
+			memmove(addr, lifc->local+IPv4off, IPv4addrlen);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ *  return first v6 address associated with an interface
+ */
+int
+ipv6local(Ipifc *ifc, uchar *addr)
+{
+	Iplifc *lifc;
+
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+		if(!isv4(lifc->local) && !(lifc->tentative)){
+			ipmove(addr, lifc->local);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int
+ipv6anylocal(Ipifc *ifc, uchar *addr)
+{
+	Iplifc *lifc;
+
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+		if(!isv4(lifc->local)){
+			ipmove(addr, lifc->local);
+			return SRC_UNI;
+		}
+	}
+	return SRC_UNSPEC;
+}
+
+/*
+ *  see if this address is bound to the interface
+ */
+Iplifc*
+iplocalonifc(Ipifc *ifc, uchar *ip)
+{
+	Iplifc *lifc;
+
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next)
+		if(ipcmp(ip, lifc->local) == 0)
+			return lifc;
+	return nil;
+}
+
+
+/*
+ *  See if we're proxying for this address on this interface
+ */
+int
+ipproxyifc(Fs *f, Ipifc *ifc, uchar *ip)
+{
+	Route *r;
+	uchar net[IPaddrlen];
+	Iplifc *lifc;
+
+	/* see if this is a direct connected pt to pt address */
+	r = v6lookup(f, ip, nil);
+	if(r == nil || (r->type & (Rifc|Rproxy)) != (Rifc|Rproxy))
+		return 0;
+
+	/* see if this is on the right interface */
+	for(lifc = ifc->lifc; lifc; lifc = lifc->next){
+		maskip(ip, lifc->mask, net);
+		if(ipcmp(net, lifc->remote) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ *  return multicast version if any
+ */
+int
+ipismulticast(uchar *ip)
+{
+	if(isv4(ip)){
+		if(ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0)
+			return V4;
+	}
+	else if(ip[0] == 0xff)
+		return V6;
+	return 0;
+}
+int
+ipisbm(uchar *ip)
+{
+	if(isv4(ip)){
+		if(ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0)
+			return V4;
+		else if(ipcmp(ip, IPv4bcast) == 0)
+			return V4;
+	}
+	else if(ip[0] == 0xff)
+		return V6;
+	return 0;
+}
+
+
+/*
+ *  add a multicast address to an interface, called with c->car locked
+ */
+void
+ipifcaddmulti(Conv *c, uchar *ma, uchar *ia)
+{
+	Ipifc *ifc;
+	Iplifc *lifc;
+	Conv **p;
+	Ipmulti *multi, **l;
+	Fs *f;
+
+	f = c->p->f;
+
+	for(l = &c->multi; *l; l = &(*l)->next)
+		if(ipcmp(ma, (*l)->ma) == 0 && ipcmp(ia, (*l)->ia) == 0)
+			return;		/* it's already there */
+
+	multi = *l = smalloc(sizeof(*multi));
+	ipmove(multi->ma, ma);
+	ipmove(multi->ia, ia);
+	multi->next = nil;
+
+	for(p = f->ipifc->conv; *p; p++){
+		if((*p)->inuse == 0)
+			continue;
+		ifc = (Ipifc*)(*p)->ptcl;
+		if(waserror()){
+			wunlock(ifc);
+			nexterror();
+		}
+		wlock(ifc);
+		for(lifc = ifc->lifc; lifc; lifc = lifc->next)
+			if(ipcmp(ia, lifc->local) == 0)
+				addselfcache(f, ifc, lifc, ma, Rmulti);
+		wunlock(ifc);
+		poperror();
+	}
+}
+
+
+/*
+ *  remove a multicast address from an interface, called with c->car locked
+ */
+void
+ipifcremmulti(Conv *c, uchar *ma, uchar *ia)
+{
+	Ipmulti *multi, **l;
+	Iplifc *lifc;
+	Conv **p;
+	Ipifc *ifc;
+	Fs *f;
+
+	f = c->p->f;
+
+	for(l = &c->multi; *l; l = &(*l)->next)
+		if(ipcmp(ma, (*l)->ma) == 0 && ipcmp(ia, (*l)->ia) == 0)
+			break;
+
+	multi = *l;
+	if(multi == nil)
+		return; 	/* we don't have it open */
+
+	*l = multi->next;
+
+	for(p = f->ipifc->conv; *p; p++){
+		if((*p)->inuse == 0)
+			continue;
+
+		ifc = (Ipifc*)(*p)->ptcl;
+		if(waserror()){
+			wunlock(ifc);
+			nexterror();
+		}
+		wlock(ifc);
+		for(lifc = ifc->lifc; lifc; lifc = lifc->next)
+			if(ipcmp(ia, lifc->local) == 0)
+				remselfcache(f, ifc, lifc, ma);
+		wunlock(ifc);
+		poperror();
+	}
+
+	free(multi);
+}
+
+/*
+ *  make lifc's join and leave multicast groups
+ */
+static char*
+ipifcjoinmulti(Ipifc *ifc, char **argv, int argc)
+{
+	USED(ifc, argv, argc);
+	return nil;
+}
+
+static char*
+ipifcleavemulti(Ipifc *ifc, char **argv, int argc)
+{
+	USED(ifc, argv, argc);
+	return nil;
+}
+
+static void
+ipifcregisterproxy(Fs *f, Ipifc *ifc, uchar *ip)
+{
+	Conv **cp, **e;
+	Ipifc *nifc;
+	Iplifc *lifc;
+	Medium *medium;
+	uchar net[IPaddrlen];
+
+	/* register the address on any network that will proxy for us */
+	e = &f->ipifc->conv[f->ipifc->nc];
+
+	if(!isv4(ip)) {				/* V6 */
+		for(cp = f->ipifc->conv; cp < e; cp++){
+			if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc)
+				continue;
+			rlock(nifc);
+			medium = nifc->medium;
+			if(medium == nil || medium->addmulti == nil) {
+				runlock(nifc);
+				continue;
+			}
+			for(lifc = nifc->lifc; lifc; lifc = lifc->next){
+				maskip(ip, lifc->mask, net);
+				if(ipcmp(net, lifc->remote) == 0) {
+					/* add solicited-node multicast addr */
+					ipv62smcast(net, ip);
+					addselfcache(f, nifc, lifc, net, Rmulti);
+					arpenter(f, V6, ip, nifc->mac, 6, 0);
+					// (*medium->addmulti)(nifc, net, ip);
+					break;
+				}
+			}
+			runlock(nifc);
+		}
+	}
+	else {					/* V4 */
+		for(cp = f->ipifc->conv; cp < e; cp++){
+			if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc)
+				continue;
+			rlock(nifc);
+			medium = nifc->medium;
+			if(medium == nil || medium->areg == nil){
+				runlock(nifc);
+				continue;
+			}
+			for(lifc = nifc->lifc; lifc; lifc = lifc->next){
+				maskip(ip, lifc->mask, net);
+				if(ipcmp(net, lifc->remote) == 0){
+					(*medium->areg)(nifc, ip);
+					break;
+				}
+			}
+			runlock(nifc);
+		}
+	}
+}
+
+
+/* added for new v6 mesg types */
+static void
+adddefroute6(Fs *f, uchar *gate, int force)
+{
+	Route *r;
+
+	r = v6lookup(f, v6Unspecified, nil);
+	/*
+	 * route entries generated by all other means take precedence
+	 * over router announcements.
+	 */
+	if (r && !force && strcmp(r->tag, "ra") != 0)
+		return;
+
+	v6delroute(f, v6Unspecified, v6Unspecified, 1);
+	v6addroute(f, "ra", v6Unspecified, v6Unspecified, gate, 0);
+}
+
+enum {
+	Ngates = 3,
+};
+
+char*
+ipifcadd6(Ipifc *ifc, char**argv, int argc)
+{
+	int plen = 64;
+	long origint = NOW / 1000, preflt = ~0L, validlt = ~0L;
+	char addr[40], preflen[6];
+	char *params[3];
+	uchar autoflag = 1, onlink = 1;
+	uchar prefix[IPaddrlen];
+	Iplifc *lifc;
+
+	switch(argc) {
+	case 7:
+		preflt = atoi(argv[6]);
+		/* fall through */
+	case 6:
+		validlt = atoi(argv[5]);
+		/* fall through */
+	case 5:
+		autoflag = atoi(argv[4]);
+		/* fall through */
+	case 4:
+		onlink = atoi(argv[3]);
+		/* fall through */
+	case 3:
+		plen = atoi(argv[2]);
+		/* fall through */
+	case 2:
+		break;
+	default:
+		return Ebadarg;
+	}
+
+	if (parseip(prefix, argv[1]) != 6 || validlt < preflt || plen < 0 ||
+	    plen > 64 || islinklocal(prefix))
+		return Ebadarg;
+
+	lifc = smalloc(sizeof(Iplifc));
+	lifc->onlink = (onlink != 0);
+	lifc->autoflag = (autoflag != 0);
+	lifc->validlt = validlt;
+	lifc->preflt = preflt;
+	lifc->origint = origint;
+
+	/* issue "add" ctl msg for v6 link-local addr and prefix len */
+	if(!ifc->medium->pref2addr)
+		return Ebadarg;
+	ifc->medium->pref2addr(prefix, ifc->mac);	/* mac → v6 link-local addr */
+	sprint(addr, "%I", prefix);
+	sprint(preflen, "/%d", plen);
+	params[0] = "add";
+	params[1] = addr;
+	params[2] = preflen;
+
+	return ipifcadd(ifc, params, 3, 0, lifc);
+}

+ 861 - 0
sys/src/9/ip/iproute.c

@@ -0,0 +1,861 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+
+static void	walkadd(Fs*, Route**, Route*);
+static void	addnode(Fs*, Route**, Route*);
+static void	calcd(Route*);
+
+/* these are used for all instances of IP */
+Route*	v4freelist;
+Route*	v6freelist;
+RWlock	routelock;
+ulong	v4routegeneration, v6routegeneration;
+
+static void
+freeroute(Route *r)
+{
+	Route **l;
+
+	r->left = nil;
+	r->right = nil;
+	if(r->type & Rv4)
+		l = &v4freelist;
+	else
+		l = &v6freelist;
+	r->mid = *l;
+	*l = r;
+}
+
+static Route*
+allocroute(int type)
+{
+	Route *r;
+	int n;
+	Route **l;
+
+	if(type & Rv4){
+		n = sizeof(RouteTree) + sizeof(V4route);
+		l = &v4freelist;
+	} else {
+		n = sizeof(RouteTree) + sizeof(V6route);
+		l = &v6freelist;
+	}
+
+	r = *l;
+	if(r != nil){
+		*l = r->mid;
+	} else {
+		r = malloc(n);
+		if(r == nil)
+			panic("out of routing nodes");
+	}
+	memset(r, 0, n);
+	r->type = type;
+	r->ifc = nil;
+	r->ref = 1;
+
+	return r;
+}
+
+static void
+addqueue(Route **q, Route *r)
+{
+	Route *l;
+
+	if(r == nil)
+		return;
+
+	l = allocroute(r->type);
+	l->mid = *q;
+	*q = l;
+	l->left = r;
+}
+
+/*
+ *  compare 2 v6 addresses
+ */
+static int
+lcmp(ulong *a, ulong *b)
+{
+	int i;
+
+	for(i = 0; i < IPllen; i++){
+		if(a[i] > b[i])
+			return 1;
+		if(a[i] < b[i])
+			return -1;
+	}
+	return 0;
+}
+
+/*
+ *  compare 2 v4 or v6 ranges
+ */
+enum
+{
+	Rpreceeds,
+	Rfollows,
+	Requals,
+	Rcontains,
+	Rcontained,
+};
+
+static int
+rangecompare(Route *a, Route *b)
+{
+	if(a->type & Rv4){
+		if(a->v4.endaddress < b->v4.address)
+			return Rpreceeds;
+
+		if(a->v4.address > b->v4.endaddress)
+			return Rfollows;
+
+		if(a->v4.address <= b->v4.address
+		&& a->v4.endaddress >= b->v4.endaddress){
+			if(a->v4.address == b->v4.address
+			&& a->v4.endaddress == b->v4.endaddress)
+				return Requals;
+			return Rcontains;
+		}
+		return Rcontained;
+	}
+
+	if(lcmp(a->v6.endaddress, b->v6.address) < 0)
+		return Rpreceeds;
+
+	if(lcmp(a->v6.address, b->v6.endaddress) > 0)
+		return Rfollows;
+
+	if(lcmp(a->v6.address, b->v6.address) <= 0
+	&& lcmp(a->v6.endaddress, b->v6.endaddress) >= 0){
+		if(lcmp(a->v6.address, b->v6.address) == 0
+		&& lcmp(a->v6.endaddress, b->v6.endaddress) == 0)
+				return Requals;
+		return Rcontains;
+	}
+
+	return Rcontained;
+}
+
+static void
+copygate(Route *old, Route *new)
+{
+	if(new->type & Rv4)
+		memmove(old->v4.gate, new->v4.gate, IPv4addrlen);
+	else
+		memmove(old->v6.gate, new->v6.gate, IPaddrlen);
+}
+
+/*
+ *  walk down a tree adding nodes back in
+ */
+static void
+walkadd(Fs *f, Route **root, Route *p)
+{
+	Route *l, *r;
+
+	l = p->left;
+	r = p->right;
+	p->left = 0;
+	p->right = 0;
+	addnode(f, root, p);
+	if(l)
+		walkadd(f, root, l);
+	if(r)
+		walkadd(f, root, r);
+}
+
+/*
+ *  calculate depth
+ */
+static void
+calcd(Route *p)
+{
+	Route *q;
+	int d;
+
+	if(p) {
+		d = 0;
+		q = p->left;
+		if(q)
+			d = q->depth;
+		q = p->right;
+		if(q && q->depth > d)
+			d = q->depth;
+		q = p->mid;
+		if(q && q->depth > d)
+			d = q->depth;
+		p->depth = d+1;
+	}
+}
+
+/*
+ *  balance the tree at the current node
+ */
+static void
+balancetree(Route **cur)
+{
+	Route *p, *l, *r;
+	int dl, dr;
+
+	/*
+	 * if left and right are
+	 * too out of balance,
+	 * rotate tree node
+	 */
+	p = *cur;
+	dl = 0; if(l = p->left) dl = l->depth;
+	dr = 0; if(r = p->right) dr = r->depth;
+
+	if(dl > dr+1) {
+		p->left = l->right;
+		l->right = p;
+		*cur = l;
+		calcd(p);
+		calcd(l);
+	} else
+	if(dr > dl+1) {
+		p->right = r->left;
+		r->left = p;
+		*cur = r;
+		calcd(p);
+		calcd(r);
+	} else
+		calcd(p);
+}
+
+/*
+ *  add a new node to the tree
+ */
+static void
+addnode(Fs *f, Route **cur, Route *new)
+{
+	Route *p;
+
+	p = *cur;
+	if(p == 0) {
+		*cur = new;
+		new->depth = 1;
+		return;
+	}
+
+	switch(rangecompare(new, p)){
+	case Rpreceeds:
+		addnode(f, &p->left, new);
+		break;
+	case Rfollows:
+		addnode(f, &p->right, new);
+		break;
+	case Rcontains:
+		/*
+		 *  if new node is superset
+		 *  of tree node,
+		 *  replace tree node and
+		 *  queue tree node to be
+		 *  merged into root.
+		 */
+		*cur = new;
+		new->depth = 1;
+		addqueue(&f->queue, p);
+		break;
+	case Requals:
+		/*
+		 *  supercede the old entry if the old one isn't
+		 *  a local interface.
+		 */
+		if((p->type & Rifc) == 0){
+			p->type = new->type;
+			p->ifcid = -1;
+			copygate(p, new);
+		} else if(new->type & Rifc)
+			p->ref++;
+		freeroute(new);
+		break;
+	case Rcontained:
+		addnode(f, &p->mid, new);
+		break;
+	}
+
+	balancetree(cur);
+}
+
+#define	V4H(a)	((a&0x07ffffff)>>(32-Lroot-5))
+
+void
+v4addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type)
+{
+	Route *p;
+	ulong sa;
+	ulong m;
+	ulong ea;
+	int h, eh;
+
+	m = nhgetl(mask);
+	sa = nhgetl(a) & m;
+	ea = sa | ~m;
+
+	eh = V4H(ea);
+	for(h=V4H(sa); h<=eh; h++) {
+		p = allocroute(Rv4 | type);
+		p->v4.address = sa;
+		p->v4.endaddress = ea;
+		memmove(p->v4.gate, gate, sizeof(p->v4.gate));
+		memmove(p->tag, tag, sizeof(p->tag));
+
+		wlock(&routelock);
+		addnode(f, &f->v4root[h], p);
+		while(p = f->queue) {
+			f->queue = p->mid;
+			walkadd(f, &f->v4root[h], p->left);
+			freeroute(p);
+		}
+		wunlock(&routelock);
+	}
+	v4routegeneration++;
+
+	ipifcaddroute(f, Rv4, a, mask, gate, type);
+}
+
+#define	V6H(a)	(((a)[IPllen-1] & 0x07ffffff)>>(32-Lroot-5))
+#define ISDFLT(a, mask, tag) ((ipcmp((a),v6Unspecified)==0) && (ipcmp((mask),v6Unspecified)==0) && (strcmp((tag), "ra")!=0))
+
+void
+v6addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type)
+{
+	Route *p;
+	ulong sa[IPllen], ea[IPllen];
+	ulong x, y;
+	int h, eh;
+
+	/*
+	if(ISDFLT(a, mask, tag))
+		f->v6p->cdrouter = -1;
+	*/
+
+
+	for(h = 0; h < IPllen; h++){
+		x = nhgetl(a+4*h);
+		y = nhgetl(mask+4*h);
+		sa[h] = x & y;
+		ea[h] = x | ~y;
+	}
+
+	eh = V6H(ea);
+	for(h = V6H(sa); h <= eh; h++) {
+		p = allocroute(type);
+		memmove(p->v6.address, sa, IPaddrlen);
+		memmove(p->v6.endaddress, ea, IPaddrlen);
+		memmove(p->v6.gate, gate, IPaddrlen);
+		memmove(p->tag, tag, sizeof(p->tag));
+
+		wlock(&routelock);
+		addnode(f, &f->v6root[h], p);
+		while(p = f->queue) {
+			f->queue = p->mid;
+			walkadd(f, &f->v6root[h], p->left);
+			freeroute(p);
+		}
+		wunlock(&routelock);
+	}
+	v6routegeneration++;
+
+	ipifcaddroute(f, 0, a, mask, gate, type);
+}
+
+Route**
+looknode(Route **cur, Route *r)
+{
+	Route *p;
+
+	for(;;){
+		p = *cur;
+		if(p == 0)
+			return 0;
+
+		switch(rangecompare(r, p)){
+		case Rcontains:
+			return 0;
+		case Rpreceeds:
+			cur = &p->left;
+			break;
+		case Rfollows:
+			cur = &p->right;
+			break;
+		case Rcontained:
+			cur = &p->mid;
+			break;
+		case Requals:
+			return cur;
+		}
+	}
+}
+
+void
+v4delroute(Fs *f, uchar *a, uchar *mask, int dolock)
+{
+	Route **r, *p;
+	Route rt;
+	int h, eh;
+	ulong m;
+
+	m = nhgetl(mask);
+	rt.v4.address = nhgetl(a) & m;
+	rt.v4.endaddress = rt.v4.address | ~m;
+	rt.type = Rv4;
+
+	eh = V4H(rt.v4.endaddress);
+	for(h=V4H(rt.v4.address); h<=eh; h++) {
+		if(dolock)
+			wlock(&routelock);
+		r = looknode(&f->v4root[h], &rt);
+		if(r) {
+			p = *r;
+			if(--(p->ref) == 0){
+				*r = 0;
+				addqueue(&f->queue, p->left);
+				addqueue(&f->queue, p->mid);
+				addqueue(&f->queue, p->right);
+				freeroute(p);
+				while(p = f->queue) {
+					f->queue = p->mid;
+					walkadd(f, &f->v4root[h], p->left);
+					freeroute(p);
+				}
+			}
+		}
+		if(dolock)
+			wunlock(&routelock);
+	}
+	v4routegeneration++;
+
+	ipifcremroute(f, Rv4, a, mask);
+}
+
+void
+v6delroute(Fs *f, uchar *a, uchar *mask, int dolock)
+{
+	Route **r, *p;
+	Route rt;
+	int h, eh;
+	ulong x, y;
+
+	for(h = 0; h < IPllen; h++){
+		x = nhgetl(a+4*h);
+		y = nhgetl(mask+4*h);
+		rt.v6.address[h] = x & y;
+		rt.v6.endaddress[h] = x | ~y;
+	}
+	rt.type = 0;
+
+	eh = V6H(rt.v6.endaddress);
+	for(h=V6H(rt.v6.address); h<=eh; h++) {
+		if(dolock)
+			wlock(&routelock);
+		r = looknode(&f->v6root[h], &rt);
+		if(r) {
+			p = *r;
+			if(--(p->ref) == 0){
+				*r = 0;
+				addqueue(&f->queue, p->left);
+				addqueue(&f->queue, p->mid);
+				addqueue(&f->queue, p->right);
+				freeroute(p);
+				while(p = f->queue) {
+					f->queue = p->mid;
+					walkadd(f, &f->v6root[h], p->left);
+					freeroute(p);
+				}
+			}
+		}
+		if(dolock)
+			wunlock(&routelock);
+	}
+	v6routegeneration++;
+
+	ipifcremroute(f, 0, a, mask);
+}
+
+Route*
+v4lookup(Fs *f, uchar *a, Conv *c)
+{
+	Route *p, *q;
+	ulong la;
+	uchar gate[IPaddrlen];
+	Ipifc *ifc;
+
+	if(c != nil && c->r != nil && c->r->ifc != nil && c->rgen == v4routegeneration)
+		return c->r;
+
+	la = nhgetl(a);
+	q = nil;
+	for(p=f->v4root[V4H(la)]; p;)
+		if(la >= p->v4.address) {
+			if(la <= p->v4.endaddress) {
+				q = p;
+				p = p->mid;
+			} else
+				p = p->right;
+		} else
+			p = p->left;
+
+	if(q && (q->ifc == nil || q->ifcid != q->ifc->ifcid)){
+		if(q->type & Rifc) {
+			hnputl(gate+IPv4off, q->v4.address);
+			memmove(gate, v4prefix, IPv4off);
+		} else
+			v4tov6(gate, q->v4.gate);
+		ifc = findipifc(f, gate, q->type);
+		if(ifc == nil)
+			return nil;
+		q->ifc = ifc;
+		q->ifcid = ifc->ifcid;
+	}
+
+	if(c != nil){
+		c->r = q;
+		c->rgen = v4routegeneration;
+	}
+
+	return q;
+}
+
+Route*
+v6lookup(Fs *f, uchar *a, Conv *c)
+{
+	Route *p, *q;
+	ulong la[IPllen];
+	int h;
+	ulong x, y;
+	uchar gate[IPaddrlen];
+	Ipifc *ifc;
+
+	if(memcmp(a, v4prefix, IPv4off) == 0){
+		q = v4lookup(f, a+IPv4off, c);
+		if(q != nil)
+			return q;
+	}
+
+	if(c != nil && c->r != nil && c->r->ifc != nil && c->rgen == v6routegeneration)
+		return c->r;
+
+	for(h = 0; h < IPllen; h++)
+		la[h] = nhgetl(a+4*h);
+
+	q = 0;
+	for(p=f->v6root[V6H(la)]; p;){
+		for(h = 0; h < IPllen; h++){
+			x = la[h];
+			y = p->v6.address[h];
+			if(x == y)
+				continue;
+			if(x < y){
+				p = p->left;
+				goto next;
+			}
+			break;
+		}
+		for(h = 0; h < IPllen; h++){
+			x = la[h];
+			y = p->v6.endaddress[h];
+			if(x == y)
+				continue;
+			if(x > y){
+				p = p->right;
+				goto next;
+			}
+			break;
+		}
+		q = p;
+		p = p->mid;
+next:		;
+	}
+
+	if(q && (q->ifc == nil || q->ifcid != q->ifc->ifcid)){
+		if(q->type & Rifc) {
+			for(h = 0; h < IPllen; h++)
+				hnputl(gate+4*h, q->v6.address[h]);
+			ifc = findipifc(f, gate, q->type);
+		} else
+			ifc = findipifc(f, q->v6.gate, q->type);
+		if(ifc == nil)
+			return nil;
+		q->ifc = ifc;
+		q->ifcid = ifc->ifcid;
+	}
+	if(c != nil){
+		c->r = q;
+		c->rgen = v6routegeneration;
+	}
+
+	return q;
+}
+
+void
+routetype(int type, char *p)
+{
+	memset(p, ' ', 4);
+	p[4] = 0;
+	if(type & Rv4)
+		*p++ = '4';
+	else
+		*p++ = '6';
+	if(type & Rifc)
+		*p++ = 'i';
+	if(type & Runi)
+		*p++ = 'u';
+	else if(type & Rbcast)
+		*p++ = 'b';
+	else if(type & Rmulti)
+		*p++ = 'm';
+	if(type & Rptpt)
+		*p = 'p';
+}
+
+char *rformat = "%-15I %-4M %-15I %4.4s %4.4s %3s\n";
+
+void
+convroute(Route *r, uchar *addr, uchar *mask, uchar *gate, char *t, int *nifc)
+{
+	int i;
+
+	if(r->type & Rv4){
+		memmove(addr, v4prefix, IPv4off);
+		hnputl(addr+IPv4off, r->v4.address);
+		memset(mask, 0xff, IPv4off);
+		hnputl(mask+IPv4off, ~(r->v4.endaddress ^ r->v4.address));
+		memmove(gate, v4prefix, IPv4off);
+		memmove(gate+IPv4off, r->v4.gate, IPv4addrlen);
+	} else {
+		for(i = 0; i < IPllen; i++){
+			hnputl(addr + 4*i, r->v6.address[i]);
+			hnputl(mask + 4*i, ~(r->v6.endaddress[i] ^ r->v6.address[i]));
+		}
+		memmove(gate, r->v6.gate, IPaddrlen);
+	}
+
+	routetype(r->type, t);
+
+	if(r->ifc)
+		*nifc = r->ifc->conv->x;
+	else
+		*nifc = -1;
+}
+
+/*
+ *  this code is not in rr to reduce stack size
+ */
+static void
+sprintroute(Route *r, Routewalk *rw)
+{
+	int nifc, n;
+	char t[5], *iname, ifbuf[5];
+	uchar addr[IPaddrlen], mask[IPaddrlen], gate[IPaddrlen];
+	char *p;
+
+	convroute(r, addr, mask, gate, t, &nifc);
+	iname = "-";
+	if(nifc != -1) {
+		iname = ifbuf;
+		sprint(ifbuf, "%d", nifc);
+	}
+	p = seprint(rw->p, rw->e, rformat, addr, mask, gate, t, r->tag, iname);
+	if(rw->o < 0){
+		n = p - rw->p;
+		if(n > -rw->o){
+			memmove(rw->p, rw->p-rw->o, n+rw->o);
+			rw->p = p + rw->o;
+		}
+		rw->o += n;
+	} else
+		rw->p = p;
+}
+
+/*
+ *  recurse descending tree, applying the function in Routewalk
+ */
+static int
+rr(Route *r, Routewalk *rw)
+{
+	int h;
+
+	if(rw->e <= rw->p)
+		return 0;
+	if(r == nil)
+		return 1;
+
+	if(rr(r->left, rw) == 0)
+		return 0;
+
+	if(r->type & Rv4)
+		h = V4H(r->v4.address);
+	else
+		h = V6H(r->v6.address);
+
+	if(h == rw->h)
+		rw->walk(r, rw);
+
+	if(rr(r->mid, rw) == 0)
+		return 0;
+
+	return rr(r->right, rw);
+}
+
+void
+ipwalkroutes(Fs *f, Routewalk *rw)
+{
+	rlock(&routelock);
+	if(rw->e > rw->p) {
+		for(rw->h = 0; rw->h < nelem(f->v4root); rw->h++)
+			if(rr(f->v4root[rw->h], rw) == 0)
+				break;
+	}
+	if(rw->e > rw->p) {
+		for(rw->h = 0; rw->h < nelem(f->v6root); rw->h++)
+			if(rr(f->v6root[rw->h], rw) == 0)
+				break;
+	}
+	runlock(&routelock);
+}
+
+long
+routeread(Fs *f, char *p, ulong offset, int n)
+{
+	Routewalk rw;
+
+	rw.p = p;
+	rw.e = p+n;
+	rw.o = -offset;
+	rw.walk = sprintroute;
+
+	ipwalkroutes(f, &rw);
+
+	return rw.p - p;
+}
+
+/*
+ *  this code is not in routeflush to reduce stack size
+ */
+void
+delroute(Fs *f, Route *r, int dolock)
+{
+	uchar addr[IPaddrlen];
+	uchar mask[IPaddrlen];
+	uchar gate[IPaddrlen];
+	char t[5];
+	int nifc;
+
+	convroute(r, addr, mask, gate, t, &nifc);
+	if(r->type & Rv4)
+		v4delroute(f, addr+IPv4off, mask+IPv4off, dolock);
+	else
+		v6delroute(f, addr, mask, dolock);
+}
+
+/*
+ *  recurse until one route is deleted
+ *  returns 0 if nothing is deleted, 1 otherwise
+ */
+int
+routeflush(Fs *f, Route *r, char *tag)
+{
+	if(r == nil)
+		return 0;
+	if(routeflush(f, r->mid, tag))
+		return 1;
+	if(routeflush(f, r->left, tag))
+		return 1;
+	if(routeflush(f, r->right, tag))
+		return 1;
+	if((r->type & Rifc) == 0){
+		if(tag == nil || strncmp(tag, r->tag, sizeof(r->tag)) == 0){
+			delroute(f, r, 0);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+long
+routewrite(Fs *f, Chan *c, char *p, int n)
+{
+	int h, changed;
+	char *tag;
+	Cmdbuf *cb;
+	uchar addr[IPaddrlen];
+	uchar mask[IPaddrlen];
+	uchar gate[IPaddrlen];
+	IPaux *a, *na;
+
+	cb = parsecmd(p, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+
+	if(strcmp(cb->f[0], "flush") == 0){
+		tag = cb->f[1];
+		for(h = 0; h < nelem(f->v4root); h++)
+			for(changed = 1; changed;){
+				wlock(&routelock);
+				changed = routeflush(f, f->v4root[h], tag);
+				wunlock(&routelock);
+			}
+		for(h = 0; h < nelem(f->v6root); h++)
+			for(changed = 1; changed;){
+				wlock(&routelock);
+				changed = routeflush(f, f->v6root[h], tag);
+				wunlock(&routelock);
+			}
+	} else if(strcmp(cb->f[0], "remove") == 0){
+		if(cb->nf < 3)
+			error(Ebadarg);
+		parseip(addr, cb->f[1]);
+		parseipmask(mask, cb->f[2]);
+		if(memcmp(addr, v4prefix, IPv4off) == 0)
+			v4delroute(f, addr+IPv4off, mask+IPv4off, 1);
+		else
+			v6delroute(f, addr, mask, 1);
+	} else if(strcmp(cb->f[0], "add") == 0){
+		if(cb->nf < 4)
+			error(Ebadarg);
+		parseip(addr, cb->f[1]);
+		parseipmask(mask, cb->f[2]);
+		parseip(gate, cb->f[3]);
+		tag = "none";
+		if(c != nil){
+			a = c->aux;
+			tag = a->tag;
+		}
+		if(memcmp(addr, v4prefix, IPv4off) == 0)
+			v4addroute(f, tag, addr+IPv4off, mask+IPv4off, gate+IPv4off, 0);
+		else
+			v6addroute(f, tag, addr, mask, gate, 0);
+	} else if(strcmp(cb->f[0], "tag") == 0) {
+		if(cb->nf < 2)
+			error(Ebadarg);
+
+		a = c->aux;
+		na = newipaux(a->owner, cb->f[1]);
+		c->aux = na;
+		free(a);
+	}
+
+	poperror();
+	free(cb);
+	return n;
+}

+ 738 - 0
sys/src/9/ip/ipv6.c

@@ -0,0 +1,738 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+#include	"ipv6.h"
+
+enum
+{
+	IP4HDR		= 20,		/* sizeof(Ip4hdr) */
+	IP6HDR		= 40,		/* sizeof(Ip6hdr) */
+	IP_HLEN4	= 0x05,		/* Header length in words */
+	IP_DF		= 0x4000,	/* Don't fragment */
+	IP_MF		= 0x2000,	/* More fragments */
+	IP6FHDR		= 8, 		/* sizeof(Fraghdr6) */
+	IP_MAX		= 32*1024,	/* Maximum Internet packet size */
+};
+
+#define IPV6CLASS(hdr)	(((hdr)->vcf[0]&0x0F)<<2 | ((hdr)->vcf[1]&0xF0)>>2)
+#define BLKIPVER(xp)	(((Ip6hdr*)((xp)->rp))->vcf[0] & 0xF0)
+/*
+ * This sleazy macro is stolen shamelessly from ip.c, see comment there.
+ */
+#define BKFG(xp)	((Ipfrag*)((xp)->base))
+
+typedef struct	IP	IP;
+typedef struct	Fragment4	Fragment4;
+typedef struct	Fragment6	Fragment6;
+typedef struct	Ipfrag	Ipfrag;
+
+Block*		ip6reassemble(IP*, int, Block*, Ip6hdr*);
+Fragment6*	ipfragallo6(IP*);
+void		ipfragfree6(IP*, Fragment6*);
+Block*		procopts(Block *bp);
+static Block*	procxtns(IP *ip, Block *bp, int doreasm);
+int		unfraglen(Block *bp, uchar *nexthdr, int setfh);
+
+/* MIB II counters */
+enum
+{
+	Forwarding,
+	DefaultTTL,
+	InReceives,
+	InHdrErrors,
+	InAddrErrors,
+	ForwDatagrams,
+	InUnknownProtos,
+	InDiscards,
+	InDelivers,
+	OutRequests,
+	OutDiscards,
+	OutNoRoutes,
+	ReasmTimeout,
+	ReasmReqds,
+	ReasmOKs,
+	ReasmFails,
+	FragOKs,
+	FragFails,
+	FragCreates,
+
+	Nstats,
+};
+
+static char *statnames[] =
+{
+[Forwarding]	"Forwarding",
+[DefaultTTL]	"DefaultTTL",
+[InReceives]	"InReceives",
+[InHdrErrors]	"InHdrErrors",
+[InAddrErrors]	"InAddrErrors",
+[ForwDatagrams]	"ForwDatagrams",
+[InUnknownProtos]	"InUnknownProtos",
+[InDiscards]	"InDiscards",
+[InDelivers]	"InDelivers",
+[OutRequests]	"OutRequests",
+[OutDiscards]	"OutDiscards",
+[OutNoRoutes]	"OutNoRoutes",
+[ReasmTimeout]	"ReasmTimeout",
+[ReasmReqds]	"ReasmReqds",
+[ReasmOKs]	"ReasmOKs",
+[ReasmFails]	"ReasmFails",
+[FragOKs]	"FragOKs",
+[FragFails]	"FragFails",
+[FragCreates]	"FragCreates",
+};
+
+struct Fragment4
+{
+	Block*	blist;
+	Fragment4*	next;
+	ulong 	src;
+	ulong 	dst;
+	ushort	id;
+	ulong 	age;
+};
+
+struct Fragment6
+{
+	Block*	blist;
+	Fragment6*	next;
+	uchar 	src[IPaddrlen];
+	uchar 	dst[IPaddrlen];
+	uint	id;
+	ulong 	age;
+};
+
+struct Ipfrag
+{
+	ushort	foff;
+	ushort	flen;
+};
+
+/* an instance of IP */
+struct IP
+{
+	ulong		stats[Nstats];
+
+	QLock		fraglock4;
+	Fragment4*	flisthead4;
+	Fragment4*	fragfree4;
+	Ref		id4;
+
+	QLock		fraglock6;
+	Fragment6*	flisthead6;
+	Fragment6*	fragfree6;
+	Ref		id6;
+
+	int		iprouting;	/* true if we route like a gateway */
+};
+
+int
+ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
+{
+	int medialen, len, chunk, uflen, flen, seglen, lid, offset, fragoff;
+	int morefrags, blklen, rv = 0, tentative;
+	uchar *gate, nexthdr;
+	Block *xp, *nb;
+	Fraghdr6 fraghdr;
+	IP *ip;
+	Ip6hdr *eh;
+	Ipifc *ifc;
+	Route *r, *sr;
+
+	ip = f->ip;
+
+	/* Fill out the ip header */
+	eh = (Ip6hdr*)(bp->rp);
+
+	ip->stats[OutRequests]++;
+
+	/* Number of uchars in data and ip header to write */
+	len = blocklen(bp);
+
+	tentative = iptentative(f, eh->src);
+	if(tentative){
+		netlog(f, Logip, "reject tx of packet with tentative src address\n");
+		goto free;
+	}
+
+	if(gating){
+		chunk = nhgets(eh->ploadlen);
+		if(chunk > len){
+			ip->stats[OutDiscards]++;
+			netlog(f, Logip, "short gated packet\n");
+			goto free;
+		}
+		if(chunk + IPV6HDR_LEN < len)
+			len = chunk + IPV6HDR_LEN;
+	}
+
+	if(len >= IP_MAX){
+//		print("len > IP_MAX, free\n");
+		ip->stats[OutDiscards]++;
+		netlog(f, Logip, "exceeded ip max size %I\n", eh->dst);
+		goto free;
+	}
+
+	r = v6lookup(f, eh->dst, c);
+	if(r == nil){
+//		print("no route for %I, src %I free\n", eh->dst, eh->src);
+		ip->stats[OutNoRoutes]++;
+		netlog(f, Logip, "no interface %I\n", eh->dst);
+		rv = -1;
+		goto free;
+	}
+
+	ifc = r->ifc;
+	if(r->type & (Rifc|Runi))
+		gate = eh->dst;
+	else if(r->type & (Rbcast|Rmulti)) {
+		gate = eh->dst;
+		sr = v6lookup(f, eh->src, nil);
+		if(sr && (sr->type & Runi))
+			ifc = sr->ifc;
+	}
+	else
+		gate = r->v6.gate;
+
+	if(!gating)
+		eh->vcf[0] = IP_VER6;
+	eh->ttl = ttl;
+	if(!gating) {
+		eh->vcf[0] |= tos >> 4;
+		eh->vcf[1]  = tos << 4;
+	}
+
+	if(!canrlock(ifc))
+		goto free;
+
+	if(waserror()){
+		runlock(ifc);
+		nexterror();
+	}
+
+	if(ifc->medium == nil)
+		goto raise;
+
+	/* If we dont need to fragment just send it */
+	medialen = ifc->maxtu - ifc->medium->hsize;
+	if(len <= medialen) {
+		hnputs(eh->ploadlen, len-IPV6HDR_LEN);
+		ifc->medium->bwrite(ifc, bp, V6, gate);
+		runlock(ifc);
+		poperror();
+		return 0;
+	}
+
+	if(gating && ifc->reassemble <= 0) {
+		/*
+		 * v6 intermediate nodes are not supposed to fragment pkts;
+		 * we fragment if ifc->reassemble is turned on; an exception
+		 * needed for nat.
+		 */
+		ip->stats[OutDiscards]++;
+		icmppkttoobig6(f, ifc, bp);
+		netlog(f, Logip, "%I: gated pkts not fragmented\n", eh->dst);
+		goto raise;
+	}
+
+	/* start v6 fragmentation */
+	uflen = unfraglen(bp, &nexthdr, 1);
+	if(uflen > medialen) {
+		ip->stats[FragFails]++;
+		ip->stats[OutDiscards]++;
+		netlog(f, Logip, "%I: unfragmentable part too big\n", eh->dst);
+		goto raise;
+	}
+
+	flen = len - uflen;
+	seglen = (medialen - (uflen + IP6FHDR)) & ~7;
+	if(seglen < 8) {
+		ip->stats[FragFails]++;
+		ip->stats[OutDiscards]++;
+		netlog(f, Logip, "%I: seglen < 8\n", eh->dst);
+		goto raise;
+	}
+
+	lid = incref(&ip->id6);
+	fraghdr.nexthdr = nexthdr;
+	fraghdr.res = 0;
+	hnputl(fraghdr.id, lid);
+
+	xp = bp;
+	offset = uflen;
+	while (xp && offset && offset >= BLEN(xp)) {
+		offset -= BLEN(xp);
+		xp = xp->next;
+	}
+	xp->rp += offset;
+
+	fragoff = 0;
+	morefrags = 1;
+
+	for(; fragoff < flen; fragoff += seglen) {
+		nb = allocb(uflen + IP6FHDR + seglen);
+
+		if(fragoff + seglen >= flen) {
+			seglen = flen - fragoff;
+			morefrags = 0;
+		}
+
+		hnputs(eh->ploadlen, seglen+IP6FHDR);
+		memmove(nb->wp, eh, uflen);
+		nb->wp += uflen;
+
+		hnputs(fraghdr.offsetRM, fragoff); /* last 3 bits must be 0 */
+		fraghdr.offsetRM[1] |= morefrags;
+		memmove(nb->wp, &fraghdr, IP6FHDR);
+		nb->wp += IP6FHDR;
+
+		/* Copy data */
+		chunk = seglen;
+		while (chunk) {
+			if(!xp) {
+				ip->stats[OutDiscards]++;
+				ip->stats[FragFails]++;
+				freeblist(nb);
+				netlog(f, Logip, "!xp: chunk in v6%d\n", chunk);
+				goto raise;
+			}
+			blklen = chunk;
+			if(BLEN(xp) < chunk)
+				blklen = BLEN(xp);
+			memmove(nb->wp, xp->rp, blklen);
+
+			nb->wp += blklen;
+			xp->rp += blklen;
+			chunk -= blklen;
+			if(xp->rp == xp->wp)
+				xp = xp->next;
+		}
+
+		ifc->medium->bwrite(ifc, nb, V6, gate);
+		ip->stats[FragCreates]++;
+	}
+	ip->stats[FragOKs]++;
+
+raise:
+	runlock(ifc);
+	poperror();
+free:
+	freeblist(bp);
+	return rv;
+}
+
+void
+ipiput6(Fs *f, Ipifc *ifc, Block *bp)
+{
+	int hl, hop, tos, notforme, tentative;
+	uchar proto;
+	uchar v6dst[IPaddrlen];
+	IP *ip;
+	Ip6hdr *h;
+	Proto *p;
+	Route *r, *sr;
+
+	ip = f->ip;
+	ip->stats[InReceives]++;
+
+	/*
+	 *  Ensure we have all the header info in the first
+	 *  block.  Make life easier for other protocols by
+	 *  collecting up to the first 64 bytes in the first block.
+	 */
+	if(BLEN(bp) < 64) {
+		hl = blocklen(bp);
+		if(hl < IP6HDR)
+			hl = IP6HDR;
+		if(hl > 64)
+			hl = 64;
+		bp = pullupblock(bp, hl);
+		if(bp == nil)
+			return;
+	}
+
+	h = (Ip6hdr *)bp->rp;
+
+	memmove(&v6dst[0], &h->dst[0], IPaddrlen);
+	notforme = ipforme(f, v6dst) == 0;
+	tentative = iptentative(f, v6dst);
+
+	if(tentative && h->proto != ICMPv6) {
+		print("tentative addr, drop\n");
+		freeblist(bp);
+		return;
+	}
+
+	/* Check header version */
+	if(BLKIPVER(bp) != IP_VER6) {
+		ip->stats[InHdrErrors]++;
+		netlog(f, Logip, "ip: bad version %ux\n", (h->vcf[0]&0xF0)>>2);
+		freeblist(bp);
+		return;
+	}
+
+	/* route */
+	if(notforme) {
+		if(!ip->iprouting){
+			freeb(bp);
+			return;
+		}
+
+		/* don't forward to link-local destinations */
+		if(islinklocal(h->dst) ||
+		   (isv6mcast(h->dst) && (h->dst[1]&0xF) <= Link_local_scop)){
+			ip->stats[OutDiscards]++;
+			freeblist(bp);
+			return;
+		}
+
+		/* don't forward to source's network */
+		sr = v6lookup(f, h->src, nil);
+		r  = v6lookup(f, h->dst, nil);
+
+		if(r == nil || sr == r){
+			ip->stats[OutDiscards]++;
+			freeblist(bp);
+			return;
+		}
+
+		/* don't forward if packet has timed out */
+		hop = h->ttl;
+		if(hop < 1) {
+			ip->stats[InHdrErrors]++;
+			icmpttlexceeded6(f, ifc, bp);
+			freeblist(bp);
+			return;
+		}
+
+		/* process headers & reassemble if the interface expects it */
+		bp = procxtns(ip, bp, r->ifc->reassemble);
+		if(bp == nil)
+			return;
+
+		ip->stats[ForwDatagrams]++;
+		h = (Ip6hdr *)bp->rp;
+		tos = IPV6CLASS(h);
+		hop = h->ttl;
+		ipoput6(f, bp, 1, hop-1, tos, nil);
+		return;
+	}
+
+	/* reassemble & process headers if needed */
+	bp = procxtns(ip, bp, 1);
+	if(bp == nil)
+		return;
+
+	h = (Ip6hdr *) (bp->rp);
+	proto = h->proto;
+	p = Fsrcvpcol(f, proto);
+	if(p && p->rcv) {
+		ip->stats[InDelivers]++;
+		(*p->rcv)(p, ifc, bp);
+		return;
+	}
+
+	ip->stats[InDiscards]++;
+	ip->stats[InUnknownProtos]++;
+	freeblist(bp);
+}
+
+/*
+ * ipfragfree6 - copied from ipfragfree4 - assume hold fraglock6
+ */
+void
+ipfragfree6(IP *ip, Fragment6 *frag)
+{
+	Fragment6 *fl, **l;
+
+	if(frag->blist)
+		freeblist(frag->blist);
+
+	memset(frag->src, 0, IPaddrlen);
+	frag->id = 0;
+	frag->blist = nil;
+
+	l = &ip->flisthead6;
+	for(fl = *l; fl; fl = fl->next) {
+		if(fl == frag) {
+			*l = frag->next;
+			break;
+		}
+		l = &fl->next;
+	}
+
+	frag->next = ip->fragfree6;
+	ip->fragfree6 = frag;
+}
+
+/*
+ * ipfragallo6 - copied from ipfragalloc4
+ */
+Fragment6*
+ipfragallo6(IP *ip)
+{
+	Fragment6 *f;
+
+	while(ip->fragfree6 == nil) {
+		/* free last entry on fraglist */
+		for(f = ip->flisthead6; f->next; f = f->next)
+			;
+		ipfragfree6(ip, f);
+	}
+	f = ip->fragfree6;
+	ip->fragfree6 = f->next;
+	f->next = ip->flisthead6;
+	ip->flisthead6 = f;
+	f->age = NOW + 30000;
+
+	return f;
+}
+
+static Block*
+procxtns(IP *ip, Block *bp, int doreasm)
+{
+	int offset;
+	uchar proto;
+	Ip6hdr *h;
+
+	h = (Ip6hdr *)bp->rp;
+	offset = unfraglen(bp, &proto, 0);
+
+	if(proto == FH && doreasm != 0) {
+		bp = ip6reassemble(ip, offset, bp, h);
+		if(bp == nil)
+			return nil;
+		offset = unfraglen(bp, &proto, 0);
+	}
+
+	if(proto == DOH || offset > IP6HDR)
+		bp = procopts(bp);
+	return bp;
+}
+
+/*
+ * returns length of "Unfragmentable part", i.e., sum of lengths of ipv6 hdr,
+ * hop-by-hop & routing headers if present; *nexthdr is set to nexthdr value
+ * of the last header in the "Unfragmentable part"; if setfh != 0, nexthdr
+ * field of the last header in the "Unfragmentable part" is set to FH.
+ */
+int
+unfraglen(Block *bp, uchar *nexthdr, int setfh)
+{
+	uchar *p, *q;
+	int ufl, hs;
+
+	p = bp->rp;
+	q = p+6;   /* proto, = p+sizeof(Ip6hdr.vcf)+sizeof(Ip6hdr.ploadlen) */
+	*nexthdr = *q;
+	ufl = IP6HDR;
+	p += ufl;
+
+	for(;;) {
+		if(*nexthdr == HBH || *nexthdr == RH) {
+			*nexthdr = *p;
+			hs = ((int)*(p+1) + 1) * 8;
+			ufl += hs;
+			q = p;
+			p += hs;
+		}
+		else
+			break;
+	}
+
+	if(*nexthdr == FH)
+		*q = *p;
+	if(setfh)
+		*q = FH;
+	return ufl;
+}
+
+Block*
+procopts(Block *bp)
+{
+	return bp;
+}
+
+Block*
+ip6reassemble(IP* ip, int uflen, Block* bp, Ip6hdr* ih)
+{
+	int fend, offset, ovlap, len, fragsize, pktposn;
+	uint id;
+	uchar src[IPaddrlen], dst[IPaddrlen];
+	Block *bl, **l, *last, *prev;
+	Fraghdr6 *fraghdr;
+	Fragment6 *f, *fnext;
+
+	fraghdr = (Fraghdr6 *)(bp->rp + uflen);
+	memmove(src, ih->src, IPaddrlen);
+	memmove(dst, ih->dst, IPaddrlen);
+	id = nhgetl(fraghdr->id);
+	offset = nhgets(fraghdr->offsetRM) & ~7;
+
+	/*
+	 *  block lists are too hard, pullupblock into a single block
+	 */
+	if(bp->next){
+		bp = pullupblock(bp, blocklen(bp));
+		ih = (Ip6hdr *)bp->rp;
+	}
+
+	qlock(&ip->fraglock6);
+
+	/*
+	 *  find a reassembly queue for this fragment
+	 */
+	for(f = ip->flisthead6; f; f = fnext){
+		fnext = f->next;
+		if(ipcmp(f->src, src)==0 && ipcmp(f->dst, dst)==0 && f->id == id)
+			break;
+		if(f->age < NOW){
+			ip->stats[ReasmTimeout]++;
+			ipfragfree6(ip, f);
+		}
+	}
+
+	/*
+	 *  if this isn't a fragmented packet, accept it
+	 *  and get rid of any fragments that might go
+	 *  with it.
+	 */
+	if(nhgets(fraghdr->offsetRM) == 0) {	/* 1st frag is also last */
+		if(f) {
+			ipfragfree6(ip, f);
+			ip->stats[ReasmFails]++;
+		}
+		qunlock(&ip->fraglock6);
+		return bp;
+	}
+
+	if(bp->base+sizeof(Ipfrag) >= bp->rp){
+		bp = padblock(bp, sizeof(Ipfrag));
+		bp->rp += sizeof(Ipfrag);
+	}
+
+	BKFG(bp)->foff = offset;
+	BKFG(bp)->flen = nhgets(ih->ploadlen) + IP6HDR - uflen - IP6FHDR;
+
+	/* First fragment allocates a reassembly queue */
+	if(f == nil) {
+		f = ipfragallo6(ip);
+		f->id = id;
+		memmove(f->src, src, IPaddrlen);
+		memmove(f->dst, dst, IPaddrlen);
+
+		f->blist = bp;
+
+		qunlock(&ip->fraglock6);
+		ip->stats[ReasmReqds]++;
+		return nil;
+	}
+
+	/*
+	 *  find the new fragment's position in the queue
+	 */
+	prev = nil;
+	l = &f->blist;
+	bl = f->blist;
+	while(bl != nil && BKFG(bp)->foff > BKFG(bl)->foff) {
+		prev = bl;
+		l = &bl->next;
+		bl = bl->next;
+	}
+
+	/* Check overlap of a previous fragment - trim away as necessary */
+	if(prev) {
+		ovlap = BKFG(prev)->foff + BKFG(prev)->flen - BKFG(bp)->foff;
+		if(ovlap > 0) {
+			if(ovlap >= BKFG(bp)->flen) {
+				freeblist(bp);
+				qunlock(&ip->fraglock6);
+				return nil;
+			}
+			BKFG(prev)->flen -= ovlap;
+		}
+	}
+
+	/* Link onto assembly queue */
+	bp->next = *l;
+	*l = bp;
+
+	/* Check to see if succeeding segments overlap */
+	if(bp->next) {
+		l = &bp->next;
+		fend = BKFG(bp)->foff + BKFG(bp)->flen;
+
+		/* Take completely covered segments out */
+		while(*l) {
+			ovlap = fend - BKFG(*l)->foff;
+			if(ovlap <= 0)
+				break;
+			if(ovlap < BKFG(*l)->flen) {
+				BKFG(*l)->flen -= ovlap;
+				BKFG(*l)->foff += ovlap;
+				/* move up ih hdrs */
+				memmove((*l)->rp + ovlap, (*l)->rp, uflen);
+				(*l)->rp += ovlap;
+				break;
+			}
+			last = (*l)->next;
+			(*l)->next = nil;
+			freeblist(*l);
+			*l = last;
+		}
+	}
+
+	/*
+	 *  look for a complete packet.  if we get to a fragment
+	 *  with the trailing bit of fraghdr->offsetRM[1] set, we're done.
+	 */
+	pktposn = 0;
+	for(bl = f->blist; bl && BKFG(bl)->foff == pktposn; bl = bl->next) {
+		fraghdr = (Fraghdr6 *)(bl->rp + uflen);
+		if((fraghdr->offsetRM[1] & 1) == 0) {
+			bl = f->blist;
+
+			/* get rid of frag header in first fragment */
+			memmove(bl->rp + IP6FHDR, bl->rp, uflen);
+			bl->rp += IP6FHDR;
+			len = nhgets(((Ip6hdr*)bl->rp)->ploadlen) - IP6FHDR;
+			bl->wp = bl->rp + len + IP6HDR;
+			/*
+			 * Pullup all the fragment headers and
+			 * return a complete packet
+			 */
+			for(bl = bl->next; bl; bl = bl->next) {
+				fragsize = BKFG(bl)->flen;
+				len += fragsize;
+				bl->rp += uflen + IP6FHDR;
+				bl->wp = bl->rp + fragsize;
+			}
+
+			bl = f->blist;
+			f->blist = nil;
+			ipfragfree6(ip, f);
+			ih = (Ip6hdr*)bl->rp;
+			hnputs(ih->ploadlen, len);
+			qunlock(&ip->fraglock6);
+			ip->stats[ReasmOKs]++;
+			return bl;
+		}
+		pktposn += BKFG(bl)->flen;
+	}
+	qunlock(&ip->fraglock6);
+	return nil;
+}

+ 194 - 0
sys/src/9/ip/ipv6.h

@@ -0,0 +1,194 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Internet Protocol Version 6
+ *
+ * rfc2460 defines the protocol.
+ * rfc4291 defines the address prefices.
+ *
+ * global unicast is anything but unspecified (::), loopback (::1),
+ * multicast (ff00::/8), and link-local unicast (fe80::/10).
+ *
+ * site-local (fec0::/10) is now deprecated by rfc3879.
+ *
+ * Unique Local IPv6 Unicast Addresses are defined by rfc4193.
+ * prefix is fc00::/7, scope is global, routing is limited to roughly a site.
+ */
+#define isv6mcast(addr)	  ((addr)[0] == 0xff)
+#define islinklocal(addr) ((addr)[0] == 0xfe && ((addr)[1] & 0xc0) == 0x80)
+
+#define optexsts(np)	(nhgets((np)->ploadlen) > 24)
+#define issmcast(addr)	(memcmp((addr), v6solicitednode, 13) == 0)
+
+#ifndef MIN
+#define MIN(a, b) ((a) <= (b)? (a): (b))
+#endif
+
+enum {				/* Header Types */
+	HBH		= 0,	/* hop-by-hop multicast routing protocol */
+	ICMP		= 1,
+	IGMP		= 2,
+	GGP		= 3,
+	IPINIP		= 4,
+	ST		= 5,
+	TCP		= 6,
+	UDP		= 17,
+	ISO_TP4		= 29,
+	RH		= 43,
+	FH		= 44,
+	IDRP		= 45,
+	RSVP		= 46,
+	AH		= 51,
+	ESP		= 52,
+	ICMPv6		= 58,
+	NNH		= 59,
+	DOH		= 60,
+	ISO_IP		= 80,
+	IGRP		= 88,
+	OSPF		= 89,
+
+	Maxhdrtype	= 256,
+};
+
+enum {
+	/* multicast flags and scopes */
+
+//	Well_known_flg	= 0,
+//	Transient_flg	= 1,
+
+//	Interface_local_scop = 1,
+	Link_local_scop	= 2,
+//	Site_local_scop	= 5,
+//	Org_local_scop	= 8,
+	Global_scop	= 14,
+
+	/* various prefix lengths */
+	SOLN_PREF_LEN	= 13,
+
+	/* icmpv6 unreach codes */
+	icmp6_no_route		= 0,
+	icmp6_ad_prohib		= 1,
+	icmp6_unassigned	= 2,
+	icmp6_adr_unreach	= 3,
+	icmp6_port_unreach	= 4,
+	icmp6_unkn_code		= 5,
+
+	/* various flags & constants */
+	v6MINTU		= 1280,
+	HOP_LIMIT	= 255,
+	ETHERHDR_LEN	= 14,
+	IPV6HDR_LEN	= 40,
+	IPV4HDR_LEN	= 20,
+
+	/* option types */
+
+	SRC_LLADDR	= 1,
+	TARGET_LLADDR	= 2,
+	PREFIX_INFO	= 3,
+	REDIR_HEADER	= 4,
+	MTU_OPTION	= 5,
+
+	SRC_UNSPEC	= 0,
+	SRC_UNI		= 1,
+	TARG_UNI	= 2,
+	TARG_MULTI	= 3,
+
+	Tunitent	= 1,
+	Tuniproxy	= 2,
+	Tunirany	= 3,
+
+	/* Router constants (all times in milliseconds) */
+	MAX_INIT_RTR_ADVERT_INTVL = 16000,
+	MAX_INIT_RTR_ADVERTS	= 3,
+	MAX_FINAL_RTR_ADVERTS	= 3,
+	MIN_DELAY_BETWEEN_RAS	= 3000,
+	MAX_RA_DELAY_TIME	= 500,
+
+	/* Host constants */
+	MAX_RTR_SOLICIT_DELAY	= 1000,
+	RTR_SOLICIT_INTVL	= 4000,
+	MAX_RTR_SOLICITS	= 3,
+
+	/* Node constants */
+	MAX_MULTICAST_SOLICIT	= 3,
+	MAX_UNICAST_SOLICIT	= 3,
+	MAX_ANYCAST_DELAY_TIME	= 1000,
+	MAX_NEIGHBOR_ADVERT	= 3,
+	REACHABLE_TIME		= 30000,
+	RETRANS_TIMER		= 1000,
+	DELAY_FIRST_PROBE_TIME	= 5000,
+};
+
+typedef struct Ip6hdr	Ip6hdr;
+typedef struct Opthdr	Opthdr;
+typedef struct Routinghdr Routinghdr;
+typedef struct Fraghdr6	Fraghdr6;
+
+struct	Ip6hdr {
+	uchar	vcf[4];		/* version:4, traffic class:8, flow label:20 */
+	uchar	ploadlen[2];	/* payload length: packet length - 40 */
+	uchar	proto;		/* next header type */
+	uchar	ttl;		/* hop limit */
+	uchar	src[IPaddrlen];
+	uchar	dst[IPaddrlen];
+};
+
+struct	Opthdr {
+	uchar	nexthdr;
+	uchar	len;
+};
+
+struct	Routinghdr {
+	uchar	nexthdr;
+	uchar	len;
+	uchar	rtetype;
+	uchar	segrem;
+};
+
+struct	Fraghdr6 {
+	uchar	nexthdr;
+	uchar	res;
+	uchar	offsetRM[2];	/* Offset, Res, M flag */
+	uchar	id[4];
+};
+
+extern uchar v6allnodesN[IPaddrlen];
+extern uchar v6allnodesL[IPaddrlen];
+extern uchar v6allroutersN[IPaddrlen];
+extern uchar v6allroutersL[IPaddrlen];
+extern uchar v6allnodesNmask[IPaddrlen];
+extern uchar v6allnodesLmask[IPaddrlen];
+extern uchar v6allroutersS[IPaddrlen];
+extern uchar v6solicitednode[IPaddrlen];
+extern uchar v6solicitednodemask[IPaddrlen];
+extern uchar v6Unspecified[IPaddrlen];
+extern uchar v6loopback[IPaddrlen];
+extern uchar v6loopbackmask[IPaddrlen];
+extern uchar v6linklocal[IPaddrlen];
+extern uchar v6linklocalmask[IPaddrlen];
+extern uchar v6glunicast[IPaddrlen];
+extern uchar v6multicast[IPaddrlen];
+extern uchar v6multicastmask[IPaddrlen];
+
+extern int v6llpreflen;
+extern int v6lbpreflen;
+extern int v6mcpreflen;
+extern int v6snpreflen;
+extern int v6aNpreflen;
+extern int v6aLpreflen;
+
+extern int ReTransTimer;
+
+void ipv62smcast(uchar *, uchar *);
+void icmpns(Fs *f, uchar* src, int suni, uchar* targ, int tuni, uchar* mac);
+void icmpna(Fs *f, uchar* src, uchar* dst, uchar* targ, uchar* mac, uchar flags);
+void icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp);
+void icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp);
+void icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free);

+ 129 - 0
sys/src/9/ip/loopbackmedium.c

@@ -0,0 +1,129 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+enum
+{
+	Maxtu=	16*1024,
+};
+
+typedef struct LB LB;
+struct LB
+{
+	Proc	*readp;
+	Queue	*q;
+	Fs	*f;
+};
+
+static void loopbackread(void *a);
+
+static void
+loopbackbind(Ipifc *ifc, int, char**)
+{
+	LB *lb;
+
+	lb = smalloc(sizeof(*lb));
+	lb->f = ifc->conv->p->f;
+	lb->q = qopen(1024*1024, Qmsg, nil, nil);
+	ifc->arg = lb;
+	ifc->mbps = 10001;
+
+	kproc("loopbackread", loopbackread, ifc);
+
+}
+
+static void
+loopbackunbind(Ipifc *ifc)
+{
+	LB *lb = ifc->arg;
+
+	if(lb->readp)
+		postnote(lb->readp, 1, "unbind", NUser);
+
+	/* wait for reader to die */
+	while(lb->readp != 0)
+		tsleep(&up->sleep, return0, 0, 300);
+
+	/* clean up */
+	qfree(lb->q);
+	free(lb);
+}
+
+static void
+loopbackbwrite(Ipifc *ifc, Block *bp, int, uchar*)
+{
+	LB *lb;
+
+	lb = ifc->arg;
+	if(qpass(lb->q, bp) < 0)
+		ifc->outerr++;
+	ifc->out++;
+}
+
+static void
+loopbackread(void *a)
+{
+	Ipifc *ifc;
+	Block *bp;
+	LB *lb;
+
+	ifc = a;
+	lb = ifc->arg;
+	lb->readp = up;	/* hide identity under a rock for unbind */
+	if(waserror()){
+		lb->readp = 0;
+		pexit("hangup", 1);
+	}
+	for(;;){
+		bp = qbread(lb->q, Maxtu);
+		if(bp == nil)
+			continue;
+		ifc->in++;
+		if(!canrlock(ifc)){
+			freeb(bp);
+			continue;
+		}
+		if(waserror()){
+			runlock(ifc);
+			nexterror();
+		}
+		if(ifc->lifc == nil)
+			freeb(bp);
+		else
+			ipiput4(lb->f, ifc, bp);
+		runlock(ifc);
+		poperror();
+	}
+}
+
+Medium loopbackmedium =
+{
+.hsize=		0,
+.mintu=		0,
+.maxtu=		Maxtu,
+.maclen=	0,
+.name=		"loopback",
+.bind=		loopbackbind,
+.unbind=	loopbackunbind,
+.bwrite=	loopbackbwrite,
+};
+
+void
+loopbackmediumlink(void)
+{
+	addipmedium(&loopbackmedium);
+}

+ 162 - 0
sys/src/9/ip/netdevmedium.c

@@ -0,0 +1,162 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+static void	netdevbind(Ipifc *ifc, int argc, char **argv);
+static void	netdevunbind(Ipifc *ifc);
+static void	netdevbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip);
+static void	netdevread(void *a);
+
+typedef struct	Netdevrock Netdevrock;
+struct Netdevrock
+{
+	Fs	*f;		/* file system we belong to */
+	Proc	*readp;		/* reading process */
+	Chan	*mchan;		/* Data channel */
+};
+
+Medium netdevmedium =
+{
+.name=		"netdev",
+.hsize=		0,
+.mintu=	0,
+.maxtu=	64000,
+.maclen=	0,
+.bind=		netdevbind,
+.unbind=	netdevunbind,
+.bwrite=	netdevbwrite,
+.unbindonclose=	0,
+};
+
+/*
+ *  called to bind an IP ifc to a generic network device
+ *  called with ifc qlock'd
+ */
+static void
+netdevbind(Ipifc *ifc, int argc, char **argv)
+{
+	Chan *mchan;
+	Netdevrock *er;
+
+	if(argc < 2)
+		error(Ebadarg);
+
+	mchan = namec(argv[2], Aopen, ORDWR, 0);
+
+	er = smalloc(sizeof(*er));
+	er->mchan = mchan;
+	er->f = ifc->conv->p->f;
+
+	ifc->arg = er;
+
+	kproc("netdevread", netdevread, ifc);
+}
+
+/*
+ *  called with ifc wlock'd
+ */
+static void
+netdevunbind(Ipifc *ifc)
+{
+	Netdevrock *er = ifc->arg;
+
+	if(er->readp != nil)
+		postnote(er->readp, 1, "unbind", NUser);
+
+	/* wait for readers to die */
+	while(er->readp != nil)
+		tsleep(&up->sleep, return0, 0, 300);
+
+	if(er->mchan != nil)
+		cclose(er->mchan);
+
+	free(er);
+}
+
+/*
+ *  called by ipoput with a single block to write
+ */
+static void
+netdevbwrite(Ipifc *ifc, Block *bp, int, uchar*)
+{
+	Netdevrock *er = ifc->arg;
+
+	if(bp->next)
+		bp = concatblock(bp);
+	if(BLEN(bp) < ifc->mintu)
+		bp = adjustblock(bp, ifc->mintu);
+
+	er->mchan->dev->bwrite(er->mchan, bp, 0);
+	ifc->out++;
+}
+
+/*
+ *  process to read from the device
+ */
+static void
+netdevread(void *a)
+{
+	Ipifc *ifc;
+	Block *bp;
+	Netdevrock *er;
+	char *argv[1];
+
+	ifc = a;
+	er = ifc->arg;
+	er->readp = up;	/* hide identity under a rock for unbind */
+	if(waserror()){
+		er->readp = nil;
+		pexit("hangup", 1);
+	}
+	for(;;){
+		bp = er->mchan->dev->bread(er->mchan, ifc->maxtu, 0);
+		if(bp == nil){
+			/*
+			 * get here if mchan is a pipe and other side hangs up
+			 * clean up this interface & get out
+ZZZ is this a good idea?
+			 */
+			poperror();
+			er->readp = nil;
+			argv[0] = "unbind";
+			if(!waserror())
+				ifc->conv->p->ctl(ifc->conv, argv, 1);
+			pexit("hangup", 1);
+		}
+		if(!canrlock(ifc)){
+			freeb(bp);
+			continue;
+		}
+		if(waserror()){
+			runlock(ifc);
+			nexterror();
+		}
+		ifc->in++;
+		if(ifc->lifc == nil)
+			freeb(bp);
+		else
+			ipiput4(er->f, ifc, bp);
+		runlock(ifc);
+		poperror();
+	}
+}
+
+void
+netdevmediumlink(void)
+{
+	addipmedium(&netdevmedium);
+}

+ 272 - 0
sys/src/9/ip/netlog.c

@@ -0,0 +1,272 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"../ip/ip.h"
+
+enum {
+	Nlog		= 4*1024,
+};
+
+/*
+ *  action log
+ */
+struct Netlog {
+	Lock;
+	int	opens;
+	char*	buf;
+	char	*end;
+	char	*rptr;
+	int	len;
+
+	int	logmask;			/* mask of things to debug */
+	uchar	iponly[IPaddrlen];		/* ip address to print debugging for */
+	int	iponlyset;
+
+	QLock;
+	Rendez;
+};
+
+typedef struct Netlogflag {
+	char*	name;
+	int	mask;
+} Netlogflag;
+
+static Netlogflag flags[] =
+{
+	{ "ppp",	Logppp, },
+	{ "ip",		Logip, },
+	{ "fs",		Logfs, },
+	{ "tcp",	Logtcp, },
+	{ "il",		Logil, },
+	{ "icmp",	Logicmp, },
+	{ "udp",	Logudp, },
+	{ "compress",	Logcompress, },
+	{ "ilmsg",	Logil|Logilmsg, },
+	{ "gre",	Loggre, },
+	{ "tcpwin",	Logtcp|Logtcpwin, },
+	{ "tcprxmt",	Logtcp|Logtcprxmt, },
+	{ "udpmsg",	Logudp|Logudpmsg, },
+	{ "ipmsg",	Logip|Logipmsg, },
+	{ "esp",	Logesp, },
+	{ nil,		0, },
+};
+
+char Ebadnetctl[] = "too few arguments for netlog control message";
+
+enum
+{
+	CMset,
+	CMclear,
+	CMonly,
+};
+
+static
+Cmdtab routecmd[] = {
+	CMset,		"set",		0,
+	CMclear,	"clear",	0,
+	CMonly,		"only",		0,
+};
+
+void
+netloginit(Fs *f)
+{
+	f->alog = smalloc(sizeof(Netlog));
+}
+
+void
+netlogopen(Fs *f)
+{
+	lock(f->alog);
+	if(waserror()){
+		unlock(f->alog);
+		nexterror();
+	}
+	if(f->alog->opens == 0){
+		if(f->alog->buf == nil)
+			f->alog->buf = malloc(Nlog);
+		f->alog->rptr = f->alog->buf;
+		f->alog->end = f->alog->buf + Nlog;
+	}
+	f->alog->opens++;
+	unlock(f->alog);
+	poperror();
+}
+
+void
+netlogclose(Fs *f)
+{
+	lock(f->alog);
+	if(waserror()){
+		unlock(f->alog);
+		nexterror();
+	}
+	f->alog->opens--;
+	if(f->alog->opens == 0){
+		free(f->alog->buf);
+		f->alog->buf = nil;
+	}
+	unlock(f->alog);
+	poperror();
+}
+
+static int
+netlogready(void *a)
+{
+	Fs *f = a;
+
+	return f->alog->len;
+}
+
+long
+netlogread(Fs *f, void *a, ulong, long n)
+{
+	int i, d;
+	char *p, *rptr;
+
+	qlock(f->alog);
+	if(waserror()){
+		qunlock(f->alog);
+		nexterror();
+	}
+
+	for(;;){
+		lock(f->alog);
+		if(f->alog->len){
+			if(n > f->alog->len)
+				n = f->alog->len;
+			d = 0;
+			rptr = f->alog->rptr;
+			f->alog->rptr += n;
+			if(f->alog->rptr >= f->alog->end){
+				d = f->alog->rptr - f->alog->end;
+				f->alog->rptr = f->alog->buf + d;
+			}
+			f->alog->len -= n;
+			unlock(f->alog);
+
+			i = n-d;
+			p = a;
+			memmove(p, rptr, i);
+			memmove(p+i, f->alog->buf, d);
+			break;
+		}
+		else
+			unlock(f->alog);
+
+		sleep(f->alog, netlogready, f);
+	}
+
+	qunlock(f->alog);
+	poperror();
+
+	return n;
+}
+
+void
+netlogctl(Fs *f, char* s, int n)
+{
+	int i, set;
+	Netlogflag *fp;
+	Cmdbuf *cb;
+	Cmdtab *ct;
+
+	cb = parsecmd(s, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+
+	if(cb->nf < 2)
+		error(Ebadnetctl);
+
+	ct = lookupcmd(cb, routecmd, nelem(routecmd));
+
+	SET(set);
+
+	switch(ct->index){
+	case CMset:
+		set = 1;
+		break;
+
+	case CMclear:
+		set = 0;
+		break;
+
+	case CMonly:
+		parseip(f->alog->iponly, cb->f[1]);
+		if(ipcmp(f->alog->iponly, IPnoaddr) == 0)
+			f->alog->iponlyset = 0;
+		else
+			f->alog->iponlyset = 1;
+		free(cb);
+		return;
+
+	default:
+		cmderror(cb, "unknown ip control message");
+	}
+
+	for(i = 1; i < cb->nf; i++){
+		for(fp = flags; fp->name; fp++)
+			if(strcmp(fp->name, cb->f[i]) == 0)
+				break;
+		if(fp->name == nil)
+			continue;
+		if(set)
+			f->alog->logmask |= fp->mask;
+		else
+			f->alog->logmask &= ~fp->mask;
+	}
+
+	free(cb);
+	poperror();
+}
+
+void
+netlog(Fs *f, int mask, char *fmt, ...)
+{
+	char buf[128], *t, *fp;
+	int i, n;
+	va_list arg;
+
+	if(!(f->alog->logmask & mask))
+		return;
+
+	if(f->alog->opens == 0)
+		return;
+
+	va_start(arg, fmt);
+	n = vseprint(buf, buf+sizeof(buf), fmt, arg) - buf;
+	va_end(arg);
+
+	lock(f->alog);
+	i = f->alog->len + n - Nlog;
+	if(i > 0){
+		f->alog->len -= i;
+		f->alog->rptr += i;
+		if(f->alog->rptr >= f->alog->end)
+			f->alog->rptr = f->alog->buf + (f->alog->rptr - f->alog->end);
+	}
+	t = f->alog->rptr + f->alog->len;
+	fp = buf;
+	f->alog->len += n;
+	while(n-- > 0){
+		if(t >= f->alog->end)
+			t = f->alog->buf + (t - f->alog->end);
+		*t++ = *fp++;
+	}
+	unlock(f->alog);
+
+	wakeup(f->alog);
+}

+ 48 - 0
sys/src/9/ip/nullmedium.c

@@ -0,0 +1,48 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+static void
+nullbind(Ipifc*, int, char**)
+{
+	error("cannot bind null device");
+}
+
+static void
+nullunbind(Ipifc*)
+{
+}
+
+static void
+nullbwrite(Ipifc*, Block*, int, uchar*)
+{
+	error("nullbwrite");
+}
+
+Medium nullmedium =
+{
+.name=		"null",
+.bind=		nullbind,
+.unbind=	nullunbind,
+.bwrite=	nullbwrite,
+};
+
+void
+nullmediumlink(void)
+{
+	addipmedium(&nullmedium);
+}

+ 88 - 0
sys/src/9/ip/pktmedium.c

@@ -0,0 +1,88 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+
+static void	pktbind(Ipifc*, int, char**);
+static void	pktunbind(Ipifc*);
+static void	pktbwrite(Ipifc*, Block*, int, uchar*);
+static void	pktin(Fs*, Ipifc*, Block*);
+
+Medium pktmedium =
+{
+.name=		"pkt",
+.hsize=		14,
+.mintu=		40,
+.maxtu=		4*1024,
+.maclen=	6,
+.bind=		pktbind,
+.unbind=	pktunbind,
+.bwrite=	pktbwrite,
+.pktin=		pktin,
+.unbindonclose=	1,
+};
+
+/*
+ *  called to bind an IP ifc to an ethernet device
+ *  called with ifc wlock'd
+ */
+static void
+pktbind(Ipifc*, int, char**)
+{
+}
+
+/*
+ *  called with ifc wlock'd
+ */
+static void
+pktunbind(Ipifc*)
+{
+}
+
+/*
+ *  called by ipoput with a single packet to write
+ */
+static void
+pktbwrite(Ipifc *ifc, Block *bp, int, uchar*)
+{
+	/* enqueue onto the conversation's rq */
+	bp = concatblock(bp);
+	if(ifc->conv->snoopers.ref > 0)
+		qpass(ifc->conv->sq, copyblock(bp, BLEN(bp)));
+	qpass(ifc->conv->rq, bp);
+}
+
+/*
+ *  called with ifc rlocked when someone write's to 'data'
+ */
+static void
+pktin(Fs *f, Ipifc *ifc, Block *bp)
+{
+	if(ifc->lifc == nil)
+		freeb(bp);
+	else {
+		if(ifc->conv->snoopers.ref > 0)
+			qpass(ifc->conv->sq, copyblock(bp, BLEN(bp)));
+		ipiput4(f, ifc, bp);
+	}
+}
+
+void
+pktmediumlink(void)
+{
+	addipmedium(&pktmedium);
+}

+ 81 - 0
sys/src/9/ip/ptclbsum.c

@@ -0,0 +1,81 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"ip.h"
+
+static	short	endian	= 1;
+static	uchar*	aendian	= (uchar*)&endian;
+#define	LITTLE	*aendian
+
+ushort
+ptclbsum(uchar *addr, int len)
+{
+	ulong losum, hisum, mdsum, x;
+	ulong t1, t2;
+
+	losum = 0;
+	hisum = 0;
+	mdsum = 0;
+
+	x = 0;
+	if(PTR2UINT(addr) & 1) {
+		if(len) {
+			hisum += addr[0];
+			len--;
+			addr++;
+		}
+		x = 1;
+	}
+	while(len >= 16) {
+		t1 = *(ushort*)(addr+0);
+		t2 = *(ushort*)(addr+2);	mdsum += t1;
+		t1 = *(ushort*)(addr+4);	mdsum += t2;
+		t2 = *(ushort*)(addr+6);	mdsum += t1;
+		t1 = *(ushort*)(addr+8);	mdsum += t2;
+		t2 = *(ushort*)(addr+10);	mdsum += t1;
+		t1 = *(ushort*)(addr+12);	mdsum += t2;
+		t2 = *(ushort*)(addr+14);	mdsum += t1;
+		mdsum += t2;
+		len -= 16;
+		addr += 16;
+	}
+	while(len >= 2) {
+		mdsum += *(ushort*)addr;
+		len -= 2;
+		addr += 2;
+	}
+	if(x) {
+		if(len)
+			losum += addr[0];
+		if(LITTLE)
+			losum += mdsum;
+		else
+			hisum += mdsum;
+	} else {
+		if(len)
+			hisum += addr[0];
+		if(LITTLE)
+			hisum += mdsum;
+		else
+			losum += mdsum;
+	}
+
+	losum += hisum >> 8;
+	losum += (hisum & 0xff) << 8;
+	while(hisum = losum>>16)
+		losum = hisum + (losum & 0xffff);
+
+	return losum & 0xffff;
+}

+ 3264 - 0
sys/src/9/ip/tcp.c

@@ -0,0 +1,3264 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+
+enum
+{
+	QMAX		= 64*1024-1,
+	IP_TCPPROTO	= 6,
+
+	TCP4_IPLEN	= 8,
+	TCP4_PHDRSIZE	= 12,
+	TCP4_HDRSIZE	= 20,
+	TCP4_TCBPHDRSZ	= 40,
+	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
+
+	TCP6_IPLEN	= 0,
+	TCP6_PHDRSIZE	= 40,
+	TCP6_HDRSIZE	= 20,
+	TCP6_TCBPHDRSZ	= 60,
+	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
+
+	TcptimerOFF	= 0,
+	TcptimerON	= 1,
+	TcptimerDONE	= 2,
+	MAX_TIME 	= (1<<20),	/* Forever */
+	TCP_ACK		= 50,		/* Timed ack sequence in ms */
+	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
+
+	URG		= 0x20,		/* Data marked urgent */
+	ACK		= 0x10,		/* Acknowledge is valid */
+	PSH		= 0x08,		/* Whole data pipe is pushed */
+	RST		= 0x04,		/* Reset connection */
+	SYN		= 0x02,		/* Pkt. is synchronise */
+	FIN		= 0x01,		/* Start close down */
+
+	EOLOPT		= 0,
+	NOOPOPT		= 1,
+	MSSOPT		= 2,
+	MSS_LENGTH	= 4,		/* Mean segment size */
+	WSOPT		= 3,
+	WS_LENGTH	= 3,		/* Bits to scale window size by */
+	MSL2		= 10,
+	MSPTICK		= 50,		/* Milliseconds per timer tick */
+	DEF_MSS		= 1460,		/* Default mean segment */
+	DEF_MSS6	= 1280,		/* Default mean segment (min) for v6 */
+	DEF_RTT		= 500,		/* Default round trip */
+	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
+	TCP_LISTEN	= 0,		/* Listen connection */
+	TCP_CONNECT	= 1,		/* Outgoing connection */
+	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
+
+	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
+	TCPMAXBURST	= 4,
+
+	FORCE		= 1,
+	CLONE		= 2,
+	RETRAN		= 4,
+	ACTIVE		= 8,
+	SYNACK		= 16,
+
+	LOGAGAIN	= 3,
+	LOGDGAIN	= 2,
+
+	Closed		= 0,		/* Connection states */
+	Listen,
+	Syn_sent,
+	Syn_received,
+	Established,
+	Finwait1,
+	Finwait2,
+	Close_wait,
+	Closing,
+	Last_ack,
+	Time_wait,
+
+	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
+	NLHT		= 256,		/* hash table size, must be a power of 2 */
+	LHTMASK		= NLHT-1,
+	HaveWS		= 1<<8,
+};
+
+/* Must correspond to the enumeration above */
+char *tcpstates[] =
+{
+	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
+	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
+	"Closing", 	"Last_ack", 	"Time_wait"
+};
+
+typedef struct Tcptimer Tcptimer;
+struct Tcptimer
+{
+	Tcptimer	*next;
+	Tcptimer	*prev;
+	Tcptimer	*readynext;
+	int	state;
+	int	start;
+	int	count;
+	void	(*func)(void*);
+	void	*arg;
+};
+
+/*
+ *  v4 and v6 pseudo headers used for
+ *  checksuming tcp
+ */
+typedef struct Tcp4hdr Tcp4hdr;
+struct Tcp4hdr
+{
+	uchar	vihl;		/* Version and header length */
+	uchar	tos;		/* Type of service */
+	uchar	length[2];	/* packet length */
+	uchar	id[2];		/* Identification */
+	uchar	frag[2];	/* Fragment information */
+	uchar	Unused;
+	uchar	proto;
+	uchar	tcplen[2];
+	uchar	tcpsrc[4];
+	uchar	tcpdst[4];
+	uchar	tcpsport[2];
+	uchar	tcpdport[2];
+	uchar	tcpseq[4];
+	uchar	tcpack[4];
+	uchar	tcpflag[2];
+	uchar	tcpwin[2];
+	uchar	tcpcksum[2];
+	uchar	tcpurg[2];
+	/* Options segment */
+	uchar	tcpopt[1];
+};
+
+typedef struct Tcp6hdr Tcp6hdr;
+struct Tcp6hdr
+{
+	uchar	vcf[4];
+	uchar	ploadlen[2];
+	uchar	proto;
+	uchar	ttl;
+	uchar	tcpsrc[IPaddrlen];
+	uchar	tcpdst[IPaddrlen];
+	uchar	tcpsport[2];
+	uchar	tcpdport[2];
+	uchar	tcpseq[4];
+	uchar	tcpack[4];
+	uchar	tcpflag[2];
+	uchar	tcpwin[2];
+	uchar	tcpcksum[2];
+	uchar	tcpurg[2];
+	/* Options segment */
+	uchar	tcpopt[1];
+};
+
+/*
+ *  this represents the control info
+ *  for a single packet.  It is derived from
+ *  a packet in ntohtcp{4,6}() and stuck into
+ *  a packet in htontcp{4,6}().
+ */
+typedef struct Tcp Tcp;
+struct	Tcp
+{
+	ushort	source;
+	ushort	dest;
+	ulong	seq;
+	ulong	ack;
+	uchar	flags;
+	ushort	ws;	/* window scale option (if not zero) */
+	ulong	wnd;
+	ushort	urg;
+	ushort	mss;	/* max segment size option (if not zero) */
+	ushort	len;	/* size of data */
+};
+
+/*
+ *  this header is malloc'd to thread together fragments
+ *  waiting to be coalesced
+ */
+typedef struct Reseq Reseq;
+struct Reseq
+{
+	Reseq	*next;
+	Tcp	seg;
+	Block	*bp;
+	ushort	length;
+};
+
+/*
+ *  the qlock in the Conv locks this structure
+ */
+typedef struct Tcpctl Tcpctl;
+struct Tcpctl
+{
+	uchar	state;			/* Connection state */
+	uchar	type;			/* Listening or active connection */
+	uchar	code;			/* Icmp code */
+	struct {
+		ulong	una;		/* Unacked data pointer */
+		ulong	nxt;		/* Next sequence expected */
+		ulong	ptr;		/* Data pointer */
+		ulong	wnd;		/* Tcp send window */
+		ulong	urg;		/* Urgent data pointer */
+		ulong	wl2;
+		int	scale;		/* how much to right shift window in xmitted packets */
+		/* to implement tahoe and reno TCP */
+		ulong	dupacks;	/* number of duplicate acks rcvd */
+		int	recovery;	/* loss recovery flag */
+		ulong	rxt;		/* right window marker for recovery */
+	} snd;
+	struct {
+		ulong	nxt;		/* Receive pointer to next uchar slot */
+		ulong	wnd;		/* Receive window incoming */
+		ulong	urg;		/* Urgent pointer */
+		int	blocked;
+		int	una;		/* unacked data segs, for delayed acks */
+		int	scale;		/* how much to left shift window in rcved packets */
+	} rcv;
+	ulong	iss;			/* Initial sequence number */
+	int	sawwsopt;		/* true if we saw a wsopt on the incoming SYN */
+	ulong	cwind;			/* Congestion window */
+	ulong	abcbytes;		/* appropriate byte counting */
+	int	scale;			/* desired snd.scale */
+	ulong	ssthresh;		/* Slow start threshold */
+	int	resent;			/* Bytes just resent */
+	int	irs;			/* Initial received squence */
+	ushort	mss;			/* Mean segment size */
+	int	rerecv;			/* Overlap of data rerecevived */
+	ulong	window;			/* Recevive window */
+	uchar	backoff;		/* Exponential backoff counter */
+	int	backedoff;		/* ms we've backed off for rexmits */
+	uchar	flags;			/* State flags */
+	Reseq	*reseq;			/* Resequencing queue */
+	Tcptimer	timer;			/* Activity timer */
+	Tcptimer	acktimer;		/* Acknowledge timer */
+	Tcptimer	rtt_timer;		/* Round trip timer */
+	Tcptimer	katimer;		/* keep alive timer */
+	ulong	rttseq;			/* Round trip sequence */
+	int	srtt;			/* Shortened round trip */
+	int	mdev;			/* Mean deviation of round trip */
+	int	kacounter;		/* count down for keep alive */
+	uint	sndsyntime;		/* time syn sent */
+	ulong	time;			/* time Finwait2 or Syn_received was sent */
+	int	nochecksum;		/* non-zero means don't send checksums */
+	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
+
+	union {
+		Tcp4hdr	tcp4hdr;
+		Tcp6hdr	tcp6hdr;
+	} protohdr;		/* prototype header */
+};
+
+/*
+ *  New calls are put in limbo rather than having a conversation structure
+ *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
+ *  any real Conv structures mucking things up.  Calls in limbo rexmit their
+ *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
+ *
+ *  In particular they aren't on a listener's queue so that they don't figure
+ *  in the input queue limit.
+ *
+ *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
+ *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
+ *  there is no hashing of this list.
+ */
+typedef struct Limbo Limbo;
+struct Limbo
+{
+	Limbo	*next;
+
+	uchar	laddr[IPaddrlen];
+	uchar	raddr[IPaddrlen];
+	ushort	lport;
+	ushort	rport;
+	ulong	irs;		/* initial received sequence */
+	ulong	iss;		/* initial sent sequence */
+	ushort	mss;		/* mss from the other end */
+	ushort	rcvscale;	/* how much to scale rcvd windows */
+	ushort	sndscale;	/* how much to scale sent windows */
+	ulong	lastsend;	/* last time we sent a synack */
+	uchar	version;	/* v4 or v6 */
+	uchar	rexmits;	/* number of retransmissions */
+};
+
+int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
+ushort	tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */
+
+enum {
+	/* MIB stats */
+	MaxConn,
+	ActiveOpens,
+	PassiveOpens,
+	EstabResets,
+	CurrEstab,
+	InSegs,
+	OutSegs,
+	RetransSegs,
+	RetransTimeouts,
+	InErrs,
+	OutRsts,
+
+	/* non-MIB stats */
+	CsumErrs,
+	HlenErrs,
+	LenErrs,
+	OutOfOrder,
+
+	Nstats
+};
+
+static char *statnames[] =
+{
+[MaxConn]	"MaxConn",
+[ActiveOpens]	"ActiveOpens",
+[PassiveOpens]	"PassiveOpens",
+[EstabResets]	"EstabResets",
+[CurrEstab]	"CurrEstab",
+[InSegs]	"InSegs",
+[OutSegs]	"OutSegs",
+[RetransSegs]	"RetransSegs",
+[RetransTimeouts]	"RetransTimeouts",
+[InErrs]	"InErrs",
+[OutRsts]	"OutRsts",
+[CsumErrs]	"CsumErrs",
+[HlenErrs]	"HlenErrs",
+[LenErrs]	"LenErrs",
+[OutOfOrder]	"OutOfOrder",
+};
+
+typedef struct Tcppriv Tcppriv;
+struct Tcppriv
+{
+	/* List of active timers */
+	QLock 	tl;
+	Tcptimer *timers;
+
+	/* hash table for matching conversations */
+	Ipht	ht;
+
+	/* calls in limbo waiting for an ACK to our SYN ACK */
+	int	nlimbo;
+	Limbo	*lht[NLHT];
+
+	/* for keeping track of tcpackproc */
+	QLock	apl;
+	int	ackprocstarted;
+
+	ulong	stats[Nstats];
+};
+
+/*
+ *  Setting tcpporthogdefense to non-zero enables Dong Lin's
+ *  solution to hijacked systems staking out port's as a form
+ *  of DoS attack.
+ *
+ *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
+ *  it that number gets acked by the other end, we shut down the connection.
+ *  Look for tcpporthogedefense in the code.
+ */
+int tcpporthogdefense = 0;
+
+static int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
+static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
+static void localclose(Conv*, char*);
+static void procsyn(Conv*, Tcp*);
+static void tcpacktimer(void*);
+static void tcpiput(Proto*, Ipifc*, Block*);
+static void tcpkeepalive(void*);
+static void tcpoutput(Conv*);
+static void tcprcvwin(Conv*);
+static void tcprxmit(Conv*);
+static void tcpsetkacounter(Tcpctl*);
+static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
+static void tcpsettimer(Tcpctl*);
+static void tcpsndsyn(Conv*, Tcpctl*);
+static void tcpstart(Conv*, int);
+static void tcpsynackrtt(Conv*);
+static void tcptimeout(void*);
+static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
+
+static void limborexmit(Proto*);
+static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
+
+static void
+tcpsetstate(Conv *s, uchar newstate)
+{
+	Tcpctl *tcb;
+	uchar oldstate;
+	Tcppriv *tpriv;
+
+	tpriv = s->p->priv;
+
+	tcb = (Tcpctl*)s->ptcl;
+
+	oldstate = tcb->state;
+	if(oldstate == newstate)
+		return;
+
+	if(oldstate == Established)
+		tpriv->stats[CurrEstab]--;
+	if(newstate == Established)
+		tpriv->stats[CurrEstab]++;
+
+	switch(newstate) {
+	case Closed:
+		qclose(s->rq);
+		qclose(s->wq);
+		qclose(s->eq);
+		break;
+
+	case Close_wait:		/* Remote closes */
+		qhangup(s->rq, nil);
+		break;
+	}
+
+	tcb->state = newstate;
+
+	if(oldstate == Syn_sent && newstate != Closed)
+		Fsconnected(s, nil);
+}
+
+static char*
+tcpconnect(Conv *c, char **argv, int argc)
+{
+	char *e;
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)(c->ptcl);
+	if(tcb->state != Closed)
+		return Econinuse;
+
+	e = Fsstdconnect(c, argv, argc);
+	if(e != nil)
+		return e;
+	tcpstart(c, TCP_CONNECT);
+
+	return nil;
+}
+
+static int
+tcpstate(Conv *c, char *state, int n)
+{
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)(c->ptcl);
+
+	return snprint(state, n,
+		"%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d"
+		" rwin %lud>>%d timer.start %d timer.count %d rerecv %d"
+		" katimer.start %d katimer.count %d ssthresh %lud\n",
+		tcpstates[tcb->state],
+		c->rq ? qlen(c->rq) : 0,
+		c->wq ? qlen(c->wq) : 0,
+		tcb->srtt, tcb->mdev,
+		tcb->cwind, tcb->snd.wnd, tcb->rcv.scale, tcb->rcv.wnd,
+		tcb->snd.scale, tcb->timer.start, tcb->timer.count, tcb->rerecv,
+		tcb->katimer.start, tcb->katimer.count, tcb->ssthresh);
+}
+
+static int
+tcpinuse(Conv *c)
+{
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)(c->ptcl);
+	return tcb->state != Closed;
+}
+
+static char*
+tcpannounce(Conv *c, char **argv, int argc)
+{
+	char *e;
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)(c->ptcl);
+	if(tcb->state != Closed)
+		return Econinuse;
+
+	e = Fsstdannounce(c, argv, argc);
+	if(e != nil)
+		return e;
+	tcpstart(c, TCP_LISTEN);
+	Fsconnected(c, nil);
+
+	return nil;
+}
+
+/*
+ *  tcpclose is always called with the q locked
+ */
+static void
+tcpclose(Conv *c)
+{
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)c->ptcl;
+
+	qhangup(c->rq, nil);
+	qhangup(c->wq, nil);
+	qhangup(c->eq, nil);
+	qflush(c->rq);
+
+	switch(tcb->state) {
+	case Listen:
+		/*
+		 *  reset any incoming calls to this listener
+		 */
+		Fsconnected(c, "Hangup");
+
+		localclose(c, nil);
+		break;
+	case Closed:
+	case Syn_sent:
+		localclose(c, nil);
+		break;
+	case Syn_received:
+	case Established:
+		tcb->flgcnt++;
+		tcb->snd.nxt++;
+		tcpsetstate(c, Finwait1);
+		tcpoutput(c);
+		break;
+	case Close_wait:
+		tcb->flgcnt++;
+		tcb->snd.nxt++;
+		tcpsetstate(c, Last_ack);
+		tcpoutput(c);
+		break;
+	}
+}
+
+static void
+tcpkick(void *x)
+{
+	Conv *s = x;
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)s->ptcl;
+
+	if(waserror()){
+		qunlock(s);
+		nexterror();
+	}
+	qlock(s);
+
+	switch(tcb->state) {
+	case Syn_sent:
+	case Syn_received:
+	case Established:
+	case Close_wait:
+		/*
+		 * Push data
+		 */
+		tcprcvwin(s);
+		tcpoutput(s);
+		break;
+	default:
+		localclose(s, "Hangup");
+		break;
+	}
+
+	qunlock(s);
+	poperror();
+}
+
+static void
+tcprcvwin(Conv *s)				/* Call with tcb locked */
+{
+	int w;
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)s->ptcl;
+	w = tcb->window - qlen(s->rq);
+	if(w < 0)
+		w = 0;
+	tcb->rcv.wnd = w;
+	if(w == 0)
+		tcb->rcv.blocked = 1;
+}
+
+static void
+tcpacktimer(void *v)
+{
+	Tcpctl *tcb;
+	Conv *s;
+
+	s = v;
+	tcb = (Tcpctl*)s->ptcl;
+
+	if(waserror()){
+		qunlock(s);
+		nexterror();
+	}
+	qlock(s);
+	if(tcb->state != Closed){
+		tcb->flags |= FORCE;
+		tcprcvwin(s);
+		tcpoutput(s);
+	}
+	qunlock(s);
+	poperror();
+}
+
+/*
+ *  qio.c wakes up writer when queue is half-empty.
+ *  wq holds data until it gets acked, thus wq needs to
+ *  be able to contain twice the full window of data.
+ *  plus more to guarantee saturation.
+ */
+static int
+tcpwqsize(int maxwin)
+{
+	return (5*maxwin)/2;
+}
+
+/*
+ *  limit maximum packet bursts due to stretched acks
+ *  and application limited periods. increase ssthresh
+ *  if necessary to allow reasonably quick cwind
+ *  reclamation.
+ */
+static void
+tcplimitmaxburst(Tcpctl *tcb)
+{
+	ulong cwindmax;
+	cwindmax = tcb->snd.nxt - tcb->snd.una + 3*tcb->mss;
+	if(tcb->cwind > cwindmax){
+		if(tcb->ssthresh < tcb->cwind)
+			tcb->ssthresh = tcb->cwind;
+		tcb->cwind = cwindmax;
+	}
+}
+
+static void
+tcpcongestion(Tcpctl *tcb)
+{
+	ulong inflight;
+	inflight = tcb->snd.nxt - tcb->snd.una;
+	if(inflight > tcb->cwind)
+		inflight = tcb->cwind;
+	tcb->ssthresh = inflight / 2;
+	if(tcb->ssthresh < 2*tcb->mss)
+		tcb->ssthresh = 2*tcb->mss;
+}
+
+static void
+tcpabcincr(Tcpctl *tcb, ulong acked, ulong limit)
+{
+	tcb->abcbytes += acked;
+	if(tcb->abcbytes >= limit){
+		tcb->cwind += tcb->mss;
+		tcb->abcbytes %= limit;
+	}
+}
+
+
+static void
+tcpcreate(Conv *c)
+{
+	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
+	c->wq = qopen(tcpwqsize(QMAX), Qkick, tcpkick, c);
+}
+
+static void
+timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
+{
+	if(newstate != TcptimerON){
+		if(t->state == TcptimerON){
+			/* unchain */
+			if(priv->timers == t){
+				priv->timers = t->next;
+				if(t->prev != nil)
+					panic("timerstate1");
+			}
+			if(t->next)
+				t->next->prev = t->prev;
+			if(t->prev)
+				t->prev->next = t->next;
+			t->next = t->prev = nil;
+		}
+	} else {
+		if(t->state != TcptimerON){
+			/* chain */
+			if(t->prev != nil || t->next != nil)
+				panic("timerstate2");
+			t->prev = nil;
+			t->next = priv->timers;
+			if(t->next)
+				t->next->prev = t;
+			priv->timers = t;
+		}
+	}
+	t->state = newstate;
+}
+
+static void
+tcpackproc(void *a)
+{
+	Tcptimer *t, *tp, *timeo;
+	Proto *tcp;
+	Tcppriv *priv;
+	int loop;
+
+	tcp = a;
+	priv = tcp->priv;
+
+	for(;;) {
+		tsleep(&up->sleep, return0, 0, MSPTICK);
+
+		qlock(&priv->tl);
+		timeo = nil;
+		loop = 0;
+		for(t = priv->timers; t != nil; t = tp) {
+			if(loop++ > 10000)
+				panic("tcpackproc1");
+			tp = t->next;
+ 			if(t->state == TcptimerON) {
+				t->count--;
+				if(t->count == 0) {
+					timerstate(priv, t, TcptimerDONE);
+					t->readynext = timeo;
+					timeo = t;
+				}
+			}
+		}
+		qunlock(&priv->tl);
+
+		loop = 0;
+		for(t = timeo; t != nil; t = t->readynext) {
+			if(loop++ > 10000)
+				panic("tcpackproc2");
+			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
+				(*t->func)(t->arg);
+				poperror();
+			}
+		}
+
+		limborexmit(tcp);
+	}
+}
+
+static void
+tcpgo(Tcppriv *priv, Tcptimer *t)
+{
+	if(t == nil || t->start == 0)
+		return;
+
+	qlock(&priv->tl);
+	t->count = t->start;
+	timerstate(priv, t, TcptimerON);
+	qunlock(&priv->tl);
+}
+
+static void
+tcphalt(Tcppriv *priv, Tcptimer *t)
+{
+	if(t == nil)
+		return;
+
+	qlock(&priv->tl);
+	timerstate(priv, t, TcptimerOFF);
+	qunlock(&priv->tl);
+}
+
+static int
+backoff(int n)
+{
+	return 1 << n;
+}
+
+static void
+localclose(Conv *s, char *reason)	/* called with tcb locked */
+{
+	Tcpctl *tcb;
+	Reseq *rp,*rp1;
+	Tcppriv *tpriv;
+
+	tpriv = s->p->priv;
+	tcb = (Tcpctl*)s->ptcl;
+
+	iphtrem(&tpriv->ht, s);
+
+	tcphalt(tpriv, &tcb->timer);
+	tcphalt(tpriv, &tcb->rtt_timer);
+	tcphalt(tpriv, &tcb->acktimer);
+	tcphalt(tpriv, &tcb->katimer);
+
+	/* Flush reassembly queue; nothing more can arrive */
+	for(rp = tcb->reseq; rp != nil; rp = rp1) {
+		rp1 = rp->next;
+		freeblist(rp->bp);
+		free(rp);
+	}
+	tcb->reseq = nil;
+
+	if(tcb->state == Syn_sent)
+		Fsconnected(s, reason);
+	if(s->state == Announced)
+		wakeup(&s->listenr);
+
+	qhangup(s->rq, reason);
+	qhangup(s->wq, reason);
+
+	tcpsetstate(s, Closed);
+}
+
+/* mtu (- TCP + IP hdr len) of 1st hop */
+static int
+tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
+{
+	Ipifc *ifc;
+	int mtu;
+
+	ifc = findipifc(tcp->f, addr, 0);
+	switch(version){
+	default:
+	case V4:
+		mtu = DEF_MSS;
+		if(ifc != nil)
+			mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE);
+		break;
+	case V6:
+		mtu = DEF_MSS6;
+		if(ifc != nil)
+			mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE);
+		break;
+	}
+	if(ifc != nil){
+		if(ifc->mbps > 1000)
+			*scale = HaveWS | 4;
+		else if(ifc->mbps > 100)
+			*scale = HaveWS | 3;
+		else if(ifc->mbps > 10)
+			*scale = HaveWS | 1;
+		else
+			*scale = HaveWS | 0;
+	} else
+		*scale = HaveWS | 0;
+
+	return mtu;
+}
+
+static void
+inittcpctl(Conv *s, int mode)
+{
+	Tcpctl *tcb;
+	Tcp4hdr* h4;
+	Tcp6hdr* h6;
+	int mss;
+
+	tcb = (Tcpctl*)s->ptcl;
+
+	memset(tcb, 0, sizeof(Tcpctl));
+
+	tcb->ssthresh = QMAX;
+	tcb->srtt = tcp_irtt<<LOGAGAIN;
+	tcb->mdev = 0;
+
+	/* setup timers */
+	tcb->timer.start = tcp_irtt / MSPTICK;
+	tcb->timer.func = tcptimeout;
+	tcb->timer.arg = s;
+	tcb->rtt_timer.start = MAX_TIME;
+	tcb->acktimer.start = TCP_ACK / MSPTICK;
+	tcb->acktimer.func = tcpacktimer;
+	tcb->acktimer.arg = s;
+	tcb->katimer.start = DEF_KAT / MSPTICK;
+	tcb->katimer.func = tcpkeepalive;
+	tcb->katimer.arg = s;
+
+	mss = DEF_MSS;
+
+	/* create a prototype(pseudo) header */
+	if(mode != TCP_LISTEN){
+		if(ipcmp(s->laddr, IPnoaddr) == 0)
+			findlocalip(s->p->f, s->laddr, s->raddr);
+
+		switch(s->ipversion){
+		case V4:
+			h4 = &tcb->protohdr.tcp4hdr;
+			memset(h4, 0, sizeof(*h4));
+			h4->proto = IP_TCPPROTO;
+			hnputs(h4->tcpsport, s->lport);
+			hnputs(h4->tcpdport, s->rport);
+			v6tov4(h4->tcpsrc, s->laddr);
+			v6tov4(h4->tcpdst, s->raddr);
+			break;
+		case V6:
+			h6 = &tcb->protohdr.tcp6hdr;
+			memset(h6, 0, sizeof(*h6));
+			h6->proto = IP_TCPPROTO;
+			hnputs(h6->tcpsport, s->lport);
+			hnputs(h6->tcpdport, s->rport);
+			ipmove(h6->tcpsrc, s->laddr);
+			ipmove(h6->tcpdst, s->raddr);
+			mss = DEF_MSS6;
+			break;
+		default:
+			panic("inittcpctl: version %d", s->ipversion);
+		}
+	}
+
+	tcb->mss = tcb->cwind = mss;
+	tcb->abcbytes = 0;
+
+	/* default is no window scaling */
+	tcb->window = QMAX;
+	tcb->rcv.wnd = QMAX;
+	tcb->rcv.scale = 0;
+	tcb->snd.scale = 0;
+	qsetlimit(s->rq, QMAX);
+	qsetlimit(s->wq, tcpwqsize(QMAX));
+}
+
+/*
+ *  called with s qlocked
+ */
+static void
+tcpstart(Conv *s, int mode)
+{
+	Tcpctl *tcb;
+	Tcppriv *tpriv;
+	char kpname[KNAMELEN];
+
+	tpriv = s->p->priv;
+
+	if(tpriv->ackprocstarted == 0){
+		qlock(&tpriv->apl);
+		if(tpriv->ackprocstarted == 0){
+			sprint(kpname, "#I%dtcpack", s->p->f->dev);
+			kproc(kpname, tcpackproc, s->p);
+			tpriv->ackprocstarted = 1;
+		}
+		qunlock(&tpriv->apl);
+	}
+
+	tcb = (Tcpctl*)s->ptcl;
+
+	inittcpctl(s, mode);
+
+	iphtadd(&tpriv->ht, s);
+	switch(mode) {
+	case TCP_LISTEN:
+		tpriv->stats[PassiveOpens]++;
+		tcb->flags |= CLONE;
+		tcpsetstate(s, Listen);
+		break;
+
+	case TCP_CONNECT:
+		tpriv->stats[ActiveOpens]++;
+		tcb->flags |= ACTIVE;
+		tcpsndsyn(s, tcb);
+		tcpsetstate(s, Syn_sent);
+		tcpoutput(s);
+		break;
+	}
+}
+
+static char*
+tcpflag(ushort flag)
+{
+	static char buf[128];
+
+	sprint(buf, "%d", flag>>10);	/* Head len */
+	if(flag & URG)
+		strcat(buf, " URG");
+	if(flag & ACK)
+		strcat(buf, " ACK");
+	if(flag & PSH)
+		strcat(buf, " PSH");
+	if(flag & RST)
+		strcat(buf, " RST");
+	if(flag & SYN)
+		strcat(buf, " SYN");
+	if(flag & FIN)
+		strcat(buf, " FIN");
+
+	return buf;
+}
+
+static Block*
+htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
+{
+	int dlen;
+	Tcp6hdr *h;
+	ushort csum;
+	ushort hdrlen, optpad = 0;
+	uchar *opt;
+
+	hdrlen = TCP6_HDRSIZE;
+	if(tcph->flags & SYN){
+		if(tcph->mss)
+			hdrlen += MSS_LENGTH;
+		if(tcph->ws)
+			hdrlen += WS_LENGTH;
+		optpad = hdrlen & 3;
+		if(optpad)
+			optpad = 4 - optpad;
+		hdrlen += optpad;
+	}
+
+	if(data) {
+		dlen = blocklen(data);
+		data = padblock(data, hdrlen + TCP6_PKT);
+		if(data == nil)
+			return nil;
+	}
+	else {
+		dlen = 0;
+		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
+		if(data == nil)
+			return nil;
+		data->wp += hdrlen + TCP6_PKT;
+	}
+
+	/* copy in pseudo ip header plus port numbers */
+	h = (Tcp6hdr *)(data->rp);
+	memmove(h, ph, TCP6_TCBPHDRSZ);
+
+	/* compose pseudo tcp header, do cksum calculation */
+	hnputl(h->vcf, hdrlen + dlen);
+	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
+	h->ttl = ph->proto;
+
+	/* copy in variable bits */
+	hnputl(h->tcpseq, tcph->seq);
+	hnputl(h->tcpack, tcph->ack);
+	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
+	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
+	hnputs(h->tcpurg, tcph->urg);
+
+	if(tcph->flags & SYN){
+		opt = h->tcpopt;
+		if(tcph->mss != 0){
+			*opt++ = MSSOPT;
+			*opt++ = MSS_LENGTH;
+			hnputs(opt, tcph->mss);
+			opt += 2;
+		}
+		if(tcph->ws != 0){
+			*opt++ = WSOPT;
+			*opt++ = WS_LENGTH;
+			*opt++ = tcph->ws;
+		}
+		while(optpad-- > 0)
+			*opt++ = NOOPOPT;
+	}
+
+	if(tcb != nil && tcb->nochecksum){
+		h->tcpcksum[0] = h->tcpcksum[1] = 0;
+	} else {
+		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
+		hnputs(h->tcpcksum, csum);
+	}
+
+	/* move from pseudo header back to normal ip header */
+	memset(h->vcf, 0, 4);
+	h->vcf[0] = IP_VER6;
+	hnputs(h->ploadlen, hdrlen+dlen);
+	h->proto = ph->proto;
+
+	return data;
+}
+
+static Block*
+htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
+{
+	int dlen;
+	Tcp4hdr *h;
+	ushort csum;
+	ushort hdrlen, optpad = 0;
+	uchar *opt;
+
+	hdrlen = TCP4_HDRSIZE;
+	if(tcph->flags & SYN){
+		if(tcph->mss)
+			hdrlen += MSS_LENGTH;
+		if(tcph->ws)
+			hdrlen += WS_LENGTH;
+		optpad = hdrlen & 3;
+		if(optpad)
+			optpad = 4 - optpad;
+		hdrlen += optpad;
+	}
+
+	if(data) {
+		dlen = blocklen(data);
+		data = padblock(data, hdrlen + TCP4_PKT);
+		if(data == nil)
+			return nil;
+	}
+	else {
+		dlen = 0;
+		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
+		if(data == nil)
+			return nil;
+		data->wp += hdrlen + TCP4_PKT;
+	}
+
+	/* copy in pseudo ip header plus port numbers */
+	h = (Tcp4hdr *)(data->rp);
+	memmove(h, ph, TCP4_TCBPHDRSZ);
+
+	/* copy in variable bits */
+	hnputs(h->tcplen, hdrlen + dlen);
+	hnputl(h->tcpseq, tcph->seq);
+	hnputl(h->tcpack, tcph->ack);
+	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
+	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
+	hnputs(h->tcpurg, tcph->urg);
+
+	if(tcph->flags & SYN){
+		opt = h->tcpopt;
+		if(tcph->mss != 0){
+			*opt++ = MSSOPT;
+			*opt++ = MSS_LENGTH;
+			hnputs(opt, tcph->mss);
+			opt += 2;
+		}
+		if(tcph->ws != 0){
+			*opt++ = WSOPT;
+			*opt++ = WS_LENGTH;
+			*opt++ = tcph->ws;
+		}
+		while(optpad-- > 0)
+			*opt++ = NOOPOPT;
+	}
+
+	if(tcb != nil && tcb->nochecksum){
+		h->tcpcksum[0] = h->tcpcksum[1] = 0;
+	} else {
+		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
+		hnputs(h->tcpcksum, csum);
+	}
+
+	return data;
+}
+
+static int
+ntohtcp6(Tcp *tcph, Block **bpp)
+{
+	Tcp6hdr *h;
+	uchar *optr;
+	ushort hdrlen;
+	ushort optlen;
+	int n;
+
+	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
+	if(*bpp == nil)
+		return -1;
+
+	h = (Tcp6hdr *)((*bpp)->rp);
+	tcph->source = nhgets(h->tcpsport);
+	tcph->dest = nhgets(h->tcpdport);
+	tcph->seq = nhgetl(h->tcpseq);
+	tcph->ack = nhgetl(h->tcpack);
+	hdrlen = (h->tcpflag[0]>>2) & ~3;
+	if(hdrlen < TCP6_HDRSIZE) {
+		freeblist(*bpp);
+		return -1;
+	}
+
+	tcph->flags = h->tcpflag[1];
+	tcph->wnd = nhgets(h->tcpwin);
+	tcph->urg = nhgets(h->tcpurg);
+	tcph->mss = 0;
+	tcph->ws = 0;
+	tcph->len = nhgets(h->ploadlen) - hdrlen;
+
+	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
+	if(*bpp == nil)
+		return -1;
+
+	optr = h->tcpopt;
+	n = hdrlen - TCP6_HDRSIZE;
+	while(n > 0 && *optr != EOLOPT) {
+		if(*optr == NOOPOPT) {
+			n--;
+			optr++;
+			continue;
+		}
+		optlen = optr[1];
+		if(optlen < 2 || optlen > n)
+			break;
+		switch(*optr) {
+		case MSSOPT:
+			if(optlen == MSS_LENGTH)
+				tcph->mss = nhgets(optr+2);
+			break;
+		case WSOPT:
+			if(optlen == WS_LENGTH && *(optr+2) <= 14)
+				tcph->ws = HaveWS | *(optr+2);
+			break;
+		}
+		n -= optlen;
+		optr += optlen;
+	}
+	return hdrlen;
+}
+
+static int
+ntohtcp4(Tcp *tcph, Block **bpp)
+{
+	Tcp4hdr *h;
+	uchar *optr;
+	ushort hdrlen;
+	ushort optlen;
+	int n;
+
+	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
+	if(*bpp == nil)
+		return -1;
+
+	h = (Tcp4hdr *)((*bpp)->rp);
+	tcph->source = nhgets(h->tcpsport);
+	tcph->dest = nhgets(h->tcpdport);
+	tcph->seq = nhgetl(h->tcpseq);
+	tcph->ack = nhgetl(h->tcpack);
+
+	hdrlen = (h->tcpflag[0]>>2) & ~3;
+	if(hdrlen < TCP4_HDRSIZE) {
+		freeblist(*bpp);
+		return -1;
+	}
+
+	tcph->flags = h->tcpflag[1];
+	tcph->wnd = nhgets(h->tcpwin);
+	tcph->urg = nhgets(h->tcpurg);
+	tcph->mss = 0;
+	tcph->ws = 0;
+	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
+
+	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
+	if(*bpp == nil)
+		return -1;
+
+	optr = h->tcpopt;
+	n = hdrlen - TCP4_HDRSIZE;
+	while(n > 0 && *optr != EOLOPT) {
+		if(*optr == NOOPOPT) {
+			n--;
+			optr++;
+			continue;
+		}
+		optlen = optr[1];
+		if(optlen < 2 || optlen > n)
+			break;
+		switch(*optr) {
+		case MSSOPT:
+			if(optlen == MSS_LENGTH)
+				tcph->mss = nhgets(optr+2);
+			break;
+		case WSOPT:
+			if(optlen == WS_LENGTH && *(optr+2) <= 14)
+				tcph->ws = HaveWS | *(optr+2);
+			break;
+		}
+		n -= optlen;
+		optr += optlen;
+	}
+	return hdrlen;
+}
+
+/*
+ *  For outgoing calls, generate an initial sequence
+ *  number and put a SYN on the send queue
+ */
+static void
+tcpsndsyn(Conv *s, Tcpctl *tcb)
+{
+	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
+	tcb->rttseq = tcb->iss;
+	tcb->snd.wl2 = tcb->iss;
+	tcb->snd.una = tcb->iss;
+	tcb->snd.ptr = tcb->rttseq;
+	tcb->snd.nxt = tcb->rttseq;
+	tcb->flgcnt++;
+	tcb->flags |= FORCE;
+	tcb->sndsyntime = NOW;
+
+	/* set desired mss and scale */
+	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
+}
+
+void
+sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
+{
+	Block *hbp;
+	uchar rflags;
+	Tcppriv *tpriv;
+	Tcp4hdr ph4;
+	Tcp6hdr ph6;
+
+	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
+
+	tpriv = tcp->priv;
+
+	if(seg->flags & RST)
+		return;
+
+	/* make pseudo header */
+	switch(version) {
+	case V4:
+		memset(&ph4, 0, sizeof(ph4));
+		ph4.vihl = IP_VER4;
+		v6tov4(ph4.tcpsrc, dest);
+		v6tov4(ph4.tcpdst, source);
+		ph4.proto = IP_TCPPROTO;
+		hnputs(ph4.tcplen, TCP4_HDRSIZE);
+		hnputs(ph4.tcpsport, seg->dest);
+		hnputs(ph4.tcpdport, seg->source);
+		break;
+	case V6:
+		memset(&ph6, 0, sizeof(ph6));
+		ph6.vcf[0] = IP_VER6;
+		ipmove(ph6.tcpsrc, dest);
+		ipmove(ph6.tcpdst, source);
+		ph6.proto = IP_TCPPROTO;
+		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
+		hnputs(ph6.tcpsport, seg->dest);
+		hnputs(ph6.tcpdport, seg->source);
+		break;
+	default:
+		panic("sndrst: version %d", version);
+	}
+
+	tpriv->stats[OutRsts]++;
+	rflags = RST;
+
+	/* convince the other end that this reset is in band */
+	if(seg->flags & ACK) {
+		seg->seq = seg->ack;
+		seg->ack = 0;
+	}
+	else {
+		rflags |= ACK;
+		seg->ack = seg->seq;
+		seg->seq = 0;
+		if(seg->flags & SYN)
+			seg->ack++;
+		seg->ack += length;
+		if(seg->flags & FIN)
+			seg->ack++;
+	}
+	seg->flags = rflags;
+	seg->wnd = 0;
+	seg->urg = 0;
+	seg->mss = 0;
+	seg->ws = 0;
+	switch(version) {
+	case V4:
+		hbp = htontcp4(seg, nil, &ph4, nil);
+		if(hbp == nil)
+			return;
+		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
+		break;
+	case V6:
+		hbp = htontcp6(seg, nil, &ph6, nil);
+		if(hbp == nil)
+			return;
+		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
+		break;
+	default:
+		panic("sndrst2: version %d", version);
+	}
+}
+
+/*
+ *  send a reset to the remote side and close the conversation
+ *  called with s qlocked
+ */
+static char*
+tcphangup(Conv *s)
+{
+	Tcp seg;
+	Tcpctl *tcb;
+	Block *hbp;
+
+	tcb = (Tcpctl*)s->ptcl;
+	if(waserror())
+		return commonerror();
+	if(ipcmp(s->raddr, IPnoaddr) != 0) {
+		if(!waserror()){
+			seg.flags = RST | ACK;
+			seg.ack = tcb->rcv.nxt;
+			tcb->rcv.una = 0;
+			seg.seq = tcb->snd.ptr;
+			seg.wnd = 0;
+			seg.urg = 0;
+			seg.mss = 0;
+			seg.ws = 0;
+			switch(s->ipversion) {
+			case V4:
+				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
+				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
+				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
+				break;
+			case V6:
+				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
+				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
+				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
+				break;
+			default:
+				panic("tcphangup: version %d", s->ipversion);
+			}
+			poperror();
+		}
+	}
+	localclose(s, nil);
+	poperror();
+	return nil;
+}
+
+/*
+ *  (re)send a SYN ACK
+ */
+static int
+sndsynack(Proto *tcp, Limbo *lp)
+{
+	Block *hbp;
+	Tcp4hdr ph4;
+	Tcp6hdr ph6;
+	Tcp seg;
+	int scale;
+
+	/* make pseudo header */
+	switch(lp->version) {
+	case V4:
+		memset(&ph4, 0, sizeof(ph4));
+		ph4.vihl = IP_VER4;
+		v6tov4(ph4.tcpsrc, lp->laddr);
+		v6tov4(ph4.tcpdst, lp->raddr);
+		ph4.proto = IP_TCPPROTO;
+		hnputs(ph4.tcplen, TCP4_HDRSIZE);
+		hnputs(ph4.tcpsport, lp->lport);
+		hnputs(ph4.tcpdport, lp->rport);
+		break;
+	case V6:
+		memset(&ph6, 0, sizeof(ph6));
+		ph6.vcf[0] = IP_VER6;
+		ipmove(ph6.tcpsrc, lp->laddr);
+		ipmove(ph6.tcpdst, lp->raddr);
+		ph6.proto = IP_TCPPROTO;
+		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
+		hnputs(ph6.tcpsport, lp->lport);
+		hnputs(ph6.tcpdport, lp->rport);
+		break;
+	default:
+		panic("sndrst: version %d", lp->version);
+	}
+
+	seg.seq = lp->iss;
+	seg.ack = lp->irs+1;
+	seg.flags = SYN|ACK;
+	seg.urg = 0;
+	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
+	seg.wnd = QMAX;
+
+	/* if the other side set scale, we should too */
+	if(lp->rcvscale){
+		seg.ws = scale;
+		lp->sndscale = scale;
+	} else {
+		seg.ws = 0;
+		lp->sndscale = 0;
+	}
+
+	switch(lp->version) {
+	case V4:
+		hbp = htontcp4(&seg, nil, &ph4, nil);
+		if(hbp == nil)
+			return -1;
+		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
+		break;
+	case V6:
+		hbp = htontcp6(&seg, nil, &ph6, nil);
+		if(hbp == nil)
+			return -1;
+		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
+		break;
+	default:
+		panic("sndsnack: version %d", lp->version);
+	}
+	lp->lastsend = NOW;
+	return 0;
+}
+
+#define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
+
+/*
+ *  put a call into limbo and respond with a SYN ACK
+ *
+ *  called with proto locked
+ */
+static void
+limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
+{
+	Limbo *lp, **l;
+	Tcppriv *tpriv;
+	int h;
+
+	tpriv = s->p->priv;
+	h = hashipa(source, seg->source);
+
+	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
+		lp = *l;
+		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
+			continue;
+		if(ipcmp(lp->raddr, source) != 0)
+			continue;
+		if(ipcmp(lp->laddr, dest) != 0)
+			continue;
+
+		/* each new SYN restarts the retransmits */
+		lp->irs = seg->seq;
+		break;
+	}
+	lp = *l;
+	if(lp == nil){
+		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
+			lp = tpriv->lht[h];
+			tpriv->lht[h] = lp->next;
+			lp->next = nil;
+		} else {
+			lp = malloc(sizeof(*lp));
+			if(lp == nil)
+				return;
+			tpriv->nlimbo++;
+		}
+		*l = lp;
+		lp->version = version;
+		ipmove(lp->laddr, dest);
+		ipmove(lp->raddr, source);
+		lp->lport = seg->dest;
+		lp->rport = seg->source;
+		lp->mss = seg->mss;
+		lp->rcvscale = seg->ws;
+		lp->irs = seg->seq;
+		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
+	}
+
+	if(sndsynack(s->p, lp) < 0){
+		*l = lp->next;
+		tpriv->nlimbo--;
+		free(lp);
+	}
+}
+
+/*
+ *  resend SYN ACK's once every SYNACK_RXTIMER ms.
+ */
+static void
+limborexmit(Proto *tcp)
+{
+	Tcppriv *tpriv;
+	Limbo **l, *lp;
+	int h;
+	int seen;
+	ulong now;
+
+	tpriv = tcp->priv;
+
+	if(!canqlock(tcp))
+		return;
+	seen = 0;
+	now = NOW;
+	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
+		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
+			lp = *l;
+			seen++;
+			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
+				continue;
+
+			/* time it out after 1 second */
+			if(++(lp->rexmits) > 5){
+				tpriv->nlimbo--;
+				*l = lp->next;
+				free(lp);
+				continue;
+			}
+
+			/* if we're being attacked, don't bother resending SYN ACK's */
+			if(tpriv->nlimbo > 100)
+				continue;
+
+			if(sndsynack(tcp, lp) < 0){
+				tpriv->nlimbo--;
+				*l = lp->next;
+				free(lp);
+				continue;
+			}
+
+			l = &lp->next;
+		}
+	}
+	qunlock(tcp);
+}
+
+/*
+ *  lookup call in limbo.  if found, throw it out.
+ *
+ *  called with proto locked
+ */
+static void
+limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
+{
+	Limbo *lp, **l;
+	int h;
+	Tcppriv *tpriv;
+
+	tpriv = s->p->priv;
+
+	/* find a call in limbo */
+	h = hashipa(src, segp->source);
+	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
+		lp = *l;
+		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
+			continue;
+		if(ipcmp(lp->laddr, dst) != 0)
+			continue;
+		if(ipcmp(lp->raddr, src) != 0)
+			continue;
+
+		/* RST can only follow the SYN */
+		if(segp->seq == lp->irs+1){
+			tpriv->nlimbo--;
+			*l = lp->next;
+			free(lp);
+		}
+		break;
+	}
+}
+
+/*
+ *  come here when we finally get an ACK to our SYN-ACK.
+ *  lookup call in limbo.  if found, create a new conversation
+ *
+ *  called with proto locked
+ */
+static Conv*
+tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
+{
+	Conv *new;
+	Tcpctl *tcb;
+	Tcppriv *tpriv;
+	Tcp4hdr *h4;
+	Tcp6hdr *h6;
+	Limbo *lp, **l;
+	int h;
+
+	/* unless it's just an ack, it can't be someone coming out of limbo */
+	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
+		return nil;
+
+	tpriv = s->p->priv;
+
+	/* find a call in limbo */
+	h = hashipa(src, segp->source);
+	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
+		netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n",
+			src, segp->source, lp->raddr, lp->rport,
+			dst, segp->dest, lp->laddr, lp->lport,
+			version, lp->version
+ 		);
+
+		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
+			continue;
+		if(ipcmp(lp->laddr, dst) != 0)
+			continue;
+		if(ipcmp(lp->raddr, src) != 0)
+			continue;
+
+		/* we're assuming no data with the initial SYN */
+		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
+			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
+				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
+			lp = nil;
+		} else {
+			tpriv->nlimbo--;
+			*l = lp->next;
+		}
+		break;
+	}
+	if(lp == nil)
+		return nil;
+
+	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
+	if(new == nil)
+		return nil;
+
+	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
+	tcb = (Tcpctl*)new->ptcl;
+	tcb->flags &= ~CLONE;
+	tcb->timer.arg = new;
+	tcb->timer.state = TcptimerOFF;
+	tcb->acktimer.arg = new;
+	tcb->acktimer.state = TcptimerOFF;
+	tcb->katimer.arg = new;
+	tcb->katimer.state = TcptimerOFF;
+	tcb->rtt_timer.arg = new;
+	tcb->rtt_timer.state = TcptimerOFF;
+
+	tcb->irs = lp->irs;
+	tcb->rcv.nxt = tcb->irs+1;
+	tcb->rcv.urg = tcb->rcv.nxt;
+
+	tcb->iss = lp->iss;
+	tcb->rttseq = tcb->iss;
+	tcb->snd.wl2 = tcb->iss;
+	tcb->snd.una = tcb->iss+1;
+	tcb->snd.ptr = tcb->iss+1;
+	tcb->snd.nxt = tcb->iss+1;
+	tcb->flgcnt = 0;
+	tcb->flags |= SYNACK;
+
+	/* our sending max segment size cannot be bigger than what he asked for */
+	if(lp->mss != 0 && lp->mss < tcb->mss)
+		tcb->mss = lp->mss;
+
+	/* window scaling */
+	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
+
+	/* the congestion window always starts out as a single segment */
+	tcb->snd.wnd = segp->wnd;
+	tcb->cwind = tcb->mss;
+
+	/* set initial round trip time */
+	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
+	tcpsynackrtt(new);
+
+	free(lp);
+
+	/* set up proto header */
+	switch(version){
+	case V4:
+		h4 = &tcb->protohdr.tcp4hdr;
+		memset(h4, 0, sizeof(*h4));
+		h4->proto = IP_TCPPROTO;
+		hnputs(h4->tcpsport, new->lport);
+		hnputs(h4->tcpdport, new->rport);
+		v6tov4(h4->tcpsrc, dst);
+		v6tov4(h4->tcpdst, src);
+		break;
+	case V6:
+		h6 = &tcb->protohdr.tcp6hdr;
+		memset(h6, 0, sizeof(*h6));
+		h6->proto = IP_TCPPROTO;
+		hnputs(h6->tcpsport, new->lport);
+		hnputs(h6->tcpdport, new->rport);
+		ipmove(h6->tcpsrc, dst);
+		ipmove(h6->tcpdst, src);
+		break;
+	default:
+		panic("tcpincoming: version %d", new->ipversion);
+	}
+
+	tcpsetstate(new, Established);
+
+	iphtadd(&tpriv->ht, new);
+
+	return new;
+}
+
+static int
+seq_within(ulong x, ulong low, ulong high)
+{
+	if(low <= high){
+		if(low <= x && x <= high)
+			return 1;
+	}
+	else {
+		if(x >= low || x <= high)
+			return 1;
+	}
+	return 0;
+}
+
+static int
+seq_lt(ulong x, ulong y)
+{
+	return (int)(x-y) < 0;
+}
+
+static int
+seq_le(ulong x, ulong y)
+{
+	return (int)(x-y) <= 0;
+}
+
+static int
+seq_gt(ulong x, ulong y)
+{
+	return (int)(x-y) > 0;
+}
+
+static int
+seq_ge(ulong x, ulong y)
+{
+	return (int)(x-y) >= 0;
+}
+
+/*
+ *  use the time between the first SYN and it's ack as the
+ *  initial round trip time
+ */
+static void
+tcpsynackrtt(Conv *s)
+{
+	Tcpctl *tcb;
+	int delta;
+	Tcppriv *tpriv;
+
+	tcb = (Tcpctl*)s->ptcl;
+	tpriv = s->p->priv;
+
+	delta = NOW - tcb->sndsyntime;
+	tcb->srtt = delta<<LOGAGAIN;
+	tcb->mdev = delta<<LOGDGAIN;
+
+	/* halt round trip timer */
+	tcphalt(tpriv, &tcb->rtt_timer);
+}
+
+static void
+update(Conv *s, Tcp *seg)
+{
+	int rtt, delta;
+	Tcpctl *tcb;
+	ulong acked;
+	Tcppriv *tpriv;
+
+	tpriv = s->p->priv;
+	tcb = (Tcpctl*)s->ptcl;
+
+	/* if everything has been acked, force output(?) */
+	if(seq_gt(seg->ack, tcb->snd.nxt)) {
+		tcb->flags |= FORCE;
+		return;
+	}
+
+	if(seg->ack == tcb->snd.una)
+	if(tcb->snd.una != tcb->snd.nxt)
+	if(seg->len == 0)
+	if(seg->wnd == tcb->snd.wnd)
+	if(++tcb->snd.dupacks == TCPREXMTTHRESH){
+		tcb->snd.recovery = 1;
+		tcb->snd.rxt = tcb->snd.nxt;
+		tcpcongestion(tcb);
+		tcprxmit(s);
+		tcb->cwind = tcb->ssthresh;
+	} else
+		tcb->cwind += tcb->mss;
+
+	/*
+	 *  update window
+	 */
+	if(seq_gt(seg->ack, tcb->snd.wl2)
+	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
+		tcb->snd.wnd = seg->wnd;
+		tcb->snd.wl2 = seg->ack;
+	}
+
+	if(!seq_gt(seg->ack, tcb->snd.una)){
+		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
+			tcb->backedoff = MAXBACKMS/4;
+		return;
+	}
+
+	acked = seg->ack - tcb->snd.una;
+
+	/* avoid slow start and timers for SYN acks */
+	if((tcb->flags & SYNACK) == 0) {
+		tcb->flags |= SYNACK;
+		acked--;
+		tcb->flgcnt--;
+		goto done;
+	}
+
+	/*
+	 *  congestion control
+	 */
+	if(tcb->snd.recovery){
+		if(seq_ge(seg->ack, tcb->snd.rxt)){
+			/* recovery finished */
+			tcb->snd.dupacks = 0;
+			tcb->snd.recovery = 0;
+			tcb->cwind = (tcb->snd.nxt - seg->ack) + tcb->mss;
+			if(tcb->ssthresh < tcb->cwind)
+				tcb->cwind = tcb->ssthresh;
+		} else {
+			/* partial ack */
+			tcb->cwind -= acked;
+			tcb->cwind += tcb->mss;
+		}
+	} else {
+		tcb->snd.dupacks = 0;
+		if(tcb->cwind < tcb->ssthresh)
+			tcpabcincr(tcb, acked, 2*tcb->mss);	/* slow start */
+		else
+			tcpabcincr(tcb, acked, tcb->cwind);	/* congestion dance */
+	}
+
+	if(tcb->cwind > tcb->snd.wnd)
+		tcb->cwind = tcb->snd.wnd;
+
+	/* Adjust the timers according to the round trip time */
+	/* todo: fix sloppy treatment of overflow cases here. */
+	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
+		tcphalt(tpriv, &tcb->rtt_timer);
+		if((tcb->flags&RETRAN) == 0) {
+			tcb->backoff = 0;
+			tcb->backedoff = 0;
+			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
+			if(rtt == 0)
+				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
+			rtt *= MSPTICK;
+			if(tcb->srtt == 0) {
+				tcb->srtt = rtt << LOGAGAIN;
+				tcb->mdev = rtt << LOGDGAIN;
+			} else {
+				delta = rtt - (tcb->srtt>>LOGAGAIN);
+				tcb->srtt += delta;
+				if(tcb->srtt <= 0)
+					tcb->srtt = 1;
+
+				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
+				tcb->mdev += delta;
+				if(tcb->mdev <= 0)
+					tcb->mdev = 1;
+			}
+			tcpsettimer(tcb);
+		}
+	}
+
+done:
+	if(qdiscard(s->wq, acked) < acked)
+		tcb->flgcnt--;
+	tcb->snd.una = seg->ack;
+
+	/* newreno fast recovery */
+	if(tcb->snd.recovery)
+		tcprxmit(s);
+
+	/*tcplimitmaxburst(tcb);*/
+
+	if(seq_gt(seg->ack, tcb->snd.urg))
+		tcb->snd.urg = seg->ack;
+
+	if(tcb->snd.una != tcb->snd.nxt)
+		tcpgo(tpriv, &tcb->timer);
+	else
+		tcphalt(tpriv, &tcb->timer);
+
+	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
+		tcb->snd.ptr = tcb->snd.una;
+
+	if(!tcb->snd.recovery)
+		tcb->flags &= ~RETRAN;
+	tcb->backoff = 0;
+	tcb->backedoff = 0;
+}
+
+static void
+tcpiput(Proto *tcp, Ipifc*, Block *bp)
+{
+	Tcp seg;
+	Tcp4hdr *h4;
+	Tcp6hdr *h6;
+	int hdrlen;
+	Tcpctl *tcb;
+	ushort length;
+	uchar source[IPaddrlen], dest[IPaddrlen];
+	Conv *s;
+	Fs *f;
+	Tcppriv *tpriv;
+	uchar version;
+
+	f = tcp->f;
+	tpriv = tcp->priv;
+
+	tpriv->stats[InSegs]++;
+
+	h4 = (Tcp4hdr*)(bp->rp);
+	h6 = (Tcp6hdr*)(bp->rp);
+
+	if((h4->vihl&0xF0)==IP_VER4) {
+		version = V4;
+		length = nhgets(h4->length);
+		v4tov6(dest, h4->tcpdst);
+		v4tov6(source, h4->tcpsrc);
+
+		h4->Unused = 0;
+		hnputs(h4->tcplen, length-TCP4_PKT);
+		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
+			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
+			tpriv->stats[CsumErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "bad tcp proto cksum\n");
+			freeblist(bp);
+			return;
+		}
+
+		hdrlen = ntohtcp4(&seg, &bp);
+		if(hdrlen < 0){
+			tpriv->stats[HlenErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "bad tcp hdr len\n");
+			return;
+		}
+
+		/* trim the packet to the size claimed by the datagram */
+		length -= hdrlen+TCP4_PKT;
+		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
+		if(bp == nil){
+			tpriv->stats[LenErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "tcp len < 0 after trim\n");
+			return;
+		}
+	}
+	else {
+		int ttl = h6->ttl;
+		int proto = h6->proto;
+
+		version = V6;
+		length = nhgets(h6->ploadlen);
+		ipmove(dest, h6->tcpdst);
+		ipmove(source, h6->tcpsrc);
+
+		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
+		h6->ttl = proto;
+		hnputl(h6->vcf, length);
+		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
+			ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
+			tpriv->stats[CsumErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "bad tcp proto cksum\n");
+			freeblist(bp);
+			return;
+		}
+		h6->ttl = ttl;
+		h6->proto = proto;
+		hnputs(h6->ploadlen, length);
+
+		hdrlen = ntohtcp6(&seg, &bp);
+		if(hdrlen < 0){
+			tpriv->stats[HlenErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "bad tcp hdr len\n");
+			return;
+		}
+
+		/* trim the packet to the size claimed by the datagram */
+		length -= hdrlen;
+		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
+		if(bp == nil){
+			tpriv->stats[LenErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "tcp len < 0 after trim\n");
+			return;
+		}
+	}
+
+	/* lock protocol while searching for a conversation */
+	qlock(tcp);
+
+	/* Look for a matching conversation */
+	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
+	if(s == nil){
+		netlog(f, Logtcp, "iphtlook failed\n");
+reset:
+		qunlock(tcp);
+		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
+		freeblist(bp);
+		return;
+	}
+
+	/* if it's a listener, look for the right flags and get a new conv */
+	tcb = (Tcpctl*)s->ptcl;
+	if(tcb->state == Listen){
+		if(seg.flags & RST){
+			limborst(s, &seg, source, dest, version);
+			qunlock(tcp);
+			freeblist(bp);
+			return;
+		}
+
+		/* if this is a new SYN, put the call into limbo */
+		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
+			limbo(s, source, dest, &seg, version);
+			qunlock(tcp);
+			freeblist(bp);
+			return;
+		}
+
+		/*
+		 *  if there's a matching call in limbo, tcpincoming will
+		 *  return it in state Syn_received
+		 */
+		s = tcpincoming(s, &seg, source, dest, version);
+		if(s == nil)
+			goto reset;
+	}
+
+	/* The rest of the input state machine is run with the control block
+	 * locked and implements the state machine directly out of the RFC.
+	 * Out-of-band data is ignored - it was always a bad idea.
+	 */
+	tcb = (Tcpctl*)s->ptcl;
+	if(waserror()){
+		qunlock(s);
+		nexterror();
+	}
+	qlock(s);
+	qunlock(tcp);
+
+	/* fix up window */
+	seg.wnd <<= tcb->rcv.scale;
+
+	/* every input packet in puts off the keep alive time out */
+	tcpsetkacounter(tcb);
+
+	switch(tcb->state) {
+	case Closed:
+		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
+		goto raise;
+	case Syn_sent:
+		if(seg.flags & ACK) {
+			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
+				sndrst(tcp, source, dest, length, &seg, version,
+					 "bad seq in Syn_sent");
+				goto raise;
+			}
+		}
+		if(seg.flags & RST) {
+			if(seg.flags & ACK)
+				localclose(s, Econrefused);
+			goto raise;
+		}
+
+		if(seg.flags & SYN) {
+			procsyn(s, &seg);
+			if(seg.flags & ACK){
+				update(s, &seg);
+				tcpsynackrtt(s);
+				tcpsetstate(s, Established);
+				tcpsetscale(s, tcb, seg.ws, tcb->scale);
+			}
+			else {
+				tcb->time = NOW;
+				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
+			}
+
+			if(length != 0 || (seg.flags & FIN))
+				break;
+
+			freeblist(bp);
+			goto output;
+		}
+		else
+			freeblist(bp);
+
+		qunlock(s);
+		poperror();
+		return;
+	case Syn_received:
+		/* doesn't matter if it's the correct ack, we're just trying to set timing */
+		if(seg.flags & ACK)
+			tcpsynackrtt(s);
+		break;
+	}
+
+	/*
+	 *  One DOS attack is to open connections to us and then forget about them,
+	 *  thereby tying up a conv at no long term cost to the attacker.
+	 *  This is an attempt to defeat these stateless DOS attacks.  See
+	 *  corresponding code in tcpsendka().
+	 */
+	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
+		if(tcpporthogdefense
+		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
+			print("stateless hog %I.%d->%I.%d f %#ux %#lux - %#lux - %#lux\n",
+				source, seg.source, dest, seg.dest, seg.flags,
+				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
+			localclose(s, "stateless hog");
+		}
+	}
+
+	/* Cut the data to fit the receive window */
+	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
+		netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
+		update(s, &seg);
+		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
+			tcphalt(tpriv, &tcb->rtt_timer);
+			tcphalt(tpriv, &tcb->acktimer);
+			tcphalt(tpriv, &tcb->katimer);
+			tcpsetstate(s, Time_wait);
+			tcb->timer.start = MSL2*(1000 / MSPTICK);
+			tcpgo(tpriv, &tcb->timer);
+		}
+		if(!(seg.flags & RST)) {
+			tcb->flags |= FORCE;
+			goto output;
+		}
+		qunlock(s);
+		poperror();
+		return;
+	}
+
+	/* Cannot accept so answer with a rst */
+	if(length && tcb->state == Closed) {
+		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
+		goto raise;
+	}
+
+	/* The segment is beyond the current receive pointer so
+	 * queue the data in the resequence queue
+	 */
+	if(seg.seq != tcb->rcv.nxt)
+	if(length != 0 || (seg.flags & (SYN|FIN))) {
+		update(s, &seg);
+		if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
+			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
+		tcb->flags |= FORCE;
+		goto output;
+	}
+
+	/*
+	 *  keep looping till we've processed this packet plus any
+	 *  adjacent packets in the resequence queue
+	 */
+	for(;;) {
+		if(seg.flags & RST) {
+			if(tcb->state == Established) {
+				tpriv->stats[EstabResets]++;
+				if(tcb->rcv.nxt != seg.seq)
+					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %#lux seq %#lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
+			}
+			localclose(s, Econrefused);
+			goto raise;
+		}
+
+		if((seg.flags&ACK) == 0)
+			goto raise;
+
+		switch(tcb->state) {
+		case Syn_received:
+			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
+				sndrst(tcp, source, dest, length, &seg, version,
+					"bad seq in Syn_received");
+				goto raise;
+			}
+			update(s, &seg);
+			tcpsetstate(s, Established);
+		case Established:
+		case Close_wait:
+			update(s, &seg);
+			break;
+		case Finwait1:
+			update(s, &seg);
+			if(qlen(s->wq)+tcb->flgcnt == 0){
+				tcphalt(tpriv, &tcb->rtt_timer);
+				tcphalt(tpriv, &tcb->acktimer);
+				tcpsetkacounter(tcb);
+				tcb->time = NOW;
+				tcpsetstate(s, Finwait2);
+				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
+				tcpgo(tpriv, &tcb->katimer);
+			}
+			break;
+		case Finwait2:
+			update(s, &seg);
+			break;
+		case Closing:
+			update(s, &seg);
+			if(qlen(s->wq)+tcb->flgcnt == 0) {
+				tcphalt(tpriv, &tcb->rtt_timer);
+				tcphalt(tpriv, &tcb->acktimer);
+				tcphalt(tpriv, &tcb->katimer);
+				tcpsetstate(s, Time_wait);
+				tcb->timer.start = MSL2*(1000 / MSPTICK);
+				tcpgo(tpriv, &tcb->timer);
+			}
+			break;
+		case Last_ack:
+			update(s, &seg);
+			if(qlen(s->wq)+tcb->flgcnt == 0) {
+				localclose(s, nil);
+				goto raise;
+			}
+		case Time_wait:
+			tcb->flags |= FORCE;
+			if(tcb->timer.state != TcptimerON)
+				tcpgo(tpriv, &tcb->timer);
+		}
+
+		if((seg.flags&URG) && seg.urg) {
+			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
+				tcb->rcv.urg = seg.urg + seg.seq;
+				pullblock(&bp, seg.urg);
+			}
+		}
+		else
+		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
+			tcb->rcv.urg = tcb->rcv.nxt;
+
+		if(length == 0) {
+			if(bp != nil)
+				freeblist(bp);
+		}
+		else {
+			switch(tcb->state){
+			default:
+				/* Ignore segment text */
+				if(bp != nil)
+					freeblist(bp);
+				break;
+
+			case Syn_received:
+			case Established:
+			case Finwait1:
+				/* If we still have some data place on
+				 * receive queue
+				 */
+				if(bp) {
+					bp = packblock(bp);
+					if(bp == nil)
+						panic("tcp packblock");
+					qpassnolim(s->rq, bp);
+					bp = nil;
+
+					/*
+					 *  Force an ack every 2 data messages.  This is
+					 *  a hack for rob to make his home system run
+					 *  faster.
+					 *
+					 *  this also keeps the standard TCP congestion
+					 *  control working since it needs an ack every
+					 *  2 max segs worth.  This is not quite that,
+					 *  but under a real stream is equivalent since
+					 *  every packet has a max seg in it.
+					 */
+					if(++(tcb->rcv.una) >= 2)
+						tcb->flags |= FORCE;
+				}
+				tcb->rcv.nxt += length;
+
+				/*
+				 *  update our rcv window
+				 */
+				tcprcvwin(s);
+
+				/*
+				 *  turn on the acktimer if there's something
+				 *  to ack
+				 */
+				if(tcb->acktimer.state != TcptimerON)
+					tcpgo(tpriv, &tcb->acktimer);
+
+				break;
+			case Finwait2:
+				/* no process to read the data, send a reset */
+				if(bp != nil)
+					freeblist(bp);
+				sndrst(tcp, source, dest, length, &seg, version,
+					"send to Finwait2");
+				qunlock(s);
+				poperror();
+				return;
+			}
+		}
+
+		if(seg.flags & FIN) {
+			tcb->flags |= FORCE;
+
+			switch(tcb->state) {
+			case Syn_received:
+			case Established:
+				tcb->rcv.nxt++;
+				tcpsetstate(s, Close_wait);
+				break;
+			case Finwait1:
+				tcb->rcv.nxt++;
+				if(qlen(s->wq)+tcb->flgcnt == 0) {
+					tcphalt(tpriv, &tcb->rtt_timer);
+					tcphalt(tpriv, &tcb->acktimer);
+					tcphalt(tpriv, &tcb->katimer);
+					tcpsetstate(s, Time_wait);
+					tcb->timer.start = MSL2*(1000/MSPTICK);
+					tcpgo(tpriv, &tcb->timer);
+				}
+				else
+					tcpsetstate(s, Closing);
+				break;
+			case Finwait2:
+				tcb->rcv.nxt++;
+				tcphalt(tpriv, &tcb->rtt_timer);
+				tcphalt(tpriv, &tcb->acktimer);
+				tcphalt(tpriv, &tcb->katimer);
+				tcpsetstate(s, Time_wait);
+				tcb->timer.start = MSL2 * (1000/MSPTICK);
+				tcpgo(tpriv, &tcb->timer);
+				break;
+			case Close_wait:
+			case Closing:
+			case Last_ack:
+				break;
+			case Time_wait:
+				tcpgo(tpriv, &tcb->timer);
+				break;
+			}
+		}
+
+		/*
+		 *  get next adjacent segment from the resequence queue.
+		 *  dump/trim any overlapping segments
+		 */
+		for(;;) {
+			if(tcb->reseq == nil)
+				goto output;
+
+			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
+				goto output;
+
+			getreseq(tcb, &seg, &bp, &length);
+
+			if(tcptrim(tcb, &seg, &bp, &length) == 0)
+				break;
+		}
+	}
+output:
+	tcpoutput(s);
+	qunlock(s);
+	poperror();
+	return;
+raise:
+	qunlock(s);
+	poperror();
+	freeblist(bp);
+	tcpkick(s);
+}
+
+/*
+ *  always enters and exits with the s locked.  We drop
+ *  the lock to ipoput the packet so some care has to be
+ *  taken by callers.
+ */
+static void
+tcpoutput(Conv *s)
+{
+	Tcp seg;
+	int msgs;
+	Tcpctl *tcb;
+	Block *hbp, *bp;
+	int sndcnt;
+	ulong ssize, dsize, sent;
+	Fs *f;
+	Tcppriv *tpriv;
+	uchar version;
+
+	f = s->p->f;
+	tpriv = s->p->priv;
+	version = s->ipversion;
+
+	for(msgs = 0; msgs < 100; msgs++) {
+		tcb = (Tcpctl*)s->ptcl;
+
+		switch(tcb->state) {
+		case Listen:
+		case Closed:
+		case Finwait2:
+			return;
+		}
+
+		/* Don't send anything else until our SYN has been acked */
+		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
+			break;
+
+		/* force an ack when a window has opened up */
+		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
+			tcb->rcv.blocked = 0;
+			tcb->flags |= FORCE;
+		}
+
+		sndcnt = qlen(s->wq)+tcb->flgcnt;
+		sent = tcb->snd.ptr - tcb->snd.una;
+		ssize = sndcnt;
+		if(tcb->snd.wnd == 0){
+			/* zero window probe */
+			if(sent > 0)
+			if(!(tcb->flags & FORCE))
+				break;	/* already probing, rto re-probes */
+			ssize -= sent;
+			if(ssize > 0)
+				ssize = 1;
+		} else {
+			/* calculate usable segment size */
+			if(ssize > tcb->cwind)
+				ssize = tcb->cwind;
+			if(ssize > tcb->snd.wnd)
+				ssize = tcb->snd.wnd;
+
+			if(ssize < sent)
+				ssize = 0;
+			else {
+				ssize -= sent;
+				if(ssize > tcb->mss)
+					ssize = tcb->mss;
+			}
+		}
+
+		dsize = ssize;
+		seg.urg = 0;
+
+		if(!(tcb->flags & FORCE)){
+			if(ssize == 0)
+				break;
+			if(ssize < tcb->mss)
+			if(tcb->snd.nxt == tcb->snd.ptr)
+			if(sent > TCPREXMTTHRESH*tcb->mss)
+				break;
+		}
+
+		tcb->flags &= ~FORCE;
+		tcprcvwin(s);
+
+		/* By default we will generate an ack */
+		tcphalt(tpriv, &tcb->acktimer);
+		tcb->rcv.una = 0;
+		seg.source = s->lport;
+		seg.dest = s->rport;
+		seg.flags = ACK;
+		seg.mss = 0;
+		seg.ws = 0;
+		switch(tcb->state){
+		case Syn_sent:
+			seg.flags = 0;
+			if(tcb->snd.ptr == tcb->iss){
+				seg.flags |= SYN;
+				dsize--;
+				seg.mss = tcb->mss;
+				seg.ws = tcb->scale;
+			}
+			break;
+		case Syn_received:
+			/*
+			 *  don't send any data with a SYN/ACK packet
+			 *  because Linux rejects the packet in its
+			 *  attempt to solve the SYN attack problem
+			 */
+			if(tcb->snd.ptr == tcb->iss){
+				seg.flags |= SYN;
+				dsize = 0;
+				ssize = 1;
+				seg.mss = tcb->mss;
+				seg.ws = tcb->scale;
+			}
+			break;
+		}
+		seg.seq = tcb->snd.ptr;
+		seg.ack = tcb->rcv.nxt;
+		seg.wnd = tcb->rcv.wnd;
+
+		/* Pull out data to send */
+		bp = nil;
+		if(dsize != 0) {
+			bp = qcopy(s->wq, dsize, sent);
+			if(BLEN(bp) != dsize) {
+				seg.flags |= FIN;
+				dsize--;
+			}
+		}
+
+		if(sent+dsize == sndcnt)
+			seg.flags |= PSH;
+
+		tcb->snd.ptr += ssize;
+
+		/* Pull up the send pointer so we can accept acks
+		 * for this window
+		 */
+		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
+			tcb->snd.nxt = tcb->snd.ptr;
+
+		/* Build header, link data and compute cksum */
+		switch(version){
+		case V4:
+			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
+			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
+			if(hbp == nil) {
+				freeblist(bp);
+				return;
+			}
+			break;
+		case V6:
+			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
+			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
+			if(hbp == nil) {
+				freeblist(bp);
+				return;
+			}
+			break;
+		default:
+			hbp = nil;	/* to suppress a warning */
+			panic("tcpoutput: version %d", version);
+		}
+
+		/* Start the transmission timers if there is new data and we
+		 * expect acknowledges
+		 */
+		if(ssize != 0){
+			if(tcb->timer.state != TcptimerON)
+				tcpgo(tpriv, &tcb->timer);
+
+			/*  If round trip timer isn't running, start it.
+			 *  measure the longest packet only in case the
+			 *  transmission time dominates RTT
+			 */
+			if(tcb->rtt_timer.state != TcptimerON)
+			if(ssize == tcb->mss) {
+				tcpgo(tpriv, &tcb->rtt_timer);
+				tcb->rttseq = tcb->snd.ptr;
+			}
+		}
+
+		tpriv->stats[OutSegs]++;
+
+		/* put off the next keep alive */
+		tcpgo(tpriv, &tcb->katimer);
+
+		switch(version){
+		case V4:
+			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
+				/* a negative return means no route */
+				localclose(s, "no route");
+			}
+			break;
+		case V6:
+			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
+				/* a negative return means no route */
+				localclose(s, "no route");
+			}
+			break;
+		default:
+			panic("tcpoutput2: version %d", version);
+		}
+	}
+}
+
+/*
+ *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
+ */
+static void
+tcpsendka(Conv *s)
+{
+	Tcp seg;
+	Tcpctl *tcb;
+	Block *hbp,*dbp;
+
+	tcb = (Tcpctl*)s->ptcl;
+
+	dbp = nil;
+	seg.urg = 0;
+	seg.source = s->lport;
+	seg.dest = s->rport;
+	seg.flags = ACK|PSH;
+	seg.mss = 0;
+	seg.ws = 0;
+	if(tcpporthogdefense)
+		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
+	else
+		seg.seq = tcb->snd.una-1;
+	seg.ack = tcb->rcv.nxt;
+	tcb->rcv.una = 0;
+	seg.wnd = tcb->rcv.wnd;
+	if(tcb->state == Finwait2){
+		seg.flags |= FIN;
+	} else {
+		dbp = allocb(1);
+		dbp->wp++;
+	}
+
+	if(isv4(s->raddr)) {
+		/* Build header, link data and compute cksum */
+		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
+		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
+		if(hbp == nil) {
+			freeblist(dbp);
+			return;
+		}
+		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
+	}
+	else {
+		/* Build header, link data and compute cksum */
+		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
+		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
+		if(hbp == nil) {
+			freeblist(dbp);
+			return;
+		}
+		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
+	}
+}
+
+/*
+ *  set connection to time out after 12 minutes
+ */
+static void
+tcpsetkacounter(Tcpctl *tcb)
+{
+	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
+	if(tcb->kacounter < 3)
+		tcb->kacounter = 3;
+}
+
+/*
+ *  if we've timed out, close the connection
+ *  otherwise, send a keepalive and restart the timer
+ */
+static void
+tcpkeepalive(void *v)
+{
+	Tcpctl *tcb;
+	Conv *s;
+
+	s = v;
+	tcb = (Tcpctl*)s->ptcl;
+	if(waserror()){
+		qunlock(s);
+		nexterror();
+	}
+	qlock(s);
+	if(tcb->state != Closed){
+		if(--(tcb->kacounter) <= 0) {
+			localclose(s, Etimedout);
+		} else {
+			tcpsendka(s);
+			tcpgo(s->p->priv, &tcb->katimer);
+		}
+	}
+	qunlock(s);
+	poperror();
+}
+
+/*
+ *  start keepalive timer
+ */
+static char*
+tcpstartka(Conv *s, char **f, int n)
+{
+	Tcpctl *tcb;
+	int x;
+
+	tcb = (Tcpctl*)s->ptcl;
+	if(tcb->state != Established)
+		return "connection must be in Establised state";
+	if(n > 1){
+		x = atoi(f[1]);
+		if(x >= MSPTICK)
+			tcb->katimer.start = x/MSPTICK;
+	}
+	tcpsetkacounter(tcb);
+	tcpgo(s->p->priv, &tcb->katimer);
+
+	return nil;
+}
+
+/*
+ *  turn checksums on/off
+ */
+static char*
+tcpsetchecksum(Conv *s, char **f, int)
+{
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)s->ptcl;
+	tcb->nochecksum = !atoi(f[1]);
+
+	return nil;
+}
+
+/*
+ *  retransmit (at most) one segment at snd.una.
+ *  preserve cwind & snd.ptr
+ */
+static void
+tcprxmit(Conv *s)
+{
+	Tcpctl *tcb;
+	ulong tcwind, tptr;
+	tcb = (Tcpctl*)s->ptcl;
+	tcb->flags |= RETRAN|FORCE;
+
+	tptr = tcb->snd.ptr;
+	tcwind = tcb->cwind;
+	tcb->snd.ptr = tcb->snd.una;
+	tcb->cwind = tcb->mss;
+	tcpoutput(s);
+	tcb->cwind = tcwind;
+	tcb->snd.ptr = tptr;
+}
+
+/*
+ *  todo: RFC 4138 F-RTO
+ */
+static void
+tcptimeout(void *arg)
+{
+	Conv *s;
+	Tcpctl *tcb;
+	int maxback;
+	Tcppriv *tpriv;
+
+	s = (Conv*)arg;
+	tpriv = s->p->priv;
+	tcb = (Tcpctl*)s->ptcl;
+
+	if(waserror()){
+		qunlock(s);
+		nexterror();
+	}
+	qlock(s);
+	switch(tcb->state){
+	default:
+		tcb->backoff++;
+		if(tcb->state == Syn_sent)
+			maxback = MAXBACKMS/2;
+		else
+			maxback = MAXBACKMS;
+		tcb->backedoff += tcb->timer.start * MSPTICK;
+		if(tcb->backedoff >= maxback) {
+			localclose(s, Etimedout);
+			break;
+		}
+		netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n",
+			tcb->snd.una, tcb->timer.start, NOW);
+		tcpsettimer(tcb);
+		tcpcongestion(tcb);
+		tcprxmit(s);
+		tcb->snd.ptr = tcb->snd.una;
+		tcb->cwind = tcb->mss;
+		tpriv->stats[RetransTimeouts]++;
+		tcb->snd.dupacks = 0;
+		tcb->snd.recovery = 0;
+		break;
+	case Time_wait:
+		localclose(s, nil);
+		break;
+	case Closed:
+		break;
+	}
+	qunlock(s);
+	poperror();
+}
+
+static int
+inwindow(Tcpctl *tcb, int seq)
+{
+	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
+}
+
+/*
+ *  set up state for a received SYN (or SYN ACK) packet
+ */
+static void
+procsyn(Conv *s, Tcp *seg)
+{
+	Tcpctl *tcb;
+
+	tcb = (Tcpctl*)s->ptcl;
+	tcb->flags |= FORCE;
+
+	tcb->rcv.nxt = seg->seq + 1;
+	tcb->rcv.urg = tcb->rcv.nxt;
+	tcb->irs = seg->seq;
+
+	/* our sending max segment size cannot be bigger than what he asked for */
+	if(seg->mss != 0 && seg->mss < tcb->mss)
+		tcb->mss = seg->mss;
+
+	tcb->snd.wnd = seg->wnd;
+
+	/* RFC 3390 initial window */
+	if(tcb->mss < 1095)
+		tcb->cwind = 4*tcb->mss;
+	else if(tcb->mss < 2190)
+		tcb->cwind = 4380;
+	else
+		tcb->cwind = 2*tcb->mss;
+}
+
+static int
+addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
+{
+	Reseq *rp, *rp1;
+	int i, rqlen, qmax;
+
+	rp = malloc(sizeof(Reseq));
+	if(rp == nil){
+		freeblist(bp);	/* bp always consumed by add_reseq */
+		return 0;
+	}
+
+	rp->seg = *seg;
+	rp->bp = bp;
+	rp->length = length;
+
+	/* Place on reassembly list sorting by starting seq number */
+	rp1 = tcb->reseq;
+	if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
+		rp->next = rp1;
+		tcb->reseq = rp;
+		if(rp->next != nil)
+			tpriv->stats[OutOfOrder]++;
+		return 0;
+	}
+
+	rqlen = 0;
+	for(i = 0;; i++) {
+		rqlen += rp1->length;
+		if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
+			rp->next = rp1->next;
+			rp1->next = rp;
+			if(rp->next != nil)
+				tpriv->stats[OutOfOrder]++;
+			break;
+		}
+		rp1 = rp1->next;
+	}
+	qmax = QMAX<<tcb->rcv.scale;
+	if(rqlen > qmax){
+		print("resequence queue > window: %d > %d\n", rqlen, qmax);
+		i = 0;
+	  	for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
+	  		print("%#lux %#lux %#ux\n", rp1->seg.seq,
+	  			rp1->seg.ack, rp1->seg.flags);
+			if(i++ > 10){
+				print("...\n");
+				break;
+			}
+		}
+
+		/*
+		 * delete entire reassembly queue; wait for retransmit.
+		 * - should we be smarter and only delete the tail?
+		 */
+		for(rp = tcb->reseq; rp != nil; rp = rp1){
+			rp1 = rp->next;
+			freeblist(rp->bp);
+			free(rp);
+		}
+		tcb->reseq = nil;
+
+	  	return -1;
+	}
+	return 0;
+}
+
+static void
+getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
+{
+	Reseq *rp;
+
+	rp = tcb->reseq;
+	if(rp == nil)
+		return;
+
+	tcb->reseq = rp->next;
+
+	*seg = rp->seg;
+	*bp = rp->bp;
+	*length = rp->length;
+
+	free(rp);
+}
+
+static int
+tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
+{
+	ushort len;
+	uchar accept;
+	int dupcnt, excess;
+
+	accept = 0;
+	len = *length;
+	if(seg->flags & SYN)
+		len++;
+	if(seg->flags & FIN)
+		len++;
+
+	if(tcb->rcv.wnd == 0) {
+		if(len == 0 && seg->seq == tcb->rcv.nxt)
+			return 0;
+	}
+	else {
+		/* Some part of the segment should be in the window */
+		if(inwindow(tcb,seg->seq))
+			accept++;
+		else
+		if(len != 0) {
+			if(inwindow(tcb, seg->seq+len-1) ||
+			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
+				accept++;
+		}
+	}
+	if(!accept) {
+		freeblist(*bp);
+		return -1;
+	}
+	dupcnt = tcb->rcv.nxt - seg->seq;
+	if(dupcnt > 0){
+		tcb->rerecv += dupcnt;
+		if(seg->flags & SYN){
+			seg->flags &= ~SYN;
+			seg->seq++;
+
+			if(seg->urg > 1)
+				seg->urg--;
+			else
+				seg->flags &= ~URG;
+			dupcnt--;
+		}
+		if(dupcnt > 0){
+			pullblock(bp, (ushort)dupcnt);
+			seg->seq += dupcnt;
+			*length -= dupcnt;
+
+			if(seg->urg > dupcnt)
+				seg->urg -= dupcnt;
+			else {
+				seg->flags &= ~URG;
+				seg->urg = 0;
+			}
+		}
+	}
+	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
+	if(excess > 0) {
+		tcb->rerecv += excess;
+		*length -= excess;
+		*bp = trimblock(*bp, 0, *length);
+		if(*bp == nil)
+			panic("presotto is a boofhead");
+		seg->flags &= ~FIN;
+	}
+	return 0;
+}
+
+static void
+tcpadvise(Proto *tcp, Block *bp, char *msg)
+{
+	Tcp4hdr *h4;
+	Tcp6hdr *h6;
+	Tcpctl *tcb;
+	uchar source[IPaddrlen];
+	uchar dest[IPaddrlen];
+	ushort psource, pdest;
+	Conv *s, **p;
+
+	h4 = (Tcp4hdr*)(bp->rp);
+	h6 = (Tcp6hdr*)(bp->rp);
+
+	if((h4->vihl&0xF0)==IP_VER4) {
+		v4tov6(dest, h4->tcpdst);
+		v4tov6(source, h4->tcpsrc);
+		psource = nhgets(h4->tcpsport);
+		pdest = nhgets(h4->tcpdport);
+	}
+	else {
+		ipmove(dest, h6->tcpdst);
+		ipmove(source, h6->tcpsrc);
+		psource = nhgets(h6->tcpsport);
+		pdest = nhgets(h6->tcpdport);
+	}
+
+	/* Look for a connection */
+	qlock(tcp);
+	for(p = tcp->conv; *p; p++) {
+		s = *p;
+		tcb = (Tcpctl*)s->ptcl;
+		if(s->rport == pdest)
+		if(s->lport == psource)
+		if(tcb->state != Closed)
+		if(ipcmp(s->raddr, dest) == 0)
+		if(ipcmp(s->laddr, source) == 0){
+			qlock(s);
+			qunlock(tcp);
+			switch(tcb->state){
+			case Syn_sent:
+				localclose(s, msg);
+				break;
+			}
+			qunlock(s);
+			freeblist(bp);
+			return;
+		}
+	}
+	qunlock(tcp);
+	freeblist(bp);
+}
+
+static char*
+tcpporthogdefensectl(char *val)
+{
+	if(strcmp(val, "on") == 0)
+		tcpporthogdefense = 1;
+	else if(strcmp(val, "off") == 0)
+		tcpporthogdefense = 0;
+	else
+		return "unknown value for tcpporthogdefense";
+	return nil;
+}
+
+/* called with c qlocked */
+static char*
+tcpctl(Conv* c, char** f, int n)
+{
+	if(n == 1 && strcmp(f[0], "hangup") == 0)
+		return tcphangup(c);
+	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
+		return tcpstartka(c, f, n);
+	if(n >= 1 && strcmp(f[0], "checksum") == 0)
+		return tcpsetchecksum(c, f, n);
+	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
+		return tcpporthogdefensectl(f[1]);
+	return "unknown control request";
+}
+
+static int
+tcpstats(Proto *tcp, char *buf, int len)
+{
+	Tcppriv *priv;
+	char *p, *e;
+	int i;
+
+	priv = tcp->priv;
+	p = buf;
+	e = p+len;
+	for(i = 0; i < Nstats; i++)
+		p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
+	return p - buf;
+}
+
+/*
+ *  garbage collect any stale conversations:
+ *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
+ *	- Finwait2 after 5 minutes
+ *
+ *  this is called whenever we run out of channels.  Both checks are
+ *  of questionable validity so we try to use them only when we're
+ *  up against the wall.
+ */
+static int
+tcpgc(Proto *tcp)
+{
+	Conv *c, **pp, **ep;
+	int n;
+	Tcpctl *tcb;
+
+
+	n = 0;
+	ep = &tcp->conv[tcp->nc];
+	for(pp = tcp->conv; pp < ep; pp++) {
+		c = *pp;
+		if(c == nil)
+			break;
+		if(!canqlock(c))
+			continue;
+		tcb = (Tcpctl*)c->ptcl;
+		switch(tcb->state){
+		case Syn_received:
+			if(NOW - tcb->time > 5000){
+				localclose(c, Etimedout);
+				n++;
+			}
+			break;
+		case Finwait2:
+			if(NOW - tcb->time > 5*60*1000){
+				localclose(c, Etimedout);
+				n++;
+			}
+			break;
+		}
+		qunlock(c);
+	}
+	return n;
+}
+
+static void
+tcpsettimer(Tcpctl *tcb)
+{
+	int x;
+
+	/* round trip dependency */
+	x = backoff(tcb->backoff) *
+		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
+
+	/* bounded twixt 1/2 and 64 seconds */
+	if(x < 500/MSPTICK)
+		x = 500/MSPTICK;
+	else if(x > (64000/MSPTICK))
+		x = 64000/MSPTICK;
+	tcb->timer.start = x;
+}
+
+void
+tcpinit(Fs *fs)
+{
+	Proto *tcp;
+	Tcppriv *tpriv;
+
+	tcp = smalloc(sizeof(Proto));
+	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
+	tcp->name = "tcp";
+	tcp->connect = tcpconnect;
+	tcp->announce = tcpannounce;
+	tcp->ctl = tcpctl;
+	tcp->state = tcpstate;
+	tcp->create = tcpcreate;
+	tcp->close = tcpclose;
+	tcp->rcv = tcpiput;
+	tcp->advise = tcpadvise;
+	tcp->stats = tcpstats;
+	tcp->inuse = tcpinuse;
+	tcp->gc = tcpgc;
+	tcp->ipproto = IP_TCPPROTO;
+	tcp->nc = scalednconv();
+	tcp->ptclsize = sizeof(Tcpctl);
+	tpriv->stats[MaxConn] = tcp->nc;
+
+	Fsproto(fs, tcp);
+}
+
+static void
+tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
+{
+	if(rcvscale){
+		tcb->rcv.scale = rcvscale & 0xff;
+		tcb->snd.scale = sndscale & 0xff;
+		tcb->window = QMAX<<tcb->snd.scale;
+		tcb->ssthresh = tcb->window;
+		qsetlimit(s->rq, tcb->window);
+		qsetlimit(s->wq, tcpwqsize(tcb->window));
+	} else {
+		tcb->rcv.scale = 0;
+		tcb->snd.scale = 0;
+		tcb->window = QMAX;
+		tcb->ssthresh = tcb->window;
+		qsetlimit(s->rq, tcb->window);
+		qsetlimit(s->wq, tcpwqsize(tcb->window));
+	}
+}
+

+ 660 - 0
sys/src/9/ip/udp.c

@@ -0,0 +1,660 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+#include	"ipv6.h"
+
+
+#define DPRINT if(0)print
+
+enum
+{
+	UDP_UDPHDR_SZ	= 8,
+
+	UDP4_PHDR_OFF = 8,
+	UDP4_PHDR_SZ = 12,
+	UDP4_IPHDR_SZ = 20,
+	UDP6_IPHDR_SZ = 40,
+	UDP6_PHDR_SZ = 40,
+	UDP6_PHDR_OFF = 0,
+
+	IP_UDPPROTO	= 17,
+	UDP_USEAD7	= 52,
+	UDP_USEAD6	= 36,
+
+	Udprxms		= 200,
+	Udptickms	= 100,
+	Udpmaxxmit	= 10,
+};
+
+typedef struct Udp4hdr Udp4hdr;
+struct Udp4hdr
+{
+	/* ip header */
+	uchar	vihl;		/* Version and header length */
+	uchar	tos;		/* Type of service */
+	uchar	length[2];	/* packet length */
+	uchar	id[2];		/* Identification */
+	uchar	frag[2];	/* Fragment information */
+	uchar	Unused;
+	uchar	udpproto;	/* Protocol */
+	uchar	udpplen[2];	/* Header plus data length */
+	uchar	udpsrc[IPv4addrlen];	/* Ip source */
+	uchar	udpdst[IPv4addrlen];	/* Ip destination */
+
+	/* udp header */
+	uchar	udpsport[2];	/* Source port */
+	uchar	udpdport[2];	/* Destination port */
+	uchar	udplen[2];	/* data length */
+	uchar	udpcksum[2];	/* Checksum */
+};
+
+typedef struct Udp6hdr Udp6hdr;
+struct Udp6hdr {
+	uchar viclfl[4];
+	uchar len[2];
+	uchar nextheader;
+	uchar hoplimit;
+	uchar udpsrc[IPaddrlen];
+	uchar udpdst[IPaddrlen];
+
+	/* udp header */
+	uchar	udpsport[2];	/* Source port */
+	uchar	udpdport[2];	/* Destination port */
+	uchar	udplen[2];	/* data length */
+	uchar	udpcksum[2];	/* Checksum */
+};
+
+/* MIB II counters */
+typedef struct Udpstats Udpstats;
+struct Udpstats
+{
+	ulong	udpInDatagrams;
+	ulong	udpNoPorts;
+	ulong	udpInErrors;
+	ulong	udpOutDatagrams;
+};
+
+typedef struct Udppriv Udppriv;
+struct Udppriv
+{
+	Ipht		ht;
+
+	/* MIB counters */
+	Udpstats	ustats;
+
+	/* non-MIB stats */
+	ulong		csumerr;		/* checksum errors */
+	ulong		lenerr;			/* short packet */
+};
+
+void (*etherprofiler)(char *name, int qlen);
+void udpkick(void *x, Block *bp);
+
+/*
+ *  protocol specific part of Conv
+ */
+typedef struct Udpcb Udpcb;
+struct Udpcb
+{
+	QLock;
+	uchar	headers;
+};
+
+static char*
+udpconnect(Conv *c, char **argv, int argc)
+{
+	char *e;
+	Udppriv *upriv;
+
+	upriv = c->p->priv;
+	e = Fsstdconnect(c, argv, argc);
+	Fsconnected(c, e);
+	if(e != nil)
+		return e;
+
+	iphtadd(&upriv->ht, c);
+	return nil;
+}
+
+
+static int
+udpstate(Conv *c, char *state, int n)
+{
+	return snprint(state, n, "%s qin %d qout %d\n",
+		c->inuse ? "Open" : "Closed",
+		c->rq ? qlen(c->rq) : 0,
+		c->wq ? qlen(c->wq) : 0
+	);
+}
+
+static char*
+udpannounce(Conv *c, char** argv, int argc)
+{
+	char *e;
+	Udppriv *upriv;
+
+	upriv = c->p->priv;
+	e = Fsstdannounce(c, argv, argc);
+	if(e != nil)
+		return e;
+	Fsconnected(c, nil);
+	iphtadd(&upriv->ht, c);
+
+	return nil;
+}
+
+static void
+udpcreate(Conv *c)
+{
+	c->rq = qopen(128*1024, Qmsg, 0, 0);
+	c->wq = qbypass(udpkick, c);
+}
+
+static void
+udpclose(Conv *c)
+{
+	Udpcb *ucb;
+	Udppriv *upriv;
+
+	upriv = c->p->priv;
+	iphtrem(&upriv->ht, c);
+
+	c->state = 0;
+	qclose(c->rq);
+	qclose(c->wq);
+	qclose(c->eq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->lport = 0;
+	c->rport = 0;
+
+	ucb = (Udpcb*)c->ptcl;
+	ucb->headers = 0;
+}
+
+void
+udpkick(void *x, Block *bp)
+{
+	Conv *c = x;
+	Udp4hdr *uh4;
+	Udp6hdr *uh6;
+	ushort rport;
+	uchar laddr[IPaddrlen], raddr[IPaddrlen];
+	Udpcb *ucb;
+	int dlen, ptcllen;
+	Udppriv *upriv;
+	Fs *f;
+	int version;
+	Conv *rc;
+
+	upriv = c->p->priv;
+	f = c->p->f;
+
+	netlog(c->p->f, Logudp, "udp: kick\n");
+	if(bp == nil)
+		return;
+
+	ucb = (Udpcb*)c->ptcl;
+	switch(ucb->headers) {
+	case 7:
+		/* get user specified addresses */
+		bp = pullupblock(bp, UDP_USEAD7);
+		if(bp == nil)
+			return;
+		ipmove(raddr, bp->rp);
+		bp->rp += IPaddrlen;
+		ipmove(laddr, bp->rp);
+		bp->rp += IPaddrlen;
+		/* pick interface closest to dest */
+		if(ipforme(f, laddr) != Runi)
+			findlocalip(f, laddr, raddr);
+		bp->rp += IPaddrlen;		/* Ignore ifc address */
+		rport = nhgets(bp->rp);
+		bp->rp += 2+2;			/* Ignore local port */
+		break;
+	case 6:					/* OBS */
+		/* get user specified addresses */
+		bp = pullupblock(bp, UDP_USEAD6);
+		if(bp == nil)
+			return;
+		ipmove(raddr, bp->rp);
+		bp->rp += IPaddrlen;
+		ipmove(laddr, bp->rp);
+		bp->rp += IPaddrlen;
+		/* pick interface closest to dest */
+		if(ipforme(f, laddr) != Runi)
+			findlocalip(f, laddr, raddr);
+		rport = nhgets(bp->rp);
+		bp->rp += 2+2;			/* Ignore local port */
+		break;
+	default:
+		rport = 0;
+		break;
+	}
+
+	if(ucb->headers) {
+		if(memcmp(laddr, v4prefix, IPv4off) == 0
+		|| ipcmp(laddr, IPnoaddr) == 0)
+			version = 4;
+		else
+			version = 6;
+	} else {
+		if( (memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
+			memcmp(c->laddr, v4prefix, IPv4off) == 0)
+			|| ipcmp(c->raddr, IPnoaddr) == 0)
+			version = 4;
+		else
+			version = 6;
+	}
+
+	dlen = blocklen(bp);
+
+	/* fill in pseudo header and compute checksum */
+	switch(version){
+	case V4:
+		bp = padblock(bp, UDP4_IPHDR_SZ+UDP_UDPHDR_SZ);
+		if(bp == nil)
+			return;
+
+		uh4 = (Udp4hdr *)(bp->rp);
+		ptcllen = dlen + UDP_UDPHDR_SZ;
+		uh4->Unused = 0;
+		uh4->udpproto = IP_UDPPROTO;
+		uh4->frag[0] = 0;
+		uh4->frag[1] = 0;
+		hnputs(uh4->udpplen, ptcllen);
+		if(ucb->headers) {
+			v6tov4(uh4->udpdst, raddr);
+			hnputs(uh4->udpdport, rport);
+			v6tov4(uh4->udpsrc, laddr);
+			rc = nil;
+		} else {
+			v6tov4(uh4->udpdst, c->raddr);
+			hnputs(uh4->udpdport, c->rport);
+			if(ipcmp(c->laddr, IPnoaddr) == 0)
+				findlocalip(f, c->laddr, c->raddr);
+			v6tov4(uh4->udpsrc, c->laddr);
+			rc = c;
+		}
+		hnputs(uh4->udpsport, c->lport);
+		hnputs(uh4->udplen, ptcllen);
+		uh4->udpcksum[0] = 0;
+		uh4->udpcksum[1] = 0;
+		hnputs(uh4->udpcksum,
+		       ptclcsum(bp, UDP4_PHDR_OFF, dlen+UDP_UDPHDR_SZ+UDP4_PHDR_SZ));
+		uh4->vihl = IP_VER4;
+		ipoput4(f, bp, 0, c->ttl, c->tos, rc);
+		break;
+
+	case V6:
+		bp = padblock(bp, UDP6_IPHDR_SZ+UDP_UDPHDR_SZ);
+		if(bp == nil)
+			return;
+
+		/*
+		 * using the v6 ip header to create pseudo header
+		 * first then reset it to the normal ip header
+		 */
+		uh6 = (Udp6hdr *)(bp->rp);
+		memset(uh6, 0, 8);
+		ptcllen = dlen + UDP_UDPHDR_SZ;
+		hnputl(uh6->viclfl, ptcllen);
+		uh6->hoplimit = IP_UDPPROTO;
+		if(ucb->headers) {
+			ipmove(uh6->udpdst, raddr);
+			hnputs(uh6->udpdport, rport);
+			ipmove(uh6->udpsrc, laddr);
+			rc = nil;
+		} else {
+			ipmove(uh6->udpdst, c->raddr);
+			hnputs(uh6->udpdport, c->rport);
+			if(ipcmp(c->laddr, IPnoaddr) == 0)
+				findlocalip(f, c->laddr, c->raddr);
+			ipmove(uh6->udpsrc, c->laddr);
+			rc = c;
+		}
+		hnputs(uh6->udpsport, c->lport);
+		hnputs(uh6->udplen, ptcllen);
+		uh6->udpcksum[0] = 0;
+		uh6->udpcksum[1] = 0;
+		hnputs(uh6->udpcksum,
+		       ptclcsum(bp, UDP6_PHDR_OFF, dlen+UDP_UDPHDR_SZ+UDP6_PHDR_SZ));
+		memset(uh6, 0, 8);
+		uh6->viclfl[0] = IP_VER6;
+		hnputs(uh6->len, ptcllen);
+		uh6->nextheader = IP_UDPPROTO;
+		ipoput6(f, bp, 0, c->ttl, c->tos, rc);
+		break;
+
+	default:
+		panic("udpkick: version %d", version);
+	}
+	upriv->ustats.udpOutDatagrams++;
+}
+
+void
+udpiput(Proto *udp, Ipifc *ifc, Block *bp)
+{
+	int len;
+	Udp4hdr *uh4;
+	Udp6hdr *uh6;
+	Conv *c;
+	Udpcb *ucb;
+	uchar raddr[IPaddrlen], laddr[IPaddrlen];
+	ushort rport, lport;
+	Udppriv *upriv;
+	Fs *f;
+	int version;
+	int ottl, oviclfl, olen;
+	uchar *p;
+
+	upriv = udp->priv;
+	f = udp->f;
+	upriv->ustats.udpInDatagrams++;
+
+	uh4 = (Udp4hdr*)(bp->rp);
+	version = ((uh4->vihl&0xF0)==IP_VER6) ? 6 : 4;
+
+	/* Put back pseudo header for checksum
+	 * (remember old values for icmpnoconv()) */
+	switch(version) {
+	case V4:
+		ottl = uh4->Unused;
+		uh4->Unused = 0;
+		len = nhgets(uh4->udplen);
+		olen = nhgets(uh4->udpplen);
+		hnputs(uh4->udpplen, len);
+
+		v4tov6(raddr, uh4->udpsrc);
+		v4tov6(laddr, uh4->udpdst);
+		lport = nhgets(uh4->udpdport);
+		rport = nhgets(uh4->udpsport);
+
+		if(nhgets(uh4->udpcksum)) {
+			if(ptclcsum(bp, UDP4_PHDR_OFF, len+UDP4_PHDR_SZ)) {
+				upriv->ustats.udpInErrors++;
+				netlog(f, Logudp, "udp: checksum error %I\n", raddr);
+				DPRINT("udp: checksum error %I\n", raddr);
+				freeblist(bp);
+				return;
+			}
+		}
+		uh4->Unused = ottl;
+		hnputs(uh4->udpplen, olen);
+		break;
+	case V6:
+		uh6 = (Udp6hdr*)(bp->rp);
+		len = nhgets(uh6->udplen);
+		oviclfl = nhgetl(uh6->viclfl);
+		olen = nhgets(uh6->len);
+		ottl = uh6->hoplimit;
+		ipmove(raddr, uh6->udpsrc);
+		ipmove(laddr, uh6->udpdst);
+		lport = nhgets(uh6->udpdport);
+		rport = nhgets(uh6->udpsport);
+		memset(uh6, 0, 8);
+		hnputl(uh6->viclfl, len);
+		uh6->hoplimit = IP_UDPPROTO;
+		if(ptclcsum(bp, UDP6_PHDR_OFF, len+UDP6_PHDR_SZ)) {
+			upriv->ustats.udpInErrors++;
+			netlog(f, Logudp, "udp: checksum error %I\n", raddr);
+			DPRINT("udp: checksum error %I\n", raddr);
+			freeblist(bp);
+			return;
+		}
+		hnputl(uh6->viclfl, oviclfl);
+		hnputs(uh6->len, olen);
+		uh6->nextheader = IP_UDPPROTO;
+		uh6->hoplimit = ottl;
+		break;
+	default:
+		panic("udpiput: version %d", version);
+		return;	/* to avoid a warning */
+	}
+
+	qlock(udp);
+
+	c = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
+	if(c == nil){
+		/* no converstation found */
+		upriv->ustats.udpNoPorts++;
+		qunlock(udp);
+		netlog(f, Logudp, "udp: no conv %I!%d -> %I!%d\n", raddr, rport,
+		       laddr, lport);
+
+		switch(version){
+		case V4:
+			icmpnoconv(f, bp);
+			break;
+		case V6:
+			icmphostunr(f, ifc, bp, icmp6_port_unreach, 0);
+			break;
+		default:
+			panic("udpiput2: version %d", version);
+		}
+
+		freeblist(bp);
+		return;
+	}
+	ucb = (Udpcb*)c->ptcl;
+
+	if(c->state == Announced){
+		if(ucb->headers == 0){
+			/* create a new conversation */
+			if(ipforme(f, laddr) != Runi) {
+				switch(version){
+				case V4:
+					v4tov6(laddr, ifc->lifc->local);
+					break;
+				case V6:
+					ipmove(laddr, ifc->lifc->local);
+					break;
+				default:
+					panic("udpiput3: version %d", version);
+				}
+			}
+			c = Fsnewcall(c, raddr, rport, laddr, lport, version);
+			if(c == nil){
+				qunlock(udp);
+				freeblist(bp);
+				return;
+			}
+			iphtadd(&upriv->ht, c);
+			ucb = (Udpcb*)c->ptcl;
+		}
+	}
+
+	qlock(c);
+	qunlock(udp);
+
+	/*
+	 * Trim the packet down to data size
+	 */
+	len -= UDP_UDPHDR_SZ;
+	switch(version){
+	case V4:
+		bp = trimblock(bp, UDP4_IPHDR_SZ+UDP_UDPHDR_SZ, len);
+		break;
+	case V6:
+		bp = trimblock(bp, UDP6_IPHDR_SZ+UDP_UDPHDR_SZ, len);
+		break;
+	default:
+		bp = nil;
+		panic("udpiput4: version %d", version);
+	}
+	if(bp == nil){
+		qunlock(c);
+		netlog(f, Logudp, "udp: len err %I.%d -> %I.%d\n", raddr, rport,
+		       laddr, lport);
+		upriv->lenerr++;
+		return;
+	}
+
+	netlog(f, Logudpmsg, "udp: %I.%d -> %I.%d l %d\n", raddr, rport,
+	       laddr, lport, len);
+
+	switch(ucb->headers){
+	case 7:
+		/* pass the src address */
+		bp = padblock(bp, UDP_USEAD7);
+		p = bp->rp;
+		ipmove(p, raddr); p += IPaddrlen;
+		ipmove(p, laddr); p += IPaddrlen;
+		ipmove(p, ifc->lifc->local); p += IPaddrlen;
+		hnputs(p, rport); p += 2;
+		hnputs(p, lport);
+		break;
+	case 6:					/* OBS */
+		/* pass the src address */
+		bp = padblock(bp, UDP_USEAD6);
+		p = bp->rp;
+		ipmove(p, raddr); p += IPaddrlen;
+		ipmove(p, ipforme(f, laddr)==Runi ? laddr : ifc->lifc->local); p += IPaddrlen;
+		hnputs(p, rport); p += 2;
+		hnputs(p, lport);
+		break;
+	}
+
+	if(bp->next)
+		bp = concatblock(bp);
+
+	if(qfull(c->rq)){
+		qunlock(c);
+		netlog(f, Logudp, "udp: qfull %I.%d -> %I.%d\n", raddr, rport,
+		       laddr, lport);
+		freeblist(bp);
+		return;
+	}
+
+	qpass(c->rq, bp);
+	qunlock(c);
+
+}
+
+char*
+udpctl(Conv *c, char **f, int n)
+{
+	Udpcb *ucb;
+
+	ucb = (Udpcb*)c->ptcl;
+	if(n == 1){
+		if(strcmp(f[0], "oldheaders") == 0){	/* OBS */
+			ucb->headers = 6;
+			if (up)
+				print("program %s wrote `oldheaders' to udp "
+					"ctl file; fix or recompile it\n",
+					up->text);
+			return nil;
+		} else if(strcmp(f[0], "headers") == 0){
+			ucb->headers = 7;	/* new headers format */
+			return nil;
+		}
+	}
+	return "unknown control request";
+}
+
+void
+udpadvise(Proto *udp, Block *bp, char *msg)
+{
+	Udp4hdr *h4;
+	Udp6hdr *h6;
+	uchar source[IPaddrlen], dest[IPaddrlen];
+	ushort psource, pdest;
+	Conv *s, **p;
+	int version;
+
+	h4 = (Udp4hdr*)(bp->rp);
+	version = ((h4->vihl&0xF0)==IP_VER6) ? 6 : 4;
+
+	switch(version) {
+	case V4:
+		v4tov6(dest, h4->udpdst);
+		v4tov6(source, h4->udpsrc);
+		psource = nhgets(h4->udpsport);
+		pdest = nhgets(h4->udpdport);
+		break;
+	case V6:
+		h6 = (Udp6hdr*)(bp->rp);
+		ipmove(dest, h6->udpdst);
+		ipmove(source, h6->udpsrc);
+		psource = nhgets(h6->udpsport);
+		pdest = nhgets(h6->udpdport);
+		break;
+	default:
+		panic("udpadvise: version %d", version);
+		return;  /* to avoid a warning */
+	}
+
+	/* Look for a connection */
+	qlock(udp);
+	for(p = udp->conv; *p; p++) {
+		s = *p;
+		if(s->rport == pdest)
+		if(s->lport == psource)
+		if(ipcmp(s->raddr, dest) == 0)
+		if(ipcmp(s->laddr, source) == 0){
+			if(s->ignoreadvice)
+				break;
+			qlock(s);
+			qunlock(udp);
+			qhangup(s->rq, msg);
+			qhangup(s->wq, msg);
+			qunlock(s);
+			freeblist(bp);
+			return;
+		}
+	}
+	qunlock(udp);
+	freeblist(bp);
+}
+
+int
+udpstats(Proto *udp, char *buf, int len)
+{
+	Udppriv *upriv;
+
+	upriv = udp->priv;
+	return snprint(buf, len, "InDatagrams: %lud\nNoPorts: %lud\nInErrors: %lud\nOutDatagrams: %lud\n",
+		upriv->ustats.udpInDatagrams,
+		upriv->ustats.udpNoPorts,
+		upriv->ustats.udpInErrors,
+		upriv->ustats.udpOutDatagrams);
+}
+
+void
+udpinit(Fs *fs)
+{
+	Proto *udp;
+
+	udp = smalloc(sizeof(Proto));
+	udp->priv = smalloc(sizeof(Udppriv));
+	udp->name = "udp";
+	udp->connect = udpconnect;
+	udp->announce = udpannounce;
+	udp->ctl = udpctl;
+	udp->state = udpstate;
+	udp->create = udpcreate;
+	udp->close = udpclose;
+	udp->rcv = udpiput;
+	udp->advise = udpadvise;
+	udp->stats = udpstats;
+	udp->ipproto = IP_UDPPROTO;
+	udp->nc = Nchans;
+	udp->ptclsize = sizeof(Udpcb);
+
+	Fsproto(fs, udp);
+}

+ 2 - 0
sys/src/9/k10/Linux

@@ -0,0 +1,2 @@
+Linux support was removed from this kernel.
+It may be found in /n/nixdump/2011/1114/sys/src/nix

+ 332 - 0
sys/src/9/k10/acore.c

@@ -0,0 +1,332 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include <tos.h>
+#include <pool.h>
+#include "amd64.h"
+#include "ureg.h"
+#include "io.h"
+#include "../port/pmc.h"
+
+/*
+ * NIX code run at the AC.
+ * This is the "AC kernel".
+ */
+
+/*
+ * FPU:
+ *
+ * The TC handles the FPU by keeping track of the state for the
+ * current process. If it has been used and must be saved, it is saved, etc.
+ * When a process gets to the AC, we handle the FPU directly, and save its
+ * state before going back to the TC (or the TC state would be stale).
+ *
+ * Because of this, each time the process comes back to the AC and
+ * uses the FPU it will get a device not available trap and
+ * the state will be restored. This could be optimized because the AC
+ * is single-process, and we do not have to disable the FPU while
+ * saving, so it does not have to be restored.
+ */
+
+extern char* acfpunm(Ureg* ureg, void*);
+extern char* acfpumf(Ureg* ureg, void*);
+extern char* acfpuxf(Ureg* ureg, void*);
+extern void acfpusysprocsetup(Proc*);
+
+extern void _acsysret(void);
+extern void _actrapret(void);
+
+ACVctl *acvctl[256];
+
+/* 
+ * Test inter core calls by calling a cores to print something, and then
+ * waiting for it to complete.
+ */
+static void
+testiccfn(void)
+{
+	print("called: %s\n", (char*)m->icc->data);
+}
+
+void
+testicc(int i)
+{
+	Mach *mp;
+
+	if((mp = sys->machptr[i]) != nil && mp->online != 0){
+		if(mp->nixtype != NIXAC){
+			print("testicc: core %d is not an AC\n", i);
+			return;
+		}
+		print("calling core %d... ", i);
+		mp->icc->flushtlb = 0;
+		snprint((char*)mp->icc->data, ICCLNSZ, "<%d>", i);
+		mfence();
+		mp->icc->fn = testiccfn;
+		mwait(&mp->icc->fn);
+	}
+}
+
+/*
+ * Check if the AC kernel (mach) stack has more than 4*KiB free.
+ * Do not call panic, the stack is gigantic.
+ */
+static void
+acstackok(void)
+{
+	char dummy;
+	char *sstart;
+
+	sstart = (char *)m - PGSZ - 4*PTSZ - MACHSTKSZ;
+	if(&dummy < sstart + 4*KiB){
+		print("ac kernel stack overflow, cpu%d stopped\n", m->machno);
+		DONE();
+	}
+}
+
+/*
+ * Main scheduling loop done by the application core.
+ * Some of functions run will not return.
+ * The system call handler will reset the stack and
+ * call acsched again.
+ * We loop because some functions may return and we should
+ * wait for another call.
+ */
+void
+acsched(void)
+{
+	acmmuswitch();
+	for(;;){
+		acstackok();
+		mwait(&m->icc->fn);
+		if(m->icc->flushtlb)
+			acmmuswitch();
+		DBG("acsched: cpu%d: fn %#p\n", m->machno, m->icc->fn);
+		m->icc->fn();
+		DBG("acsched: cpu%d: idle\n", m->machno);
+		mfence();
+		m->icc->fn = nil;
+	}
+}
+
+void
+acmmuswitch(void)
+{
+	extern Page mach0pml4;
+
+	DBG("acmmuswitch mpl4 %#p mach0pml4 %#p m0pml4 %#p\n", m->pml4->pa, mach0pml4.pa, sys->machptr[0]->pml4->pa);
+
+
+	cr3put(m->pml4->pa);
+}
+
+/*
+ * Beware: up is not set when this function is called.
+ */
+void
+actouser(void)
+{
+	void xactouser(u64int);
+	Ureg *u;
+
+	acfpusysprocsetup(m->proc);
+
+	u = m->proc->dbgreg;
+	DBG("cpu%d: touser usp = %#p entry %#p\n", m->machno, u->sp, u->ip);
+	xactouser(u->sp);
+	panic("actouser");
+}
+
+void
+actrapret(void)
+{
+	/* done by actrap() */
+}
+
+/*
+ * Entered in AP core context, upon traps (system calls go through acsyscall)
+ * using up->dbgreg means cores MUST be homogeneous.
+ *
+ * BUG: We should setup some trapenable() mechanism for the AC,
+ * so that code like fpu.c could arrange for handlers specific for
+ * the AC, instead of doint that by hand here.
+ * 
+ * All interrupts are masked while in the "kernel"
+ */
+void
+actrap(Ureg *u)
+{
+	char *n;
+	ACVctl *v;
+
+	n = nil;
+
+	_pmcupdate(m);
+	if(m->proc != nil){
+		m->proc->nactrap++;
+		m->proc->actime1 = fastticks(nil);
+	}
+	if(u->type < nelem(acvctl)){
+		v = acvctl[u->type];
+		if(v != nil){
+			DBG("actrap: cpu%d: %ulld\n", m->machno, u->type);
+			n = v->f(u, v->a);
+			if(n != nil)
+				goto Post;
+			return;
+		}
+	}
+	switch(u->type){
+	case IdtDF:
+		print("AC: double fault\n");
+		dumpregs(u);
+		ndnr();
+	case IdtIPI:
+		m->intr++;
+		DBG("actrap: cpu%d: IPI\n", m->machno);
+		apiceoi(IdtIPI);
+		break;
+	case IdtTIMER:
+		apiceoi(IdtTIMER);
+		panic("timer interrupt in an AC");
+		break;
+	case IdtPF:
+		/* this case is here for debug only */
+		m->pfault++;
+		DBG("actrap: cpu%d: PF cr2 %#ullx\n", m->machno, cr2get());
+		break;
+	default:
+		print("actrap: cpu%d: %ulld\n", m->machno, u->type);
+	}
+Post:
+	m->icc->rc = ICCTRAP;
+	m->cr2 = cr2get();
+	memmove(m->proc->dbgreg, u, sizeof *u);
+	m->icc->note = n;
+	fpuprocsave(m->proc);
+	_pmcupdate(m);
+	mfence();
+	m->icc->fn = nil;
+	ready(m->proc);
+
+	mwait(&m->icc->fn);
+
+	if(m->icc->flushtlb)
+		acmmuswitch();
+	if(m->icc->fn != actrapret)
+		acsched();
+	DBG("actrap: ret\n");
+	memmove(u, m->proc->dbgreg, sizeof *u);
+	if(m->proc)
+		m->proc->actime += fastticks2us(fastticks(nil) - m->proc->actime1);
+}
+
+void
+acsyscall(void)
+{
+	Proc *p;
+
+	/*
+	 * If we saved the Ureg into m->proc->dbgregs,
+	 * There's nothing else we have to do.
+	 * Otherwise, we should m->proc->dbgregs = u;
+	 */
+	DBG("acsyscall: cpu%d\n", m->machno);
+
+	_pmcupdate(m);
+	p = m->proc;
+	p->actime1 = fastticks(nil);
+	m->syscall++;	/* would also count it in the TS core */
+	m->icc->rc = ICCSYSCALL;
+	m->cr2 = cr2get();
+	fpuprocsave(p);
+	_pmcupdate(m);
+	mfence();
+	m->icc->fn = nil;
+	ready(p);
+	/*
+	 * The next call is probably going to make us jmp
+	 * into user code, forgetting all our state in this
+	 * stack, upon the next syscall.
+	 * We don't nest calls in the current stack for too long.
+	 */
+	acsched();
+}
+
+/*
+ * Called in AP core context, to return from system call.
+ */
+void
+acsysret(void)
+{
+	DBG("acsysret\n");
+	if(m->proc != nil)
+		m->proc->actime += fastticks2us(fastticks(nil) - m->proc->actime1);
+	_acsysret();
+}
+
+void
+dumpreg(void *u)
+{
+	print("reg is %p\n", u);
+	ndnr();
+}
+
+char *rolename[] = 
+{
+	[NIXAC]	"AC",
+	[NIXTC]	"TC",
+	[NIXKC]	"KC",
+	[NIXXC]	"XC",
+};
+
+void
+acmodeset(int mode)
+{
+	switch(mode){
+	case NIXAC:
+	case NIXKC:
+	case NIXTC:
+	case NIXXC:
+		break;
+	default:
+		panic("acmodeset: bad mode %d", mode);
+	}
+	m->nixtype = mode;
+}
+
+void
+acinit(void)
+{
+	Mach *mp;
+	Proc *pp;
+
+	/*
+	 * Lower the priority of the apic to 0,
+	 * to accept interrupts.
+	 * Raise it later if needed to disable them.
+	 */
+	apicpri(0);
+
+	/*
+	 * Be sure a few  assembler assumptions still hold.
+	 * Someone moved m->stack and I had fun debugging...
+	 */
+	mp = 0;
+	pp = 0;
+	assert((uintptr)&mp->proc == 16);
+	assert((uintptr)&pp->dbgreg == 24);
+	assert((uintptr)&mp->stack == 24);
+}

+ 391 - 0
sys/src/9/k10/acore.c.old

@@ -0,0 +1,391 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include <tos.h>
+#include <pool.h>
+#include "amd64.h"
+#include "ureg.h"
+#include "io.h"
+
+/*
+ * BUG:
+ * The AC must not accept interrupts while in the kernel,
+ * or we must be prepared for nesting them, which we are not.
+ * This is important for note handling, because postnote()
+ * assumes that it's ok to send an IPI to an AC, no matter its
+ * state.
+ * 
+ */
+
+void
+intrac(Proc *p)
+{
+	Mach *ac;
+
+	ac = p->ac;
+	if(ac == nil){
+		DBG("intrac: Proc.ac is nil. no ipi sent.\n");
+		return;
+	}
+	/*
+	 * It's ok if the AC gets idle in the mean time.
+	 */
+	DBG("intrac: ipi to cpu%d\n", ac->machno);
+	apicipi(ac->apicno);
+}
+
+/*
+ * Functions starting with ac... are run in the application core.
+ * All other functions are run by the time-sharing cores.
+ */
+
+typedef void (*APfunc)(void);
+extern int notify(Ureg*);
+extern void _acsysret(void);
+extern void _actrapret(void);
+
+static char *acnames[] = { "Ok", "Trap", "Syscall"};
+
+void
+acmmuswitch(void)
+{
+	cr3put(m->pml4->pa);
+}
+void xactouser(u64int);
+void
+actouser(void)
+{
+	uintptr sp;
+	Ureg *u;
+
+	memmove(&sp, m->icc->data, sizeof(sp));
+	u = m->proc->dbgreg;
+	DBG("cpu%d: touser usp = %#p entry %#p\n", m->machno, sp, u->ip);
+
+
+	/*
+	 * This code for updating tos is wrong. It shouldn't go here.
+	 * It's the fact that we assing a process to a core what makes
+	 * it run in that core, not the fact that we call actouser(),
+	 * In the future, we might not even call actouser here. -Nemo.
+	 */
+
+	/* BUG: add a function, called here and kexit */
+	m->load = 100;
+	xactouser(sp);
+	panic("actouser");
+}
+
+void
+actrapret(void)
+{
+	/* done by actrap() */
+}
+
+/*
+ * Entered in AP core context, upon traps and system calls.
+ * using up->dbgreg means cores MUST be homogeneous.
+ */
+void
+actrap(Ureg *u)
+{
+	/* print instead of DBG, so we see any trap by now */
+	switch(u->type){
+	case IdtIPI:
+		m->intr++;
+		print("actrap: cpu%d: IPI\n", m->machno);
+		/*
+		 * Beware: BUG: we can get now IPIs while in kernel mode,
+		 * after declaring the end of the interrupt.
+		 * The code is not prepared for that.
+		 */
+		apiceoi(IdtIPI);
+		break;
+	case IdtPF:
+		m->pfault++;
+		print("actrap: cpu%d: PF\n", m->machno);
+		break;
+	default:
+		print("actrap: cpu%d: %ulld\n", m->machno, u->type);
+	}
+	m->icc->rc = ICCTRAP;
+	m->cr2 = cr2get();
+	memmove(m->proc->dbgreg, u, sizeof *u);
+	m->icc->fn = nil;
+	mfence();
+	ready(m->proc);
+	m->load = 0;
+	while(*m->icc->fn == nil)
+		;
+	m->load = 100;
+	if(m->icc->flushtlb)
+		acmmuswitch();
+	DBG("actrap: ret\n");
+	if(m->icc->fn != actrapret)
+		acsched();
+	memmove(u, m->proc->dbgreg, sizeof *u);
+}
+
+void
+acsyscall(void)
+{
+	/*
+	 * If we saved the Ureg into m->proc->dbgregs,
+	 * There's nothing else we have to do.
+	 * Otherwise, we should m->proc->dbgregs = u;
+	 */
+	DBG("acsyscall: cpu%d\n", m->machno);
+	m->syscall++;	/* would also count it in the TS core */
+	m->icc->rc = ICCSYSCALL;
+	m->cr2 = cr2get();
+	m->icc->fn = nil;
+	ready(m->proc);
+	m->load = 0;
+	/*
+	 * The next call is probably going to make us jmp
+	 * into user code, forgetting all our state in this
+	 * stack, upon the next syscall.
+	 * We don't nest calls in the current stack for too long.
+	 */
+	acsched();
+}
+
+/*
+ * Called in AP core context, to return from system call.
+ */
+void
+acsysret(void)
+{
+	DBG("acsysret\n");
+	_acsysret();
+}
+
+void
+dumpreg(void *u)
+{
+	print("reg is %p\n", u);
+	ndnr();
+}
+
+
+/*
+ * run an arbitrary function with arbitrary args on an ap core
+ * first argument is always pml4 for process
+ * make a field and a struct for the args cache line.
+ *
+ * Returns the return-code for the ICC or -1 if the process was
+ * interrupted while issuing the ICC.
+ */
+int
+runac(int core, APfunc func, int flushtlb, void *a, long n)
+{
+	Mach *mp;
+	uchar *dpg, *spg;
+
+	if (n > sizeof(mp->icc->data))
+		panic("runac: args too long");
+
+	if((mp = sys->machptr[core]) == nil || mp->online == 0)
+		panic("Bad core");
+	if(mp->proc != nil && mp->proc != up)
+		panic("runapfunc: mach is busy with another proc?");
+
+	memmove(mp->icc->data, a, n);
+	if(flushtlb){
+		dpg = UINT2PTR(mp->pml4->va);
+		spg = UINT2PTR(m->pml4->va);
+		/* We should copy only user space mappings:
+		 *	memmove(dgp, spg, m->pml4->daddr * sizeof(PTE));
+		 */
+		memmove(dpg, spg, PTPGSZ);
+	}
+	mp->icc->flushtlb = flushtlb;
+	mp->icc->rc = ICCOK;
+
+	DBG("runac: exotic proc on cpu%d\n", mp->machno);
+	if(waserror()){
+		qunlock(&up->debug);
+		nexterror();
+	}
+	qlock(&up->debug);
+	up->ac = mp;
+	up->nicc++;
+	up->state = Exotic;
+	up->psstate = 0;
+	qunlock(&up->debug);
+	poperror();
+	mfence();
+	mp->icc->fn = func;
+	sched();
+	return mp->icc->rc;
+}
+
+/*
+ * Cleanup done by runacore to pretend we are going back to user space.
+ * We won't return and won't do what syscall() would normally do.
+ * Do it here instead.
+ */
+static void
+fakeretfromsyscall(Ureg *ureg)
+{
+	int s;
+
+	poperror();	/* as syscall() would do if we would return */
+	if(up->procctl == Proc_tracesyscall){	/* Would this work? */
+		up->procctl = Proc_stopme;
+		s = splhi();
+		procctl(up);
+		splx(s);
+	}
+
+	up->insyscall = 0;
+	/* if we delayed sched because we held a lock, sched now */
+	if(up->delaysched){
+		sched();
+		splhi();
+	}
+	kexit(ureg);
+}
+
+static void
+testproc(void *a)
+{
+	Proc *p;
+
+	p = a;
+	if(p == nil){
+		print("no proc to intr\n");
+		return;
+	}
+	tsleep(&up->sleep, return0, 0, 10000);
+	print("testproc: sending ipi to proc\n");
+	intrac(p);
+	print("sent\n");
+}
+
+/*
+ * Move the current process to an application core.
+ * This is performed at the end of execac(), and
+ * we pretend to be returning to user-space, but instead we
+ * dispatch the process to another core.
+ * 1. We do the final bookkeeping that syscall() would do after
+ *    a return from sysexec(), because we are not returning.
+ * 2. We dispatch the process to an AC using an ICC.
+ *
+ * This function won't return unless the process is reclaimed back
+ * to the time-sharing core, and is the handler for the process
+ * to deal with traps and system calls until the process dies.
+ *
+ * Remember that this function is the "line" between user and kernel
+ * space, it's not expected to raise|handle any error.
+ *
+ * We install a safety error label, just in case we raise errors,
+ * which we shouldn't. (noerrorsleft knows that for exotic processes
+ * there is an error label pushed by us).
+ */
+void
+runacore(int core, u64int ar0p)
+{
+	Ureg *ureg;
+	void (*fn)(void);
+	int rc, flush, becometimesharing;
+
+	if(waserror())
+		panic("runacore: error: %s\n", up->errstr);
+	ureg = up->dbgreg;
+	ureg->ax = ar0p;		/* see xactouser */
+	fakeretfromsyscall(ureg);
+
+/* IPI testing */
+if(0)
+kproc("testproc", testproc, up);
+
+	rc = runac(core, actouser, 1, &ureg->sp, sizeof ureg->sp);
+	becometimesharing = 0;
+	for(;;){
+		flush = 0;
+		fn = nil;
+		switch(rc){
+		case ICCTRAP:
+			m->cr2 = up->ac->cr2;
+			DBG("runacore: trap %ulld cr2 %#ullx ureg %#p\n",
+				ureg->type, m->cr2, ureg);
+			if(ureg->type == IdtIPI){
+				if(up->procctl || up->nnote)
+					notify(up->dbgreg);
+				kexit(up->dbgreg);
+			}else
+				trap(ureg);
+			flush = 1;
+			fn = actrapret;
+			break;
+		case ICCSYSCALL:
+			DBG("runacore: syscall ax %#ullx ureg %#p\n",
+				ureg->ax, ureg);
+			syscall(ureg->ax, ureg);
+			flush = 1;
+			fn = acsysret;
+			break;
+		default:
+			panic("runacore: unexpected rc = %d", rc);
+		}
+		if(becometimesharing)
+			break;
+		rc = runac(core, fn, flush, &ureg, sizeof ureg);
+	}
+
+	fakeretfromsyscall(up->dbgreg);
+	/*
+	 * dettach from the AC.
+	 */
+	up->ac->proc = nil;
+	up->ac = nil;
+	/* And we return to syscall, which would do nothing but
+	 * returning, and we'd be back to the TS core.
+	 */
+}
+
+void
+acmodeset(int mode)
+{
+	switch(mode){
+	case NIXAC:
+	case NIXKC:
+	case NIXTC:
+		break;
+	default:
+		panic("apmodeset: bad mode %d", mode);
+	}
+	m->nixtype = mode;
+}
+
+void
+stopac(Proc *p)
+{
+	Mach *mp;
+
+	mp = p->ac;
+	if(mp == nil)
+		return;
+	if(mp->proc != p)
+		return;
+	DBG("stopac: cpu%d\n", mp->machno);
+	p->ac = nil;
+	mp->proc = nil;
+//	send sipi to p->ac, it would rerun squidboy(), and
+//	wait for us to give it a function to run.
+}
+
+void
+acinit(void)
+{
+	/*
+	 * Lower the priority of the apic to 0,
+	 * to accept interrupts.
+	 * Raise it later if needed to disable them.
+	 */
+	apicpri(0);
+}

+ 419 - 0
sys/src/9/k10/acpi.h

@@ -0,0 +1,419 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+typedef struct Atable Atable;
+typedef struct Facs Facs;
+typedef struct Fadt Fadt;
+typedef struct Gas Gas;
+typedef struct Gpe Gpe;
+typedef struct Rsdp Rsdp;
+typedef struct Sdthdr Sdthdr;
+typedef struct Parse Parse;
+typedef struct Xsdt Xsdt;
+typedef struct Regio Regio;
+typedef struct Reg Reg;
+typedef struct Madt Madt;
+typedef struct Msct Msct;
+typedef struct Mdom Mdom;
+typedef struct Apicst Apicst;
+typedef struct Srat Srat;
+typedef struct Slit Slit;
+typedef struct SlEntry SlEntry;
+
+enum
+{
+
+	Sdthdrsz	= 36,	/* size of SDT header */
+
+	/* ACPI regions. Gas ids */
+	Rsysmem	= 0,
+	Rsysio,
+	Rpcicfg,
+	Rembed,
+	Rsmbus,
+	Rcmos,
+	Rpcibar,
+	Ripmi,
+	Rfixedhw	= 0x7f,
+
+	/* ACPI PM1 control */
+	Pm1SciEn		= 0x1,		/* Generate SCI and not SMI */
+
+	/* ACPI tbdf as encoded in acpi region base addresses */
+	Rpciregshift	= 0,
+	Rpciregmask	= 0xFFFF,
+	Rpcifunshift	= 16,
+	Rpcifunmask	= 0xFFFF,
+	Rpcidevshift	= 32,
+	Rpcidevmask	= 0xFFFF,
+	Rpcibusshift	= 48,
+	Rpcibusmask	= 0xFFFF,
+
+	/* Apic structure types */
+	ASlapic = 0,	/* processor local apic */
+	ASioapic,	/* I/O apic */
+	ASintovr,	/* Interrupt source override */
+	ASnmi,		/* NMI source */
+	ASlnmi,		/* local apic nmi */
+	ASladdr,	/* local apic address override */
+	ASiosapic,	/* I/O sapic */
+	ASlsapic,	/* local sapic */
+	ASintsrc,	/* platform interrupt sources */
+	ASlx2apic,	/* local x2 apic */
+	ASlx2nmi,	/* local x2 apic NMI */
+
+	/* Apic flags */
+	AFbus	= 0,	/* polarity/trigger like in ISA */
+	AFhigh	= 1,	/* active high */
+	AFlow	= 3,	/* active low */
+	AFpmask	= 3,	/* polarity bits */
+	AFedge	= 1<<2,	/* edge triggered */
+	AFlevel	= 3<<2,	/* level triggered */
+	AFtmask	= 3<<2,	/* trigger bits */
+
+	/* SRAT types */
+	SRlapic = 0,	/* Local apic/sapic affinity */
+	SRmem,		/* Memory affinity */
+	SRlx2apic,	/* x2 apic affinity */
+
+	/* Arg for _PIC */
+	Ppic = 0,	/* PIC interrupt model */
+	Papic,		/* APIC interrupt model */
+	Psapic,		/* SAPIC interrupt model */
+
+
+	CMregion = 0,			/* regio name spc base len accsz*/
+	CMgpe,				/* gpe name id */
+
+	Qdir = 0,
+	Qctl,
+	Qtbl,
+	Qio,
+};
+
+/*
+ * ACPI table (sw)
+ */
+struct Atable
+{
+	Atable*	next;		/* next table in list */
+	int	is64;		/* uses 64bits */
+	char	sig[5];		/* signature */
+	char	oemid[7];	/* oem id str. */
+	char	oemtblid[9];	/* oem tbl. id str. */
+	uchar* tbl;		/* pointer to table in memory */
+	long	dlen;		/* size of data in table, after Stdhdr */
+};
+
+struct Gpe
+{
+	uintptr	stsio;		/* port used for status */
+	int	stsbit;		/* bit number */
+	uintptr	enio;		/* port used for enable */
+	int	enbit;		/* bit number */
+	int	nb;		/* event number */
+	char*	obj;		/* handler object  */
+	int	id;		/* id as supplied by user */
+};
+
+struct Parse
+{
+	char*	sig;
+	Atable*	(*f)(uchar*, int);	/* return nil to keep vmap */
+};
+
+struct Regio{
+	void	*arg;
+	u8int	(*get8)(uintptr, void*);
+	void	(*set8)(uintptr, u8int, void*);
+	u16int	(*get16)(uintptr, void*);
+	void	(*set16)(uintptr, u16int, void*);
+	u32int	(*get32)(uintptr, void*);
+	void	(*set32)(uintptr, u32int, void*);
+	u64int	(*get64)(uintptr, void*);
+	void	(*set64)(uintptr, u64int, void*);
+};
+
+struct Reg
+{
+	char*	name;
+	int	spc;		/* io space */
+	u64int	base;		/* address, physical */
+	uchar*	p;		/* address, kmapped */
+	u64int	len;
+	int	tbdf;
+	int	accsz;		/* access size */
+};
+
+/* Generic address structure. 
+ */
+#pragma pack on
+struct Gas
+{
+	u8int	spc;	/* address space id */
+	u8int	len;	/* register size in bits */
+	u8int	off;	/* bit offset */
+	u8int	accsz;	/* 1: byte; 2: word; 3: dword; 4: qword */
+	u64int	addr;	/* address (or acpi encoded tbdf + reg) */
+};
+
+/* Root system description table pointer.
+ * Used to locate the root system description table RSDT
+ * (or the extended system description table from version 2) XSDT.
+ * The XDST contains (after the DST header) a list of pointers to tables:
+ *	- FADT	fixed acpi description table.
+ *		It points to the DSDT, AML code making the acpi namespace.
+ *	- SSDTs	tables with AML code to add to the acpi namespace.
+ *	- pointers to other tables for apics, etc.
+ */
+
+struct Rsdp
+{
+	u8int	signature[8];			/* "RSD PTR " */
+	u8int	rchecksum;
+	u8int	oemid[6];
+	u8int	revision;
+	u8int	raddr[4];			/* RSDT */
+	u8int	length[4];
+	u8int	xaddr[8];			/* XSDT */
+	u8int	xchecksum;			/* XSDT */
+	u8int	_33_[3];			/* reserved */
+};
+
+/* Header for ACPI description tables
+ */
+struct Sdthdr
+{
+	u8int	sig[4];			/* "FACP" or whatever */
+	u8int	length[4];
+	u8int	rev;
+	u8int	csum;
+	u8int	oemid[6];
+	u8int	oemtblid[8];
+	u8int	oemrev[4];
+	u8int	creatorid[4];
+	u8int	creatorrev[4];
+};
+
+/* Firmware control structure
+ */
+struct Facs
+{
+	u32int	hwsig;
+	u32int	wakingv;
+	u32int	glock;
+	u32int	flags;
+	u64int	xwakingv;
+	u8int	vers;
+	u32int	ospmflags;
+};
+
+#pragma pack off
+
+/* Maximum System Characteristics table
+ */
+struct Msct
+{
+	int	ndoms;		/* number of domains */
+	int	nclkdoms;	/* number of clock domains */
+	u64int	maxpa;		/* max physical address */
+
+	Mdom*	dom;		/* domain information list */
+};
+
+struct Mdom
+{
+	Mdom*	next;
+	int	start;		/* start dom id */
+	int	end;		/* end dom id */
+	int	maxproc;	/* max processor capacity */
+	u64int	maxmem;		/* max memory capacity */
+};
+
+/* Multiple APIC description table
+ * Interrupts are virtualized by ACPI and each APIC has
+ * a `virtual interrupt base' where its interrupts start.
+ * Addresses are processor-relative physical addresses.
+ * Only enabled devices are linked, others are filtered out.
+ */
+struct Madt
+{
+	u64int	lapicpa;		/* local APIC addr */
+	int	pcat;		/* the machine has PC/AT 8259s */
+	Apicst*	st;		/* list of Apic related structures */
+};
+
+struct Apicst
+{
+	int	type;
+	Apicst*	next;
+	union{
+		struct{
+			int	pid;	/* processor id */
+			int	id;	/* apic no */
+		} lapic;
+		struct{
+			int	id;	/* io apic id */
+			u32int	ibase;	/* interrupt base addr. */
+			u64int	addr;	/* base address */
+		} ioapic, iosapic;
+		struct{
+			int	irq;	/* bus intr. source (ISA only) */
+			int	intr;	/* system interrupt */
+			int	flags;	/* apic flags */
+		} intovr;
+		struct{
+			int	intr;	/* system interrupt */
+			int	flags;	/* apic flags */
+		} nmi;
+		struct{
+			int	pid;	/* processor id */
+			int	flags;	/* lapic flags */
+			int	lint;	/* lapic LINTn for nmi */
+		} lnmi;
+		struct{
+			int	pid;	/* processor id */
+			int	id;	/* apic id */
+			int	eid;	/* apic eid */
+			int	puid;	/* processor uid */
+			char*	puids;	/* same thing */
+		} lsapic;
+		struct{
+			int	pid;	/* processor id */
+			int	peid;	/* processor eid */
+			int	iosv;	/* io sapic vector */
+			int	intr;	/* global sys intr. */
+			int	type;	/* intr type */
+			int	flags;	/* apic flags */
+			int	any;	/* err sts at any proc */
+		} intsrc;
+		struct{
+			int	id;	/* x2 apic id */
+			int	puid;	/* processor uid */
+		} lx2apic;
+		struct{
+			int	puid;
+			int	flags;
+			int	intr;
+		} lx2nmi;
+	};
+};
+
+/* System resource affinity table
+ */
+struct Srat
+{
+	int	type;
+	Srat*	next;
+	union{
+		struct{
+			int	dom;	/* proximity domain */
+			int	apic;	/* apic id */
+			int	sapic;	/* sapic id */
+			int	clkdom;	/* clock domain */
+		} lapic;
+		struct{
+			int	dom;	/* proximity domain */
+			u64int	addr;	/* base address */
+			u64int	len;
+			int	hplug;	/* hot pluggable */
+			int	nvram;	/* non volatile */	
+		} mem;
+		struct{
+			int	dom;	/* proximity domain */
+			int	apic;	/* x2 apic id */
+			int	clkdom;	/* clock domain */
+		} lx2apic;
+	};
+};
+
+/* System locality information table
+ */
+struct Slit {
+	uvlong rowlen;
+	SlEntry **e;
+};
+
+struct SlEntry {
+	int dom;	/* proximity domain */
+	uint dist;	/* distance to proximity domain */
+};
+
+/* Fixed ACPI description table.
+ * Describes implementation and hardware registers.
+ * PM* blocks are low level functions.
+ * GPE* blocks refer to general purpose events.
+ * P_* blocks are for processor features.
+ * Has address for the DSDT.
+ */
+struct Fadt
+{
+	u32int	facs;
+	u32int	dsdt;
+	/* 1 reserved */
+	u8int	pmprofile;
+	u16int	sciint;
+	u32int	smicmd;
+	u8int	acpienable;
+	u8int	acpidisable;
+	u8int	s4biosreq;
+	u8int	pstatecnt;
+	u32int	pm1aevtblk;
+	u32int	pm1bevtblk;
+	u32int	pm1acntblk;
+	u32int	pm1bcntblk;
+	u32int	pm2cntblk;
+	u32int	pmtmrblk;
+	u32int	gpe0blk;
+	u32int	gpe1blk;
+	u8int	pm1evtlen;
+	u8int	pm1cntlen;
+	u8int	pm2cntlen;
+	u8int	pmtmrlen;
+	u8int	gpe0blklen;
+	u8int	gpe1blklen;
+	u8int	gp1base;
+	u8int	cstcnt;
+	u16int	plvl2lat;
+	u16int	plvl3lat;
+	u16int	flushsz;
+	u16int	flushstride;
+	u8int	dutyoff;
+	u8int	dutywidth;
+	u8int	dayalrm;
+	u8int	monalrm;
+	u8int	century;
+	u16int	iapcbootarch;
+	/* 1 reserved */
+	u32int	flags;
+	Gas	resetreg;
+	u8int	resetval;
+	/* 3 reserved */
+	u64int	xfacs;
+	u64int	xdsdt;
+	Gas	xpm1aevtblk;
+	Gas	xpm1bevtblk;
+	Gas	xpm1acntblk;
+	Gas	xpm1bcntblk;
+	Gas	xpm2cntblk;
+	Gas	xpmtmrblk;
+	Gas	xgpe0blk;
+	Gas	xgpe1blk;
+};
+
+/* XSDT/RSDT. 4/8 byte addresses starting at p.
+ */
+struct Xsdt
+{
+	int	len;
+	int	asize;
+	u8int*	p;
+};
+
+extern uintmem acpimblocksize(uintmem, int*);

+ 205 - 0
sys/src/9/k10/amd64.h

@@ -0,0 +1,205 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+enum {						/* Cr0 */
+	Pe		= 0x00000001,		/* Protected Mode Enable */
+	Mp		= 0x00000002,		/* Monitor Coprocessor */
+	Em		= 0x00000004,		/* Emulate Coprocessor */
+	Ts		= 0x00000008,		/* Task Switched */
+	Et		= 0x00000010,		/* Extension Type */
+	Ne		= 0x00000020,		/* Numeric Error  */
+	Wp		= 0x00010000,		/* Write Protect */
+	Am		= 0x00040000,		/* Alignment Mask */
+	Nw		= 0x20000000,		/* Not Writethrough */
+	Cd		= 0x40000000,		/* Cache Disable */
+	Pg		= 0x80000000,		/* Paging Enable */
+};
+
+enum {						/* Cr3 */
+	Pwt		= 0x00000008,		/* Page-Level Writethrough */
+	Pcd		= 0x00000010,		/* Page-Level Cache Disable */
+};
+
+enum {						/* Cr4 */
+	Vme		= 0x00000001,		/* Virtual-8086 Mode Extensions */
+	Pvi		= 0x00000002,		/* Protected Mode Virtual Interrupts */
+	Tsd		= 0x00000004,		/* Time-Stamp Disable */
+	De		= 0x00000008,		/* Debugging Extensions */
+	Pse		= 0x00000010,		/* Page-Size Extensions */
+	Pae		= 0x00000020,		/* Physical Address Extension */
+	Mce		= 0x00000040,		/* Machine Check Enable */
+	Pge		= 0x00000080,		/* Page-Global Enable */
+	Pce		= 0x00000100,		/* Performance Monitoring Counter Enable */
+	Osfxsr		= 0x00000200,		/* FXSAVE/FXRSTOR Support */
+	Osxmmexcpt	= 0x00000400,		/* Unmasked Exception Support */
+};
+
+enum {						/* Rflags */
+	Cf		= 0x00000001,		/* Carry Flag */
+	Pf		= 0x00000004,		/* Parity Flag */
+	Af		= 0x00000010,		/* Auxiliary Flag */
+	Zf		= 0x00000040,		/* Zero Flag */
+	Sf		= 0x00000080,		/* Sign Flag */
+	Tf		= 0x00000100,		/* Trap Flag */
+	If		= 0x00000200,		/* Interrupt Flag */
+	Df		= 0x00000400,		/* Direction Flag */
+	Of		= 0x00000800,		/* Overflow Flag */
+	Iopl0		= 0x00000000,		/* I/O Privilege Level */
+	Iopl1		= 0x00001000,
+	Iopl2		= 0x00002000,
+	Iopl3		= 0x00003000,
+	Nt		= 0x00004000,		/* Nested Task */
+	Rf		= 0x00010000,		/* Resume Flag */
+	Vm		= 0x00020000,		/* Virtual-8086 Mode */
+	Ac		= 0x00040000,		/* Alignment Check */
+	Vif		= 0x00080000,		/* Virtual Interrupt Flag */
+	Vip		= 0x00100000,		/* Virtual Interrupt Pending */
+	Id		= 0x00200000,		/* ID Flag */
+};
+
+enum {						/* MSRs */
+	PerfEvtbase	= 0xc0010000,		/* Performance Event Select */
+	PerfCtrbase	= 0xc0010004,		/* Performance Counters */
+
+	Efer		= 0xc0000080,		/* Extended Feature Enable */
+	Star		= 0xc0000081,		/* Legacy Target IP and [CS]S */
+	Lstar		= 0xc0000082,		/* Long Mode Target IP */
+	Cstar		= 0xc0000083,		/* Compatibility Target IP */
+	Sfmask		= 0xc0000084,		/* SYSCALL Flags Mask */
+	FSbase		= 0xc0000100,		/* 64-bit FS Base Address */
+	GSbase		= 0xc0000101,		/* 64-bit GS Base Address */
+	KernelGSbase	= 0xc0000102,		/* SWAPGS instruction */
+};
+
+enum {						/* Efer */
+	Sce		= 0x00000001,		/* System Call Extension */
+	Lme		= 0x00000100,		/* Long Mode Enable */
+	Lma		= 0x00000400,		/* Long Mode Active */
+	Nxe		= 0x00000800,		/* No-Execute Enable */
+	Svme		= 0x00001000,		/* SVM Extension Enable */
+	Ffxsr		= 0x00004000,		/* Fast FXSAVE/FXRSTOR */
+};
+
+enum {						/* PML4E/PDPE/PDE/PTE */
+	PteP		= 0x0000000000000001ull,/* Present */
+	PteRW		= 0x0000000000000002ull,/* Read/Write */
+	PteU		= 0x0000000000000004ull,/* User/Supervisor */
+	PtePWT		= 0x0000000000000008ull,/* Page-Level Write Through */
+	PtePCD		= 0x0000000000000010ull,/* Page Level Cache Disable */
+	PteA		= 0x0000000000000020ull,/* Accessed */
+	PteD		= 0x0000000000000040ull,/* Dirty */
+	PtePS		= 0x0000000000000080ull,/* Page Size */
+	Pte4KPAT	= PtePS,		/* PTE PAT */
+	PteG		= 0x0000000000000100ull,/* Global */
+	Pte2MPAT	= 0x0000000000001000ull,/* PDE PAT */
+	Pte1GPAT	= Pte2MPAT,		/* PDPE PAT */
+	PteNX		= 0x8000000000000000ull,/* No Execute */
+};
+
+enum {						/* Exceptions */
+	IdtDE		= 0,			/* Divide-by-Zero Error */
+	IdtDB		= 1,			/* Debug */
+	IdtNMI		= 2,			/* Non-Maskable-Interrupt */
+	IdtBP		= 3,			/* Breakpoint */
+	IdtOF		= 4,			/* Overflow */
+	IdtBR		= 5,			/* Bound-Range */
+	IdtUD		= 6,			/* Invalid-Opcode */
+	IdtNM		= 7,			/* Device-Not-Available */
+	IdtDF		= 8,			/* Double-Fault */
+	Idt09		= 9,			/* unsupported */
+	IdtTS		= 10,			/* Invalid-TSS */
+	IdtNP		= 11,			/* Segment-Not-Present */
+	IdtSS		= 12,			/* Stack */
+	IdtGP		= 13,			/* General-Protection */
+	IdtPF		= 14,			/* Page-Fault */
+	Idt0F		= 15,			/* reserved */
+	IdtMF		= 16,			/* x87 FPE-Pending */
+	IdtAC		= 17,			/* Alignment-Check */
+	IdtMC		= 18,			/* Machine-Check */
+	IdtXF		= 19,			/* SIMD Floating-Point */
+};
+
+/*
+ * Vestigial Segmented Virtual Memory.
+ */
+enum {						/* Segment Descriptor */
+	SdISTM		= 0x0000000700000000ull,/* Interrupt Stack Table Mask */
+	SdA		= 0x0000010000000000ull,/* Accessed */
+	SdR		= 0x0000020000000000ull,/* Readable (Code) */
+	SdW		= 0x0000020000000000ull,/* Writeable (Data) */
+	SdE		= 0x0000040000000000ull,/* Expand Down */
+	SdaTSS		= 0x0000090000000000ull,/* Available TSS */
+	SdbTSS		= 0x00000b0000000000ull,/* Busy TSS */
+	SdCG		= 0x00000c0000000000ull,/* Call Gate */
+	SdIG		= 0x00000e0000000000ull,/* Interrupt Gate */
+	SdTG		= 0x00000f0000000000ull,/* Trap Gate */
+	SdCODE		= 0x0000080000000000ull,/* Code/Data */
+	SdS		= 0x0000100000000000ull,/* System/User */
+	SdDPL0		= 0x0000000000000000ull,/* Descriptor Privilege Level */
+	SdDPL1		= 0x0000200000000000ull,
+	SdDPL2		= 0x0000400000000000ull,
+	SdDPL3		= 0x0000600000000000ull,
+	SdP		= 0x0000800000000000ull,/* Present */
+	Sd4G		= 0x000f00000000ffffull,/* 4G Limit */
+	SdL		= 0x0020000000000000ull,/* Long Attribute */
+	SdD		= 0x0040000000000000ull,/* Default Operand Size */
+	SdG		= 0x0080000000000000ull,/* Granularity */
+};
+
+/*
+ * Performance Counter Configuration
+ */
+enum {						/* Performance Event Selector */
+    				 
+	PeHo		= 0x0000020000000000ull,/* Host only */
+	PeGo		= 0x0000010000000000ull,/* Guest only */
+	PeEvMskH	= 0x0000000f00000000ull,/* Event mask H */
+	PeCtMsk		= 0x00000000ff000000ull,/* Counter mask */
+	PeInMsk		= 0x0000000000800000ull,/* Invert mask */
+	PeCtEna		= 0x0000000000400000ull,/* Counter enable */
+	PeInEna		= 0x0000000000100000ull,/* Interrupt enable */
+	PePnCtl		= 0x0000000000080000ull,/* Pin control */
+	PeEdg		= 0x0000000000040000ull,/* Edge detect */
+	PeOS		= 0x0000000000020000ull,/* OS mode */
+	PeUsr		= 0x0000000000010000ull,/* User mode */
+	PeUnMsk		= 0x000000000000ff00ull,/* Unit Mask */
+	PeEvMskL	= 0x00000000000000ffull,/* Event Mask L */
+
+	PeEvMsksh	= 32ull,		/* Event mask shift */
+};
+
+enum {						/* Segment Selector */
+	SsRPL0		= 0x0000,		/* Requestor Privilege Level */
+	SsRPL1		= 0x0001,
+	SsRPL2		= 0x0002,
+	SsRPL3		= 0x0003,
+	SsTIGDT		= 0x0000,		/* GDT Table Indicator  */
+	SsTILDT		= 0x0004,		/* LDT Table Indicator */
+	SsSIM		= 0xfff8,		/* Selector Index Mask */
+};
+
+#define SSEL(si, tirpl)	(((si)<<3)|(tirpl))	/* Segment Selector */
+
+enum {
+	SiNULL		= 0,			/* NULL selector index */
+	SiCS		= 1,			/* CS selector index */
+	SiDS		= 2,			/* DS selector index */
+	SiU32CS		= 3,			/* User CS selector index */
+	SiUDS		= 4,			/* User DS selector index */
+	SiUCS		= 5,			/* User CS selector index */
+	SiFS		= 6,			/* FS selector index */
+	SiGS		= 7,			/* GS selector index */
+	SiTSS		= 8,			/* TSS selector index */
+};
+
+/*
+ * Extern registers.
+ */
+#define RMACH		R15			/* m-> */
+#define RUSER		R14			/* up-> */

+ 416 - 0
sys/src/9/k10/apic.c

@@ -0,0 +1,416 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "apic.h"
+#include "io.h"
+
+enum {						/* Local APIC registers */
+	Id		= 0x0020,		/* Identification */
+	Ver		= 0x0030,		/* Version */
+	Tp		= 0x0080,		/* Task Priority */
+	Ap		= 0x0090,		/* Arbitration Priority */
+	Pp		= 0x00a0,		/* Processor Priority */
+	Eoi		= 0x00b0,		/* EOI */
+	Ld		= 0x00d0,		/* Logical Destination */
+	Df		= 0x00e0,		/* Destination Format */
+	Siv		= 0x00f0,		/* Spurious Interrupt Vector */
+	Is		= 0x0100,		/* Interrupt Status (8) */
+	Tm		= 0x0180,		/* Trigger Mode (8) */
+	Ir		= 0x0200,		/* Interrupt Request (8) */
+	Es		= 0x0280,		/* Error Status */
+	Iclo		= 0x0300,		/* Interrupt Command */
+	Ichi		= 0x0310,		/* Interrupt Command [63:32] */
+	Lvt0		= 0x0320,		/* Local Vector Table 0 */
+	Lvt5		= 0x0330,		/* Local Vector Table 5 */
+	Lvt4		= 0x0340,		/* Local Vector Table 4 */
+	Lvt1		= 0x0350,		/* Local Vector Table 1 */
+	Lvt2		= 0x0360,		/* Local Vector Table 2 */
+	Lvt3		= 0x0370,		/* Local Vector Table 3 */
+	Tic		= 0x0380,		/* Timer Initial Count */
+	Tcc		= 0x0390,		/* Timer Current Count */
+	Tdc		= 0x03e0,		/* Timer Divide Configuration */
+
+	Tlvt		= Lvt0,			/* Timer */
+	Lint0		= Lvt1,			/* Local Interrupt 0 */
+	Lint1		= Lvt2,			/* Local Interrupt 1 */
+	Elvt		= Lvt3,			/* Error */
+	Pclvt		= Lvt4,			/* Performance Counter */
+	Tslvt		= Lvt5,			/* Thermal Sensor */
+};
+
+enum {						/* Siv */
+	Swen		= 0x00000100,		/* Software Enable */
+	Fdis		= 0x00000200,		/* Focus Disable */
+};
+
+enum {						/* Iclo */
+	Lassert		= 0x00004000,		/* Assert level */
+
+	DSnone		= 0x00000000,		/* Use Destination Field */
+	DSself		= 0x00040000,		/* Self is only destination */
+	DSallinc	= 0x00080000,		/* All including self */
+	DSallexc	= 0x000c0000,		/* All Excluding self */
+};
+
+enum {						/* Tlvt */
+	Periodic	= 0x00020000,		/* Periodic Timer Mode */
+};
+
+enum {						/* Tdc */
+	DivX2		= 0x00000000,		/* Divide by 2 */
+	DivX4		= 0x00000001,		/* Divide by 4 */
+	DivX8		= 0x00000002,		/* Divide by 8 */
+	DivX16		= 0x00000003,		/* Divide by 16 */
+	DivX32		= 0x00000008,		/* Divide by 32 */
+	DivX64		= 0x00000009,		/* Divide by 64 */
+	DivX128		= 0x0000000a,		/* Divide by 128 */
+	DivX1		= 0x0000000b,		/* Divide by 1 */
+};
+
+static u8int* apicbase;
+static int apmachno = 1;
+
+Apic	xlapic[Napic];
+Mach	*xlapicmachptr[Napic];		/* maintained, but unused */
+
+static u32int
+apicrget(int r)
+{
+	return *((u32int*)(apicbase+r));
+}
+
+static void
+apicrput(int r, u32int data)
+{
+	*((u32int*)(apicbase+r)) = data;
+}
+
+int
+apiceoi(int vecno)
+{
+	apicrput(Eoi, 0);
+
+	return vecno;
+}
+
+int
+apicisr(int vecno)
+{
+	int isr;
+
+	isr = apicrget(Is + (vecno/32)*16);
+
+	return isr & (1<<(vecno%32));
+}
+
+void
+apicinit(int apicno, uintmem pa, int isbp)
+{
+	Apic *apic;
+
+	/*
+	 * Mark the APIC useable if it has a good ID
+	 * and the registers can be mapped.
+	 * The APIC Extended Broadcast and ID bits in the HyperTransport
+	 * Transaction Control register determine whether 4 or 8 bits
+	 * are used for the APIC ID. There is also xAPIC and x2APIC
+	 * to be dealt with sometime.
+	 */
+	DBG("apicinit: apicno %d pa %#p isbp %d\n", apicno, pa, isbp);
+	if(apicno >= Napic){
+		print("apicinit%d: out of range\n", apicno);
+		return;
+	}
+	if((apic = &xlapic[apicno])->useable){
+		print("apicinit%d: already initialised\n", apicno);
+		return;
+	}
+	if(apicbase == nil){
+		if((apicbase = vmap(pa, 1024)) == nil){
+			print("apicinit%d: can't map apicbase\n", apicno);
+			return;
+		}
+		DBG("apicinit%d: apicbase %#p -> %#p\n", apicno, pa, apicbase);
+	}
+	apic->useable = 1;
+
+	/*
+	 * Assign a machno to the processor associated with this
+	 * APIC, it may not be an identity map.
+	 * Machno 0 is always the bootstrap processor.
+	 */
+	if(isbp){
+		apic->machno = 0;
+		m->apicno = apicno;
+	}
+	else
+		apic->machno = apmachno++;
+}
+
+static void
+apicdump0(Apic *apic, int i)
+{
+	if(!apic->useable || apic->addr != 0)
+		return;
+	DBG("apic%d: machno %d lint0 %#8.8ux lint1 %#8.8ux\n",
+		i, apic->machno, apic->lvt[0], apic->lvt[1]);
+	DBG(" tslvt %#8.8ux pclvt %#8.8ux elvt %#8.8ux\n",
+		apicrget(Tslvt), apicrget(Pclvt), apicrget(Elvt));
+	DBG(" tlvt %#8.8ux lint0 %#8.8ux lint1 %#8.8ux siv %#8.8ux\n",
+		apicrget(Tlvt), apicrget(Lint0),
+		apicrget(Lint1), apicrget(Siv));
+}
+
+void
+apicdump(void)
+{
+	int i;
+
+	if(!DBGFLG)
+		return;
+
+	DBG("apicbase %#p apmachno %d\n", apicbase, apmachno);
+	for(i = 0; i < Napic; i++)
+		apicdump0(xlapic + i, i);
+	for(i = 0; i < Napic; i++)
+		apicdump0(xioapic + i, i);
+}
+
+static void
+apictimer(Ureg* ureg, void*)
+{
+	timerintr(ureg, 0);
+}
+
+int
+apiconline(void)
+{
+	Apic *apic;
+	u64int tsc;
+	u32int dfr, ver;
+	int apicno, nlvt;
+
+	if(apicbase == nil)
+		return 0;
+	if((apicno = ((apicrget(Id)>>24) & 0xff)) >= Napic)
+		return 0;
+	apic = &xlapic[apicno];
+	if(!apic->useable || apic->addr != nil)
+		return 0;
+
+	/*
+	 * Things that can only be done when on the processor
+	 * owning the APIC, apicinit above runs on the bootstrap
+	 * processor.
+	 */
+	ver = apicrget(Ver);
+	nlvt = ((ver>>16) & 0xff) + 1;
+	if(nlvt > nelem(apic->lvt)){
+		print("apicinit%d: nlvt %d > max (%d)\n",
+			apicno, nlvt, nelem(apic->lvt));
+		nlvt = nelem(apic->lvt);
+	}
+	apic->nlvt = nlvt;
+	apic->ver = ver & 0xff;
+
+	/*
+	 * These don't really matter in Physical mode;
+	 * set the defaults anyway.
+	 */
+	if(memcmp(m->cpuinfo, "AuthenticAMD", 12) == 0)
+		dfr = 0xf0000000;
+	else
+		dfr = 0xffffffff;
+	apicrput(Df, dfr);
+	apicrput(Ld, 0x00000000);
+
+	/*
+	 * Disable interrupts until ready by setting the Task Priority
+	 * register to 0xff.
+	 */
+	apicrput(Tp, 0xff);
+
+	/*
+	 * Software-enable the APIC in the Spurious Interrupt Vector
+	 * register and set the vector number. The vector number must have
+	 * bits 3-0 0x0f unless the Extended Spurious Vector Enable bit
+	 * is set in the HyperTransport Transaction Control register.
+	 */
+	apicrput(Siv, Swen|IdtSPURIOUS);
+
+	/*
+	 * Acknowledge any outstanding interrupts.
+	 */
+	apicrput(Eoi, 0);
+
+	/*
+	 * Use the TSC to determine the APIC timer frequency.
+	 * It might be possible to snarf this from a chipset
+	 * register instead.
+	 */
+	apicrput(Tdc, DivX1);
+	apicrput(Tlvt, Im);
+	tsc = rdtsc() + m->cpuhz/10;
+	apicrput(Tic, 0xffffffff);
+
+	while(rdtsc() < tsc)
+		;
+
+	apic->hz = (0xffffffff-apicrget(Tcc))*10;
+	apic->max = apic->hz/HZ;
+	apic->min = apic->hz/(100*HZ);
+	apic->div = ((m->cpuhz/apic->max)+HZ/2)/HZ;
+
+	if(m->machno == 0 || DBGFLG){
+		print("apic%d: hz %lld max %lld min %lld div %lld\n", apicno,
+			apic->hz, apic->max, apic->min, apic->div);
+	}
+
+	/*
+	 * Mask interrupts on Performance Counter overflow and
+	 * Thermal Sensor if implemented, and on Lintr0 (Legacy INTR),
+	 * and Lintr1 (Legacy NMI).
+	 * Clear any Error Status (write followed by read) and enable
+	 * the Error interrupt.
+	 */
+	switch(apic->nlvt){
+	case 6:
+		apicrput(Tslvt, Im);
+		/*FALLTHROUGH*/
+	case 5:
+		apicrput(Pclvt, Im);
+		/*FALLTHROUGH*/
+	default:
+		break;
+	}
+	apicrput(Lint1, apic->lvt[1]|Im|IdtLINT1);
+	apicrput(Lint0, apic->lvt[0]|Im|IdtLINT0);
+
+	apicrput(Es, 0);
+	apicrget(Es);
+	apicrput(Elvt, IdtERROR);
+
+	/*
+	 * Issue an INIT Level De-Assert to synchronise arbitration ID's.
+	 * (Necessary in this implementation? - not if Pentium 4 or Xeon
+	 * (APIC Version >= 0x14), or AMD).
+	apicrput(Ichi, 0);
+	apicrput(Iclo, DSallinc|Lassert|MTir);
+	while(apicrget(Iclo) & Ds)
+		;
+	 */
+
+	/*
+	 * Reload the timer to de-synchronise the processors,
+	 * then lower the task priority to allow interrupts to be
+	 * accepted by the APIC.
+	 */
+	microdelay((TK2MS(1)*1000/apmachno) * m->machno);
+
+	if(apic->machno == 0){
+		apicrput(Tic, apic->max);
+		intrenable(IdtTIMER, apictimer, 0, -1, "APIC timer");
+		apicrput(Tlvt, Periodic|IrqTIMER);
+	}
+
+	if(m->machno == 0)
+		apicrput(Tp, 0);
+
+	xlapicmachptr[apicno] = m;
+
+	return 1;
+}
+
+/* To start timers on TCs as part of the boot process. */
+void
+apictimerenab(void)
+{
+	Apic *apic;
+
+	apic = &xlapic[(apicrget(Id)>>24) & 0xff];
+
+	apiceoi(IdtTIMER);
+	apicrput(Tic, apic->max);
+	apicrput(Tlvt, Periodic|IrqTIMER);
+
+}
+
+void
+apictimerset(uvlong next)
+{
+	Mpl pl;
+	Apic *apic;
+	vlong period;
+
+	apic = &xlapic[(apicrget(Id)>>24) & 0xff];
+
+	pl = splhi();
+	lock(&m->apictimerlock);
+
+	period = apic->max;
+	if(next != 0){
+		period = next - fastticks(nil);	/* fastticks is just rdtsc() */
+		period /= apic->div;
+
+		if(period < apic->min)
+			period = apic->min;
+		else if(period > apic->max - apic->min)
+			period = apic->max;
+	}
+	apicrput(Tic, period);
+
+	unlock(&m->apictimerlock);
+	splx(pl);
+}
+
+void
+apicsipi(int apicno, uintmem pa)
+{
+	int i;
+	u32int crhi, crlo;
+
+	/*
+	 * SIPI - Start-up IPI.
+	 * To do: checks on apic validity.
+	 */
+	crhi = apicno<<24;
+	apicrput(Ichi, crhi);
+	apicrput(Iclo, DSnone|TMlevel|Lassert|MTir);
+	microdelay(200);
+	apicrput(Iclo, DSnone|TMlevel|MTir);
+	millidelay(10);
+
+	crlo = DSnone|TMedge|MTsipi|((u32int)pa/(4*KiB));
+	for(i = 0; i < 2; i++){
+		apicrput(Ichi, crhi);
+		apicrput(Iclo, crlo);
+		microdelay(200);
+	}
+}
+
+void
+apicipi(int apicno)
+{
+	apicrput(Ichi, apicno<<24);
+	apicrput(Iclo, DSnone|TMedge|Lassert|MTf|IdtIPI);
+	while(apicrget(Iclo) & Ds)
+		;
+}
+
+void
+apicpri(int pri)
+{
+	apicrput(Tp, pri);
+}

+ 101 - 0
sys/src/9/k10/apic.h

@@ -0,0 +1,101 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * There are 2 flavours of APIC, Local APIC and IOAPIC,
+ * Each I/O APIC has a unique physical address,
+ * Local APICs are all at the same physical address as they can only be
+ * accessed by the local CPU.  APIC ids are unique to the
+ * APIC type, so an IOAPIC and APIC both with id 0 is ok.
+ */
+typedef	struct	Ioapic	Ioapic;
+typedef	struct	Lapic	Lapic;
+typedef	struct	Apic	Apic;
+
+struct Ioapic {
+	Lock;					/* IOAPIC: register access */
+	u32int*	addr;				/* IOAPIC: register base */
+	int	nrdt;				/* IOAPIC: size of RDT */
+	int	gsib;				/* IOAPIC: global RDT index */
+};
+
+struct Lapic {
+	int	machno;				/* APIC */
+
+	u32int	lvt[6];
+	int	nlvt;
+	int	ver;
+
+	vlong	hz;				/* APIC Timer frequency */
+	vlong	max;
+	vlong	min;
+	vlong	div;
+};
+
+struct Apic {
+	int	useable;			/* en */
+	Ioapic;
+	Lapic;
+};
+
+enum {
+	Nbus		= 256,
+	Napic		= 254,			/* xAPIC architectural limit */
+	Nrdt		= 64,
+};
+
+/*
+ * Common bits for
+ *	IOAPIC Redirection Table Entry (RDT);
+ *	APIC Local Vector Table Entry (LVT);
+ *	APIC Interrupt Command Register (ICR).
+ * [10:8] Message Type
+ * [11] Destination Mode (RW)
+ * [12] Delivery Status (RO)
+ * [13] Interrupt Input Pin Polarity (RW)
+ * [14] Remote IRR (RO)
+ * [15] Trigger Mode (RW)
+ * [16] Interrupt Mask
+ */
+enum {
+	MTf		= 0x00000000,		/* Fixed */
+	MTlp		= 0x00000100,		/* Lowest Priority */
+	MTsmi		= 0x00000200,		/* SMI */
+	MTrr		= 0x00000300,		/* Remote Read */
+	MTnmi		= 0x00000400,		/* NMI */
+	MTir		= 0x00000500,		/* INIT/RESET */
+	MTsipi		= 0x00000600,		/* Startup IPI */
+	MTei		= 0x00000700,		/* ExtINT */
+
+	Pm		= 0x00000000,		/* Physical Mode */
+	Lm		= 0x00000800,		/* Logical Mode */
+
+	Ds		= 0x00001000,		/* Delivery Status */
+	IPhigh		= 0x00000000,		/* IIPP High */
+	IPlow		= 0x00002000,		/* IIPP Low */
+	Rirr		= 0x00004000,		/* Remote IRR */
+	TMedge		= 0x00000000,		/* Trigger Mode Edge */
+	TMlevel		= 0x00008000,		/* Trigger Mode Level */
+	Im		= 0x00010000,		/* Interrupt Mask */
+};
+
+extern	Apic	xlapic[Napic];
+extern	Apic	xioapic[Napic];
+extern	Mach	*xlapicmachptr[Napic];		/* maintained, but unused */
+
+#define l16get(p)	(((p)[1]<<8)|(p)[0])
+#define	l32get(p)	(((u32int)l16get(p+2)<<16)|l16get(p))
+#define	l64get(p)	(((u64int)l32get(p+4)<<32)|l32get(p))
+
+extern void apicdump(void);
+extern void apictimerenab(void);
+extern void ioapicdump(void);
+
+extern int pcimsienable(Pcidev*, uvlong);
+extern int pcimsimask(Pcidev*, int);

+ 115 - 0
sys/src/9/k10/arch.c

@@ -0,0 +1,115 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * EPISODE 12B
+ * How to recognise different types of trees from quite a long way away.
+ * NO. 1
+ * THE LARCH
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+int
+incref(Ref *r)
+{
+	int x;
+
+	lock(r);
+	x = ++r->ref;
+	unlock(r);
+	return x;
+}
+
+int
+decref(Ref *r)
+{
+	int x;
+
+	lock(r);
+	x = --r->ref;
+	unlock(r);
+	if(x < 0)
+		panic("decref pc=%#p", getcallerpc(&r));
+
+	return x;
+}
+
+void
+procrestore(Proc *p)
+{
+	uvlong t;
+
+	if(p->kp)
+		return;
+	cycles(&t);
+	p->pcycles -= t;
+
+	fpuprocrestore(p);
+}
+
+/*
+ *  Save the mach dependent part of the process state.
+ *  NB: the caller should mmuflushtlb after procsave().
+ *  procsave/procrestore don't touch the mmu, they
+ *  care about fpu, mostly.
+ */
+void
+procsave(Proc *p)
+{
+	uvlong t;
+
+	cycles(&t);
+	p->pcycles += t;
+
+	fpuprocsave(p);
+}
+
+static void
+linkproc(void)
+{
+	spllo();
+	up->kpfun(up->kparg);
+	pexit("kproc dying", 0);
+}
+
+void
+kprocchild(Proc* p, void (*func)(void*), void* arg)
+{
+	/*
+	 * gotolabel() needs a word on the stack in
+	 * which to place the return PC used to jump
+	 * to linkproc().
+	 */
+	p->sched.pc = PTR2UINT(linkproc);
+	p->sched.sp = PTR2UINT(p->kstack+KSTACK-BY2SE);
+	p->sched.sp = STACKALIGN(p->sched.sp);
+
+	p->kpfun = func;
+	p->kparg = arg;
+}
+
+/*
+ *  put the processor in the halt state if we've no processes to run.
+ *  an interrupt will get us going again.
+ *  The boot TC in nix can't halt, because it must stay alert in
+ *  case an AC makes a handler process ready.
+ *  We should probably use mwait in that case.
+ */
+void
+idlehands(void)
+{
+if(0)
+	if(m->machno != 0)
+		halt();
+}

+ 380 - 0
sys/src/9/k10/archk10.c

@@ -0,0 +1,380 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+static int
+cpuidinit(void)
+{
+	u32int eax, info[4];
+
+	/*
+	 * Standard CPUID functions.
+	 * Functions 0 and 1 will be needed multiple times
+	 * so cache the info now.
+	 */
+	if((m->ncpuinfos = cpuid(0, 0, m->cpuinfo[0])) == 0)
+		return 0;
+	m->ncpuinfos++;
+
+	if(memcmp(&m->cpuinfo[0][1], "GenuntelineI", 12) == 0)
+		m->isintelcpu = 1;
+	cpuid(1, 0, m->cpuinfo[1]);
+
+	/*
+	 * Extended CPUID functions.
+	 */
+	if((eax = cpuid(0x80000000, 0, info)) >= 0x80000000)
+		m->ncpuinfoe = (eax & ~0x80000000) + 1;
+
+	/* is mnonitor supported? */
+	if (m->cpuinfo[1][2] & 8) {
+		cpuid(5, 0, m->cpuinfo[2]);	
+		mwait = k10mwait;
+	}
+
+	return 1;
+}
+
+static int
+cpuidinfo(u32int eax, u32int ecx, u32int info[4])
+{
+	if(m->ncpuinfos == 0 && cpuidinit() == 0)
+		return 0;
+
+	if(!(eax & 0x80000000)){
+		if(eax >= m->ncpuinfos)
+			return 0;
+	}
+	else if(eax >= (0x80000000|m->ncpuinfoe))
+		return 0;
+
+	cpuid(eax, ecx, info);
+
+	return 1;
+}
+
+static vlong
+cpuidhz(u32int info[2][4])
+{
+	int f, r;
+	vlong hz;
+	u64int msr;
+
+	if(memcmp(&info[0][1], "GenuntelineI", 12) == 0){
+		switch(info[1][0] & 0x0fff3ff0){
+		default:
+			return 0;
+		case 0x00000f30:		/* Xeon (MP), Pentium [4D] */
+		case 0x00000f40:		/* Xeon (MP), Pentium [4D] */
+		case 0x00000f60:		/* Xeon 7100, 5000 or above */
+			msr = rdmsr(0x2c);
+			r = (msr>>16) & 0x07;
+			switch(r){
+			default:
+				return 0;
+			case 0:
+				hz = 266666666666ll;
+				break;
+			case 1:
+				hz = 133333333333ll;
+				break;
+			case 2:
+				hz = 200000000000ll;
+				break;
+			case 3:
+				hz = 166666666666ll;
+				break;
+			case 4:
+				hz = 333333333333ll;
+				break;
+			}
+
+			/*
+			 * Hz is *1000 at this point.
+			 * Do the scaling then round it.
+			 * The manual is conflicting about
+			 * the size of the msr field.
+			 */
+			hz = (((hz*(msr>>24))/100)+5)/10;
+			break;
+		case 0x00000690:		/* Pentium M, Celeron M */
+		case 0x000006d0:		/* Pentium M, Celeron M */
+			hz = ((rdmsr(0x2a)>>22) & 0x1f)*100 * 1000000ll;
+			break;
+		case 0x000006e0:		/* Core Duo */
+		case 0x000006f0:		/* Core 2 Duo/Quad/Extreme */
+		case 0x00010670:		/* Core 2 Extreme */
+		case 0x000006a0:		/* i7 paurea... */
+			/*
+			 * Get the FSB frequemcy.
+			 * If processor has Enhanced Intel Speedstep Technology
+			 * then non-integer bus frequency ratios are possible.
+			 */
+			if(info[1][2] & 0x00000080){
+				msr = rdmsr(0x198);
+				r = (msr>>40) & 0x1f;
+			}
+			else{
+				msr = 0;
+				r = rdmsr(0x2a) & 0x1f;
+			}
+			f = rdmsr(0xcd) & 0x07;
+			switch(f){
+			default:
+				return 0;
+			case 5:
+				hz = 100000000000ll;
+				break;
+			case 1:
+				hz = 133333333333ll;
+				break;
+			case 3:
+				hz = 166666666666ll;
+				break;
+			case 2:
+				hz = 200000000000ll;
+				break;
+			case 0:
+				hz = 266666666666ll;
+				break;
+			case 4:
+				hz = 333333333333ll;
+				break;
+			case 6:
+				hz = 400000000000ll;
+				break;
+			}
+
+			/*
+			 * Hz is *1000 at this point.
+			 * Do the scaling then round it.
+			 */
+			if(msr & 0x0000400000000000ll)
+				hz = hz*r + hz/2;
+			else
+				hz = hz*r;
+			hz = ((hz/100)+5)/10;
+			break;
+		}
+		DBG("cpuidhz: 0x2a: %#llux hz %lld\n", rdmsr(0x2a), hz);
+	}
+	else if(memcmp(&info[0][1], "AuthcAMDenti", 12) == 0){
+		switch(info[1][0] & 0x0fff0ff0){
+		default:
+			return 0;
+		case 0x00000f50:		/* K8 */
+			msr = rdmsr(0xc0010042);
+			if(msr == 0)
+				return 0;
+			hz = (800 + 200*((msr>>1) & 0x1f)) * 1000000ll;
+			break;
+		case 0x00100f90:		/* K10 */
+		case 0x00000620:		/* QEMU64 */
+			msr = rdmsr(0xc0010064);
+			r = (msr>>6) & 0x07;
+			hz = (((msr & 0x3f)+0x10)*100000000ll)/(1<<r);
+			break;
+		}
+		DBG("cpuidhz: %#llux hz %lld\n", msr, hz);
+	}
+	else
+		return 0;
+
+	return hz;
+}
+
+void
+cpuiddump(void)
+{
+	int i;
+	u32int info[4];
+
+	if(!DBGFLG)
+		return;
+
+	if(m->ncpuinfos == 0 && cpuidinit() == 0)
+		return;
+
+	for(i = 0; i < m->ncpuinfos; i++){
+		cpuid(i, 0, info);
+		DBG("eax = %#8.8ux: %8.8ux %8.8ux %8.8ux %8.8ux\n",
+			i, info[0], info[1], info[2], info[3]);
+	}
+	for(i = 0; i < m->ncpuinfoe; i++){
+		cpuid(0x80000000|i, 0, info);
+		DBG("eax = %#8.8ux: %8.8ux %8.8ux %8.8ux %8.8ux\n",
+			0x80000000|i, info[0], info[1], info[2], info[3]);
+	}
+}
+
+vlong
+archhz(void)
+{
+	vlong hz;
+	u32int info[2][4];
+
+	if(!cpuidinfo(0, 0, info[0]) || !cpuidinfo(1, 0, info[1]))
+		return 0;
+
+	hz = cpuidhz(info);
+	if(hz != 0 || m->machno != 0)
+		return hz;
+
+	return i8254hz(info);
+}
+
+int
+archmmu(void)
+{
+	u32int info[4];
+
+	/*
+	 * Should the check for m->machno != 0 be here
+	 * or in the caller (mmuinit)?
+	 *
+	 * To do here:
+	 * check and enable Pse;
+	 * Pge; Nxe.
+	 */
+
+	/*
+	 * How many page sizes are there?
+	 * Always have 4*KiB, but need to check
+	 * configured correctly.
+	 */
+	assert(PGSZ == 4*KiB);
+
+	m->pgszlg2[0] = 12;
+	m->pgszmask[0] = (1<<12)-1;
+	m->pgsz[0] = 1<<12;
+	m->npgsz = 1;
+	if(m->ncpuinfos == 0 && cpuidinit() == 0)
+		return 1;
+
+	/*
+	 * Check the Pse bit in function 1 DX for 2*MiB support;
+	 * if false, only 4*KiB is available.
+	 */
+	if(!(m->cpuinfo[1][3] & 0x00000008))
+		return 1;
+	m->pgszlg2[1] = 21;
+	m->pgszmask[1] = (1<<21)-1;
+	m->pgsz[1] = 1<<21;
+	m->npgsz = 2;
+
+	/*
+	 * Check the Page1GB bit in function 0x80000001 DX for 1*GiB support.
+	 */
+	if(cpuidinfo(0x80000001, 0, info) && (info[3] & 0x04000000)){
+		m->pgszlg2[2] = 30;
+		m->pgszmask[2] = (1<<30)-1;
+		m->pgsz[2] = 1<<30;
+		m->npgsz = 3;
+	}
+
+	return m->npgsz;
+}
+
+static int
+fmtP(Fmt* f)
+{
+	uintmem pa;
+
+	pa = va_arg(f->args, uintmem);
+
+	if(f->flags & FmtSharp)
+		return fmtprint(f, "%#16.16llux", pa);
+
+	return fmtprint(f, "%llud", pa);
+}
+
+static int
+fmtL(Fmt* f)
+{
+	Mpl pl;
+
+	pl = va_arg(f->args, Mpl);
+
+	return fmtprint(f, "%#16.16llux", pl);
+}
+
+static int
+fmtR(Fmt* f)
+{
+	u64int r;
+
+	r = va_arg(f->args, u64int);
+
+	return fmtprint(f, "%#16.16llux", r);
+}
+
+/* virtual address fmt */
+static int
+fmtW(Fmt *f)
+{
+	u64int va;
+
+	va = va_arg(f->args, u64int);
+	return fmtprint(f, "%#ullx=0x[%ullx][%ullx][%ullx][%ullx][%ullx]", va,
+		PTLX(va, 3), PTLX(va, 2), PTLX(va, 1), PTLX(va, 0),
+		va & ((1<<PGSHFT)-1));
+		
+}
+
+void
+archfmtinstall(void)
+{
+	/*
+	 * Architecture-specific formatting. Not as neat as they
+	 * could be (e.g. there's no defined type for a 'register':
+	 *	L - Mpl, mach priority level
+	 *	P - uintmem, physical address
+	 *	R - register
+	 * With a little effort these routines could be written
+	 * in a fairly architecturally-independent manner, relying
+	 * on the compiler to optimise-away impossible conditions,
+	 * and/or by exploiting the innards of the fmt library.
+	 */
+	fmtinstall('P', fmtP);
+
+	fmtinstall('L', fmtL);
+	fmtinstall('R', fmtR);
+	fmtinstall('W', fmtW);
+}
+
+void
+archidle(void)
+{
+	halt();
+}
+
+void
+microdelay(int microsecs)
+{
+	u64int r, t;
+
+	r = rdtsc();
+	for(t = r + m->cpumhz*microsecs; r < t; r = rdtsc())
+		;
+}
+
+void
+millidelay(int millisecs)
+{
+	u64int r, t;
+
+	r = rdtsc();
+	for(t = r + m->cpumhz*1000ull*millisecs; r < t; r = rdtsc())
+		;
+}

+ 256 - 0
sys/src/9/k10/archk8.c

@@ -0,0 +1,256 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+static int
+cpuidinit(void)
+{
+	int i, n;
+	u32int eax;
+
+	if((m->ncpuinfos = cpuid(0, m->cpuinfo[0])) == 0)
+		return 0;
+
+	n = ++m->ncpuinfos;
+	if(n > nelem(m->cpuinfo))
+		n = nelem(m->cpuinfo);
+	eax = cpuid(0x80000000, m->cpuinfo[m->ncpuinfos-1]);
+	if(eax >= 0x80000000){
+		eax &= ~0x80000000;
+		n += ++eax;
+		if(n > nelem(m->cpuinfo))
+			n = nelem(m->cpuinfo);
+		m->ncpuinfoe = n - m->ncpuinfos;
+	}
+
+	for(i = 1; i < n; i++){
+		eax = i;
+		if(i >= m->ncpuinfos)
+			eax = 0x80000000|(i - m->ncpuinfos);
+		cpuid(eax, m->cpuinfo[i]);
+	}
+
+	return 1;
+}
+
+static u32int*
+cpuidinfo(u32int eax)
+{
+	if(m->ncpuinfos == 0 && cpuidinit() == 0)
+		return nil;
+
+	if(!(eax & 0x80000000)){
+		if(eax >= m->ncpuinfos)
+			return nil;
+	}
+	else{
+		eax &= ~0x80000000;
+		if(eax >= m->ncpuinfoe)
+			return nil;
+		eax += m->ncpuinfos;
+	}
+
+	return m->cpuinfo[eax];
+}
+
+static vlong
+cpuidhz(u32int* info[2])
+{
+	int f, r;
+	vlong hz;
+	u64int msr;
+
+	if(memcmp(&info[0][1], "GenuntelineI", 12) == 0){
+		switch(info[1][0] & 0x0fff3ff0){
+		default:
+			return 0;
+		case 0x00000f30:		/* Xeon (MP), Pentium [4D] */
+		case 0x00000f40:		/* Xeon (MP), Pentium [4D] */
+		case 0x00000f60:		/* Xeon 7100, 5000 or above */
+			msr = rdmsr(0x2c);
+			r = (msr>>16) & 0x07;
+			switch(r){
+			default:
+				return 0;
+			case 0:
+				hz = 266666666666ll;
+				break;
+			case 1:
+				hz = 133333333333ll;
+				break;
+			case 2:
+				hz = 200000000000ll;
+				break;
+			case 3:
+				hz = 166666666666ll;
+				break;
+			case 4:
+				hz = 333333333333ll;
+				break;
+			}
+			/*
+			 * Hz is *1000 at this point.
+			 * Do the scaling then round it.
+			 * The manual is conflicting about
+			 * the size of the msr field.
+			 */
+			hz = (((hz*(msr>>24))/100)+5)/10;
+			break;
+		case 0x00000690:		/* Pentium M, Celeron M */
+		case 0x000006d0:		/* Pentium M, Celeron M */
+			hz = ((rdmsr(0x2a)>>22) & 0x1f)*100 * 1000000ll;
+			break;
+		case 0x000006e0:		/* Core Duo */
+		case 0x000006f0:		/* Core 2 Duo/Quad/Extreme */
+		case 0x00010670:		/* Core 2 Extreme */
+		case 0x000006a0:		/* i7 paurea... */
+			/*
+			 * Get the FSB frequemcy.
+			 * If processor has Enhanced Intel Speedstep Technology
+			 * then non-integer bus frequency ratios are possible.
+			 */
+			if(info[1][2] & 0x00000080){
+				msr = rdmsr(0x198);
+				r = (msr>>40) & 0x1f;
+			}
+			else{
+				msr = 0;
+				r = rdmsr(0x2a) & 0x1f;
+			}
+			f = rdmsr(0xcd) & 0x07;
+			switch(f){
+			default:
+				return 0;
+			case 5:
+				hz = 100000000000ll;
+				break;
+			case 1:
+				hz = 133333333333ll;
+				break;
+			case 3:
+				hz = 166666666666ll;
+				break;
+			case 2:
+				hz = 200000000000ll;
+				break;
+			case 0:
+				hz = 266666666666ll;
+				break;
+			case 4:
+				hz = 333333333333ll;
+				break;
+			case 6:
+				hz = 400000000000ll;
+				break;
+			}
+
+			/*
+			 * Hz is *1000 at this point.
+			 * Do the scaling then round it.
+			 */
+			if(msr & 0x0000400000000000ll)
+				hz = hz*r + hz/2;
+			else
+				hz = hz*r;
+			hz = ((hz/100)+5)/10;
+			break;
+		}
+		DBG("cpuidhz: 0x2a: %#llux hz %lld\n", rdmsr(0x2a), hz);
+	}
+	else if(memcmp(&info[0][1], "AuthcAMDenti", 12) == 0){
+		switch(info[1][0] & 0x0fff0ff0){
+		default:
+			return 0;
+		case 0x00000f50:		/* K8 */
+			msr = rdmsr(0xc0010042);
+			if(msr == 0)
+				return 0;
+			hz = (800 + 200*((msr>>1) & 0x1f)) * 1000000ll;
+			break;
+		case 0x00100f90:		/* K10 */
+		case 0x00000620:		/* QEMU64 */
+			msr = rdmsr(0xc0010064);
+			r = (msr>>6) & 0x07;
+			hz = (((msr & 0x3f)+0x10)*100000000ll)/(1<<r);
+			break;
+		}
+		DBG("cpuidhz: %#llux hz %lld\n", msr, hz);
+	}
+	else
+		return 0;
+
+	return hz;
+}
+
+void
+cpuiddump(void)
+{
+	int i, n;
+
+	if(!DBGFLG)
+		return;
+
+	if(m->ncpuinfos == 0 && cpuidinit() == 0)
+		return;
+
+	n = m->ncpuinfos+m->ncpuinfoe;
+	for(i = 0; i < n; i++){
+		DBG("eax = %#8.8ux: %8.8ux %8.8ux %8.8ux %8.8ux\n",
+			(i >= m->ncpuinfos ? 0x80000000|(i - m->ncpuinfos): i),
+			m->cpuinfo[i][0], m->cpuinfo[i][1],
+			m->cpuinfo[i][2], m->cpuinfo[i][3]);
+	}
+}
+
+vlong
+archhz(void)
+{
+	vlong hz;
+	u32int *info[2];
+
+	if((info[0] = cpuidinfo(0)) == 0 || (info[1] = cpuidinfo(1)) == 0)
+		return 0;
+
+	hz = cpuidhz(info);
+	if(hz != 0)
+		return hz;
+
+	return i8254hz(info);
+}
+
+void
+archidle(void)
+{
+	halt();
+}
+
+void
+microdelay(int microsecs)
+{
+	u64int r, t;
+
+	r = rdtsc();
+	for(t = r + m->cpumhz*microsecs; r < t; r = rdtsc())
+		;
+}
+
+void
+millidelay(int millisecs)
+{
+	u64int r, t;
+
+	r = rdtsc();
+	for(t = r + m->cpumhz*1000ull*millisecs; r < t; r = rdtsc())
+		;
+}

+ 438 - 0
sys/src/9/k10/asm.c

@@ -0,0 +1,438 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * To do:
+ *	find a purpose for this...
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "amd64.h"
+
+/*
+ * Address Space Map.
+ * Low duty cycle.
+ */
+typedef struct Asm Asm;
+typedef struct Asm {
+	uintmem	addr;
+	uintmem	size;
+	int	type;
+	int	location;
+	Asm*	next;
+} Asm;
+
+enum {
+	AsmNONE		= 0,
+	AsmMEMORY	= 1,
+	AsmRESERVED	= 2,
+	AsmACPIRECLAIM	= 3,
+	AsmACPINVS	= 4,
+
+	AsmDEV		= 5,
+};
+
+static Lock asmlock;
+static Asm asmarray[64] = {
+	{ 0, ~0, AsmNONE, nil, },
+};
+static int asmindex = 1;
+static Asm* asmlist = &asmarray[0];
+static Asm* asmfreelist;
+
+/*static*/ void
+asmdump(void)
+{
+	Asm* asm;
+
+	print("asm: index %d:\n", asmindex);
+	for(asm = asmlist; asm != nil; asm = asm->next){
+		print(" %#P %#P %d (%P)\n",
+			asm->addr, asm->addr+asm->size,
+			asm->type, asm->size);
+	}
+}
+
+static Asm*
+asmnew(uintmem addr, uintmem size, int type)
+{
+	Asm * asm;
+
+	if(asmfreelist != nil){
+		asm = asmfreelist;
+		asmfreelist = asm->next;
+		asm->next = nil;
+	}
+	else{
+		if(asmindex >= nelem(asmarray))
+			return nil;
+		asm = &asmarray[asmindex++];
+	}
+	asm->addr = addr;
+	asm->size = size;
+	asm->type = type;
+
+	return asm;
+}
+
+int
+asmfree(uintmem addr, uintmem size, int type)
+{
+	Asm *np, *pp, **ppp;
+
+	DBG("asmfree: %#P@%#P, type %d\n", size, addr, type);
+	if(size == 0)
+		return 0;
+
+	lock(&asmlock);
+
+	/*
+	 * Find either a map entry with an address greater
+	 * than that being returned, or the end of the map.
+	 */
+	pp = nil;
+	ppp = &asmlist;
+	for(np = *ppp; np != nil && np->addr <= addr; np = np->next){
+		pp = np;
+		ppp = &np->next;
+	}
+
+	if((pp != nil && pp->addr+pp->size > addr)
+	|| (np != nil && addr+size > np->addr)){
+		unlock(&asmlock);
+		DBG("asmfree: overlap %#Px@%#P, type %d\n", size, addr, type);
+		return -1;
+	}
+
+	if(pp != nil && pp->type == type && pp->addr+pp->size == addr){
+		pp->size += size;
+		if(np != nil && np->type == type && addr+size == np->addr){
+			pp->size += np->size;
+			pp->next = np->next;
+
+			np->next = asmfreelist;
+			asmfreelist = np;
+		}
+
+		unlock(&asmlock);
+		return 0;
+	}
+
+	if(np != nil && np->type == type && addr+size == np->addr){
+		np->addr -= size;
+		np->size += size;
+
+		unlock(&asmlock);
+		return 0;
+	}
+
+	if((pp = asmnew(addr, size, type)) == nil){
+		unlock(&asmlock);
+		DBG("asmfree: losing %#P@%#P, type %d\n", size, addr, type);
+		return -1;
+	}
+	*ppp = pp;
+	pp->next = np;
+
+	unlock(&asmlock);
+
+	return 0;
+}
+
+uintmem
+asmalloc(uintmem addr, uintmem size, int type, int align)
+{
+	uintmem a, o;
+	Asm *asm, *pp;
+
+	DBG("asmalloc: %#P@%#P, type %d\n", size, addr, type);
+	lock(&asmlock);
+	for(pp = nil, asm = asmlist; asm != nil; pp = asm, asm = asm->next){
+		if(asm->type != type)
+			continue;
+		a = asm->addr;
+
+		if(addr != 0){
+			/*
+			 * A specific address range has been given:
+			 *   if the current map entry is greater then
+			 *   the address is not in the map;
+			 *   if the current map entry does not overlap
+			 *   the beginning of the requested range then
+			 *   continue on to the next map entry;
+			 *   if the current map entry does not entirely
+			 *   contain the requested range then the range
+			 *   is not in the map.
+			 * The comparisons are strange to prevent
+			 * overflow.
+			 */
+			if(a > addr)
+				break;
+			if(asm->size < addr - a)
+				continue;
+			if(addr - a > asm->size - size)
+				break;
+			a = addr;
+		}
+
+		if(align > 0)
+			a = ((a+align-1)/align)*align;
+		if(asm->addr+asm->size-a < size)
+			continue;
+
+		o = asm->addr;
+		asm->addr = a+size;
+		asm->size -= a-o+size;
+		if(asm->size == 0){
+			if(pp != nil)
+				pp->next = asm->next;
+			asm->next = asmfreelist;
+			asmfreelist = asm;
+		}
+
+		unlock(&asmlock);
+		if(o != a)
+			asmfree(o, a-o, type);
+		return a;
+	}
+	unlock(&asmlock);
+
+	return 0;
+}
+
+static void
+asminsert(uintmem addr, uintmem size, int type)
+{
+	if(type == AsmNONE || asmalloc(addr, size, AsmNONE, 0) == 0)
+		return;
+	if(asmfree(addr, size, type) == 0)
+		return;
+	asmfree(addr, size, 0);
+}
+
+void
+asminit(void)
+{
+	sys->pmstart = ROUNDUP(PADDR(end), PGSZ);
+	sys->pmend = sys->pmstart;
+	asmalloc(0, sys->pmstart, AsmNONE, 0);
+}
+
+/*
+ * Notes:
+ * asmmapinit and asmmodinit called from multiboot;
+ * subject to change; the numerology here is probably suspect.
+ * Multiboot defines the alignment of modules as 4096.
+ */
+void
+asmmapinit(uintmem addr, uintmem size, int type)
+{
+	switch(type){
+	default:
+		asminsert(addr, size, type);
+		break;
+	case AsmMEMORY:
+		/*
+		 * Adjust things for the peculiarities of this
+		 * architecture.
+		 * Sys->pmend is the largest physical memory address found,
+		 * there may be gaps between it and sys->pmstart, the range
+		 * and how much of it is occupied, might need to be known
+		 * for setting up allocators later.
+		 */
+		if(addr < 1*MiB || addr+size < sys->pmstart)
+			break;
+		if(addr < sys->pmstart){
+			size -= sys->pmstart - addr;
+			addr = sys->pmstart;
+		}
+		asminsert(addr, size, type);
+		sys->pmoccupied += size;
+		if(addr+size > sys->pmend)
+			sys->pmend = addr+size;
+		break;
+	}
+}
+
+void
+asmmodinit(u32int start, u32int end, char* s)
+{
+	DBG("asmmodinit: %#ux -> %#ux: <%s> %#ux\n",
+		start, end, s, ROUNDUP(end, 4096));
+
+	if(start < sys->pmstart)
+		return;
+	end = ROUNDUP(end, 4096);
+	if(end > sys->pmstart){
+		asmalloc(sys->pmstart, end-sys->pmstart, AsmNONE, 0);
+		sys->pmstart = end;
+	}
+}
+
+static int npg[4];
+
+void*
+asmbootalloc(usize size)
+{
+	uintptr va;
+
+	assert(sys->vmunused+size <= sys->vmunmapped);
+	va = sys->vmunused;
+	sys->vmunused += size;
+	memset(UINT2PTR(va), 0, size);
+	return UINT2PTR(va);
+}
+
+static PTE
+asmwalkalloc(usize size)
+{
+	uintmem pa;
+
+	assert(size == PTSZ && sys->vmunused+size <= sys->vmunmapped);
+
+	if(!ALIGNED(sys->vmunused, PTSZ)){
+		DBG("asmwalkalloc: %ulld wasted\n",
+			ROUNDUP(sys->vmunused, PTSZ) - sys->vmunused);
+		sys->vmunused = ROUNDUP(sys->vmunused, PTSZ);
+	}
+	if((pa = mmuphysaddr(sys->vmunused)) != ~0)
+		sys->vmunused += size;
+
+	return pa;
+}
+
+// still needed so iallocb gets initialised correctly. needs to go.
+#define ConfCrap
+
+void
+asmmeminit(void)
+{
+	int i, l;
+	Asm* asm;
+	PTE *pte, *pml4;
+	uintptr va;
+	uintmem hi, lo, mem, nextmem, pa;
+#ifdef ConfCrap
+	int cx;
+#endif /* ConfCrap */
+
+	assert(!((sys->vmunmapped|sys->vmend) & m->pgszmask[1]));
+
+	if((pa = mmuphysaddr(sys->vmunused)) == ~0)
+		panic("asmmeminit 1");
+	pa += sys->vmunmapped - sys->vmunused;
+	mem = asmalloc(pa, sys->vmend - sys->vmunmapped, 1, 0);
+	if(mem != pa)
+		panic("asmmeminit 2");
+	DBG("pa %#llux mem %#llux\n", pa, mem);
+
+	/* assume already 2MiB aligned*/
+	assert(ALIGNED(sys->vmunmapped, 2*MiB));
+	pml4 = UINT2PTR(m->pml4->va);
+	while(sys->vmunmapped < sys->vmend){
+		l = mmuwalk(pml4, sys->vmunmapped, 1, &pte, asmwalkalloc);
+		DBG("%#p l %d\n", sys->vmunmapped, l);
+		*pte = pa|PtePS|PteRW|PteP;
+		sys->vmunmapped += 2*MiB;
+		pa += 2*MiB;
+	}
+
+#ifdef ConfCrap
+	cx = 0;
+#endif /* ConfCrap */
+	for(asm = asmlist; asm != nil; asm = asm->next){
+		if(asm->type != AsmMEMORY)
+			continue;
+		va = KSEG2+asm->addr;
+		print("asm: addr %#P end %#P type %d size %P\n",
+			asm->addr, asm->addr+asm->size,
+			asm->type, asm->size);
+
+		lo = asm->addr;
+		hi = asm->addr+asm->size;
+		/* Convert a range into pages */
+		for(mem = lo; mem < hi; mem = nextmem){
+			nextmem = (mem + PGLSZ(0)) & ~m->pgszmask[0];
+
+			/* Try large pages first */
+			for(i = m->npgsz - 1; i >= 0; i--){
+				if((mem & m->pgszmask[i]) != 0)
+					continue;
+				if(mem + PGLSZ(i) > hi)
+					continue;
+				/* This page fits entirely within the range. */
+				/* Mark it a usable */
+				if((l = mmuwalk(pml4, va, i, &pte, asmwalkalloc)) < 0)
+					panic("asmmeminit 3");
+
+				*pte = mem|PteRW|PteP;
+				if(l > 0)
+					*pte |= PtePS;
+
+				nextmem = mem + PGLSZ(i);
+				va += PGLSZ(i);
+				npg[i]++;
+
+				break;
+			}
+		}
+
+#ifdef ConfCrap
+		/*
+		 * Fill in conf crap.
+		 */
+		if(cx >= nelem(conf.mem))
+			continue;
+		lo = ROUNDUP(asm->addr, PGSZ);
+//if(lo >= 600ull*MiB)
+//    continue;
+		conf.mem[cx].base = lo;
+		hi = ROUNDDN(hi, PGSZ);
+//if(hi > 600ull*MiB)
+//  hi = 600*MiB;
+		conf.mem[cx].npage = (hi - lo)/PGSZ;
+		conf.npage += conf.mem[cx].npage;
+		print("cm %d: addr %#llux npage %lud\n",
+			cx, conf.mem[cx].base, conf.mem[cx].npage);
+		cx++;
+#endif /* ConfCrap */
+	}
+	print("%d %d %d\n", npg[0], npg[1], npg[2]);
+
+#ifdef ConfCrap
+	/*
+	 * Fill in more conf crap.
+	 * This is why I hate Plan 9.
+	 */
+	conf.upages = conf.npage;
+	i = (sys->vmend - sys->vmstart)/PGSZ;		/* close enough */
+	conf.ialloc = (i/2)*PGSZ;
+	print("npage %llud upage %lud kpage %d\n",
+		conf.npage, conf.upages, i);
+
+#endif /* ConfCrap */
+}
+
+void
+asmumeminit(void)
+{
+	Asm *asm;
+	extern void physallocdump(void);
+
+	for(asm = asmlist; asm != nil; asm = asm->next){
+		if(asm->type != AsmMEMORY)
+			continue;
+		physinit(asm->addr, asm->size);
+	}
+	physallocdump();
+}

+ 5 - 0
sys/src/9/k10/boot.fs

@@ -0,0 +1,5 @@
+#!/boot/rc -m /boot/rcmain
+/boot/echo Morning
+# boot script for file servers, including standalone ones
+path=(/boot /$cputype/bin /rc/bin .)
+exec /boot/rc -m/boot/rcmain -i

+ 172 - 0
sys/src/9/k10/cga.c

@@ -0,0 +1,172 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	Black		= 0x00,
+	Blue		= 0x01,
+	Green		= 0x02,
+	Cyan		= 0x03,
+	Red		= 0x04,
+	Magenta		= 0x05,
+	Brown		= 0x06,
+	Grey		= 0x07,
+
+	Bright 		= 0x08,
+	Blinking	= 0x80,
+
+	Attr		= (Black<<4)|Grey,	/* (background<<4)|foreground */
+};
+
+enum {
+	Index		= 0x3d4,
+	Data		= Index+1,
+
+	Width		= 80*2,
+	Height		= 25,
+
+	Poststrlen	= 0,
+	Postcodelen	= 2,
+	Postlen		= Poststrlen+Postcodelen,
+};
+
+#define CGA		(BIOSSEG(0xb800))
+
+static Lock cgalock;
+static int cgapos;
+static int cgainitdone;
+
+static int
+cgaregr(int index)
+{
+	outb(Index, index);
+	return inb(Data) & 0xff;
+}
+
+static void
+cgaregw(int index, int data)
+{
+	outb(Index, index);
+	outb(Data, data);
+}
+
+static void
+cgacursor(void)
+{
+	uchar *cga;
+
+	cgaregw(0x0e, (cgapos/2>>8) & 0xff);
+	cgaregw(0x0f, cgapos/2 & 0xff);
+
+	cga = CGA;
+	cga[cgapos+1] = Attr;
+}
+
+/*
+ * extern, so we could use it to debug things like
+ * lock() if necessary.
+ */
+void
+cgaputc(int c)
+{
+	int i;
+	uchar *cga, *p;
+
+	cga = CGA;
+
+	if(c == '\n'){
+		cgapos = cgapos/Width;
+		cgapos = (cgapos+1)*Width;
+	}
+	else if(c == '\t'){
+		i = 8 - ((cgapos/2)&7);
+		while(i-- > 0)
+			cgaputc(' ');
+	}
+	else if(c == '\b'){
+		if(cgapos >= 2)
+			cgapos -= 2;
+		cgaputc(' ');
+		cgapos -= 2;
+	}
+	else{
+		cga[cgapos++] = c;
+		cga[cgapos++] = Attr;
+	}
+	if(cgapos >= (Width*Height)-Postlen*2){
+		memmove(cga, &cga[Width], Width*(Height-1));
+		p = &cga[Width*(Height-1)-Postlen*2];
+		for(i = 0; i < Width/2; i++){
+			*p++ = ' ';
+			*p++ = Attr;
+		}
+		cgapos -= Width;
+	}
+	cgacursor();
+}
+
+/*
+ * debug
+ */
+void
+cgaprinthex(uintptr x)
+{
+	char str[30];
+	char *s;
+	static char dig[] = "0123456789abcdef";
+
+	str[29] = 0;
+	s = &str[29];
+	while(x != 0){
+		*--s = dig[x&0xF];
+		x >>= 4;
+	}
+	while(*s != 0)
+		cgaputc(*s++);
+	cgaputc('\n');
+}
+
+void
+cgaconsputs(char* s, int n)
+{
+	ilock(&cgalock);
+	while(n-- > 0)
+		cgaputc(*s++);
+	iunlock(&cgalock);
+}
+
+void
+cgapost(int code)
+{
+	uchar *cga;
+
+	static char hex[] = "0123456789ABCDEF";
+
+	cga = CGA;
+	cga[Width*Height-Postcodelen*2] = hex[(code>>4) & 0x0f];
+	cga[Width*Height-Postcodelen*2+1] = Attr;
+	cga[Width*Height-Postcodelen*2+2] = hex[code & 0x0f];
+	cga[Width*Height-Postcodelen*2+3] = Attr;
+}
+
+void
+cgainit(void)
+{
+	ilock(&cgalock);
+	cgapos = cgaregr(0x0e)<<8;
+	cgapos |= cgaregr(0x0f);
+	cgapos *= 2;
+	cgainitdone = 1;
+	iunlock(&cgalock);
+}

+ 25 - 0
sys/src/9/k10/cpuidamd64.s

@@ -0,0 +1,25 @@
+/*
+ * The CPUID instruction is always supported on the amd64.
+ */
+TEXT cpuid(SB), 1, $-4
+	MOVL	RARG, AX
+
+	CPUID					/* argument in AX */
+
+	MOVQ	info+8(FP), BP
+	MOVL	AX, 0(BP)
+	MOVL	BX, 4(BP)
+	MOVL	CX, 8(BP)
+	MOVL	DX, 12(BP)
+	RET
+
+/*
+ * Basic timing loop to determine CPU frequency.
+ * The AAM instruction is not available in 64-bit mode.
+ */
+TEXT aamloop(SB), 1, $-4
+	MOVL	c+0(FP), CX
+aaml1:
+	XORQ	AX, AX				/* close enough */
+	LOOP	aaml1
+	RET

+ 138 - 0
sys/src/9/k10/crap.c

@@ -0,0 +1,138 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+/*
+Conf conf;
+char *confname[1] = {
+	"console",
+};
+char *confval[1] = {
+	"0 b115200",
+};
+int nconf = nelem(confname);
+ */
+
+/*
+ * Where configuration info is left for the loaded programme.
+ * This will turn into a structure as more is done by the boot loader
+ * (e.g. why parse the .ini file twice?).
+ * There are 3584 bytes available at CONFADDR.
+ */
+#define	CONFADDR	PTR2UINT(KADDR(0x0001200))
+
+#define BOOTLINE	((char*)CONFADDR)
+#define BOOTLINELEN	64
+#define BOOTARGS	((char*)(CONFADDR+BOOTLINELEN))
+#define	BOOTARGSLEN	(4096-0x200-BOOTLINELEN)
+#define	MAXCONF		64
+
+char *confname[MAXCONF];
+char *confval[MAXCONF];
+int nconf;
+
+void
+crapoptions(void)
+{
+	long i, n;
+	char *cp, *line[MAXCONF], *p, *q;
+
+	/*
+	 *  parse configuration args from dos file plan9.ini
+	 */
+	cp = BOOTARGS;	/* where b.com leaves its config */
+	cp[BOOTARGSLEN-1] = 0;
+
+	/*
+	 * Strip out '\r', change '\t' -> ' '.
+	 */
+	p = cp;
+	for(q = cp; *q; q++){
+		if(*q == '\r')
+			continue;
+		if(*q == '\t')
+			*q = ' ';
+		*p++ = *q;
+	}
+	*p = 0;
+
+	n = getfields(cp, line, MAXCONF, 1, "\n");
+	for(i = 0; i < n; i++){
+		if(*line[i] == '#')
+			continue;
+		cp = strchr(line[i], '=');
+		if(cp == nil)
+			continue;
+		*cp++ = '\0';
+		confname[nconf] = line[i];
+		confval[nconf] = cp;
+		nconf++;
+	}
+}
+
+char*
+getconf(char *name)
+{
+	int i;
+
+	for(i = 0; i < nconf; i++)
+		if(cistrcmp(confname[i], name) == 0)
+			return confval[i];
+	return 0;
+}
+
+void
+confsetenv(void)
+{
+	int i;
+
+	for(i = 0; i < nconf; i++){
+		if(confname[i][0] != '*')
+			ksetenv(confname[i], confval[i], 0);
+		ksetenv(confname[i], confval[i], 1);
+	}
+}
+
+int
+isaconfig(char *class, int ctlrno, ISAConf *isa)
+{
+	char cc[32], *p;
+	int i;
+
+	snprint(cc, sizeof cc, "%s%d", class, ctlrno);
+	p = getconf(cc);
+	if(p == nil)
+		return 0;
+
+	isa->type = "";
+	isa->nopt = tokenize(p, isa->opt, NISAOPT);
+	for(i = 0; i < isa->nopt; i++){
+		p = isa->opt[i];
+		if(cistrncmp(p, "type=", 5) == 0)
+			isa->type = p + 5;
+		else if(cistrncmp(p, "port=", 5) == 0)
+			isa->port = strtoul(p+5, &p, 0);
+		else if(cistrncmp(p, "irq=", 4) == 0)
+			isa->irq = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "dma=", 4) == 0)
+			isa->dma = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "mem=", 4) == 0)
+			isa->mem = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "size=", 5) == 0)
+			isa->size = strtoul(p+5, &p, 0);
+		else if(cistrncmp(p, "freq=", 5) == 0)
+			isa->freq = strtoul(p+5, &p, 0);
+	}
+	return 1;
+}

+ 430 - 0
sys/src/9/k10/dat.h

@@ -0,0 +1,430 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+typedef struct ACVctl ACVctl;
+typedef struct Conf Conf;
+typedef struct Confmem Confmem;
+typedef struct Fxsave Fxsave;
+typedef struct ICC ICC;
+typedef struct ICCparms ICCparms;
+typedef struct ISAConf ISAConf;
+typedef struct Label Label;
+typedef struct Lock Lock;
+typedef struct MCPU MCPU;
+typedef struct MFPU MFPU;
+typedef struct MMMU MMMU;
+typedef struct NIX NIX;
+typedef struct Mach Mach;
+typedef u64int Mpl;
+typedef struct Page Page;
+typedef struct Pcidev Pcidev;
+typedef struct PFPU PFPU;
+typedef struct PmcCtr PmcCtr;
+typedef struct PmcCtl PmcCtl;
+typedef struct PmcWait PmcWait;
+typedef struct PMMU PMMU;
+typedef struct PNOTIFY PNOTIFY;
+typedef u64int PTE;
+typedef struct Proc Proc;
+typedef struct Sys Sys;
+typedef u64int uintmem;				/* Physical address (hideous) */
+typedef struct Ureg Ureg;
+typedef struct Vctl Vctl;
+
+#pragma incomplete Ureg
+
+#define MAXSYSARG	5	/* for mount(fd, afd, mpt, flag, arg) */
+
+/*
+ *  parameters for sysproc.c
+ */
+#define AOUT_MAGIC	(S_MAGIC)
+
+/*
+ *  machine dependent definitions used by ../port/portdat.h
+ */
+
+
+struct Lock
+{
+	u32int	key;
+	int	isilock;
+	Mpl	pl;
+	uintptr	pc;
+	Proc*	p;
+	Mach*	m;
+	uvlong	lockcycles;
+};
+
+struct Label
+{
+	uintptr	sp;
+	uintptr	pc;
+};
+
+struct Fxsave {
+	u16int	fcw;			/* x87 control word */
+	u16int	fsw;			/* x87 status word */
+	u8int	ftw;			/* x87 tag word */
+	u8int	zero;			/* 0 */
+	u16int	fop;			/* last x87 opcode */
+	u64int	rip;			/* last x87 instruction pointer */
+	u64int	rdp;			/* last x87 data pointer */
+	u32int	mxcsr;			/* MMX control and status */
+	u32int	mxcsrmask;		/* supported MMX feature bits */
+	uchar	st[128];		/* shared 64-bit media and x87 regs */
+	uchar	xmm[256];		/* 128-bit media regs */
+	uchar	ign[96];		/* reserved, ignored */
+};
+
+/*
+ *  FPU stuff in Proc
+ */
+struct PFPU {
+	int	fpustate;
+	uchar	fxsave[sizeof(Fxsave)+15];
+	void*	fpusave;
+};
+
+/*
+ *  MMU stuff in Proc
+ */
+#define NCOLOR 1
+struct PMMU
+{
+	Page*	mmuptp[4];		/* page table pages for each level */
+};
+
+/*
+ *  things saved in the Proc structure during a notify
+ */
+struct PNOTIFY
+{
+	void	emptiness;
+};
+
+struct Confmem
+{
+	uintptr	base;
+	usize	npage;
+	uintptr	kbase;
+	uintptr	klimit;
+};
+
+struct Conf
+{
+	ulong	nproc;		/* processes */
+	Confmem	mem[4];		/* physical memory */
+	uvlong	npage;		/* total physical pages of memory */
+	usize	upages;		/* user page pool */
+	ulong	copymode;	/* 0 is copy on write, 1 is copy on reference */
+	ulong	ialloc;		/* max interrupt time allocation in bytes */
+	ulong	nimage;		/* number of page cache image headers */
+};
+
+enum
+{
+	NPGSZ = 4	/* # of supported  pages sizes in Mach */
+};
+
+#include "../port/portdat.h"
+
+/*
+ *  CPU stuff in Mach.
+ */
+struct MCPU
+{
+	u32int	cpuinfo[3][4];			/*  CPUID Functions 0, 1, and 5 (n.b.: 2-4 are invalid) */
+	int	ncpuinfos;			/* number of standard entries */
+	int	ncpuinfoe;			/* number of extended entries */
+	int	isintelcpu;			/*  */
+};
+
+/*
+ *  FPU stuff in Mach.
+ */
+struct MFPU
+{
+	u16int	fcw;			/* x87 control word */
+	u32int	mxcsr;			/* MMX control and status */
+	u32int	mxcsrmask;		/* supported MMX feature bits */
+};
+
+struct NIX
+{
+	ICC*	icc;			/* inter-core call */
+	int	nixtype;	
+};
+
+/*
+ *  MMU stuff in Mach.
+ */
+struct MMMU
+{
+	uintptr cr2;
+	Page*	pml4;			/* pml4 for this processor */
+	PTE*	pmap;			/* unused as of yet */
+
+	uint	pgszlg2[NPGSZ];		/* per Mach or per Sys? */
+	uint	pgszmask[NPGSZ];
+	uint	pgsz[NPGSZ];
+	int	npgsz;
+
+	Page	pml4kludge;		/* NIX KLUDGE: we need a page */
+};
+
+/*
+ * Inter core calls
+ */
+enum
+{
+	ICCLNSZ =	128,	/* Cache line size for inter core calls */
+
+
+	ICCOK = 0,		/* Return codes: Ok; trap; syscall */
+	ICCTRAP,
+	ICCSYSCALL
+};
+
+struct ICC
+{
+	/* fn is kept in its own cache line */
+	union{
+		void	(*fn)(void);
+		uchar	_ln1_[ICCLNSZ];
+	};
+	int	flushtlb;	/* on the AC, before running fn */
+	int	rc;		/* return code from AC to TC */
+	char*	note;		/* to be posted in the TC after returning */
+	uchar	data[ICCLNSZ];	/* sent to the AC */
+};
+
+/*
+ * hw perf counters
+ */
+struct PmcCtl {
+	Ref;
+	u32int coreno;
+	int enab;
+	int user;
+	int os;
+	int nodesc;
+	char descstr[KNAMELEN];
+	int reset;
+};
+
+struct PmcWait{
+	Ref;
+	Rendez r;
+	PmcWait*	next;
+};
+
+struct PmcCtr{
+	int stale;
+	PmcWait *wq;
+	u64int ctr;
+	int ctrset;
+	PmcCtl;
+	int ctlset;
+};
+
+enum {
+	PmcMaxCtrs = 4,
+	PmcIgn = 0,
+	PmcGet = 1,
+	PmcSet = 2,
+};
+
+/*
+ * Per processor information.
+ *
+ * The offsets of the first few elements may be known
+ * to low-level assembly code, so do not re-order:
+ *	machno	- no dependency, convention
+ *	splpc	- splhi, spllo, splx
+ *	proc	- syscallentry
+ *	stack	- acsyscall
+ */
+struct Mach
+{
+	int	machno;			/* physical id of processor */
+	uintptr	splpc;			/* pc of last caller to splhi */
+
+	Proc*	proc;			/* current process on this processor */
+	uintptr	stack;
+
+	int	apicno;
+	int	online;
+
+	MMMU;
+
+	uchar*	vsvm;
+	void*	gdt;
+	void*	tss;
+
+	ulong	ticks;			/* of the clock since boot time */
+	Label	sched;			/* scheduler wakeup */
+	Lock	alarmlock;		/* access to alarm list */
+	void*	alarm;			/* alarms bound to this clock */
+	int	inclockintr;
+
+	ulong	qstart;			/* time when up started running */
+	int	qexpired;		/* quantum expired */
+
+	int	tlbfault;
+	int	tlbpurge;
+	int	pfault;
+	int	cs;
+	int	syscall;
+	int	intr;
+	int	mmuflush;		/* make current proc flush it's mmu state */
+	int	ilockdepth;
+	Perf	perf;			/* performance counters */
+	int	inidle;			/* profiling */
+	int	lastintr;
+
+	Lock	apictimerlock;
+	uvlong	cyclefreq;		/* Frequency of user readable cycle counter */
+	vlong	cpuhz;
+	int	cpumhz;
+	u64int	rdtsc;
+
+	Lock	pmclock;
+	PmcCtr	pmc[PmcMaxCtrs];
+
+	MFPU;
+	MCPU;
+
+	NIX;
+};
+
+/*
+ * This is the low memory map, between 0x100000 and 0x110000.
+ * It is located there to allow fundamental datastructures to be
+ * created and used before knowing where free memory begins
+ * (e.g. there may be modules located after the kernel BSS end).
+ * The layout is known in the bootstrap code in l32p.s.
+ * It is logically two parts: the per processor data structures
+ * for the bootstrap processor (stack, Mach, vsvm, and page tables),
+ * and the global information about the system (syspage, ptrpage).
+ * Some of the elements must be aligned on page boundaries, hence
+ * the unions.
+ */
+struct Sys {
+	uchar	machstk[MACHSTKSZ];
+
+	PTE	pml4[PTSZ/sizeof(PTE)];	/*  */
+	PTE	pdp[PTSZ/sizeof(PTE)];
+	PTE	pd[PTSZ/sizeof(PTE)];
+	PTE	pt[PTSZ/sizeof(PTE)];
+
+	uchar	vsvmpage[4*KiB];
+
+	union {
+		Mach	mach;
+		uchar	machpage[MACHSZ];
+	};
+
+	union {
+		struct {
+			u64int	pmstart;	/* physical memory */
+			u64int	pmoccupied;	/* how much is occupied */
+			u64int	pmend;		/* total span */
+
+			uintptr	vmstart;	/* base address for malloc */
+			uintptr	vmunused;	/* 1st unused va */
+			uintptr	vmunmapped;	/* 1st unmapped va */
+			uintptr	vmend;		/* 1st unusable va */
+			u64int	epoch;		/* crude time synchronisation */
+
+			int		nc[NIXROLES];		/* number of online processors */
+			int		nmach;
+			int		load;
+			ulong	ticks;			/* of the clock since boot time */
+		};
+		uchar	syspage[4*KiB];
+	};
+
+	union {
+		Mach*	machptr[MACHMAX];
+		uchar	ptrpage[4*KiB];
+	};
+
+	uchar	_57344_[2][4*KiB];		/* unused */
+};
+
+extern Sys* sys;
+
+/*
+ * KMap
+ */
+typedef void KMap;
+extern KMap* kmap(Page*);
+
+#define kunmap(k)
+#define VA(k)		PTR2UINT(k)
+
+struct
+{
+	Lock;
+	int	nonline;			/* # of active CPUs */
+	int nbooting;			/* # of CPUs waiting for the bTC to go */
+	int	exiting;			/* shutdown */
+	int	ispanic;			/* shutdown in response to a panic */
+	int	thunderbirdsarego;	/* lets the added processors continue */
+}active;
+
+/*
+ *  a parsed plan9.ini line
+ */
+#define NISAOPT		8
+
+struct ISAConf {
+	char	*type;
+	uintptr	port;
+	int	irq;
+	ulong	dma;
+	uintptr	mem;
+	usize	size;
+	ulong	freq;
+
+	int	nopt;
+	char	*opt[NISAOPT];
+};
+
+/*
+ * The Mach structures must be available via the per-processor
+ * MMU information array machptr, mainly for disambiguation and access to
+ * the clock which is only maintained by the bootstrap processor (0).
+ */
+extern register Mach* m;			/* R15 */
+extern register Proc* up;			/* R14 */
+
+extern uintptr kseg0;
+
+extern char*rolename[];
+
+
+#pragma	varargck	type	"P"	uintmem
+
+/*
+ * Horrid.
+ */
+#ifdef _DBGC_
+#define DBGFLG		(dbgflg[_DBGC_])
+#else
+#define DBGFLG		(0)
+#endif /* _DBGC_ */
+
+#define DBG(...)	if(!DBGFLG){}else dbgprint(__VA_ARGS__)
+
+extern char dbgflg[256];
+
+#define dbgprint	print		/* for now */
+

+ 1721 - 0
sys/src/9/k10/devacpi.c

@@ -0,0 +1,1721 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"../port/error.h"
+#include "mp.h"
+#include "acpi.h"
+
+/*
+ * ACPI 4.0 Support.
+ * Still WIP.
+ *
+ * This driver locates tables and parses only the FADT
+ * and the XSDT. All other tables are mapped and kept there
+ * for the user-level interpreter.
+ */
+
+
+#define l16get(p)	(((p)[1]<<8)|(p)[0])
+#define l32get(p)	(((u32int)l16get(p+2)<<16)|l16get(p))
+static Atable* acpifadt(uchar*, int);
+static Atable* acpitable(uchar*, int);
+static Atable* acpimadt(uchar*, int);
+static Atable* acpimsct(uchar*, int);
+static Atable* acpisrat(uchar*, int);
+static Atable* acpislit(uchar*, int);
+
+#pragma	varargck	type	"G"	Gas*
+
+static Cmdtab ctls[] =
+{
+	{CMregion,	"region",	6},
+	{CMgpe,		"gpe",		3},
+};
+
+static Dirtab acpidir[]={
+	".",		{Qdir, 0, QTDIR},	0,	DMDIR|0555,
+	"acpictl",	{Qctl},			0,	0666,
+	"acpitbl",	{Qtbl},			0,	0444,
+	"acpiregio",	{Qio},			0,	0666,
+};
+
+/*
+ * The DSDT is always given to the user interpreter.
+ * Tables listed here are also loaded from the XSDT:
+ * MSCT, MADT, and FADT are processed by us, because they are
+ * required to do early initialization before we have user processes.
+ * Other tables are given to the user level interpreter for
+ * execution.
+ */
+static Parse ptables[] =
+{
+	"FACP", acpifadt,
+	"APIC",	acpimadt,
+	"SRAT",	acpisrat,
+	"SLIT",	acpislit,
+	"MSCT",	acpimsct,
+	"SSDT", acpitable,
+};
+
+static Facs*	facs;	/* Firmware ACPI control structure */
+static Fadt	fadt;	/* Fixed ACPI description. To reach ACPI registers */
+static Xsdt*	xsdt;	/* XSDT table */
+static Atable*	tfirst;	/* loaded DSDT/SSDT/... tables */
+static Atable*	tlast;	/* pointer to last table */
+static Madt*	apics;	/* APIC info */
+static Srat*	srat;	/* System resource affinity, used by physalloc */
+static Slit*	slit;	/* System locality information table used by the scheduler */
+static Msct*	msct;	/* Maximum system characteristics table */
+static Reg*	reg;	/* region used for I/O */
+static Gpe*	gpes;	/* General purpose events */
+static int	ngpes;
+
+static char* regnames[] = {
+	"mem", "io", "pcicfg", "embed",
+	"smb", "cmos", "pcibar",
+};
+
+static char*
+acpiregstr(int id)
+{
+	static char buf[20];	/* BUG */
+
+	if(id >= 0 && id < nelem(regnames))
+		return regnames[id];
+	seprint(buf, buf+sizeof(buf), "spc:%#x", id);
+	return buf;
+}
+
+static int
+acpiregid(char *s)
+{
+	int i;
+
+	for(i = 0; i < nelem(regnames); i++)
+		if(strcmp(regnames[i], s) == 0)
+			return i;
+	return -1;
+}
+
+static u64int
+l64get(u8int* p)
+{
+	/*
+	 * Doing this as a define
+	 * #define l64get(p)	(((u64int)l32get(p+4)<<32)|l32get(p))
+	 * causes 8c to abort with "out of fixed registers" in
+	 * rsdlink() below.
+	 */
+	return (((u64int)l32get(p+4)<<32)|l32get(p));
+}
+
+static u8int
+mget8(uintptr p, void*)
+{
+	u8int *cp = (u8int*)p;
+	return *cp;
+}
+
+static void
+mset8(uintptr p, u8int v, void*)
+{
+	u8int *cp = (u8int*)p;
+	*cp = v;
+}
+
+static u16int
+mget16(uintptr p, void*)
+{
+	u16int *cp = (u16int*)p;
+	return *cp;
+}
+
+static void
+mset16(uintptr p, u16int v, void*)
+{
+	u16int *cp = (u16int*)p;
+	*cp = v;
+}
+
+static u32int
+mget32(uintptr p, void*)
+{
+	u32int *cp = (u32int*)p;
+	return *cp;
+}
+
+static void
+mset32(uintptr p, u32int v, void*)
+{
+	u32int *cp = (u32int*)p;
+	*cp = v;
+}
+
+static u64int
+mget64(uintptr p, void*)
+{
+	u64int *cp = (u64int*)p;
+	return *cp;
+}
+
+static void
+mset64(uintptr p, u64int v, void*)
+{
+	u64int *cp = (u64int*)p;
+	*cp = v;
+}
+
+static u8int
+ioget8(uintptr p, void*)
+{
+	return inb(p);
+}
+
+static void
+ioset8(uintptr p, u8int v, void*)
+{
+	outb(p, v);
+}
+
+static u16int
+ioget16(uintptr p, void*)
+{
+	return ins(p);
+}
+
+static void
+ioset16(uintptr p, u16int v, void*)
+{
+	outs(p, v);
+}
+
+static u32int
+ioget32(uintptr p, void*)
+{
+	return inl(p);
+}
+
+static void
+ioset32(uintptr p, u32int v, void*)
+{
+	outl(p, v);
+}
+
+static u8int
+cfgget8(uintptr p, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	return pcicfgr8(&d, p);
+}
+
+static void
+cfgset8(uintptr p, u8int v, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	pcicfgw8(&d, p, v);
+}
+
+static u16int
+cfgget16(uintptr p, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	return pcicfgr16(&d, p);
+}
+
+static void
+cfgset16(uintptr p, u16int v, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	pcicfgw16(&d, p, v);
+}
+
+static u32int
+cfgget32(uintptr p, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	return pcicfgr32(&d, p);
+}
+
+static void
+cfgset32(uintptr p, u32int v, void* r)
+{
+	Reg *ro = r;
+	Pcidev d;
+
+	d.tbdf = ro->tbdf;
+	pcicfgw32(&d, p, v);
+}
+
+static Regio memio = 
+{
+	nil,
+	mget8, mset8, mget16, mset16,
+	mget32, mset32, mget64, mset64
+};
+
+static Regio ioio = 
+{
+	nil,
+	ioget8, ioset8, ioget16, ioset16,
+	ioget32, ioset32, nil, nil
+};
+
+static Regio cfgio = 
+{
+	nil,
+	cfgget8, cfgset8, cfgget16, cfgset16,
+	cfgget32, cfgset32, nil, nil
+};
+
+/*
+ * Copy memory, 1/2/4/8-bytes at a time, to/from a region.
+ */
+static long
+regcpy(Regio *dio, uintptr da, Regio *sio, uintptr sa, long len, int align)
+{
+	int n, i;
+
+	DBG("regcpy %#ullx %#ullx %#ulx %#ux\n", da, sa, len, align);
+	if((len%align) != 0)
+		print("regcpy: bug: copy not aligned. truncated\n");
+	n = len/align;
+	for(i = 0; i < n; i++){
+		switch(align){
+		case 1:
+			DBG("cpy8 %#p %#p\n", da, sa);
+			dio->set8(da, sio->get8(sa, sio->arg), dio->arg);
+			break;
+		case 2:
+			DBG("cpy16 %#p %#p\n", da, sa);
+			dio->set16(da, sio->get16(sa, sio->arg), dio->arg);
+			break;
+		case 4:
+			DBG("cpy32 %#p %#p\n", da, sa);
+			dio->set32(da, sio->get32(sa, sio->arg), dio->arg);
+			break;
+		case 8:
+			DBG("cpy64 %#p %#p\n", da, sa);
+		//	dio->set64(da, sio->get64(sa, sio->arg), dio->arg);
+			break;
+		default:
+			panic("regcpy: align bug");
+		}
+		da += align;
+		sa += align;
+	}
+	return n*align;
+}
+
+/*
+ * Perform I/O within region in access units of accsz bytes.
+ * All units in bytes.
+ */
+static long
+regio(Reg *r, void *p, ulong len, uintptr off, int iswr)
+{
+	Regio rio;
+	uintptr rp;
+
+	DBG("reg%s %s %#p %#ullx %#lx sz=%d\n",
+		iswr ? "out" : "in", r->name, p, off, len, r->accsz);
+	rp = 0;
+	if(off + len > r->len){
+		print("regio: access outside limits");
+		len = r->len - off;
+	}
+	if(len <= 0){
+		print("regio: zero len\n");
+		return 0;
+	}
+	switch(r->spc){
+	case Rsysmem:
+		// XXX should map only what we are going to use
+		// A region might be too large.
+		if(r->p == nil)
+			r->p = vmap(r->base, len);
+		if(r->p == nil)
+			error("regio: vmap failed");
+		rp = (uintptr)r->p + off;
+		rio = memio;
+		break;
+	case Rsysio:
+		rp = r->base + off;
+		rio = ioio;
+		break;
+	case Rpcicfg:
+		rp = r->base + off;
+		rio = cfgio;
+		rio.arg = r;
+		break;
+	case Rpcibar:
+	case Rembed:
+	case Rsmbus:
+	case Rcmos:
+	case Ripmi:
+	case Rfixedhw:
+		print("regio: reg %s not supported\n", acpiregstr(r->spc));
+		error("region not supported");
+	}
+	if(iswr)
+		regcpy(&rio, rp, &memio, (uintptr)p, len, r->accsz);
+	else
+		regcpy(&memio, (uintptr)p, &rio, rp, len, r->accsz);
+	return len;
+}
+
+static Atable*
+newtable(uchar *p)
+{
+	Atable *t;
+	Sdthdr *h;
+
+	t = malloc(sizeof(Atable));
+	if(t == nil)
+		panic("no memory for more aml tables");
+	t->tbl = p;
+	h = (Sdthdr*)t->tbl;
+	t->is64 = h->rev >= 2;
+	t->dlen = l32get(h->length) - Sdthdrsz;
+	memmove(t->sig, h->sig, sizeof(h->sig));
+	t->sig[sizeof(t->sig)-1] = 0;
+	memmove(t->oemid, h->oemid, sizeof(h->oemid));
+	t->oemtblid[sizeof(t->oemtblid)-1] = 0;
+	memmove(t->oemtblid, h->oemtblid, sizeof(h->oemtblid));
+	t->oemtblid[sizeof(t->oemtblid)-1] = 0;
+	t->next = nil;
+	if(tfirst == nil)
+		tfirst = tlast = t;
+	else{
+		tlast->next = t;
+		tlast = t;
+	}
+	return t;
+}
+
+static void*
+sdtchecksum(void* addr, int len)
+{
+	u8int *p, sum;
+
+	sum = 0;
+	for(p = addr; len-- > 0; p++)
+		sum += *p;
+	if(sum == 0)
+		return addr;
+
+	return nil;
+}
+
+static void *
+sdtmap(uintptr pa, int *n, int cksum)
+{
+	Sdthdr* sdt;
+
+	sdt = vmap(pa, sizeof(Sdthdr));
+	if(sdt == nil){
+		DBG("acpi: vmap1: nil\n");
+		return nil;
+	}
+	*n = l32get(sdt->length);
+	vunmap(sdt, sizeof(Sdthdr));
+	if((sdt = vmap(pa, *n)) == nil){
+		DBG("acpi: nil vmap\n");
+		return nil;
+	}
+	if(cksum != 0 && sdtchecksum(sdt, *n) == nil){
+		DBG("acpi: SDT: bad checksum\n");
+		vunmap(sdt, sizeof(Sdthdr));
+		return nil;
+	}
+	return sdt;
+}
+
+static int
+loadfacs(uintptr pa)
+{
+	int n;
+
+	facs = sdtmap(pa, &n, 0);
+	if(facs == nil)
+		return -1;
+	if(memcmp(facs, "FACS", 4) != 0){
+		vunmap(facs, n);
+		facs = nil;
+		return -1;
+	}
+	/* no unmap */
+
+	DBG("acpi: facs: hwsig: %#ux\n", facs->hwsig);
+	DBG("acpi: facs: wakingv: %#ux\n", facs->wakingv);
+	DBG("acpi: facs: flags: %#ux\n", facs->flags);
+	DBG("acpi: facs: glock: %#ux\n", facs->glock);
+	DBG("acpi: facs: xwakingv: %#llux\n", facs->xwakingv);
+	DBG("acpi: facs: vers: %#ux\n", facs->vers);
+	DBG("acpi: facs: ospmflags: %#ux\n", facs->ospmflags);
+	return 0;
+}
+
+static void
+loaddsdt(uintptr pa)
+{
+	int n;
+	uchar *dsdtp;
+
+	dsdtp = sdtmap(pa, &n, 1);
+	if(dsdtp == nil)
+		return;
+	if(acpitable(dsdtp, n) == nil)
+		vunmap(dsdtp, n);
+}
+
+static void
+gasget(Gas *gas, uchar *p)
+{
+	gas->spc = p[0];
+	gas->len = p[1];
+	gas->off = p[2];
+	gas->accsz = p[3];
+	gas->addr = l64get(p+4);
+}
+
+static void
+dumpfadt(Fadt *fp)
+{
+	if(DBGFLG == 0)
+		return;
+
+	DBG("acpi: fadt: facs: %#ux\n", fp->facs);
+	DBG("acpi: fadt: dsdt: %#ux\n", fp->dsdt);
+	DBG("acpi: fadt: pmprofile: %#ux\n", fp->pmprofile);
+	DBG("acpi: fadt: sciint: %#ux\n", fp->sciint);
+	DBG("acpi: fadt: smicmd: %#ux\n", fp->smicmd);
+	DBG("acpi: fadt: acpienable: %#ux\n", fp->acpienable);
+	DBG("acpi: fadt: acpidisable: %#ux\n", fp->acpidisable);
+	DBG("acpi: fadt: s4biosreq: %#ux\n", fp->s4biosreq);
+	DBG("acpi: fadt: pstatecnt: %#ux\n", fp->pstatecnt);
+	DBG("acpi: fadt: pm1aevtblk: %#ux\n", fp->pm1aevtblk);
+	DBG("acpi: fadt: pm1bevtblk: %#ux\n", fp->pm1bevtblk);
+	DBG("acpi: fadt: pm1acntblk: %#ux\n", fp->pm1acntblk);
+	DBG("acpi: fadt: pm1bcntblk: %#ux\n", fp->pm1bcntblk);
+	DBG("acpi: fadt: pm2cntblk: %#ux\n", fp->pm2cntblk);
+	DBG("acpi: fadt: pmtmrblk: %#ux\n", fp->pmtmrblk);
+	DBG("acpi: fadt: gpe0blk: %#ux\n", fp->gpe0blk);
+	DBG("acpi: fadt: gpe1blk: %#ux\n", fp->gpe1blk);
+	DBG("acpi: fadt: pm1evtlen: %#ux\n", fp->pm1evtlen);
+	DBG("acpi: fadt: pm1cntlen: %#ux\n", fp->pm1cntlen);
+	DBG("acpi: fadt: pm2cntlen: %#ux\n", fp->pm2cntlen);
+	DBG("acpi: fadt: pmtmrlen: %#ux\n", fp->pmtmrlen);
+	DBG("acpi: fadt: gpe0blklen: %#ux\n", fp->gpe0blklen);
+	DBG("acpi: fadt: gpe1blklen: %#ux\n", fp->gpe1blklen);
+	DBG("acpi: fadt: gp1base: %#ux\n", fp->gp1base);
+	DBG("acpi: fadt: cstcnt: %#ux\n", fp->cstcnt);
+	DBG("acpi: fadt: plvl2lat: %#ux\n", fp->plvl2lat);
+	DBG("acpi: fadt: plvl3lat: %#ux\n", fp->plvl3lat);
+	DBG("acpi: fadt: flushsz: %#ux\n", fp->flushsz);
+	DBG("acpi: fadt: flushstride: %#ux\n", fp->flushstride);
+	DBG("acpi: fadt: dutyoff: %#ux\n", fp->dutyoff);
+	DBG("acpi: fadt: dutywidth: %#ux\n", fp->dutywidth);
+	DBG("acpi: fadt: dayalrm: %#ux\n", fp->dayalrm);
+	DBG("acpi: fadt: monalrm: %#ux\n", fp->monalrm);
+	DBG("acpi: fadt: century: %#ux\n", fp->century);
+	DBG("acpi: fadt: iapcbootarch: %#ux\n", fp->iapcbootarch);
+	DBG("acpi: fadt: flags: %#ux\n", fp->flags);
+	DBG("acpi: fadt: resetreg: %G\n", &fp->resetreg);
+	DBG("acpi: fadt: resetval: %#ux\n", fp->resetval);
+	DBG("acpi: fadt: xfacs: %#llux\n", fp->xfacs);
+	DBG("acpi: fadt: xdsdt: %#llux\n", fp->xdsdt);
+	DBG("acpi: fadt: xpm1aevtblk: %G\n", &fp->xpm1aevtblk);
+	DBG("acpi: fadt: xpm1bevtblk: %G\n", &fp->xpm1bevtblk);
+	DBG("acpi: fadt: xpm1acntblk: %G\n", &fp->xpm1acntblk);
+	DBG("acpi: fadt: xpm1bcntblk: %G\n", &fp->xpm1bcntblk);
+	DBG("acpi: fadt: xpm2cntblk: %G\n", &fp->xpm2cntblk);
+	DBG("acpi: fadt: xpmtmrblk: %G\n", &fp->xpmtmrblk);
+	DBG("acpi: fadt: xgpe0blk: %G\n", &fp->xgpe0blk);
+	DBG("acpi: fadt: xgpe1blk: %G\n", &fp->xgpe1blk);
+}
+
+static Atable*
+acpifadt(uchar *p, int)
+{
+	Fadt *fp;
+
+	fp = &fadt;
+	fp->facs = l32get(p + 36);
+	fp->dsdt = l32get(p + 40);
+	fp->pmprofile = p[45];
+	fp->sciint = l16get(p+46);
+	fp->smicmd = l32get(p+48);
+	fp->acpienable = p[52];
+	fp->acpidisable = p[53];
+	fp->s4biosreq = p[54];
+	fp->pstatecnt = p[55];
+	fp->pm1aevtblk = l32get(p+56);
+	fp->pm1bevtblk = l32get(p+60);
+	fp->pm1acntblk = l32get(p+64);
+	fp->pm1bcntblk = l32get(p+68);
+	fp->pm2cntblk = l32get(p+72);
+	fp->pmtmrblk = l32get(p+76);
+	fp->gpe0blk = l32get(p+80);
+	fp->gpe1blk = l32get(p+84);
+	fp->pm1evtlen = p[88];
+	fp->pm1cntlen = p[89];
+	fp->pm2cntlen = p[90];
+	fp->pmtmrlen = p[91];
+	fp->gpe0blklen = p[92];
+	fp->gpe1blklen = p[93];
+	fp->gp1base = p[94];
+	fp->cstcnt = p[95];
+	fp->plvl2lat = l16get(p+96);
+	fp->plvl3lat = l16get(p+98);
+	fp->flushsz = l16get(p+100);
+	fp->flushstride = l16get(p+102);
+	fp->dutyoff = p[104];
+	fp->dutywidth = p[105];
+	fp->dayalrm = p[106];
+	fp->monalrm = p[107];
+	fp->century = p[108];
+	fp->iapcbootarch = l16get(p+109);
+	fp->flags = l32get(p+112);
+	gasget(&fp->resetreg, p+116);
+	fp->resetval = p[128];
+	fp->xfacs = l64get(p+132);
+	fp->xdsdt = l64get(p+140);
+	gasget(&fp->xpm1aevtblk, p+148);
+	gasget(&fp->xpm1bevtblk, p+160);
+	gasget(&fp->xpm1acntblk, p+172);
+	gasget(&fp->xpm1bcntblk, p+184);
+	gasget(&fp->xpm2cntblk, p+196);
+	gasget(&fp->xpmtmrblk, p+208);
+	gasget(&fp->xgpe0blk, p+220);
+	gasget(&fp->xgpe1blk, p+232);
+
+	dumpfadt(fp);
+	if(fp->xfacs != 0)
+		loadfacs(fp->xfacs);
+	else
+		loadfacs(fp->facs);
+
+	if(fp->xdsdt == ((u64int)fp->dsdt)) /* acpica */
+		loaddsdt(fp->xdsdt);
+	else
+		loaddsdt(fp->dsdt);
+
+	return nil;	/* can be unmapped once parsed */
+}
+
+static void
+dumpmsct(Msct *msct)
+{
+	Mdom *st;
+
+	DBG("acpi: msct: %d doms %d clkdoms %#ullx maxpa\n",
+		msct->ndoms, msct->nclkdoms, msct->maxpa);
+	for(st = msct->dom; st != nil; st = st->next)
+		DBG("\t[%d:%d] %d maxproc %#ullx maxmmem\n",
+			st->start, st->end, st->maxproc, st->maxmem);
+	DBG("\n");
+}
+
+/*
+ * XXX: should perhaps update our idea of available memory.
+ * Else we should remove this code.
+ */
+static Atable*
+acpimsct(uchar *p, int len)
+{
+	uchar *pe;
+	Mdom **stl, *st;
+	int off;
+
+	msct = mallocz(sizeof(Msct), 1);
+	msct->ndoms = l32get(p+40) + 1;
+	msct->nclkdoms = l32get(p+44) + 1;
+	msct->maxpa = l64get(p+48);
+	msct->dom = nil;
+	stl = &msct->dom;
+	pe = p + len;
+	off = l32get(p+36);
+	for(p += off; p < pe; p += 22){
+		st = mallocz(sizeof(Mdom), 1);
+		st->next = nil;
+		st->start = l32get(p+2);
+		st->end = l32get(p+6);
+		st->maxproc = l32get(p+10);
+		st->maxmem = l64get(p+14);
+		*stl = st;
+		stl = &st->next;
+	}
+
+	dumpmsct(msct);
+	return nil;	/* can be unmapped once parsed */
+}
+
+static void
+dumpsrat(Srat *st)
+{
+	DBG("acpi: srat:\n");
+	for(; st != nil; st = st->next)
+		switch(st->type){
+		case SRlapic:
+			DBG("\tlapic: dom %d apic %d sapic %d clk %d\n",
+				st->lapic.dom, st->lapic.apic,
+				st->lapic.sapic, st->lapic.clkdom);
+			break;
+		case SRmem:
+			DBG("\tmem: dom %d %#ullx %#ullx %c%c\n",
+				st->mem.dom, st->mem.addr, st->mem.len,
+				st->mem.hplug?'h':'-',
+				st->mem.nvram?'n':'-');
+			break;
+		case SRlx2apic:
+			DBG("\tlx2apic: dom %d apic %d clk %d\n",
+				st->lx2apic.dom, st->lx2apic.apic,
+				st->lx2apic.clkdom);
+			break;
+		default:
+			DBG("\t<unknown srat entry>\n");
+		}
+	DBG("\n");
+}
+
+static Atable*
+acpisrat(uchar *p, int len)
+{
+	Srat **stl, *st;
+	uchar *pe;
+	int stlen, flags;
+
+	if(srat != nil){
+		print("acpi: two SRATs?\n");
+		return nil;
+	}
+
+	stl = &srat;
+	pe = p + len;
+	for(p += 48; p < pe; p += stlen){
+		st = mallocz(sizeof(Srat), 1);
+		st->type = p[0];
+		st->next = nil;
+		stlen = p[1];
+		switch(st->type){
+		case SRlapic:
+			st->lapic.dom = p[2] | p[9]<<24| p[10]<<16 | p[11]<<8;
+			st->lapic.apic = p[3];
+			st->lapic.sapic = p[8];
+			st->lapic.clkdom = l32get(p+12);
+			if(l32get(p+4) == 0){
+				free(st);
+				st = nil;
+			}
+			break;
+		case SRmem:
+			st->mem.dom = l32get(p+2);
+			st->mem.addr = l64get(p+8);
+			st->mem.len = l64get(p+16);
+			flags = l32get(p+28);
+			if((flags&1) == 0){	/* not enabled */
+				free(st);
+				st = nil;
+			}else{
+				st->mem.hplug = flags & 2;
+				st->mem.nvram = flags & 4;
+			}
+			break;
+		case SRlx2apic:
+			st->lx2apic.dom = l32get(p+4);
+			st->lx2apic.apic = l32get(p+8);
+			st->lx2apic.clkdom = l32get(p+16);
+			if(l32get(p+12) == 0){
+				free(st);
+				st = nil;
+			}
+			break;
+		default:
+			print("unknown SRAT structure\n");
+			free(st);
+			st = nil;
+		}
+		if(st != nil){
+			*stl = st;
+			stl = &st->next;
+		}
+	}
+
+	dumpsrat(srat);
+	return nil;	/* can be unmapped once parsed */
+}
+
+static void
+dumpslit(Slit *sl)
+{
+	int i;
+	
+	DBG("acpi slit:\n");
+	for(i = 0; i < sl->rowlen*sl->rowlen; i++){
+		DBG("slit: %ux\n", sl->e[i/sl->rowlen][i%sl->rowlen].dist);
+	}
+}
+
+static int
+cmpslitent(void* v1, void* v2)
+{
+	SlEntry *se1, *se2;
+
+	se1 = v1;
+	se2 = v2;
+	return se1->dist - se2->dist;
+}
+
+static Atable*
+acpislit(uchar *p, int len)
+{
+	uchar *pe;
+	int i, j, k;
+	SlEntry *se;
+
+	pe = p + len;
+	slit = malloc(sizeof(*slit));
+	slit->rowlen = l64get(p+36);
+	slit->e = malloc(slit->rowlen*sizeof(SlEntry*));
+	for(i = 0; i < slit->rowlen; i++)
+		slit->e[i] = malloc(sizeof(SlEntry)*slit->rowlen);
+
+	i = 0;
+	for(p += 44; p < pe; p++, i++){
+		j = i/slit->rowlen;
+		k = i%slit->rowlen;
+		se = &slit->e[j][k];
+		se->dom = k;
+		se->dist = *p;
+	}
+	dumpslit(slit);
+	for(i = 0; i < slit->rowlen; i++)
+		qsort(slit->e[i], slit->rowlen, sizeof(slit->e[0][0]), cmpslitent);
+	
+	dumpslit(slit);
+	return nil;	/* can be unmapped once parsed */
+}
+
+uintmem
+acpimblocksize(uintmem addr, int *dom)
+{
+	Srat *sl;
+
+	for(sl = srat; sl != nil; sl = sl->next)
+		if(sl->type == SRmem)
+		if(sl->mem.addr <= addr && sl->mem.addr + sl->mem.len > addr){
+			*dom = sl->mem.dom;
+			return sl->mem.len - (addr - sl->mem.addr);
+		}
+	return 0;
+}
+
+
+/*
+ * we use mp->machno (or index in Mach array) as the identifier,
+ * but ACPI relies on the apic identifier.
+ */
+int
+corecolor(int core)
+{
+	Srat *sl;
+	Mach *m;
+	static int colors[32];
+
+	if(core < 0 || core >= MACHMAX)
+		return -1;
+	m = sys->machptr[core];
+	if(m == nil)
+		return -1;
+
+	if(core >= 0 && core < nelem(colors) && colors[core] != 0)
+		return colors[core] - 1;
+
+	for(sl = srat; sl != nil; sl = sl->next)
+		if(sl->type == SRlapic && sl->lapic.apic == m->apicno){
+			if(core >= 0 && core < nelem(colors))
+				colors[core] = 1 + sl->lapic.dom;
+			return sl->lapic.dom;
+		}
+	return -1;
+}
+
+
+int
+pickcore(int mycolor, int index)
+{
+	int color;
+	int ncorepercol;
+
+	if(slit == nil)
+		return 0;
+	ncorepercol = MACHMAX/slit->rowlen;
+	color = slit->e[mycolor][index/ncorepercol].dom;
+	return color * ncorepercol + index % ncorepercol;
+}
+
+
+static void
+dumpmadt(Madt *apics)
+{
+	Apicst *st;
+
+	DBG("acpi: madt lapic paddr %llux pcat %d:\n", apics->lapicpa, apics->pcat);
+	for(st = apics->st; st != nil; st = st->next)
+		switch(st->type){
+		case ASlapic:
+			DBG("\tlapic pid %d id %d\n", st->lapic.pid, st->lapic.id);
+			break;
+		case ASioapic:
+		case ASiosapic:
+			DBG("\tioapic id %d addr %#llux ibase %d\n",
+				st->ioapic.id, st->ioapic.addr, st->ioapic.ibase);
+			break;
+		case ASintovr:
+			DBG("\tintovr irq %d intr %d flags %#ux\n",
+				st->intovr.irq, st->intovr.intr,st->intovr.flags);
+			break;
+		case ASnmi:
+			DBG("\tnmi intr %d flags %#ux\n",
+				st->nmi.intr, st->nmi.flags);
+			break;
+		case ASlnmi:
+			DBG("\tlnmi pid %d lint %d flags %#ux\n",
+				st->lnmi.pid, st->lnmi.lint, st->lnmi.flags);
+			break;
+		case ASlsapic:
+			DBG("\tlsapic pid %d id %d eid %d puid %d puids %s\n",
+				st->lsapic.pid, st->lsapic.id,
+				st->lsapic.eid, st->lsapic.puid,
+				st->lsapic.puids);
+			break;
+		case ASintsrc:
+			DBG("\tintr type %d pid %d peid %d iosv %d intr %d %#x\n",
+				st->type, st->intsrc.pid,
+				st->intsrc.peid, st->intsrc.iosv,
+				st->intsrc.intr, st->intsrc.flags);
+			break;
+		case ASlx2apic:
+			DBG("\tlx2apic puid %d id %d\n", st->lx2apic.puid, st->lx2apic.id);
+			break;
+		case ASlx2nmi:
+			DBG("\tlx2nmi puid %d intr %d flags %#ux\n",
+				st->lx2nmi.puid, st->lx2nmi.intr, st->lx2nmi.flags);
+			break;
+		default:
+			DBG("\t<unknown madt entry>\n");
+		}
+	DBG("\n");
+}
+
+static Atable*
+acpimadt(uchar *p, int len)
+{
+	uchar *pe;
+	Apicst *st, *l, **stl;
+	int stlen, id;
+
+	apics = mallocz(sizeof(Madt), 1);
+	apics->lapicpa = l32get(p+36);
+	apics->pcat = l32get(p+40);
+	apics->st = nil;
+	stl = &apics->st;
+	pe = p + len;
+	for(p += 44; p < pe; p += stlen){
+		st = mallocz(sizeof(Apicst), 1);
+		st->type = p[0];
+		st->next = nil;
+		stlen = p[1];
+		switch(st->type){
+		case ASlapic:
+			st->lapic.pid = p[2];
+			st->lapic.id = p[3];
+			if(l32get(p+4) == 0){
+				free(st);
+				st = nil;
+			}
+			break;
+		case ASioapic:
+			st->ioapic.id = id = p[2];
+			st->ioapic.addr = l32get(p+4);
+			st->ioapic.ibase = l32get(p+8);
+			/* iosapic overrides any ioapic entry for the same id */
+			for(l = apics->st; l != nil; l = l->next)
+				if(l->type == ASiosapic && l->iosapic.id == id){
+					st->ioapic = l->iosapic;
+					/* we leave it linked; could be removed */
+					break;
+				}
+			break;
+		case ASintovr:
+			st->intovr.irq = p[3];
+			st->intovr.intr = l32get(p+4);
+			st->intovr.flags = l16get(p+8);
+			break;
+		case ASnmi:
+			st->nmi.flags = l16get(p+2);
+			st->nmi.intr = l32get(p+4);
+			break;
+		case ASlnmi:
+			st->lnmi.pid = p[2];
+			st->lnmi.flags = l16get(p+3);
+			st->lnmi.lint = p[5];
+			break;
+		case ASladdr:
+			/* This is for 64 bits, perhaps we should not
+			 * honor it on 32 bits.
+			 */
+			apics->lapicpa = l64get(p+8);
+			break;
+		case ASiosapic:
+			id = st->iosapic.id = p[2];
+			st->iosapic.ibase = l32get(p+4);
+			st->iosapic.addr = l64get(p+8);
+			/* iosapic overrides any ioapic entry for the same id */
+			for(l = apics->st; l != nil; l = l->next)
+				if(l->type == ASioapic && l->ioapic.id == id){
+					l->ioapic = st->iosapic;
+					free(st);
+					st = nil;
+					break;
+				}
+			break;
+		case ASlsapic:
+			st->lsapic.pid = p[2];
+			st->lsapic.id = p[3];
+			st->lsapic.eid = p[4];
+			st->lsapic.puid = l32get(p+12);
+			if(l32get(p+8) == 0){
+				free(st);
+				st = nil;
+			}else
+				kstrdup(&st->lsapic.puids, (char*)p+16);
+			break;
+		case ASintsrc:
+			st->intsrc.flags = l16get(p+2);
+			st->type = p[4];
+			st->intsrc.pid = p[5];
+			st->intsrc.peid = p[6];
+			st->intsrc.iosv = p[7];
+			st->intsrc.intr = l32get(p+8);
+			st->intsrc.any = l32get(p+12);
+			break;
+		case ASlx2apic:
+			st->lx2apic.id = l32get(p+4);
+			st->lx2apic.puid = l32get(p+12);
+			if(l32get(p+8) == 0){
+				free(st);
+				st = nil;
+			}
+			break;
+		case ASlx2nmi:
+			st->lx2nmi.flags = l16get(p+2);
+			st->lx2nmi.puid = l32get(p+4);
+			st->lx2nmi.intr = p[8];
+			break;
+		default:
+			print("unknown APIC structure\n");
+			free(st);
+			st = nil;
+		}
+		if(st != nil){
+			*stl = st;
+			stl = &st->next;
+		}
+	}
+
+	dumpmadt(apics);
+	return nil;	/* can be unmapped once parsed */
+}
+
+/*
+ * Map the table and keep it there.
+ */
+static Atable*
+acpitable(uchar *p, int len)
+{
+	if(len < Sdthdrsz)
+		return nil;
+	return newtable(p);
+}
+
+static void
+dumptable(char *sig, uchar *p, int l)
+{
+	int n, i;
+
+	if(DBGFLG > 1){
+		DBG("%s @ %#p\n", sig, p);
+		if(DBGFLG > 2)
+			n = l;
+		else
+			n = 256;
+		for(i = 0; i < n; i++){
+			if((i % 16) == 0)
+				DBG("%x: ", i);
+			DBG(" %2.2ux", p[i]);
+			if((i % 16) == 15)
+				DBG("\n");
+		}
+		DBG("\n");
+		DBG("\n");
+	}
+}
+
+static char*
+seprinttable(char *s, char *e, Atable *t)
+{
+	uchar *p;
+	int i, n;
+
+	p = (uchar*)t->tbl;	/* include header */
+	n = Sdthdrsz + t->dlen;
+	s = seprint(s, e, "%s @ %#p\n", t->sig, p);
+	for(i = 0; i < n; i++){
+		if((i % 16) == 0)
+			s = seprint(s, e, "%x: ", i);
+		s = seprint(s, e, " %2.2ux", p[i]);
+		if((i % 16) == 15)
+			s = seprint(s, e, "\n");
+	}
+	return seprint(s, e, "\n\n");
+}
+
+/*
+ * process xsdt table and load tables with sig, or all if nil.
+ * (XXX: should be able to search for sig, oemid, oemtblid)
+ */
+static int
+acpixsdtload(char *sig)
+{
+	int i, l, t, unmap, found;
+	uintptr dhpa;
+	uchar *sdt;
+	char tsig[5];
+
+	found = 0;
+	for(i = 0; i < xsdt->len; i += xsdt->asize){
+		if(xsdt->asize == 8)
+			dhpa = l64get(xsdt->p+i);
+		else
+			dhpa = l32get(xsdt->p+i);
+		if((sdt = sdtmap(dhpa, &l, 1)) == nil)
+			continue;
+		unmap = 1;
+		memmove(tsig, sdt, 4);
+		tsig[4] = 0;
+		if(sig == nil || strcmp(sig, tsig) == 0){
+			DBG("acpi: %s addr %#p\n", tsig, sdt);
+			for(t = 0; t < nelem(ptables); t++)
+				if(strcmp(tsig, ptables[t].sig) == 0){
+					dumptable(tsig, sdt, l);
+					unmap = ptables[t].f(sdt, l) == nil;
+					found = 1;
+					break;
+				}
+		}
+		if(unmap)
+			vunmap(sdt, l);
+	}
+	return found;
+}
+
+static void*
+rsdscan(u8int* addr, int len, char* signature)
+{
+	int sl;
+	u8int *e, *p;
+
+	e = addr+len;
+	sl = strlen(signature);
+	for(p = addr; p+sl < e; p += 16){
+		if(memcmp(p, signature, sl))
+			continue;
+		return p;
+	}
+
+	return nil;
+}
+
+static void*
+rsdsearch(char* signature)
+{
+	uintptr p;
+	u8int *bda;
+	void *rsd;
+
+	/*
+	 * Search for the data structure signature:
+	 * 1) in the first KB of the EBDA;
+	 * 2) in the BIOS ROM between 0xE0000 and 0xFFFFF.
+	 */
+	if(strncmp((char*)KADDR(0xFFFD9), "EISA", 4) == 0){
+		bda = BIOSSEG(0x40);
+		if((p = (bda[0x0F]<<8)|bda[0x0E])){
+			if(rsd = rsdscan(KADDR(p), 1024, signature))
+				return rsd;
+		}
+	}
+	return rsdscan(BIOSSEG(0xE000), 0x20000, signature);
+}
+
+static void
+acpirsdptr(void)
+{
+	Rsdp *rsd;
+	int asize;
+	uintptr sdtpa;
+
+	if((rsd = rsdsearch("RSD PTR ")) == nil)
+		return;
+
+	assert(sizeof(Sdthdr) == 36);
+
+	DBG("acpi: RSD PTR@ %#p, physaddr %#ux length %ud %#llux rev %d\n",
+		rsd, l32get(rsd->raddr), l32get(rsd->length),
+		l64get(rsd->xaddr), rsd->revision);
+
+	if(rsd->revision >= 2){
+		if(sdtchecksum(rsd, 36) == nil){
+			DBG("acpi: RSD: bad checksum\n");
+			return;
+		}
+		sdtpa = l64get(rsd->xaddr);
+		asize = 8;
+	}
+	else{
+		if(sdtchecksum(rsd, 20) == nil){
+			DBG("acpi: RSD: bad checksum\n");
+			return;
+		}
+		sdtpa = l32get(rsd->raddr);
+		asize = 4;
+	}
+
+	/*
+	 * process the RSDT or XSDT table.
+	 */
+	xsdt = malloc(sizeof(Xsdt));
+	if(xsdt == nil){
+		DBG("acpi: malloc failed\n");
+		return;
+	}
+	if((xsdt->p = sdtmap(sdtpa, &xsdt->len, 1)) == nil){
+		DBG("acpi: sdtmap failed\n");
+		return;
+	}
+	if((xsdt->p[0] != 'R' && xsdt->p[0] != 'X') || memcmp(xsdt->p+1, "SDT", 3) != 0){
+		DBG("acpi: xsdt sig: %c%c%c%c\n",
+			xsdt->p[0], xsdt->p[1], xsdt->p[2], xsdt->p[3]);
+		free(xsdt);
+		xsdt = nil;
+		vunmap(xsdt, xsdt->len);
+		return;
+	}
+	xsdt->p += sizeof(Sdthdr);
+	xsdt->len -= sizeof(Sdthdr);
+	xsdt->asize = asize;
+	DBG("acpi: XSDT %#p\n", xsdt);
+	acpixsdtload(nil);
+	/* xsdt is kept and not unmapped */
+
+}
+
+static int
+acpigen(Chan *c, char*, Dirtab *tab, int ntab, int i, Dir *dp)
+{
+	Qid qid;
+
+	if(i == DEVDOTDOT){
+		mkqid(&qid, Qdir, 0, QTDIR);
+		devdir(c, qid, ".", 0, eve, 0555, dp);
+		return 1;
+	}
+	i++; /* skip first element for . itself */
+	if(tab==0 || i>=ntab)
+		return -1;
+	tab += i;
+	qid = tab->qid;
+	qid.path &= ~Qdir;
+	qid.vers = 0;
+	devdir(c, qid, tab->name, tab->length, eve, tab->perm, dp);
+	return 1;
+}
+
+static int
+Gfmt(Fmt* f)
+{
+	static char* rnames[] = {
+			"mem", "io", "pcicfg", "embed",
+			"smb", "cmos", "pcibar", "ipmi"};
+	Gas *g;
+
+	g = va_arg(f->args, Gas*);
+	switch(g->spc){
+	case Rsysmem:
+	case Rsysio:
+	case Rembed:
+	case Rsmbus:
+	case Rcmos:
+	case Rpcibar:
+	case Ripmi:
+		fmtprint(f, "[%s ", rnames[g->spc]);
+		break;
+	case Rpcicfg:
+		fmtprint(f, "[pci ");
+		fmtprint(f, "dev %#ulx ", (ulong)(g->addr >> 32) & 0xFFFF);
+		fmtprint(f, "fn %#ulx ", (ulong)(g->addr & 0xFFFF0000) >> 16);
+		fmtprint(f, "adr %#ulx ", (ulong)(g->addr &0xFFFF));
+		break;
+	case Rfixedhw:
+		fmtprint(f, "[hw ");
+		break;
+	default:
+		fmtprint(f, "[spc=%#ux ", g->spc);
+	}
+	return fmtprint(f, "off %d len %d addr %#ullx sz%d]",
+		g->off, g->len, g->addr, g->accsz);
+}
+
+static uint
+getbanked(uintptr ra, uintptr rb, int sz)
+{
+	uint r;
+
+	r = 0;
+	switch(sz){
+	case 1:
+		if(ra != 0)
+			r |= inb(ra);
+		if(rb != 0)
+			r |= inb(rb);
+		break;
+	case 2:
+		if(ra != 0)
+			r |= ins(ra);
+		if(rb != 0)
+			r |= ins(rb);
+		break;
+	case 4:
+		if(ra != 0)
+			r |= inl(ra);
+		if(rb != 0)
+			r |= inl(rb);
+		break;
+	default:
+		print("getbanked: wrong size\n");
+	}
+	return r;
+}
+
+static uint
+setbanked(uintptr ra, uintptr rb, int sz, int v)
+{
+	uint r;
+
+	r = -1;
+	switch(sz){
+	case 1:
+		if(ra != 0)
+			outb(ra, v);
+		if(rb != 0)
+			outb(rb, v);
+		break;
+	case 2:
+		if(ra != 0)
+			outs(ra, v);
+		if(rb != 0)
+			outs(rb, v);
+		break;
+	case 4:
+		if(ra != 0)
+			outl(ra, v);
+		if(rb != 0)
+			outl(rb, v);
+		break;
+	default:
+		print("setbanked: wrong size\n");
+	}
+	return r;
+}
+
+static uint
+getpm1ctl(void)
+{
+	return getbanked(fadt.pm1acntblk, fadt.pm1bcntblk, fadt.pm1cntlen);
+}
+
+static void
+setpm1sts(uint v)
+{
+	DBG("acpi: setpm1sts %#ux\n", v);
+	setbanked(fadt.pm1aevtblk, fadt.pm1bevtblk, fadt.pm1evtlen/2, v);
+}
+
+static uint
+getpm1sts(void)
+{
+	return getbanked(fadt.pm1aevtblk, fadt.pm1bevtblk, fadt.pm1evtlen/2);
+}
+
+static uint
+getpm1en(void)
+{
+	int sz;
+
+	sz = fadt.pm1evtlen/2;
+	return getbanked(fadt.pm1aevtblk+sz, fadt.pm1bevtblk+sz, sz);
+}
+
+static int
+getgpeen(int n)
+{
+	return inb(gpes[n].enio) & 1<<gpes[n].enbit;
+}
+
+static void
+setgpeen(int n, uint v)
+{
+	int old;
+
+	DBG("acpi: setgpe %d %d\n", n, v);
+	old = inb(gpes[n].enio);
+	if(v)
+		outb(gpes[n].enio, old | 1<<gpes[n].enbit);
+	else
+		outb(gpes[n].enio, old & ~(1<<gpes[n].enbit));
+}
+
+static void
+clrgpests(int n)
+{
+	outb(gpes[n].stsio, 1<<gpes[n].stsbit);
+}
+
+static uint
+getgpests(int n)
+{
+	return inb(gpes[n].stsio) & 1<<gpes[n].stsbit;
+}
+
+static void
+acpiintr(Ureg*, void*)
+{
+	int i;
+	uint sts, en;
+
+	print("acpi: intr\n");
+
+	for(i = 0; i < ngpes; i++)
+		if(getgpests(i)){
+			print("gpe %d on\n", i);
+ 			en = getgpeen(i);
+			setgpeen(i, 0);
+			clrgpests(i);
+			if(en != 0)
+				print("acpiitr: calling gpe %d\n", i);
+		//	queue gpe for calling gpe->ho in the
+		//	aml process.
+		//	enable it again when it returns.
+		}
+	sts = getpm1sts();
+	en = getpm1en();
+	print("acpiitr: pm1sts %#ux pm1en %#ux\n", sts, en);
+	if(sts&en)
+		print("have enabled events\n");
+	if(sts&1)
+		print("power button\n");
+	// XXX serve other interrupts here.
+	setpm1sts(sts);	
+}
+
+static void
+initgpes(void)
+{
+	int i, n0, n1;
+
+	n0 = fadt.gpe0blklen/2;
+	n1 = fadt.gpe1blklen/2;
+	ngpes = n0 + n1;
+	gpes = mallocz(sizeof(Gpe) * ngpes, 1);
+	for(i = 0; i < n0; i++){
+		gpes[i].nb = i;
+		gpes[i].stsbit = i&7;
+		gpes[i].stsio = fadt.gpe0blk + (i>>3);
+		gpes[i].enbit = (n0 + i)&7;
+		gpes[i].enio = fadt.gpe0blk + ((n0 + i)>>3);
+	}
+	for(i = 0; i + n0 < ngpes; i++){
+		gpes[i + n0].nb = fadt.gp1base + i;
+		gpes[i + n0].stsbit = i&7;
+		gpes[i + n0].stsio = fadt.gpe1blk + (i>>3);
+		gpes[i + n0].enbit = (n1 + i)&7;
+		gpes[i + n0].enio = fadt.gpe1blk + ((n1 + i)>>3);
+	}
+	for(i = 0; i < ngpes; i++){
+		setgpeen(i, 0);
+		clrgpests(i);
+	}
+}
+
+static void
+acpiioalloc(uint addr, int len)
+{
+	if(addr != 0)
+		ioalloc(addr, len, 0, "acpi");
+}
+
+int
+acpiinit(void)
+{
+	if(fadt.smicmd == 0){
+		fmtinstall('G', Gfmt);
+		acpirsdptr();
+		if(fadt.smicmd == 0)
+			return -1;
+	}
+	return 0;
+}
+
+static Chan*
+acpiattach(char *spec)
+{
+	int i;
+
+	/*
+	 * This was written for the stock kernel.
+	 * This code must use 64 registers to be acpi ready in nix.
+	 */
+	if(1 || acpiinit() < 0)
+		error("no acpi");
+
+	/*
+	 * should use fadt->xpm* and fadt->xgpe* registers for 64 bits.
+	 * We are not ready in this kernel for that.
+	 */
+	DBG("acpi io alloc\n");
+	acpiioalloc(fadt.smicmd, 1);
+	acpiioalloc(fadt.pm1aevtblk, fadt.pm1evtlen);
+	acpiioalloc(fadt.pm1bevtblk, fadt.pm1evtlen );
+	acpiioalloc(fadt.pm1acntblk, fadt.pm1cntlen);
+	acpiioalloc(fadt.pm1bcntblk, fadt.pm1cntlen);
+	acpiioalloc(fadt.pm2cntblk, fadt.pm2cntlen);
+	acpiioalloc(fadt.pmtmrblk, fadt.pmtmrlen);
+	acpiioalloc(fadt.gpe0blk, fadt.gpe0blklen);
+	acpiioalloc(fadt.gpe1blk, fadt.gpe1blklen);
+
+	DBG("acpi init gpes\n");
+	initgpes();
+
+	/*
+	 * This starts ACPI, which may require we handle
+	 * power mgmt events ourselves. Use with care.
+	 */
+	DBG("acpi starting\n");
+	outb(fadt.smicmd, fadt.acpienable);
+	for(i = 0; i < 10; i++)
+		if(getpm1ctl() & Pm1SciEn)
+			break;
+	if(i == 10)
+		error("acpi: failed to enable\n");
+	if(fadt.sciint != 0)
+		intrenable(fadt.sciint, acpiintr, 0, BUSUNKNOWN, "acpi");
+	return devattach(L'α', spec);
+}
+
+static Walkqid*
+acpiwalk(Chan *c, Chan *nc, char **name, int nname)
+{
+	return devwalk(c, nc, name, nname, acpidir, nelem(acpidir), acpigen);
+}
+
+static long
+acpistat(Chan *c, uchar *dp, long n)
+{
+	return devstat(c, dp, n, acpidir, nelem(acpidir), acpigen);
+}
+
+static Chan*
+acpiopen(Chan *c, int omode)
+{
+	return devopen(c, omode, acpidir, nelem(acpidir), acpigen);
+}
+
+static void
+acpiclose(Chan *)
+{
+}
+
+static char*ttext;
+static int tlen;
+
+static long
+acpiread(Chan *c, void *a, long n, vlong off)
+{
+	long q;
+	Atable *t;
+	char *ns, *s, *e, *ntext;
+
+	q = c->qid.path;
+	switch(q){
+	case Qdir:
+		return devdirread(c, a, n, acpidir, nelem(acpidir), acpigen);
+	case Qtbl:
+		if(ttext == nil){
+			tlen = 1024;
+			ttext = malloc(tlen);
+			if(ttext == nil){
+				print("acpi: no memory\n");
+				return 0;
+			}
+			s = ttext;
+			e = ttext + tlen;
+			strcpy(s, "no tables\n");
+			for(t = tfirst; t != nil; t = t->next){
+				ns = seprinttable(s, e, t);
+				while(ns == e - 1){
+					DBG("acpiread: allocated %d\n", tlen*2);
+					ntext = realloc(ttext, tlen*2);
+					if(ntext == nil)
+						panic("acpi: no memory\n");
+					s = ntext + (ttext - s);
+					ttext = ntext;
+					tlen *= 2;
+					e = ttext + tlen;
+					ns = seprinttable(s, e, t);
+				}
+				s = ns;
+			}
+					
+		}
+		return readstr(off, a, n, ttext);
+	case Qio:
+		if(reg == nil)
+			error("region not configured");
+		return regio(reg, a, n, off, 0);
+	}
+	error(Eperm);
+	return -1;
+}
+
+static long
+acpiwrite(Chan *c, void *a, long n, vlong off)
+{
+	Cmdtab *ct;
+	Cmdbuf *cb;
+	Reg *r;
+	uint rno, fun, dev, bus, i;
+
+	if(c->qid.path == Qio){
+		if(reg == nil)
+			error("region not configured");
+		return regio(reg, a, n, off, 1);
+	}
+	if(c->qid.path != Qctl)
+		error(Eperm);
+
+	cb = parsecmd(a, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+	ct = lookupcmd(cb, ctls, nelem(ctls));
+	DBG("acpi ctl %s\n", cb->f[0]);
+	switch(ct->index){
+	case CMregion:
+		r = reg;
+		if(r == nil){
+			r = smalloc(sizeof(Reg));
+			r->name = nil;
+		}
+		kstrdup(&r->name, cb->f[1]);
+		r->spc = acpiregid(cb->f[2]);
+		if(r->spc < 0){
+			free(r);
+			reg = nil;
+			error("bad region type");
+		}
+		if(r->spc == Rpcicfg || r->spc == Rpcibar){
+			rno = r->base>>Rpciregshift & Rpciregmask;
+			fun = r->base>>Rpcifunshift & Rpcifunmask;
+			dev = r->base>>Rpcidevshift & Rpcidevmask;
+			bus = r->base>>Rpcibusshift & Rpcibusmask;
+			r->tbdf = MKBUS(BusPCI, bus, dev, fun);
+			r->base = rno;	/* register ~ our base addr */
+		}
+		r->base = strtoull(cb->f[3], nil, 0);
+		r->len = strtoull(cb->f[4], nil, 0);
+		r->accsz = strtoul(cb->f[5], nil, 0);
+		if(r->accsz < 1 || r->accsz > 4){
+			free(r);
+			reg = nil;
+			error("bad region access size");
+		}
+		reg = r;
+		DBG("region %s %s %llux %llux sz%d",
+			r->name, acpiregstr(r->spc), r->base, r->len, r->accsz);
+		break;
+	case CMgpe:
+		i = strtoul(cb->f[1], nil, 0);
+		if(i >= ngpes)
+			error("gpe out of range");
+		kstrdup(&gpes[i].obj, cb->f[2]);
+		DBG("gpe %d %s\n", i, gpes[i].obj);
+		setgpeen(i, 1);
+		break;
+	default:
+		panic("acpi: unknown ctl");
+	}
+	poperror();
+	free(cb);
+	return n;
+}
+
+
+Dev acpidevtab = {
+	L'α',
+	"acpi",
+
+	devreset,
+	devinit,
+	devshutdown,
+	acpiattach,
+	acpiwalk,
+	acpistat,
+	acpiopen,
+	devcreate,
+	acpiclose,
+	acpiread,
+	devbread,
+	acpiwrite,
+	devbwrite,
+	devremove,
+	devwstat,
+};

+ 632 - 0
sys/src/9/k10/devarch.c

@@ -0,0 +1,632 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ureg.h"
+
+typedef struct IOMap IOMap;
+struct IOMap
+{
+	IOMap	*next;
+	int	reserved;
+	char	tag[13];
+	ulong	start;
+	ulong	end;
+};
+
+static struct
+{
+	Lock;
+	IOMap	*map;
+	IOMap	*free;
+	IOMap	maps[32];		// some initial free maps
+
+	QLock	ql;			// lock for reading map
+} iomap;
+
+enum {
+	Qdir = 0,
+	Qioalloc = 1,
+	Qiob,
+	Qiow,
+	Qiol,
+	Qbase,
+	Qmapram,
+
+	Qmax = 16,
+};
+
+typedef long Rdwrfn(Chan*, void*, long, vlong);
+
+static Rdwrfn *readfn[Qmax];
+static Rdwrfn *writefn[Qmax];
+
+static Dirtab archdir[Qmax] = {
+	".",		{ Qdir, 0, QTDIR },	0,	0555,
+	"ioalloc",	{ Qioalloc, 0 },	0,	0444,
+	"iob",		{ Qiob, 0 },		0,	0660,
+	"iow",		{ Qiow, 0 },		0,	0660,
+	"iol",		{ Qiol, 0 },		0,	0660,
+	"mapram",	{ Qmapram, 0 },	0,	0444,
+};
+Lock archwlock;	/* the lock is only for changing archdir */
+int narchdir = Qbase;
+
+/*
+ * Add a file to the #P listing.  Once added, you can't delete it.
+ * You can't add a file with the same name as one already there,
+ * and you get a pointer to the Dirtab entry so you can do things
+ * like change the Qid version.  Changing the Qid path is disallowed.
+ */
+Dirtab*
+addarchfile(char *name, int perm, Rdwrfn *rdfn, Rdwrfn *wrfn)
+{
+	int i;
+	Dirtab d;
+	Dirtab *dp;
+
+	memset(&d, 0, sizeof d);
+	strcpy(d.name, name);
+	d.perm = perm;
+
+	lock(&archwlock);
+	if(narchdir >= Qmax){
+		unlock(&archwlock);
+		return nil;
+	}
+
+	for(i=0; i<narchdir; i++)
+		if(strcmp(archdir[i].name, name) == 0){
+			unlock(&archwlock);
+			return nil;
+		}
+
+	d.qid.path = narchdir;
+	archdir[narchdir] = d;
+	readfn[narchdir] = rdfn;
+	writefn[narchdir] = wrfn;
+	dp = &archdir[narchdir++];
+	unlock(&archwlock);
+
+	return dp;
+}
+
+void
+ioinit(void)
+{
+	char *excluded;
+	int i;
+
+	for(i = 0; i < nelem(iomap.maps)-1; i++)
+		iomap.maps[i].next = &iomap.maps[i+1];
+	iomap.maps[i].next = nil;
+	iomap.free = iomap.maps;
+
+	/*
+	 * Someone needs to explain why this was here...
+	 */
+	ioalloc(0x0fff, 1, 0, "dummy");	// i82557 is at 0x1000, the dummy
+					// entry is needed for swappable devs.
+
+	if ((excluded = getconf("ioexclude")) != nil) {
+		char *s;
+
+		s = excluded;
+		while (s && *s != '\0' && *s != '\n') {
+			char *ends;
+			int io_s, io_e;
+
+			io_s = (int)strtol(s, &ends, 0);
+			if (ends == nil || ends == s || *ends != '-') {
+				print("ioinit: cannot parse option string\n");
+				break;
+			}
+			s = ++ends;
+
+			io_e = (int)strtol(s, &ends, 0);
+			if (ends && *ends == ',')
+				*ends++ = '\0';
+			s = ends;
+
+			ioalloc(io_s, io_e - io_s + 1, 0, "pre-allocated");
+		}
+	}
+
+}
+
+// Reserve a range to be ioalloced later.
+// This is in particular useful for exchangable cards, such
+// as pcmcia and cardbus cards.
+int
+ioreserve(int, int size, int align, char *tag)
+{
+	IOMap *map, **l;
+	int i, port;
+
+	lock(&iomap);
+	// find a free port above 0x400 and below 0x1000
+	port = 0x400;
+	for(l = &iomap.map; *l; l = &(*l)->next){
+		map = *l;
+		if (map->start < 0x400)
+			continue;
+		i = map->start - port;
+		if(i > size)
+			break;
+		if(align > 0)
+			port = ((port+align-1)/align)*align;
+		else
+			port = map->end;
+	}
+	if(*l == nil){
+		unlock(&iomap);
+		return -1;
+	}
+	map = iomap.free;
+	if(map == nil){
+		print("ioalloc: out of maps");
+		unlock(&iomap);
+		return port;
+	}
+	iomap.free = map->next;
+	map->next = *l;
+	map->start = port;
+	map->end = port + size;
+	map->reserved = 1;
+	strncpy(map->tag, tag, sizeof(map->tag));
+	map->tag[sizeof(map->tag)-1] = 0;
+	*l = map;
+
+	archdir[0].qid.vers++;
+
+	unlock(&iomap);
+	return map->start;
+}
+
+//
+//	alloc some io port space and remember who it was
+//	alloced to.  if port < 0, find a free region.
+//
+int
+ioalloc(int port, int size, int align, char *tag)
+{
+	IOMap *map, **l;
+	int i;
+
+	lock(&iomap);
+	if(port < 0){
+		// find a free port above 0x400 and below 0x1000
+		port = 0x400;
+		for(l = &iomap.map; *l; l = &(*l)->next){
+			map = *l;
+			if (map->start < 0x400)
+				continue;
+			i = map->start - port;
+			if(i > size)
+				break;
+			if(align > 0)
+				port = ((port+align-1)/align)*align;
+			else
+				port = map->end;
+		}
+		if(*l == nil){
+			unlock(&iomap);
+			return -1;
+		}
+	} else {
+		// Only 64KB I/O space on the x86.
+		if((port+size) > 0x10000){
+			unlock(&iomap);
+			return -1;
+		}
+		// see if the space clashes with previously allocated ports
+		for(l = &iomap.map; *l; l = &(*l)->next){
+			map = *l;
+			if(map->end <= port)
+				continue;
+			if(map->reserved && map->start == port && map->end == port + size) {
+				map->reserved = 0;
+				unlock(&iomap);
+				return map->start;
+			}
+			if(map->start >= port+size)
+				break;
+			unlock(&iomap);
+			return -1;
+		}
+	}
+	map = iomap.free;
+	if(map == nil){
+		print("ioalloc: out of maps");
+		unlock(&iomap);
+		return port;
+	}
+	iomap.free = map->next;
+	map->next = *l;
+	map->start = port;
+	map->end = port + size;
+	strncpy(map->tag, tag, sizeof(map->tag));
+	map->tag[sizeof(map->tag)-1] = 0;
+	*l = map;
+
+	archdir[0].qid.vers++;
+
+	unlock(&iomap);
+	return map->start;
+}
+
+void
+iofree(int port)
+{
+	IOMap *map, **l;
+
+	lock(&iomap);
+	for(l = &iomap.map; *l; l = &(*l)->next){
+		if((*l)->start == port){
+			map = *l;
+			*l = map->next;
+			map->next = iomap.free;
+			iomap.free = map;
+			break;
+		}
+		if((*l)->start > port)
+			break;
+	}
+	archdir[0].qid.vers++;
+	unlock(&iomap);
+}
+
+int
+iounused(int start, int end)
+{
+	IOMap *map;
+
+	for(map = iomap.map; map; map = map->next){
+		if(start >= map->start && start < map->end
+		|| start <= map->start && end > map->start)
+			return 0; 
+	}
+	return 1;
+}
+
+static void
+checkport(int start, int end)
+{
+	/* standard vga regs are OK */
+	if(start >= 0x2b0 && end <= 0x2df+1)
+		return;
+	if(start >= 0x3c0 && end <= 0x3da+1)
+		return;
+
+	if(iounused(start, end))
+		return;
+	error(Eperm);
+}
+
+static Chan*
+archattach(char* spec)
+{
+	return devattach('P', spec);
+}
+
+Walkqid*
+archwalk(Chan* c, Chan *nc, char** name, int nname)
+{
+	return devwalk(c, nc, name, nname, archdir, narchdir, devgen);
+}
+
+static long
+archstat(Chan* c, uchar* dp, long n)
+{
+	return devstat(c, dp, n, archdir, narchdir, devgen);
+}
+
+static Chan*
+archopen(Chan* c, int omode)
+{
+	return devopen(c, omode, archdir, narchdir, devgen);
+}
+
+static void
+archclose(Chan*)
+{
+}
+
+enum
+{
+	Linelen= 31,
+};
+
+static long
+archread(Chan *c, void *a, long n, vlong offset)
+{
+	char *buf, *p;
+	int port;
+	ushort *sp;
+	ulong *lp;
+	IOMap *map;
+	Rdwrfn *fn;
+
+	switch((ulong)c->qid.path){
+
+	case Qdir:
+		return devdirread(c, a, n, archdir, narchdir, devgen);
+
+	case Qiob:
+		port = offset;
+		checkport(offset, offset+n);
+		for(p = a; port < offset+n; port++)
+			*p++ = inb(port);
+		return n;
+
+	case Qiow:
+		if(n & 1)
+			error(Ebadarg);
+		checkport(offset, offset+n);
+		sp = a;
+		for(port = offset; port < offset+n; port += 2)
+			*sp++ = ins(port);
+		return n;
+
+	case Qiol:
+		if(n & 3)
+			error(Ebadarg);
+		checkport(offset, offset+n);
+		lp = a;
+		for(port = offset; port < offset+n; port += 4)
+			*lp++ = inl(port);
+		return n;
+
+	case Qioalloc:
+		break;
+
+	default:
+		if(c->qid.path < narchdir && (fn = readfn[c->qid.path]))
+			return fn(c, a, n, offset);
+		error(Eperm);
+		break;
+	}
+
+	if((buf = malloc(n)) == nil)
+		error(Enomem);
+	p = buf;
+	n = n/Linelen;
+	offset = offset/Linelen;
+
+	switch((ulong)c->qid.path){
+	case Qioalloc:
+		lock(&iomap);
+		for(map = iomap.map; n > 0 && map != nil; map = map->next){
+			if(offset-- > 0)
+				continue;
+			sprint(p, "%#8lux %#8lux %-12.12s\n", map->start, map->end-1, map->tag);
+			p += Linelen;
+			n--;
+		}
+		unlock(&iomap);
+		break;
+	case Qmapram:
+/* shit */
+#ifdef NOTYET
+		for(mp = rmapram.map; mp->size; mp++){
+			/*
+			 * Up to MemMinMiB is already set up.
+			 */
+			if(mp->addr < MemMinMiB*MiB){
+				if(mp->addr+mp->size <= MemMinMiB*MiB)
+					continue;
+				pa = MemMinMiB*MiB;
+				size = mp->size - MemMinMiB*MiB-mp->addr;
+			}
+			else{
+				pa = mp->addr;
+				size = mp->size;
+			}
+#endif
+		error("Not yet");
+	
+		break;
+	}
+
+	n = p - buf;
+	memmove(a, buf, n);
+	free(buf);
+
+	return n;
+}
+
+static long
+archwrite(Chan *c, void *a, long n, vlong offset)
+{
+	char *p;
+	int port;
+	ushort *sp;
+	ulong *lp;
+	Rdwrfn *fn;
+
+	switch((ulong)c->qid.path){
+
+	case Qiob:
+		p = a;
+		checkport(offset, offset+n);
+		for(port = offset; port < offset+n; port++)
+			outb(port, *p++);
+		return n;
+
+	case Qiow:
+		if(n & 1)
+			error(Ebadarg);
+		checkport(offset, offset+n);
+		sp = a;
+		for(port = offset; port < offset+n; port += 2)
+			outs(port, *sp++);
+		return n;
+
+	case Qiol:
+		if(n & 3)
+			error(Ebadarg);
+		checkport(offset, offset+n);
+		lp = a;
+		for(port = offset; port < offset+n; port += 4)
+			outl(port, *lp++);
+		return n;
+
+	default:
+		if(c->qid.path < narchdir && (fn = writefn[c->qid.path]))
+			return fn(c, a, n, offset);
+		error(Eperm);
+		break;
+	}
+	return 0;
+}
+
+Dev archdevtab = {
+	'P',
+	"arch",
+
+	devreset,
+	devinit,
+	devshutdown,
+	archattach,
+	archwalk,
+	archstat,
+	archopen,
+	devcreate,
+	archclose,
+	archread,
+	devbread,
+	archwrite,
+	devbwrite,
+	devremove,
+	devwstat,
+};
+
+/*
+ */
+void
+nop(void)
+{
+}
+
+void (*coherence)(void) = mfence;
+
+static long
+cputyperead(Chan*, void *a, long n, vlong off)
+{
+	char buf[512], *s, *e;
+	int i, k;
+
+	e = buf+sizeof buf;
+	s = seprint(buf, e, "%s %ud\n", "AMD64", m->cpumhz);
+	k = m->ncpuinfoe - m->ncpuinfos;
+	if(k > 4)
+		k = 4;
+	for(i = 0; i < k; i++)
+		s = seprint(s, e, "%#8.8ux %#8.8ux %#8.8ux %#8.8ux\n",
+			m->cpuinfo[i][0], m->cpuinfo[i][1],
+			m->cpuinfo[i][2], m->cpuinfo[i][3]);
+	return readstr(off, a, n, buf);
+}
+
+void
+archinit(void)
+{
+	addarchfile("cputype", 0444, cputyperead, nil);
+}
+
+void
+archreset(void)
+{
+	int i;
+
+	/*
+	 * And sometimes there is no keyboard...
+	 *
+	 * The reset register (0xcf9) is usually in one of the bridge
+	 * chips. The actual location and sequence could be extracted from
+	 * ACPI but why bother, this is the end of the line anyway.
+	print("Takes a licking and keeps on ticking...\n");
+	 */
+	i = inb(0xcf9);					/* ICHx reset control */
+	i &= 0x06;
+	outb(0xcf9, i|0x02);				/* SYS_RST */
+	millidelay(1);
+	outb(0xcf9, i|0x06);				/* RST_CPU transition */
+
+	for(;;)
+		;
+}
+
+/*
+ *  return value and speed of timer
+ */
+uvlong
+fastticks(uvlong* hz)
+{
+	if(hz != nil)
+		*hz = m->cpuhz;
+	return rdtsc();
+}
+
+ulong
+µs(void)
+{
+	return fastticks2us(rdtsc());
+}
+
+/*
+ *  set next timer interrupt
+ */
+void
+timerset(uvlong x)
+{
+	extern void apictimerset(uvlong);
+
+	apictimerset(x);
+}
+
+void
+cycles(uvlong* t)
+{
+	*t = rdtsc();
+}
+
+void
+delay(int millisecs)
+{
+	u64int r, t;
+
+	if(millisecs <= 0)
+		millisecs = 1;
+	r = rdtsc();
+	for(t = r + m->cpumhz*1000ull*millisecs; r < t; r = rdtsc())
+		;
+}
+
+/*  
+ *  performance measurement ticks.  must be low overhead.
+ *  doesn't have to count over a second.
+ */
+ulong
+perfticks(void)
+{
+	uvlong x;
+
+//	if(m->havetsc)
+		cycles(&x);
+//	else
+//		x = 0;
+	return x;
+}

+ 2160 - 0
sys/src/9/k10/ether82563.c

@@ -0,0 +1,2160 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+ /*
+ * Intel 8256[367], 8257[1-9], 82573[ev], 
+ * 82575eb, 82576, 82577, 82579, 8258[03]
+ *	Gigabit Ethernet PCI-Express Controllers
+ * Coraid EtherDrive® hba
+ * This rewrite has only been tested on 82579
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+
+#include "etherif.h"
+
+/*
+ * note: the 82575, 82576 and 82580 are operated using registers aliased
+ * to the 82563-style architecture.  many features seen in the 82598
+ * are also seen in the 82575 part.
+ */
+
+enum {
+	/* General */
+
+	Ctrl		= 0x0000,	/* Device Control */
+	Status		= 0x0008,	/* Device Status */
+	Eec		= 0x0010,	/* EEPROM/Flash Control/Data */
+	Eerd		= 0x0014,	/* EEPROM Read */
+	Ctrlext		= 0x0018,	/* Extended Device Control */
+	Fla		= 0x001C,	/* Flash Access */
+	Mdic		= 0x0020,	/* MDI Control */
+	Seresctl	= 0x0024,	/* Serdes ana */
+	Fcal		= 0x0028,	/* Flow Control Address Low */
+	Fcah		= 0x002C,	/* Flow Control Address High */
+	Fct		= 0x0030,	/* Flow Control Type */
+	Kumctrlsta	= 0x0034,	/* Kumeran Control and Status Register */
+	Vet		= 0x0038,	/* VLAN EtherType */
+	Fcttv		= 0x0170,	/* Flow Control Transmit Timer Value */
+	Txcw		= 0x0178,	/* Transmit Configuration Word */
+	Rxcw		= 0x0180,	/* Receive Configuration Word */
+	Ledctl		= 0x0E00,	/* LED control */
+	Pba		= 0x1000,	/* Packet Buffer Allocation */
+	Pbs		= 0x1008,	/* Packet Buffer Size */
+
+	/* Interrupt */
+
+	Icr		= 0x00C0,	/* Interrupt Cause Read */
+	Itr		= 0x00C4,	/* Interrupt Throttling Rate */
+	Ics		= 0x00C8,	/* Interrupt Cause Set */
+	Ims		= 0x00D0,	/* Interrupt Mask Set/Read */
+	Imc		= 0x00D8,	/* Interrupt mask Clear */
+	Iam		= 0x00E0,	/* Interrupt acknowledge Auto Mask */
+	Eitr		= 0x1680,	/* Extended itr; 82575/6 80 only */
+
+	/* Receive */
+
+	Rctl		= 0x0100,	/* Control */
+	Ert		= 0x2008,	/* Early Receive Threshold (573[EVL], 82578 only) */
+	Fcrtl		= 0x2160,	/* Flow Control RX Threshold Low */
+	Fcrth		= 0x2168,	/* Flow Control Rx Threshold High */
+	Psrctl		= 0x2170,	/* Packet Split Receive Control */
+	Drxmxod		= 0x2540,	/* dma max outstanding bytes (82575) */
+	Rdbal		= 0x2800,	/* Rdesc Base Address Low Queue 0 */
+	Rdbah		= 0x2804,	/* Rdesc Base Address High Queue 0 */
+	Rdlen		= 0x2808,	/* Descriptor Length Queue 0 */
+	Srrctl		= 0x280C,	/* split and replication rx control (82575) */
+	Rdh		= 0x2810,	/* Descriptor Head Queue 0 */
+	Rdt		= 0x2818,	/* Descriptor Tail Queue 0 */
+	Rdtr		= 0x2820,	/* Descriptor Timer Ring */
+	Rxdctl		= 0x2828,	/* Descriptor Control */
+	Radv		= 0x282C,	/* Interrupt Absolute Delay Timer */
+	Rdbal1		= 0x2900,	/* Rdesc Base Address Low Queue 1 */
+	Rdbah1		= 0x2804,	/* Rdesc Base Address High Queue 1 */
+	Rdlen1		= 0x2908,	/* Descriptor Length Queue 1 */
+	Rdh1		= 0x2910,	/* Descriptor Head Queue 1 */
+	Rdt1		= 0x2918,	/* Descriptor Tail Queue 1 */
+	Rxdctl1		= 0x2928,	/* Descriptor Control Queue 1 */
+	Rsrpd		= 0x2C00,	/* Small Packet Detect */
+	Raid		= 0x2C08,	/* ACK interrupt delay */
+	Cpuvec		= 0x2C10,	/* CPU Vector */
+	Rxcsum		= 0x5000,	/* Checksum Control */
+	Rmpl		= 0x5004,	/* rx maximum packet length (82575) */
+	Rfctl		= 0x5008,	/* Filter Control */
+	Mta		= 0x5200,	/* Multicast Table Array */
+	Ral		= 0x5400,	/* Receive Address Low */
+	Rah		= 0x5404,	/* Receive Address High */
+	Vfta		= 0x5600,	/* VLAN Filter Table Array */
+	Mrqc		= 0x5818,	/* Multiple Receive Queues Command */
+	Rssim		= 0x5864,	/* RSS Interrupt Mask */
+	Rssir		= 0x5868,	/* RSS Interrupt Request */
+	Reta		= 0x5c00,	/* Redirection Table */
+	Rssrk		= 0x5c80,	/* RSS Random Key */
+
+	/* Transmit */
+
+	Tctl		= 0x0400,	/* Transmit Control */
+	Tipg		= 0x0410,	/* Transmit IPG */
+	Tkabgtxd	= 0x3004,	/* glci afe band gap transmit ref data, or something */
+	Tdbal		= 0x3800,	/* Tdesc Base Address Low */
+	Tdbah		= 0x3804,	/* Tdesc Base Address High */
+	Tdlen		= 0x3808,	/* Descriptor Length */
+	Tdh		= 0x3810,	/* Descriptor Head */
+	Tdt		= 0x3818,	/* Descriptor Tail */
+	Tidv		= 0x3820,	/* Interrupt Delay Value */
+	Txdctl		= 0x3828,	/* Descriptor Control */
+	Tadv		= 0x382C,	/* Interrupt Absolute Delay Timer */
+	Tarc0		= 0x3840,	/* Arbitration Counter Queue 0 */
+	Tdbal1		= 0x3900,	/* Descriptor Base Low Queue 1 */
+	Tdbah1		= 0x3904,	/* Descriptor Base High Queue 1 */
+	Tdlen1		= 0x3908,	/* Descriptor Length Queue 1 */
+	Tdh1		= 0x3910,	/* Descriptor Head Queue 1 */
+	Tdt1		= 0x3918,	/* Descriptor Tail Queue 1 */
+	Txdctl1		= 0x3928,	/* Descriptor Control 1 */
+	Tarc1		= 0x3940,	/* Arbitration Counter Queue 1 */
+
+	/* Statistics */
+
+	Statistics	= 0x4000,	/* Start of Statistics Area */
+	Gorcl		= 0x88/4,	/* Good Octets Received Count */
+	Gotcl		= 0x90/4,	/* Good Octets Transmitted Count */
+	Torl		= 0xC0/4,	/* Total Octets Received */
+	Totl		= 0xC8/4,	/* Total Octets Transmitted */
+	Nstatistics	= 0x124/4,
+};
+
+enum {					/* Ctrl */
+	GIOmd		= 1<<2,		/* BIO master disable */
+	Lrst		= 1<<3,		/* link reset */
+	Slu		= 1<<6,		/* Set Link Up */
+	SspeedMASK	= 3<<8,		/* Speed Selection */
+	SspeedSHIFT	= 8,
+	Sspeed10	= 0x00000000,	/* 10Mb/s */
+	Sspeed100	= 0x00000100,	/* 100Mb/s */
+	Sspeed1000	= 0x00000200,	/* 1000Mb/s */
+	Frcspd		= 1<<11,	/* Force Speed */
+	Frcdplx		= 1<<12,	/* Force Duplex */
+	SwdpinsloMASK	= 0x003C0000,	/* Software Defined Pins - lo nibble */
+	SwdpinsloSHIFT	= 18,
+	SwdpioloMASK	= 0x03C00000,	/* Software Defined Pins - I or O */
+	SwdpioloSHIFT	= 22,
+	Devrst		= 1<<26,	/* Device Reset */
+	Rfce		= 1<<27,	/* Receive Flow Control Enable */
+	Tfce		= 1<<28,	/* Transmit Flow Control Enable */
+	Vme		= 1<<30,	/* VLAN Mode Enable */
+	Phyrst		= 1<<31,	/* Phy Reset */
+};
+
+enum {					/* Status */
+	Lu		= 1<<1,		/* Link Up */
+	Lanid		= 3<<2,		/* mask for Lan ID. */
+	Txoff		= 1<<4,		/* Transmission Paused */
+	Tbimode		= 1<<5,		/* TBI Mode Indication */
+	Phyra		= 1<<10,	/* PHY Reset Asserted */
+	GIOme		= 1<<19,	/* GIO Master Enable Status */
+};
+
+enum {					/* Eerd */
+	EEstart		= 1<<0,		/* Start Read */
+	EEdone		= 1<<1,		/* Read done */
+};
+
+enum {					/* Ctrlext */
+	Asdchk		= 1<<12,	/* ASD Check */
+	Eerst		= 1<<13,	/* EEPROM Reset */
+	Spdbyps		= 1<<15,	/* Speed Select Bypass */
+	Linkmode	= 3<<23,	/* linkmode */
+	Serdes		= 3<<23,	/* " serdes */
+};
+
+enum {					/* EEPROM content offsets */
+	Ea		= 0x00,		/* Ethernet Address */
+	Cf		= 0x03,		/* Compatibility Field */
+	Icw1		= 0x0A,		/* Initialization Control Word 1 */
+	Sid		= 0x0B,		/* Subsystem ID */
+	Svid		= 0x0C,		/* Subsystem Vendor ID */
+	Did		= 0x0D,		/* Device ID */
+	Vid		= 0x0E,		/* Vendor ID */
+	Icw2		= 0x0F,		/* Initialization Control Word 2 */
+};
+
+enum {					/* Mdic */
+	MDIdMASK	= 0x0000FFFF,	/* Data */
+	MDIdSHIFT	= 0,
+	MDIrMASK	= 0x001F0000,	/* PHY Register Address */
+	MDIrSHIFT	= 16,
+	MDIpMASK	= 0x03E00000,	/* PHY Address */
+	MDIpSHIFT	= 21,
+	MDIwop		= 0x04000000,	/* Write Operation */
+	MDIrop		= 0x08000000,	/* Read Operation */
+	MDIready	= 0x10000000,	/* End of Transaction */
+	MDIie		= 0x20000000,	/* Interrupt Enable */
+	MDIe		= 0x40000000,	/* Error */
+};
+
+enum {					/* phy interface registers */
+	Phyctl		= 0,		/* phy ctl */
+	Physsr		= 17,		/* phy secondary status */
+	Phyier		= 18,		/* 82573 phy interrupt enable */
+	Phyisr		= 19,		/* 82563 phy interrupt status */
+	Phylhr		= 19,		/* 8257[12] link health */
+
+	Phyprst		= 193<<8 | 17,	/* 8256[34] phy port reset */
+	Phypage		= 22,		/* 8256[34] page register */
+	Phystat		= 26,		/* 82580 phy status */
+	Phyapage	= 29,
+	Rtlink		= 1<<10,	/* realtime link status */
+	Phyan		= 1<<11,	/* phy has auto-negotiated */
+
+	/* Phyctl bits */
+	Ran		= 1<<9,		/* restart auto-negotiation */
+	Ean		= 1<<12,	/* enable auto-negotiation */
+
+	/* Phyprst bits */
+	Prst		= 1<<0,	/* reset the port */
+
+	/* 82573 Phyier bits */
+	Lscie		= 1<<10,	/* link status changed ie */
+	Ancie		= 1<<11,	/* auto-negotiation complete ie */
+	Spdie		= 1<<14,	/* speed changed ie */
+	Panie		= 1<<15,	/* phy auto-negotiation error ie */
+
+	/* Phylhr/Phyisr bits */
+	Anf		= 1<<6,		/* lhr: auto-negotiation fault */
+	Ane		= 1<<15,	/* isr: auto-negotiation error */
+
+	/* 82580 Phystat bits */
+	Ans	= 1<<14 | 1<<15,	/* 82580 auto-negotiation status */
+	Link	= 1<<6,		/* 82580 Link */
+
+	/* Rxcw builtin serdes */
+	Anc		= 1<<31,
+	Rxsynch		= 1<<30,
+	Rxcfg		= 1<<29,
+	Rxcfgch		= 1<<28,
+	Rxcfgbad	= 1<<27,
+	Rxnc		= 1<<26,
+
+	/* Txcw */
+	Txane		= 1<<31,
+	Txcfg		= 1<<30,
+};
+
+enum {					/* fiber (pcs) interface */
+	Pcsctl	= 0x4208,		/* pcs control */
+	Pcsstat	= 0x420c,		/* pcs status */
+
+	/* Pcsctl bits */
+	Pan	= 1<<16,		/* auto-negotiate */
+	Prestart	= 1<<17,		/* restart an (self clearing) */
+
+	/* Pcsstat bits */
+	Linkok	= 1<<0,		/* link is okay */
+	Andone	= 1<<16,		/* an phase is done see below for success */
+	Anbad	= 1<<19 | 1<<20,	/* Anerror | Anremfault */
+};
+
+enum {					/* Icr, Ics, Ims, Imc */
+	Txdw		= 0x00000001,	/* Transmit Descriptor Written Back */
+	Txqe		= 0x00000002,	/* Transmit Queue Empty */
+	Lsc		= 0x00000004,	/* Link Status Change */
+	Rxseq		= 0x00000008,	/* Receive Sequence Error */
+	Rxdmt0		= 0x00000010,	/* Rdesc Minimum Threshold Reached */
+	Rxo		= 0x00000040,	/* Receiver Overrun */
+	Rxt0		= 0x00000080,	/* Receiver Timer Interrupt; !82575/6/80 only */
+	Rxdw		= 0x00000080,	/* Rdesc write back; 82575/6/80 only */
+	Mdac		= 0x00000200,	/* MDIO Access Completed */
+	Rxcfgsets		= 0x00000400,	/* Receiving /C/ ordered sets */
+	Gpi0		= 0x00000800,	/* General Purpose Interrupts */
+	Gpi1		= 0x00001000,
+	Gpi2		= 0x00002000,
+	Gpi3		= 0x00004000,
+	Ack		= 0x00020000,	/* Receive ACK frame */
+};
+
+enum {					/* Txcw */
+	TxcwFd		= 0x00000020,	/* Full Duplex */
+	TxcwHd		= 0x00000040,	/* Half Duplex */
+	TxcwPauseMASK	= 0x00000180,	/* Pause */
+	TxcwPauseSHIFT	= 7,
+	TxcwPs		= 1<<TxcwPauseSHIFT,	/* Pause Supported */
+	TxcwAs		= 2<<TxcwPauseSHIFT,	/* Asymmetric FC desired */
+	TxcwRfiMASK	= 0x00003000,	/* Remote Fault Indication */
+	TxcwRfiSHIFT	= 12,
+	TxcwNpr		= 0x00008000,	/* Next Page Request */
+	TxcwConfig	= 0x40000000,	/* Transmit Config Control */
+	TxcwAne		= 0x80000000,	/* Auto-Negotiation Enable */
+};
+
+enum {					/* Rctl */
+	Rrst		= 0x00000001,	/* Receiver Software Reset */
+	Ren		= 0x00000002,	/* Receiver Enable */
+	Sbp		= 0x00000004,	/* Store Bad Packets */
+	Upe		= 0x00000008,	/* Unicast Promiscuous Enable */
+	Mpe		= 0x00000010,	/* Multicast Promiscuous Enable */
+	Lpe		= 0x00000020,	/* Long Packet Reception Enable */
+	LbmMASK		= 0x000000C0,	/* Loopback Mode */
+	LbmOFF		= 0x00000000,	/* No Loopback */
+	LbmTBI		= 0x00000040,	/* TBI Loopback */
+	LbmMII		= 0x00000080,	/* GMII/MII Loopback */
+	LbmXCVR		= 0x000000C0,	/* Transceiver Loopback */
+	RdtmsHALF	= 0x00000000,	/* Threshold is 1/2 Rdlen */
+	RdtmsQUARTER	= 0x00000100,	/* Threshold is 1/4 Rdlen */
+	RdtmsEIGHTH	= 0x00000200,	/* Threshold is 1/8 Rdlen */
+	RdtmsMASK	= 0x00000300,	/* Rdesc Minimum Threshold Size */
+	MoMASK		= 0x00003000,	/* Multicast Offset */
+	Bam		= 0x00008000,	/* Broadcast Accept Mode */
+	BsizeMASK	= 0x00030000,	/* Receive Buffer Size */
+	Bsize16384	= 0x00010000,	/* Bsex = 1 */
+	Bsize8192	= 0x00020000, 	/* Bsex = 1 */
+	Bsize2048	= 0x00000000,
+	Bsize1024	= 0x00010000,
+	Bsize512	= 0x00020000,
+	Bsize256	= 0x00030000,
+	BsizeFlex	= 0x08000000,	/* Flexible Bsize in 1KB increments */
+	Vfe		= 0x00040000,	/* VLAN Filter Enable */
+	Cfien		= 0x00080000,	/* Canonical Form Indicator Enable */
+	Cfi		= 0x00100000,	/* Canonical Form Indicator value */
+	Dpf		= 0x00400000,	/* Discard Pause Frames */
+	Pmcf		= 0x00800000,	/* Pass MAC Control Frames */
+	Bsex		= 0x02000000,	/* Buffer Size Extension */
+	Secrc		= 0x04000000,	/* Strip CRC from incoming packet */
+};
+
+enum {					/* Srrctl */
+	Dropen		= 1<<31,
+};
+
+enum {					/* Tctl */
+	Trst		= 0x00000001,	/* Transmitter Software Reset */
+	Ten		= 0x00000002,	/* Transmit Enable */
+	Psp		= 0x00000008,	/* Pad Short Packets */
+	Mulr		= 0x10000000,	/* Allow multiple concurrent requests */
+	CtMASK		= 0x00000FF0,	/* Collision Threshold */
+	CtSHIFT		= 4,
+	ColdMASK	= 0x003FF000,	/* Collision Distance */
+	ColdSHIFT	= 12,
+	Swxoff		= 0x00400000,	/* Sofware XOFF Transmission */
+	Pbe		= 0x00800000,	/* Packet Burst Enable */
+	Rtlc		= 0x01000000,	/* Re-transmit on Late Collision */
+	Nrtu		= 0x02000000,	/* No Re-transmit on Underrrun */
+};
+
+enum {					/* [RT]xdctl */
+	PthreshMASK	= 0x0000003F,	/* Prefetch Threshold */
+	PthreshSHIFT	= 0,
+	HthreshMASK	= 0x00003F00,	/* Host Threshold */
+	HthreshSHIFT	= 8,
+	WthreshMASK	= 0x003F0000,	/* Writeback Threshold */
+	WthreshSHIFT	= 16,
+	Gran		= 0x01000000,	/* Granularity; not 82575 */
+	Qenable		= 0x02000000,	/* Queue Enable (82575) */
+};
+
+enum {					/* Rxcsum */
+	PcssMASK	= 0x00FF,	/* Packet Checksum Start */
+	PcssSHIFT	= 0,
+	Ipofl		= 0x0100,	/* IP Checksum Off-load Enable */
+	Tuofl		= 0x0200,	/* TCP/UDP Checksum Off-load Enable */
+};
+
+enum {					/* Receive Delay Timer Ring */
+	DelayMASK	= 0xFFFF,	/* delay timer in 1.024nS increments */
+	DelaySHIFT	= 0,
+	Fpd		= 0x80000000,	/* Flush partial Descriptor Block */
+};
+
+typedef struct Rd {			/* Receive Descriptor */
+	u32int	addr[2];
+	u16int	length;
+	u16int	checksum;
+	u8int	status;
+	u8int	errors;
+	u16int	special;
+} Rd;
+
+enum {					/* Rd status */
+	Rdd		= 0x01,		/* Descriptor Done */
+	Reop		= 0x02,		/* End of Packet */
+	Ixsm		= 0x04,		/* Ignore Checksum Indication */
+	Vp		= 0x08,		/* Packet is 802.1Q (matched VET) */
+	Tcpcs		= 0x20,		/* TCP Checksum Calculated on Packet */
+	Ipcs		= 0x40,		/* IP Checksum Calculated on Packet */
+	Pif		= 0x80,		/* Passed in-exact filter */
+};
+
+enum {					/* Rd errors */
+	Ce		= 0x01,		/* CRC Error or Alignment Error */
+	Se		= 0x02,		/* Symbol Error */
+	Seq		= 0x04,		/* Sequence Error */
+	Cxe		= 0x10,		/* Carrier Extension Error */
+	Tcpe		= 0x20,		/* TCP/UDP Checksum Error */
+	Ipe		= 0x40,		/* IP Checksum Error */
+	Rxe		= 0x80,		/* RX Data Error */
+};
+
+typedef struct {			/* Transmit Descriptor */
+	u32int	addr[2];		/* Data */
+	u32int	control;
+	u32int	status;
+} Td;
+
+enum {					/* Tdesc control */
+	LenMASK		= 0x000FFFFF,	/* Data/Packet Length Field */
+	LenSHIFT	= 0,
+	DtypeCD		= 0x00000000,	/* Data Type 'Context Descriptor' */
+	DtypeDD		= 0x00100000,	/* Data Type 'Data Descriptor' */
+	PtypeTCP	= 0x01000000,	/* TCP/UDP Packet Type (CD) */
+	Teop		= 0x01000000,	/* End of Packet (DD) */
+	PtypeIP		= 0x02000000,	/* IP Packet Type (CD) */
+	Ifcs		= 0x02000000,	/* Insert FCS (DD) */
+	Tse		= 0x04000000,	/* TCP Segmentation Enable */
+	Rs		= 0x08000000,	/* Report Status */
+	Rps		= 0x10000000,	/* Report Status Sent */
+	Dext		= 0x20000000,	/* Descriptor Extension */
+	Vle		= 0x40000000,	/* VLAN Packet Enable */
+	Ide		= 0x80000000,	/* Interrupt Delay Enable */
+};
+
+enum {					/* Tdesc status */
+	Tdd		= 0x0001,	/* Descriptor Done */
+	Ec		= 0x0002,	/* Excess Collisions */
+	Lc		= 0x0004,	/* Late Collision */
+	Tu		= 0x0008,	/* Transmit Underrun */
+	CssMASK		= 0xFF00,	/* Checksum Start Field */
+	CssSHIFT	= 8,
+};
+
+typedef struct {
+	u16int	*reg;
+	u32int	*reg32;
+	int	sz;
+} Flash;
+
+
+enum {
+	/* 16 and 32-bit flash registers for ich flash parts */
+	Bfpr	= 0x00/4,		/* flash base 0:12; lim 16:28 */
+	Fsts	= 0x04/2,		/* flash status;  Hsfsts */
+	Fctl	= 0x06/2,		/* flash control; Hsfctl */
+	Faddr	= 0x08/4,		/* flash address to r/w */
+	Fdata	= 0x10/4,		/* data @ address */
+
+	/* status register */
+	Fdone	= 1<<0,			/* flash cycle done */
+	Fcerr	= 1<<1,			/* cycle error; write 1 to clear */
+	Ael	= 1<<2,			/* direct access error log; 1 to clear */
+	Scip	= 1<<5,			/* spi cycle in progress */
+	Fvalid	= 1<<14,		/* flash descriptor valid */
+
+	/* control register */
+	Fgo	= 1<<0,			/* start cycle */
+	Flcycle	= 1<<1,			/* two bits: r=0; w=2 */
+	Fdbc	= 1<<8,			/* bytes to read; 5 bits */
+};
+
+enum {
+	Nrd		= 256,		/* power of two */
+	Ntd		= 128,		/* power of two */
+	Nrb		= 512,	/* private receive buffers per Ctlr  */
+};
+
+/*
+ * cavet emptor: 82577/78 have been entered speculatively.
+ * awating datasheet from intel.
+ */
+enum {
+	Iany = -1,
+	i82563,
+	i82566,
+	i82567,
+	i82567m,
+	i82571,
+	i82572,
+	i82573,
+	i82574,
+	i82575,
+	i82576,
+	i82577,
+	i82577m,	
+	i82578,
+	i82578m,
+	i82579,
+	i82580,
+	i82583,
+	Nctlrtype,
+};
+
+enum {
+	Fload	= 1<<0,
+	Fert	= 1<<1,
+	F75	= 1<<2,
+	Fpba	= 1<<3,
+	Fflashea	= 1<<4,
+};
+
+typedef struct Ctlrtype Ctlrtype;
+struct Ctlrtype {
+	int	type;
+	int	mtu;
+	int	flag;
+	char	*name;
+};
+
+static Ctlrtype cttab[Nctlrtype] = {
+	i82563,		9014,	Fpba,		"i82563",
+	i82566,		1514,	Fload,		"i82566",
+	i82567,		9234,	Fload,		"i82567",
+	i82567m,		1514,	0,		"i82567m",
+	i82571,		9234,	Fpba,		"i82571",
+	i82572,		9234,	Fpba,		"i82572",
+	i82573,		8192,	Fert,		"i82573",		/* terrible perf above 8k */
+	i82574,		9018,	0,		"i82574",
+	i82575,		9728,	F75|Fflashea,	"i82575",
+	i82576,		9728,	F75,		"i82576",
+	i82577,		4096,	Fload|Fert,	"i82577",
+	i82577m,		1514,	Fload|Fert,	"i82577",
+	i82578,		4096,	Fload|Fert,	"i82578",
+	i82578m,		1514,	Fload|Fert,	"i82578",
+	i82579,		9018,	Fload|Fert,	"i82579",
+	i82580,		9728,	F75,		"i82580",
+	i82583,		1514,	0,		"i82583",
+};
+
+
+typedef struct Ctlr Ctlr;
+struct Ctlr {
+	uintmem	port;
+	Pcidev	*pcidev;
+	Ctlr	*next;
+	Ether	*edev;
+	int	active;
+	int	type;
+	u16int	eeprom[0x40];
+
+	QLock	alock;			/* attach */
+	int	attached;
+	int	nrd;
+	int	ntd;
+	int	nrb;			/* how many this Ctlr has in the pool */
+	uint	rbsz;
+
+	int	*nic;
+	Lock	imlock;
+	int	im;			/* interrupt mask */
+
+	Rendez	lrendez;
+	int	lim;
+
+	QLock	slock;
+	uint	statistics[Nstatistics];
+	uint	lsleep;
+	uint	lintr;
+	uint	rsleep;
+	uint	rintr;
+	uint	txdw;
+	uint	tintr;
+	uint	ixsm;
+	uint	ipcs;
+	uint	tcpcs;
+	uint	speeds[4];
+	uint	phyerrata;
+
+	uchar	ra[Eaddrlen];		/* receive address */
+	u32int	mta[128];		/* multicast table array */
+
+	Rendez	rrendez;
+	int	rim;
+	int	rdfree;
+	Rd	*rdba;			/* receive descriptor base address */
+	Block	**rb;			/* receive buffers */
+	int	rdh;			/* receive descriptor head */
+	int	rdt;			/* receive descriptor tail */
+	int	rdtr;			/* receive delay timer ring value */
+	int	radv;			/* receive interrupt absolute delay timer */
+
+	Rendez	trendez;
+	QLock	tlock;
+	int	tbusy;
+	Td	*tdba;			/* transmit descriptor base address */
+	Block	**tb;			/* transmit buffers */
+	int	tdh;			/* transmit descriptor head */
+	int	tdt;			/* transmit descriptor tail */
+
+	int	fcrtl;
+	int	fcrth;
+
+	uint	pba;			/* packet buffer allocation */
+};
+
+#define csr32r(c, r)	(*((c)->nic+((r)/4)))
+#define csr32w(c, r, v)	(*((c)->nic+((r)/4)) = (v))
+
+static Ctlr* i82563ctlrhead;
+static Ctlr* i82563ctlrtail;
+
+static Lock i82563rblock;		/* free receive Blocks */
+static Block* i82563rbpool;
+
+
+static char *statistics[Nstatistics] = {
+	"CRC Error",
+	"Alignment Error",
+	"Symbol Error",
+	"RX Error",
+	"Missed Packets",
+	"Single Collision",
+	"Excessive Collisions",
+	"Multiple Collision",
+	"Late Collisions",
+	nil,
+	"Collision",
+	"Transmit Underrun",
+	"Defer",
+	"Transmit - No CRS",
+	"Sequence Error",
+	"Carrier Extension Error",
+	"Receive Error Length",
+	nil,
+	"XON Received",
+	"XON Transmitted",
+	"XOFF Received",
+	"XOFF Transmitted",
+	"FC Received Unsupported",
+	"Packets Received (64 Bytes)",
+	"Packets Received (65-127 Bytes)",
+	"Packets Received (128-255 Bytes)",
+	"Packets Received (256-511 Bytes)",
+	"Packets Received (512-1023 Bytes)",
+	"Packets Received (1024-mtu Bytes)",
+	"Good Packets Received",
+	"Broadcast Packets Received",
+	"Multicast Packets Received",
+	"Good Packets Transmitted",
+	nil,
+	"Good Octets Received",
+	nil,
+	"Good Octets Transmitted",
+	nil,
+	nil,
+	nil,
+	"Receive No Buffers",
+	"Receive Undersize",
+	"Receive Fragment",
+	"Receive Oversize",
+	"Receive Jabber",
+	"Management Packets Rx",
+	"Management Packets Drop",
+	"Management Packets Tx",
+	"Total Octets Received",
+	nil,
+	"Total Octets Transmitted",
+	nil,
+	"Total Packets Received",
+	"Total Packets Transmitted",
+	"Packets Transmitted (64 Bytes)",
+	"Packets Transmitted (65-127 Bytes)",
+	"Packets Transmitted (128-255 Bytes)",
+	"Packets Transmitted (256-511 Bytes)",
+	"Packets Transmitted (512-1023 Bytes)",
+	"Packets Transmitted (1024-mtu Bytes)",
+	"Multicast Packets Transmitted",
+	"Broadcast Packets Transmitted",
+	"TCP Segmentation Context Transmitted",
+	"TCP Segmentation Context Fail",
+	"Interrupt Assertion",
+	"Interrupt Rx Pkt Timer",
+	"Interrupt Rx Abs Timer",
+	"Interrupt Tx Pkt Timer",
+	"Interrupt Tx Abs Timer",
+	"Interrupt Tx Queue Empty",
+	"Interrupt Tx Desc Low",
+	"Interrupt Rx Min",
+	"Interrupt Rx Overrun",
+};
+
+static char*
+cname(Ctlr* c)
+{
+	if (c->type == Iany)
+		return "any";
+	return cttab[c->type].name;
+}
+
+static long
+i82563ifstat(Ether *edev, void *a, long n, ulong offset)
+{
+	Ctlr *ctlr;
+	char *s, *p, *e, *stat;
+	int i, r;
+	uvlong tuvl, ruvl;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->slock);
+	p = s = malloc(READSTR);
+	if(p == nil) {
+		qunlock(&ctlr->slock);
+		error(Enomem);
+	}
+	e = p + READSTR;
+
+	for(i = 0; i < Nstatistics; i++){
+		r = csr32r(ctlr, Statistics + i*4);
+		if((stat = statistics[i]) == nil)
+			continue;
+		switch(i){
+		case Gorcl:
+		case Gotcl:
+		case Torl:
+		case Totl:
+			ruvl = r;
+			ruvl += (uvlong)csr32r(ctlr, Statistics+(i+1)*4) << 32;
+			tuvl = ruvl;
+			tuvl += ctlr->statistics[i];
+			tuvl += (uvlong)ctlr->statistics[i+1] << 32;
+			if(tuvl == 0)
+				continue;
+			ctlr->statistics[i] = tuvl;
+			ctlr->statistics[i+1] = tuvl >> 32;
+			p = seprint(p, e, "%s: %llud %llud\n", stat, tuvl, ruvl);
+			i++;
+			break;
+
+		default:
+			ctlr->statistics[i] += r;
+			if(ctlr->statistics[i] == 0)
+				continue;
+			p = seprint(p, e, "%s: %ud %ud\n", stat,
+				ctlr->statistics[i], r);
+			break;
+		}
+	}
+
+	p = seprint(p, e, "lintr: %ud %ud\n", ctlr->lintr, ctlr->lsleep);
+	p = seprint(p, e, "rintr: %ud %ud\n", ctlr->rintr, ctlr->rsleep);
+	p = seprint(p, e, "tintr: %ud %ud\n", ctlr->tintr, ctlr->txdw);
+	p = seprint(p, e, "ixcs: %ud %ud %ud\n", ctlr->ixsm, ctlr->ipcs, ctlr->tcpcs);
+	p = seprint(p, e, "rdtr: %ud\n", ctlr->rdtr);
+	p = seprint(p, e, "radv: %ud\n", ctlr->radv);
+	p = seprint(p, e, "ctrl: %.8ux\n", csr32r(ctlr, Ctrl));
+	p = seprint(p, e, "ctrlext: %.8ux\n", csr32r(ctlr, Ctrlext));
+	p = seprint(p, e, "status: %.8ux\n", csr32r(ctlr, Status));
+	p = seprint(p, e, "txcw: %.8ux\n", csr32r(ctlr, Txcw));
+	p = seprint(p, e, "txdctl: %.8ux\n", csr32r(ctlr, Txdctl));
+	p = seprint(p, e, "pba: %.8ux\n", ctlr->pba);
+
+	p = seprint(p, e, "speeds: 10:%ud 100:%ud 1000:%ud ?:%ud\n",
+		ctlr->speeds[0], ctlr->speeds[1], ctlr->speeds[2], ctlr->speeds[3]);
+	p = seprint(p, e, "type: %s\n", cname(ctlr));
+
+//	p = seprint(p, e, "eeprom:");
+//	for(i = 0; i < 0x40; i++){
+//		if(i && ((i & 7) == 0))
+//			p = seprint(p, e, "\n       ");
+//		p = seprint(p, e, " %4.4ux", ctlr->eeprom[i]);
+//	}
+//	p = seprint(p, e, "\n");
+
+	USED(p);
+	n = readstr(offset, a, n, s);
+	free(s);
+	qunlock(&ctlr->slock);
+
+	return n;
+}
+
+static void
+i82563promiscuous(void* arg, int on)
+{
+	int rctl;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	rctl = csr32r(ctlr, Rctl);
+	rctl &= ~MoMASK;
+	if(on)
+		rctl |= Upe|Mpe;
+	else
+		rctl &= ~(Upe|Mpe);
+	csr32w(ctlr, Rctl, rctl);
+}
+
+static void
+i82563multicast(void* arg, uchar* addr, int on)
+{
+	int bit, x;
+	Ctlr *ctlr;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	x = addr[5]>>1;
+	if(ctlr->type == i82566 || ctlr->type == i82567)
+		x &= 31;
+	bit = ((addr[5] & 1)<<4)|(addr[4]>>4);
+	/*
+	 * multiple ether addresses can hash to the same filter bit,
+	 * so it's never safe to clear a filter bit.
+	 * if we want to clear filter bits, we need to keep track of
+	 * all the multicast addresses in use, clear all the filter bits,
+	 * then set the ones corresponding to in-use addresses.
+	 */
+	if(on)
+		ctlr->mta[x] |= 1<<bit;
+//	else
+//		ctlr->mta[x] &= ~(1<<bit);
+
+	csr32w(ctlr, Mta+x*4, ctlr->mta[x]);
+}
+
+static Block*
+i82563rballoc(void)
+{
+	Block *bp;
+
+	ilock(&i82563rblock);
+	if((bp = i82563rbpool) != nil){
+		i82563rbpool = bp->next;
+		bp->next = nil;
+		/*ainc(&bp->ref);	 prevent bp from being freed */
+	}
+	iunlock(&i82563rblock);
+
+	return bp;
+}
+
+static void
+i82563rbfree(Block* b)
+{
+	b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, PGSZ);
+ 	b->flag &= ~(Bipck | Budpck | Btcpck | Bpktck);
+	ilock(&i82563rblock);
+	b->next = i82563rbpool;
+	i82563rbpool = b;
+	iunlock(&i82563rblock);
+}
+
+static void
+i82563im(Ctlr* ctlr, int im)
+{
+	ilock(&ctlr->imlock);
+	ctlr->im |= im;
+	csr32w(ctlr, Ims, ctlr->im);
+	iunlock(&ctlr->imlock);
+}
+
+static void
+i82563txinit(Ctlr* ctlr)
+{
+	int i, r;
+	Block *bp;
+
+	if(cttab[ctlr->type].flag & F75)
+		csr32w(ctlr, Tctl, 0x0F<<CtSHIFT | Psp);
+	else
+		csr32w(ctlr, Tctl, 0x0F<<CtSHIFT | Psp | 66<<ColdSHIFT | Mulr);
+	csr32w(ctlr, Tipg, 6<<20 | 8<<10 | 8);		/* yb sez: 0x702008 */
+	csr32w(ctlr, Tdbal, PCIWADDR(ctlr->tdba));
+	csr32w(ctlr, Tdbah, 0);
+	csr32w(ctlr, Tdlen, ctlr->ntd * sizeof(Td));
+	ctlr->tdh = PREV(0, ctlr->ntd);
+	csr32w(ctlr, Tdh, 0);
+	ctlr->tdt = 0;
+	csr32w(ctlr, Tdt, 0);
+	for(i = 0; i < ctlr->ntd; i++){
+		if((bp = ctlr->tb[i]) != nil){
+			ctlr->tb[i] = nil;	
+			freeb(bp);
+		}
+		memset(&ctlr->tdba[i], 0, sizeof(Td));
+	}
+	csr32w(ctlr, Tidv, 128);
+	csr32w(ctlr, Tadv, 64);
+	r = csr32r(ctlr, Tctl);
+	r |= Ten;
+	csr32w(ctlr, Tctl, r);
+	r = csr32r(ctlr, Txdctl);
+	r &= ~(WthreshMASK|PthreshMASK);
+	r |= 4<<WthreshSHIFT | 4<<PthreshSHIFT;
+	if(cttab[ctlr->type].flag & F75)
+		r |= Qenable;
+	csr32w(ctlr, Txdctl, r);
+}
+
+#define Next(x, m)	(((x)+1) & (m))
+
+static int
+i82563cleanup(Ctlr *c)
+{
+	Block *bp;
+	int tdh, m, n;
+
+	tdh = c->tdh;
+	m = c->ntd-1;
+	while(c->tdba[n = Next(tdh, m)].status & Tdd){
+		tdh = n;
+		if((bp = c->tb[tdh]) != nil){
+			c->tb[tdh] = nil;
+			freeb(bp);
+		}else
+			iprint("82563 tx underrun!\n");
+		c->tdba[tdh].status = 0;
+	}
+
+	return c->tdh = tdh;
+}
+
+static int
+notrim(void *v)
+{
+	Ctlr *c;
+
+	c = v;
+	return (c->im & Txdw) == 0;
+}
+
+static void
+i82563transmit(Ether* edev)
+{
+	Td *td;
+	Block *bp;
+	Ctlr *ctlr;
+	int tdh, tdt, m;
+
+	ctlr = edev->ctlr;
+
+	qlock(&ctlr->tlock);
+
+	/*
+	 * Free any completed packets
+	 */
+	tdh = i82563cleanup(ctlr);
+
+	/*
+	 * Try to fill the ring back up.
+	 */
+	tdt = ctlr->tdt;
+	m = ctlr->ntd-1;
+	for(;;){
+		if(Next(tdt, m) == tdh){
+			ctlr->txdw++;
+			i82563im(ctlr, Txdw);
+			break;
+		}
+		if((bp = qget(edev->oq)) == nil)
+			break;
+		td = &ctlr->tdba[tdt];
+		td->addr[0] = PCIWADDR(bp->rp);
+		td->control = Ide|Rs|Ifcs|Teop|BLEN(bp);
+		ctlr->tb[tdt] = bp;
+		tdt = Next(tdt, m);
+	}
+	if(ctlr->tdt != tdt){
+		ctlr->tdt = tdt;
+		csr32w(ctlr, Tdt, tdt);
+	}
+	qunlock(&ctlr->tlock);
+}
+
+static void
+i82563replenish(Ctlr* ctlr)
+{
+	Rd *rd;
+	int rdt, m;
+	Block *bp;
+
+	rdt = ctlr->rdt;
+	m = ctlr->nrd-1;
+	while(Next(rdt, m) != ctlr->rdh){
+		rd = &ctlr->rdba[rdt];
+		if(ctlr->rb[rdt] != nil){
+			iprint("82563: tx overrun\n");
+			break;
+		}
+		bp = i82563rballoc();
+		if(bp == nil){
+			vlong now;
+			static vlong lasttime;
+
+			/* don't flood the console */
+			now = tk2ms(sys->ticks);
+			if (now - lasttime > 2000)
+				iprint("#l%d: 82563: all %d rx buffers in use\n",
+					ctlr->edev->ctlrno, ctlr->nrb);
+			lasttime = now;
+			break;
+		}
+		ctlr->rb[rdt] = bp;
+		rd->addr[0] = PCIWADDR(bp->rp);
+//		rd->addr[1] = 0;
+		rd->status = 0;
+		ctlr->rdfree++;
+		rdt = Next(rdt, m);
+	}
+	ctlr->rdt = rdt;
+	csr32w(ctlr, Rdt, rdt);
+}
+
+static void
+i82563rxinit(Ctlr* ctlr)
+{
+	Block *bp;
+	int i, r, rctl;
+
+	i = ctlr->rbsz / 1024;
+	if(ctlr->rbsz % 1024)
+		i++;
+
+	if(ctlr->rbsz <= 2048 || (cttab[ctlr->type].flag & F75)){
+		if(ctlr->rbsz > 2048){
+			if(ctlr->type != i82575)
+				i |= (ctlr->nrd/2>>4)<<20;		/* RdmsHalf */
+			csr32w(ctlr, Srrctl, i | Dropen);
+			csr32w(ctlr, Rmpl, ctlr->rbsz);
+//			csr32w(ctlr, Drxmxod, 0x7ff);
+		}
+		rctl = Dpf|Bsize2048|Bam|RdtmsHALF;
+	}else if(ctlr->rbsz <= 8192){
+		rctl = Lpe|Dpf|Bsize8192|Bsex|Bam|RdtmsHALF|Secrc;
+	}else{
+		rctl = Lpe|Dpf|BsizeFlex*i|Bam|RdtmsHALF|Secrc;
+	}
+
+	if(ctlr->type == i82575 || ctlr->type == i82576){
+		/*
+		 * Setting Qenable in Rxdctl does not
+		 * appear to stick unless Ren is on.
+		 */
+		csr32w(ctlr, Rctl, Ren|rctl);
+		r = csr32r(ctlr, Rxdctl);
+		r |= Qenable;
+		csr32w(ctlr, Rxdctl, r);
+	}
+	csr32w(ctlr, Rctl, rctl);
+
+	if(cttab[ctlr->type].flag & Fert)
+		csr32w(ctlr, Ert, 1024/8);
+
+	if(ctlr->type == i82566 || ctlr->type == i82567)
+		csr32w(ctlr, Pbs, 16);
+
+	csr32w(ctlr, Rdbal, PCIWADDR(ctlr->rdba));
+	csr32w(ctlr, Rdbah, 0);
+	csr32w(ctlr, Rdlen, ctlr->nrd * sizeof(Rd));
+	ctlr->rdh = 0;
+	csr32w(ctlr, Rdh, 0);
+	ctlr->rdt = 0;
+	csr32w(ctlr, Rdt, 0);
+	/* keep interrupt moderation, our network is just crazy */
+	ctlr->rdtr = 25;		/* µs */
+	ctlr->radv = 500;		/* µs */
+	csr32w(ctlr, Rdtr, ctlr->rdtr);
+	csr32w(ctlr, Radv, ctlr->radv);
+
+	for(i = 0; i < ctlr->nrd; i++)
+		if((bp = ctlr->rb[i]) != nil){
+			ctlr->rb[i] = nil;
+			freeb(bp);
+		}
+
+	i82563replenish(ctlr);
+
+	if(cttab[ctlr->type].flag & F75)
+		csr32w(ctlr, Rxdctl, 1<<WthreshSHIFT | 8<<PthreshSHIFT | 1<<HthreshSHIFT | Qenable);
+	else
+		csr32w(ctlr, Rxdctl, 2<<WthreshSHIFT | 2<<PthreshSHIFT);
+
+	/*
+	 * Don't enable checksum offload.  In practice, it interferes with
+	 * tftp booting on at least in the 82575.
+	 */
+//	csr32w(ctlr, Rxcsum, Tuofl | Ipofl | ETHERHDRSIZE<<PcssSHIFT);
+	csr32w(ctlr, Rxcsum, 0);
+}
+
+static int
+i82563rim(void* ctlr)
+{
+	return ((Ctlr*)ctlr)->rim != 0;
+}
+
+static void
+i82563rproc(void* arg)
+{
+	Rd *rd;
+	Block *bp;
+	Ctlr *ctlr;
+	int r, m, rdh, rim, im;
+	Ether *edev;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	i82563rxinit(ctlr);
+	r = csr32r(ctlr, Rctl);
+	r |= Ren;
+	csr32w(ctlr, Rctl, r);
+	if(cttab[ctlr->type].flag & F75){
+		r = csr32r(ctlr, Rxdctl);
+		r |= Qenable;
+		csr32w(ctlr, Rxdctl, r);
+	}
+	m = ctlr->nrd-1;
+
+	im = Rxt0|Rxo|Rxdmt0|Rxseq|Ack;
+	for(;;){
+		i82563im(ctlr, im);
+		ctlr->rsleep++;
+//		coherence();
+		sleep(&ctlr->rrendez, i82563rim, ctlr);
+
+		rdh = ctlr->rdh;
+		for(;;){
+			rd = &ctlr->rdba[rdh];
+			rim = ctlr->rim;
+			ctlr->rim = 0;
+			if(!(rd->status & Rdd))
+				break;
+
+			/*
+			 * Accept eop packets with no errors.
+			 * With no errors and the Ixsm bit set,
+			 * the descriptor status Tpcs and Ipcs bits give
+			 * an indication of whether the checksums were
+			 * calculated and valid.
+			 */
+			bp = ctlr->rb[rdh];
+			if((rd->status & Reop) && rd->errors == 0){
+				bp->wp += rd->length;
+				/* bp->lim = bp->wp;	lie like a dog.  avoid packblock. */
+				if(!(rd->status & Ixsm)){
+					ctlr->ixsm++;
+					if(rd->status & Ipcs){
+						/*
+						 * IP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->ipcs++;
+						bp->flag |= Bipck;
+					}
+					if(rd->status & Tcpcs){
+						/*
+						 * TCP/UDP checksum calculated
+						 * (and valid as errors == 0).
+						 */
+						ctlr->tcpcs++;
+						bp->flag |= Btcpck|Budpck;
+					}
+					bp->checksum = rd->checksum;
+					bp->flag |= Bpktck;
+				}
+				etheriq(edev, bp, 1);
+			} else {
+				if (rd->status & Reop && rd->errors)
+					print("%s: input packet error %#ux\n",
+						cname(ctlr), rd->errors);
+				freeb(bp);
+			}
+			ctlr->rb[rdh] = nil;
+			rd->status = 0;
+			ctlr->rdfree--;
+			ctlr->rdh = rdh = Next(rdh, m);
+			if(ctlr->nrd-ctlr->rdfree >= 32 || (rim & Rxdmt0))
+				i82563replenish(ctlr);
+		}
+	}
+}
+
+static int
+i82563lim(void* ctrl)
+{
+	return ((Ctlr*)ctrl)->lim != 0;
+}
+static int speedtab[] = {
+	10, 100, 1000, 0
+};
+
+static uint
+phyread(Ctlr *c, int phyno, int reg)
+{
+	uint phy, i;
+
+	csr32w(c, Mdic, MDIrop | phyno<<MDIpSHIFT | reg<<MDIrSHIFT);
+	phy = 0;
+	for(i = 0; i < 64; i++){
+		phy = csr32r(c, Mdic);
+		if(phy & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+	if((phy & (MDIe|MDIready)) != MDIready){
+		print("%s: phy %d wedged %.8ux\n", cname(c), phyno, phy);
+		return ~0;
+	}
+	return phy & 0xffff;
+}
+
+static uint
+phywrite0(Ctlr *c, int phyno, int reg, u16int val)
+{
+	uint phy, i;
+
+	csr32w(c, Mdic, MDIwop | phyno<<MDIpSHIFT | reg<<MDIrSHIFT | val);
+	phy = 0;
+	for(i = 0; i < 64; i++){
+		phy = csr32r(c, Mdic);
+		if(phy & (MDIe|MDIready))
+			break;
+		microdelay(1);
+	}
+	if((phy & (MDIe|MDIready)) != MDIready)
+		return ~0;
+	return 0;
+}
+
+static uint
+setpage(Ctlr *c, uint phyno, uint p, uint r)
+{
+	uint pr;
+
+	if(c->type == i82563){
+		if(r >= 16 && r <= 28 && r != 22)
+			pr = Phypage;
+		else if(r == 30 || r == 31)
+			pr = Phyapage;
+		else
+			return 0;
+		return phywrite0(c, phyno, pr, p);
+	}else if(p == 0)
+		return 0;
+	return ~0;
+}
+
+static uint
+phywrite(Ctlr *c, uint phyno, uint reg, u16int v)
+{
+	if(setpage(c, phyno, reg>>8, reg & 0xff) == ~0)
+		panic("%s: bad phy reg %.4ux", cname(c), reg);
+	return phywrite0(c, phyno, reg & 0xff, v);
+}
+
+static void
+phyerrata(Ether *e, Ctlr *c)
+{
+	if(e->mbps == 0){
+		if(c->phyerrata == 0){
+			c->phyerrata++;
+			phywrite(c, 1, Phyprst, Prst);	/* try a port reset */
+			print("%s: phy port reset\n", cname(c));
+		}
+	}else
+		c->phyerrata = 0;
+}
+
+/*
+ * watch for changes of link state
+ */
+static void
+phylproc(void *v)
+{
+	uint a, i, phy, r, phyno, phystat, link;
+	Ctlr *c;
+	Ether *e;
+
+	e = v;
+	c = e->ctlr;
+	link = Rtlink;
+
+	if(c->type == i82573 && (phy = phyread(c, 1, Phyier)) != ~0)
+		phywrite(c, 1, Phyier, phy | Lscie | Ancie | Spdie | Panie);
+
+	phyno = 1;
+	if(c->type == i82579)
+		phyno = 2;
+
+	phystat = Physsr;
+	if(c->type == i82579 || c->type == i82580){
+		phystat = Phystat;
+		link = Link;
+	}
+
+	for(;;){
+		phy = phyread(c, phyno, phystat);
+		if(phy == ~0)
+			goto next;
+		if(c->type == i82579 || c->type == i82580)
+			i = (phy>>8) & 3;
+		else
+			i = (phy>>14) & 3;
+		switch(c->type){
+		default:
+			a = 0;
+			break;
+		case i82579:
+		case i82580:
+			a = phy & Ans;
+			break;
+		case i82563:
+		case i82578:
+		case i82578m:
+		case i82583:
+			a = phyread(c, phyno, Phyisr) & Ane;
+			break;
+		case i82571:
+		case i82572:
+		case i82575:
+		case i82576:
+			a = phyread(c, phyno, Phylhr) & Anf;
+			i = (i-1) & 3;
+			break;
+		}
+		if(a){
+			r = phyread(c, phyno, Phyctl);
+			phywrite(c, phyno, Phyctl, r | Ran | Ean);
+		}
+		e->link = (phy & link) != 0;
+		if(e->link == 0)
+			i = 3;
+		c->speeds[i]++;
+		e->mbps = speedtab[i];
+		if(c->type == i82563)
+			phyerrata(e, c);
+next:
+		c->lim = 0;
+		i82563im(c, Lsc);
+		c->lsleep++;
+		sleep(&c->lrendez, i82563lim, c);
+	}
+}
+
+/*
+ * watch for changes of link state, pcs version
+ */
+static void
+pcslproc(void *v)
+{
+	uint i, phy;
+	Ctlr *c;
+	Ether *e;
+
+	e = v;
+	c = e->ctlr;
+
+	for(;;){
+		phy = csr32r(c, Pcsstat);
+		e->link = phy & Linkok;
+		i = 3;
+		if(e->link)
+			i = (phy & 6) >> 1;
+		else if(phy & Anbad)
+			csr32w(c, Pcsctl, csr32r(c, Pcsctl) | Pan | Prestart);
+		c->speeds[i]++;
+		e->mbps = speedtab[i];
+		c->lim = 0;
+		i82563im(c, Lsc);
+		c->lsleep++;
+		sleep(&c->lrendez, i82563lim, c);
+	}
+}
+
+/*
+ * watch for changes of link state, serdes version
+ */
+static void
+serdeslproc(void *v)
+{
+	uint i, tx, rx;
+	Ctlr *c;
+	Ether *e;
+
+	e = v;
+	c = e->ctlr;
+
+	for(;;){
+		rx = csr32r(c, Rxcw);
+		tx = csr32r(c, Txcw);
+		USED(tx);
+		e->link = (rx & 1<<31) != 0;
+//		e->link = (csr32r(c, Status) & Lu) != 0;
+		i = 3;
+		if(e->link)
+			i = 2;
+		c->speeds[i]++;
+		e->mbps = speedtab[i];
+		c->lim = 0;
+		i82563im(c, Lsc);
+		c->lsleep++;
+		sleep(&c->lrendez, i82563lim, c);
+	}
+}
+
+static void
+i82563tproc(void *v)
+{
+	Ether *e;
+	Ctlr *c;
+
+	e = v;
+	c = e->ctlr;
+	for(;;){
+		sleep(&c->trendez, return0, 0);
+		i82563transmit(e);
+	}
+}
+
+static void
+i82563attach(Ether* edev)
+{
+	char name[KNAMELEN];
+	Block *bp;
+	Ctlr *ctlr;
+
+	ctlr = edev->ctlr;
+	qlock(&ctlr->alock);
+	if(ctlr->attached){
+		qunlock(&ctlr->alock);
+		return;
+	}
+
+	ctlr->nrd = Nrd;
+	ctlr->ntd = Ntd;
+
+	if(waserror()){
+		while(ctlr->nrb > 0){
+			bp = i82563rballoc();
+			bp->free = nil;
+			freeb(bp);
+			ctlr->nrb--;
+		}
+		free(ctlr->tb);
+		ctlr->tb = nil;
+		free(ctlr->rb);
+		ctlr->rb = nil;
+		free(ctlr->tdba);
+		ctlr->tdba = nil;
+		free(ctlr->rdba);
+		ctlr->rdba = nil;
+		qunlock(&ctlr->alock);
+		nexterror();
+	}
+
+	if((ctlr->rdba = mallocalign(ctlr->nrd*sizeof(Rd), 128, 0, 0)) == nil ||
+	   (ctlr->tdba = mallocalign(ctlr->ntd*sizeof(Td), 128, 0, 0)) == nil ||
+	   (ctlr->rb = malloc(ctlr->nrd*sizeof(Block*))) == nil ||
+	   (ctlr->tb = malloc(ctlr->ntd*sizeof(Block*))) == nil)
+		error(Enomem);
+
+	for(ctlr->nrb = 0; ctlr->nrb < Nrb; ctlr->nrb++){
+		if((bp = allocb(ctlr->rbsz + PGSZ)) == nil)
+			break;
+		bp->free = i82563rbfree;
+		freeb(bp);
+	}
+
+	ctlr->attached = 1;
+
+	snprint(name, sizeof name, "#l%dl", edev->ctlrno);
+	if((csr32r(ctlr, Ctrlext) & Linkmode) == Serdes)
+		kproc(name, pcslproc, edev);		/* phy based serdes */
+	else if(csr32r(ctlr, Status) & Tbimode)
+		kproc(name, serdeslproc, edev);		/* mac based serdes */
+	else if(ctlr->type == i82579 || ctlr->type == i82580)
+		kproc(name, phylproc, edev);
+
+	snprint(name, sizeof name, "#l%dr", edev->ctlrno);
+	kproc(name, i82563rproc, edev);
+
+	snprint(name, sizeof name, "#l%dt", edev->ctlrno);
+	kproc(name, i82563tproc, edev);
+
+	i82563txinit(ctlr);
+
+	qunlock(&ctlr->alock);
+	poperror();
+}
+
+static void
+i82563interrupt(Ureg*, void* arg)
+{
+	Ctlr *ctlr;
+	Ether *edev;
+	int icr, im;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+
+	ilock(&ctlr->imlock);
+	csr32w(ctlr, Imc, ~0);
+	im = ctlr->im;
+
+	while(icr = csr32r(ctlr, Icr) & ctlr->im){
+		if(icr & Lsc){
+			im &= ~Lsc;
+			ctlr->lim = icr & Lsc;
+			wakeup(&ctlr->lrendez);
+			ctlr->lintr++;
+		}
+		if(icr & (Rxt0|Rxo|Rxdmt0|Rxseq|Ack)){
+			ctlr->rim = icr & (Rxt0|Rxo|Rxdmt0|Rxseq|Ack);
+			im &= ~(Rxt0|Rxo|Rxdmt0|Rxseq|Ack);
+			wakeup(&ctlr->rrendez);
+			ctlr->rintr++;
+		}
+		if(icr & Txdw){
+			im &= ~Txdw;
+			ctlr->tintr++;
+			wakeup(&ctlr->trendez);
+		}
+	}
+
+	ctlr->im = im;
+	csr32w(ctlr, Ims, im);
+	iunlock(&ctlr->imlock);
+}
+
+/* assume misrouted interrupts and check all controllers */
+static void
+i82575interrupt(Ureg*, void *)
+{
+	Ctlr *ctlr;
+
+	for (ctlr = i82563ctlrhead; ctlr != nil; ctlr = ctlr->next)
+		i82563interrupt(nil, ctlr->edev);
+}
+
+static int
+i82563detach(Ctlr* ctlr)
+{
+	int r, timeo;
+
+	/*
+	 * Perform a device reset to get the chip back to the
+	 * power-on state, followed by an EEPROM reset to read
+	 * the defaults for some internal registers.
+	 */
+	csr32w(ctlr, Imc, ~0);
+	csr32w(ctlr, Rctl, 0);
+	csr32w(ctlr, Tctl, csr32r(ctlr, Tctl) & ~Ten);
+
+	delay(10);
+
+	r = csr32r(ctlr, Ctrl);
+	if(ctlr->type == i82566 || ctlr->type == i82567 || ctlr->type == i82579)
+		r |= Phyrst;
+	csr32w(ctlr, Ctrl, Devrst | r);
+	delay(1);
+	for(timeo = 0;; timeo++){
+		if((csr32r(ctlr, Ctrl) & (Devrst|Phyrst)) == 0)
+			break;
+		if(timeo >= 1000)
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrl) & (Devrst|Phyrst))
+		return -1;
+
+	r = csr32r(ctlr, Ctrlext);
+	csr32w(ctlr, Ctrlext, r|Eerst);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if(!(csr32r(ctlr, Ctrlext) & Eerst))
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Ctrlext) & Eerst)
+		return -1;
+
+	csr32w(ctlr, Imc, ~0);
+	delay(1);
+	for(timeo = 0; timeo < 1000; timeo++){
+		if((csr32r(ctlr, Icr) & ~Rxcfg) == 0)
+			break;
+		delay(1);
+	}
+	if(csr32r(ctlr, Icr) & ~Rxcfg)
+		return -1;
+	/* balance rx/tx packet buffer; survives reset */
+	if(ctlr->rbsz > 8192 && cttab[ctlr->type].flag & Fpba){
+		ctlr->pba = csr32r(ctlr, Pba);
+		r = ctlr->pba >> 16;
+		r += ctlr->pba & 0xffff;
+		r >>= 1;
+		csr32w(ctlr, Pba, r);
+	}else if(ctlr->type == i82573 && ctlr->rbsz > 1514)
+		csr32w(ctlr, Pba, 14);
+	ctlr->pba = csr32r(ctlr, Pba);
+
+	r = csr32r(ctlr, Ctrl);
+	csr32w(ctlr, Ctrl, Slu|r);
+	return 0;
+}
+
+static void
+i82563shutdown(Ether* ether)
+{
+	i82563detach(ether->ctlr);
+}
+
+static u16int
+eeread(Ctlr *ctlr, int adr)
+{
+	csr32w(ctlr, Eerd, EEstart | adr << 2);
+	while ((csr32r(ctlr, Eerd) & EEdone) == 0)
+		;
+	return csr32r(ctlr, Eerd) >> 16;
+}
+
+static int
+eeload(Ctlr *ctlr)
+{
+	u16int sum;
+	int data, adr;
+
+	sum = 0;
+	for (adr = 0; adr < 0x40; adr++) {
+		data = eeread(ctlr, adr);
+		ctlr->eeprom[adr] = data;
+		sum += data;
+	}
+	return sum;
+}
+
+static int
+fcycle(Ctlr *, Flash *f)
+{
+	u16int s, i;
+
+	s = f->reg[Fsts];
+	if((s&Fvalid) == 0)
+		return -1;
+	f->reg[Fsts] |= Fcerr | Ael;
+	for(i = 0; i < 10; i++){
+		if((s&Scip) == 0)
+			return 0;
+		delay(1);
+		s = f->reg[Fsts];
+	}
+	return -1;
+}
+
+static int
+fread(Ctlr *c, Flash *f, int ladr)
+{
+	u16int s;
+
+	delay(1);
+	if(fcycle(c, f) == -1)
+		return -1;
+	f->reg[Fsts] |= Fdone;
+	f->reg32[Faddr] = ladr;
+
+	/* setup flash control register */
+	s = f->reg[Fctl];
+	s &= ~(0x1f << 8);
+	s |= (2-1) << 8;		/* 2 bytes */
+	s &= ~(2*Flcycle);		/* read */
+	f->reg[Fctl] = s | Fgo;
+
+	while((f->reg[Fsts] & Fdone) == 0)
+		;
+	if(f->reg[Fsts] & (Fcerr|Ael))
+		return -1;
+	return f->reg32[Fdata] & 0xffff;
+}
+
+static int
+fload(Ctlr *c)
+{
+	u32int data, io, r, adr;
+	u16int sum;
+	Flash f;
+
+	io = c->pcidev->mem[1].bar & ~0x0f;
+	f.reg = vmap(io, c->pcidev->mem[1].size);
+	if(f.reg == nil)
+		return -1;
+	f.reg32 = (void*)f.reg;
+	f.sz = f.reg32[Bfpr];
+	r = f.sz & 0x1fff;
+	if(csr32r(c, Eec) & 1<<22){
+		if(c->type == i82579)
+			r += 16;		/* sector size: 64k */
+		else
+			r  += 1;		/* sector size: 4k */
+	}
+	r <<= 12;
+	sum = 0;
+	for (adr = 0; adr < 0x40; adr++) {
+		data = fread(c, &f, r + adr*2);
+		if(data == -1)
+			return -1;
+		c->eeprom[adr] = data;
+		sum += data;
+	}
+	vunmap(f.reg, c->pcidev->mem[1].size);
+	return sum;
+}
+
+static void
+defaultea(Ctlr *ctlr, uchar *ra)
+{
+	uint i, r;
+	uvlong u;
+	static uchar nilea[Eaddrlen];
+
+	if(memcmp(ra, nilea, Eaddrlen) != 0)
+		return;
+	if(cttab[ctlr->type].flag & Fflashea){
+		/* intel mb bug */
+		u = (uvlong)csr32r(ctlr, Rah)<<32u | (u32int)csr32r(ctlr, Ral);
+		for(i = 0; i < Eaddrlen; i++)
+			ra[i] = u >> 8*i;
+	}
+	if(memcmp(ra, nilea, Eaddrlen) != 0)
+		return;
+	for(i = 0; i < Eaddrlen/2; i++){
+		ra[2*i] = ctlr->eeprom[Ea+i];
+		ra[2*i+1] = ctlr->eeprom[Ea+i] >> 8;
+	}
+	r = (csr32r(ctlr, Status) & Lanid) >> 2;
+	ra[5] += r;				/* ea ctlr[n] = ea ctlr[0]+n */
+}
+
+static int
+i82563reset(Ctlr *ctlr)
+{
+	uchar *ra;
+	int i, r;
+
+	if(i82563detach(ctlr))
+		return -1;
+	if(cttab[ctlr->type].flag & Fload)
+		r = fload(ctlr);
+	else
+		r = eeload(ctlr);
+	if(r != 0 && r != 0xBABA){
+		print("%s: bad EEPROM checksum - %#.4ux\n",
+			cname(ctlr), r);
+		return -1;
+	}
+
+	ra = ctlr->ra;
+	defaultea(ctlr, ra);
+	r = ctlr->ra[3]<<24 | ctlr->ra[2]<<16 | ctlr->ra[1]<<8 | ctlr->ra[0];
+	csr32w(ctlr, Ral, r);
+	r = 0x80000000 | ctlr->ra[5]<<8 | ctlr->ra[4];
+	csr32w(ctlr, Rah, r);
+	for(i = 1; i < 16; i++){
+		csr32w(ctlr, Ral+i*8, 0);
+		csr32w(ctlr, Rah+i*8, 0);
+	}
+	memset(ctlr->mta, 0, sizeof(ctlr->mta));
+	for(i = 0; i < 128; i++)
+		csr32w(ctlr, Mta + i*4, 0);
+	/*
+	 * Does autonegotiation affect this manual setting?
+	 * The correct values here should depend on the PBA value
+	 * and maximum frame length, no?
+	 * ctlr->fcrt[lh] are never set, so default to 0.
+	 */
+	csr32w(ctlr, Fcal, 0x00C28001);
+	csr32w(ctlr, Fcah, 0x0100);
+	if(ctlr->type != i82579)
+		csr32w(ctlr, Fct, 0x8808);
+	csr32w(ctlr, Fcttv, 0x0100);
+
+	ctlr->fcrtl = ctlr->fcrth = 0;
+	// ctlr->fcrtl = 0x00002000;
+	// ctlr->fcrth = 0x00004000;
+	csr32w(ctlr, Fcrtl, ctlr->fcrtl);
+	csr32w(ctlr, Fcrth, ctlr->fcrth);
+	if(cttab[ctlr->type].flag & F75)
+		csr32w(ctlr, Eitr, 128<<2);		/* 128 ¼ microsecond intervals */
+	return 0;
+}
+
+enum {
+	CMrdtr,
+	CMradv,
+	CMpause,
+	CMan,
+};
+
+static Cmdtab i82563ctlmsg[] = {
+	CMrdtr,	"rdtr",	2,
+	CMradv,	"radv",	2,
+	CMpause, "pause", 1,
+	CMan,	"an",	1,
+};
+
+static long
+i82563ctl(Ether *edev, void *buf, long n)
+{
+	char *p;
+	u32int v;
+	Ctlr *ctlr;
+	Cmdbuf *cb;
+	Cmdtab *ct;
+
+	if((ctlr = edev->ctlr) == nil)
+		error(Enonexist);
+
+	cb = parsecmd(buf, n);
+	if(waserror()){
+		free(cb);
+		nexterror();
+	}
+
+	ct = lookupcmd(cb, i82563ctlmsg, nelem(i82563ctlmsg));
+	switch(ct->index){
+	case CMrdtr:
+		v = strtoul(cb->f[1], &p, 0);
+		if(*p || v > 0xffff)
+			error(Ebadarg);
+		ctlr->rdtr = v;
+		csr32w(ctlr, Rdtr, v);
+		break;
+	case CMradv:
+		v = strtoul(cb->f[1], &p, 0);
+		if(*p || v > 0xffff)
+			error(Ebadarg);
+		ctlr->radv = v;
+		csr32w(ctlr, Radv, v);
+		break;
+	case CMpause:
+		csr32w(ctlr, Ctrl, csr32r(ctlr, Ctrl) ^ (1<<27 | 1<<28));
+		break;
+	case CMan:
+		csr32w(ctlr, Ctrl, csr32r(ctlr, Ctrl) | Lrst | Phyrst);
+		break;
+	}
+	free(cb);
+	poperror();
+
+	return n;
+}
+
+static int
+didtype(int d)
+{
+	switch(d){
+	case 0x1096:
+	case 0x10ba:		/* “gilgal” */
+	// case 0x1098:		/* serdes; not seen */
+	// case 0x10bb:		/* serdes */
+		return i82563;
+	case 0x1049:		/* mm */
+	case 0x104a:		/* dm */
+	case 0x104b:		/* dc */
+	case 0x104d:		/* v “ninevah” */
+	case 0x10bd:		/* dm-2 */
+	case 0x294c:		/* ich 9 */
+		return i82566;
+	case 0x10de:		/* lm ich10d */
+	case 0x10df:		/* lf ich10 */
+	case 0x10e5:		/* lm ich9 */
+	case 0x10f5:		/* lm ich9m; “boazman” */
+		return i82567;
+	case 0x10bf:		/* lf ich9m */
+	case 0x10cb:		/* v ich9m */
+	case 0x10cd:		/* lf ich10 */
+	case 0x10ce:		/* v ich10 */
+	case 0x10cc:		/* lm ich10 */
+		return i82567m;
+	case 0x105e:		/* eb */
+	case 0x105f:		/* eb */
+	case 0x1060:		/* eb */
+	case 0x10a4:		/* eb */
+	case 0x10a5:		/* eb  fiber */
+	case 0x10bc:		/* eb */
+	case 0x10d9:		/* eb serdes */
+	case 0x10da:		/* eb serdes “ophir” */
+		return i82571;
+	case 0x107d:		/* eb copper */
+	case 0x107e:		/* ei fiber */
+	case 0x107f:		/* ei */
+	case 0x10b9:		/* ei “rimon” */
+		return i82572;
+	case 0x108b:		/*  e “vidalia” */
+	case 0x108c:		/*  e (iamt) */
+	case 0x109a:		/*  l “tekoa” */
+		return i82573;
+	case 0x10d3:		/* l or it; “hartwell” */
+		return i82574;
+	case 0x10a7:
+	case 0x10a9:		/* fiber/serdes */
+		return i82575;
+	case 0x10c9:		/* copper */
+	case 0x10e6:		/* fiber */
+	case 0x10e7:		/* serdes; “kawela” */
+		return i82576;
+	case 0x10ea:		/* lc “calpella”; aka pch lan */
+		return i82577;
+	case 0x10eb:		/* lm “calpella” */
+		return i82577m;
+	case 0x10ef:		/* dc “piketon” */
+		return i82578;
+	case 0x1502:		/* lm */
+	case 0x1503:		/* v */
+		return i82579;
+	case 0x10f0:		/* dm “king's creek” */
+		return i82578m;
+	case 0x150e:		/* “barton hills” */
+	case 0x150f:		/* fiber */
+	case 0x1510:		/* backplane */
+	case 0x1511:		/* sfp */
+	case 0x1516:		
+		return i82580;
+	case 0x1506:		/* v */
+		return i82583;
+	}
+	return -1;
+}
+
+static void
+hbafixup(Pcidev *p)
+{
+	uint i;
+
+	i = pcicfgr32(p, PciSVID);
+	if((i & 0xffff) == 0x1b52 && p->did == 1)
+		p->did = i>>16;
+}
+
+static int
+setup(Ctlr *ctlr)
+{
+	Pcidev *p;
+
+	p = ctlr->pcidev;
+	ctlr->nic = vmap(ctlr->port, p->mem[0].size);
+	if(ctlr->nic == nil){
+		print("%s: can't map %#llud\n", cname(ctlr), ctlr->port);
+		return -1;
+	}
+	if(i82563reset(ctlr)){
+		vunmap(ctlr->nic, p->mem[0].size);
+		return -1;
+	}
+	pcisetbme(ctlr->pcidev);
+	return 0;
+}
+
+static void
+i82563pci(void)
+{
+	int type;
+	u32int io;
+	Ctlr *ctlr;
+	Pcidev *p;
+
+	p = nil;
+	while(p = pcimatch(p, 0x8086, 0)){
+		hbafixup(p);
+		if((type = didtype(p->did)) == -1)
+			continue;
+		ctlr = malloc(sizeof(Ctlr));
+		if(ctlr == nil) 
+			error(Enomem);
+		ctlr->type = type;
+		ctlr->pcidev = p;
+		ctlr->rbsz = cttab[type].mtu;
+		io = p->mem[0].bar & ~0x0F;
+		ctlr->port = io;
+		if(i82563ctlrhead != nil)
+			i82563ctlrtail->next = ctlr;
+		else
+			i82563ctlrhead = ctlr;
+		i82563ctlrtail = ctlr;
+	}
+}
+
+
+static int
+pnp(Ether* edev, int type)
+{
+	Ctlr *ctlr;
+	static int done;
+
+	if(!done) {
+		i82563pci();
+		done = 1;
+	}
+
+	/*
+	 * Any adapter matches if no edev->port is supplied,
+	 * otherwise the ports must match.
+	 */
+	for(ctlr = i82563ctlrhead; ; ctlr = ctlr->next){
+		if(ctlr == nil)
+			return -1;
+		if(ctlr->active)
+			continue;
+		if(type != Iany && ctlr->type != type)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			memmove(ctlr->ra, edev->ea, Eaddrlen);
+			if(setup(ctlr) == 0)
+				break;
+		}
+	}
+
+	edev->ctlr = ctlr;
+	ctlr->edev = edev;			/* point back to Ether* */
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 1000;
+	edev->maxmtu = ctlr->rbsz;
+	memmove(edev->ea, ctlr->ra, Eaddrlen);
+
+	/*
+	 * Linkage to the generic ethernet driver.
+	 */
+	edev->attach = i82563attach;
+	edev->transmit = i82563transmit;
+	edev->interrupt =  (ctlr->type == i82575?
+		i82575interrupt: i82563interrupt);
+	edev->ifstat = i82563ifstat;
+	edev->ctl = i82563ctl;
+
+	edev->arg = edev;
+	edev->promiscuous = i82563promiscuous;
+	edev->shutdown = i82563shutdown;
+	edev->multicast = i82563multicast;
+
+	return 0;
+}
+
+static int
+anypnp(Ether *e)
+{
+	return pnp(e, Iany);
+}
+
+static int
+i82563pnp(Ether *e)
+{
+	return pnp(e, i82563);
+}
+
+static int
+i82566pnp(Ether *e)
+{
+	return pnp(e, i82566);
+}
+
+static int
+i82567pnp(Ether *e)
+{
+	return pnp(e, i82567m) & pnp(e, i82567);
+}
+
+static int
+i82571pnp(Ether *e)
+{
+	return pnp(e, i82571);
+}
+
+static int
+i82572pnp(Ether *e)
+{
+	return pnp(e, i82572);
+}
+
+static int
+i82573pnp(Ether *e)
+{
+	return pnp(e, i82573);
+}
+
+static int
+i82574pnp(Ether *e)
+{
+	return pnp(e, i82574);
+}
+
+static int
+i82575pnp(Ether *e)
+{
+	return pnp(e, i82575);
+}
+
+static int
+i82576pnp(Ether *e)
+{
+	return pnp(e, i82576);
+}
+
+static int
+i82577pnp(Ether *e)
+{
+	return pnp(e, i82577m) & pnp(e, i82577);
+}
+
+static int
+i82578pnp(Ether *e)
+{
+	return pnp(e, i82578m) & pnp(e, i82578);
+}
+
+static int
+i82579pnp(Ether *e)
+{
+	return pnp(e, i82579);
+}
+
+static int
+i82580pnp(Ether *e)
+{
+	return pnp(e, i82580);
+}
+
+static int
+i82583pnp(Ether *e)
+{
+	return pnp(e, i82583);
+}
+
+void
+ether82563link(void)
+{
+	/*
+	 * recognise lots of model numbers for debugging
+	 * also good for forcing onboard nic(s) as ether0
+	 * try to make that unnecessary by listing lom first.
+	 */
+	addethercard("i82563", i82563pnp);
+	addethercard("i82566", i82566pnp);
+	addethercard("i82574", i82574pnp);
+	addethercard("i82576", i82576pnp);
+	addethercard("i82567", i82567pnp);
+	addethercard("i82573", i82573pnp);
+
+	addethercard("i82571", i82571pnp);
+	addethercard("i82572", i82572pnp);
+	addethercard("i82575", i82575pnp);
+	addethercard("i82577", i82577pnp);
+	addethercard("i82578", i82578pnp);
+	addethercard("i82579", i82579pnp);
+	addethercard("i82580", i82580pnp);
+	addethercard("i82583", i82583pnp);
+	addethercard("igbepcie", anypnp);
+}

+ 895 - 0
sys/src/9/k10/etherbcm.c

@@ -0,0 +1,895 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Broadcom BCM57xx
+ * Not implemented:
+ *  proper fatal error handling
+ *  multiple rings
+ *  checksum offloading
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+
+#include "etherif.h"
+#include "../port/ethermii.h"
+
+#define dprint(...)	do{ if(debug)print(__VA_ARGS__); }while(0)
+#define Rbsz		ROUNDUP(sizeof(Etherpkt)+4, 4)
+
+typedef struct Ctlr Ctlr;
+struct Ctlr {
+	Lock	txlock, imlock;
+	Ether	*ether;
+	Ctlr	*next;
+	Pcidev	*pdev;
+	u32int	*nic, *status;
+
+	u32int	*recvret, *recvprod, *sendr;
+	ulong	port;
+	uint	recvreti, recvprodi, sendri, sendcleani;
+	Block	**sends;
+	Block	**rxs;
+	int	active, duplex;
+	int	type;
+
+	uint	nobuf;
+	uint	partial;
+	uint	rxerr;
+	uint	qfull;
+	uint	dmaerr;
+};
+
+enum {
+	/* configurable constants */
+	RxRetRingLen 		= 0x200,
+	RxProdRingLen 		= 0x200,
+	SendRingLen 		= 0x200,
+
+	Reset 			= 1<<0,
+	Enable 			= 1<<1,
+	Attn 			= 1<<2,
+
+	Pwrctlstat 		= 0x4C,
+
+	MiscHostCtl 		= 0x68,
+	TaggedStatus		= 1<<9,
+	IndirAccessEn		= 1<<7,
+	EnableClockCtl		= 1<<5,
+	PCIStateRegEn		= 1<<4,
+	WordSwap		= 1<<3,
+	ByteSwap		= 1<<2,
+	MaskPCIInt		= 1<<1,
+	ClearIntA		= 1<<0,
+
+	Fwmbox			= 0x0b50,	/* magic value exchange */
+	Fwmagic			= 0x4b657654,
+
+	Dmarwctl 		= 0x6C,
+	DMAWaterMask		= ~(7<<19),
+	DMAWaterValue		= 3<<19,
+
+	Memwind		= 0x7C,
+	MemwindData		= 0x84,
+
+	SendRCB 		= 0x100,
+	RxRetRCB 		= 0x200,
+
+	InterruptMailbox 		= 0x204,
+
+	RxProdBDRingIdx	= 0x26c,
+	RxBDRetRingIdx		= 0x284,
+	SendBDRingHostIdx	= 0x304,
+
+	MACMode		= 0x400,
+	MACPortMask		= ~(1<<3 | 1<<2),
+	MACPortGMII		= 1<<3,
+	MACPortMII		= 1<<2,
+	MACEnable		= 1<<23 | 1<<22 | 1<<21 | 1 << 15 | 1 << 14 | 1<<12 | 1<<11,
+	MACHalfDuplex		= 1<<1,
+	
+	MACEventStatus		= 0x404,
+	MACEventEnable	= 0x408,
+	MACAddress		= 0x410,
+	RandomBackoff		= 0x438,
+	RxMTU			= 0x43C,
+	MIComm		= 0x44C,
+	MIStatus		= 0x450,
+	MIMode			= 0x454,
+	RxMACMode		= 0x468,
+	TxMACMode		= 0x45C,
+	TxMACLengths		= 0x464,
+	MACHash		= 0x470,
+	RxRules			= 0x480,
+	
+	RxRulesConf		= 0x500,
+	LowWaterMax		= 0x504,
+	LowWaterMaxMask	= ~0xFFFF,
+	LowWaterMaxValue	= 2,
+
+	SendDataInitiatorMode	= 0xC00,
+	SendInitiatorConf	= 0x0C08,
+	SendStats		= 1<<0,
+	SendInitiatorMask	= 0x0C0C,
+
+	SendDataCompletionMode = 0x1000,
+	SendBDSelectorMode	= 0x1400,
+	SendBDInitiatorMode	= 0x1800,
+	SendBDCompletionMode	= 0x1C00,
+	
+	RxListPlacementMode	= 0x2000,
+	RxListPlacement		= 0x2010,
+	RxListPlacementConf	= 0x2014,
+	RxStats			= 1<<0,
+	RxListPlacementMask	= 0x2018,
+	
+	RxDataBDInitiatorMode	= 0x2400,
+	RxBDHostAddr		= 0x2450,
+	RxBDFlags		= 0x2458,
+	RxBDNIC		= 0x245C,
+	RxDataCompletionMode	= 0x2800,
+	RxBDInitiatorMode	= 0x2C00,
+	RxBDRepl		= 0x2C18,
+	
+	RxBDCompletionMode	= 0x3000,
+	HostCoalMode		= 0x3C00,
+	HostCoalRxTicks		= 0x3C08,
+	HostCoalSendTicks	= 0x3C0C,
+	RxMaxCoalFrames	= 0x3C10,
+	SendMaxCoalFrames	= 0x3C14,
+	RxMaxCoalFramesInt	= 0x3C20,
+	SendMaxCoalFramesInt	= 0x3C24,
+	StatusBlockHostAddr	= 0x3C38,
+	FlowAttention		= 0x3C48,
+
+	MemArbiterMode		= 0x4000,
+	
+	BufferManMode		= 0x4400,
+	
+	MBUFLowWater		= 0x4414,
+	MBUFHighWater		= 0x4418,
+	
+	ReadDMAMode		= 0x4800,
+	ReadDMAStatus		= 0x4804,
+	WriteDMAMode		= 0x4C00,
+	WriteDMAStatus		= 0x4C04,
+	
+	RISCState		= 0x5004,
+	FTQReset		= 0x5C00,
+	MSIMode		= 0x6000,
+	
+	ModeControl		= 0x6800,
+	ByteWordSwap		= 1<<4 | 1<<5 | 1<<2, // | 1<<1,
+	HostStackUp		= 1<<16,
+	HostSendBDs		= 1<<17,
+	InterruptOnMAC		= 1<<26,
+	
+	MiscConf		= 0x6804,
+	CoreClockBlocksReset	= 1<<0,
+	GPHYPwrdnOverride	= 1<<26,
+	DisableGRCRstOnPpcie	= 1<<29,
+	TimerMask		= ~0xFF,
+	TimerValue		= 65<<1,
+	MiscLocalControl		= 0x6808,
+	InterruptOnAttn		= 1<<3,
+	AutoSEEPROM		= 1<<24,
+	
+	SwArbitration		= 0x7020,
+	SwArbitSet1		= 1<<1,
+	SwArbitWon1		= 1<<9,
+	Pcitlplpl			= 0x7C00,	/* "lower 1k of the pcie pl regs" ?? */
+
+	PhyAuxControl		= 0x18,
+	PhyIntStatus		= 0x1A,
+	PhyIntMask		= 0x1B,
+	
+	Updated			= 1<<0,
+	LinkStateChange		= 1<<1,
+	Error			= 1<<2,
+	
+	PacketEnd		= 1<<2,
+	FrameError		= 1<<10,
+};
+
+enum {
+	b5722,
+	b5751,
+	b5754,
+	b5755,
+	b5756,
+	b5782,
+	b5787,
+	b5906,
+	Nctlrtype,
+};
+
+typedef struct Ctlrtype Ctlrtype;
+struct Ctlrtype {
+	int	mtu;
+	int	flag;
+	char	*name;
+};
+
+static Ctlrtype cttab[Nctlrtype] = {
+[b5722]	1514,	0,	"b5722",
+[b5751]	1514,	0,	"b5751",
+[b5754]	1514,	0,	"b5754",
+[b5755]	1514,	0,	"b5755",
+[b5756]	1514,	0,	"b5756",
+[b5782]	1514,	0,	"b5782",
+[b5787]	1514,	0,	"b5787",
+[b5906]	1514,	0,	"b5906",
+};
+
+#define csr32(c, r)	((c)->nic[(r)/4])
+
+static Ctlr *bcmhead;
+static int debug=1;
+
+static char*
+cname(Ctlr *c)
+{
+	return cttab[c->type].name;
+}
+
+static long
+bcmifstat(Ether *edev, void *a, long n, ulong offset)
+{
+	char *s, *p, *e;
+	Ctlr *c;
+
+	c = edev->ctlr;
+	p = s = malloc(READSTR);
+	e = p + READSTR;
+
+	p = seprint(p, e, "nobuf	%ud\n", c->nobuf);
+	p = seprint(p, e, "partial	%ud\n", c->partial);
+	p = seprint(p, e, "rxerr	%ud\n", c->rxerr);
+	p = seprint(p, e, "qfull	%ud\n", c->qfull);
+	p = seprint(p, e, "dmaerr	%ud\n", c->dmaerr);
+	p = seprint(p, e, "type: %s\n", cname(c));
+
+	USED(p);
+	n = readstr(offset, a, n, s);
+	free(s);
+
+	return n;
+}
+
+enum {
+	Phybusy		= 1<<29,
+	Phyrdfail		= 1<<28,
+	Phyrd		= 1<<27,
+	Phywr		= 1<<26,
+};
+Lock miilock;
+
+static uint
+miiwait(Ctlr *ctlr)
+{
+	uint i, v;
+
+	for(i = 0; i < 100; i += 5){
+		microdelay(10);
+		v = csr32(ctlr, MIComm);
+		if((v & Phybusy) == 0){
+			microdelay(5);
+			return csr32(ctlr, MIComm);
+		}
+		microdelay(5);
+	}
+	print("#l%d: bcm: miiwait: timeout\n", ctlr->ether->ctlrno);
+	return ~0;
+}
+
+static int
+miir(Ctlr *ctlr, int r)
+{
+	uint v, phyno;
+
+	phyno = 1;
+	lock(&miilock);
+	csr32(ctlr, MIComm) = r<<16 | phyno<<21 | Phyrd | Phybusy;
+	v = miiwait(ctlr);
+	unlock(&miilock);
+	if(v == ~0)
+		return -1;
+	if(v & Phyrdfail){
+		print("#l%d: bcm: miir: fail\n", ctlr->ether->ctlrno);
+		return -1;
+	}
+	return v & 0xffff;
+}
+
+static int
+miiw(Ctlr *ctlr, int r, int v)
+{
+	uint phyno, w;
+
+	phyno = 1;
+	lock(&miilock);
+	csr32(ctlr, MIComm) = r<<16 | v&0xffff | phyno<<21 | Phywr | Phybusy;
+	w = miiwait(ctlr);
+	unlock(&miilock);
+	if(w == ~0)
+		return -1;
+	return 0;
+}
+
+static void
+checklink(Ether *edev)
+{
+	uint i;
+	Ctlr *ctlr;
+
+	ctlr = edev->ctlr;
+	miir(ctlr, Bmsr); /* read twice for current status as per 802.3 */
+	if(!(miir(ctlr, Bmsr) & BmsrLs)) {
+		edev->link = 0;
+		edev->mbps = 1000;
+		ctlr->duplex = 1;
+		dprint("bcm: no link\n");
+		goto out;
+	}
+	edev->link = 1;
+	while((miir(ctlr, Bmsr) & BmsrAnc) == 0)
+		;
+	i = miir(ctlr, Mssr);
+	if(i & (Mssr1000THD | Mssr1000TFD)) {
+		edev->mbps = 1000;
+		ctlr->duplex = (i & Mssr1000TFD) != 0;
+	} else if(i = miir(ctlr, Anlpar), i & (AnaTXHD | AnaTXFD)) {
+		edev->mbps = 100;
+		ctlr->duplex = (i & AnaTXFD) != 0;
+	} else if(i & (Ana10HD | Ana10FD)) {
+		edev->mbps = 10;
+		ctlr->duplex = (i & Ana10FD) != 0;
+	} else {
+		edev->link = 0;
+		edev->mbps = 1000;
+		ctlr->duplex = 1;
+		dprint("bcm: link partner supports neither 10/100/1000 Mbps\n");
+		goto out;
+	}
+	dprint("bcm: %d Mbps link, %s duplex\n", edev->mbps, ctlr->duplex ? "full" : "half");
+out:
+	if(ctlr->duplex)
+		csr32(ctlr, MACMode) &= ~MACHalfDuplex;
+	else
+		csr32(ctlr, MACMode) |= MACHalfDuplex;
+	if(edev->mbps >= 1000)
+		csr32(ctlr, MACMode) = (csr32(ctlr, MACMode) & MACPortMask) | MACPortGMII;
+	else
+		csr32(ctlr, MACMode) = (csr32(ctlr, MACMode) & MACPortMask) | MACPortMII;
+	csr32(ctlr, MACEventStatus) |= (1<<4) | (1<<3); /* undocumented bits (sync and config changed) */
+}
+
+static uint*
+currentrecvret(Ctlr *ctlr)
+{
+	if(ctlr->recvreti == (ctlr->status[4] & 0xFFFF))
+		return 0;
+	return ctlr->recvret + ctlr->recvreti * 8;
+}
+
+static void
+consumerecvret(Ctlr *ctlr)
+{
+	ctlr->recvreti = ctlr->recvreti+1 & RxRetRingLen-1;
+	csr32(ctlr, RxBDRetRingIdx) = ctlr->recvreti;
+}
+
+static int
+replenish(Ctlr *ctlr)
+{
+	uint incr;
+	u32int *next;
+	Block *bp;
+	
+	incr = (ctlr->recvprodi + 1) & (RxProdRingLen - 1);
+	if(incr == (ctlr->status[2] >> 16))
+		return -1;
+	bp = iallocb(Rbsz);
+	if(bp == nil) {
+		/* iallocb never fails.  this code is unnecessary */
+		dprint("bcm: out of memory for receive buffers\n");
+		ctlr->nobuf++;
+		return -1;
+	}
+	next = ctlr->recvprod + ctlr->recvprodi * 8;
+	memset(next, 0, 32);
+	next[0] = Pciwaddrh(bp->rp);
+	next[1] = Pciwaddrl(bp->rp);
+	next[2] = Rbsz;
+	next[7] = ctlr->recvprodi;
+	ctlr->rxs[ctlr->recvprodi] = bp;
+	coherence();
+	csr32(ctlr, RxProdBDRingIdx) = ctlr->recvprodi = incr;
+	return 0;
+}
+
+static void
+bcmreceive(Ether *edev)
+{
+	uint len;
+	u32int *pkt;
+	Ctlr *ctlr;
+	Block *bp;
+	
+	ctlr = edev->ctlr;
+	for(; pkt = currentrecvret(ctlr); replenish(ctlr), consumerecvret(ctlr)) {
+		bp = ctlr->rxs[pkt[7]];
+		len = pkt[2] & 0xFFFF;
+		bp->wp = bp->rp + len;
+		if((pkt[3] & PacketEnd) == 0){
+			dprint("bcm: partial frame received -- shouldn't happen\n");
+			ctlr->partial++;
+			freeb(bp);
+			continue;
+		}
+		if(pkt[3] & FrameError){
+			ctlr->rxerr++;
+			freeb(bp);
+			continue;
+		}
+		etheriq(edev, bp, 1);
+	}
+}
+
+static void
+bcmtransclean(Ether *edev)
+{
+	Ctlr *ctlr;
+	
+	ctlr = edev->ctlr;
+	ilock(&ctlr->txlock);
+	while(ctlr->sendcleani != (ctlr->status[4] >> 16)) {
+		freeb(ctlr->sends[ctlr->sendcleani]);
+		ctlr->sends[ctlr->sendcleani] = nil;
+		ctlr->sendcleani = (ctlr->sendcleani + 1) & (SendRingLen - 1);
+	}
+	iunlock(&ctlr->txlock);
+}
+
+static void
+bcmtransmit(Ether *edev)
+{
+	uint incr;
+	u32int *next;
+	Ctlr *ctlr;
+	Block *bp;
+	
+	ctlr = edev->ctlr;
+	ilock(&ctlr->txlock);
+	for(;;){
+		incr = (ctlr->sendri + 1) & (SendRingLen - 1);
+		if(incr == ctlr->sendcleani) {
+			dprint("bcm: send queue full\n");
+			ctlr->qfull++;
+			break;
+		}
+		bp = qget(edev->oq);
+		if(bp == nil)
+			break;
+		next = ctlr->sendr + ctlr->sendri * 4;
+		next[0] = Pciwaddrh(bp->rp);
+		next[1] = Pciwaddrl(bp->rp);
+		next[2] = (BLEN(bp) << 16) | PacketEnd;
+		next[3] = 0;
+		ctlr->sends[ctlr->sendri] = bp;
+		coherence();
+		csr32(ctlr, SendBDRingHostIdx) = ctlr->sendri = incr;
+	}
+	iunlock(&ctlr->txlock);
+}
+
+static void
+bcmerror(Ether *edev)
+{
+	Ctlr *ctlr;
+	
+	ctlr = edev->ctlr;
+	if(csr32(ctlr, FlowAttention)) {
+		if(csr32(ctlr, FlowAttention) & 0xf8ff8080)
+			print("bcm: fatal error %#.8ux", csr32(ctlr, FlowAttention));
+		csr32(ctlr, FlowAttention) = 0;
+	}
+	csr32(ctlr, MACEventStatus) = 0; /* worth ignoring */
+	if(csr32(ctlr, ReadDMAStatus) || csr32(ctlr, WriteDMAStatus)) {
+		dprint("bcm: DMA error\n");
+		ctlr->dmaerr++;
+		csr32(ctlr, ReadDMAStatus) = 0;
+		csr32(ctlr, WriteDMAStatus) = 0;
+	}
+	if(csr32(ctlr, RISCState)) {
+		if(csr32(ctlr, RISCState) & 0x78000403)
+			print("bcm: RISC halted %#.8ux", csr32(ctlr, RISCState));
+		csr32(ctlr, RISCState) = 0;
+	}
+}
+
+static void
+bcminterrupt(Ureg*, void *arg)
+{
+	u32int status, tag, dummy;
+	Ether *edev;
+	Ctlr *ctlr;
+	
+	edev = arg;
+	ctlr = edev->ctlr;
+	ilock(&ctlr->imlock);
+	dummy = csr32(ctlr, InterruptMailbox);
+	USED(dummy);
+	csr32(ctlr, InterruptMailbox) = 1;
+	status = ctlr->status[0];
+	tag = ctlr->status[1];
+	ctlr->status[0] = 0;
+	if(status & Error)
+		bcmerror(edev);
+	if(status & LinkStateChange)
+		checklink(edev);
+	if(0)
+		iprint("bcm: interrupt %.8ux %.8ux\n", ctlr->status[2], ctlr->status[4]);
+	bcmreceive(edev);
+	bcmtransclean(edev);
+	bcmtransmit(edev);
+	csr32(ctlr, InterruptMailbox) = tag << 24;
+	iunlock(&ctlr->imlock);
+}
+
+static void
+mem32w(Ctlr *c, uint r, uint v)
+{
+	pcicfgw32(c->pdev, Memwind, r);
+	pcicfgw32(c->pdev, MemwindData, v);
+}
+
+static u32int
+mem32r(Ctlr *c, uint r)
+{
+	u32int v;
+
+	pcicfgw32(c->pdev, Memwind, r);
+	v = pcicfgr32(c->pdev, MemwindData);
+	pcicfgw32(c->pdev, Memwind, 0);
+	return v;
+}
+
+static int
+bcmµwait(Ctlr *ctlr, uint to, uint r, uint m, uint v)
+{
+	int i;
+
+	for(i = 0;; i += 100){
+		if((csr32(ctlr, r) & m) == v)
+			return 0;
+		if(i == to /* µs */)
+			return -1;
+		microdelay(100);
+	}
+}
+
+static int
+bcminit(Ether *edev)
+{
+	uint i;
+	u32int j;
+	Ctlr *ctlr;
+	
+	ctlr = edev->ctlr;
+	dprint("bcm: reset\n");
+	/* initialization procedure according to the datasheet */
+	csr32(ctlr, MiscHostCtl) |= MaskPCIInt | ClearIntA | WordSwap | IndirAccessEn;
+	csr32(ctlr, SwArbitration) |= SwArbitSet1;
+	if(bcmµwait(ctlr, 2000, SwArbitration, SwArbitWon1, SwArbitWon1) == -1){
+		print("bcm: arbiter failed to respond\n");
+		return -1;
+	}
+	csr32(ctlr, MemArbiterMode) |= Enable;
+	csr32(ctlr, MiscHostCtl) = WordSwap | IndirAccessEn | PCIStateRegEn | EnableClockCtl
+		| MaskPCIInt | ClearIntA;
+	csr32(ctlr, Memwind) = 0;
+	mem32w(ctlr, Fwmbox, Fwmagic);
+	csr32(ctlr, MiscConf) |= GPHYPwrdnOverride | DisableGRCRstOnPpcie | CoreClockBlocksReset;
+	delay(100);
+	pcicfgw32(ctlr->pdev, PciPCR, ctlr->pdev->pcr);	/* restore pci bits lost */
+	csr32(ctlr, MiscHostCtl) |= MaskPCIInt | ClearIntA;
+	csr32(ctlr, MemArbiterMode) |= Enable;
+	csr32(ctlr, MiscHostCtl) |= WordSwap | IndirAccessEn | PCIStateRegEn | EnableClockCtl | TaggedStatus;
+	csr32(ctlr, ModeControl) |= ByteWordSwap;
+	csr32(ctlr, MACMode) = (csr32(ctlr, MACMode) & MACPortMask) | MACPortGMII;
+	delay(40);
+	for(i = 0;; i += 100){
+		if(mem32r(ctlr, Fwmbox) == ~Fwmagic)
+			break;
+		if(i == 20*10000 /* µs */){
+			print("bcm: fw failed to respond %#.8ux\n", mem32r(ctlr, Fwmbox));
+			break; //return -1;
+		}
+		microdelay(100);
+	}
+	/*
+	 * there appears to be no justification for setting these bits in any driver
+	 * i can find.  nor to i have a datasheet that recommends this.  - quanstro
+	 * csr32(ctlr, Pcitlplpl) |= 1<<25 | 1<<29;
+	 */
+	memset(ctlr->status, 0, 20);
+	csr32(ctlr, Dmarwctl) = (csr32(ctlr, Dmarwctl) & DMAWaterMask) | DMAWaterValue;
+	csr32(ctlr, ModeControl) |= HostSendBDs | HostStackUp | InterruptOnMAC;
+	csr32(ctlr, MiscConf) = (csr32(ctlr, MiscConf) & TimerMask) | TimerValue;
+	csr32(ctlr, MBUFLowWater) = 0x20;
+	csr32(ctlr, MBUFHighWater) = 0x60;
+	csr32(ctlr, LowWaterMax) = (csr32(ctlr, LowWaterMax) & LowWaterMaxMask) | LowWaterMaxValue;
+	csr32(ctlr, BufferManMode) |= Enable | Attn;
+	if(bcmµwait(ctlr, 2000, BufferManMode, Enable, Enable) == -1){
+		print("bcm: failed to enable buffers\n");
+		return -1;
+	}
+	csr32(ctlr, FTQReset) = ~0;
+	csr32(ctlr, FTQReset) = 0;
+	if(bcmµwait(ctlr, 2000, FTQReset, ~0, 0) == -1){
+		print("bcm: failed to bring ftq out of reset\n");
+		return -1;
+	}
+	csr32(ctlr, RxBDHostAddr) = Pciwaddrh(ctlr->recvprod);
+	csr32(ctlr, RxBDHostAddr + 4) = Pciwaddrl(ctlr->recvprod);
+	csr32(ctlr, RxBDFlags) = RxProdRingLen << 16;
+	csr32(ctlr, RxBDNIC) = 0x6000;
+	csr32(ctlr, RxBDRepl) = 25;
+	csr32(ctlr, SendBDRingHostIdx) = 0;
+	csr32(ctlr, SendBDRingHostIdx+4) = 0;
+	mem32w(ctlr, SendRCB, Pciwaddrh(ctlr->sendr));
+	mem32w(ctlr, SendRCB + 4, Pciwaddrl(ctlr->sendr));
+	mem32w(ctlr, SendRCB + 8, SendRingLen << 16);
+	mem32w(ctlr, SendRCB + 12, 0x4000);
+	for(i=1; i<4; i++)
+		mem32w(ctlr, RxRetRCB + i * 0x10 + 8, 2);
+	mem32w(ctlr, RxRetRCB, Pciwaddrh(ctlr->recvret));
+	mem32w(ctlr, RxRetRCB + 4, Pciwaddrl(ctlr->recvret));
+	mem32w(ctlr, RxRetRCB + 8, RxRetRingLen << 16);
+	csr32(ctlr, RxProdBDRingIdx) = 0;
+	csr32(ctlr, RxProdBDRingIdx+4) = 0;
+	/* this delay is not in the datasheet, but necessary */
+	delay(1);
+	i = csr32(ctlr, MACAddress);
+	j = edev->ea[0] = i >> 8;
+	j += edev->ea[1] = i;
+	i = csr32(ctlr, MACAddress + 4);
+	j += edev->ea[2] = i >> 24;
+	j += edev->ea[3] = i >> 16;
+	j += edev->ea[4] = i >> 8;
+	j += edev->ea[5] = i;
+	csr32(ctlr, RandomBackoff) = j & 0x3FF;
+	csr32(ctlr, RxMTU) = Rbsz;
+	csr32(ctlr, TxMACLengths) = 0x2620;
+	csr32(ctlr, RxListPlacement) = 1<<3; /* one list */
+	csr32(ctlr, RxListPlacementMask) = 0xFFFFFF;
+	csr32(ctlr, RxListPlacementConf) |= RxStats;
+	csr32(ctlr, SendInitiatorMask) = 0xFFFFFF;
+	csr32(ctlr, SendInitiatorConf) |= SendStats;
+	csr32(ctlr, HostCoalMode) = 0;
+	if(bcmµwait(ctlr, 2000, HostCoalMode, ~0, 0) == -1){
+		print("bcm: failed to unset coalescing\n");
+		return -1;
+	}
+	csr32(ctlr, HostCoalRxTicks) = 150;
+	csr32(ctlr, HostCoalSendTicks) = 150;
+	csr32(ctlr, RxMaxCoalFrames) = 10;
+	csr32(ctlr, SendMaxCoalFrames) = 10;
+	csr32(ctlr, RxMaxCoalFramesInt) = 0;
+	csr32(ctlr, SendMaxCoalFramesInt) = 0;
+	csr32(ctlr, StatusBlockHostAddr) = Pciwaddrh(ctlr->status);
+	csr32(ctlr, StatusBlockHostAddr + 4) = Pciwaddrl(ctlr->status);
+	csr32(ctlr, HostCoalMode) |= Enable;
+	csr32(ctlr, RxBDCompletionMode) |= Enable | Attn;
+	csr32(ctlr, RxListPlacementMode) |= Enable;
+	csr32(ctlr, MACMode) |= MACEnable;
+	csr32(ctlr, MiscLocalControl) |= InterruptOnAttn | AutoSEEPROM;
+	csr32(ctlr, InterruptMailbox) = 0;
+	csr32(ctlr, WriteDMAMode) |= 0x200003fe; /* pulled out of my nose */
+	csr32(ctlr, ReadDMAMode) |= 0x3fe;
+	csr32(ctlr, RxDataCompletionMode) |= Enable | Attn;
+	csr32(ctlr, SendDataCompletionMode) |= Enable;
+	csr32(ctlr, SendBDCompletionMode) |= Enable | Attn;
+	csr32(ctlr, RxBDInitiatorMode) |= Enable | Attn;
+	csr32(ctlr, RxDataBDInitiatorMode) |= Enable | (1<<4);
+	csr32(ctlr, SendDataInitiatorMode) |= Enable;
+	csr32(ctlr, SendBDInitiatorMode) |= Enable | Attn;
+	csr32(ctlr, SendBDSelectorMode) |= Enable | Attn;
+	ctlr->recvprodi = 0;
+	while(replenish(ctlr) >= 0)
+		;
+	csr32(ctlr, TxMACMode) |= Enable;
+	csr32(ctlr, RxMACMode) |= Enable;
+	csr32(ctlr, Pwrctlstat) &= ~3;
+	csr32(ctlr, MIStatus) |= 1<<0;
+	csr32(ctlr, MACEventEnable) = 0;
+	csr32(ctlr, MACEventStatus) |= (1<<12);
+	csr32(ctlr, MIMode) = 0xC0000;		/* set base mii clock */
+	microdelay(40);
+
+	if(0){
+		/* bug (ours): can't reset phy without dropping into 100mbit mode */
+		miiw(ctlr, Bmcr, BmcrR);
+		for(i = 0;; i += 100){
+			if((miir(ctlr, Bmcr) & BmcrR) == 0)
+				break;
+			if(i == 10000 /* µs */){
+				print("bcm: phy reset failure\n");
+				return -1;
+			}
+			microdelay(100);
+		}
+	}
+	miiw(ctlr, Bmcr, BmcrAne | BmcrRan);
+
+	miiw(ctlr, PhyAuxControl, 2);
+	miir(ctlr, PhyIntStatus);
+	miir(ctlr, PhyIntStatus);
+	miiw(ctlr, PhyIntMask, ~(1<<1));
+	csr32(ctlr, MACEventEnable) |= 1<<12;
+	for(i = 0; i < 4; i++)
+		csr32(ctlr, MACHash + 4*i) = ~0;
+	for(i = 0; i < 8; i++)
+		csr32(ctlr, RxRules + 8 * i) = 0;
+	csr32(ctlr, RxRulesConf) = 1 << 3;
+	csr32(ctlr, MSIMode) |= Enable;
+	csr32(ctlr, MiscHostCtl) &= ~(MaskPCIInt | ClearIntA);
+	dprint("bcm: reset: fin\n");
+	return 0;
+}
+
+static int
+didtype(Pcidev *p)
+{
+	if(p->vid != 0x14e4)
+		return -1;
+	
+	switch(p->did){
+	default:
+		return -1;
+	case 0x165a:		/* 5722 gbe */
+		return b5722;
+	case 0x1670:		/* ?? */
+		return b5751;
+	case 0x1672:		/* 5754m */
+		return b5754;
+	case 0x1673:		/* 5755m gbe */
+		return b5755;
+	case 0x1674:		/* 5756me gbe */
+		return b5756;
+	case 0x1677:		/* 5751 gbe */
+		return b5751;
+	case 0x167a:		/* 5754 gbe */
+		return b5754;
+	case 0x167b:		/* 5755 gbe */
+		return b5755;
+	case 0x1693:		/* 5787m gbe */
+		return b5787;
+	case 0x1696:		/* 5782 gbe; steve */
+		return b5782;
+	case 0x169b:		/* 5787 gbe */
+		return b5787;
+	case 0x1712:		/* 5906 fast */
+	case 0x1713:		/* 5906m fast */
+		return b5906;
+	case 0x167d:		/* 5751m gbe */
+	case 0x167e:		/* 5751f fast */
+		return b5751;
+	}
+}
+
+static void
+bcmpci(void)
+{
+	int type;
+	void *mem;
+	Ctlr *ctlr, **xx;
+	Pcidev *p;
+
+	xx = &bcmhead;
+	for(p = nil; p = pcimatch(p, 0, 0); ) {
+		if(p->ccrb != 2 || p->ccru != 0 || (type = didtype(p)) == -1)
+			continue;
+		pcisetbme(p);
+		pcisetpms(p, 0);
+		ctlr = malloc(sizeof(Ctlr));
+		if(ctlr == nil)
+			continue;
+		ctlr->type = type;
+		ctlr->port = p->mem[0].bar & ~0x0F;
+		mem = vmap(ctlr->port, p->mem[0].size);
+		if(mem == nil) {
+			print("bcm: can't map %#p\n", (uvlong)ctlr->port);
+			free(ctlr);
+			continue;
+		}
+		ctlr->pdev = p;
+		ctlr->nic = mem;
+		ctlr->status = mallocalign(20, 16, 0, 0);
+		ctlr->recvprod = mallocalign(32 * RxProdRingLen, 16, 0, 0);
+		ctlr->recvret = mallocalign(32 * RxRetRingLen, 16, 0, 0);
+		ctlr->sendr = mallocalign(16 * SendRingLen, 16, 0, 0);
+		ctlr->sends = malloc(sizeof *ctlr->sends * SendRingLen);
+		ctlr->rxs = malloc(sizeof *ctlr->sends * SendRingLen);
+		*xx = ctlr;
+		xx = &ctlr->next;
+	}
+}
+
+static void
+bcmpromiscuous(void* arg, int on)
+{
+	Ctlr *ctlr;
+	
+	ctlr = ((Ether*)arg)->ctlr;
+	if(on)
+		csr32(ctlr, RxMACMode) |= 1<<8;
+	else
+		csr32(ctlr, RxMACMode) &= ~(1<<8);
+}
+
+static void
+bcmmulticast(void*, uchar*, int)
+{
+}
+
+static int
+bcmpnp(Ether* edev)
+{
+	Ctlr *ctlr;
+	static int done;
+
+	if(done == 0){
+		bcmpci();
+		done = 1;
+	}
+	
+redux:
+	for(ctlr = bcmhead; ; ctlr = ctlr->next) {
+		if(ctlr == nil)
+			return -1;
+		if(ctlr->active)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port) {
+			ctlr->active = 1;
+			break;
+		}
+	}
+
+	ctlr->ether = edev;
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pdev->intl;
+	edev->tbdf = ctlr->pdev->tbdf;
+	edev->interrupt = bcminterrupt;
+	edev->ifstat = bcmifstat;
+	edev->transmit = bcmtransmit;
+	edev->multicast = bcmmulticast;
+	edev->promiscuous = bcmpromiscuous;
+	edev->arg = edev;
+	edev->mbps = 1000;
+
+	if(bcminit(edev) == -1)
+		goto redux;
+	return 0;
+}
+
+void
+etherbcmlink(void)
+{
+	addethercard("bcm57xx", bcmpnp);
+}

+ 60 - 0
sys/src/9/k10/etherif.h

@@ -0,0 +1,60 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+enum
+{
+	Eaddrlen	= 6,
+	ETHERMINTU	= 60,		/* minimum transmit size */
+	ETHERMAXTU	= 1514,		/* maximum transmit size */
+	ETHERHDRSIZE	= 14,		/* size of an ethernet header */
+
+	MaxEther	= 48,
+	Ntypes		= 8,
+};
+
+typedef struct Ether Ether;
+struct Ether {
+	ISAConf;			/* hardware info */
+
+	int	ctlrno;
+	int	tbdf;			/* type+busno+devno+funcno */
+	uchar	ea[Eaddrlen];
+
+	void	(*attach)(Ether*);	/* filled in by reset routine */
+	void	(*detach)(Ether*);
+	void	(*transmit)(Ether*);
+	void	(*interrupt)(Ureg*, void*);
+	long	(*ifstat)(Ether*, void*, long, ulong);
+	long 	(*ctl)(Ether*, void*, long); /* custom ctl messages */
+	void	(*power)(Ether*, int);	/* power on/off */
+	void	(*shutdown)(Ether*);	/* shutdown hardware before reboot */
+	void	*ctlr;
+
+	int	scan[Ntypes];		/* base station scanning interval */
+	int	nscan;			/* number of base station scanners */
+
+	Netif;
+};
+
+typedef struct Etherpkt Etherpkt;
+struct Etherpkt
+{
+	uchar	d[Eaddrlen];
+	uchar	s[Eaddrlen];
+	uchar	type[2];
+	uchar	data[1500];
+};
+
+extern Block* etheriq(Ether*, Block*, int);
+extern void addethercard(char*, int(*)(Ether*));
+extern ulong ethercrc(uchar*, int);
+extern int parseether(uchar*, char*);
+
+#define NEXT(x, l)	(((x)+1)%(l))
+#define PREV(x, l)	(((x) == 0) ? (l)-1: (x)-1)

+ 261 - 0
sys/src/9/k10/fns.h

@@ -0,0 +1,261 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "../port/portfns.h"
+void	intrac(Proc*);
+void	acinit(void);
+int	acpiinit(void);
+void	actrapenable(int, char* (*)(Ureg*, void*), void*, char*);
+void	apicipi(int);
+void	apicpri(int);
+void	acsysret(void);
+void	actouser(void);
+void		runacore(void);
+void	aamloop(int);
+Dirtab*	addarchfile(char*, int, long(*)(Chan*,void*,long,vlong), long(*)(Chan*,void*,long,vlong));
+void	acmmuswitch(void);
+void	acmodeset(int);
+void	archfmtinstall(void);
+void	archidle(void);
+int	archmmu(void);
+int	asmfree(uintmem, uintmem, int);
+uvlong	asmalloc(uintmem, uintmem, int, int);
+void	asminit(void);
+void	asmmapinit(uintmem, uintmem, int);
+extern void asmmodinit(u32int, u32int, char*);
+void	noerrorsleft(void);
+void	archinit(void);
+void	archreset(void);
+vlong	archhz(void);
+void	cgaconsputs(char*, int);
+void	cgainit(void);
+void	cgapost(int);
+void	checkpa(char*, uintmem);
+#define	clearmmucache()				/* x86 doesn't have one */
+void	(*coherence)(void);
+int	corecolor(int);
+u32int	cpuid(u32int, u32int, u32int[4]);
+int	dbgprint(char*, ...);
+int	decref(Ref*);
+void	delay(int);
+void	dumpmmu(Proc*);
+void	dumpmmuwalk(u64int pa);
+void	dumpptepg(int lvl,uintptr pa);
+#define	evenaddr(x)				/* x86 doesn't care */
+int	fpudevprocio(Proc*, void*, long, uintptr, int);
+void	fpuinit(void);
+void	fpunoted(void);
+void	fpunotify(Ureg*);
+void	fpuprocrestore(Proc*);
+void	fpuprocsave(Proc*);
+void	fpusysprocsetup(Proc*);
+void	fpusysrfork(Ureg*);
+void	fpusysrforkchild(Proc*, Proc*);
+Mach*	getac(Proc*, int);
+char*	getconf(char*);
+void	halt(void);
+void	hardhalt(void);
+int	i8042auxcmd(int);
+int	i8042auxcmds(uchar*, int);
+void	i8042auxenable(void (*)(int, int));
+void	i8042reset(void);
+Uart*	i8250console(char*);
+void*	i8250alloc(int, int, int);
+vlong	i8254hz(u32int[2][4]);
+void	idlehands(void);
+void	acidthandlers(void);
+void	idthandlers(void);
+int	inb(int);
+int	incref(Ref*);
+void	insb(int, void*, int);
+ushort	ins(int);
+void	inss(int, void*, int);
+ulong	inl(int);
+void	insl(int, void*, int);
+int	intrdisable(void*);
+void*	intrenable(int, void (*)(Ureg*, void*), void*, int, char*);
+void	invlpg(uintptr);
+void	iofree(int);
+void	ioinit(void);
+int	iounused(int, int);
+int	ioalloc(int, int, int, char*);
+int	ioreserve(int, int, int, char*);
+int	iprint(char*, ...);
+int	isaconfig(char*, int, ISAConf*);
+void	kbdenable(void);
+void	kbdinit(void);
+void	kexit(Ureg*);
+#define	kmapinval()
+void	lfence(void);
+void	links(void);
+void	machinit(void);
+void	mach0init(void);
+void	mapraminit(uvlong, uvlong);
+void	mapupainit(uvlong, ulong);
+void	meminit(void);
+void	mfence(void);
+void	mmuflushtlb(u64int);
+void	mmuinit(void);
+uintptr	mmukmap(uintptr, uintptr, usize);
+int	mmukmapsync(uvlong);
+uintmem	mmuphysaddr(uintptr);
+int	mmuwalk(PTE*, uintptr, int, PTE**, PTE (*)(usize));
+int	multiboot(u32int, u32int, int);
+void	ndnr(void);
+uchar	nvramread(int);
+void	nvramwrite(int, uchar);
+void	optionsinit(char*);
+void	outb(int, int);
+void	outsb(int, void*, int);
+void	outs(int, ushort);
+void	outss(int, void*, int);
+void	outl(int, ulong);
+void	outsl(int, void*, int);
+int	pcicap(Pcidev*, int);
+int	pcicfgr8(Pcidev*, int);
+int	pcicfgr16(Pcidev*, int);
+int	pcicfgr32(Pcidev*, int);
+void	pcicfgw8(Pcidev*, int, int);
+void	pcicfgw16(Pcidev*, int, int);
+void	pcicfgw32(Pcidev*, int, int);
+void	pciclrbme(Pcidev*);
+void	pciclrmwi(Pcidev*);
+int	pcigetpms(Pcidev*);
+void	pcihinv(Pcidev*);
+Pcidev*	pcimatch(Pcidev*, int, int);
+Pcidev*	pcimatchtbdf(int);
+void	pcireset(void);
+void	pcisetbme(Pcidev*);
+void	pcisetmwi(Pcidev*);
+int	pcisetpms(Pcidev*, int);
+int	pickcore(int, int);
+void	printcpufreq(void);
+void	putac(Mach*);
+void	runapcore(int);
+int	screenprint(char*, ...);			/* debugging */
+void	sfence(void);
+void	spldone(void);
+u64int	splhi(void);
+u64int	spllo(void);
+void	splx(u64int);
+void	splxpc(u64int);
+void	stopac(void);
+void	syncclock(void);
+void	syscall(int scallnr, Ureg* ureg);
+void*	sysexecregs(uintptr, ulong, ulong);
+uintptr	sysexecstack(uintptr, int);
+void	sysprocsetup(Proc*);
+void	tssrsp0(u64int);
+void	trapenable(int, void (*)(Ureg*, void*), void*, char*);
+void	trapinit(void);
+void	trap(Ureg*);
+void	umeminit(void);
+void	upafree(uintptr, usize);
+uintptr	upamalloc(uintptr, usize, usize);
+void	upareserve(uintptr, usize);
+int	userureg(Ureg*);
+void*	vmap(uintptr, usize);
+void	vsvminit(int, int);
+void	vunmap(void*, usize);
+
+extern u64int cr0get(void);
+extern void cr0put(u64int);
+extern u64int cr2get(void);
+extern u64int cr3get(void);
+extern void cr3put(u64int);
+extern u64int cr4get(void);
+extern void cr4put(u64int);
+extern void gdtget(void*);
+extern void gdtput(int, u64int, u16int);
+extern void idtput(int, u64int);
+extern u64int rdmsr(u32int);
+extern u64int rdtsc(void);
+extern void trput(u64int);
+extern void wrmsr(u32int, u64int);
+
+extern int islo(void);
+extern void spldone(void);
+extern Mpl splhi(void);
+extern Mpl spllo(void);
+extern void splx(Mpl);
+
+int	cas32(void*, u32int, u32int);
+int	cas64(void*, u64int, u64int);
+int	tas32(void*);
+u64int	fas64(u64int*, u64int);
+
+#define CASU(p, e, n)	cas64((p), (u64int)(e), (u64int)(n))
+#define CASV(p, e, n)	cas64((p), (u64int)(e), (u64int)(n))
+#define CASP(p, e, n)	cas64((p), (u64int)(e), (u64int)(n))
+#define CASW(p, e, n)	cas32((p), (e), (n))
+#define TAS(addr)	tas32((addr))
+#define	FASP(p, v)	((void*)fas64((u64int*)(p), (u64int)(v)))
+
+void	touser(uintptr);
+void	syscallentry(void);
+void	acsyscallentry(void);
+void	syscallreturn(void);
+void	sysrforkret(void);
+
+#define	waserror()	(up->nerrlab++, setlabel(&up->errlab[up->nerrlab-1]))
+
+#define	dcflush(a, b)
+
+#define PTR2UINT(p)	((uintptr)(p))
+#define UINT2PTR(i)	((void*)(i))
+
+void*	KADDR(uintptr);
+uintmem	PADDR(void*);
+
+#define BIOSSEG(a)	KADDR(((uint)(a))<<4)
+
+/*
+ * apic.c
+ */
+extern int apiceoi(int);
+extern void apicipi(int);
+extern void apicinit(int, uintmem, int);
+extern int apicisr(int);
+extern int apiconline(void);
+extern void apicpri(int);
+extern void apicsipi(int, uintmem);
+
+extern void ioapicinit(int, uintmem);
+extern void ioapicintrinit(int, int, int, int, u32int);
+extern void ioapiconline(void);
+
+/*
+ * archk10.c
+ */
+extern void millidelay(int);
+extern void k10mwait(void*);
+
+/*
+ * i8259.c
+ */
+extern int i8259init(int);
+extern int i8259irqdisable(int);
+extern int i8259irqenable(int);
+extern int i8259isr(int);
+
+/*
+ * mp.c
+ */
+extern void mpsinit(int);
+
+/*
+ * sipi.c
+ */
+extern void sipi(void);
+
+/*
+ * debug
+ */
+void HERE(void);
+void DONE(void);

+ 549 - 0
sys/src/9/k10/fpu.c

@@ -0,0 +1,549 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * SIMD Floating Point.
+ * Assembler support to get at the individual instructions
+ * is in l64fpu.s.
+ * There are opportunities to be lazier about saving and
+ * restoring the state and allocating the storage needed.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "amd64.h"
+#include "ureg.h"
+
+enum {						/* FCW, FSW and MXCSR */
+	I		= 0x00000001,		/* Invalid-Operation */
+	D		= 0x00000002,		/* Denormalized-Operand */
+	Z		= 0x00000004,		/* Zero-Divide */
+	O		= 0x00000008,		/* Overflow */
+	U		= 0x00000010,		/* Underflow */
+	P		= 0x00000020,		/* Precision */
+};
+
+enum {						/* FCW */
+	PCs		= 0x00000000,		/* Precision Control -Single */
+	PCd		= 0x00000200,		/* -Double */
+	PCde		= 0x00000300,		/* -Double Extended */
+	RCn		= 0x00000000,		/* Rounding Control -Nearest */
+	RCd		= 0x00000400,		/* -Down */
+	RCu		= 0x00000800,		/* -Up */
+	RCz		= 0x00000C00,		/* -Toward Zero */
+};
+
+enum {						/* FSW */
+	Sff		= 0x00000040,		/* Stack Fault Flag */
+	Es		= 0x00000080,		/* Error Summary Status */
+	C0		= 0x00000100,		/* ZF - Condition Code Bits */
+	C1		= 0x00000200,		/* O/U# */
+	C2		= 0x00000400,		/* PF */
+	C3		= 0x00004000,		/* ZF */
+	B		= 0x00008000,		/* Busy */
+};
+
+enum {						/* MXCSR */
+	Daz		= 0x00000040,		/* Denormals are Zeros */
+	Im		= 0x00000080,		/* I Mask */
+	Dm		= 0x00000100,		/* D Mask */
+	Zm		= 0x00000200,		/* Z Mask */
+	Om		= 0x00000400,		/* O Mask */
+	Um		= 0x00000800,		/* U Mask */
+	Pm		= 0x00001000,		/* P Mask */
+	Rn		= 0x00000000,		/* Round to Nearest */
+	Rd		= 0x00002000,		/* Round Down */
+	Ru		= 0x00004000,		/* Round Up */
+	Rz		= 0x00006000,		/* Round toward Zero */
+	Fz		= 0x00008000,		/* Flush to Zero for Um */
+};
+
+enum {						/* PFPU.state */
+	Init		= 0,			/* The FPU has not been used */
+	Busy		= 1,			/* The FPU is being used */
+	Idle		= 2,			/* The FPU has been used */
+
+	Hold		= 4,			/* Handling an FPU note */
+};
+
+extern void _clts(void);
+extern void _fldcw(u16int);
+extern void _fnclex(void);
+extern void _fninit(void);
+extern void _fxrstor(Fxsave*);
+extern void _fxsave(Fxsave*);
+extern void _fwait(void);
+extern void _ldmxcsr(u32int);
+extern void _stts(void);
+
+int
+fpudevprocio(Proc* proc, void* a, long n, uintptr offset, int write)
+{
+	uchar *p;
+
+	/*
+	 * Called from procdevtab.read and procdevtab.write
+	 * allow user process access to the FPU registers.
+	 * This is the only FPU routine which is called directly
+	 * from the port code; it would be nice to have dynamic
+	 * creation of entries in the device file trees...
+	 */
+	if(offset >= sizeof(Fxsave))
+		return 0;
+	if((p = proc->fpusave) == nil)
+		return 0;
+	switch(write){
+	default:
+		if(offset+n > sizeof(Fxsave))
+			n = sizeof(Fxsave) - offset;
+		memmove(p+offset, a, n);
+		break;
+	case 0:
+		if(offset+n > sizeof(Fxsave))
+			n = sizeof(Fxsave) - offset;
+		memmove(a, p+offset, n);
+		break;
+	}
+
+	return n;
+}
+
+void
+fpunotify(Ureg*)
+{
+	/*
+	 * Called when a note is about to be delivered to a
+	 * user process, usually at the end of a system call.
+	 * Note handlers are not allowed to use the FPU so
+	 * the state is marked (after saving if necessary) and
+	 * checked in the Device Not Available handler.
+	 */
+	if(up->fpustate == Busy){
+		_fxsave(up->fpusave);
+		_stts();
+		up->fpustate = Idle;
+	}
+	up->fpustate |= Hold;
+}
+
+void
+fpunoted(void)
+{
+	/*
+	 * Called from sysnoted() via the machine-dependent
+	 * noted() routine.
+	 * Clear the flag set above in fpunotify().
+	 */
+	up->fpustate &= ~Hold;
+}
+
+void
+fpusysrfork(Ureg*)
+{
+	/*
+	 * Called early in the non-interruptible path of
+	 * sysrfork() via the machine-dependent syscall() routine.
+	 * Save the state so that it can be easily copied
+	 * to the child process later.
+	 */
+	if(up->fpustate != Busy)
+		return;
+
+	_fxsave(up->fpusave);
+	_stts();
+	up->fpustate = Idle;
+}
+
+void
+fpusysrforkchild(Proc* child, Proc* parent)
+{
+	/*
+	 * Called later in sysrfork() via the machine-dependent
+	 * sysrforkchild() routine.
+	 * Copy the parent FPU state to the child.
+	 */
+	child->fpustate = parent->fpustate;
+	child->fpusave = (void*)((PTR2UINT(up->fxsave) + 15) & ~15);
+	if(child->fpustate == Init)
+		return;
+
+	memmove(child->fpusave, parent->fpusave, sizeof(Fxsave));
+}
+
+void
+fpuprocsave(Proc* p)
+{
+	/*
+	 * Called from sched() and sleep() via the machine-dependent
+	 * procsave() routine.
+	 * About to go in to the scheduler.
+	 * If the process wasn't using the FPU
+	 * there's nothing to do.
+	 */
+	if(p->fpustate != Busy)
+		return;
+
+	/*
+	 * The process is dead so clear and disable the FPU
+	 * and set the state for whoever gets this proc struct
+	 * next.
+	 */
+	if(p->state == Moribund){
+		_clts();
+		_fnclex();
+		_stts();
+		p->fpustate = Init;
+		return;
+	}
+
+	/*
+	 * Save the FPU state without handling pending
+	 * unmasked exceptions and disable. Postnote() can't
+	 * be called here as sleep() already has up->rlock,
+	 * so the handling of pending exceptions is delayed
+	 * until the process runs again and generates a
+	 * Device Not Available exception fault to activate
+	 * the FPU.
+	 */
+	_fxsave(p->fpusave);
+	_stts();
+	p->fpustate = Idle;
+}
+
+void
+fpuprocrestore(Proc* p)
+{
+	/*
+	 * The process has been rescheduled and is about to run.
+	 * Nothing to do here right now. If the process tries to use
+	 * the FPU again it will cause a Device Not Available
+	 * exception and the state will then be restored.
+	 */
+	USED(p);
+}
+
+void
+fpusysprocsetup(Proc* p)
+{
+	/*
+	 * Disable the FPU.
+	 * Called from sysexec() via sysprocsetup() to
+	 * set the FPU for the new process.
+	 */
+	if(p->fpustate != Init){
+		_clts();
+		_fnclex();
+		_stts();
+		p->fpustate = Init;
+	}
+}
+
+void
+acfpusysprocsetup(Proc *p)
+{
+	if(p->fpustate == Init){
+		/* The FPU is initialized in the TC but we must initialize
+		 * it in the AC.
+		 */
+		p->fpustate = Idle;
+		fpusysprocsetup(p);
+	}
+}
+
+static char*
+fpunote(void)
+{
+	ushort fsw;
+	Fxsave *fpusave;
+	char *m;
+
+	/*
+	 * The Sff bit is sticky, meaning it should be explicitly
+	 * cleared or there's no way to tell if the exception was an
+	 * invalid operation or a stack fault.
+	 */
+	fpusave = up->fpusave;
+	fsw = (fpusave->fsw & ~fpusave->fcw) & (Sff|P|U|O|Z|D|I);
+	if(fsw & I){
+		if(fsw & Sff){
+			if(fsw & C1)
+				m = "Stack Overflow";
+			else
+				m = "Stack Underflow";
+		}
+		else
+			m = "Invalid Operation";
+	}
+	else if(fsw & D)
+		m = "Denormal Operand";
+	else if(fsw & Z)
+		m = "Divide-By-Zero";
+	else if(fsw & O)
+		m = "Numeric Overflow";
+	else if(fsw & U)
+		m = "Numeric Underflow";
+	else if(fsw & P)
+		m = "Precision";
+	else
+		m =  "Unknown";
+
+	snprint(up->genbuf, sizeof(up->genbuf),
+		"sys: fp: %s Exception ipo=%#llux fsw=%#ux",
+		m, fpusave->rip, fsw);
+	return up->genbuf;
+}
+
+char*
+xfpuxf(Ureg* ureg, void*)
+{
+	u32int mxcsr;
+	Fxsave *fpusave;
+	char *m;
+
+	/*
+	 * #XF - SIMD Floating Point Exception (Vector 18).
+	 */
+
+	/*
+	 * Save FPU state to check out the error.
+	 */
+	fpusave = up->fpusave;
+	_fxsave(fpusave);
+	_stts();
+	up->fpustate = Idle;
+
+	if(ureg->ip & KZERO)
+		panic("#MF: ip=%#p", ureg->ip);
+
+	/*
+	 * Notify the user process.
+	 * The path here is similar to the x87 path described
+	 * in fpupostnote above but without the fpupostnote()
+	 * call.
+	 */
+	mxcsr = fpusave->mxcsr;
+	if((mxcsr & (Im|I)) == I)
+		m = "Invalid Operation";
+	else if((mxcsr & (Dm|D)) == D)
+		m = "Denormal Operand";
+	else if((mxcsr & (Zm|Z)) == Z)
+		m = "Divide-By-Zero";
+	else if((mxcsr & (Om|O)) == O)
+		m = "Numeric Overflow";
+	else if((mxcsr & (Um|U)) == U)
+		m = "Numeric Underflow";
+	else if((mxcsr & (Pm|P)) == P)
+		m = "Precision";
+	else
+		m =  "Unknown";
+
+	snprint(up->genbuf, sizeof(up->genbuf),
+		"sys: fp: %s Exception mxcsr=%#ux", m, mxcsr);
+	return up->genbuf;
+}
+
+void
+fpuxf(Ureg *ureg, void *p)
+{
+	char *n;
+
+	n = xfpuxf(ureg, p);
+	if(n != nil)
+		postnote(up, 1, n, NDebug);
+}
+
+char*
+acfpuxf(Ureg *ureg, void *p)
+{
+	return xfpuxf(ureg, p);
+}
+
+static char*
+xfpumf(Ureg* ureg, void*)
+{
+	Fxsave *fpusave;
+
+	/*
+	 * #MF - x87 Floating Point Exception Pending (Vector 16).
+	 */
+
+	/*
+	 * Save FPU state to check out the error.
+	 */
+	fpusave = up->fpusave;
+	_fxsave(fpusave);
+	_stts();
+	up->fpustate = Idle;
+
+	if(ureg->ip & KZERO)
+		panic("#MF: ip=%#p rip=%#p", ureg->ip, fpusave->rip);
+
+	/*
+	 * Notify the user process.
+	 * The path here is
+	 *	call trap->fpumf->fpupostnote->postnote
+	 *	return ->fpupostnote->fpumf->trap
+	 *	call notify->fpunotify
+	 *	return ->notify
+	 * then either
+	 *	call pexit
+	 * or
+	 *	return ->trap
+	 *	return ->user note handler
+	 */
+	return fpunote();
+}
+
+void
+fpumf(Ureg *ureg, void *p)
+{
+	char *n;
+
+	n = xfpumf(ureg, p);
+	if(n != nil)
+		postnote(up, 1, n, NDebug);
+}
+
+char*
+acfpumf(Ureg *ureg, void *p)
+{
+	return xfpumf(ureg, p);
+}
+
+static char*
+xfpunm(Ureg* ureg, void*)
+{
+	Fxsave *fpusave;
+
+	/*
+	 * #NM - Device Not Available (Vector 7).
+	 */
+	if(up == nil)
+		panic("#NM: fpu in kernel: ip %#p\n", ureg->ip);
+
+	/*
+	 * Someone tried to use the FPU in a note handler.
+	 * That's a no-no.
+	 */
+	if(up->fpustate & Hold)
+		return "sys: floating point in note handler";
+
+	if(ureg->ip & KZERO)
+		panic("#NM: proc %d %s state %d ip %#p\n",
+			up->pid, up->text, up->fpustate, ureg->ip);
+
+	switch(up->fpustate){
+	case Busy:
+	default:
+		panic("#NM: state %d ip %#p\n", up->fpustate, ureg->ip);
+		break;
+	case Init:
+		/*
+		 * A process tries to use the FPU for the
+		 * first time and generates a 'device not available'
+		 * exception.
+		 * Turn the FPU on and initialise it for use.
+		 * Set the precision and mask the exceptions
+		 * we don't care about from the generic Mach value.
+		 */
+		_clts();
+		_fninit();
+		_fwait();
+		_fldcw(m->fcw);
+		_ldmxcsr(m->mxcsr);
+		up->fpusave = (void*)((PTR2UINT(up->fxsave) + 15) & ~15);
+		up->fpustate = Busy;
+		break;
+	case Idle:
+		/*
+		 * Before restoring the state, check for any pending
+		 * exceptions, there's no way to restore the state without
+		 * generating an unmasked exception.
+		 */
+		fpusave = up->fpusave;
+		if((fpusave->fsw & ~fpusave->fcw) & (Sff|P|U|O|Z|D|I))
+			return fpunote();
+
+		/*
+		 * Sff is sticky.
+		 */
+		fpusave->fcw &= ~Sff;
+		_clts();
+		_fxrstor(fpusave);
+		up->fpustate = Busy;
+		break;
+	}
+	return nil;
+}
+
+void
+fpunm(Ureg *ureg, void *p)
+{
+	char *n;
+
+	n = xfpunm(ureg, p);
+	if(n != nil)
+		postnote(up, 1, n, NDebug);
+}
+
+char*
+acfpunm(Ureg *ureg, void *p)
+{
+	return xfpunm(ureg, p);
+}
+
+void
+fpuinit(void)
+{
+	u64int r;
+	Fxsave *fxsave;
+	uchar buf[sizeof(Fxsave)+15];
+
+	/*
+	 * It's assumed there is an integrated FPU, so Em is cleared;
+	 */
+	r = cr0get();
+	r &= ~(Ts|Em);
+	r |= Ne|Mp;
+	cr0put(r);
+
+	r = cr4get();
+	r |= Osxmmexcpt|Osfxsr;
+	cr4put(r);
+
+	_fninit();
+	fxsave = (Fxsave*)((PTR2UINT(buf) + 15) & ~15);
+	memset(fxsave, 0, sizeof(Fxsave));
+	_fxsave(fxsave);
+	m->fcw = RCn|PCd|P|U|D;
+	if(fxsave->mxcsrmask == 0)
+		m->mxcsrmask = 0x0000FFBF;
+	else
+		m->mxcsrmask = fxsave->mxcsrmask;
+	m->mxcsr = (Rn|Pm|Um|Dm) & m->mxcsrmask;
+	_stts();
+
+	if(m->machno != 0)
+		return;
+
+	/*
+	 * Set up the exception handlers.
+	 */
+	trapenable(IdtNM, fpunm, 0, "#NM");
+	trapenable(IdtMF, fpumf, 0, "#MF");
+	trapenable(IdtXF, fpuxf, 0, "#XF");
+
+	/* Same thing, for the AC */
+	actrapenable(IdtNM, acfpunm, 0, "#NM");
+	actrapenable(IdtMF, acfpumf, 0, "#MF");
+	actrapenable(IdtXF, acfpuxf, 0, "#XF");
+}

+ 475 - 0
sys/src/9/k10/fpu.c.old

@@ -0,0 +1,475 @@
+/*
+ * SIMD Floating Point.
+ * Assembler support to get at the individual instructions
+ * is in l64fpu.s.
+ * There are opportunities to be lazier about saving and
+ * restoring the state and allocating the storage needed.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "amd64.h"
+#include "ureg.h"
+
+enum {						/* FCW, FSW and MXCSR */
+	I		= 0x00000001,		/* Invalid-Operation */
+	D		= 0x00000002,		/* Denormalized-Operand */
+	Z		= 0x00000004,		/* Zero-Divide */
+	O		= 0x00000008,		/* Overflow */
+	U		= 0x00000010,		/* Underflow */
+	P		= 0x00000020,		/* Precision */
+};
+
+enum {						/* FCW */
+	PCs		= 0x00000000,		/* Precision Control -Single */
+	PCd		= 0x00000200,		/* -Double */
+	PCde		= 0x00000300,		/* -Double Extended */
+	RCn		= 0x00000000,		/* Rounding Control -Nearest */
+	RCd		= 0x00000400,		/* -Down */
+	RCu		= 0x00000800,		/* -Up */
+	RCz		= 0x00000C00,		/* -Toward Zero */
+};
+
+enum {						/* FSW */
+	Sff		= 0x00000040,		/* Stack Fault Flag */
+	Es		= 0x00000080,		/* Error Summary Status */
+	C0		= 0x00000100,		/* ZF - Condition Code Bits */
+	C1		= 0x00000200,		/* O/U# */
+	C2		= 0x00000400,		/* PF */
+	C3		= 0x00004000,		/* ZF */
+	B		= 0x00008000,		/* Busy */
+};
+
+enum {						/* MXCSR */
+	Daz		= 0x00000040,		/* Denormals are Zeros */
+	Im		= 0x00000080,		/* I Mask */
+	Dm		= 0x00000100,		/* D Mask */
+	Zm		= 0x00000200,		/* Z Mask */
+	Om		= 0x00000400,		/* O Mask */
+	Um		= 0x00000800,		/* U Mask */
+	Pm		= 0x00001000,		/* P Mask */
+	Rn		= 0x00000000,		/* Round to Nearest */
+	Rd		= 0x00002000,		/* Round Down */
+	Ru		= 0x00004000,		/* Round Up */
+	Rz		= 0x00006000,		/* Round toward Zero */
+	Fz		= 0x00008000,		/* Flush to Zero for Um */
+};
+
+enum {						/* PFPU.state */
+	Init		= 0,			/* The FPU has not been used */
+	Busy		= 1,			/* The FPU is being used */
+	Idle		= 2,			/* The FPU has been used */
+
+	Hold		= 4,			/* Handling an FPU note */
+};
+
+extern void _clts(void);
+extern void _fldcw(u16int);
+extern void _fnclex(void);
+extern void _fninit(void);
+extern void _fxrstor(Fxsave*);
+extern void _fxsave(Fxsave*);
+extern void _fwait(void);
+extern void _ldmxcsr(u32int);
+extern void _stts(void);
+
+int
+fpudevprocio(Proc* proc, void* a, long n, uintptr offset, int write)
+{
+	uchar *p;
+
+	/*
+	 * Called from procdevtab.read and procdevtab.write
+	 * allow user process access to the FPU registers.
+	 * This is the only FPU routine which is called directly
+	 * from the port code; it would be nice to have dynamic
+	 * creation of entries in the device file trees...
+	 */
+	if(offset >= sizeof(Fxsave))
+		return 0;
+	if((p = proc->fpusave) == nil)
+		return 0;
+	switch(write){
+	default:
+		if(offset+n > sizeof(Fxsave))
+			n = sizeof(Fxsave) - offset;
+		memmove(p+offset, a, n);
+		break;
+	case 0:
+		if(offset+n > sizeof(Fxsave))
+			n = sizeof(Fxsave) - offset;
+		memmove(a, p+offset, n);
+		break;
+	}
+
+	return n;
+}
+
+void
+fpunotify(Ureg*)
+{
+	/*
+	 * Called when a note is about to be delivered to a
+	 * user process, usually at the end of a system call.
+	 * Note handlers are not allowed to use the FPU so
+	 * the state is marked (after saving if necessary) and
+	 * checked in the Device Not Available handler.
+	 */
+	if(up->fpustate == Busy){
+		_fxsave(up->fpusave);
+		_stts();
+		up->fpustate = Idle;
+	}
+	up->fpustate |= Hold;
+}
+
+void
+fpunoted(void)
+{
+	/*
+	 * Called from sysnoted() via the machine-dependent
+	 * noted() routine.
+	 * Clear the flag set above in fpunotify().
+	 */
+	up->fpustate &= ~Hold;
+}
+
+void
+fpusysrfork(Ureg*)
+{
+	/*
+	 * Called early in the non-interruptible path of
+	 * sysrfork() via the machine-dependent syscall() routine.
+	 * Save the state so that it can be easily copied
+	 * to the child process later.
+	 */
+	if(up->fpustate != Busy)
+		return;
+
+	_fxsave(up->fpusave);
+	_stts();
+	up->fpustate = Idle;
+}
+
+void
+fpusysrforkchild(Proc* child, Proc* parent)
+{
+	/*
+	 * Called later in sysrfork() via the machine-dependent
+	 * sysrforkchild() routine.
+	 * Copy the parent FPU state to the child.
+	 */
+	child->fpustate = parent->fpustate;
+	child->fpusave = (void*)((PTR2UINT(up->fxsave) + 15) & ~15);
+	if(child->fpustate == Init)
+		return;
+
+	memmove(child->fpusave, parent->fpusave, sizeof(Fxsave));
+}
+
+void
+fpuprocsave(Proc* p)
+{
+	/*
+	 * Called from sched() and sleep() via the machine-dependent
+	 * procsave() routine.
+	 * About to go in to the scheduler.
+	 * If the process wasn't using the FPU
+	 * there's nothing to do.
+	 */
+	if(p->fpustate != Busy)
+		return;
+
+	/*
+	 * The process is dead so clear and disable the FPU
+	 * and set the state for whoever gets this proc struct
+	 * next.
+	 */
+	if(p->state == Moribund){
+		_clts();
+		_fnclex();
+		_stts();
+		p->fpustate = Init;
+		return;
+	}
+
+	/*
+	 * Save the FPU state without handling pending
+	 * unmasked exceptions and disable. Postnote() can't
+	 * be called here as sleep() already has up->rlock,
+	 * so the handling of pending exceptions is delayed
+	 * until the process runs again and generates a
+	 * Device Not Available exception fault to activate
+	 * the FPU.
+	 */
+	_fxsave(p->fpusave);
+	_stts();
+	p->fpustate = Idle;
+}
+
+void
+fpuprocrestore(Proc* p)
+{
+	/*
+	 * The process has been rescheduled and is about to run.
+	 * Nothing to do here right now. If the process tries to use
+	 * the FPU again it will cause a Device Not Available
+	 * exception and the state will then be restored.
+	 */
+	USED(p);
+}
+
+void
+fpusysprocsetup(Proc* p)
+{
+	/*
+	 * Disable the FPU.
+	 * Called from sysexec() via sysprocsetup() to
+	 * set the FPU for the new process.
+	 */
+	if(p->fpustate != Init){
+		_clts();
+		_fnclex();
+		_stts();
+		p->fpustate = Init;
+	}
+}
+
+static void
+fpupostnote(void)
+{
+	ushort fsw;
+	Fxsave *fpusave;
+	char *m, n[ERRMAX];
+
+	/*
+	 * The Sff bit is sticky, meaning it should be explicitly
+	 * cleared or there's no way to tell if the exception was an
+	 * invalid operation or a stack fault.
+	 */
+	fpusave = up->fpusave;
+	fsw = (fpusave->fsw & ~fpusave->fcw) & (Sff|P|U|O|Z|D|I);
+	if(fsw & I){
+		if(fsw & Sff){
+			if(fsw & C1)
+				m = "Stack Overflow";
+			else
+				m = "Stack Underflow";
+		}
+		else
+			m = "Invalid Operation";
+	}
+	else if(fsw & D)
+		m = "Denormal Operand";
+	else if(fsw & Z)
+		m = "Divide-By-Zero";
+	else if(fsw & O)
+		m = "Numeric Overflow";
+	else if(fsw & U)
+		m = "Numeric Underflow";
+	else if(fsw & P)
+		m = "Precision";
+	else
+		m =  "Unknown";
+
+	snprint(n, sizeof(n), "sys: fp: %s Exception ipo=%#llux fsw=%#ux",
+		m, fpusave->rip, fsw);
+	postnote(up, 1, n, NDebug);
+}
+
+void
+fpuxf(Ureg* ureg, void*)
+{
+	u32int mxcsr;
+	Fxsave *fpusave;
+	char *m, n[ERRMAX];
+
+	/*
+	 * #XF - SIMD Floating Point Exception (Vector 18).
+	 */
+
+	/*
+	 * Save FPU state to check out the error.
+	 */
+	fpusave = up->fpusave;
+	_fxsave(fpusave);
+	_stts();
+	up->fpustate = Idle;
+
+	if(ureg->ip & KZERO)
+		panic("#MF: ip=%#p", ureg->ip);
+
+	/*
+	 * Notify the user process.
+	 * The path here is similar to the x87 path described
+	 * in fpupostnote above but without the fpupostnote()
+	 * call.
+	 */
+	mxcsr = fpusave->mxcsr;
+	if((mxcsr & (Im|I)) == I)
+		m = "Invalid Operation";
+	else if((mxcsr & (Dm|D)) == D)
+		m = "Denormal Operand";
+	else if((mxcsr & (Zm|Z)) == Z)
+		m = "Divide-By-Zero";
+	else if((mxcsr & (Om|O)) == O)
+		m = "Numeric Overflow";
+	else if((mxcsr & (Um|U)) == U)
+		m = "Numeric Underflow";
+	else if((mxcsr & (Pm|P)) == P)
+		m = "Precision";
+	else
+		m =  "Unknown";
+
+	snprint(n, sizeof(n), "sys: fp: %s Exception mxcsr=%#ux", m, mxcsr);
+	postnote(up, 1, n, NDebug);
+}
+
+void
+fpumf(Ureg* ureg, void*)
+{
+	Fxsave *fpusave;
+
+	/*
+	 * #MF - x87 Floating Point Exception Pending (Vector 16).
+	 */
+
+	/*
+	 * Save FPU state to check out the error.
+	 */
+	fpusave = up->fpusave;
+	_fxsave(fpusave);
+	_stts();
+	up->fpustate = Idle;
+
+	if(ureg->ip & KZERO)
+		panic("#MF: ip=%#p rip=%#p", ureg->ip, fpusave->rip);
+
+	/*
+	 * Notify the user process.
+	 * The path here is
+	 *	call trap->fpumf->fpupostnote->postnote
+	 *	return ->fpupostnote->fpumf->trap
+	 *	call notify->fpunotify
+	 *	return ->notify
+	 * then either
+	 *	call pexit
+	 * or
+	 *	return ->trap
+	 *	return ->user note handler
+	 */
+	fpupostnote();
+}
+
+void
+fpunm(Ureg* ureg, void*)
+{
+	Fxsave *fpusave;
+
+	/*
+	 * #NM - Device Not Available (Vector 7).
+	 */
+	if(up == nil)
+		panic("#NM: fpu in kernel: ip %#p\n", ureg->ip);
+
+	/*
+	 * Someone tried to use the FPU in a note handler.
+	 * That's a no-no.
+	 */
+	if(up->fpustate & Hold){
+		postnote(up, 1, "sys: floating point in note handler", NDebug);
+		return;
+	}
+	if(ureg->ip & KZERO)
+		panic("#NM: proc %d %s state %d ip %#p\n",
+			up->pid, up->text, up->fpustate, ureg->ip);
+
+	switch(up->fpustate){
+	case Busy:
+	default:
+		panic("#NM: state %d ip %#p\n", up->fpustate, ureg->ip);
+		break;
+	case Init:
+		/*
+		 * A process tries to use the FPU for the
+		 * first time and generates a 'device not available'
+		 * exception.
+		 * Turn the FPU on and initialise it for use.
+		 * Set the precision and mask the exceptions
+		 * we don't care about from the generic Mach value.
+		 */
+		_clts();
+		_fninit();
+		_fwait();
+		_fldcw(m->fcw);
+		_ldmxcsr(m->mxcsr);
+		up->fpusave = (void*)((PTR2UINT(up->fxsave) + 15) & ~15);
+		up->fpustate = Busy;
+		break;
+	case Idle:
+		/*
+		 * Before restoring the state, check for any pending
+		 * exceptions, there's no way to restore the state without
+		 * generating an unmasked exception.
+		 */
+		fpusave = up->fpusave;
+		if((fpusave->fsw & ~fpusave->fcw) & (Sff|P|U|O|Z|D|I)){
+			fpupostnote();
+			break;
+		}
+
+		/*
+		 * Sff is sticky.
+		 */
+		fpusave->fcw &= ~Sff;
+		_clts();
+		_fxrstor(fpusave);
+		up->fpustate = Busy;
+		break;
+	}
+}
+
+void
+fpuinit(void)
+{
+	u64int r;
+	Fxsave *fxsave;
+	uchar buf[sizeof(Fxsave)+15];
+
+	/*
+	 * It's assumed there is an integrated FPU, so Em is cleared;
+	 */
+	r = cr0get();
+	r &= ~(Ts|Em);
+	r |= Ne|Mp;
+	cr0put(r);
+
+	r = cr4get();
+	r |= Osxmmexcpt|Osfxsr;
+	cr4put(r);
+
+	_fninit();
+	fxsave = (Fxsave*)((PTR2UINT(buf) + 15) & ~15);
+	memset(fxsave, 0, sizeof(Fxsave));
+	_fxsave(fxsave);
+	m->fcw = RCn|PCd|P|U|D;
+	if(fxsave->mxcsrmask == 0)
+		m->mxcsrmask = 0x0000FFBF;
+	else
+		m->mxcsrmask = fxsave->mxcsrmask;
+	m->mxcsr = (Rn|Pm|Um|Dm) & m->mxcsrmask;
+	_stts();
+
+	if(m->machno != 0)
+		return;
+
+	/*
+	 * Set up the exception handlers.
+	 */
+	trapenable(IdtNM, fpunm, 0, "#NM");
+	trapenable(IdtMF, fpumf, 0, "#MF");
+	trapenable(IdtXF, fpuxf, 0, "#XF");
+}

+ 176 - 0
sys/src/9/k10/i8254.c

@@ -0,0 +1,176 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+/*
+ * 8254 Programmable Interval Timer and compatibles.
+ */
+enum {					/* I/O ports */
+	Timer1		= 0x40,
+	Timer2		= 0x48,		/* Counter0 is watchdog (EISA) */
+
+	Counter0	= 0,		/* Counter 0 Access Port */
+	Counter1	= 1,		/* Counter 1 Access Port */
+	Counter2	= 2,		/* Counter 2 Access Port */
+	Control		= 3,		/* Timer Control Word */
+};
+
+enum {					/* Control */
+	Bcd		= 0x01,		/* Binary/BCD countdown select */
+
+	Mode0		= 0x00,		/* [3:1] interrupt on terminal count */
+	Mode1		= 0x02,		/* hardware re-triggerable one-shot */
+	Mode2		= 0x04,		/* rate generator */
+	Mode3		= 0x06,		/* square-wave generator */
+	Mode4		= 0x08,		/* sofware triggered strobe */
+	Mode5		= 0x0A,		/* hardware triggered strobe */
+
+	Clc		= 0x00,		/* [5:4] Counter Latch Command */
+	RWlsb		= 0x10,		/* R/W LSB */
+	RWmsb		= 0x20,		/* R/W MSB */
+	RW16		= 0x30,		/* R/W LSB then MSB */
+	Cs0		= 0x00,		/* [7:6] Counter 0 Select */
+	Cs1		= 0x40,		/* Counter 1 Select */
+	Cs2		= 0x80,		/* Counter 2 Select */
+
+	Rbc		= 0xC0,		/* Read-Back Command */
+	RbCnt0		= 0x02,		/* Select Counter 0 */
+	RbCnt1		= 0x04,		/* Select Counter 1 */
+	RbCnt2		= 0x08,		/* Select Counter 2 */
+	RbS		= 0x20,		/* Read-Back Status */
+	RbC		= 0x10,		/* Read-Back Count */
+	RbCS		= 0x00,		/* Read-Back Count and Status */
+
+	RbNULL		= 0x40,		/* NULL-Count Flag */
+	RbOUT		= 0x80,		/* OUT-pin */
+};
+
+enum {
+	Osc		= 1193182,	/* 14.318180MHz/12 */
+	Hz		= 82,		/* 2*41*14551 = 1193182 */
+};
+
+static void
+i8254set(int port, int hz)
+{
+	int counter, timeo;
+
+	/*
+	 * Initialise Counter0 to be the system clock if necessary,
+	 * it's normally connected to IRQ0 on an interrupt controller.
+	 * Use a periodic square wave (Mode3).
+	 */
+	counter = Osc/hz;
+	outb(port+Control, Cs0|RW16|Mode3);
+	outb(port+Counter0, counter);
+	outb(port+Counter0, counter>>8);
+
+	/*
+	 * Wait until the counting register has been loaded
+	 * into the counting element.
+	 */
+	for(timeo = 0; timeo < 100000; timeo++){
+		outb(port+Control, Rbc|RbS|RbCnt0);
+		if(!(inb(port+Counter0) & RbNULL))
+			break;
+	}
+}
+
+vlong
+i8254hz(u32int info[2][4])
+{
+	u32int ax;
+	u64int a, b;
+	int aamcycles, incr, loops, x, y;
+
+	/*
+	 * Use the cpuid family info to get the
+	 * cycles for the AAM instruction.
+	 * Beware: this can be called VERY early before
+	 * some of the other device state is set.
+	 */
+	ax = info[1][0] & 0x00000f00;
+	if(memcmp(&info[0][1], "GenuntelineI", 12) == 0){
+		switch(ax){
+		default:
+			return 0;
+		case 0x00000600:
+		case 0x00000f00:
+			aamcycles = 16;
+			break;
+		}
+	}
+	else if(memcmp(&info[0][1], "AuthcAMDenti", 12) == 0){
+		switch(ax){
+		default:
+			return 0;
+		case 0x00000600:
+		case 0x00000f00:
+			aamcycles = 11;
+			break;
+		}
+	}
+	else
+		return 0;
+
+	i8254set(Timer1, Hz);
+
+	/*
+	 * Find biggest loop that doesn't wrap.
+	 */
+	SET(a, b);
+	incr = 16000000/(aamcycles*Hz*2);
+	x = 2000;
+	for(loops = incr; loops < 64*1024; loops += incr) {
+		/*
+		 * Measure time for the loop
+		 *
+		 *		MOVL	loops,CX
+		 *	aaml1:
+		 *		AAM
+		 *		LOOP	aaml1
+		 *
+		 * The time for the loop should be independent of external
+		 * cache and memory system since it fits in the execution
+		 * prefetch buffer.
+		 * The AAM instruction is not available in 64-bit mode.
+		 */
+		outb(Timer1+Control, Cs0|Clc);
+
+		a = rdtsc();
+		x = inb(Timer1+Counter0);
+		x |= inb(Timer1+Counter0)<<8;
+		aamloop(loops);
+		outb(Timer1+Control, Cs0|Clc);
+		b = rdtsc();
+
+		y = inb(Timer1+Counter0);
+		y |= inb(Timer1+Counter0)<<8;
+		x -= y;
+
+		if(x < 0)
+			x += Osc/Hz;
+
+		if(x > Osc/(3*Hz))
+			break;
+	}
+
+	/*
+ 	 * Figure out clock frequency.
+	 */
+	b = (b-a)<<1;
+	b *= Osc;
+
+	return b/x;
+}

+ 238 - 0
sys/src/9/k10/i8259.c

@@ -0,0 +1,238 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "io.h"
+
+/*
+ * 8259 Interrupt Controller and compatibles.
+ */
+enum {					/* I/O ports */
+	Cntrl1		= 0x20,
+	Cntrl2		= 0xa0,
+
+	Icw1		= 0,		/* Initialisation Command Word 1 */
+	Icw2		= 1,
+	Icw3		= 1,
+	Icw4		= 1,
+
+	Ocw1		= 1,		/* Operational Control Word 1 */
+	Ocw2		= 0,
+	Ocw3		= 0,
+
+	Imr		= Ocw1,		/* Interrupt Mask Register */
+	Isr		= Ocw3,		/* In-Service Register */
+	Irr		= Ocw3,		/* Interrupt Request Register */
+
+	Elcr1		= 0x4d0,	/* Edge/Level Control Register */
+	Elcr2		= 0x4d1,
+};
+
+enum {					/* Icw1 */
+	Ic4		= 0x01,		/* there will be an Icw4 */
+	Icw1sel		= 0x10,		/* Icw/Ocw select */
+};
+
+enum {					/* Icw3 */
+	Cascaded	= 0x04,		/* Cntrl1 - Cascaded Mode Enable */
+	SlaveIRQ2	= 0x02,		/* Cntrl2 - Slave Identification Code */
+};
+
+enum {					/* Icw4 */
+	Microprocessor	= 0x01,		/* 80x86-based system */
+};
+
+enum {					/* Ocw2 */
+	Ocw2sel		= 0x00,		/* Ocw2 select */
+	Eoi		= 0x20,		/* Non-spcific EOI command */
+};
+
+enum {					/* Ocw3 */
+	Irrread		= 0x02,		/* Read IRQ register */
+	Isrread		= 0x03,		/* Read IS register */
+	Ocw3sel		= 0x08,		/* Ocw3 select */
+};
+
+static Lock i8259lock;
+static int i8259mask = ~0;		/* mask of disabled interrupts */
+static int i8259elcr;			/* mask of level interrupts */
+
+int
+i8259init(int vectorbase)
+{
+	int elcr;
+
+	vectorbase &= ~0x07;
+
+	ilock(&i8259lock);
+
+	/*
+	 * Boilerplate to initialise the pair of 8259 controllers,
+	 * see one of the Intel bridge datasheets for details,
+	 * e.g. 82371AB (PIIX4). The default settings are 80x86 mode,
+	 * edge-sensitive detection, normal EOI, non-buffered and
+	 * cascade mode. Cntrl1 is connected as the master and Cntrl2
+	 * as the slave; IRQ2 is used to cascade the two controllers.
+	 */
+	outb(Cntrl1+Icw1, Icw1sel|Ic4);
+	outb(Cntrl1+Icw2, vectorbase);
+	outb(Cntrl1+Icw3, Cascaded);
+	outb(Cntrl1+Icw4, Microprocessor);
+
+	outb(Cntrl2+Icw1, Icw1sel|Ic4);
+	outb(Cntrl2+Icw2, vectorbase+8);
+	outb(Cntrl2+Icw3, SlaveIRQ2);
+	outb(Cntrl2+Icw4, Microprocessor);
+
+	/*
+	 * Set the interrupt masks, allowing interrupts
+	 * to pass from Cntrl2 to Cntrl1 on IRQ2.
+	 */
+	i8259mask &= ~(1<<2);
+	outb(Cntrl2+Imr, (i8259mask>>8) & 0xff);
+	outb(Cntrl1+Imr, i8259mask & 0xff);
+
+	outb(Cntrl1+Ocw2, Ocw2sel|Eoi);
+	outb(Cntrl2+Ocw2, Ocw2sel|Eoi);
+
+	/*
+	 * Set Ocw3 to return the ISR when read for i8259isr()
+	 * (after initialisation status read is set to return the IRR).
+	 * Read IRR first to possibly deassert an outstanding
+	 * interrupt.
+	 */
+	inb(Cntrl1+Irr);
+	outb(Cntrl1+Ocw3, Ocw3sel|Isrread);
+	inb(Cntrl2+Irr);
+	outb(Cntrl2+Ocw3, Ocw3sel|Isrread);
+
+	/*
+	 * Check for Edge/Level Control register.
+	 * This check may not work for all chipsets.
+	 * First try a non-intrusive test - the bits for
+	 * IRQs 13, 8, 2, 1 and 0 must be edge (0). If
+	 * that's OK try a R/W test.
+	 */
+	elcr = (inb(Elcr2)<<8)|inb(Elcr1);
+	if(!(elcr & 0x2107)){
+		outb(Elcr1, 0);
+		if(inb(Elcr1) == 0){
+			outb(Elcr1, 0x20);
+			if(inb(Elcr1) == 0x20)
+				i8259elcr = elcr;
+			outb(Elcr1, elcr & 0xff);
+		}
+	}
+	iunlock(&i8259lock);
+
+	return vectorbase;
+}
+
+int
+i8259isr(int vno)
+{
+	int irq, isr;
+
+	if(vno < IdtPIC || vno > IdtPIC+15)
+		return 0;
+	irq = vno-IdtPIC;
+
+	/*
+	 * Collect the interrupt status,
+	 * acknowledge the interrupt and return whether
+	 * the acknowledged interrupt was the correct
+	 * one (this could be better but it's not really
+	 * used).
+	 */
+	ilock(&i8259lock);
+	isr = inb(Cntrl1+Isr);
+	outb(Cntrl1+Ocw2, Ocw2sel|Eoi);
+	if(irq >= 8){
+		isr |= inb(Cntrl2+Isr)<<8;
+		outb(Cntrl2+Ocw2, Ocw2sel|Eoi);
+	}
+	iunlock(&i8259lock);
+
+	return isr & (1<<irq);
+}
+
+#ifdef notdef
+
+int
+i8259irqenable(Vctl* v)
+{
+	int irq, irqbit;
+
+	/*
+	 * Given an IRQ, enable the corresponding interrupt in the i8259
+	 * and return the vector to be used. The i8259 is set to use a fixed
+	 * range of vectors starting at VectorPIC.
+	 */
+	irq = v->irq;
+	if(irq < 0 || irq > 15){
+		print("i8259enable: irq %d out of range\n", irq);
+		return -1;
+	}
+	irqbit = 1<<irq;
+
+	ilock(&i8259lock);
+	if(!(i8259mask & irqbit) && !(i8259elcr & irqbit)){
+		print("i8259enable: irq %d shared but not level\n", irq);
+		iunlock(&i8259lock);
+		return -1;
+	}
+	i8259mask &= ~irqbit;
+	if(irq < 8)
+		outb(Cntrl1+Imr, i8259mask & 0xff);
+	else
+		outb(Cntrl2+Imr, (i8259mask>>8) & 0xff);
+
+	if(i8259elcr & irqbit)
+		v->eoi = i8259isr;
+	else
+		v->isr = i8259isr;
+	iunlock(&i8259lock);
+
+	v->type = "8259";
+	return IdtPIC+irq;
+}
+
+int
+i8259irqdisable(int irq)
+{
+	int irqbit;
+
+	/*
+	 * Given an IRQ, disable the corresponding interrupt
+	 * in the 8259.
+	 */
+	if(irq < 0 || irq > 15){
+		print("i8259disable: irq %d out of range\n", irq);
+		return -1;
+	}
+	irqbit = 1<<irq;
+
+	ilock(&i8259lock);
+	if(!(i8259mask & irqbit)){
+		i8259mask |= irqbit;
+		if(irq < 8)
+			outb(Cntrl1+Imr, i8259mask & 0xff);
+		else
+			outb(Cntrl2+Imr, (i8259mask>>8) & 0xff);
+	}
+	iunlock(&i8259lock);
+
+	return 0;
+}
+#endif /* notdef */

+ 16 - 0
sys/src/9/k10/init9.c

@@ -0,0 +1,16 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+extern void startboot(char*, char**);
+
+void
+main(char* argv0)
+{
+	startboot(argv0, &argv0);
+}

+ 275 - 0
sys/src/9/k10/io.h

@@ -0,0 +1,275 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+enum {
+	VectorNMI	= 2,		/* non-maskable interrupt */
+	VectorBPT	= 3,		/* breakpoint */
+	VectorUD	= 6,		/* invalid opcode exception */
+	VectorCNA	= 7,		/* coprocessor not available */
+	Vector2F	= 8,		/* double fault */
+	VectorCSO	= 9,		/* coprocessor segment overrun */
+	VectorPF	= 14,		/* page fault */
+	Vector15	= 15,		/* reserved */
+	VectorCERR	= 16,		/* coprocessor error */
+
+	VectorPIC	= 32,		/* external i8259 interrupts */
+	IrqCLOCK	= 0,
+	IrqKBD		= 1,
+	IrqUART1	= 3,
+	IrqUART0	= 4,
+	IrqPCMCIA	= 5,
+	IrqFLOPPY	= 6,
+	IrqLPT		= 7,
+	IrqIRQ7		= 7,
+	IrqAUX		= 12,		/* PS/2 port */
+	IrqIRQ13	= 13,		/* coprocessor on 386 */
+	IrqATA0		= 14,
+	IrqATA1		= 15,
+	MaxIrqPIC	= 15,
+
+	VectorLAPIC	= VectorPIC+16,	/* local APIC interrupts */
+	IrqLINT0	= VectorLAPIC+0,
+	IrqLINT1	= VectorLAPIC+1,
+	IrqTIMER	= VectorLAPIC+2,
+	IrqERROR	= VectorLAPIC+3,
+	IrqPCINT	= VectorLAPIC+4,
+	IrqSPURIOUS	= VectorLAPIC+15,
+	MaxIrqLAPIC	= VectorLAPIC+15,
+
+	VectorSYSCALL	= 64,
+
+	VectorAPIC	= 65,		/* external APIC interrupts */
+	MaxVectorAPIC	= 255,
+};
+
+enum {
+	IdtPIC		= 32,			/* external i8259 interrupts */
+
+	IdtLINT0	= 48,			/* local APIC interrupts */
+	IdtLINT1	= 49,
+	IdtTIMER	= 50,
+	IdtERROR	= 51,
+	IdtPCINT	= 52,
+
+	IdtIPI		= 62,
+	IdtSPURIOUS	= 63,
+
+	IdtSYSCALL	= 64,
+
+	IdtIOAPIC	= 65,			/* external APIC interrupts */
+
+	IdtMAX		= 255,
+};
+
+typedef struct Vkey {
+	int	tbdf;			/* pci: ioapic or msi sources */
+	int	irq;			/* 8259-emulating sources */
+} Vkey;
+
+typedef struct Vctl {
+	Vctl*	next;			/* handlers on this vector */
+
+	int	isintr;			/* interrupt or fault/trap */
+
+	Vkey;				/* source-specific key; tbdf for pci */
+	void	(*f)(Ureg*, void*);	/* handler to call */
+	void*	a;			/* argument to call it with */
+	char	name[KNAMELEN];		/* of driver */
+	char	*type;
+
+	int	(*isr)(int);		/* get isr bit for this irq */
+	int	(*eoi)(int);		/* eoi */
+	int	(*mask)(Vkey*, int);	/* interrupt enable returns masked vector */
+	int	vno;
+} Vctl;
+
+typedef struct ACVctl {
+	char*	(*f)(Ureg*,void*);
+	void*	a;
+	int	vno;
+	char	name[KNAMELEN];		/* of driver */
+} ACVctl;
+
+enum {
+	BusCBUS		= 0,		/* Corollary CBUS */
+	BusCBUSII,			/* Corollary CBUS II */
+	BusEISA,			/* Extended ISA */
+	BusFUTURE,			/* IEEE Futurebus */
+	BusINTERN,			/* Internal bus */
+	BusISA,				/* Industry Standard Architecture */
+	BusMBI,				/* Multibus I */
+	BusMBII,			/* Multibus II */
+	BusMCA,				/* Micro Channel Architecture */
+	BusMPI,				/* MPI */
+	BusMPSA,			/* MPSA */
+	BusNUBUS,			/* Apple Macintosh NuBus */
+	BusPCI,				/* Peripheral Component Interconnect */
+	BusPCMCIA,			/* PC Memory Card International Association */
+	BusTC,				/* DEC TurboChannel */
+	BusVL,				/* VESA Local bus */
+	BusVME,				/* VMEbus */
+	BusXPRESS,			/* Express System Bus */
+};
+
+#define MKBUS(t,b,d,f)	(((t)<<24)|(((b)&0xFF)<<16)|(((d)&0x1F)<<11)|(((f)&0x07)<<8))
+#define BUSFNO(tbdf)	(((tbdf)>>8)&0x07)
+#define BUSDNO(tbdf)	(((tbdf)>>11)&0x1F)
+#define BUSBNO(tbdf)	(((tbdf)>>16)&0xFF)
+#define BUSTYPE(tbdf)	((tbdf)>>24)
+#define BUSBDF(tbdf)	((tbdf)&0x00FFFF00)
+#define BUSUNKNOWN	(-1)
+
+enum {
+	MaxEISA		= 16,
+	CfgEISA		= 0xC80,
+};
+
+/*
+ * PCI support code.
+ */
+enum {					/* type 0 and type 1 pre-defined header */
+	PciVID		= 0x00,		/* vendor ID */
+	PciDID		= 0x02,		/* device ID */
+	PciPCR		= 0x04,		/* command */
+	PciPSR		= 0x06,		/* status */
+	PciRID		= 0x08,		/* revision ID */
+	PciCCRp		= 0x09,		/* programming interface class code */
+	PciCCRu		= 0x0A,		/* sub-class code */
+	PciCCRb		= 0x0B,		/* base class code */
+	PciCLS		= 0x0C,		/* cache line size */
+	PciLTR		= 0x0D,		/* latency timer */
+	PciHDT		= 0x0E,		/* header type */
+	PciBST		= 0x0F,		/* BIST */
+
+	PciBAR0		= 0x10,		/* base address */
+	PciBAR1		= 0x14,
+
+	PciCP		= 0x34,		/* capabilities pointer */
+
+	PciINTL		= 0x3C,		/* interrupt line */
+	PciINTP		= 0x3D,		/* interrupt pin */
+};
+
+enum {					/* type 0 pre-defined header */
+	PciCIS		= 0x28,		/* cardbus CIS pointer */
+	PciSVID		= 0x2C,		/* subsystem vendor ID */
+	PciSID		= 0x2E,		/* cardbus CIS pointer */
+	PciEBAR0	= 0x30,		/* expansion ROM base address */
+	PciMGNT		= 0x3E,		/* burst period length */
+	PciMLT		= 0x3F,		/* maximum latency between bursts */
+};
+
+enum {					/* type 1 pre-defined header */
+	PciPBN		= 0x18,		/* primary bus number */
+	PciSBN		= 0x19,		/* secondary bus number */
+	PciUBN		= 0x1A,		/* subordinate bus number */
+	PciSLTR		= 0x1B,		/* secondary latency timer */
+	PciIBR		= 0x1C,		/* I/O base */
+	PciILR		= 0x1D,		/* I/O limit */
+	PciSPSR		= 0x1E,		/* secondary status */
+	PciMBR		= 0x20,		/* memory base */
+	PciMLR		= 0x22,		/* memory limit */
+	PciPMBR		= 0x24,		/* prefetchable memory base */
+	PciPMLR		= 0x26,		/* prefetchable memory limit */
+	PciPUBR		= 0x28,		/* prefetchable base upper 32 bits */
+	PciPULR		= 0x2C,		/* prefetchable limit upper 32 bits */
+	PciIUBR		= 0x30,		/* I/O base upper 16 bits */
+	PciIULR		= 0x32,		/* I/O limit upper 16 bits */
+	PciEBAR1	= 0x28,		/* expansion ROM base address */
+	PciBCR		= 0x3E,		/* bridge control register */
+};
+
+enum {					/* type 2 pre-defined header */
+	PciCBExCA	= 0x10,
+	PciCBSPSR	= 0x16,
+	PciCBPBN	= 0x18,		/* primary bus number */
+	PciCBSBN	= 0x19,		/* secondary bus number */
+	PciCBUBN	= 0x1A,		/* subordinate bus number */
+	PciCBSLTR	= 0x1B,		/* secondary latency timer */
+	PciCBMBR0	= 0x1C,
+	PciCBMLR0	= 0x20,
+	PciCBMBR1	= 0x24,
+	PciCBMLR1	= 0x28,
+	PciCBIBR0	= 0x2C,		/* I/O base */
+	PciCBILR0	= 0x30,		/* I/O limit */
+	PciCBIBR1	= 0x34,		/* I/O base */
+	PciCBILR1	= 0x38,		/* I/O limit */
+	PciCBSVID	= 0x40,		/* subsystem vendor ID */
+	PciCBSID	= 0x42,		/* subsystem ID */
+	PciCBLMBAR	= 0x44,		/* legacy mode base address */
+};
+
+/* capabilities */
+enum {
+	PciCapPMG	= 0x01,		/* power management */
+	PciCapAGP	= 0x02,
+	PciCapVPD	= 0x03,		/* vital product data */
+	PciCapSID	= 0x04,		/* slot id */
+	PciCapMSI	= 0x05,
+	PciCapCHS	= 0x06,		/* compact pci hot swap */
+	PciCapPCIX	= 0x07,
+	PciCapHTC	= 0x08,		/* hypertransport irq conf */
+	PciCapVND	= 0x09,		/* vendor specific information */
+	PciCapPCIe	= 0x10,
+	PciCapMSIX	= 0x11,
+	PciCapSATA	= 0x12,
+	PciCapHSW	= 0x0c,		/* hot swap */
+};
+
+typedef struct Pcisiz Pcisiz;
+struct Pcisiz
+{
+	Pcidev*	dev;
+	int	siz;
+	int	bar;
+};
+
+typedef struct Pcidev Pcidev;
+struct Pcidev
+{
+	int	tbdf;			/* type+bus+device+function */
+	ushort	vid;			/* vendor ID */
+	ushort	did;			/* device ID */
+
+	ushort	pcr;
+
+	uchar	rid;
+	uchar	ccrp;
+	uchar	ccru;
+	uchar	ccrb;
+	uchar	cls;
+	uchar	ltr;
+
+	struct {
+		ulong	bar;		/* base address */
+		int	size;
+	} mem[6];
+
+	struct {
+		ulong	bar;
+		int	size;
+	} rom;
+	uchar	intl;			/* interrupt line */
+
+	Pcidev*	list;
+	Pcidev*	link;			/* next device on this bno */
+
+	Pcidev*	bridge;			/* down a bus */
+	struct {
+		ulong	bar;
+		int	size;
+	} ioa, mema;
+};
+
+#define PCIWINDOW	0
+#define PCIWADDR(va)	(PADDR(va)+PCIWINDOW)
+#define ISAWINDOW	0
+#define ISAWADDR(va)	(PADDR(va)+ISAWINDOW)
+
+#pragma	varargck	type	"T"	int

+ 490 - 0
sys/src/9/k10/ioapic.c

@@ -0,0 +1,490 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "apic.h"
+#include "io.h"
+
+typedef struct Rbus Rbus;
+typedef struct Rdt Rdt;
+
+struct Rbus {
+	Rbus	*next;
+	int	devno;
+	Rdt	*rdt;
+};
+
+struct Rdt {
+	Apic	*apic;
+	int	intin;
+	u32int	lo;
+
+	int	ref;				/* could map to multiple busses */
+	int	enabled;				/* times enabled */
+};
+
+enum {						/* IOAPIC registers */
+	Ioregsel	= 0x00,			/* indirect register address */
+	Iowin		= 0x04,			/* indirect register data */
+	Ioipa		= 0x08,			/* IRQ Pin Assertion */
+	Ioeoi		= 0x10,			/* EOI */
+
+	Ioapicid	= 0x00,			/* Identification */
+	Ioapicver	= 0x01,			/* Version */
+	Ioapicarb	= 0x02,			/* Arbitration */
+	Ioabcfg		= 0x03,			/* Boot Coniguration */
+	Ioredtbl	= 0x10,			/* Redirection Table */
+};
+
+static Rdt rdtarray[Nrdt];
+static int nrdtarray;
+static int gsib;
+static Rbus* rdtbus[Nbus];
+static Rdt* rdtvecno[IdtMAX+1];
+
+static Lock idtnolock;
+static int idtno = IdtIOAPIC;
+
+Apic	xioapic[Napic];
+
+static void
+rtblget(Apic* apic, int sel, u32int* hi, u32int* lo)
+{
+	sel = Ioredtbl + 2*sel;
+
+	*(apic->addr+Ioregsel) = sel+1;
+	*hi = *(apic->addr+Iowin);
+	*(apic->addr+Ioregsel) = sel;
+	*lo = *(apic->addr+Iowin);
+}
+
+static void
+rtblput(Apic* apic, int sel, u32int hi, u32int lo)
+{
+	sel = Ioredtbl + 2*sel;
+
+	*(apic->addr+Ioregsel) = sel+1;
+	*(apic->addr+Iowin) = hi;
+	*(apic->addr+Ioregsel) = sel;
+	*(apic->addr+Iowin) = lo;
+}
+
+Rdt*
+rdtlookup(Apic *apic, int intin)
+{
+	int i;
+	Rdt *r;
+
+	for(i = 0; i < nrdtarray; i++){
+		r = rdtarray + i;
+		if(apic == r->apic && intin == r->intin)
+			return r;
+	}
+	return nil;
+}
+
+void
+ioapicintrinit(int busno, int apicno, int intin, int devno, u32int lo)
+{
+	Rbus *rbus;
+	Rdt *rdt;
+	Apic *apic;
+
+	if(busno >= Nbus || apicno >= Napic || nrdtarray >= Nrdt)
+		return;
+	apic = &xioapic[apicno];
+	if(!apic->useable || intin >= apic->nrdt)
+		return;
+
+	rdt = rdtlookup(apic, intin);
+	if(rdt == nil){
+		rdt = &rdtarray[nrdtarray++];
+		rdt->apic = apic;
+		rdt->intin = intin;
+		rdt->lo = lo;
+	}else{
+		if(lo != rdt->lo){
+			print("mutiple irq botch bus %d %d/%d/%d lo %d vs %d\n",
+				busno, apicno, intin, devno, lo, rdt->lo);
+			return;
+		}
+		DBG("dup rdt %d %d %d %d %.8ux\n", busno, apicno, intin, devno, lo);
+	}
+	rdt->ref++;
+	rbus = malloc(sizeof *rbus);
+	rbus->rdt = rdt;
+	rbus->devno = devno;
+	rbus->next = rdtbus[busno];
+	rdtbus[busno] = rbus;
+}
+
+void
+ioapicinit(int id, uintptr pa)
+{
+	Apic *apic;
+
+	/*
+	 * Mark the IOAPIC useable if it has a good ID
+	 * and the registers can be mapped.
+	 */
+	if(id >= Napic)
+		return;
+
+	apic = &xioapic[id];
+	if(apic->useable || (apic->addr = vmap(pa, 1024)) == nil)
+		return;
+	apic->useable = 1;
+
+	/*
+	 * Initialise the I/O APIC.
+	 * The MultiProcessor Specification says it is the
+	 * responsibility of the O/S to set the APIC ID.
+	 */
+	lock(apic);
+	*(apic->addr+Ioregsel) = Ioapicver;
+	apic->nrdt = ((*(apic->addr+Iowin)>>16) & 0xff) + 1;
+	apic->gsib = gsib;
+	gsib += apic->nrdt;
+
+	*(apic->addr+Ioregsel) = Ioapicid;
+	*(apic->addr+Iowin) = id<<24;
+	unlock(apic);
+}
+
+void
+ioapicdump(void)
+{
+	int i, n;
+	Rbus *rbus;
+	Rdt *rdt;
+	Apic *apic;
+	u32int hi, lo;
+
+	if(!DBGFLG)
+		return;
+	for(i = 0; i < Napic; i++){
+		apic = &xioapic[i];
+		if(!apic->useable || apic->addr == 0)
+			continue;
+		print("ioapic %d addr %#p nrdt %d gsib %d\n",
+			i, apic->addr, apic->nrdt, apic->gsib);
+		for(n = 0; n < apic->nrdt; n++){
+			lock(apic);
+			rtblget(apic, n, &hi, &lo);
+			unlock(apic);
+			print(" rdt %2.2d %#8.8ux %#8.8ux\n", n, hi, lo);
+		}
+	}
+	for(i = 0; i < Nbus; i++){
+		if((rbus = rdtbus[i]) == nil)
+			continue;
+		print("iointr bus %d:\n", i);
+		for(; rbus != nil; rbus = rbus->next){
+			rdt = rbus->rdt;
+			print(" apic %ld devno %#ux (%d %d) intin %d lo %#ux ref %d\n",
+				rdt->apic-xioapic, rbus->devno, rbus->devno>>2,
+				rbus->devno & 0x03, rdt->intin, rdt->lo, rdt->ref);
+		}
+	}
+}
+
+void
+ioapiconline(void)
+{
+	int i;
+	Apic *apic;
+
+	for(apic = xioapic; apic < &xioapic[Napic]; apic++){
+		if(!apic->useable || apic->addr == nil)
+			continue;
+		for(i = 0; i < apic->nrdt; i++){
+			lock(apic);
+			rtblput(apic, i, 0, Im);
+			unlock(apic);
+		}
+	}
+	ioapicdump();
+}
+
+static int dfpolicy = 0;
+
+static void
+ioapicintrdd(u32int* hi, u32int* lo)
+{
+	int i;
+	static int df;
+	static Lock dflock;
+
+	/*
+	 * Set delivery mode (lo) and destination field (hi),
+	 * according to interrupt routing policy.
+	 */
+	/*
+	 * The bulk of this code was written ~1995, when there was
+	 * one architecture and one generation of hardware, the number
+	 * of CPUs was up to 4(8) and the choices for interrupt routing
+	 * were physical, or flat logical (optionally with lowest
+	 * priority interrupt). Logical mode hasn't scaled well with
+	 * the increasing number of packages/cores/threads, so the
+	 * fall-back is to physical mode, which works across all processor
+	 * generations, both AMD and Intel, using the APIC and xAPIC.
+	 *
+	 * Interrupt routing policy can be set here.
+	 */
+	switch(dfpolicy){
+	default:				/* noise core 0 */
+		*hi = sys->machptr[0]->apicno<<24;
+		break;
+	case 1:					/* round-robin */
+		/*
+		 * Assign each interrupt to a different CPU on a round-robin
+		 * Some idea of the packages/cores/thread topology would be
+		 * useful here, e.g. to not assign interrupts to more than one
+		 * thread in a core. But, as usual, Intel make that an onerous
+		 * task.
+		 */
+		lock(&dflock);
+		for(;;){
+			i = df++;
+			if(df >= sys->nmach+1)
+				df = 0;
+			if(sys->machptr[i] == nil || !sys->machptr[i]->online)
+				continue;
+			i = sys->machptr[i]->apicno;
+			if(xlapic[i].useable && xlapic[i].addr == 0)
+				break;
+		}
+		unlock(&dflock);
+	
+		*hi = i<<24;
+		break;
+	}
+	*lo |= Pm|MTf;
+}
+
+int
+nextvec(void)
+{
+	uint vecno;
+
+	lock(&idtnolock);
+	vecno = idtno;
+	idtno = (idtno+8) % IdtMAX;
+	if(idtno < IdtIOAPIC)
+		idtno += IdtIOAPIC;
+	unlock(&idtnolock);
+
+	return vecno;
+}
+
+static int
+msimask(Vkey *v, int mask)
+{
+	Pcidev *p;
+
+	p = pcimatchtbdf(v->tbdf);
+	if(p == nil)
+		return -1;
+	return pcimsimask(p, mask);
+}
+
+static int
+intrenablemsi(Vctl* v, Pcidev *p)
+{
+	uint vno, lo, hi;
+	uvlong msivec;
+
+	vno = nextvec();
+
+	lo = IPlow | TMedge | vno;
+	ioapicintrdd(&hi, &lo);
+
+	if(lo & Lm)
+		lo |= MTlp;
+
+	msivec = (uvlong)hi<<32 | lo;
+	if(pcimsienable(p, msivec) == -1)
+		return -1;
+	v->isr = apicisr;
+	v->eoi = apiceoi;
+	v->vno = vno;
+	v->type = "msi";
+	v->mask = msimask;
+
+	DBG("msiirq: %T: enabling %.16llux %s irq %d vno %d\n", p->tbdf, msivec, v->name, v->irq, vno);
+	return vno;
+}
+
+int
+disablemsi(Vctl*, Pcidev *p)
+{
+	if(p == nil)
+		return -1;
+	return pcimsimask(p, 1);
+}
+
+int
+ioapicintrenable(Vctl* v)
+{
+	Rbus *rbus;
+	Rdt *rdt;
+	u32int hi, lo;
+	int busno, devno, vecno;
+
+	/*
+	 * Bridge between old and unspecified new scheme,
+	 * the work in progress...
+	 */
+	if(v->tbdf == BUSUNKNOWN){
+		if(v->irq >= IrqLINT0 && v->irq <= MaxIrqLAPIC){
+			if(v->irq != IrqSPURIOUS)
+				v->isr = apiceoi;
+			v->type = "lapic";
+			return v->irq;
+		}
+		else{
+			/*
+			 * Legacy ISA.
+			 * Make a busno and devno using the
+			 * ISA bus number and the irq.
+			 */
+			extern int mpisabusno;
+
+			if(mpisabusno == -1)
+				panic("no ISA bus allocated");
+			busno = mpisabusno;
+			devno = v->irq<<2;
+		}
+	}
+	else if(BUSTYPE(v->tbdf) == BusPCI){
+		/*
+		 * PCI.
+		 * Make a devno from BUSDNO(tbdf) and pcidev->intp.
+		 */
+		Pcidev *pcidev;
+
+		busno = BUSBNO(v->tbdf);
+		if((pcidev = pcimatchtbdf(v->tbdf)) == nil)
+			panic("no PCI dev for tbdf %#8.8ux\n", v->tbdf);
+		if((vecno = intrenablemsi(v, pcidev)) != -1)
+			return vecno;
+		disablemsi(v, pcidev);
+		if((devno = pcicfgr8(pcidev, PciINTP)) == 0)
+			panic("no INTP for tbdf %#8.8ux\n", v->tbdf);
+		devno = BUSDNO(v->tbdf)<<2|(devno-1);
+		DBG("ioapicintrenable: tbdf %#8.8ux busno %d devno %d\n",
+			v->tbdf, busno, devno);
+	}
+	else{
+		SET(busno, devno);
+		panic("unknown tbdf %#8.8ux\n", v->tbdf);
+	}
+
+	rdt = nil;
+	for(rbus = rdtbus[busno]; rbus != nil; rbus = rbus->next)
+		if(rbus->devno == devno){
+			rdt = rbus->rdt;
+			break;
+		}
+	if(rdt == nil){
+		extern int mpisabusno;
+
+		/*
+		 * First crack in the smooth exterior of the new code:
+		 * some BIOS make an MPS table where the PCI devices are
+		 * just defaulted to ISA.
+		 * Rewrite this to be cleaner.
+		 */
+		if((busno = mpisabusno) == -1)
+			return -1;
+		devno = v->irq<<2;
+		for(rbus = rdtbus[busno]; rbus != nil; rbus = rbus->next)
+			if(rbus->devno == devno){
+				rdt = rbus->rdt;
+				break;
+			}
+		DBG("isa: tbdf %#8.8ux busno %d devno %d %#p\n",
+			v->tbdf, busno, devno, rdt);
+	}
+	if(rdt == nil)
+		return -1;
+
+	/*
+	 * Second crack:
+	 * what to do about devices that intrenable/intrdisable frequently?
+	 * 1) there is no ioapicdisable yet;
+	 * 2) it would be good to reuse freed vectors.
+	 * Oh bugger.
+	 */
+	/*
+	 * This is a low-frequency event so just lock
+	 * the whole IOAPIC to initialise the RDT entry
+	 * rather than putting a Lock in each entry.
+	 */
+	lock(rdt->apic);
+	DBG("%T: %ld/%d/%d (%d)\n", v->tbdf, rdt->apic - xioapic, rbus->devno, rdt->intin, devno);
+	if((rdt->lo & 0xff) == 0){
+		vecno = nextvec();
+		rdt->lo |= vecno;
+		rdtvecno[vecno] = rdt;
+	}else
+		DBG("%T: mutiple irq bus %d dev %d\n", v->tbdf, busno, devno);
+
+	rdt->enabled++;
+	lo = (rdt->lo & ~Im);
+	ioapicintrdd(&hi, &lo);
+	rtblput(rdt->apic, rdt->intin, hi, lo);
+	vecno = lo & 0xff;
+	unlock(rdt->apic);
+
+	DBG("busno %d devno %d hi %#8.8ux lo %#8.8ux vecno %d\n",
+		busno, devno, hi, lo, vecno);
+	v->isr = apicisr;
+	v->eoi = apiceoi;
+	v->vno = vecno;
+	v->type = "ioapic";
+
+	return vecno;
+}
+
+int
+ioapicintrdisable(int vecno)
+{
+	Rdt *rdt;
+
+	/*
+	 * FOV. Oh dear. This isn't very good.
+	 * Fortunately rdtvecno[vecno] is static
+	 * once assigned.
+	 * Must do better.
+	 *
+	 * What about any pending interrupts?
+	 */
+	if(vecno < 0 || vecno > MaxVectorAPIC){
+		panic("ioapicintrdisable: vecno %d out of range", vecno);
+		return -1;
+	}
+	if((rdt = rdtvecno[vecno]) == nil){
+		panic("ioapicintrdisable: vecno %d has no rdt", vecno);
+		return -1;
+	}
+
+	lock(rdt->apic);
+	rdt->enabled--;
+	if(rdt->enabled == 0)
+		rtblput(rdt->apic, rdt->intin, 0, rdt->lo);
+	unlock(rdt->apic);
+
+	return 0;
+}

+ 24 - 0
sys/src/9/k10/iob.h

@@ -0,0 +1,24 @@
+/* 
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+
+/* io bufs */
+void	iobufinit(void);
+Block*	va2block(void*);
+void*	io2alloc(uint);
+Block*	bigalloc(void);
+Block*	sbigalloc(void);
+int	isbigblock(Block*);
+
+/* bal.c */
+physaddr	bal(usize);
+void		bfree(physaddr, usize);
+void		balinit(physaddr, usize);
+void		balfreephys(physaddr, usize);
+void	baldump(void);

+ 198 - 0
sys/src/9/k10/k8cpu

@@ -0,0 +1,198 @@
+dev +dev
+	root
+	cons
+	arch
+	env
+	pipe
+	proc
+	mnt
+	srv
+	dup
+	rtc
+	ssl
+	cap
+	kprof
+	pmc	pmcio
+	segment
+	acpi
+	zp
+	ws
+
+# add to get cec in the kernel
+#	cec
+
+	ether		netif
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
+
+	pci
+
+	uart
+
+uart +dev
+	uarti8250
+	uartpci		pci
+
+ip +dev
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+
+link +dev
+	ether8169	pci ethermii
+	ether82557	pci
+	ether82563	pci
+	etherigbe	pci ethermii
+#	etherbcm	pci ethermii
+	ethermedium
+	loopbackmedium
+	netdevmedium
+
+#	ht
+
+misc +dev
+#	cache
+	mp		apic ioapic msi pci sipi
+
+#
+#boot cpu
+#	int cpuflag = 1;
+#boot cpu boot $3
+#	int cpuflag = 1;
+#	char* bootdisk = "$3";
+#boot rootdir $3
+#	char* rootdir = "$3";
+#boot (bboot|romboot|dosboot)
+#	int cpuflag = 1;
+#	char* bootprog = $2;
+#boot boot $3
+#	char* bootdisk = "$3";
+#
+boot cpu
+	tcp
+
+rootdir
+	bootk8cpu.out boot
+	/amd64/bin/auth/factotum factotum
+	/amd64/bin/ip/ipconfig ipconfig
+	../root/nvram nvram
+
+conf
+	int cpuserver = 1;
+
+#
+#dbgflg
+#	chan		'c'
+#	apic		'A'
+#	hpet		'H'
+#	ht		'H'
+#	ioapic		'I'
+#	mp		'M'
+#	pci		'P'
+#	arch		'V'
+#
+dbgflg
+	acore		'c'
+	apic		'A'
+	arch		'V'
+	asm		'm'
+	devacpi		'C'
+	devsegment	'z'
+	devzp		'z'
+	hpet		'H'
+	ht		'H'
+	image		'p'
+	ioapic		'I'
+	main		'x'
+	memory		'm'
+	mp		'M'
+	page		'p'
+	pager		'p'
+	physalloc		'm'
+	sysproc		'E'
+	sysseg		'p'
+	syssem		'S'
+	syszio		'z'
+	tcore		'c'
+	mmu		'v'
+
+amd64 +dev
+	l32p
+	l64v
+	l64idt
+	l64acidt
+	l64cpuid
+	l64syscall
+	l64acsyscall
+	l64fpu
+	acore
+	arch
+	archk10
+	asm
+	cga
+	crap
+	fpu
+	i8254
+	i8259
+	kbd
+	main
+	map
+	memory
+	mmu
+	multiboot
+	qmalloc
+	random
+	syscall
+	tcore
+	trap
+	vsvm
+	physalloc
+
+port
+	alarm
+	allocb
+	chan
+	dev
+	devtab
+	edf
+	fault
+	image
+	latin1
+	page
+	pager
+	parse
+	pgrp
+	portclock
+	print
+	proc
+	ps
+	qio
+	qlock
+	rebootcmd
+	segment
+	sysauth
+	sysfile
+	sysproc
+	sysseg
+	syssem
+	systab
+	taslock
+#	tcklock
+	tod
+	syszio
+	syscallfmt
+
+#
+#dir
+# pc		-.I.
+#
+dir
+	386
+	ip
+	port
+
+lib
+	libc
+	libip
+	libsec

+ 193 - 0
sys/src/9/k10/k8cpufs

@@ -0,0 +1,193 @@
+dev +dev
+	root
+	cons
+	arch
+	env
+	pipe
+	proc
+	mnt
+	srv
+	dup
+	rtc
+	ssl
+	cap
+	kprof
+	pmc
+	segment
+
+# add to get cec in the kernel
+#	cec
+
+	ether		netif
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
+
+	uart
+
+uart +dev
+	uarti8250
+	uartpci		pci
+pmc +dev
+	pmcio
+
+ip +dev
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+
+link +dev
+	ether8169	pci ethermii
+	ether82557	pci
+	ether82563	pci
+	etherigbe	pci ethermii
+	ethermedium
+	loopbackmedium
+	netdevmedium
+
+#	acpi		hpet
+#	ht
+
+misc +dev
+	cache
+	mp		apic ioapic pci sipi
+
+#
+#boot cpu
+#	int cpuflag = 1;
+#boot cpu boot $3
+#	int cpuflag = 1;
+#	char* bootdisk = "$3";
+#boot rootdir $3
+#	char* rootdir = "$3";
+#boot (bboot|romboot|dosboot)
+#	int cpuflag = 1;
+#	char* bootprog = $2;
+#boot boot $3
+#	char* bootdisk = "$3";
+#
+boot cpu
+	tcp
+
+rootdir
+	boot.fs boot
+	/amd64/bin/rc rc
+	/rc/lib/rcmain
+	/amd64/bin/echo echo
+	/amd64/bin/date date
+	/amd64/bin/ls ls
+	/amd64/bin/ps ps
+	/amd64/bin/bind bind
+	/amd64/bin/cat cat
+	/amd64/bin/auth/factotum factotum
+	/amd64/bin/ip/ipconfig ipconfig
+	../root/big big
+	../root/nvram nvram
+
+conf
+	int cpuserver = 1;
+
+#
+#dbgflg
+#	chan		'c'
+#	apic		'A'
+#	acpi		'C'
+#	hpet		'H'
+#	ht		'H'
+#	ioapic		'I'
+#	mp		'M'
+#	pci		'P'
+#	arch		'V'
+#
+dbgflg
+	apic		'A'
+	acpi		'C'
+	hpet		'H'
+	ht		'H'
+	ioapic		'I'
+	mp		'M'
+	arch		'V'
+	sysproc		'E'
+	main		'x'
+	acore		'c'
+	tcore		'c'
+	syssem		'S'
+	page	'p'
+	pager	'p'
+	memory 'm'
+
+amd64 +dev
+	l32p
+	l64v
+	l64idt
+	l64acidt
+	l64syscall
+	l64acsyscall
+	l64fpu
+	cpuidamd64
+	acore
+	arch
+	archk10
+	cga
+	crap
+	fpu
+	i8254
+	i8259
+	kbd
+	main
+	map
+	memory
+	mmu
+	multiboot
+	random
+	syscall
+	tcore
+	trap
+	vsvm
+
+port
+	alarm
+	alloc		xalloc
+	allocb
+	chan
+	dev
+	devtab
+	edf
+	fault
+	image
+	latin1
+	page
+	parse
+	pgrp
+	portclock
+	print
+	proc
+	ps
+	qio
+	qlock
+	rebootcmd
+	segment
+	pager
+	sysauth
+	sysfile
+	sysproc
+	sysseg
+	systab
+	taslock
+	tod
+	syssem
+	syszio
+
+#
+#dir
+# pc		-.I.
+#
+dir
+	386
+	ip
+	port
+
+lib
+	libc
+	libip
+	libsec

+ 202 - 0
sys/src/9/k10/k8cpukexec

@@ -0,0 +1,202 @@
+dev +dev
+	root
+	cons
+	arch
+	env
+	pipe
+	proc
+	kexec
+	cmd
+	mnt
+	srv
+	dup
+	rtc
+	ssl
+	cap
+	kprof
+#	pmc	pmcio
+	segment
+	acpi
+	tube
+	zp
+
+# add to get cec in the kernel
+#	cec
+
+	ether		netif
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
+
+	uart
+
+uart +dev
+	uarti8250
+	uartpci		pci
+
+ip +dev
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+
+link +dev
+	ether8169	pci ethermii
+	ether82557	pci
+	ether82563	pci
+	etherigbe	pci ethermii
+	ethermedium
+	loopbackmedium
+	netdevmedium
+
+#	ht
+
+misc +dev
+#	cache
+	mp		apic ioapic msi pci sipi
+#	rdb
+
+#
+#boot cpu
+#	int cpuflag = 1;
+#boot cpu boot $3
+#	int cpuflag = 1;
+#	char* bootdisk = "$3";
+#boot rootdir $3
+#	char* rootdir = "$3";
+#boot (bboot|romboot|dosboot)
+#	int cpuflag = 1;
+#	char* bootprog = $2;
+#boot boot $3
+#	char* bootdisk = "$3";
+#
+boot cpu
+	tcp
+
+rootdir
+	bootk8cpu.out boot
+	/amd64/bin/auth/factotum factotum
+	/amd64/bin/ip/ipconfig ipconfig
+	../root/nvram nvram
+
+conf
+	int cpuserver = 1;
+
+#
+#dbgflg
+#	chan		'c'
+#	apic		'A'
+#	hpet		'H'
+#	ht		'H'
+#	ioapic		'I'
+#	mp		'M'
+#	pci		'P'
+#	arch		'V'
+#
+dbgflg
+	acore		'c'
+	apic		'A'
+	arch		'V'
+	asm		'm'
+	devacpi		'C'
+	devsegment	'z'
+	devtube		'T'
+	devzp		'z'
+	hpet		'H'
+	ht		'H'
+	image		'p'
+	ioapic		'I'
+	kexec		'k'
+	main		'x'
+	memory		'm'
+	mp		'M'
+	nixcall		'n'
+	page		'p'
+	pager		'p'
+	physalloc		'm'
+	sysproc		'E'
+	sysseg		'p'
+	syssem		'S'
+	syszio		'z'
+	tcore		'c'
+	mmu		'v'
+
+amd64 +dev
+	l32p
+	l64v
+	l64idt
+	l64acidt
+	l64cpuid
+	l64syscall
+	l64acsyscall
+	l64fpu
+	acore
+	arch
+	archk10
+	asm
+	cga
+	crap
+	fpu
+	i8254
+	i8259
+	kbd
+	main
+	map
+	memory
+	mmu
+	multiboot
+	qmalloc
+	random
+	syscall
+	tcore
+	trap
+	vsvm
+	physalloc
+
+port
+	alarm
+	allocb
+	chan
+	dev
+	devtab
+	edf
+	fault
+	image
+	kexec
+	latin1
+	nixcall
+	page
+	pager
+	parse
+	pgrp
+	portclock
+	print
+	proc
+	ps
+	qio
+	qlock
+	rebootcmd
+	segment
+	sysauth
+	sysfile
+	sysproc
+	sysseg
+	syssem
+	systab
+	taslock
+	tod
+	syszio
+	syscallfmt
+
+#
+#dir
+# pc		-.I.
+#
+dir
+	386
+	ip
+	port
+
+lib
+	libc
+	libip
+	libsec

+ 235 - 0
sys/src/9/k10/l32p.s

@@ -0,0 +1,235 @@
+#include "mem.h"
+#include "amd64l.h"
+
+MODE $32
+
+#define pFARJMP32(s, o)	BYTE $0xea;		/* far jump to ptr32:16 */\
+			LONG $o; WORD $s
+
+/*
+ * Enter here in 32-bit protected mode. Welcome to 1982.
+ * Make sure the GDT is set as it should be:
+ *	disable interrupts;
+ *	load the GDT with the table in _gdt32p;
+ *	load all the data segments
+ *	load the code segment via a far jump.
+ */
+TEXT _protected<>(SB), 1, $-4
+	CLI
+	BYTE $0xe9; LONG $0x00000058;		/* JMP _endofheader */
+
+_startofheader:
+	BYTE	$0x90				/* NOP */
+	BYTE	$0x90				/* NOP */
+
+TEXT _multibootheader<>(SB), 1, $-4		/* must be 4-byte aligned */
+	LONG	$0x1badb002			/* magic */
+	LONG	$0x00000003			/* flags */
+	LONG	$-(0x1badb002 + 0x00000003)	/* checksum */
+
+TEXT _gdt32p<>(SB), 1, $-4
+	QUAD	$0x0000000000000000		/* NULL descriptor */
+	QUAD	$0x00cf9a000000ffff		/* CS */
+	QUAD	$0x00cf92000000ffff		/* DS */
+	QUAD	$0x0020980000000000		/* Long mode CS */
+
+TEXT _gdtptr32p<>(SB), 1, $-4
+	WORD	$(4*8-1)
+	LONG	$_gdt32p<>-KZERO(SB)
+
+TEXT _gdt64<>(SB), 1, $-4
+	QUAD	$0x0000000000000000		/* NULL descriptor */
+	QUAD	$0x0020980000000000		/* CS */
+
+TEXT _gdtptr64p<>(SB), 1, $-4
+	WORD	$(2*8-1)
+	QUAD	$_gdt64<>-KZERO(SB)
+
+TEXT _gdtptr64v<>(SB), 1, $-4
+	WORD	$(3*8-1)
+	QUAD	$_gdt64<>(SB)
+
+_endofheader:
+	MOVL	AX, BP				/* possible passed-in magic */
+
+	MOVL	$_gdtptr32p<>-KZERO(SB), AX
+	MOVL	(AX), GDTR
+
+	MOVL	$SSEL(SiDS, SsTIGDT|SsRPL0), AX
+	MOVW	AX, DS
+	MOVW	AX, ES
+	MOVW	AX, FS
+	MOVW	AX, GS
+	MOVW	AX, SS
+
+	pFARJMP32(SSEL(SiCS, SsTIGDT|SsRPL0), _warp64<>-KZERO(SB))
+
+/*
+ * Make the basic page tables for CPU0 to map 0-4MiB physical
+ * to KZERO, and include an identity map for the switch from protected
+ * to paging mode. There's an assumption here that the creation and later
+ * removal of the identity map will not interfere with the KZERO mappings;
+ * the conditions for clearing the identity map are
+ *	clear PML4 entry when (KZER0 & 0x0000ff8000000000) != 0;
+ *	clear PDP entry when (KZER0 & 0x0000007fc0000000) != 0;
+ *	don't clear PD entry when (KZER0 & 0x000000003fe00000) == 0;
+ * the code below assumes these conditions are met.
+ *
+ * Assume a recent processor with Page Size Extensions
+ * and use two 2MiB entries.
+ */
+/*
+ * The layout is decribed in data.h:
+ *	_protected:	start of kernel text
+ *	- 4*KiB		unused
+ *	- 4*KiB		unused
+ *	- 4*KiB		ptrpage
+ *	- 4*KiB		syspage
+ *	- MACHSZ	m
+ *	- 4*KiB		vsvmpage for gdt, tss
+ *	- PTSZ		PT for PMAPADDR		unused - assumes in KZERO PD
+ *	- PTSZ		PD
+ *	- PTSZ		PDP
+ *	- PTSZ		PML4
+ *	- MACHSTKSZ	stack
+ */
+
+/*
+ * Macros for accessing page table entries; change the
+ * C-style array-index macros into a page table byte offset
+ */
+#define PML4O(v)	((PTLX((v), 3))<<3)
+#define PDPO(v)		((PTLX((v), 2))<<3)
+#define PDO(v)		((PTLX((v), 1))<<3)
+#define PTO(v)		((PTLX((v), 0))<<3)
+
+TEXT _warp64<>(SB), 1, $-4
+	MOVL	$_protected<>-(MACHSTKSZ+4*PTSZ+5*(4*KiB)+MACHSZ+KZERO)(SB), SI
+
+	MOVL	SI, DI
+	XORL	AX, AX
+	MOVL	$((MACHSTKSZ+4*PTSZ+5*(4*KiB)+MACHSZ)>>2), CX
+
+	CLD
+	REP;	STOSL				/* stack, P*, vsvm, m, sys */
+
+	MOVL	SI, AX				/* sys-KZERO */
+	ADDL	$(MACHSTKSZ), AX		/* PML4 */
+	MOVL	AX, CR3				/* load the mmu */
+	MOVL	AX, DX
+	ADDL	$(PTSZ|PteRW|PteP), DX		/* PDP at PML4 + PTSZ */
+	MOVL	DX, PML4O(0)(AX)		/* PML4E for identity map */
+	MOVL	DX, PML4O(KZERO)(AX)		/* PML4E for KZERO, PMAPADDR */
+
+	ADDL	$PTSZ, AX			/* PDP at PML4 + PTSZ */
+	ADDL	$PTSZ, DX			/* PD at PML4 + 2*PTSZ */
+	MOVL	DX, PDPO(0)(AX)			/* PDPE for identity map */
+	MOVL	DX, PDPO(KZERO)(AX)		/* PDPE for KZERO, PMAPADDR */
+
+	ADDL	$PTSZ, AX			/* PD at PML4 + 2*PTSZ */
+	MOVL	$(PtePS|PteRW|PteP), DX
+	MOVL	DX, PDO(0)(AX)			/* PDE for identity 0-[24]MiB */
+	MOVL	DX, PDO(KZERO)(AX)		/* PDE for KZERO 0-[24]MiB */
+	ADDL	$PGLSZ(1), DX
+	MOVL	DX, PDO(KZERO+PGLSZ(1))(AX)	/* PDE for KZERO [24]-[48]MiB */
+
+	MOVL	AX, DX				/* PD at PML4 + 2*PTSZ */
+	ADDL	$(PTSZ|PteRW|PteP), DX		/* PT at PML4 + 3*PTSZ */
+	MOVL	DX, PDO(PMAPADDR)(AX)		/* PDE for PMAPADDR */
+
+/*
+ * Enable and activate Long Mode. From the manual:
+ * 	make sure Page Size Extentions are off, and Page Global
+ *	Extensions and Physical Address Extensions are on in CR4;
+ *	set Long Mode Enable in the Extended Feature Enable MSR;
+ *	set Paging Enable in CR0;
+ *	make an inter-segment jump to the Long Mode code.
+ * It's all in 32-bit mode until the jump is made.
+ */
+TEXT _lme<>(SB), 1, $-4
+	MOVL	CR4, AX
+	ANDL	$~Pse, AX			/* Page Size */
+	ORL	$(Pge|Pae), AX			/* Page Global, Phys. Address */
+	MOVL	AX, CR4
+
+	MOVL	$Efer, CX			/* Extended Feature Enable */
+	RDMSR
+	ORL	$Lme, AX			/* Long Mode Enable */
+	WRMSR
+
+	MOVL	CR0, DX
+	ANDL	$~(Cd|Nw|Ts|Mp), DX
+	ORL	$(Pg|Wp), DX			/* Paging Enable */
+	MOVL	DX, CR0
+
+	pFARJMP32(SSEL(3, SsTIGDT|SsRPL0), _identity<>-KZERO(SB))
+
+/*
+ * Long mode. Welcome to 2003.
+ * Jump out of the identity map space;
+ * load a proper long mode GDT.
+ */
+MODE $64
+
+TEXT _identity<>(SB), 1, $-4
+	MOVQ	$_start64v<>(SB), AX
+	JMP*	AX
+
+TEXT _start64v<>(SB), 1, $-4
+	MOVQ	$_gdtptr64v<>(SB), AX
+	MOVL	(AX), GDTR
+
+	XORQ	DX, DX
+	MOVW	DX, DS				/* not used in long mode */
+	MOVW	DX, ES				/* not used in long mode */
+	MOVW	DX, FS
+	MOVW	DX, GS
+	MOVW	DX, SS				/* not used in long mode */
+
+	MOVLQZX	SI, SI				/* sys-KZERO */
+	MOVQ	SI, AX
+	ADDQ	$KZERO, AX
+	MOVQ	AX, sys(SB)			/* sys */
+
+	ADDQ	$(MACHSTKSZ), AX		/* PML4 and top of stack */
+	MOVQ	AX, SP				/* set stack */
+
+_zap0pml4:
+	CMPQ	DX, $PML4O(KZERO)		/* KZER0 & 0x0000ff8000000000 */
+	JEQ	_zap0pdp
+	MOVQ	DX, PML4O(0)(AX) 		/* zap identity map PML4E */
+_zap0pdp:
+	ADDQ	$PTSZ, AX			/* PDP at PML4 + PTSZ */
+	CMPQ	DX, $PDPO(KZERO)		/* KZER0 & 0x0000007fc0000000 */
+	JEQ	_zap0pd
+	MOVQ	DX, PDPO(0)(AX)			/* zap identity map PDPE */
+_zap0pd:
+	ADDQ	$PTSZ, AX			/* PD at PML4 + 2*PTSZ */
+	CMPQ	DX, $PDO(KZERO)			/* KZER0 & 0x000000003fe00000 */
+	JEQ	_zap0done
+	MOVQ	DX, PDO(0)(AX)			/* zap identity map PDE */
+_zap0done:
+
+	ADDQ	$(MACHSTKSZ), SI		/* PML4-KZERO */
+	MOVQ	SI, CR3				/* flush TLB */
+
+	ADDQ	$(2*PTSZ+4*KiB), AX		/* PD+PT+vsvm */
+	MOVQ	AX, RMACH			/* Mach */
+	MOVQ	DX, RUSER
+
+	PUSHQ	DX				/* clear flags */
+	POPFQ
+
+	MOVLQZX	BX, BX				/* push multiboot args */
+	PUSHQ	BX				/* multiboot info* */
+	MOVLQZX	RARG, RARG
+	PUSHQ	RARG				/* multiboot magic */
+
+	CALL	main(SB)
+
+TEXT ndnr(SB), 1, $-4				/* no deposit, no return */
+_dnr:
+	STI
+	HLT
+	JMP	_dnr				/* do not resuscitate */
+

+ 341 - 0
sys/src/9/k10/l64acidt.s

@@ -0,0 +1,341 @@
+/*
+ * Interrupt/exception handling.
+ */
+#include "amd64l.h"
+
+MODE $64
+
+TEXT _acintrp<>(SB), 1, $-4			/* no error code pushed */
+	PUSHQ	AX				/* save AX */
+	MOVQ	8(SP), AX			/* idthandlers(SB) PC */
+	JMP	_acintrcommon
+
+TEXT _acintre<>(SB), 1, $-4			/* error code pushed */
+	XCHGQ	AX, (SP)
+_acintrcommon:
+	MOVBQZX	(AX), AX
+	XCHGQ	AX, (SP)
+
+	SUBQ	$24, SP				/* R1[45], [DEFG]S */
+	CMPW	48(SP), $SSEL(SiCS, SsTIGDT|SsRPL0)	/* old CS */
+	JEQ	_acintrnested
+
+	MOVQ	RUSER, 0(SP)
+	MOVQ	RMACH, 8(SP)
+	MOVW	DS, 16(SP)
+	MOVW	ES, 18(SP)
+	MOVW	FS, 20(SP)
+	MOVW	GS, 22(SP)
+
+	SWAPGS
+	BYTE $0x65; MOVQ 0, RMACH		/* m-> (MOVQ GS:0x0, R15) */
+	MOVQ	16(RMACH), RUSER		/* up */
+
+_acintrnested:
+	PUSHQ	R13
+	PUSHQ	R12
+	PUSHQ	R11
+	PUSHQ	R10
+	PUSHQ	R9
+	PUSHQ	R8
+	PUSHQ	BP
+	PUSHQ	DI
+	PUSHQ	SI
+	PUSHQ	DX
+	PUSHQ	CX
+	PUSHQ	BX
+	PUSHQ	AX
+
+	MOVQ	SP, RARG
+	PUSHQ	SP
+	CALL	actrap(SB)
+
+TEXT _acintrr<>(SB), 1, $-4			/* so ktrace can pop frame */
+	POPQ	AX
+
+	POPQ	AX
+	POPQ	BX
+	POPQ	CX
+	POPQ	DX
+	POPQ	SI
+	POPQ	DI
+	POPQ	BP
+	POPQ	R8
+	POPQ	R9
+	POPQ	R10
+	POPQ	R11
+	POPQ	R12
+	POPQ	R13
+
+	CMPQ	48(SP), $SSEL(SiCS, SsTIGDT|SsRPL0)
+	JEQ	_aciretnested
+
+	SWAPGS
+	MOVW	22(SP), GS
+	MOVW	20(SP), FS
+	MOVW	18(SP), ES
+	MOVW	16(SP), DS
+	MOVQ	8(SP), RMACH
+	MOVQ	0(SP), RUSER
+
+_aciretnested:
+	ADDQ	$40, SP
+	IRETQ
+
+TEXT acidthandlers(SB), 1, $-4
+	CALL _acintrp<>(SB); BYTE $IdtDE		/* #DE Divide-by-Zero Error */
+	CALL _acintrp<>(SB); BYTE $IdtDB		/* #DB Debug */
+	CALL _acintrp<>(SB); BYTE $IdtNMI		/* #NMI Borked */
+	CALL _acintrp<>(SB); BYTE $IdtBP		/* #BP Breakpoint */
+	CALL _acintrp<>(SB); BYTE $IdtOF		/* #OF Overflow */
+	CALL _acintrp<>(SB); BYTE $IdtBR		/* #BR Bound-Range */
+	CALL _acintrp<>(SB); BYTE $IdtUD		/* #UD Invalid-Opcode */
+	CALL _acintrp<>(SB); BYTE $IdtNM		/* #NM Device-Not-Available */
+	CALL _acintre<>(SB); BYTE $IdtDF		/* #DF Double-Fault */
+	CALL _acintrp<>(SB); BYTE $Idt09		/* reserved */
+	CALL _acintre<>(SB); BYTE $IdtTS		/* #TS Invalid-TSS */
+	CALL _acintre<>(SB); BYTE $IdtNP		/* #NP Segment-Not-Present */
+	CALL _acintre<>(SB); BYTE $IdtSS		/* #SS Stack */
+	CALL _acintre<>(SB); BYTE $IdtGP		/* #GP General-Protection */
+	CALL _acintre<>(SB); BYTE $IdtPF		/* #PF Page-Fault */
+	CALL _acintrp<>(SB); BYTE $Idt0F		/* reserved */
+	CALL _acintrp<>(SB); BYTE $IdtMF		/* #MF x87 FPE-Pending */
+	CALL _acintre<>(SB); BYTE $IdtAC		/* #AC Alignment-Check */
+	CALL _acintrp<>(SB); BYTE $IdtMC		/* #MC Machine-Check */
+	CALL _acintrp<>(SB); BYTE $IdtXF		/* #XF SIMD Floating-Point */
+	CALL _acintrp<>(SB); BYTE $0x14		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x15		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x16		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x17		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x18		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x19		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1a		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1b		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1c		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1d		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1e		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x1f		/* reserved */
+	CALL _acintrp<>(SB); BYTE $0x20
+	CALL _acintrp<>(SB); BYTE $0x21
+	CALL _acintrp<>(SB); BYTE $0x22
+	CALL _acintrp<>(SB); BYTE $0x23
+	CALL _acintrp<>(SB); BYTE $0x24
+	CALL _acintrp<>(SB); BYTE $0x25
+	CALL _acintrp<>(SB); BYTE $0x26
+	CALL _acintrp<>(SB); BYTE $0x27
+	CALL _acintrp<>(SB); BYTE $0x28
+	CALL _acintrp<>(SB); BYTE $0x29
+	CALL _acintrp<>(SB); BYTE $0x2a
+	CALL _acintrp<>(SB); BYTE $0x2b
+	CALL _acintrp<>(SB); BYTE $0x2c
+	CALL _acintrp<>(SB); BYTE $0x2d
+	CALL _acintrp<>(SB); BYTE $0x2e
+	CALL _acintrp<>(SB); BYTE $0x2f
+	CALL _acintrp<>(SB); BYTE $0x30
+	CALL _acintrp<>(SB); BYTE $0x31
+	CALL _acintrp<>(SB); BYTE $0x32
+	CALL _acintrp<>(SB); BYTE $0x33
+	CALL _acintrp<>(SB); BYTE $0x34
+	CALL _acintrp<>(SB); BYTE $0x35
+	CALL _acintrp<>(SB); BYTE $0x36
+	CALL _acintrp<>(SB); BYTE $0x37
+	CALL _acintrp<>(SB); BYTE $0x38
+	CALL _acintrp<>(SB); BYTE $0x39
+	CALL _acintrp<>(SB); BYTE $0x3a
+	CALL _acintrp<>(SB); BYTE $0x3b
+	CALL _acintrp<>(SB); BYTE $0x3c
+	CALL _acintrp<>(SB); BYTE $0x3d
+	CALL _acintrp<>(SB); BYTE $0x3e
+	CALL _acintrp<>(SB); BYTE $0x3f
+	CALL _acintrp<>(SB); BYTE $0x40
+	CALL _acintrp<>(SB); BYTE $0x41
+	CALL _acintrp<>(SB); BYTE $0x42
+	CALL _acintrp<>(SB); BYTE $0x43
+	CALL _acintrp<>(SB); BYTE $0x44
+	CALL _acintrp<>(SB); BYTE $0x45
+	CALL _acintrp<>(SB); BYTE $0x46
+	CALL _acintrp<>(SB); BYTE $0x47
+	CALL _acintrp<>(SB); BYTE $0x48
+	CALL _acintrp<>(SB); BYTE $0x49
+	CALL _acintrp<>(SB); BYTE $0x4a
+	CALL _acintrp<>(SB); BYTE $0x4b
+	CALL _acintrp<>(SB); BYTE $0x4c
+	CALL _acintrp<>(SB); BYTE $0x4d
+	CALL _acintrp<>(SB); BYTE $0x4e
+	CALL _acintrp<>(SB); BYTE $0x4f
+	CALL _acintrp<>(SB); BYTE $0x50
+	CALL _acintrp<>(SB); BYTE $0x51
+	CALL _acintrp<>(SB); BYTE $0x52
+	CALL _acintrp<>(SB); BYTE $0x53
+	CALL _acintrp<>(SB); BYTE $0x54
+	CALL _acintrp<>(SB); BYTE $0x55
+	CALL _acintrp<>(SB); BYTE $0x56
+	CALL _acintrp<>(SB); BYTE $0x57
+	CALL _acintrp<>(SB); BYTE $0x58
+	CALL _acintrp<>(SB); BYTE $0x59
+	CALL _acintrp<>(SB); BYTE $0x5a
+	CALL _acintrp<>(SB); BYTE $0x5b
+	CALL _acintrp<>(SB); BYTE $0x5c
+	CALL _acintrp<>(SB); BYTE $0x5d
+	CALL _acintrp<>(SB); BYTE $0x5e
+	CALL _acintrp<>(SB); BYTE $0x5f
+	CALL _acintrp<>(SB); BYTE $0x60
+	CALL _acintrp<>(SB); BYTE $0x61
+	CALL _acintrp<>(SB); BYTE $0x62
+	CALL _acintrp<>(SB); BYTE $0x63
+	CALL _acintrp<>(SB); BYTE $0x64
+	CALL _acintrp<>(SB); BYTE $0x65
+	CALL _acintrp<>(SB); BYTE $0x66
+	CALL _acintrp<>(SB); BYTE $0x67
+	CALL _acintrp<>(SB); BYTE $0x68
+	CALL _acintrp<>(SB); BYTE $0x69
+	CALL _acintrp<>(SB); BYTE $0x6a
+	CALL _acintrp<>(SB); BYTE $0x6b
+	CALL _acintrp<>(SB); BYTE $0x6c
+	CALL _acintrp<>(SB); BYTE $0x6d
+	CALL _acintrp<>(SB); BYTE $0x6e
+	CALL _acintrp<>(SB); BYTE $0x6f
+	CALL _acintrp<>(SB); BYTE $0x70
+	CALL _acintrp<>(SB); BYTE $0x71
+	CALL _acintrp<>(SB); BYTE $0x72
+	CALL _acintrp<>(SB); BYTE $0x73
+	CALL _acintrp<>(SB); BYTE $0x74
+	CALL _acintrp<>(SB); BYTE $0x75
+	CALL _acintrp<>(SB); BYTE $0x76
+	CALL _acintrp<>(SB); BYTE $0x77
+	CALL _acintrp<>(SB); BYTE $0x78
+	CALL _acintrp<>(SB); BYTE $0x79
+	CALL _acintrp<>(SB); BYTE $0x7a
+	CALL _acintrp<>(SB); BYTE $0x7b
+	CALL _acintrp<>(SB); BYTE $0x7c
+	CALL _acintrp<>(SB); BYTE $0x7d
+	CALL _acintrp<>(SB); BYTE $0x7e
+	CALL _acintrp<>(SB); BYTE $0x7f
+	CALL _acintrp<>(SB); BYTE $0x80
+	CALL _acintrp<>(SB); BYTE $0x81
+	CALL _acintrp<>(SB); BYTE $0x82
+	CALL _acintrp<>(SB); BYTE $0x83
+	CALL _acintrp<>(SB); BYTE $0x84
+	CALL _acintrp<>(SB); BYTE $0x85
+	CALL _acintrp<>(SB); BYTE $0x86
+	CALL _acintrp<>(SB); BYTE $0x87
+	CALL _acintrp<>(SB); BYTE $0x88
+	CALL _acintrp<>(SB); BYTE $0x89
+	CALL _acintrp<>(SB); BYTE $0x8a
+	CALL _acintrp<>(SB); BYTE $0x8b
+	CALL _acintrp<>(SB); BYTE $0x8c
+	CALL _acintrp<>(SB); BYTE $0x8d
+	CALL _acintrp<>(SB); BYTE $0x8e
+	CALL _acintrp<>(SB); BYTE $0x8f
+	CALL _acintrp<>(SB); BYTE $0x90
+	CALL _acintrp<>(SB); BYTE $0x91
+	CALL _acintrp<>(SB); BYTE $0x92
+	CALL _acintrp<>(SB); BYTE $0x93
+	CALL _acintrp<>(SB); BYTE $0x94
+	CALL _acintrp<>(SB); BYTE $0x95
+	CALL _acintrp<>(SB); BYTE $0x96
+	CALL _acintrp<>(SB); BYTE $0x97
+	CALL _acintrp<>(SB); BYTE $0x98
+	CALL _acintrp<>(SB); BYTE $0x99
+	CALL _acintrp<>(SB); BYTE $0x9a
+	CALL _acintrp<>(SB); BYTE $0x9b
+	CALL _acintrp<>(SB); BYTE $0x9c
+	CALL _acintrp<>(SB); BYTE $0x9d
+	CALL _acintrp<>(SB); BYTE $0x9e
+	CALL _acintrp<>(SB); BYTE $0x9f
+	CALL _acintrp<>(SB); BYTE $0xa0
+	CALL _acintrp<>(SB); BYTE $0xa1
+	CALL _acintrp<>(SB); BYTE $0xa2
+	CALL _acintrp<>(SB); BYTE $0xa3
+	CALL _acintrp<>(SB); BYTE $0xa4
+	CALL _acintrp<>(SB); BYTE $0xa5
+	CALL _acintrp<>(SB); BYTE $0xa6
+	CALL _acintrp<>(SB); BYTE $0xa7
+	CALL _acintrp<>(SB); BYTE $0xa8
+	CALL _acintrp<>(SB); BYTE $0xa9
+	CALL _acintrp<>(SB); BYTE $0xaa
+	CALL _acintrp<>(SB); BYTE $0xab
+	CALL _acintrp<>(SB); BYTE $0xac
+	CALL _acintrp<>(SB); BYTE $0xad
+	CALL _acintrp<>(SB); BYTE $0xae
+	CALL _acintrp<>(SB); BYTE $0xaf
+	CALL _acintrp<>(SB); BYTE $0xb0
+	CALL _acintrp<>(SB); BYTE $0xb1
+	CALL _acintrp<>(SB); BYTE $0xb2
+	CALL _acintrp<>(SB); BYTE $0xb3
+	CALL _acintrp<>(SB); BYTE $0xb4
+	CALL _acintrp<>(SB); BYTE $0xb5
+	CALL _acintrp<>(SB); BYTE $0xb6
+	CALL _acintrp<>(SB); BYTE $0xb7
+	CALL _acintrp<>(SB); BYTE $0xb8
+	CALL _acintrp<>(SB); BYTE $0xb9
+	CALL _acintrp<>(SB); BYTE $0xba
+	CALL _acintrp<>(SB); BYTE $0xbb
+	CALL _acintrp<>(SB); BYTE $0xbc
+	CALL _acintrp<>(SB); BYTE $0xbd
+	CALL _acintrp<>(SB); BYTE $0xbe
+	CALL _acintrp<>(SB); BYTE $0xbf
+	CALL _acintrp<>(SB); BYTE $0xc0
+	CALL _acintrp<>(SB); BYTE $0xc1
+	CALL _acintrp<>(SB); BYTE $0xc2
+	CALL _acintrp<>(SB); BYTE $0xc3
+	CALL _acintrp<>(SB); BYTE $0xc4
+	CALL _acintrp<>(SB); BYTE $0xc5
+	CALL _acintrp<>(SB); BYTE $0xc6
+	CALL _acintrp<>(SB); BYTE $0xc7
+	CALL _acintrp<>(SB); BYTE $0xc8
+	CALL _acintrp<>(SB); BYTE $0xc9
+	CALL _acintrp<>(SB); BYTE $0xca
+	CALL _acintrp<>(SB); BYTE $0xcb
+	CALL _acintrp<>(SB); BYTE $0xcc
+	CALL _acintrp<>(SB); BYTE $0xce
+	CALL _acintrp<>(SB); BYTE $0xce
+	CALL _acintrp<>(SB); BYTE $0xcf
+	CALL _acintrp<>(SB); BYTE $0xd0
+	CALL _acintrp<>(SB); BYTE $0xd1
+	CALL _acintrp<>(SB); BYTE $0xd2
+	CALL _acintrp<>(SB); BYTE $0xd3
+	CALL _acintrp<>(SB); BYTE $0xd4
+	CALL _acintrp<>(SB); BYTE $0xd5
+	CALL _acintrp<>(SB); BYTE $0xd6
+	CALL _acintrp<>(SB); BYTE $0xd7
+	CALL _acintrp<>(SB); BYTE $0xd8
+	CALL _acintrp<>(SB); BYTE $0xd9
+	CALL _acintrp<>(SB); BYTE $0xda
+	CALL _acintrp<>(SB); BYTE $0xdb
+	CALL _acintrp<>(SB); BYTE $0xdc
+	CALL _acintrp<>(SB); BYTE $0xdd
+	CALL _acintrp<>(SB); BYTE $0xde
+	CALL _acintrp<>(SB); BYTE $0xdf
+	CALL _acintrp<>(SB); BYTE $0xe0
+	CALL _acintrp<>(SB); BYTE $0xe1
+	CALL _acintrp<>(SB); BYTE $0xe2
+	CALL _acintrp<>(SB); BYTE $0xe3
+	CALL _acintrp<>(SB); BYTE $0xe4
+	CALL _acintrp<>(SB); BYTE $0xe5
+	CALL _acintrp<>(SB); BYTE $0xe6
+	CALL _acintrp<>(SB); BYTE $0xe7
+	CALL _acintrp<>(SB); BYTE $0xe8
+	CALL _acintrp<>(SB); BYTE $0xe9
+	CALL _acintrp<>(SB); BYTE $0xea
+	CALL _acintrp<>(SB); BYTE $0xeb
+	CALL _acintrp<>(SB); BYTE $0xec
+	CALL _acintrp<>(SB); BYTE $0xed
+	CALL _acintrp<>(SB); BYTE $0xee
+	CALL _acintrp<>(SB); BYTE $0xef
+	CALL _acintrp<>(SB); BYTE $0xf0
+	CALL _acintrp<>(SB); BYTE $0xf1
+	CALL _acintrp<>(SB); BYTE $0xf2
+	CALL _acintrp<>(SB); BYTE $0xf3
+	CALL _acintrp<>(SB); BYTE $0xf4
+	CALL _acintrp<>(SB); BYTE $0xf5
+	CALL _acintrp<>(SB); BYTE $0xf6
+	CALL _acintrp<>(SB); BYTE $0xf7
+	CALL _acintrp<>(SB); BYTE $0xf8
+	CALL _acintrp<>(SB); BYTE $0xf9
+	CALL _acintrp<>(SB); BYTE $0xfa
+	CALL _acintrp<>(SB); BYTE $0xfb
+	CALL _acintrp<>(SB); BYTE $0xfc
+	CALL _acintrp<>(SB); BYTE $0xfd
+	CALL _acintrp<>(SB); BYTE $0xfe
+	CALL _acintrp<>(SB); BYTE $0xff

+ 80 - 0
sys/src/9/k10/l64acsyscall.s

@@ -0,0 +1,80 @@
+#include "mem.h"
+#include "amd64l.h"
+
+MODE $64
+
+/*
+ */
+TEXT acsyscallentry(SB), 1, $-4
+	SWAPGS
+	BYTE $0x65; MOVQ 0, RMACH		/* m-> (MOVQ GS:0x0, R15) */
+	MOVQ	16(RMACH), RUSER		/* m->proc */
+	MOVQ	24(RUSER), R12		/* m->proc->dbgregs */
+
+	/* save sp to r13; set up kstack so we can call acsyscall */
+	MOVQ	SP, R13
+	MOVQ	24(RMACH), SP			/* m->stack */
+	ADDQ	$MACHSTKSZ, SP
+
+	MOVQ	$SSEL(SiUDS, SsRPL3), BX		/* old stack segment */
+	MOVQ	BX, 176(R12)				/* save ss */
+	MOVQ	R13, 168(R12)				/* old sp */
+	MOVQ	R11, 160(R12)				/* old flags */
+	MOVQ	$SSEL(SiUCS, SsRPL3), BX		/* old code segment */
+	MOVQ	BX, 152(R12)				/* save cs */
+	MOVQ	CX, 144(R12)				/* old ip */
+
+	MOVW	$SSEL(SiUDS, SsRPL3), 120(R12)
+	MOVW	ES,  122(R12)
+	MOVW	FS,  124(R12)
+	MOVW	GS,  126(R12)
+
+	MOVQ	RARG, 	0(R12)			/* system call number: up->dbgregs->ax  */
+	CALL	acsyscall(SB)
+NDNR:	JMP NDNR
+
+TEXT _acsysret(SB), 1, $-4
+	CLI
+	SWAPGS
+
+	MOVQ	24(RUSER), R12			/* m->proc->dbgregs */
+	MOVQ	0(R12), AX			/* m->proc->dbgregs->ax */
+	MOVQ	(6*8)(R12),	BP		/* m->proc->dbgregs->bp */
+	ADDQ	$(15*8), R12			/* after ax--r15, 8 bytes each */
+
+	MOVW	0(R12), DS
+	MOVW	2(R12), ES
+	MOVW	4(R12), FS
+	MOVW	6(R12), GS
+
+	MOVQ	24(R12), CX			/* ip */
+	MOVQ	40(R12), R11			/* flags */
+
+	MOVQ	48(R12), SP			/* sp */
+
+	BYTE $0x48; SYSRET			/* SYSRETQ */
+
+/*
+ * Return from an exec() system call that we never did,
+ * DX is ar0->p by the time we call it. See syscall()
+ */
+TEXT xactouser(SB), 1, $-4
+	CLI
+	BYTE $0x65; MOVQ 0, RMACH		/* m-> (MOVQ GS:0x0, R15) */
+	MOVQ	16(RMACH), RUSER		/* m->proc */
+	MOVQ	24(RUSER), R12			/* m->proc->dbgregs */
+	MOVQ	144(R12), CX			/* old ip */
+	MOVQ	0(R12), BX				/* save AX */
+	SWAPGS
+	MOVQ	$SSEL(SiUDS, SsRPL3), AX
+	MOVW	AX, DS
+	MOVW	AX, ES
+	MOVW	AX, FS
+	MOVW	AX, GS
+
+	MOVQ	BX, AX			/* restore AX */
+	MOVQ	$If, R11			/* flags */
+
+	MOVQ	RARG, SP			/* sp */
+
+	BYTE $0x48; SYSRET			/* SYSRETQ */

+ 26 - 0
sys/src/9/k10/l64cpuid.s

@@ -0,0 +1,26 @@
+/*
+ * The CPUID instruction is always supported on the amd64.
+ */
+TEXT cpuid(SB), $-4
+	MOVL	RARG, AX			/* function in AX */
+	MOVLQZX	cx+8(FP), CX			/* iterator/index/etc. */
+
+	CPUID
+
+	MOVQ	info+16(FP), BP
+	MOVL	AX, 0(BP)
+	MOVL	BX, 4(BP)
+	MOVL	CX, 8(BP)
+	MOVL	DX, 12(BP)
+	RET
+
+/*
+ * Basic timing loop to determine CPU frequency.
+ * The AAM instruction is not available in 64-bit mode.
+ */
+TEXT aamloop(SB), 1, $-4
+	MOVLQZX	RARG, CX
+aaml1:
+	XORQ	AX, AX				/* close enough */
+	LOOP	aaml1
+	RET

+ 46 - 0
sys/src/9/k10/l64fpu.s

@@ -0,0 +1,46 @@
+/*
+ * SIMD Floating Point.
+ * Note: for x87 instructions which have both a 'wait' and 'nowait' version,
+ * 8a only knows the 'wait' mnemonic but does NOT insertthe WAIT prefix byte
+ * (i.e. they act like their FNxxx variations) so WAIT instructions must be
+ * explicitly placed in the code if necessary.
+ */
+TEXT _clts(SB), 1, $-4
+	CLTS
+	RET
+
+TEXT _fldcw(SB), 1, $-4				/* Load x87 FPU Control Word */
+	MOVQ	RARG, cw+0(FP)
+	FLDCW	cw+0(FP)
+	RET
+
+TEXT _fnclex(SB), 1, $-4
+	FCLEX
+	RET
+
+TEXT _fninit(SB), 1, $-4
+	FINIT					/* no WAIT */
+	RET
+
+TEXT _fxrstor(SB), 1, $-4
+	FXRSTOR64 (RARG)
+	RET
+
+TEXT _fxsave(SB), 1, $-4
+	FXSAVE64 (RARG)
+	RET
+
+TEXT _fwait(SB), 1, $-4
+	WAIT
+	RET
+
+TEXT _ldmxcsr(SB), 1, $-4			/* Load MXCSR */
+	MOVQ	RARG, mxcsr+0(FP)
+	LDMXCSR	mxcsr+0(FP)
+	RET
+
+TEXT _stts(SB), 1, $-4
+	MOVQ	CR0, AX
+	ORQ	$8, AX				/* Ts */
+	MOVQ	AX, CR0
+	RET

+ 344 - 0
sys/src/9/k10/l64idt.s

@@ -0,0 +1,344 @@
+/*
+ * Interrupt/exception handling.
+ */
+#include "amd64l.h"
+
+MODE $64
+
+TEXT _intrp<>(SB), 1, $-4			/* no error code pushed */
+	PUSHQ	AX				/* save AX */
+	MOVQ	8(SP), AX			/* idthandlers(SB) PC */
+	JMP	_intrcommon
+
+TEXT _intre<>(SB), 1, $-4			/* error code pushed */
+	XCHGQ	AX, (SP)
+_intrcommon:
+	MOVBQZX	(AX), AX
+	XCHGQ	AX, (SP)
+
+	SUBQ	$24, SP				/* R1[45], [DEFG]S */
+	CMPW	48(SP), $SSEL(SiCS, SsTIGDT|SsRPL0)	/* old CS */
+	JEQ	_intrnested
+
+	MOVQ	RUSER, 0(SP)
+	MOVQ	RMACH, 8(SP)
+	MOVW	DS, 16(SP)
+	MOVW	ES, 18(SP)
+	MOVW	FS, 20(SP)
+	MOVW	GS, 22(SP)
+
+	SWAPGS
+	BYTE $0x65; MOVQ 0, RMACH		/* m-> (MOVQ GS:0x0, R15) */
+	MOVQ	16(RMACH), RUSER		/* up */
+
+_intrnested:
+	PUSHQ	R13
+	PUSHQ	R12
+	PUSHQ	R11
+	PUSHQ	R10
+	PUSHQ	R9
+	PUSHQ	R8
+	PUSHQ	BP
+	PUSHQ	DI
+	PUSHQ	SI
+	PUSHQ	DX
+	PUSHQ	CX
+	PUSHQ	BX
+	PUSHQ	AX
+
+	MOVQ	SP, RARG
+	PUSHQ	SP
+	CALL	_trap(SB)
+
+TEXT _intrr<>(SB), 1, $-4			/* so ktrace can pop frame */
+	POPQ	AX
+
+	POPQ	AX
+	POPQ	BX
+	POPQ	CX
+	POPQ	DX
+	POPQ	SI
+	POPQ	DI
+	POPQ	BP
+	POPQ	R8
+	POPQ	R9
+	POPQ	R10
+	POPQ	R11
+	POPQ	R12
+	POPQ	R13
+
+	CMPQ	48(SP), $SSEL(SiCS, SsTIGDT|SsRPL0)
+	JEQ	_iretnested
+
+	SWAPGS
+	/*	per the architecture manual, moving 16 bits to FS can zero it. Bad ... 
+	 *	not restoring it gives back the bad segment selector bug
+	 */
+	MOVW	22(SP), GS
+	MOVW	20(SP), FS
+	MOVW	18(SP), ES
+	MOVW	16(SP), DS
+	MOVQ	8(SP), RMACH
+	MOVQ	0(SP), RUSER
+
+_iretnested:
+	ADDQ	$40, SP
+	IRETQ
+
+TEXT idthandlers(SB), 1, $-4
+	CALL _intrp<>(SB); BYTE $IdtDE		/* #DE Divide-by-Zero Error */
+	CALL _intrp<>(SB); BYTE $IdtDB		/* #DB Debug */
+	CALL _intrp<>(SB); BYTE $IdtNMI		/* #NMI Borked */
+	CALL _intrp<>(SB); BYTE $IdtBP		/* #BP Breakpoint */
+	CALL _intrp<>(SB); BYTE $IdtOF		/* #OF Overflow */
+	CALL _intrp<>(SB); BYTE $IdtBR		/* #BR Bound-Range */
+	CALL _intrp<>(SB); BYTE $IdtUD		/* #UD Invalid-Opcode */
+	CALL _intrp<>(SB); BYTE $IdtNM		/* #NM Device-Not-Available */
+	CALL _intre<>(SB); BYTE $IdtDF		/* #DF Double-Fault */
+	CALL _intrp<>(SB); BYTE $Idt09		/* reserved */
+	CALL _intre<>(SB); BYTE $IdtTS		/* #TS Invalid-TSS */
+	CALL _intre<>(SB); BYTE $IdtNP		/* #NP Segment-Not-Present */
+	CALL _intre<>(SB); BYTE $IdtSS		/* #SS Stack */
+	CALL _intre<>(SB); BYTE $IdtGP		/* #GP General-Protection */
+	CALL _intre<>(SB); BYTE $IdtPF		/* #PF Page-Fault */
+	CALL _intrp<>(SB); BYTE $Idt0F		/* reserved */
+	CALL _intrp<>(SB); BYTE $IdtMF		/* #MF x87 FPE-Pending */
+	CALL _intre<>(SB); BYTE $IdtAC		/* #AC Alignment-Check */
+	CALL _intrp<>(SB); BYTE $IdtMC		/* #MC Machine-Check */
+	CALL _intrp<>(SB); BYTE $IdtXF		/* #XF SIMD Floating-Point */
+	CALL _intrp<>(SB); BYTE $0x14		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x15		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x16		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x17		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x18		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x19		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1a		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1b		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1c		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1d		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1e		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x1f		/* reserved */
+	CALL _intrp<>(SB); BYTE $0x20
+	CALL _intrp<>(SB); BYTE $0x21
+	CALL _intrp<>(SB); BYTE $0x22
+	CALL _intrp<>(SB); BYTE $0x23
+	CALL _intrp<>(SB); BYTE $0x24
+	CALL _intrp<>(SB); BYTE $0x25
+	CALL _intrp<>(SB); BYTE $0x26
+	CALL _intrp<>(SB); BYTE $0x27
+	CALL _intrp<>(SB); BYTE $0x28
+	CALL _intrp<>(SB); BYTE $0x29
+	CALL _intrp<>(SB); BYTE $0x2a
+	CALL _intrp<>(SB); BYTE $0x2b
+	CALL _intrp<>(SB); BYTE $0x2c
+	CALL _intrp<>(SB); BYTE $0x2d
+	CALL _intrp<>(SB); BYTE $0x2e
+	CALL _intrp<>(SB); BYTE $0x2f
+	CALL _intrp<>(SB); BYTE $0x30
+	CALL _intrp<>(SB); BYTE $0x31
+	CALL _intrp<>(SB); BYTE $0x32
+	CALL _intrp<>(SB); BYTE $0x33
+	CALL _intrp<>(SB); BYTE $0x34
+	CALL _intrp<>(SB); BYTE $0x35
+	CALL _intrp<>(SB); BYTE $0x36
+	CALL _intrp<>(SB); BYTE $0x37
+	CALL _intrp<>(SB); BYTE $0x38
+	CALL _intrp<>(SB); BYTE $0x39
+	CALL _intrp<>(SB); BYTE $0x3a
+	CALL _intrp<>(SB); BYTE $0x3b
+	CALL _intrp<>(SB); BYTE $0x3c
+	CALL _intrp<>(SB); BYTE $0x3d
+	CALL _intrp<>(SB); BYTE $0x3e
+	CALL _intrp<>(SB); BYTE $0x3f
+	CALL _intrp<>(SB); BYTE $0x40
+	CALL _intrp<>(SB); BYTE $0x41
+	CALL _intrp<>(SB); BYTE $0x42
+	CALL _intrp<>(SB); BYTE $0x43
+	CALL _intrp<>(SB); BYTE $0x44
+	CALL _intrp<>(SB); BYTE $0x45
+	CALL _intrp<>(SB); BYTE $0x46
+	CALL _intrp<>(SB); BYTE $0x47
+	CALL _intrp<>(SB); BYTE $0x48
+	CALL _intrp<>(SB); BYTE $0x49
+	CALL _intrp<>(SB); BYTE $0x4a
+	CALL _intrp<>(SB); BYTE $0x4b
+	CALL _intrp<>(SB); BYTE $0x4c
+	CALL _intrp<>(SB); BYTE $0x4d
+	CALL _intrp<>(SB); BYTE $0x4e
+	CALL _intrp<>(SB); BYTE $0x4f
+	CALL _intrp<>(SB); BYTE $0x50
+	CALL _intrp<>(SB); BYTE $0x51
+	CALL _intrp<>(SB); BYTE $0x52
+	CALL _intrp<>(SB); BYTE $0x53
+	CALL _intrp<>(SB); BYTE $0x54
+	CALL _intrp<>(SB); BYTE $0x55
+	CALL _intrp<>(SB); BYTE $0x56
+	CALL _intrp<>(SB); BYTE $0x57
+	CALL _intrp<>(SB); BYTE $0x58
+	CALL _intrp<>(SB); BYTE $0x59
+	CALL _intrp<>(SB); BYTE $0x5a
+	CALL _intrp<>(SB); BYTE $0x5b
+	CALL _intrp<>(SB); BYTE $0x5c
+	CALL _intrp<>(SB); BYTE $0x5d
+	CALL _intrp<>(SB); BYTE $0x5e
+	CALL _intrp<>(SB); BYTE $0x5f
+	CALL _intrp<>(SB); BYTE $0x60
+	CALL _intrp<>(SB); BYTE $0x61
+	CALL _intrp<>(SB); BYTE $0x62
+	CALL _intrp<>(SB); BYTE $0x63
+	CALL _intrp<>(SB); BYTE $0x64
+	CALL _intrp<>(SB); BYTE $0x65
+	CALL _intrp<>(SB); BYTE $0x66
+	CALL _intrp<>(SB); BYTE $0x67
+	CALL _intrp<>(SB); BYTE $0x68
+	CALL _intrp<>(SB); BYTE $0x69
+	CALL _intrp<>(SB); BYTE $0x6a
+	CALL _intrp<>(SB); BYTE $0x6b
+	CALL _intrp<>(SB); BYTE $0x6c
+	CALL _intrp<>(SB); BYTE $0x6d
+	CALL _intrp<>(SB); BYTE $0x6e
+	CALL _intrp<>(SB); BYTE $0x6f
+	CALL _intrp<>(SB); BYTE $0x70
+	CALL _intrp<>(SB); BYTE $0x71
+	CALL _intrp<>(SB); BYTE $0x72
+	CALL _intrp<>(SB); BYTE $0x73
+	CALL _intrp<>(SB); BYTE $0x74
+	CALL _intrp<>(SB); BYTE $0x75
+	CALL _intrp<>(SB); BYTE $0x76
+	CALL _intrp<>(SB); BYTE $0x77
+	CALL _intrp<>(SB); BYTE $0x78
+	CALL _intrp<>(SB); BYTE $0x79
+	CALL _intrp<>(SB); BYTE $0x7a
+	CALL _intrp<>(SB); BYTE $0x7b
+	CALL _intrp<>(SB); BYTE $0x7c
+	CALL _intrp<>(SB); BYTE $0x7d
+	CALL _intrp<>(SB); BYTE $0x7e
+	CALL _intrp<>(SB); BYTE $0x7f
+	CALL _intrp<>(SB); BYTE $0x80
+	CALL _intrp<>(SB); BYTE $0x81
+	CALL _intrp<>(SB); BYTE $0x82
+	CALL _intrp<>(SB); BYTE $0x83
+	CALL _intrp<>(SB); BYTE $0x84
+	CALL _intrp<>(SB); BYTE $0x85
+	CALL _intrp<>(SB); BYTE $0x86
+	CALL _intrp<>(SB); BYTE $0x87
+	CALL _intrp<>(SB); BYTE $0x88
+	CALL _intrp<>(SB); BYTE $0x89
+	CALL _intrp<>(SB); BYTE $0x8a
+	CALL _intrp<>(SB); BYTE $0x8b
+	CALL _intrp<>(SB); BYTE $0x8c
+	CALL _intrp<>(SB); BYTE $0x8d
+	CALL _intrp<>(SB); BYTE $0x8e
+	CALL _intrp<>(SB); BYTE $0x8f
+	CALL _intrp<>(SB); BYTE $0x90
+	CALL _intrp<>(SB); BYTE $0x91
+	CALL _intrp<>(SB); BYTE $0x92
+	CALL _intrp<>(SB); BYTE $0x93
+	CALL _intrp<>(SB); BYTE $0x94
+	CALL _intrp<>(SB); BYTE $0x95
+	CALL _intrp<>(SB); BYTE $0x96
+	CALL _intrp<>(SB); BYTE $0x97
+	CALL _intrp<>(SB); BYTE $0x98
+	CALL _intrp<>(SB); BYTE $0x99
+	CALL _intrp<>(SB); BYTE $0x9a
+	CALL _intrp<>(SB); BYTE $0x9b
+	CALL _intrp<>(SB); BYTE $0x9c
+	CALL _intrp<>(SB); BYTE $0x9d
+	CALL _intrp<>(SB); BYTE $0x9e
+	CALL _intrp<>(SB); BYTE $0x9f
+	CALL _intrp<>(SB); BYTE $0xa0
+	CALL _intrp<>(SB); BYTE $0xa1
+	CALL _intrp<>(SB); BYTE $0xa2
+	CALL _intrp<>(SB); BYTE $0xa3
+	CALL _intrp<>(SB); BYTE $0xa4
+	CALL _intrp<>(SB); BYTE $0xa5
+	CALL _intrp<>(SB); BYTE $0xa6
+	CALL _intrp<>(SB); BYTE $0xa7
+	CALL _intrp<>(SB); BYTE $0xa8
+	CALL _intrp<>(SB); BYTE $0xa9
+	CALL _intrp<>(SB); BYTE $0xaa
+	CALL _intrp<>(SB); BYTE $0xab
+	CALL _intrp<>(SB); BYTE $0xac
+	CALL _intrp<>(SB); BYTE $0xad
+	CALL _intrp<>(SB); BYTE $0xae
+	CALL _intrp<>(SB); BYTE $0xaf
+	CALL _intrp<>(SB); BYTE $0xb0
+	CALL _intrp<>(SB); BYTE $0xb1
+	CALL _intrp<>(SB); BYTE $0xb2
+	CALL _intrp<>(SB); BYTE $0xb3
+	CALL _intrp<>(SB); BYTE $0xb4
+	CALL _intrp<>(SB); BYTE $0xb5
+	CALL _intrp<>(SB); BYTE $0xb6
+	CALL _intrp<>(SB); BYTE $0xb7
+	CALL _intrp<>(SB); BYTE $0xb8
+	CALL _intrp<>(SB); BYTE $0xb9
+	CALL _intrp<>(SB); BYTE $0xba
+	CALL _intrp<>(SB); BYTE $0xbb
+	CALL _intrp<>(SB); BYTE $0xbc
+	CALL _intrp<>(SB); BYTE $0xbd
+	CALL _intrp<>(SB); BYTE $0xbe
+	CALL _intrp<>(SB); BYTE $0xbf
+	CALL _intrp<>(SB); BYTE $0xc0
+	CALL _intrp<>(SB); BYTE $0xc1
+	CALL _intrp<>(SB); BYTE $0xc2
+	CALL _intrp<>(SB); BYTE $0xc3
+	CALL _intrp<>(SB); BYTE $0xc4
+	CALL _intrp<>(SB); BYTE $0xc5
+	CALL _intrp<>(SB); BYTE $0xc6
+	CALL _intrp<>(SB); BYTE $0xc7
+	CALL _intrp<>(SB); BYTE $0xc8
+	CALL _intrp<>(SB); BYTE $0xc9
+	CALL _intrp<>(SB); BYTE $0xca
+	CALL _intrp<>(SB); BYTE $0xcb
+	CALL _intrp<>(SB); BYTE $0xcc
+	CALL _intrp<>(SB); BYTE $0xce
+	CALL _intrp<>(SB); BYTE $0xce
+	CALL _intrp<>(SB); BYTE $0xcf
+	CALL _intrp<>(SB); BYTE $0xd0
+	CALL _intrp<>(SB); BYTE $0xd1
+	CALL _intrp<>(SB); BYTE $0xd2
+	CALL _intrp<>(SB); BYTE $0xd3
+	CALL _intrp<>(SB); BYTE $0xd4
+	CALL _intrp<>(SB); BYTE $0xd5
+	CALL _intrp<>(SB); BYTE $0xd6
+	CALL _intrp<>(SB); BYTE $0xd7
+	CALL _intrp<>(SB); BYTE $0xd8
+	CALL _intrp<>(SB); BYTE $0xd9
+	CALL _intrp<>(SB); BYTE $0xda
+	CALL _intrp<>(SB); BYTE $0xdb
+	CALL _intrp<>(SB); BYTE $0xdc
+	CALL _intrp<>(SB); BYTE $0xdd
+	CALL _intrp<>(SB); BYTE $0xde
+	CALL _intrp<>(SB); BYTE $0xdf
+	CALL _intrp<>(SB); BYTE $0xe0
+	CALL _intrp<>(SB); BYTE $0xe1
+	CALL _intrp<>(SB); BYTE $0xe2
+	CALL _intrp<>(SB); BYTE $0xe3
+	CALL _intrp<>(SB); BYTE $0xe4
+	CALL _intrp<>(SB); BYTE $0xe5
+	CALL _intrp<>(SB); BYTE $0xe6
+	CALL _intrp<>(SB); BYTE $0xe7
+	CALL _intrp<>(SB); BYTE $0xe8
+	CALL _intrp<>(SB); BYTE $0xe9
+	CALL _intrp<>(SB); BYTE $0xea
+	CALL _intrp<>(SB); BYTE $0xeb
+	CALL _intrp<>(SB); BYTE $0xec
+	CALL _intrp<>(SB); BYTE $0xed
+	CALL _intrp<>(SB); BYTE $0xee
+	CALL _intrp<>(SB); BYTE $0xef
+	CALL _intrp<>(SB); BYTE $0xf0
+	CALL _intrp<>(SB); BYTE $0xf1
+	CALL _intrp<>(SB); BYTE $0xf2
+	CALL _intrp<>(SB); BYTE $0xf3
+	CALL _intrp<>(SB); BYTE $0xf4
+	CALL _intrp<>(SB); BYTE $0xf5
+	CALL _intrp<>(SB); BYTE $0xf6
+	CALL _intrp<>(SB); BYTE $0xf7
+	CALL _intrp<>(SB); BYTE $0xf8
+	CALL _intrp<>(SB); BYTE $0xf9
+	CALL _intrp<>(SB); BYTE $0xfa
+	CALL _intrp<>(SB); BYTE $0xfb
+	CALL _intrp<>(SB); BYTE $0xfc
+	CALL _intrp<>(SB); BYTE $0xfd
+	CALL _intrp<>(SB); BYTE $0xfe
+	CALL _intrp<>(SB); BYTE $0xff

Some files were not shown because too many files changed in this diff