Browse Source

ipnew.. let's see if gerrit works like this :)

Change-Id: I9d8e709958b2e603c14d5b4e616d8da296640fbd
Aki Nyrhinen 8 years ago
parent
commit
dd834e398e

+ 28 - 24
sys/src/9/ip/arp.c

@@ -57,6 +57,7 @@ char *Ebadarp = "bad arp";
 #define haship(s) ((s)[IPaddrlen-1]%NHASH)
 
 int 	ReTransTimer = RETRANS_TIMER;
+
 static void 	rxmitproc(void *v);
 
 void
@@ -78,6 +79,7 @@ newarp6(Arp *arp, uint8_t *ip, Ipifc *ifc, int addrxt)
 	uint t;
 	Block *next, *xp;
 	Arpent *a, *e, *f, **l;
+	Medium *medium = ifc->medium;
 	int empty;
 
 	/* find oldest entry */
@@ -134,7 +136,7 @@ newarp6(Arp *arp, uint8_t *ip, Ipifc *ifc, int addrxt)
 	memmove(a->ip, ip, sizeof(a->ip));
 	a->utime = NOW;
 	a->ctime = 0;
-	a->type = ifc->medium;
+	a->type = medium;
 
 	a->rtime = NOW + ReTransTimer;
 	a->rxtsrem = MAX_MULTICAST_SOLICIT;
@@ -211,12 +213,11 @@ cleanarpent(Arp *arp, Arpent *a)
  *  waiting for ip->mac to be resolved.
  */
 Arpent*
-arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uint8_t *ip,
-       uint8_t *mac)
+arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uint8_t *ip, uint8_t *mac)
 {
 	int hash;
 	Arpent *a;
-	Medium *type;
+	Medium *type = ifc->medium;
 	uint8_t v6ip[IPaddrlen];
 
 	if(version == V4){
@@ -226,7 +227,6 @@ arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uint8_t *ip,
 
 	qlock(arp);
 	hash = haship(ip);
-	type = ifc->medium;
 	for(a = arp->hash[hash]; a; a = a->hash){
 		if(memcmp(ip, a->ip, sizeof(a->ip)) == 0)
 		if(type == a->type)
@@ -264,7 +264,7 @@ arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uint8_t *ip,
  * called with arp locked
  */
 void
-arprelease(Arp *arp, Arpent* arpen)
+arprelease(Arp *arp, Arpent *arpen)
 {
 	qunlock(arp);
 }
@@ -304,8 +304,7 @@ arpresolve(Arp *arp, Arpent *a, Medium *type, uint8_t *mac)
 }
 
 void
-arpenter(Fs *fs, int version, uint8_t *ip, uint8_t *mac, int n,
-	 int refresh)
+arpenter(Fs *fs, int version, uint8_t *ip, uint8_t *mac, int n, int refresh)
 {
 	Mach *m = machp();
 	Arp *arp;
@@ -417,7 +416,7 @@ arpwrite(Fs *fs, char *s, int len)
 	Arp *arp;
 	Block *bp;
 	Arpent *a, *fl, **l;
-	Medium *type;
+	Medium *medium;
 	char *f[4], buf[256];
 	uint8_t ip[IPaddrlen], mac[MAClen];
 
@@ -458,34 +457,37 @@ arpwrite(Fs *fs, char *s, int len)
 		default:
 			error(Ebadarg);
 		case 3:
-			parseip(ip, f[1]);
+			if (parseip(ip, f[1]) == -1)
+				error(Ebadip);
 			if(isv4(ip))
 				r = v4lookup(fs, ip+IPv4off, nil);
 			else
 				r = v6lookup(fs, ip, nil);
 			if(r == nil)
 				error("Destination unreachable");
-			type = r->ifc->medium;
-			n = parsemac(mac, f[2], type->maclen);
+			medium = r->ifc->medium;
+			n = parsemac(mac, f[2], medium->maclen);
 			break;
 		case 4:
-			type = ipfindmedium(f[1]);
-			if(type == nil)
+			medium = ipfindmedium(f[1]);
+			if(medium == nil)
 				error(Ebadarp);
-			parseip(ip, f[2]);
-			n = parsemac(mac, f[3], type->maclen);
+			if (parseip(ip, f[2]) == -1)
+				error(Ebadip);
+			n = parsemac(mac, f[3], medium->maclen);
 			break;
 		}
 
-		if(type->ares == nil)
+		if(medium->ares == nil)
 			error(Ebadarp);
 
-		type->ares(fs, V6, ip, mac, n, 0);
+		medium->ares(fs, V6, ip, mac, n, 0);
 	} else if(strcmp(f[0], "del") == 0){
 		if(n != 2)
 			error(Ebadarg);
 
-		parseip(ip, f[1]);
+		if (parseip(ip, f[1]) == -1)
+			error(Ebadip);
 		qlock(arp);
 
 		l = &arp->hash[haship(ip)];
@@ -531,10 +533,10 @@ enum
 char *aformat = "%-6.6s %-8.8s %-40.40I %-32.32s\n";
 
 static void
-convmac(char *p, uint8_t *mac, int n)
+convmac(char *p, char *ep, uint8_t *mac, int n)
 {
 	while(n-- > 0)
-		p += sprint(p, "%2.2ux", *mac++);
+		p = seprint(p, ep, "%2.2ux", *mac++);
 }
 
 int
@@ -560,8 +562,9 @@ arpread(Arp *arp, char *p, uint32_t offset, int len)
 		}
 		len--;
 		qlock(arp);
-		convmac(mac, a->mac, a->type->maclen);
-		n += sprint(p+n, aformat, a->type->name, arpstate[a->state], a->ip, mac);
+		convmac(mac, &mac[sizeof mac], a->mac, a->type->maclen);
+		n += snprint(p+n, Alinelen+1, aformat, a->type->name,
+			arpstate[a->state], a->ip, mac);	/* +1 for NUL */
 		qunlock(arp);
 	}
 
@@ -613,6 +616,7 @@ rxmitsols(Arp *arp)
 	if(a == nil)
 		goto dodrops;
 
+
 	qunlock(arp);	/* for icmpns */
 	if((sflag = ipv6anylocal(ifc, ipsrc)) != SRC_UNSPEC)
 		icmpns(f, ipsrc, sflag, a->ip, TARG_MULTI, ifc->mac);
@@ -651,7 +655,7 @@ dodrops:
 
 	for(; xp; xp = next){
 		next = xp->list;
-		icmphostunr(f, ifc, xp, icmp6_adr_unreach, 1);
+		icmphostunr(f, ifc, xp, Icmp6_adr_unreach, 1);
 	}
 
 	return nrxt;

+ 1 - 1
sys/src/9/ip/chandial.c

@@ -76,7 +76,7 @@ call(char *clone, char *dest, DS *ds)
 	name[n] = 0;
 	for(p = name; *p == ' '; p++)
 		;
-	sprint(name, "%lud", strtoul(p, 0, 0));
+	snprint(name, sizeof name, "%lud", strtoul(p, 0, 0));
 	p = strrchr(clone, '/');
 	*p = 0;
 	if(ds->dir)

+ 99 - 58
sys/src/9/ip/devip.c

@@ -181,6 +181,7 @@ ip1gen(Chan *c, int i, Dir *dp)
 
 static int
 ipgen(Chan *c, char* j, Dirtab* dir, int mm, int s, Dir *dp)
+
 {
 	Mach *m = machp();
 	Qid q;
@@ -193,7 +194,7 @@ ipgen(Chan *c, char* j, Dirtab* dir, int mm, int s, Dir *dp)
 	case Qtopdir:
 		if(s == DEVDOTDOT){
 			mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
-			sprint(m->externup->genbuf, "#I%ud", c->devno);
+			snprint(m->externup->genbuf, sizeof m->externup->genbuf, "#I%lud", c->dev);
 			devdir(c, q, m->externup->genbuf, 0, network, 0555, dp);
 			return 1;
 		}
@@ -216,13 +217,13 @@ ipgen(Chan *c, char* j, Dirtab* dir, int mm, int s, Dir *dp)
 	case Qprotodir:
 		if(s == DEVDOTDOT){
 			mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
-			sprint(m->externup->genbuf, "#I%ud", c->devno);
+			snprint(m->externup->genbuf, sizeof m->externup->genbuf, "#I%lud", c->dev);
 			devdir(c, q, m->externup->genbuf, 0, network, 0555, dp);
 			return 1;
 		}
 		if(s < f->p[PROTO(c->qid)]->ac) {
 			cv = f->p[PROTO(c->qid)]->conv[s];
-			sprint(m->externup->genbuf, "%d", s);
+			snprint(m->externup->genbuf, sizeof m->externup->genbuf, "%d", s);
 			mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
 			devdir(c, q, m->externup->genbuf, 0, cv->owner, 0555, dp);
 			return 1;
@@ -342,8 +343,9 @@ ipwalk(Chan* c, Chan *nc, char **name, int nname)
 	return w;
 }
 
-static int32_t
-ipstat(Chan* c, uint8_t* db, int32_t n)
+
+static int
+ipstat(Chan* c, uint8_t* db, int n)
 {
 	return devstat(c, db, n, nil, 0, ipgen);
 }
@@ -523,13 +525,13 @@ ipcreate(Chan* c, char* n, int i, int m)
 }
 
 static void
-ipremove(Chan* c)
+ipremove(Chan *c)
 {
 	error(Eperm);
 }
 
-static int32_t
-ipwstat(Chan *c, uint8_t *dp, int32_t n)
+static int
+ipwstat(Chan *c, uint8_t *dp, int n)
 {
 	Dir d;
 	Conv *cv;
@@ -631,13 +633,13 @@ ipread(Chan *ch, void *a, int32_t n, int64_t off)
 	Conv *c;
 	Proto *x;
 	char *buf, *p;
-	int32_t offset, rv;
+	int32_t rv;
 	Fs *f;
+	uint32_t offset = off;
 
 	f = ipfs[ch->devno];
 
 	p = a;
-	offset = off;
 	switch(TYPE(ch->qid)) {
 	default:
 		error(Eperm);
@@ -659,7 +661,7 @@ ipread(Chan *ch, void *a, int32_t n, int64_t off)
 		return netlogread(f, a, offset, n);
 	case Qctl:
 		buf = smalloc(16);
-		sprint(buf, "%lud", CONV(ch->qid));
+		snprint(buf, 16, "%lud", CONV(ch->qid));
 		rv = readstr(offset, p, n, buf);
 		free(buf);
 		return rv;
@@ -668,7 +670,7 @@ ipread(Chan *ch, void *a, int32_t n, int64_t off)
 		x = f->p[PROTO(ch->qid)];
 		c = x->conv[CONV(ch->qid)];
 		if(x->remote == nil) {
-			sprint(buf, "%I!%d\n", c->raddr, c->rport);
+			snprint(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
 		} else {
 			(*x->remote)(c, buf, Statelen-2);
 		}
@@ -680,7 +682,7 @@ ipread(Chan *ch, void *a, int32_t n, int64_t off)
 		x = f->p[PROTO(ch->qid)];
 		c = x->conv[CONV(ch->qid)];
 		if(x->local == nil) {
-			sprint(buf, "%I!%d\n", c->laddr, c->lport);
+			snprint(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
 		} else {
 			(*x->local)(c, buf, Statelen-2);
 		}
@@ -776,53 +778,70 @@ setluniqueport(Conv* c, int lport)
 	return nil;
 }
 
+/*
+ * is lport in use by anyone?
+ */
+static int
+lportinuse(Proto *p, uint16_t lport)
+{
+	int x;
+
+	for(x = 0; x < p->nc && p->conv[x]; x++)
+		if(p->conv[x]->lport == lport)
+			return 1;
+	return 0;
+}
 
 /*
  *  pick a local port and set it
  */
-void
+char *
 setlport(Conv* c)
 {
 	Proto *p;
-	uint16_t *pp;
-	int x, found;
+	int i, port;
 
 	p = c->p;
-	if(c->restricted)
-		pp = &p->nextrport;
-	else
-		pp = &p->nextport;
 	qlock(p);
-	for(;;(*pp)++){
+	if(c->restricted){
+		/* Restricted ports cycle between 600 and 1024. */
+		for(i=0; i<1024-600; i++){
+			if(p->nextrport >= 1024 || p->nextrport < 600)
+				p->nextrport = 600;
+			port = p->nextrport++;
+			if(!lportinuse(p, port))
+				goto chosen;
+		}
+	}else{
 		/*
-		 * Fsproto initialises p->nextport to 0 and the restricted
-		 * ports (p->nextrport) to 600.
-		 * Restricted ports must lie between 600 and 1024.
-		 * For the initial condition or if the unrestricted port number
-		 * has wrapped round, select a random port between 5000 and 1<<15
-		 * to start at.
+		 * Unrestricted ports are chosen randomly
+		 * between 2^15 and 2^16.  There are at most
+		 * 4*Nchan = 4096 ports in use at any given time,
+		 * so even in the worst case, a random probe has a
+		 * 1 - 4096/2^15 = 87% chance of success.
+		 * If 64 successive probes fail, there is a bug somewhere
+		 * (or a once in 10^58 event has happened, but that's
+		 * less likely than a venti collision).
 		 */
-		if(c->restricted){
-			if(*pp >= 1024)
-				*pp = 600;
+		for(i=0; i<64; i++){
+			port = (1<<15) + nrand(1<<15);
+			if(!lportinuse(p, port))
+				goto chosen;
 		}
-		else while(*pp < 5000)
-			*pp = nrand(1<<15);
-
-		found = 0;
-		for(x = 0; x < p->nc; x++){
-			if(p->conv[x] == nil)
-				break;
-			if(p->conv[x]->lport == *pp){
-				found = 1;
-				break;
-			}
-		}
-		if(!found)
-			break;
 	}
-	c->lport = (*pp)++;
 	qunlock(p);
+	/*
+	 * debugging: let's see if we ever get this.
+	 * if we do (and we're a cpu server), we might as well restart
+	 * since we're now unable to service new connections.
+	 */
+	panic("setlport: out of ports");
+	return "no ports available";
+
+chosen:
+	c->lport = port;
+	qunlock(p);
+	return nil;
 }
 
 /*
@@ -837,8 +856,6 @@ setladdrport(Conv* c, char* str, int announcing)
 	uint16_t lport;
 	uint8_t addr[IPaddrlen];
 
-	rv = nil;
-
 	/*
 	 *  ignore restricted part if it exists.  it's
 	 *  meaningless on local ports.
@@ -861,7 +878,8 @@ setladdrport(Conv* c, char* str, int announcing)
 		if(strcmp(str, "*") == 0)
 			ipmove(c->laddr, IPnoaddr);
 		else {
-			parseip(addr, str);
+			if(parseip(addr, str) == -1)
+				return Ebadip;
 			if(ipforme(c->p->f, addr))
 				ipmove(c->laddr, addr);
 			else
@@ -878,7 +896,7 @@ setladdrport(Conv* c, char* str, int announcing)
 
 	lport = atoi(p);
 	if(lport <= 0)
-		setlport(c);
+		rv = setlport(c);
 	else
 		rv = setluniqueport(c, lport);
 	return rv;
@@ -893,7 +911,8 @@ setraddrport(Conv* c, char* str)
 	if(p == nil)
 		return "malformed address";
 	*p++ = 0;
-	parseip(c->raddr, str);
+	if (parseip(c->raddr, str) == -1)
+		return Ebadip;
 	c->rport = atoi(p);
 	p = strchr(p, '!');
 	if(p){
@@ -919,7 +938,9 @@ Fsstdconnect(Conv *c, char *argv[], int argc)
 		if(p != nil)
 			return p;
 		setladdr(c);
-		setlport(c);
+		p = setlport(c);
+		if (p != nil)
+			return p;
 		break;
 	case 3:
 		p = setraddrport(c, argv[1]);
@@ -1145,13 +1166,15 @@ ipwrite(Chan* ch, void *v, int32_t n, int64_t off)
 			if(cb->nf == 2){
 				if(!ipismulticast(c->raddr))
 					error("addmulti for a non multicast address");
-				parseip(ia, cb->f[1]);
+				if (parseip(ia, cb->f[1]) == -1)
+					error(Ebadip);
 				ipifcaddmulti(c, c->raddr, ia);
 			} else {
-				parseip(ma, cb->f[2]);
+				if (parseip(ia, cb->f[1]) == -1 ||
+				    parseip(ma, cb->f[2]) == -1)
+					error(Ebadip);
 				if(!ipismulticast(ma))
 					error("addmulti for a non multicast address");
-				parseip(ia, cb->f[1]);
 				ipifcaddmulti(c, ma, ia);
 			}
 		} else if(strcmp(cb->f[0], "remmulti") == 0){
@@ -1159,8 +1182,15 @@ ipwrite(Chan* ch, void *v, int32_t n, int64_t off)
 				error("remmulti needs interface address");
 			if(!ipismulticast(c->raddr))
 				error("remmulti for a non multicast address");
-			parseip(ia, cb->f[1]);
+			if (parseip(ia, cb->f[1]) == -1)
+				error(Ebadip);
 			ipifcremmulti(c, c->raddr, ia);
+		} else if(strcmp(cb->f[0], "maxfragsize") == 0){
+			if(cb->nf < 2)
+				error("maxfragsize needs size");
+
+			c->maxfragsize = (int)strtol(cb->f[1], nil, 0);
+
 		} else if(x->ctl != nil) {
 			p = x->ctl(c, cb->f, cb->nf);
 			if(p != nil)
@@ -1243,7 +1273,6 @@ Fsproto(Fs *f, Proto *p)
 		panic("Fsproto");
 
 	p->x = f->np;
-	p->nextport = 0;
 	p->nextrport = 600;
 	f->p[f->np++] = p;
 
@@ -1305,8 +1334,13 @@ retry:
 		}
 	}
 	if(pp >= ep) {
+		if(p->gc)
+			print("Fsprotoclone: garbage collecting Convs\n");
 		if(p->gc != nil && (*p->gc)(p))
 			goto retry;
+		/* debugging: do we ever get here? */
+		if (cpuserver)
+			panic("Fsprotoclone: all conversations in use");
 		return nil;
 	}
 
@@ -1321,6 +1355,7 @@ retry:
 	c->lport = 0;
 	c->rport = 0;
 	c->restricted = 0;
+	c->maxfragsize = 0;
 	c->ttl = MAXTTL;
 	qreopen(c->rq);
 	qreopen(c->wq);
@@ -1370,8 +1405,7 @@ Fsrcvpcolx(Fs *f, uint8_t proto)
  *  called with protocol locked
  */
 Conv*
-Fsnewcall(Conv *c, uint8_t *raddr, uint16_t rport, uint8_t *laddr,
-	  uint16_t lport, uint8_t version)
+Fsnewcall(Conv *c, uint8_t *raddr, uint16_t rport, uint8_t *laddr, uint16_t lport, uint8_t version)
 {
 	Conv *nc;
 	Conv **l;
@@ -1382,7 +1416,14 @@ Fsnewcall(Conv *c, uint8_t *raddr, uint16_t rport, uint8_t *laddr,
 	for(l = &c->incall; *l; l = &(*l)->next)
 		i++;
 	if(i >= Maxincall) {
+		static int beenhere;
+
 		qunlock(c);
+		if (!beenhere) {
+			beenhere = 1;
+			print("Fsnewcall: incall queue full (%d) on port %d\n",
+				i, c->lport);
+		}
 		return nil;
 	}
 

+ 161 - 0
sys/src/9/ip/eipconvtest.c

@@ -0,0 +1,161 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+#include <u.h>
+#include <libc.h>
+
+enum
+{
+	Isprefix= 16,
+};
+
+uint8_t prefixvals[256] =
+{
+[0x00] 0 | Isprefix,
+[0x80] 1 | Isprefix,
+[0xC0] 2 | Isprefix,
+[0xE0] 3 | Isprefix,
+[0xF0] 4 | Isprefix,
+[0xF8] 5 | Isprefix,
+[0xFC] 6 | Isprefix,
+[0xFE] 7 | Isprefix,
+[0xFF] 8 | Isprefix,
+};
+
+uint8_t v4prefix[16] = {
+	0, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 0, 0xff, 0xff,
+	0, 0, 0, 0
+};
+
+void
+hnputl(void *p, uint32_t v)
+{
+	uint8_t *a;
+
+	a = p;
+	a[0] = v>>24;
+	a[1] = v>>16;
+	a[2] = v>>8;
+	a[3] = v;
+}
+
+int
+eipconv(va_list *arg, Fconv *f)
+{
+	char buf[8*5];
+	static char *efmt = "%.2lux%.2lux%.2lux%.2lux%.2lux%.2lux";
+	static char *ifmt = "%d.%d.%d.%d";
+	uint8_t *p, ip[16];
+	uint32_t *lp;
+	uint16_t s;
+	int i, j, n, eln, eli;
+
+	switch(f->chr) {
+	case 'E':		/* Ethernet address */
+		p = va_arg(*arg, uint8_t*);
+		sprint(buf, efmt, p[0], p[1], p[2], p[3], p[4], p[5]);
+		break;
+	case 'I':		/* Ip address */
+		p = va_arg(*arg, uint8_t*);
+common:
+		if(memcmp(p, v4prefix, 12) == 0)
+			sprint(buf, ifmt, p[12], p[13], p[14], p[15]);
+		else {
+			/* find longest elision */
+			eln = eli = -1;
+			for(i = 0; i < 16; i += 2){
+				for(j = i; j < 16; j += 2)
+					if(p[j] != 0 || p[j+1] != 0)
+						break;
+				if(j > i && j - i > eln){
+					eli = i;
+					eln = j - i;
+				}
+			}
+
+			/* print with possible elision */
+			n = 0;
+			for(i = 0; i < 16; i += 2){
+				if(i == eli){
+					n += sprint(buf+n, "::");
+					i += eln;
+					if(i >= 16)
+						break;
+				} else if(i != 0)
+					n += sprint(buf+n, ":");
+				s = (p[i]<<8) + p[i+1];
+				n += sprint(buf+n, "%ux", s);
+			}
+		}
+		break;
+	case 'i':		/* v6 address as 4 longs */
+		lp = va_arg(*arg, uint32_t*);
+		for(i = 0; i < 4; i++)
+			hnputl(ip+4*i, *lp++);
+		p = ip;
+		goto common;
+	case 'V':		/* v4 ip address */
+		p = va_arg(*arg, uint8_t*);
+		sprint(buf, ifmt, p[0], p[1], p[2], p[3]);
+		break;
+	case 'M':		/* ip mask */
+		p = va_arg(*arg, uint8_t*);
+
+		/* look for a prefix mask */
+		for(i = 0; i < 16; i++)
+			if(p[i] != 0xff)
+				break;
+		if(i < 16){
+			if((prefixvals[p[i]] & Isprefix) == 0)
+				goto common;
+			for(j = i+1; j < 16; j++)
+				if(p[j] != 0)
+					goto common;
+			n = 8*i + (prefixvals[p[i]] & ~Isprefix);
+		} else
+			n = 8*16;
+
+		/* got one, use /xx format */
+		sprint(buf, "/%d", n);
+		break;
+	default:
+		strcpy(buf, "(eipconv)");
+	}
+	strconv(buf, f);
+	return sizeof(uint8_t*);
+}
+
+uint8_t testvec[11][16] =
+{
+ { 0,0,0,0, 0,0,0,0, 0,0,0xff,0xff, 1,3,4,5, },
+ { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, },
+ { 0xff,0xff,0x80,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0xff,0xff,0xff,0xc0, 0,0,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0xff,0xff,0xff,0xff, 0xe0,0,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0xff,0xff,0xff,0xff, 0xff,0xf0,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0xff,0xff,0xff,0xff, 0xff,0xff,0xf8,0, 0,0,0,0, 0,0,0,0, },
+ { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, },
+ { 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0,0,0,0, 0,0x11,0,0, 0,0,0,0, 0,0,0,0, },
+ { 0,0,0,0x11, 0,0,0,0, 0,0,0,0, 0,0,0,0x12, },
+};
+
+void
+main(void)
+{
+	int i;
+
+	fmtinstall('I', eipconv);
+	fmtinstall('M', eipconv);
+	for(i = 0; i < 11; i++)
+		print("%I\n%M\n", testvec[i], testvec[i]);
+	exits(0);
+}

+ 1122 - 0
sys/src/9/ip/esp.c

@@ -0,0 +1,1122 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * Encapsulating Security Payload for IPsec for IPv4, rfc1827.
+ * extended to IPv6.
+ * rfc2104 defines hmac computation.
+ *	currently only implements tunnel mode.
+ * TODO: verify aes algorithms;
+ *	transport mode (host-to-host)
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+#include	"ipv6.h"
+#include	"libsec.h"
+
+#define BITS2BYTES(bi) (((bi) + BI2BY - 1) / BI2BY)
+#define BYTES2BITS(by)  ((by) * BI2BY)
+
+typedef struct Algorithm Algorithm;
+typedef struct Esp4hdr Esp4hdr;
+typedef struct Esp6hdr Esp6hdr;
+typedef struct Espcb Espcb;
+typedef struct Esphdr Esphdr;
+typedef struct Esppriv Esppriv;
+typedef struct Esptail Esptail;
+typedef struct Userhdr Userhdr;
+
+enum {
+	Encrypt,
+	Decrypt,
+
+	IP_ESPPROTO	= 50,	/* IP v4 and v6 protocol number */
+	Esp4hdrlen	= IP4HDR + 8,
+	Esp6hdrlen	= IP6HDR + 8,
+
+	Esptaillen	= 2,	/* does not include pad or auth data */
+	Userhdrlen	= 4,	/* user-visible header size - if enabled */
+
+	Desblk	 = BITS2BYTES(64),
+	Des3keysz = BITS2BYTES(192),
+
+	Aesblk	 = BITS2BYTES(128),
+	Aeskeysz = BITS2BYTES(128),
+};
+
+struct Esphdr
+{
+	uint8_t	espspi[4];	/* Security parameter index */
+	uint8_t	espseq[4];	/* Sequence number */
+	uint8_t	payload[];
+};
+
+/*
+ * tunnel-mode (network-to-network, etc.) layout is:
+ * new IP hdrs | ESP hdr |
+ *	 enc { orig IP hdrs | TCP/UDP hdr | user data | ESP trailer } | ESP ICV
+ *
+ * transport-mode (host-to-host) layout would be:
+ *	orig IP hdrs | ESP hdr |
+ *			enc { TCP/UDP hdr | user data | ESP trailer } | ESP ICV
+ */
+struct Esp4hdr
+{
+	/* ipv4 header */
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	length[2];	/* packet length */
+	uint8_t	id[2];		/* Identification */
+	uint8_t	frag[2];	/* Fragment information */
+	uint8_t	Unused;
+	uint8_t	espproto;	/* Protocol */
+	uint8_t	espplen[2];	/* Header plus data length */
+	uint8_t	espsrc[4];	/* Ip source */
+	uint8_t	espdst[4];	/* Ip destination */
+
+	Esphdr;
+};
+
+/* tunnel-mode layout */
+struct Esp6hdr
+{
+	IPV6HDR;
+	Esphdr;
+};
+
+struct Esptail
+{
+	uint8_t	pad;
+	uint8_t	nexthdr;
+};
+
+/* IP-version-dependent data */
+typedef struct Versdep Versdep;
+struct Versdep
+{
+	uint32_t	version;
+	uint32_t	iphdrlen;
+	uint32_t	hdrlen;		/* iphdrlen + esp hdr len */
+	uint32_t	spi;
+	uint8_t	laddr[IPaddrlen];
+	uint8_t	raddr[IPaddrlen];
+};
+
+/* header as seen by the user */
+struct Userhdr
+{
+	uint8_t	nexthdr;	/* next protocol */
+	uint8_t	unused[3];
+};
+
+struct Esppriv
+{
+	uint64_t	in;
+	uint32_t	inerrors;
+};
+
+/*
+ *  protocol specific part of Conv
+ */
+struct Espcb
+{
+	int	incoming;
+	int	header;		/* user-level header */
+	uint32_t	spi;
+	uint32_t	seq;		/* last seq sent */
+	uint32_t	window;		/* for replay attacks */
+
+	char	*espalg;
+	void	*espstate;	/* other state for esp */
+	int	espivlen;	/* in bytes */
+	int	espblklen;
+	int	(*cipher)(Espcb*, uint8_t *buf, int len);
+
+	char	*ahalg;
+	void	*ahstate;	/* other state for esp */
+	int	ahlen;		/* auth data length in bytes */
+	int	ahblklen;
+	int	(*auth)(Espcb*, uint8_t *buf, int len, uint8_t *hash);
+	DigestState *ds;
+};
+
+struct Algorithm
+{
+	char 	*name;
+	int	keylen;		/* in bits */
+	void	(*init)(Espcb*, char* name, uint8_t *key, unsigned keylen);
+};
+
+static	Conv* convlookup(Proto *esp, uint32_t spi);
+static	char *setalg(Espcb *ecb, char **f, int n, Algorithm *alg);
+static	void espkick(void *x);
+
+static	void nullespinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void des3espinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void aescbcespinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void aesctrespinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void desespinit(Espcb *ecb, char *name, uint8_t *k, unsigned n);
+
+static	void nullahinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void shaahinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void aesahinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+static	void md5ahinit(Espcb*, char*, uint8_t *key, unsigned keylen);
+
+static Algorithm espalg[] =
+{
+	"null",		0,	nullespinit,
+	"des3_cbc",	192,	des3espinit,	/* new rfc2451, des-ede3 */
+	"aes_128_cbc",	128,	aescbcespinit,	/* new rfc3602 */
+	"aes_ctr",	128,	aesctrespinit,	/* new rfc3686 */
+	"des_56_cbc",	64,	desespinit,	/* rfc2405, deprecated */
+	/* rc4 was never required, was used in original bandt */
+//	"rc4_128",	128,	rc4espinit,
+	nil,		0,	nil,
+};
+
+static Algorithm ahalg[] =
+{
+	"null",		0,	nullahinit,
+	"hmac_sha1_96",	128,	shaahinit,	/* rfc2404 */
+	"aes_xcbc_mac_96", 128,	aesahinit,	/* new rfc3566 */
+	"hmac_md5_96",	128,	md5ahinit,	/* rfc2403 */
+	nil,		0,	nil,
+};
+
+static char*
+espconnect(Conv *c, char **argv, int argc)
+{
+	char *p, *pp, *e = nil;
+	uint32_t spi;
+	Espcb *ecb = (Espcb*)c->ptcl;
+
+	switch(argc) {
+	default:
+		e = "bad args to connect";
+		break;
+	case 2:
+		p = strchr(argv[1], '!');
+		if(p == nil){
+			e = "malformed address";
+			break;
+		}
+		*p++ = 0;
+		if (parseip(c->raddr, argv[1]) == -1) {
+			e = Ebadip;
+			break;
+		}
+		findlocalip(c->p->f, c->laddr, c->raddr);
+		ecb->incoming = 0;
+		ecb->seq = 0;
+		if(strcmp(p, "*") == 0) {
+			qlock(c->p);
+			for(;;) {
+				spi = nrand(1<<16) + 256;
+				if(convlookup(c->p, spi) == nil)
+					break;
+			}
+			qunlock(c->p);
+			ecb->spi = spi;
+			ecb->incoming = 1;
+			qhangup(c->wq, nil);
+		} else {
+			spi = strtoul(p, &pp, 10);
+			if(pp == p) {
+				e = "malformed address";
+				break;
+			}
+			ecb->spi = spi;
+			qhangup(c->rq, nil);
+		}
+		nullespinit(ecb, "null", nil, 0);
+		nullahinit(ecb, "null", nil, 0);
+	}
+	Fsconnected(c, e);
+
+	return e;
+}
+
+
+static int
+espstate(Conv *c, char *state, int n)
+{
+	return snprint(state, n, "%s", c->inuse?"Open\n":"Closed\n");
+}
+
+static void
+espcreate(Conv *c)
+{
+	c->rq = qopen(64*1024, Qmsg, 0, 0);
+	c->wq = qopen(64*1024, Qkick, espkick, c);
+}
+
+static void
+espclose(Conv *c)
+{
+	Espcb *ecb;
+
+	qclose(c->rq);
+	qclose(c->wq);
+	qclose(c->eq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+
+	ecb = (Espcb*)c->ptcl;
+	free(ecb->espstate);
+	free(ecb->ahstate);
+	memset(ecb, 0, sizeof(Espcb));
+}
+
+static int
+convipvers(Conv *c)
+{
+	if((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
+	    memcmp(c->laddr, v4prefix, IPv4off) == 0) ||
+	    ipcmp(c->raddr, IPnoaddr) == 0)
+		return V4;
+	else
+		return V6;
+}
+
+static int
+pktipvers(Fs *f, Block **bpp)
+{
+	if (*bpp == nil || BLEN(*bpp) == 0) {
+		/* get enough to identify the IP version */
+		*bpp = pullupblock(*bpp, IP4HDR);
+		if(*bpp == nil) {
+			netlog(f, Logesp, "esp: short packet\n");
+			return 0;
+		}
+	}
+	return (((Esp4hdr*)(*bpp)->rp)->vihl & 0xf0) == IP_VER4? V4: V6;
+}
+
+static void
+getverslens(int version, Versdep *vp)
+{
+	vp->version = version;
+	switch(vp->version) {
+	case V4:
+		vp->iphdrlen = IP4HDR;
+		vp->hdrlen   = Esp4hdrlen;
+		break;
+	case V6:
+		vp->iphdrlen = IP6HDR;
+		vp->hdrlen   = Esp6hdrlen;
+		break;
+	default:
+		panic("esp: getverslens version %d wrong", version);
+	}
+}
+
+static void
+getpktspiaddrs(uint8_t *pkt, Versdep *vp)
+{
+	Esp4hdr *eh4;
+	Esp6hdr *eh6;
+
+	switch(vp->version) {
+	case V4:
+		eh4 = (Esp4hdr*)pkt;
+		v4tov6(vp->raddr, eh4->espsrc);
+		v4tov6(vp->laddr, eh4->espdst);
+		vp->spi = nhgetl(eh4->espspi);
+		break;
+	case V6:
+		eh6 = (Esp6hdr*)pkt;
+		ipmove(vp->raddr, eh6->src);
+		ipmove(vp->laddr, eh6->dst);
+		vp->spi = nhgetl(eh6->espspi);
+		break;
+	default:
+		panic("esp: getpktspiaddrs vp->version %ld wrong", vp->version);
+	}
+}
+
+/*
+ * encapsulate next IP packet on x's write queue in IP/ESP packet
+ * and initiate output of the result.
+ */
+static void
+espkick(void *x)
+{
+	int nexthdr, payload, pad, align;
+	uint8_t *auth;
+	Block *bp;
+	Conv *c = x;
+	Esp4hdr *eh4;
+	Esp6hdr *eh6;
+	Espcb *ecb;
+	Esptail *et;
+	Userhdr *uh;
+	Versdep vers;
+
+	getverslens(convipvers(c), &vers);
+	bp = qget(c->wq);
+	if(bp == nil)
+		return;
+
+	qlock(c);
+	ecb = c->ptcl;
+
+	if(ecb->header) {
+		/* make sure the message has a User header */
+		bp = pullupblock(bp, Userhdrlen);
+		if(bp == nil) {
+			qunlock(c);
+			return;
+		}
+		uh = (Userhdr*)bp->rp;
+		nexthdr = uh->nexthdr;
+		bp->rp += Userhdrlen;
+	} else {
+		nexthdr = 0;	/* what should this be? */
+	}
+
+	payload = BLEN(bp) + ecb->espivlen;
+
+	/* Make space to fit ip header */
+	bp = padblock(bp, vers.hdrlen + ecb->espivlen);
+	getpktspiaddrs(bp->rp, &vers);
+
+	align = 4;
+	if(ecb->espblklen > align)
+		align = ecb->espblklen;
+	if(align % ecb->ahblklen != 0)
+		panic("espkick: ahblklen is important after all");
+	pad = (align-1) - (payload + Esptaillen-1)%align;
+
+	/*
+	 * Make space for tail
+	 * this is done by calling padblock with a negative size
+	 * Padblock does not change bp->wp!
+	 */
+	bp = padblock(bp, -(pad+Esptaillen+ecb->ahlen));
+	bp->wp += pad+Esptaillen+ecb->ahlen;
+
+	et = (Esptail*)(bp->rp + vers.hdrlen + payload + pad);
+
+	/* fill in tail */
+	et->pad = pad;
+	et->nexthdr = nexthdr;
+
+	/* encrypt the payload */
+	ecb->cipher(ecb, bp->rp + vers.hdrlen, payload + pad + Esptaillen);
+	auth = bp->rp + vers.hdrlen + payload + pad + Esptaillen;
+
+	/* fill in head; construct a new IP header and an ESP header */
+	if (vers.version == V4) {
+		eh4 = (Esp4hdr *)bp->rp;
+		eh4->vihl = IP_VER4;
+		v6tov4(eh4->espsrc, c->laddr);
+		v6tov4(eh4->espdst, c->raddr);
+		eh4->espproto = IP_ESPPROTO;
+		eh4->frag[0] = 0;
+		eh4->frag[1] = 0;
+
+		hnputl(eh4->espspi, ecb->spi);
+		hnputl(eh4->espseq, ++ecb->seq);
+	} else {
+		eh6 = (Esp6hdr *)bp->rp;
+		eh6->vcf[0] = IP_VER6;
+		ipmove(eh6->src, c->laddr);
+		ipmove(eh6->dst, c->raddr);
+		eh6->proto = IP_ESPPROTO;
+
+		hnputl(eh6->espspi, ecb->spi);
+		hnputl(eh6->espseq, ++ecb->seq);
+	}
+
+	/* compute secure hash */
+	ecb->auth(ecb, bp->rp + vers.iphdrlen, (vers.hdrlen - vers.iphdrlen) +
+		payload + pad + Esptaillen, auth);
+
+	qunlock(c);
+	/* print("esp: pass down: %uld\n", BLEN(bp)); */
+	if (vers.version == V4)
+		ipoput4(c->p->f, bp, 0, c->ttl, c->tos, c);
+	else
+		ipoput6(c->p->f, bp, 0, c->ttl, c->tos, c);
+}
+
+/*
+ * decapsulate IP packet from IP/ESP packet in bp and
+ * pass the result up the spi's Conv's read queue.
+ */
+void
+espiput(Proto *esp, Ipifc *ipifc, Block *bp)
+{
+	Mach *m = machp();
+	int payload, nexthdr;
+	uint8_t *auth, *espspi;
+	Conv *c;
+	Espcb *ecb;
+	Esptail *et;
+	Fs *f;
+	Userhdr *uh;
+	Versdep vers;
+
+	f = esp->f;
+
+	getverslens(pktipvers(f, &bp), &vers);
+
+	bp = pullupblock(bp, vers.hdrlen + Esptaillen);
+	if(bp == nil) {
+		netlog(f, Logesp, "esp: short packet\n");
+		return;
+	}
+	getpktspiaddrs(bp->rp, &vers);
+
+	qlock(esp);
+	/* Look for a conversation structure for this port */
+	c = convlookup(esp, vers.spi);
+	if(c == nil) {
+		qunlock(esp);
+		netlog(f, Logesp, "esp: no conv %I -> %I!%lud\n", vers.raddr,
+			vers.laddr, vers.spi);
+		icmpnoconv(f, bp);
+		freeblist(bp);
+		return;
+	}
+
+	qlock(c);
+	qunlock(esp);
+
+	ecb = c->ptcl;
+	/* too hard to do decryption/authentication on block lists */
+	if(bp->next)
+		bp = concatblock(bp);
+
+	if(BLEN(bp) < vers.hdrlen + ecb->espivlen + Esptaillen + ecb->ahlen) {
+		qunlock(c);
+		netlog(f, Logesp, "esp: short block %I -> %I!%lud\n", vers.raddr,
+			vers.laddr, vers.spi);
+		freeb(bp);
+		return;
+	}
+
+	auth = bp->wp - ecb->ahlen;
+	espspi = vers.version == V4?	((Esp4hdr*)bp->rp)->espspi:
+					((Esp6hdr*)bp->rp)->espspi;
+
+	/* compute secure hash and authenticate */
+	if(!ecb->auth(ecb, espspi, auth - espspi, auth)) {
+		qunlock(c);
+print("esp: bad auth %I -> %I!%ld\n", vers.raddr, vers.laddr, vers.spi);
+		netlog(f, Logesp, "esp: bad auth %I -> %I!%lud\n", vers.raddr,
+			vers.laddr, vers.spi);
+		freeb(bp);
+		return;
+	}
+
+	payload = BLEN(bp) - vers.hdrlen - ecb->ahlen;
+	if(payload <= 0 || payload % 4 != 0 || payload % ecb->espblklen != 0) {
+		qunlock(c);
+		netlog(f, Logesp, "esp: bad length %I -> %I!%lud payload=%d BLEN=%lud\n",
+			vers.raddr, vers.laddr, vers.spi, payload, BLEN(bp));
+		freeb(bp);
+		return;
+	}
+
+	/* decrypt payload */
+	if(!ecb->cipher(ecb, bp->rp + vers.hdrlen, payload)) {
+		qunlock(c);
+print("esp: cipher failed %I -> %I!%ld: %s\n", vers.raddr, vers.laddr, vers.spi, m->externup->errstr);
+		netlog(f, Logesp, "esp: cipher failed %I -> %I!%lud: %s\n",
+			vers.raddr, vers.laddr, vers.spi, m->externup->errstr);
+		freeb(bp);
+		return;
+	}
+
+	payload -= Esptaillen;
+	et = (Esptail*)(bp->rp + vers.hdrlen + payload);
+	payload -= et->pad + ecb->espivlen;
+	nexthdr = et->nexthdr;
+	if(payload <= 0) {
+		qunlock(c);
+		netlog(f, Logesp, "esp: short packet after decrypt %I -> %I!%lud\n",
+			vers.raddr, vers.laddr, vers.spi);
+		freeb(bp);
+		return;
+	}
+
+	/* trim packet */
+	bp->rp += vers.hdrlen + ecb->espivlen; /* toss original IP & ESP hdrs */
+	bp->wp = bp->rp + payload;
+	if(ecb->header) {
+		/* assume Userhdrlen < Esp4hdrlen < Esp6hdrlen */
+		bp->rp -= Userhdrlen;
+		uh = (Userhdr*)bp->rp;
+		memset(uh, 0, Userhdrlen);
+		uh->nexthdr = nexthdr;
+	}
+
+	/* ingress filtering here? */
+
+	if(qfull(c->rq)){
+		netlog(f, Logesp, "esp: qfull %I -> %I.%uld\n", vers.raddr,
+			vers.laddr, vers.spi);
+		freeblist(bp);
+	}else {
+//		print("esp: pass up: %uld\n", BLEN(bp));
+		qpass(c->rq, bp);	/* pass packet up the read queue */
+	}
+
+	qunlock(c);
+}
+
+char*
+espctl(Conv *c, char **f, int n)
+{
+	Espcb *ecb = c->ptcl;
+	char *e = nil;
+
+	if(strcmp(f[0], "esp") == 0)
+		e = setalg(ecb, f, n, espalg);
+	else if(strcmp(f[0], "ah") == 0)
+		e = setalg(ecb, f, n, ahalg);
+	else if(strcmp(f[0], "header") == 0)
+		ecb->header = 1;
+	else if(strcmp(f[0], "noheader") == 0)
+		ecb->header = 0;
+	else
+		e = "unknown control request";
+	return e;
+}
+
+/* called from icmp(v6) for unreachable hosts, time exceeded, etc. */
+void
+espadvise(Proto *esp, Block *bp, char *msg)
+{
+	Conv *c;
+	Versdep vers;
+
+	getverslens(pktipvers(esp->f, &bp), &vers);
+	getpktspiaddrs(bp->rp, &vers);
+
+	qlock(esp);
+	c = convlookup(esp, vers.spi);
+	if(c != nil) {
+		qhangup(c->rq, msg);
+		qhangup(c->wq, msg);
+	}
+	qunlock(esp);
+	freeblist(bp);
+}
+
+int
+espstats(Proto *esp, char *buf, int len)
+{
+	Esppriv *upriv;
+
+	upriv = esp->priv;
+	return snprint(buf, len, "%llud %lud\n",
+		upriv->in,
+		upriv->inerrors);
+}
+
+static int
+esplocal(Conv *c, char *buf, int len)
+{
+	Espcb *ecb = c->ptcl;
+	int n;
+
+	qlock(c);
+	if(ecb->incoming)
+		n = snprint(buf, len, "%I!%uld\n", c->laddr, ecb->spi);
+	else
+		n = snprint(buf, len, "%I\n", c->laddr);
+	qunlock(c);
+	return n;
+}
+
+static int
+espremote(Conv *c, char *buf, int len)
+{
+	Espcb *ecb = c->ptcl;
+	int n;
+
+	qlock(c);
+	if(ecb->incoming)
+		n = snprint(buf, len, "%I\n", c->raddr);
+	else
+		n = snprint(buf, len, "%I!%uld\n", c->raddr, ecb->spi);
+	qunlock(c);
+	return n;
+}
+
+static	Conv*
+convlookup(Proto *esp, uint32_t spi)
+{
+	Conv *c, **p;
+	Espcb *ecb;
+
+	for(p=esp->conv; *p; p++){
+		c = *p;
+		ecb = c->ptcl;
+		if(ecb->incoming && ecb->spi == spi)
+			return c;
+	}
+	return nil;
+}
+
+static char *
+setalg(Espcb *ecb, char **f, int n, Algorithm *alg)
+{
+	uint8_t *key;
+	int c, nbyte, nchar;
+	uint i;
+
+	if(n < 2 || n > 3)
+		return "bad format";
+	for(; alg->name; alg++)
+		if(strcmp(f[1], alg->name) == 0)
+			break;
+	if(alg->name == nil)
+		return "unknown algorithm";
+
+	nbyte = (alg->keylen + 7) >> 3;
+	if (n == 2)
+		nchar = 0;
+	else
+		nchar = strlen(f[2]);
+	if(nchar != 2 * nbyte)			/* TODO: maybe < is ok */
+		return "key not required length";
+	/* convert hex digits from ascii, in place */
+	for(i=0; i<nchar; i++) {
+		c = f[2][i];
+		if(c >= '0' && c <= '9')
+			f[2][i] -= '0';
+		else if(c >= 'a' && c <= 'f')
+			f[2][i] -= 'a'-10;
+		else if(c >= 'A' && c <= 'F')
+			f[2][i] -= 'A'-10;
+		else
+			return "non-hex character in key";
+	}
+	/* collapse hex digits into complete bytes in reverse order in key */
+	key = smalloc(nbyte);
+	for(i = 0; i < nchar && i/2 < nbyte; i++) {
+		c = f[2][nchar-i-1];
+		if(i&1)
+			c <<= 4;
+		key[i/2] |= c;
+	}
+
+	alg->init(ecb, alg->name, key, alg->keylen);
+	free(key);
+	return nil;
+}
+
+
+/*
+ * null encryption
+ */
+
+static int
+nullcipher(Espcb *espcb, uint8_t *c, int i)
+{
+	return 1;
+}
+
+static void
+nullespinit(Espcb *ecb, char *name, uint8_t *c, unsigned keylen)
+{
+	ecb->espalg = name;
+	ecb->espblklen = 1;
+	ecb->espivlen = 0;
+	ecb->cipher = nullcipher;
+}
+
+static int
+nullauth(Espcb *espcb, uint8_t *c, int i, uint8_t *d)
+{
+	return 1;
+}
+
+static void
+nullahinit(Espcb *ecb, char *name, uint8_t *c, unsigned keylen)
+{
+	ecb->ahalg = name;
+	ecb->ahblklen = 1;
+	ecb->ahlen = 0;
+	ecb->auth = nullauth;
+}
+
+
+/*
+ * sha1
+ */
+
+static void
+seanq_hmac_sha1(uint8_t hash[SHA1dlen], uint8_t *t, int32_t tlen, uint8_t *key, int32_t klen)
+{
+	int i;
+	uint8_t ipad[Hmacblksz+1], opad[Hmacblksz+1], innerhash[SHA1dlen];
+	DigestState *digest;
+
+	memset(ipad, 0x36, Hmacblksz);
+	memset(opad, 0x5c, Hmacblksz);
+	ipad[Hmacblksz] = opad[Hmacblksz] = 0;
+	for(i = 0; i < klen; i++){
+		ipad[i] ^= key[i];
+		opad[i] ^= key[i];
+	}
+	digest = sha1(ipad, Hmacblksz, nil, nil);
+	sha1(t, tlen, innerhash, digest);
+	digest = sha1(opad, Hmacblksz, nil, nil);
+	sha1(innerhash, SHA1dlen, hash, digest);
+}
+
+static int
+shaauth(Espcb *ecb, uint8_t *t, int tlen, uint8_t *auth)
+{
+	int r;
+	uint8_t hash[SHA1dlen];
+
+	memset(hash, 0, SHA1dlen);
+	seanq_hmac_sha1(hash, t, tlen, (uint8_t*)ecb->ahstate, BITS2BYTES(128));
+	r = memcmp(auth, hash, ecb->ahlen) == 0;
+	memmove(auth, hash, ecb->ahlen);
+	return r;
+}
+
+static void
+shaahinit(Espcb *ecb, char *name, uint8_t *key, unsigned klen)
+{
+	if(klen != 128)
+		panic("shaahinit: bad keylen");
+	klen /= BI2BY;
+
+	ecb->ahalg = name;
+	ecb->ahblklen = 1;
+	ecb->ahlen = BITS2BYTES(96);
+	ecb->auth = shaauth;
+	ecb->ahstate = smalloc(klen);
+	memmove(ecb->ahstate, key, klen);
+}
+
+
+/*
+ * aes
+ */
+
+/* ah_aes_xcbc_mac_96, rfc3566 */
+static int
+aesahauth(Espcb *ecb, uint8_t *t, int tlen, uint8_t *auth)
+{
+	int r;
+	uint8_t hash[AESdlen];
+
+	memset(hash, 0, AESdlen);
+	ecb->ds = hmac_aes(t, tlen, (uint8_t*)ecb->ahstate, BITS2BYTES(96), hash,
+		ecb->ds);
+	r = memcmp(auth, hash, ecb->ahlen) == 0;
+	memmove(auth, hash, ecb->ahlen);
+	return r;
+}
+
+static void
+aesahinit(Espcb *ecb, char *name, uint8_t *key, unsigned klen)
+{
+	if(klen != 128)
+		panic("aesahinit: keylen not 128");
+	klen /= BI2BY;
+
+	ecb->ahalg = name;
+	ecb->ahblklen = 1;
+	ecb->ahlen = BITS2BYTES(96);
+	ecb->auth = aesahauth;
+	ecb->ahstate = smalloc(klen);
+	memmove(ecb->ahstate, key, klen);
+}
+
+static int
+aescbccipher(Espcb *ecb, uint8_t *p, int n)	/* 128-bit blocks */
+{
+	uint8_t tmp[AESbsize], q[AESbsize];
+	uint8_t *pp, *tp, *ip, *eip, *ep;
+	AESstate *ds = ecb->espstate;
+
+	ep = p + n;
+	if(ecb->incoming) {
+		memmove(ds->ivec, p, AESbsize);
+		p += AESbsize;
+		while(p < ep){
+			memmove(tmp, p, AESbsize);
+			aes_decrypt(ds->dkey, ds->rounds, p, q);
+			memmove(p, q, AESbsize);
+			tp = tmp;
+			ip = ds->ivec;
+			for(eip = ip + AESbsize; ip < eip; ){
+				*p++ ^= *ip;
+				*ip++ = *tp++;
+			}
+		}
+	} else {
+		memmove(p, ds->ivec, AESbsize);
+		for(p += AESbsize; p < ep; p += AESbsize){
+			pp = p;
+			ip = ds->ivec;
+			for(eip = ip + AESbsize; ip < eip; )
+				*pp++ ^= *ip++;
+			aes_encrypt(ds->ekey, ds->rounds, p, q);
+			memmove(ds->ivec, q, AESbsize);
+			memmove(p, q, AESbsize);
+		}
+	}
+	return 1;
+}
+
+static void
+aescbcespinit(Espcb *ecb, char *name, uint8_t *k, unsigned n)
+{
+	uint8_t key[Aeskeysz], ivec[Aeskeysz];
+	int i;
+
+	n = BITS2BYTES(n);
+	if(n > Aeskeysz)
+		n = Aeskeysz;
+	memset(key, 0, sizeof(key));
+	memmove(key, k, n);
+	for(i = 0; i < Aeskeysz; i++)
+		ivec[i] = nrand(256);
+	ecb->espalg = name;
+	ecb->espblklen = Aesblk;
+	ecb->espivlen = Aesblk;
+	ecb->cipher = aescbccipher;
+	ecb->espstate = smalloc(sizeof(AESstate));
+	setupAESstate(ecb->espstate, key, n /* keybytes */, ivec);
+}
+
+static int
+aesctrcipher(Espcb *ecb, uint8_t *p, int n)	/* 128-bit blocks */
+{
+	uint8_t tmp[AESbsize], q[AESbsize];
+	uint8_t *pp, *tp, *ip, *eip, *ep;
+	AESstate *ds = ecb->espstate;
+
+	ep = p + n;
+	if(ecb->incoming) {
+		memmove(ds->ivec, p, AESbsize);
+		p += AESbsize;
+		while(p < ep){
+			memmove(tmp, p, AESbsize);
+			aes_decrypt(ds->dkey, ds->rounds, p, q);
+			memmove(p, q, AESbsize);
+			tp = tmp;
+			ip = ds->ivec;
+			for(eip = ip + AESbsize; ip < eip; ){
+				*p++ ^= *ip;
+				*ip++ = *tp++;
+			}
+		}
+	} else {
+		memmove(p, ds->ivec, AESbsize);
+		for(p += AESbsize; p < ep; p += AESbsize){
+			pp = p;
+			ip = ds->ivec;
+			for(eip = ip + AESbsize; ip < eip; )
+				*pp++ ^= *ip++;
+			aes_encrypt(ds->ekey, ds->rounds, p, q);
+			memmove(ds->ivec, q, AESbsize);
+			memmove(p, q, AESbsize);
+		}
+	}
+	return 1;
+}
+
+static void
+aesctrespinit(Espcb *ecb, char *name, uint8_t *k, unsigned n)
+{
+	uint8_t key[Aesblk], ivec[Aesblk];
+	int i;
+
+	n = BITS2BYTES(n);
+	if(n > Aeskeysz)
+		n = Aeskeysz;
+	memset(key, 0, sizeof(key));
+	memmove(key, k, n);
+	for(i = 0; i < Aesblk; i++)
+		ivec[i] = nrand(256);
+	ecb->espalg = name;
+	ecb->espblklen = Aesblk;
+	ecb->espivlen = Aesblk;
+	ecb->cipher = aesctrcipher;
+	ecb->espstate = smalloc(sizeof(AESstate));
+	setupAESstate(ecb->espstate, key, n /* keybytes */, ivec);
+}
+
+
+/*
+ * md5
+ */
+
+static void
+seanq_hmac_md5(uint8_t hash[MD5dlen], uint8_t *t, int32_t tlen, uint8_t *key, int32_t klen)
+{
+	int i;
+	uint8_t ipad[Hmacblksz+1], opad[Hmacblksz+1], innerhash[MD5dlen];
+	DigestState *digest;
+
+	memset(ipad, 0x36, Hmacblksz);
+	memset(opad, 0x5c, Hmacblksz);
+	ipad[Hmacblksz] = opad[Hmacblksz] = 0;
+	for(i = 0; i < klen; i++){
+		ipad[i] ^= key[i];
+		opad[i] ^= key[i];
+	}
+	digest = md5(ipad, Hmacblksz, nil, nil);
+	md5(t, tlen, innerhash, digest);
+	digest = md5(opad, Hmacblksz, nil, nil);
+	md5(innerhash, MD5dlen, hash, digest);
+}
+
+static int
+md5auth(Espcb *ecb, uint8_t *t, int tlen, uint8_t *auth)
+{
+	uint8_t hash[MD5dlen];
+	int r;
+
+	memset(hash, 0, MD5dlen);
+	seanq_hmac_md5(hash, t, tlen, (uint8_t*)ecb->ahstate, BITS2BYTES(128));
+	r = memcmp(auth, hash, ecb->ahlen) == 0;
+	memmove(auth, hash, ecb->ahlen);
+	return r;
+}
+
+static void
+md5ahinit(Espcb *ecb, char *name, uint8_t *key, unsigned klen)
+{
+	if(klen != 128)
+		panic("md5ahinit: bad keylen");
+	klen = BITS2BYTES(klen);
+	ecb->ahalg = name;
+	ecb->ahblklen = 1;
+	ecb->ahlen = BITS2BYTES(96);
+	ecb->auth = md5auth;
+	ecb->ahstate = smalloc(klen);
+	memmove(ecb->ahstate, key, klen);
+}
+
+
+/*
+ * des, single and triple
+ */
+
+static int
+descipher(Espcb *ecb, uint8_t *p, int n)
+{
+	DESstate *ds = ecb->espstate;
+
+	if(ecb->incoming) {
+		memmove(ds->ivec, p, Desblk);
+		desCBCdecrypt(p + Desblk, n - Desblk, ds);
+	} else {
+		memmove(p, ds->ivec, Desblk);
+		desCBCencrypt(p + Desblk, n - Desblk, ds);
+	}
+	return 1;
+}
+
+static int
+des3cipher(Espcb *ecb, uint8_t *p, int n)
+{
+	DES3state *ds = ecb->espstate;
+
+	if(ecb->incoming) {
+		memmove(ds->ivec, p, Desblk);
+		des3CBCdecrypt(p + Desblk, n - Desblk, ds);
+	} else {
+		memmove(p, ds->ivec, Desblk);
+		des3CBCencrypt(p + Desblk, n - Desblk, ds);
+	}
+	return 1;
+}
+
+static void
+desespinit(Espcb *ecb, char *name, uint8_t *k, unsigned n)
+{
+	uint8_t key[Desblk], ivec[Desblk];
+	int i;
+
+	n = BITS2BYTES(n);
+	if(n > Desblk)
+		n = Desblk;
+	memset(key, 0, sizeof(key));
+	memmove(key, k, n);
+	for(i = 0; i < Desblk; i++)
+		ivec[i] = nrand(256);
+	ecb->espalg = name;
+	ecb->espblklen = Desblk;
+	ecb->espivlen = Desblk;
+
+	ecb->cipher = descipher;
+	ecb->espstate = smalloc(sizeof(DESstate));
+	setupDESstate(ecb->espstate, key, ivec);
+}
+
+static void
+des3espinit(Espcb *ecb, char *name, uint8_t *k, unsigned n)
+{
+	uint8_t key[3][Desblk], ivec[Desblk];
+	int i;
+
+	n = BITS2BYTES(n);
+	if(n > Des3keysz)
+		n = Des3keysz;
+	memset(key, 0, sizeof(key));
+	memmove(key, k, n);
+	for(i = 0; i < Desblk; i++)
+		ivec[i] = nrand(256);
+	ecb->espalg = name;
+	ecb->espblklen = Desblk;
+	ecb->espivlen = Desblk;
+
+	ecb->cipher = des3cipher;
+	ecb->espstate = smalloc(sizeof(DES3state));
+	setupDES3state(ecb->espstate, key, ivec);
+}
+
+
+/*
+ * interfacing to devip
+ */
+void
+espinit(Fs *fs)
+{
+	Proto *esp;
+
+	esp = smalloc(sizeof(Proto));
+	esp->priv = smalloc(sizeof(Esppriv));
+	esp->name = "esp";
+	esp->connect = espconnect;
+	esp->announce = nil;
+	esp->ctl = espctl;
+	esp->state = espstate;
+	esp->create = espcreate;
+	esp->close = espclose;
+	esp->rcv = espiput;
+	esp->advise = espadvise;
+	esp->stats = espstats;
+	esp->local = esplocal;
+	esp->remote = espremote;
+	esp->ipproto = IP_ESPPROTO;
+	esp->nc = Nchans;
+	esp->ptclsize = sizeof(Espcb);
+
+	Fsproto(fs, esp);
+}

+ 11 - 32
sys/src/9/ip/ethermedium.c

@@ -14,6 +14,7 @@
 #include "fns.h"
 #include "../port/error.h"
 
+#include "../port/netif.h"
 #include "ip.h"
 #include "ipv6.h"
 
@@ -38,8 +39,7 @@ static void	etherread4(void *a);
 static void	etherread6(void *a);
 static void	etherbind(Ipifc *ifc, int argc, char **argv);
 static void	etherunbind(Ipifc *ifc);
-static void	etherbwrite(Ipifc *ifc, Block *bp, int version,
-			       uint8_t *ip);
+static void	etherbwrite(Ipifc *ifc, Block *bp, int version, uint8_t *ip);
 static void	etheraddmulti(Ipifc *ifc, uint8_t *a, uint8_t *ia);
 static void	etherremmulti(Ipifc *ifc, uint8_t *a, uint8_t *ia);
 static Block*	multicastarp(Fs *f, Arpent *a, Medium*, uint8_t *mac);
@@ -67,23 +67,6 @@ Medium ethermedium =
 .pref2addr=	etherpref2addr,
 };
 
-Medium fbemedium =
-{
-.name=		"fbe",
-.hsize=		14,
-.mintu=		60,
-.maxtu=		4000,
-.maclen=	6,
-.bind=		etherbind,
-.unbind=	etherunbind,
-.bwrite=	etherbwrite,
-.addmulti=	etheraddmulti,
-.remmulti=	etherremmulti,
-.ares=		arpenter,
-.areg=		sendgarp,
-.pref2addr=	etherpref2addr,
-};
-
 Medium gbemedium =
 {
 .name=		"gbe",
@@ -120,9 +103,6 @@ struct Etherrock
  */
 enum
 {
-	ETARP		= 0x0806,
-	ETIP4		= 0x0800,
-	ETIP6		= 0x86DD,
 	ARPREQUEST	= 1,
 	ARPREPLY	= 2,
 };
@@ -184,12 +164,12 @@ etherbind(Ipifc *ifc, int argc, char **argv)
 	}
 
 	/*
-	 *  open ipv4 converstation
+	 *  open ipv4 conversation
 	 *
 	 *  the dial will fail if the type is already open on
 	 *  this device.
 	 */
-	snprint(addr, sizeof(addr), "%s!0x800", argv[2]);
+	snprint(addr, sizeof(addr), "%s!0x800", argv[2]);	/* ETIP4 */
 	mchan4 = chandial(addr, nil, dir, &cchan4);
 
 	/*
@@ -228,7 +208,7 @@ etherbind(Ipifc *ifc, int argc, char **argv)
 	/*
  	 *  open arp conversation
 	 */
-	snprint(addr, sizeof(addr), "%s!0x806", argv[2]);
+	snprint(addr, sizeof(addr), "%s!0x806", argv[2]);	/* ETARP */
 	achan = chandial(addr, nil, nil, nil);
 
 	/*
@@ -237,7 +217,7 @@ etherbind(Ipifc *ifc, int argc, char **argv)
 	 *  the dial will fail if the type is already open on
 	 *  this device.
 	 */
-	snprint(addr, sizeof(addr), "%s!0x86DD", argv[2]);
+	snprint(addr, sizeof(addr), "%s!0x86DD", argv[2]);	/* ETIP6 */
 	mchan6 = chandial(addr, nil, dir, &cchan6);
 
 	/*
@@ -272,11 +252,11 @@ etherunbind(Ipifc *ifc)
 	Etherrock *er = ifc->arg;
 
 	if(er->read4p)
-		postnote(er->read4p, 1, "unbind", NUser);
+		postnote(er->read4p, 1, "unbind", 0);
 	if(er->read6p)
-		postnote(er->read6p, 1, "unbind", NUser);
+		postnote(er->read6p, 1, "unbind", 0);
 	if(er->arpp)
-		postnote(er->arpp, 1, "unbind", NUser);
+		postnote(er->arpp, 1, "unbind", 0);
 
 	/* wait for readers to die */
 	while(er->arpp != 0 || er->read4p != 0 || er->read6p != 0)
@@ -445,7 +425,7 @@ etheraddmulti(Ipifc *ifc, uint8_t *a, uint8_t *b)
 	int version;
 
 	version = multicastea(mac, a);
-	sprint(buf, "addmulti %E", mac);
+	snprint(buf, sizeof buf, "addmulti %E", mac);
 	switch(version){
 	case V4:
 		er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0);
@@ -467,7 +447,7 @@ etherremmulti(Ipifc *ifc, uint8_t *a, uint8_t *b)
 	int version;
 
 	version = multicastea(mac, a);
-	sprint(buf, "remmulti %E", mac);
+	snprint(buf, sizeof buf, "remmulti %E", mac);
 	switch(version){
 	case V4:
 		er->cchan4->dev->write(er->cchan4, buf, strlen(buf), 0);
@@ -781,7 +761,6 @@ void
 ethermediumlink(void)
 {
 	addipmedium(&ethermedium);
-	addipmedium(&fbemedium);
 	addipmedium(&gbemedium);
 }
 

+ 765 - 73
sys/src/9/ip/gre.c

@@ -19,8 +19,7 @@
 
 #include "ip.h"
 
-enum
-{
+enum {
 	GRE_IPONLY	= 12,		/* size of ip header */
 	GRE_IPPLUSGRE	= 12,		/* minimum size of GRE header */
 	IP_GREPROTO	= 47,
@@ -28,17 +27,40 @@ enum
 	GRErxms		= 200,
 	GREtickms	= 100,
 	GREmaxxmit	= 10,
+
+	K		= 1024,
+	GREqlen		= 256 * K,
+
+	GRE_cksum	= 0x8000,
+	GRE_routing	= 0x4000,
+	GRE_key		= 0x2000,
+	GRE_seq		= 0x1000,
+
+	Nring		= 1 << 10,	/* power of two, please */
+	Ringmask	= Nring - 1,
+
+	GREctlraw	= 0,
+	GREctlcooked,
+	GREctlretunnel,
+	GREctlreport,
+	GREctldlsuspend,
+	GREctlulsuspend,
+	GREctldlresume,
+	GREctlulresume,
+	GREctlforward,
+	GREctlulkey,
+	Ncmds,
 };
 
-typedef struct GREhdr
-{
+typedef struct GREhdr GREhdr;
+struct GREhdr{
 	/* ip header */
 	uint8_t	vihl;		/* Version and header length */
 	uint8_t	tos;		/* Type of service */
 	uint8_t	len[2];		/* packet length (including headers) */
 	uint8_t	id[2];		/* Identification */
 	uint8_t	frag[2];	/* Fragment information */
-	uint8_t	Unused;
+	uint8_t	ttl;
 	uint8_t	proto;		/* Protocol */
 	uint8_t	cksum[2];	/* checksum */
 	uint8_t	src[4];		/* Ip source */
@@ -47,21 +69,115 @@ typedef struct GREhdr
 	/* gre header */
 	uint8_t	flags[2];
 	uint8_t	eproto[2];	/* encapsulation protocol */
-} GREhdr;
+};
 
 typedef struct GREpriv GREpriv;
-struct GREpriv
-{
-	int		raw;			/* Raw GRE mode */
-
+struct GREpriv{
 	/* non-MIB stats */
-	uint32_t		csumerr;		/* checksum errors */
-	uint32_t		lenerr;			/* short packet */
+	uint32_t	lenerr;			/* short packet */
+};
+
+typedef struct Bring	Bring;
+struct Bring{
+	Block	*ring[Nring];
+	int32_t	produced;
+	int32_t	consumed;
+};
+
+typedef struct GREconv	GREconv;
+struct GREconv{
+	int	raw;
+
+	/* Retunnelling information.  v4 only */
+	uint8_t	north[4];			/* HA */
+	uint8_t	south[4];			/* Base station */
+	uint8_t	hoa[4];				/* Home address */
+	uint8_t	coa[4];				/* Careof address */
+	uint32_t	seq;				/* Current sequence # */
+	int	dlsusp;				/* Downlink suspended? */
+	int	ulsusp;				/* Uplink suspended? */
+	uint32_t	ulkey;				/* GRE key */
+
+	QLock	lock;				/* Lock for rings */
+	Bring	dlpending;			/* Ring of pending packets */
+	Bring	dlbuffered;			/* Received while suspended */
+	Bring	ulbuffered;			/* Received while suspended */
+};
+
+typedef struct Metablock Metablock;
+struct Metablock{
+	uint8_t	*rp;
+	uint32_t	seq;
+};
+
+static char *grectlcooked(Conv *, int, char **);
+static char *grectldlresume(Conv *, int, char **);
+static char *grectldlsuspend(Conv *, int, char **);
+static char *grectlforward(Conv *, int, char **);
+static char *grectlraw(Conv *, int, char **);
+static char *grectlreport(Conv *, int, char **);
+static char *grectlretunnel(Conv *, int, char **);
+static char *grectlulkey(Conv *, int, char **);
+static char *grectlulresume(Conv *, int, char **);
+static char *grectlulsuspend(Conv *, int, char **);
+
+static struct{
+	char	*cmd;
+	int	argc;
+	char	*(*f)(Conv *, int, char **);
+} grectls[Ncmds] = {
+[GREctlraw]	=	{	"raw",		1,	grectlraw,	},
+[GREctlcooked]	=	{	"cooked",	1,	grectlcooked,	},
+[GREctlretunnel]=	{	"retunnel",	5,	grectlretunnel,	},
+[GREctlreport]	=	{	"report",	2,	grectlreport,	},
+[GREctldlsuspend]=	{	"dlsuspend",	1,	grectldlsuspend,},
+[GREctlulsuspend]=	{	"ulsuspend",	1,	grectlulsuspend,},
+[GREctldlresume]=	{	"dlresume",	1,	grectldlresume,	},
+[GREctlulresume]=	{	"ulresume",	1,	grectlulresume,	},
+[GREctlforward]	=	{	"forward",	2,	grectlforward,	},
+[GREctlulkey]	=	{	"ulkey",	2,	grectlulkey,	},
 };
 
+static uint8_t nulladdr[4];
+static char *sessend = "session end";
+
 static void grekick(void *x, Block *bp);
+//static char *gresetup(Conv *, char *, char *, char *);
 
-static char*
+uint32_t grepdin, grepdout, grebdin, grebdout;
+uint32_t grepuin, grepuout, grebuin, grebuout;
+
+static Block *
+getring(Bring *r)
+{
+	Block *bp;
+
+	if(r->consumed == r->produced)
+		return nil;
+
+	bp = r->ring[r->consumed & Ringmask];
+	r->ring[r->consumed & Ringmask] = nil;
+	r->consumed++;
+	return bp;
+}
+
+static void
+addring(Bring *r, Block *bp)
+{
+	Block *tbp;
+
+	if(r->produced - r->consumed > Ringmask){
+		/* Full! */
+		tbp = r->ring[r->produced & Ringmask];
+		assert(tbp);
+		freeb(tbp);
+		r->consumed++;
+	}
+	r->ring[r->produced & Ringmask] = bp;
+	r->produced++;
+}
+
+static char *
 greconnect(Conv *c, char **argv, int argc)
 {
 	Proto *p;
@@ -101,45 +217,93 @@ greconnect(Conv *c, char **argv, int argc)
 static void
 grecreate(Conv *c)
 {
-	c->rq = qopen(64*1024, Qmsg, 0, c);
+	c->rq = qopen(GREqlen, Qmsg, 0, c);
 	c->wq = qbypass(grekick, c);
 }
 
 static int
 grestate(Conv *c, char *state, int n)
 {
-	USED(c);
-	return snprint(state, n, "%s\n", "Datagram");
+	GREconv *grec;
+	char *ep, *p;
+
+	grec = c->ptcl;
+	p    = state;
+	ep   = p + n;
+	p    = seprint(p, ep, "%s%s%s%shoa %V north %V south %V seq %ulx "
+	 "pending %uld  %uld buffered dl %uld %uld ul %uld %uld ulkey %.8ulx\n",
+			c->inuse? "Open ": "Closed ",
+			grec->raw? "raw ": "",
+			grec->dlsusp? "DL suspended ": "",
+			grec->ulsusp? "UL suspended ": "",
+			grec->hoa, grec->north, grec->south, grec->seq,
+			grec->dlpending.consumed, grec->dlpending.produced,
+			grec->dlbuffered.consumed, grec->dlbuffered.produced,
+			grec->ulbuffered.consumed, grec->ulbuffered.produced,
+			grec->ulkey);
+	return p - state;
 }
 
 static char*
 greannounce(Conv* conv, char** c, int i)
 {
-	return "pktifc does not support announce";
+	return "gre does not support announce";
 }
 
 static void
 greclose(Conv *c)
 {
-	qclose(c->rq);
-	qclose(c->wq);
-	qclose(c->eq);
+	GREconv *grec;
+	Block *bp;
+
+	grec = c->ptcl;
+
+	/* Make sure we don't forward any more packets */
+	memset(grec->hoa, 0, sizeof grec->hoa);
+	memset(grec->north, 0, sizeof grec->north);
+	memset(grec->south, 0, sizeof grec->south);
+
+	qlock(&grec->lock);
+	while((bp = getring(&grec->dlpending)) != nil)
+		freeb(bp);
+
+	while((bp = getring(&grec->dlbuffered)) != nil)
+		freeb(bp);
+
+	while((bp = getring(&grec->ulbuffered)) != nil)
+		freeb(bp);
+
+	grec->dlpending.produced = grec->dlpending.consumed = 0;
+	grec->dlbuffered.produced = grec->dlbuffered.consumed = 0;
+	grec->ulbuffered.produced = grec->ulbuffered.consumed = 0;
+	qunlock(&grec->lock);
+
+	grec->raw = 0;
+	grec->seq = 0;
+	grec->dlsusp = grec->ulsusp = 1;
+
+	qhangup(c->rq, sessend);
+	qhangup(c->wq, sessend);
+	qhangup(c->eq, sessend);
 	ipmove(c->laddr, IPnoaddr);
 	ipmove(c->raddr, IPnoaddr);
-	c->lport = 0;
-	c->rport = 0;
+	c->lport = c->rport = 0;
 }
 
 static void
 grekick(void *x, Block *bp)
 {
-	Conv *c = x;
-	GREhdr *ghp;
+	Conv *c;
+	GREconv *grec;
+	GREhdr *gre;
 	uint8_t laddr[IPaddrlen], raddr[IPaddrlen];
 
 	if(bp == nil)
 		return;
 
+	c    = x;
+	grec = c->ptcl;
+
 	/* Make space to fit ip header (gre header already there) */
 	bp = padblock(bp, GRE_IPONLY);
 	if(bp == nil)
@@ -150,75 +314,351 @@ grekick(void *x, Block *bp)
 	if(bp == nil)
 		return;
 
-	ghp = (GREhdr *)(bp->rp);
-	ghp->vihl = IP_VER4;
+	gre = (GREhdr *)bp->rp;
+	gre->vihl = IP_VER4;
 
-	if(!((GREpriv*)c->p->priv)->raw){
-		v4tov6(raddr, ghp->dst);
+	if(grec->raw == 0){
+		v4tov6(raddr, gre->dst);
 		if(ipcmp(raddr, v4prefix) == 0)
-			memmove(ghp->dst, c->raddr + IPv4off, IPv4addrlen);
-		v4tov6(laddr, ghp->src);
+			memmove(gre->dst, c->raddr + IPv4off, IPv4addrlen);
+		v4tov6(laddr, gre->src);
 		if(ipcmp(laddr, v4prefix) == 0){
 			if(ipcmp(c->laddr, IPnoaddr) == 0)
-				findlocalip(c->p->f, c->laddr, raddr); /* pick interface closest to dest */
-			memmove(ghp->src, c->laddr + IPv4off, IPv4addrlen);
+				/* pick interface closest to dest */
+				findlocalip(c->p->f, c->laddr, raddr);
+			memmove(gre->src, c->laddr + IPv4off, sizeof gre->src);
 		}
-		hnputs(ghp->eproto, c->rport);
+		hnputs(gre->eproto, c->rport);
 	}
 
-	ghp->proto = IP_GREPROTO;
-	ghp->frag[0] = 0;
-	ghp->frag[1] = 0;
+	gre->proto = IP_GREPROTO;
+	gre->frag[0] = gre->frag[1] = 0;
 
+	grepdout++;
+	grebdout += BLEN(bp);
 	ipoput4(c->p->f, bp, 0, c->ttl, c->tos, nil);
 }
 
 static void
-greiput(Proto *gre, Ipifc* i, Block *bp)
+gredownlink(Conv *c, Block *bp)
 {
-	int len;
-	GREhdr *ghp;
-	Conv *c, **p;
-	uint16_t eproto;
+	Metablock *m;
+	GREconv *grec;
+	GREhdr *gre;
+	int hdrlen, suspended, extra;
+	uint16_t flags;
+	uint32_t seq;
+
+	gre = (GREhdr *)bp->rp;
+	if(gre->ttl == 1){
+		freeb(bp);
+		return;
+	}
+
+	/*
+	 * We've received a packet with a GRE header and we need to
+	 * re-adjust the packet header to strip all unwanted parts
+	 * but leave room for only a sequence number.
+	 */
+	grec   = c->ptcl;
+	flags  = nhgets(gre->flags);
+	hdrlen = 0;
+	if(flags & GRE_cksum)
+		hdrlen += 2;
+	if(flags & GRE_routing){
+		print("%V routing info present.  Discarding packet", gre->src);
+		freeb(bp);
+		return;
+	}
+	if(flags & (GRE_cksum|GRE_routing))
+		hdrlen += 2;			/* Offset field */
+	if(flags & GRE_key)
+		hdrlen += 4;
+	if(flags & GRE_seq)
+		hdrlen += 4;
+
+	/*
+	 * The outgoing packet only has the sequence number set.  Make room
+	 * for the sequence number.
+	 */
+	if(hdrlen != sizeof(uint32_t)){
+		extra = hdrlen - sizeof(uint32_t);
+		if(extra < 0 && bp->rp - bp->base < -extra){
+			print("gredownlink: cannot add sequence number\n");
+			freeb(bp);
+			return;
+		}
+		memmove(bp->rp + extra, bp->rp, sizeof(GREhdr));
+		bp->rp += extra;
+		assert(BLEN(bp) >= sizeof(GREhdr) + sizeof(uint32_t));
+		gre = (GREhdr *)bp->rp;
+	}
+	seq = grec->seq++;
+	hnputs(gre->flags, GRE_seq);
+	hnputl(bp->rp + sizeof(GREhdr), seq);
+
+	/*
+	 * Keep rp and seq at the base.  ipoput4 consumes rp for
+	 * refragmentation.
+	 */
+	assert(bp->rp - bp->base >= sizeof(Metablock));
+	m = (Metablock *)bp->base;
+	m->rp  = bp->rp;
+	m->seq = seq;
+
+	/*
+	 * Here we make a decision what we're doing with the packet.  We're
+	 * doing this w/o holding a lock which means that later on in the
+	 * process we may discover we've done the wrong thing.  I don't want
+	 * to call ipoput with the lock held.
+	 */
+restart:
+	suspended = grec->dlsusp;
+	if(suspended){
+		if(!canqlock(&grec->lock)){
+			/*
+			 * just give up.  too bad, we lose a packet.  this
+			 * is just too hard and my brain already hurts.
+			 */
+			freeb(bp);
+			return;
+		}
+
+		if(!grec->dlsusp){
+			/*
+			 * suspend race.  We though we were suspended, but
+			 * we really weren't.
+			 */
+			qunlock(&grec->lock);
+			goto restart;
+		}
+
+		/* Undo the incorrect ref count addition */
+		addring(&grec->dlbuffered, bp);
+		qunlock(&grec->lock);
+		return;
+	}
+
+	/*
+	 * When we get here, we're not suspended.  Proceed to send the
+	 * packet.
+	 */
+	memmove(gre->src, grec->coa, sizeof gre->dst);
+	memmove(gre->dst, grec->south, sizeof gre->dst);
+
+	/*
+	 * Make sure the packet does not go away.
+	 */
+	//_xinc(&bp->ref);
+	ainc(&bp->ref);
+	assert(bp->ref == 2);
+
+	ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+	grepdout++;
+	grebdout += BLEN(bp);
+
+	/*
+	 * Now make sure we didn't do the wrong thing.
+	 */
+	if(!canqlock(&grec->lock)){
+		freeb(bp);		/* The packet just goes away */
+		return;
+	}
+
+	/* We did the right thing */
+	addring(&grec->dlpending, bp);
+	qunlock(&grec->lock);
+}
+
+static void
+greuplink(Conv *c, Block *bp)
+{
+	GREconv *grec;
+	GREhdr *gre;
+	uint16_t flags;
+
+	gre = (GREhdr *)bp->rp;
+	if(gre->ttl == 1)
+		return;
+
+	grec = c->ptcl;
+	memmove(gre->src, grec->coa, sizeof gre->src);
+	memmove(gre->dst, grec->north, sizeof gre->dst);
+
+	/*
+	 * Add a key, if needed.
+	 */
+	if(grec->ulkey){
+		flags = nhgets(gre->flags);
+		if(flags & (GRE_cksum|GRE_routing)){
+			print("%V routing info present.  Discarding packet\n",
+				gre->src);
+			freeb(bp);
+			return;
+		}
+
+		if((flags & GRE_key) == 0){
+			/* Make room for the key */
+			if(bp->rp - bp->base < sizeof(uint32_t)){
+				print("%V can't add key\n", gre->src);
+				freeb(bp);
+				return;
+			}
+
+			bp->rp -= 4;
+			memmove(bp->rp, bp->rp + 4, sizeof(GREhdr));
+
+			gre = (GREhdr *)bp->rp;
+			hnputs(gre->flags, flags | GRE_key);
+		}
+
+		/* Add the key */
+		hnputl(bp->rp + sizeof(GREhdr), grec->ulkey);
+	}
+
+	if(!canqlock(&grec->lock)){
+		freeb(bp);
+		return;
+	}
+
+	if(grec->ulsusp)
+		addring(&grec->ulbuffered, bp);
+	else{
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+		grepuout++;
+		grebuout += BLEN(bp);
+	}
+	qunlock(&grec->lock);
+}
+
+static void
+greiput(Proto *proto, Ipifc *ipifc, Block *bp)
+{
+	int len, hdrlen;
+	uint16_t eproto, flags;
 	uint8_t raddr[IPaddrlen];
+	Conv *c, **p;
+	GREconv *grec;
+	GREhdr *gre;
 	GREpriv *gpriv;
+	Ip4hdr *ip;
 
-	gpriv = gre->priv;
-	ghp = (GREhdr*)(bp->rp);
+	/*
+	 * We don't want to deal with block lists.  Ever.  The problem is
+	 * that when the block is forwarded, devether.c puts the block into
+	 * a queue that also uses ->next.  Just do not use ->next here!
+	 */
+	if(bp->next){
+		len = blocklen(bp);
+		bp  = pullupblock(bp, len);
+		assert(BLEN(bp) == len && bp->next == nil);
+	}
+
+	gre = (GREhdr *)bp->rp;
+	if(BLEN(bp) < sizeof(GREhdr) || gre->proto != IP_GREPROTO){
+		freeb(bp);
+		return;
+	}
+
+	v4tov6(raddr, gre->src);
+	eproto = nhgets(gre->eproto);
+	flags  = nhgets(gre->flags);
+	hdrlen = sizeof(GREhdr);
+
+	if(flags & GRE_cksum)
+		hdrlen += 2;
+	if(flags & GRE_routing){
+		print("%I routing info present.  Discarding packet\n", raddr);
+		freeb(bp);
+		return;
+	}
+	if(flags & (GRE_cksum|GRE_routing))
+		hdrlen += 2;			/* Offset field */
+	if(flags & GRE_key)
+		hdrlen += 4;
+	if(flags & GRE_seq)
+		hdrlen += 4;
+
+	if(BLEN(bp) - hdrlen < sizeof(Ip4hdr)){
+		print("greretunnel: packet too short (s=%V d=%V)\n",
+			gre->src, gre->dst);
+		freeb(bp);
+		return;
+	}
+	ip = (Ip4hdr *)(bp->rp + hdrlen);
+
+	qlock(proto);
+	/*
+	 * Look for a conversation structure for this port and address, or
+	 * match the retunnel part, or match on the raw flag.
+	 */
+	for(p = proto->conv; *p; p++) {
+		c = *p;
+
+		if(c->inuse == 0)
+			continue;
 
-	v4tov6(raddr, ghp->src);
-	eproto = nhgets(ghp->eproto);
-	qlock(gre);
+		/*
+		 * Do not stop this session - blocking here
+		 * implies that etherread is blocked.
+		 */
+		grec = c->ptcl;
+		if(memcmp(ip->dst, grec->hoa, sizeof ip->dst) == 0){
+			grepdin++;
+			grebdin += BLEN(bp);
+			gredownlink(c, bp);
+			qunlock(proto);
+			return;
+		}
+
+		if(memcmp(ip->src, grec->hoa, sizeof ip->src) == 0){
+			grepuin++;
+			grebuin += BLEN(bp);
+			greuplink(c, bp);
+			qunlock(proto);
+			return;
+		}
+	}
 
-	/* Look for a conversation structure for this port and address */
-	c = nil;
-	for(p = gre->conv; *p; p++) {
+	/*
+	 * when we get here, none of the forwarding tunnels matched.  now
+	 * try to match on raw and conversational sessions.
+	 */
+	for(c = nil, p = proto->conv; *p; p++) {
 		c = *p;
+
 		if(c->inuse == 0)
 			continue;
+
+		/*
+		 * Do not stop this session - blocking here
+		 * implies that etherread is blocked.
+		 */
+		grec = c->ptcl;
 		if(c->rport == eproto &&
-			(gpriv->raw || ipcmp(c->raddr, raddr) == 0))
+		    (grec->raw || ipcmp(c->raddr, raddr) == 0))
 			break;
 	}
 
-	if(*p == nil) {
-		qunlock(gre);
-		freeblist(bp);
+	qunlock(proto);
+
+	if(*p == nil){
+		freeb(bp);
 		return;
 	}
 
-	qunlock(gre);
-
 	/*
 	 * Trim the packet down to data size
 	 */
-	len = nhgets(ghp->len) - GRE_IPONLY;
+	len = nhgets(gre->len) - GRE_IPONLY;
 	if(len < GRE_IPPLUSGRE){
-		freeblist(bp);
+		freeb(bp);
 		return;
 	}
+
 	bp = trimblock(bp, GRE_IPONLY, len);
 	if(bp == nil){
+		gpriv = proto->priv;
 		gpriv->lenerr++;
 		return;
 	}
@@ -226,8 +666,8 @@ greiput(Proto *gre, Ipifc* i, Block *bp)
 	/*
 	 *  Can't delimit packet so pull it all into one block.
 	 */
-	if(qlen(c->rq) > 64*1024)
-		freeblist(bp);
+	if(qlen(c->rq) > GREqlen)
+		freeb(bp);
 	else{
 		bp = concatblock(bp);
 		if(bp == 0)
@@ -242,27 +682,279 @@ grestats(Proto *gre, char *buf, int len)
 	GREpriv *gpriv;
 
 	gpriv = gre->priv;
+	return snprint(buf, len,
+		"gre: %lud %lud %lud %lud %lud %lud %lud %lud, lenerrs %lud\n",
+		grepdin, grepdout, grepuin, grepuout,
+		grebdin, grebdout, grebuin, grebuout, gpriv->lenerr);
+}
+
+static char *
+grectlraw(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
 
-	return snprint(buf, len, "gre: len %lud\n", gpriv->lenerr);
+	grec = c->ptcl;
+	grec->raw = 1;
+	return nil;
 }
 
-char*
-grectl(Conv *c, char **f, int n)
+static char *
+grectlcooked(Conv *c, int i, char **argv)
 {
-	GREpriv *gpriv;
+	GREconv *grec;
+
+	grec = c->ptcl;
+	grec->raw = 0;
+	return nil;
+}
+
+static char *
+grectlretunnel(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
+	uint8_t ipaddr[4];
+
+	grec = c->ptcl;
+	if(memcmp(grec->hoa, nulladdr, sizeof grec->hoa))
+		return "tunnel already set up";
+
+	v4parseip(ipaddr, argv[1]);
+	if(memcmp(ipaddr, nulladdr, sizeof ipaddr) == 0)
+		return "bad hoa";
+	memmove(grec->hoa, ipaddr, sizeof grec->hoa);
+	v4parseip(ipaddr, argv[2]);
+	memmove(grec->north, ipaddr, sizeof grec->north);
+	v4parseip(ipaddr, argv[3]);
+	memmove(grec->south, ipaddr, sizeof grec->south);
+	v4parseip(ipaddr, argv[4]);
+	memmove(grec->coa, ipaddr, sizeof grec->coa);
+	grec->ulsusp = 1;
+	grec->dlsusp = 0;
+
+	return nil;
+}
+
+static char *
+grectlreport(Conv *c, int i, char **argv)
+{
+	uint32_t seq;
+	Block *bp;
+	Bring *r;
+	GREconv *grec;
+	Metablock *m;
+
+	grec = c->ptcl;
+	seq  = strtoul(argv[1], nil, 0);
+
+	qlock(&grec->lock);
+	r = &grec->dlpending;
+	while(r->produced - r->consumed > 0){
+		bp = r->ring[r->consumed & Ringmask];
+
+		assert(bp && bp->rp - bp->base >= sizeof(Metablock));
+		m = (Metablock *)bp->base;
+		if((int32_t)(seq - m->seq) <= 0)
+			break;
+
+		r->ring[r->consumed & Ringmask] = nil;
+		r->consumed++;
+
+		freeb(bp);
+	}
+	qunlock(&grec->lock);
+	return nil;
+}
+
+static char *
+grectldlsuspend(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
+
+	grec = c->ptcl;
+	if(grec->dlsusp)
+		return "already suspended";
+
+	grec->dlsusp = 1;
+	return nil;
+}
+
+static char *
+grectlulsuspend(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
 
-	gpriv = c->p->priv;
-	if(n == 1){
-		if(strcmp(f[0], "raw") == 0){
-			gpriv->raw = 1;
-			return nil;
+	grec = c->ptcl;
+	if(grec->ulsusp)
+		return "already suspended";
+
+	grec->ulsusp = 1;
+	return nil;
+}
+
+static char *
+grectldlresume(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
+	GREhdr *gre;
+	Block *bp;
+
+	grec = c->ptcl;
+
+	qlock(&grec->lock);
+	if(!grec->dlsusp){
+		qunlock(&grec->lock);
+		return "not suspended";
+	}
+
+	while((bp = getring(&grec->dlbuffered)) != nil){
+		gre = (GREhdr *)bp->rp;
+		qunlock(&grec->lock);
+
+		/*
+		 * Make sure the packet does not go away.
+		 */
+		//_xinc(&bp->ref);
+		ainc(&bp->ref);
+		assert(bp->ref == 2);
+
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+
+		qlock(&grec->lock);
+		addring(&grec->dlpending, bp);
+	}
+	grec->dlsusp = 0;
+	qunlock(&grec->lock);
+	return nil;
+}
+
+static char *
+grectlulresume(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
+	GREhdr *gre;
+	Block *bp;
+
+	grec = c->ptcl;
+
+	qlock(&grec->lock);
+	while((bp = getring(&grec->ulbuffered)) != nil){
+		gre = (GREhdr *)bp->rp;
+
+		qunlock(&grec->lock);
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+		qlock(&grec->lock);
+	}
+	grec->ulsusp = 0;
+	qunlock(&grec->lock);
+	return nil;
+}
+
+static char *
+grectlforward(Conv *c, int i, char **argv)
+{
+	int len;
+	Block *bp, *nbp;
+	GREconv *grec;
+	GREhdr *gre;
+	Metablock *m;
+
+	grec = c->ptcl;
+
+	v4parseip(grec->south, argv[1]);
+	memmove(grec->north, grec->south, sizeof grec->north);
+
+	qlock(&grec->lock);
+	if(!grec->dlsusp){
+		qunlock(&grec->lock);
+		return "not suspended";
+	}
+	grec->dlsusp = 0;
+	grec->ulsusp = 0;
+
+	while((bp = getring(&grec->dlpending)) != nil){
+
+		assert(bp->rp - bp->base >= sizeof(Metablock));
+		m = (Metablock *)bp->base;
+		assert(m->rp >= bp->base && m->rp < bp->lim);
+
+		/*
+		 * If the packet is still held inside the IP transmit
+		 * system, make a copy of the packet first.
+		 */
+		if(bp->ref > 1){
+			len = bp->wp - m->rp;
+			nbp = allocb(len);
+			memmove(nbp->wp, m->rp, len);
+			nbp->wp += len;
+			freeb(bp);
+			bp  = nbp;
 		}
-		else if(strcmp(f[0], "cooked") == 0){
-			gpriv->raw = 0;
-			return nil;
+		else{
+			/* Patch up rp */
+			bp->rp = m->rp;
 		}
+
+		gre = (GREhdr *)bp->rp;
+		memmove(gre->src, grec->coa, sizeof gre->dst);
+		memmove(gre->dst, grec->south, sizeof gre->dst);
+
+		qunlock(&grec->lock);
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+		qlock(&grec->lock);
+	}
+
+	while((bp = getring(&grec->dlbuffered)) != nil){
+		gre = (GREhdr *)bp->rp;
+		memmove(gre->src, grec->coa, sizeof gre->dst);
+		memmove(gre->dst, grec->south, sizeof gre->dst);
+
+		qunlock(&grec->lock);
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+		qlock(&grec->lock);
 	}
-	return "unknown control request";
+
+	while((bp = getring(&grec->ulbuffered)) != nil){
+		gre = (GREhdr *)bp->rp;
+
+		memmove(gre->src, grec->coa, sizeof gre->dst);
+		memmove(gre->dst, grec->south, sizeof gre->dst);
+
+		qunlock(&grec->lock);
+		ipoput4(c->p->f, bp, 0, gre->ttl - 1, gre->tos, nil);
+		qlock(&grec->lock);
+	}
+	qunlock(&grec->lock);
+	return nil;
+}
+
+static char *
+grectlulkey(Conv *c, int i, char **argv)
+{
+	GREconv *grec;
+
+	grec = c->ptcl;
+	grec->ulkey = strtoul(argv[1], nil, 0);
+	return nil;
+}
+
+char *
+grectl(Conv *c, char **f, int n)
+{
+	int i;
+
+	if(n < 1)
+		return "too few arguments";
+
+	for(i = 0; i < Ncmds; i++)
+		if(strcmp(f[0], grectls[i].cmd) == 0)
+			break;
+
+	if(i == Ncmds)
+		return "no such command";
+	if(grectls[i].argc != 0 && grectls[i].argc != n)
+		return "incorrect number of arguments";
+
+	return grectls[i].f(c, n, f);
 }
 
 void
@@ -284,7 +976,7 @@ greinit(Fs *fs)
 	gre->stats = grestats;
 	gre->ipproto = IP_GREPROTO;
 	gre->nc = 64;
-	gre->ptclsize = 0;
+	gre->ptclsize = sizeof(GREconv);
 
 	Fsproto(fs, gre);
 }

+ 13 - 13
sys/src/9/ip/icmp.c

@@ -47,8 +47,8 @@ enum {			/* Packet Types */
 	TimestampReply	= 14,
 	InfoRequest	= 15,
 	InfoReply	= 16,
-	AddrMaskRequest	= 17,
-	AddrMaskReply	= 18,
+	AddrMaskRequest = 17,
+	AddrMaskReply   = 18,
 
 	Maxtype		= 18,
 };
@@ -72,7 +72,7 @@ char *icmpnames[Maxtype+1] =
 [InfoRequest]		"InfoRequest",
 [InfoReply]		"InfoReply",
 [AddrMaskRequest]	"AddrMaskRequest",
-[AddrMaskReply]		"AddrMaskReply",
+[AddrMaskReply  ]	"AddrMaskReply  ",
 };
 
 enum {
@@ -311,7 +311,7 @@ mkechoreply(Block *bp)
 	q->vihl = IP_VER4;
 	memmove(ip, q->src, sizeof(q->dst));
 	memmove(q->src, q->dst, sizeof(q->src));
-	memmove(q->dst, ip, sizeof(q->dst));
+	memmove(q->dst, ip,  sizeof(q->dst));
 	q->type = EchoReply;
 	memset(q->cksum, 0, sizeof(q->cksum));
 	hnputs(q->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE));
@@ -330,7 +330,7 @@ static char *unreachcode[] =
 };
 
 static void
-icmpiput(Proto *icmp, Ipifc* i, Block *bp)
+icmpiput(Proto *icmp, Ipifc *ipifc, Block *bp)
 {
 	int	n, iplen;
 	Icmp	*p;
@@ -345,7 +345,9 @@ icmpiput(Proto *icmp, Ipifc* i, Block *bp)
 	ipriv->stats[InMsgs]++;
 
 	p = (Icmp *)bp->rp;
-	netlog(icmp->f, Logicmp, "icmpiput %d %d\n", p->type, p->code);
+	netlog(icmp->f, Logicmp, "icmpiput %s (%d) %d\n",
+		(p->type < nelem(icmpnames)? icmpnames[p->type]: ""),
+		p->type, p->code);
 	n = blocklen(bp);
 	if(n < ICMP_IPSIZE+ICMP_HDRSIZE){
 		ipriv->stats[InErrors]++;
@@ -354,18 +356,16 @@ icmpiput(Proto *icmp, Ipifc* i, Block *bp)
 		goto raise;
 	}
 	iplen = nhgets(p->length);
-	if(iplen > n || (iplen % 1)){
+	if(iplen > n){
 		ipriv->stats[LenErrs]++;
 		ipriv->stats[InErrors]++;
-		netlog(icmp->f, Logicmp, "icmp length error n %d iplen %d\n",
-			n, iplen);
+		netlog(icmp->f, Logicmp, "icmp length %d\n", iplen);
 		goto raise;
 	}
 	if(ptclcsum(bp, ICMP_IPSIZE, iplen - ICMP_IPSIZE)){
 		ipriv->stats[InErrors]++;
 		ipriv->stats[CsumErrs]++;
-		netlog(icmp->f, Logicmp, "icmp checksum error n %d iplen %d\n",
-			n, iplen);
+		netlog(icmp->f, Logicmp, "icmp checksum error\n");
 		goto raise;
 	}
 	if(p->type <= Maxtype)
@@ -375,7 +375,7 @@ icmpiput(Proto *icmp, Ipifc* i, Block *bp)
 	case EchoRequest:
 		if (iplen < n)
 			bp = trimblock(bp, 0, iplen);
-		r = mkechoreply(bp);
+		r = mkechoreply(concatblock(bp));
 		ipriv->out[EchoReply]++;
 		ipoput4(icmp->f, r, 0, MAXTTL, DFLTTOS, nil);
 		break;
@@ -402,7 +402,7 @@ icmpiput(Proto *icmp, Ipifc* i, Block *bp)
 		break;
 	case TimeExceed:
 		if(p->code == 0){
-			sprint(m2, "ttl exceeded at %V", p->src);
+			snprint(m2, sizeof m2, "ttl exceeded at %V", p->src);
 
 			bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE;
 			if(blocklen(bp) < MinAdvise){

+ 177 - 196
sys/src/9/ip/icmp6.c

@@ -76,39 +76,47 @@ enum {
 	Maxtype6	= 137,
 };
 
-typedef struct ICMPpkt ICMPpkt;
+/* on-the-wire packet formats */
 typedef struct IPICMP IPICMP;
 typedef struct Ndpkt Ndpkt;
 typedef struct NdiscC NdiscC;
 
-struct ICMPpkt {
-	uint8_t	type;
-	uint8_t	code;
-	uint8_t	cksum[2];
-	uint8_t	icmpid[2];
-	uint8_t	seq[2];
-};
+/* we do this to avoid possible struct padding  */
+#define ICMPHDR \
+	IPV6HDR; \
+	uint8_t	type; \
+	uint8_t	code; \
+	uint8_t	cksum[2]; \
+	uint8_t	icmpid[2]; \
+	uint8_t	seq[2]
 
 struct IPICMP {
-	Ip6hdr;
-	ICMPpkt;
+	ICMPHDR;
+	uint8_t	payload[];
 };
 
-struct NdiscC
-{
-	IPICMP;
+#define IPICMPSZ offsetof(IPICMP, payload[0])
+
+struct NdiscC {
+	ICMPHDR;
 	uint8_t	target[IPaddrlen];
+	uint8_t	payload[];
 };
 
-struct Ndpkt
-{
-	NdiscC;
+#define NDISCSZ offsetof(NdiscC, payload[0])
+
+struct Ndpkt {
+	ICMPHDR;
+	uint8_t	target[IPaddrlen];
 	uint8_t	otype;
 	uint8_t	olen;		/* length in units of 8 octets(incl type, code),
 				 * 1 for IEEE 802 addresses */
 	uint8_t	lnaddr[6];	/* link-layer address */
+	uint8_t	payload[];
 };
 
+#define NDPKTSZ offsetof(Ndpkt, payload[0])
+
 typedef struct Icmppriv6
 {
 	uint32_t	stats[Nstats6];
@@ -121,7 +129,7 @@ typedef struct Icmppriv6
 typedef struct Icmpcb6
 {
 	QLock;
-	unsigned char	headers;
+	uint8_t	headers;
 } Icmpcb6;
 
 char *icmpnames6[Maxtype6+1] =
@@ -168,12 +176,14 @@ static char *statnames6[Nstats6] =
 
 static char *unreachcode[] =
 {
-[icmp6_no_route]	"no route to destination",
-[icmp6_ad_prohib]	"comm with destination administratively prohibited",
-[icmp6_unassigned]	"icmp unreachable: unassigned error code (2)",
-[icmp6_adr_unreach]	"address unreachable",
-[icmp6_port_unreach]	"port unreachable",
-[icmp6_unkn_code]	"icmp unreachable: unknown code",
+[Icmp6_no_route]	"no route to destination",
+[Icmp6_ad_prohib]	"comm with destination administratively prohibited",
+[Icmp6_out_src_scope]	"beyond scope of source address",
+[Icmp6_adr_unreach]	"address unreachable",
+[Icmp6_port_unreach]	"port unreachable",
+[Icmp6_gress_src_fail]	"source address failed ingress/egress policy",
+[Icmp6_rej_route]	"reject route to destination",
+[Icmp6_unknown]		"icmp unreachable: unknown code",
 };
 
 static void icmpkick6(void *x, Block *bp);
@@ -191,7 +201,7 @@ set_cksum(Block *bp)
 	IPICMP *p = (IPICMP *)(bp->rp);
 
 	hnputl(p->vcf, 0);  	/* borrow IP header as pseudoheader */
-	hnputs(p->ploadlen, blocklen(bp)-IPV6HDR_LEN);
+	hnputs(p->ploadlen, blocklen(bp) - IP6HDR);
 	p->proto = 0;
 	p->ttl = ICMPv6;	/* ttl gets set later */
 	hnputs(p->cksum, 0);
@@ -253,10 +263,10 @@ icmpkick6(void *x, Block *bp)
 		bp->rp += IPaddrlen;
 		ipmove(raddr, bp->rp);
 		bp->rp += IPaddrlen;
-		bp = padblock(bp, sizeof(Ip6hdr));
+		bp = padblock(bp, IP6HDR);
 	}
 
-	if(blocklen(bp) < sizeof(IPICMP)){
+	if(blocklen(bp) < IPICMPSZ){
 		freeblist(bp);
 		return;
 	}
@@ -343,15 +353,14 @@ mkechoreply6(Block *bp, Ipifc *ifc)
  * 	and tuni == TARG_UNI => neighbor reachability.
  */
 extern void
-icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni,
-       uint8_t* mac)
+icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni, uint8_t* mac)
 {
 	Block *nbp;
 	Ndpkt *np;
 	Proto *icmp = f->t2p[ICMPv6];
 	Icmppriv6 *ipriv = icmp->priv;
 
-	nbp = newIPICMP(sizeof(Ndpkt));
+	nbp = newIPICMP(NDPKTSZ);
 	np = (Ndpkt*) nbp->rp;
 
 	if(suni == SRC_UNSPEC)
@@ -372,7 +381,7 @@ icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni,
 		np->olen = 1;		/* 1+1+6 = 8 = 1 8-octet */
 		memmove(np->lnaddr, mac, sizeof(np->lnaddr));
 	} else
-		nbp->wp -= sizeof(Ndpkt) - sizeof(NdiscC);
+		nbp->wp -= NDPKTSZ - NDISCSZ;
 
 	set_cksum(nbp);
 	np = (Ndpkt*)nbp->rp;
@@ -387,15 +396,14 @@ icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni,
  * sends out an ICMPv6 neighbor advertisement. pktflags == RSO flags.
  */
 extern void
-icmpna(Fs *f, uint8_t* src, uint8_t* dst, uint8_t* targ, uint8_t* mac,
-       uint8_t flags)
+icmpna(Fs *f, uint8_t* src, uint8_t* dst, uint8_t* targ, uint8_t* mac, uint8_t flags)
 {
 	Block *nbp;
 	Ndpkt *np;
 	Proto *icmp = f->t2p[ICMPv6];
 	Icmppriv6 *ipriv = icmp->priv;
 
-	nbp = newIPICMP(sizeof(Ndpkt));
+	nbp = newIPICMP(NDPKTSZ);
 	np = (Ndpkt*)nbp->rp;
 
 	memmove(np->src, src, IPaddrlen);
@@ -419,43 +427,41 @@ icmpna(Fs *f, uint8_t* src, uint8_t* dst, uint8_t* targ, uint8_t* mac,
 	ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
 }
 
+/* if free is true, freeblist(bp) before return. */
 extern void
 icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free)
 {
-	int osz = BLEN(bp);
-	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	int osz, sz;
 	Block *nbp;
 	IPICMP *np;
+	Icmppriv6 *ipriv;
 	Ip6hdr *p;
-	Proto *icmp = f->t2p[ICMPv6];
-	Icmppriv6 *ipriv = icmp->priv;
+	Proto *icmp;
 
+	osz = BLEN(bp);
+	sz = MIN(IPICMPSZ + osz, v6MINTU);
+	icmp = f->t2p[ICMPv6];
+	ipriv = icmp->priv;
 	p = (Ip6hdr *)bp->rp;
-
 	if(isv6mcast(p->src))
-		goto clean;
-
+		goto freebl;
 	nbp = newIPICMP(sz);
 	np = (IPICMP *)nbp->rp;
 
 	rlock(ifc);
-	if(ipv6anylocal(ifc, np->src))
-		netlog(f, Logicmp, "send icmphostunr -> s%I d%I\n",
-			p->src, p->dst);
-	else {
-		netlog(f, Logicmp, "icmphostunr fail -> s%I d%I\n",
+	if(!ipv6anylocal(ifc, np->src)){
+		netlog(f, Logicmp, "icmphostunr fail -> src %I dst %I\n",
 			p->src, p->dst);
+		runlock(ifc);
 		freeblist(nbp);
-		if(free)
-			goto clean;
-		else
-			return;
+		goto freebl;
 	}
 
+	netlog(f, Logicmp, "send icmphostunr -> src %I dst %I\n", p->src, p->dst);
 	memmove(np->dst, p->src, IPaddrlen);
 	np->type = UnreachableV6;
 	np->code = code;
-	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	memmove(nbp->rp + IPICMPSZ, bp->rp, sz - IPICMPSZ);
 	set_cksum(nbp);
 	np->ttl = HOP_LIMIT;
 	np->vcf[0] = 0x06 << 4;
@@ -463,21 +469,19 @@ icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free)
 
 	if(free)
 		ipiput6(f, ifc, nbp);
-	else {
+	else
 		ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil);
-		return;
-	}
-
-clean:
 	runlock(ifc);
-	freeblist(bp);
+freebl:
+	if(free)
+		freeblist(bp);
 }
 
 extern void
 icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp)
 {
 	int osz = BLEN(bp);
-	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	int sz = MIN(IPICMPSZ + osz, v6MINTU);
 	Block *nbp;
 	IPICMP *np;
 	Ip6hdr *p;
@@ -485,18 +489,16 @@ icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp)
 	Icmppriv6 *ipriv = icmp->priv;
 
 	p = (Ip6hdr *)bp->rp;
-
 	if(isv6mcast(p->src))
 		return;
 
 	nbp = newIPICMP(sz);
 	np = (IPICMP *) nbp->rp;
-
 	if(ipv6anylocal(ifc, np->src))
-		netlog(f, Logicmp, "send icmpttlexceeded6 -> s%I d%I\n",
+		netlog(f, Logicmp, "send icmpttlexceeded6 -> src %I dst %I\n",
 			p->src, p->dst);
 	else {
-		netlog(f, Logicmp, "icmpttlexceeded6 fail -> s%I d%I\n",
+		netlog(f, Logicmp, "icmpttlexceeded6 fail -> src %I dst %I\n",
 			p->src, p->dst);
 		return;
 	}
@@ -504,7 +506,7 @@ icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp)
 	memmove(np->dst, p->src, IPaddrlen);
 	np->type = TimeExceedV6;
 	np->code = 0;
-	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	memmove(nbp->rp + IPICMPSZ, bp->rp, sz - IPICMPSZ);
 	set_cksum(nbp);
 	np->ttl = HOP_LIMIT;
 	np->vcf[0] = 0x06 << 4;
@@ -516,7 +518,7 @@ extern void
 icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp)
 {
 	int osz = BLEN(bp);
-	int sz = MIN(sizeof(IPICMP) + osz, v6MINTU);
+	int sz = MIN(IPICMPSZ + osz, v6MINTU);
 	Block *nbp;
 	IPICMP *np;
 	Ip6hdr *p;
@@ -524,18 +526,16 @@ icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp)
 	Icmppriv6 *ipriv = icmp->priv;
 
 	p = (Ip6hdr *)bp->rp;
-
 	if(isv6mcast(p->src))
 		return;
 
 	nbp = newIPICMP(sz);
 	np = (IPICMP *)nbp->rp;
-
 	if(ipv6anylocal(ifc, np->src))
-		netlog(f, Logicmp, "send icmppkttoobig6 -> s%I d%I\n",
+		netlog(f, Logicmp, "send icmppkttoobig6 -> src %I dst %I\n",
 			p->src, p->dst);
 	else {
-		netlog(f, Logicmp, "icmppkttoobig6 fail -> s%I d%I\n",
+		netlog(f, Logicmp, "icmppkttoobig6 fail -> src %I dst %I\n",
 			p->src, p->dst);
 		return;
 	}
@@ -544,7 +544,7 @@ icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp)
 	np->type = PacketTooBigV6;
 	np->code = 0;
 	hnputl(np->icmpid, ifc->maxtu - ifc->medium->hsize);
-	memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP));
+	memmove(nbp->rp + IPICMPSZ, bp->rp, sz - IPICMPSZ);
 	set_cksum(nbp);
 	np->ttl = HOP_LIMIT;
 	np->vcf[0] = 0x06 << 4;
@@ -556,24 +556,25 @@ icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp)
  * RFC 2461, pages 39-40, pages 57-58.
  */
 static int
-valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv)
+valid(Proto *icmp, Ipifc *ipifc, Block *bp, Icmppriv6 *ipriv)
 {
-	int sz, osz, unsp, n, ttl, iplen;
-	int pktsz = BLEN(bp);
-	uint8_t *packet = bp->rp;
-	IPICMP *p = (IPICMP *) packet;
+	int sz, osz, unsp, n, ttl, iplen, pktsz;
+	uint8_t *packet;
+	IPICMP *p;
 	Ndpkt *np;
 
-	USED(ifc);
 	n = blocklen(bp);
-	if(n < sizeof(IPICMP)) {
+	if(n < IPICMPSZ) {
 		ipriv->stats[HlenErrs6]++;
 		netlog(icmp->f, Logicmp, "icmp hlen %d\n", n);
 		goto err;
 	}
 
+	packet = bp->rp;
+	p = (IPICMP *)packet;
+	pktsz = BLEN(bp);
 	iplen = nhgets(p->ploadlen);
-	if(iplen > n-IPV6HDR_LEN || (iplen % 1)) {
+	if(iplen > n - IP6HDR) {
 		ipriv->stats[LenErrs6]++;
 		netlog(icmp->f, Logicmp, "icmp length %d\n", iplen);
 		goto err;
@@ -589,7 +590,7 @@ valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv)
 	ttl = p->ttl;
 	p->ttl = p->proto;
 	p->proto = 0;
-	if(ptclcsum(bp, 0, iplen + IPV6HDR_LEN)) {
+	if(ptclcsum(bp, 0, iplen + IP6HDR)) {
 		ipriv->stats[CsumErrs6]++;
 		netlog(icmp->f, Logicmp, "icmp checksum error\n");
 		goto err;
@@ -598,91 +599,78 @@ valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv)
 	p->ttl = ttl;
 
 	/* additional tests for some pkt types */
-	if (p->type == NbrSolicit   || p->type == NbrAdvert ||
-	    p->type == RouterAdvert || p->type == RouterSolicit ||
-	    p->type == RedirectV6) {
-		if(p->ttl != HOP_LIMIT) {
-			ipriv->stats[HoplimErrs6]++;
+	if (p->type != NbrSolicit   && p->type != NbrAdvert &&
+	    p->type != RouterAdvert && p->type != RouterSolicit &&
+	    p->type != RedirectV6)
+		return 1;	/* TODO: unknown, presumed valid; why? */
+	if(p->ttl != HOP_LIMIT) {
+		ipriv->stats[HoplimErrs6]++;
+		goto err;
+	}
+	if(p->code != 0) {
+		ipriv->stats[IcmpCodeErrs6]++;
+		goto err;
+	}
+
+	switch (p->type) {
+	case NbrSolicit:
+	case NbrAdvert:
+		np = (Ndpkt*) p;
+		if(isv6mcast(np->target)) {
+			ipriv->stats[TargetErrs6]++;
 			goto err;
 		}
-		if(p->code != 0) {
-			ipriv->stats[IcmpCodeErrs6]++;
+		if(optexsts(np) && np->olen == 0) {
+			ipriv->stats[OptlenErrs6]++;
 			goto err;
 		}
-
-		switch (p->type) {
-		case NbrSolicit:
-		case NbrAdvert:
-			np = (Ndpkt*) p;
-			if(isv6mcast(np->target)) {
-				ipriv->stats[TargetErrs6]++;
+		if (p->type == NbrSolicit && ipcmp(np->src, v6Unspecified) == 0)
+			if(!issmcast(np->dst) || optexsts(np)) {
+				ipriv->stats[AddrmxpErrs6]++;
 				goto err;
 			}
-			if(optexsts(np) && np->olen == 0) {
+		if(p->type == NbrAdvert && isv6mcast(np->dst) &&
+		    nhgets(np->icmpid) & Sflag){
+			ipriv->stats[AddrmxpErrs6]++;
+			goto err;
+		}
+		break;
+	case RouterAdvert:
+		if(pktsz - IP6HDR < 16) {
+			ipriv->stats[HlenErrs6]++;
+			goto err;
+		}
+		if(!islinklocal(p->src)) {
+			ipriv->stats[RouterAddrErrs6]++;
+			goto err;
+		}
+		for (sz = IPICMPSZ + 8; sz+1 < pktsz; sz += 8*osz) {
+			osz = packet[sz+1];
+			if(osz <= 0) {
 				ipriv->stats[OptlenErrs6]++;
 				goto err;
 			}
-
-			if (p->type == NbrSolicit &&
-			    ipcmp(np->src, v6Unspecified) == 0)
-				if(!issmcast(np->dst) || optexsts(np)) {
-					ipriv->stats[AddrmxpErrs6]++;
-					goto err;
-				}
-
-			if(p->type == NbrAdvert)
-				if(isv6mcast(np->dst) &&
-				    (nhgets(np->icmpid) & Sflag)){
-					ipriv->stats[AddrmxpErrs6]++;
-					goto err;
-				}
-			break;
-
-		case RouterAdvert:
-			if(pktsz - sizeof(Ip6hdr) < 16) {
-				ipriv->stats[HlenErrs6]++;
-				goto err;
-			}
-			if(!islinklocal(p->src)) {
-				ipriv->stats[RouterAddrErrs6]++;
-				goto err;
-			}
-			sz = sizeof(IPICMP) + 8;
-			while (sz+1 < pktsz) {
-				osz = packet[sz+1];
-				if(osz <= 0) {
-					ipriv->stats[OptlenErrs6]++;
-					goto err;
-				}
-				sz += 8*osz;
-			}
-			break;
-
-		case RouterSolicit:
-			if(pktsz - sizeof(Ip6hdr) < 8) {
-				ipriv->stats[HlenErrs6]++;
+		}
+		break;
+	case RouterSolicit:
+		if(pktsz - IP6HDR < 8) {
+			ipriv->stats[HlenErrs6]++;
+			goto err;
+		}
+		unsp = (ipcmp(p->src, v6Unspecified) == 0);
+		for (sz = IPICMPSZ + 8; sz+1 < pktsz; sz += 8*osz) {
+			osz = packet[sz+1];
+			if(osz <= 0 || (unsp && packet[sz] == SRC_LLADDR)) {
+				ipriv->stats[OptlenErrs6]++;
 				goto err;
 			}
-			unsp = (ipcmp(p->src, v6Unspecified) == 0);
-			sz = sizeof(IPICMP) + 8;
-			while (sz+1 < pktsz) {
-				osz = packet[sz+1];
-				if(osz <= 0 ||
-				    (unsp && packet[sz] == SRC_LLADDR)) {
-					ipriv->stats[OptlenErrs6]++;
-					goto err;
-				}
-				sz += 8*osz;
-			}
-			break;
-
-		case RedirectV6:
-			/* to be filled in */
-			break;
-
-		default:
-			goto err;
 		}
+		break;
+	case RedirectV6:
+		/* TODO: fill in */
+		break;
+	default:
+		goto err;
 	}
 	return 1;
 err:
@@ -713,42 +701,46 @@ targettype(Fs *f, Ipifc *ifc, uint8_t *target)
 	return 0;
 }
 
+/* bp needs to be freed with freeblist or passed on. */
 static void
 icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
 {
-	int refresh = 1;
+	int type;
 	char *msg, m2[128];
 	uint8_t pktflags;
-	uint8_t *packet = bp->rp;
+	uint8_t *packet, *src;
 	uint8_t lsrc[IPaddrlen];
 	Block *r;
-	IPICMP *p = (IPICMP *)packet;
-	Icmppriv6 *ipriv = icmp->priv;
+	IPICMP *p;
+	Icmppriv6 *ipriv;
 	Iplifc *lifc;
 	Ndpkt* np;
 	Proto *pr;
 
-	if(!valid(icmp, ipifc, bp, ipriv) || p->type > Maxtype6)
+	packet = bp->rp;
+	p = (IPICMP *)packet;
+	type = p->type;
+	ipriv = icmp->priv;
+	if(!valid(icmp, ipifc, bp, ipriv) || type > Maxtype6)
 		goto raise;
 
-	ipriv->in[p->type]++;
-
-	switch(p->type) {
+	ipriv->in[type]++;
+	switch(type) {
 	case EchoRequestV6:
+		bp = concatblock(bp);
 		r = mkechoreply6(bp, ipifc);
 		if(r == nil)
 			goto raise;
 		ipriv->out[EchoReply]++;
 		ipoput6(icmp->f, r, 0, MAXTTL, DFLTTOS, nil);
 		break;
-
 	case UnreachableV6:
-		if(p->code > 4)
-			msg = unreachcode[icmp6_unkn_code];
+		if(p->code >= nelem(unreachcode))
+			msg = unreachcode[Icmp6_unknown];
 		else
 			msg = unreachcode[p->code];
 
-		bp->rp += sizeof(IPICMP);
+		bp->rp += IPICMPSZ;
 		if(blocklen(bp) < 8){
 			ipriv->stats[LenErrs6]++;
 			goto raise;
@@ -760,15 +752,13 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
 			return;
 		}
 
-		bp->rp -= sizeof(IPICMP);
+		bp->rp -= IPICMPSZ;
 		goticmpkt6(icmp, bp, 0);
 		break;
-
 	case TimeExceedV6:
 		if(p->code == 0){
-			sprint(m2, "ttl exceeded at %I", p->src);
-
-			bp->rp += sizeof(IPICMP);
+			snprint(m2, sizeof m2, "ttl exceeded at %I", p->src);
+			bp->rp += IPICMPSZ;
 			if(blocklen(bp) < 8){
 				ipriv->stats[LenErrs6]++;
 				goto raise;
@@ -779,12 +769,10 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
 				(*pr->advise)(pr, bp, m2);
 				return;
 			}
-			bp->rp -= sizeof(IPICMP);
+			bp->rp -= IPICMPSZ;
 		}
-
 		goticmpkt6(icmp, bp, 0);
 		break;
-
 	case RouterAdvert:
 	case RouterSolicit:
 		/* using lsrc as a temp, munge hdr for goticmp6 */
@@ -793,43 +781,39 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
 			memmove(p->src, p->dst, IPaddrlen);
 			memmove(p->dst, lsrc, IPaddrlen);
 		}
-		goticmpkt6(icmp, bp, p->type);
+		goticmpkt6(icmp, bp, type);
 		break;
-
 	case NbrSolicit:
-		np = (Ndpkt*) p;
+		np = (Ndpkt*)p;			/* within bp */
 		pktflags = 0;
 		switch (targettype(icmp->f, ipifc, np->target)) {
 		case Tunirany:
 			pktflags |= Oflag;
 			/* fall through */
-
 		case Tuniproxy:
 			if(ipcmp(np->src, v6Unspecified) != 0) {
 				arpenter(icmp->f, V6, np->src, np->lnaddr,
 					8*np->olen-2, 0);
 				pktflags |= Sflag;
 			}
-			if(ipv6local(ipifc, lsrc))
-				icmpna(icmp->f, lsrc,
-					(ipcmp(np->src, v6Unspecified) == 0?
-						v6allnodesL: np->src),
-					np->target, ipifc->mac, pktflags);
-			else
-				freeblist(bp);
+			if(ipv6local(ipifc, lsrc)) {
+				src = np->src;
+				if(ipcmp(src, v6Unspecified) == 0)
+					src = v6allnodesL;
+				icmpna(icmp->f, lsrc, src, np->target,
+					ipifc->mac, pktflags);
+			}
 			break;
-
 		case Tunitent:
-			/* not clear what needs to be done. send up
-			 * an icmp mesg saying don't use this address? */
-		default:
-			freeblist(bp);
+			/*
+			 * not clear what needs to be done.  send up
+			 * an icmp mesg saying `don't use this address'?
+			 */
+			break;
 		}
+		freeblist(bp);
 		break;
-
 	case NbrAdvert:
-		np = (Ndpkt*) p;
-
 		/*
 		 * if the target address matches one of the local interface
 		 * addresses and the local interface address has tentative bit
@@ -837,20 +821,19 @@ icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp)
 		 * detection part of ipconfig can discover duplication through
 		 * the arp table.
 		 */
+		np = (Ndpkt*)p;			/* within bp */
 		lifc = iplocalonifc(ipifc, np->target);
-		if(lifc && lifc->tentative)
-			refresh = 0;
 		arpenter(icmp->f, V6, np->target, np->lnaddr, 8*np->olen-2,
-			refresh);
+			lifc && lifc->tentative);
 		freeblist(bp);
 		break;
-
 	case PacketTooBigV6:
 	default:
 		goticmpkt6(icmp, bp, 0);
 		break;
 	}
 	return;
+
 raise:
 	freeblist(bp);
 }
@@ -871,14 +854,12 @@ icmpstats6(Proto *icmp6, char *buf, int len)
 		if(icmpnames6[i])
 			p = seprint(p, e, "%s: %lud %lud\n", icmpnames6[i],
 				priv->in[i], priv->out[i]);
-/*		else
+		else if (0)
 			p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i],
 				priv->out[i]);
- */
 	return p - buf;
 }
 
-
 /* import from icmp.c */
 extern int	icmpstate(Conv *c, char *state, int n);
 extern char*	icmpannounce(Conv *c, char **argv, int argc);

+ 308 - 0
sys/src/9/ip/igmp.c

@@ -0,0 +1,308 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * igmp - internet group management protocol
+ * unfinished.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+
+enum
+{
+	IGMP_IPHDRSIZE	= 20,		/* size of ip header */
+	IGMP_HDRSIZE	= 8,		/* size of IGMP header */
+	IP_IGMPPROTO	= 2,
+
+	IGMPquery	= 1,
+	IGMPreport	= 2,
+
+	MSPTICK		= 100,
+	MAXTIMEOUT	= 10000/MSPTICK,	/* at most 10 secs for a response */
+};
+
+typedef struct IGMPpkt IGMPpkt;
+struct IGMPpkt
+{
+	/* ip header */
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	len[2];		/* packet length (including headers) */
+	uint8_t	id[2];		/* Identification */
+	uint8_t	frag[2];	/* Fragment information */
+	uint8_t	Unused;
+	uint8_t	proto;		/* Protocol */
+	uint8_t	cksum[2];	/* checksum of ip portion */
+	uint8_t	src[IPaddrlen];		/* Ip source */
+	uint8_t	dst[IPaddrlen];		/* Ip destination */
+
+	/* igmp header */
+	uint8_t	vertype;	/* version and type */
+	uint8_t	unused;
+	uint8_t	igmpcksum[2];		/* checksum of igmp portion */
+	uint8_t	group[IPaddrlen];	/* multicast group */
+
+	uint8_t	payload[];
+};
+
+#define IGMPPKTSZ offsetof(IGMPpkt, payload[0])
+
+/*
+ *  lists for group reports
+ */
+typedef struct IGMPrep IGMPrep;
+struct IGMPrep
+{
+	IGMPrep		*next;
+	Medium		*m;
+	int		ticks;
+	Multicast	*multi;
+};
+
+typedef struct IGMP IGMP;
+struct IGMP
+{
+	Lock;
+	Rendez	r;
+	IGMPrep	*reports;
+};
+
+IGMP igmpalloc;
+
+	Proto	igmp;
+extern	Fs	fs;
+
+static struct Stats
+{
+	uint32_t 	inqueries;
+	uint32_t	outqueries;
+	uint32_t	inreports;
+	uint32_t	outreports;
+} stats;
+
+void
+igmpsendreport(Medium *m, uint8_t *addr)
+{
+	IGMPpkt *p;
+	Block *bp;
+
+	bp = allocb(sizeof(IGMPpkt));
+	if(bp == nil)
+		return;
+	p = (IGMPpkt*)bp->wp;
+	p->vihl = IP_VER4;
+	bp->wp += IGMPPKTSZ;
+	memset(bp->rp, 0, IGMPPKTSZ);
+	hnputl(p->src, Mediumgetaddr(m));
+	hnputl(p->dst, Ipallsys);
+	p->vertype = (1<<4) | IGMPreport;
+	p->proto = IP_IGMPPROTO;
+	memmove(p->group, addr, IPaddrlen);
+	hnputs(p->igmpcksum, ptclcsum(bp, IGMP_IPHDRSIZE, IGMP_HDRSIZE));
+	netlog(Logigmp, "igmpreport %I\n", p->group);
+	stats.outreports++;
+	ipoput4(bp, 0, 1, DFLTTOS, nil);	/* TTL of 1 */
+}
+
+static int
+isreport(void *a)
+{
+	USED(a);
+	return igmpalloc.reports != 0;
+}
+
+
+void
+igmpproc(void *a)
+{
+	IGMPrep *rp, **lrp;
+	Multicast *mp, **lmp;
+	uint8_t ip[IPaddrlen];
+
+	USED(a);
+
+	for(;;){
+		sleep(&igmpalloc.r, isreport, 0);
+		for(;;){
+			lock(&igmpalloc);
+
+			if(igmpalloc.reports == nil)
+				break;
+
+			/* look for a single report */
+			lrp = &igmpalloc.reports;
+			mp = nil;
+			for(rp = *lrp; rp; rp = *lrp){
+				rp->ticks++;
+				lmp = &rp->multi;
+				for(mp = *lmp; mp; mp = *lmp){
+					if(rp->ticks >= mp->timeout){
+						*lmp = mp->next;
+						break;
+					}
+					lmp = &mp->next;
+				}
+				if(mp != nil)
+					break;
+
+				if(rp->multi != nil){
+					lrp = &rp->next;
+					continue;
+				} else {
+					*lrp = rp->next;
+					free(rp);
+				}
+			}
+			unlock(&igmpalloc);
+
+			if(mp){
+				/* do a single report and try again */
+				hnputl(ip, mp->addr);
+				igmpsendreport(rp->m, ip);
+				free(mp);
+				continue;
+			}
+
+			tsleep(&up->sleep, return0, 0, MSPTICK);
+		}
+		unlock(&igmpalloc);
+	}
+
+}
+
+void
+igmpiput(Medium *m, Ipifc *, Block *bp)
+{
+	int n;
+	IGMPpkt *ghp;
+	Ipaddr group;
+	IGMPrep *rp, **lrp;
+	Multicast *mp, **lmp;
+
+	ghp = (IGMPpkt*)(bp->rp);
+	netlog(Logigmp, "igmpiput: %d %I\n", ghp->vertype, ghp->group);
+
+	n = blocklen(bp);
+	if(n < IGMP_IPHDRSIZE+IGMP_HDRSIZE){
+		netlog(Logigmp, "igmpiput: bad len\n");
+		goto error;
+	}
+	if((ghp->vertype>>4) != 1){
+		netlog(Logigmp, "igmpiput: bad igmp type\n");
+		goto error;
+	}
+	if(ptclcsum(bp, IGMP_IPHDRSIZE, IGMP_HDRSIZE)){
+		netlog(Logigmp, "igmpiput: checksum error %I\n", ghp->src);
+		goto error;
+	}
+
+	group = nhgetl(ghp->group);
+
+	lock(&igmpalloc);
+	switch(ghp->vertype & 0xf){
+	case IGMPquery:
+		/*
+		 *  start reporting groups that we're a member of.
+		 */
+		stats.inqueries++;
+		for(rp = igmpalloc.reports; rp; rp = rp->next)
+			if(rp->m == m)
+				break;
+		if(rp != nil)
+			break;	/* already reporting */
+
+		mp = Mediumcopymulti(m);
+		if(mp == nil)
+			break;
+
+		rp = malloc(sizeof(*rp));
+		if(rp == nil)
+			break;
+
+		rp->m = m;
+		rp->multi = mp;
+		rp->ticks = 0;
+		for(; mp; mp = mp->next)
+			mp->timeout = nrand(MAXTIMEOUT);
+		rp->next = igmpalloc.reports;
+		igmpalloc.reports = rp;
+
+		wakeup(&igmpalloc.r);
+
+		break;
+	case IGMPreport:
+		/*
+		 *  find report list for this medium
+		 */
+		stats.inreports++;
+		lrp = &igmpalloc.reports;
+		for(rp = *lrp; rp; rp = *lrp){
+			if(rp->m == m)
+				break;
+			lrp = &rp->next;
+		}
+		if(rp == nil)
+			break;
+
+		/*
+		 *  if someone else has reported a group,
+		 *  we don't have to.
+		 */
+		lmp = &rp->multi;
+		for(mp = *lmp; mp; mp = *lmp){
+			if(mp->addr == group){
+				*lmp = mp->next;
+				free(mp);
+				break;
+			}
+			lmp = &mp->next;
+		}
+
+		break;
+	}
+	unlock(&igmpalloc);
+
+error:
+	freeb(bp);
+}
+
+int
+igmpstats(char *buf, int len)
+{
+	return snprint(buf, len, "\trcvd %d %d\n\tsent %d %d\n",
+		stats.inqueries, stats.inreports,
+		stats.outqueries, stats.outreports);
+}
+
+void
+igmpinit(Fs *fs)
+{
+	igmp.name = "igmp";
+	igmp.connect = nil;
+	igmp.announce = nil;
+	igmp.ctl = nil;
+	igmp.state = nil;
+	igmp.close = nil;
+	igmp.rcv = igmpiput;
+	igmp.stats = igmpstats;
+	igmp.ipproto = IP_IGMPPROTO;
+	igmp.nc = 0;
+	igmp.ptclsize = 0;
+
+	igmpreportfn = igmpsendreport;
+	kproc("igmpproc", igmpproc, 0);
+
+	Fsproto(fs, &igmp);
+}

+ 1 - 14
sys/src/9/ip/inferno.c

@@ -13,7 +13,6 @@
 #include	"dat.h"
 #include	"fns.h"
 #include	"../port/error.h"
-#include	"ip.h"
 
 /*
  *  some hacks for commonality twixt inferno and plan9
@@ -26,12 +25,6 @@ commonuser(void)
 	return m->externup->user;
 }
 
-Chan*
-commonfdtochan(int fd, int mode, int a, int b)
-{
-	return fdtochan(fd, mode, a, b);
-}
-
 char*
 commonerror(void)
 {
@@ -39,14 +32,8 @@ commonerror(void)
 	return m->externup->errstr;
 }
 
-char*
-bootp(Ipifc* i)
-{
-	return "unimplmented";
-}
-
 int
-bootpread(char* c, uint32_t n, int i)
+bootpread(char *c, uint32_t u, int i)
 {
 	return	0;
 }

+ 17 - 115
sys/src/9/ip/ip.c

@@ -16,109 +16,8 @@
 
 #include	"ip.h"
 
-typedef struct Ip4hdr		Ip4hdr;
-typedef struct IP		IP;
-typedef struct Fragment4	Fragment4;
-typedef struct Fragment6	Fragment6;
-typedef struct Ipfrag		Ipfrag;
-
-enum
-{
-	IP4HDR		= 20,		/* sizeof(Ip4hdr) */
-	IP6HDR		= 40,		/* sizeof(Ip6hdr) */
-	IP_HLEN4	= 0x05,		/* Header length in words */
-	IP_DF		= 0x4000,	/* Don't fragment */
-	IP_MF		= 0x2000,	/* More fragments */
-	IP6FHDR		= 8, 		/* sizeof(Fraghdr6) */
-	IP_MAX		= 64*1024,	/* Maximum Internet packet size */
-};
-
 #define BLKIPVER(xp)	(((Ip4hdr*)((xp)->rp))->vihl&0xF0)
 
-struct Ip4hdr
-{
-	uint8_t	vihl;		/* Version and header length */
-	uint8_t	tos;		/* Type of service */
-	uint8_t	length[2];	/* packet length */
-	uint8_t	id[2];		/* ip->identification */
-	uint8_t	frag[2];	/* Fragment information */
-	uint8_t	ttl;		/* Time to live */
-	uint8_t	proto;		/* Protocol */
-	uint8_t	cksum[2];	/* Header checksum */
-	uint8_t	src[4];		/* IP source */
-	uint8_t	dst[4];		/* IP destination */
-};
-
-/* MIB II counters */
-enum
-{
-	Forwarding,
-	DefaultTTL,
-	InReceives,
-	InHdrErrors,
-	InAddrErrors,
-	ForwDatagrams,
-	InUnknownProtos,
-	InDiscards,
-	InDelivers,
-	OutRequests,
-	OutDiscards,
-	OutNoRoutes,
-	ReasmTimeout,
-	ReasmReqds,
-	ReasmOKs,
-	ReasmFails,
-	FragOKs,
-	FragFails,
-	FragCreates,
-
-	Nstats,
-};
-
-struct Fragment4
-{
-	Block*	blist;
-	Fragment4*	next;
-	uint32_t 	src;
-	uint32_t 	dst;
-	uint16_t	id;
-	uint32_t 	age;
-};
-
-struct Fragment6
-{
-	Block*	blist;
-	Fragment6*	next;
-	uint8_t 	src[IPaddrlen];
-	uint8_t 	dst[IPaddrlen];
-	uint	id;
-	uint32_t 	age;
-};
-
-struct Ipfrag
-{
-	uint16_t	foff;
-	uint16_t	flen;
-};
-
-/* an instance of IP */
-struct IP
-{
-	uint32_t		stats[Nstats];
-
-	QLock		fraglock4;
-	Fragment4*	flisthead4;
-	Fragment4*	fragfree4;
-	Ref		id4;
-
-	QLock		fraglock6;
-	Fragment6*	flisthead6;
-	Fragment6*	fragfree6;
-	Ref		id6;
-
-	int		iprouting;	/* true if we route like a gateway */
-};
-
 static char *statnames[] =
 {
 [Forwarding]	"Forwarding",
@@ -154,7 +53,6 @@ Block*		ip4reassemble(IP*, int, Block*, Ip4hdr*);
 void		ipfragfree4(IP*, Fragment4*);
 Fragment4*	ipfragallo4(IP*);
 
-
 void
 ip_init_6(Fs *f)
 {
@@ -170,14 +68,13 @@ ip_init_6(Fs *f)
 	v6p->rp.reachtime	= 0;
 	v6p->rp.rxmitra		= 0;
 	v6p->rp.ttl		= MAXTTL;
-	v6p->rp.routerlt	= 3*(v6p->rp.maxraint);
+	v6p->rp.routerlt	= 3 * v6p->rp.maxraint;
 
 	v6p->hp.rxmithost	= 1000;		/* v6 RETRANS_TIMER */
 
 	v6p->cdrouter 		= -1;
 
 	f->v6p			= v6p;
-
 }
 
 void
@@ -250,7 +147,7 @@ ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 
 	ip->stats[OutRequests]++;
 
-	/* Number of uchars in data and ip header to write */
+	/* Number of uint8_ts in data and ip header to write */
 	len = blocklen(bp);
 
 	if(gating){
@@ -306,7 +203,10 @@ ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 		goto raise;
 
 	/* If we dont need to fragment just send it */
-	medialen = ifc->maxtu - ifc->medium->hsize;
+	if(c && c->maxfragsize && c->maxfragsize < ifc->maxtu)
+		medialen = c->maxfragsize - ifc->medium->hsize;
+	else
+		medialen = ifc->maxtu - ifc->medium->hsize;
 	if(len <= medialen) {
 		if(!gating)
 			hnputs(eh->id, incref(&ip->id4));
@@ -318,13 +218,15 @@ ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 		eh->cksum[0] = 0;
 		eh->cksum[1] = 0;
 		hnputs(eh->cksum, ipcsum(&eh->vihl));
+		assert(bp->next == nil);
 		ifc->medium->bwrite(ifc, bp, V4, gate);
 		runlock(ifc);
 		poperror();
 		return 0;
 	}
 
-if((eh->frag[0] & (IP_DF>>8)) && !gating) print("%V: DF set\n", eh->dst);
+	if((eh->frag[0] & (IP_DF>>8)) && !gating)
+		print("%V: DF set\n", eh->dst);
 
 	if(eh->frag[0] & (IP_DF>>8)){
 		ip->stats[FragFails]++;
@@ -426,6 +328,7 @@ ipiput4(Fs *f, Ipifc *ifc, Block *bp)
 	uint8_t *dp, v6dst[IPaddrlen];
 	IP *ip;
 	Route *r;
+	Conv conv;
 
 	if(BLKIPVER(bp) != IP_VER4) {
 		ipiput6(f, ifc, bp);
@@ -486,14 +389,13 @@ ipiput4(Fs *f, Ipifc *ifc, Block *bp)
 
 	/* route */
 	if(notforme) {
-		Conv conv;
-
 		if(!ip->iprouting){
-			freeb(bp);
+			freeblist(bp);
 			return;
 		}
 
 		/* don't forward to source's network */
+		memset(&conv, 0, sizeof conv);
 		conv.r = nil;
 		r = v4lookup(f, h->dst, &conv);
 		if(r == nil || r->ifc == ifc){
@@ -572,8 +474,8 @@ ipstats(Fs *f, char *buf, int len)
 
 	p = buf;
 	e = p+len;
-	for(i = 0; i < Nstats; i++)
-		p = seprint(p, e, "%s: %lud\n", statnames[i], ip->stats[i]);
+	for(i = 0; i < Nipstats; i++)
+		p = seprint(p, e, "%s: %llud\n", statnames[i], ip->stats[i]);
 	return p - buf;
 }
 
@@ -628,9 +530,9 @@ ip4reassemble(IP *ip, int offset, Block *bp, Ip4hdr *ih)
 		return bp;
 	}
 
-	if(bp->base+sizeof(Ipfrag) >= bp->rp){
-		bp = padblock(bp, sizeof(Ipfrag));
-		bp->rp += sizeof(Ipfrag);
+	if(bp->base+IPFRAGSZ >= bp->rp){
+		bp = padblock(bp, IPFRAGSZ);
+		bp->rp += IPFRAGSZ;
 	}
 
 	BKFG(bp)->foff = offset<<3;

+ 118 - 28
sys/src/9/ip/ip.h

@@ -1,4 +1,4 @@
-/* 
+/*
  * This file is part of the UCB release of Plan 9. It is subject to the license
  * terms in the LICENSE file found in the top-level directory of this
  * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
@@ -8,10 +8,14 @@
  */
 
 typedef struct	Conv	Conv;
+typedef struct	Fragment4 Fragment4;
+typedef struct	Fragment6 Fragment6;
 typedef struct	Fs	Fs;
 typedef union	Hwaddr	Hwaddr;
 typedef struct	IP	IP;
 typedef struct	IPaux	IPaux;
+typedef struct	Ip4hdr	Ip4hdr;
+typedef struct	Ipfrag	Ipfrag;
 typedef struct	Ipself	Ipself;
 typedef struct	Ipselftab	Ipselftab;
 typedef struct	Iplink	Iplink;
@@ -43,9 +47,9 @@ enum
 	Addrlen=	64,
 	Maxproto=	20,
 	Nhash=		64,
-	Maxincall=	128,
+	Maxincall=	64,	/* max. conn.s in listen q not accepted yet */
 	Nchans=		1024,
-	MAClen=		16,		/* longest mac address */
+	MAClen=		16,		/* int32_test mac address */
 
 	MAXTTL=		255,
 	DFLTTOS=	0,
@@ -60,6 +64,11 @@ enum
 	V6=		6,
 	IP_VER4= 	0x40,
 	IP_VER6=	0x60,
+	IP_HLEN4=	5,		/* v4: Header length in words */
+	IP_DF=		0x4000,		/* v4: Don't fragment */
+	IP_MF=		0x2000,		/* v4: More fragments */
+	IP4HDR=		20,		/* sizeof(Ip4hdr) */
+	IP_MAX=		64*1024,	/* Max. Internet packet size, v4 & v6 */
 
 	/* 2^Lroot trees in the root table */
 	Lroot=		10,
@@ -76,6 +85,95 @@ enum
 	Connected=	4,
 };
 
+/* MIB II counters */
+enum
+{
+	Forwarding,
+	DefaultTTL,
+	InReceives,
+	InHdrErrors,
+	InAddrErrors,
+	ForwDatagrams,
+	InUnknownProtos,
+	InDiscards,
+	InDelivers,
+	OutRequests,
+	OutDiscards,
+	OutNoRoutes,
+	ReasmTimeout,
+	ReasmReqds,
+	ReasmOKs,
+	ReasmFails,
+	FragOKs,
+	FragFails,
+	FragCreates,
+
+	Nipstats,
+};
+
+struct Fragment4
+{
+	Block*	blist;
+	Fragment4*	next;
+	uint32_t 	src;
+	uint32_t 	dst;
+	uint16_t	id;
+	uint32_t 	age;
+};
+
+struct Fragment6
+{
+	Block*	blist;
+	Fragment6*	next;
+	uint8_t 	src[IPaddrlen];
+	uint8_t 	dst[IPaddrlen];
+	uint	id;
+	uint32_t 	age;
+};
+
+struct Ipfrag
+{
+	uint16_t	foff;
+	uint16_t	flen;
+
+	uint8_t	payload[];
+};
+
+#define IPFRAGSZ offsetof(Ipfrag, payload[0])
+
+/* an instance of IP */
+struct IP
+{
+	uint64_t		stats[Nipstats];
+
+	QLock		fraglock4;
+	Fragment4*	flisthead4;
+	Fragment4*	fragfree4;
+	Ref		id4;
+
+	QLock		fraglock6;
+	Fragment6*	flisthead6;
+	Fragment6*	fragfree6;
+	Ref		id6;
+
+	int		iprouting;	/* true if we route like a gateway */
+};
+
+/* on the wire packet header */
+struct Ip4hdr
+{
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	length[2];	/* packet length */
+	uint8_t	id[2];		/* ip->identification */
+	uint8_t	frag[2];	/* Fragment information */
+	uint8_t	ttl;      	/* Time to live */
+	uint8_t	proto;		/* Protocol */
+	uint8_t	cksum[2];	/* Header checksum */
+	uint8_t	src[4];		/* IP source */
+	uint8_t	dst[4];		/* IP destination */
+};
+
 /*
  *  one per conversation directory
  */
@@ -103,6 +201,8 @@ struct Conv
 	int	length;
 	int	state;
 
+	int	maxfragsize;		/* If set, used for fragmentation */
+
 	/* udp specific */
 	int	headers;		/* data src/dst headers in udp */
 	int	reliable;		/* true if reliable udp */
@@ -137,7 +237,7 @@ struct Medium
 	int	hsize;		/* medium header size */
 	int	mintu;		/* default min mtu */
 	int	maxtu;		/* default max mtu */
-	int	maclen;		/* mac address length */
+	int	maclen;		/* mac address length  */
 	void	(*bind)(Ipifc*, int, char**);
 	void	(*unbind)(Ipifc*);
 	void	(*bwrite)(Ipifc *ifc, Block *b, int version, uint8_t *ip);
@@ -150,8 +250,7 @@ struct Medium
 	void	(*pktin)(Fs *f, Ipifc *ifc, Block *bp);
 
 	/* routes for router boards */
-	void	(*addroute)(Ipifc *ifc, int, uint8_t*, uint8_t*,
-				uint8_t*, int);
+	void	(*addroute)(Ipifc *ifc, int, uint8_t*, uint8_t*, uint8_t*, int);
 	void	(*remroute)(Ipifc *ifc, int, uint8_t*, uint8_t*);
 	void	(*flushroutes)(Ipifc *ifc);
 
@@ -286,8 +385,7 @@ struct Ipht
 };
 void iphtadd(Ipht*, Conv*);
 void iphtrem(Ipht*, Conv*);
-Conv* iphtlook(Ipht *ht, uint8_t *sa, uint16_t sp, uint8_t *da,
-	       uint16_t dp);
+Conv* iphtlook(Ipht *ht, uint8_t *sa, uint16_t sp, uint8_t *da, uint16_t dp);
 
 /*
  *  one per multiplexed protocol
@@ -320,7 +418,6 @@ struct Proto
 	int		nc;		/* number of conversations */
 	int		ac;
 	Qid		qid;		/* qid for protocol directory */
-	uint16_t		nextport;
 	uint16_t		nextrport;
 
 	void		*priv;
@@ -354,7 +451,7 @@ struct Fs
 
 	char	ndb[1024];		/* an ndb entry for this interface */
 	int	ndbvers;
-	long	ndbmtime;
+	int32_t	ndbmtime;
 };
 
 /* one per default router known to host */
@@ -378,8 +475,7 @@ struct v6params
 
 
 int	Fsconnected(Conv*, char*);
-Conv*	Fsnewcall(Conv*, uint8_t*, uint16_t, uint8_t*, uint16_t,
-		       uint8_t);
+Conv*	Fsnewcall(Conv*, uint8_t*, uint16_t, uint8_t*, uint16_t, uint8_t);
 int	Fspcolstats(char*, int);
 int	Fsproto(Fs*, Proto*);
 int	Fsbuiltinproto(Fs*, uint8_t);
@@ -399,11 +495,9 @@ enum
 	Logip=		1<<1,
 	Logtcp=		1<<2,
 	Logfs=		1<<3,
-	Logil=		1<<4,
 	Logicmp=	1<<5,
 	Logudp=		1<<6,
 	Logcompress=	1<<7,
-	Logilmsg=	1<<8,
 	Loggre=		1<<9,
 	Logppp=		1<<10,
 	Logtcprxmt=	1<<11,
@@ -428,6 +522,8 @@ void	ifclog(Fs*, uint8_t *, int);
 void	ifclogopen(Fs*, Chan*);
 void	ifclogclose(Fs*, Chan*);
 
+#pragma varargck argpos netlog	3
+
 /*
  *  iproute.c
  */
@@ -495,10 +591,8 @@ struct Route
 		V4route v4;
 	};
 };
-extern void	v4addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask,
-			      uint8_t *gate, int type);
-extern void	v6addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask,
-			      uint8_t *gate, int type);
+extern void	v4addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate, int type);
+extern void	v6addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate, int type);
 extern void	v4delroute(Fs *f, uint8_t *a, uint8_t *mask, int dolock);
 extern void	v6delroute(Fs *f, uint8_t *a, uint8_t *mask, int dolock);
 extern Route*	v4lookup(Fs *f, uint8_t *a, Conv *c);
@@ -507,8 +601,7 @@ extern int32_t	routeread(Fs *f, char*, uint32_t, int);
 extern int32_t	routewrite(Fs *f, Chan*, char*, int);
 extern void	routetype(int, char*);
 extern void	ipwalkroutes(Fs*, Routewalk*);
-extern void	convroute(Route*, uint8_t*, uint8_t*, uint8_t*, char*,
-			     int*);
+extern void	convroute(Route*, uint8_t*, uint8_t*, uint8_t*, char*, int*);
 
 /*
  *  devip.c
@@ -550,12 +643,10 @@ struct Arpent
 extern void	arpinit(Fs*);
 extern int	arpread(Arp*, char*, uint32_t, int);
 extern int	arpwrite(Fs*, char*, int);
-extern Arpent*	arpget(Arp*, Block *bp, int version, Ipifc *ifc,
-			     uint8_t *ip, uint8_t *h);
+extern Arpent*	arpget(Arp*, Block *bp, int version, Ipifc *ifc, uint8_t *ip, uint8_t *h);
 extern void	arprelease(Arp*, Arpent *a);
 extern Block*	arpresolve(Arp*, Arpent *a, Medium *type, uint8_t *mac);
-extern void	arpenter(Fs*, int version, uint8_t *ip, uint8_t *mac,
-			    int len, int norefresh);
+extern void	arpenter(Fs*, int version, uint8_t *ip, uint8_t *mac, int len, int norefresh);
 
 /*
  * ipaux.c
@@ -614,8 +705,7 @@ extern int	ipisbooting(void);
 extern int	ipifccheckin(Ipifc *ifc, Medium *med);
 extern void	ipifccheckout(Ipifc *ifc);
 extern int	ipifcgrab(Ipifc *ifc);
-extern void	ipifcaddroute(Fs*, int, uint8_t*, uint8_t*, uint8_t*,
-				 int);
+extern void	ipifcaddroute(Fs*, int, uint8_t*, uint8_t*, uint8_t*, int);
 extern void	ipifcremroute(Fs*, int, uint8_t*, uint8_t*);
 extern void	ipifcremmulti(Conv *c, uint8_t *ma, uint8_t *ia);
 extern void	ipifcaddmulti(Conv *c, uint8_t *ma, uint8_t *ia);
@@ -639,16 +729,16 @@ extern int	ipstats(Fs*, char*, int);
 extern uint16_t	ptclbsum(uint8_t*, int);
 extern uint16_t	ptclcsum(Block*, int, int);
 extern void	ip_init(Fs*);
+extern void	update_mtucache(uint8_t*, uint32_t);
+extern uint32_t	restrict_mtu(uint8_t*, uint32_t);
 /*
  * bootp.c
  */
-extern char*	bootp(Ipifc*);
 extern int	bootpread(char*, uint32_t, int);
 
 /*
  *  resolving inferno/plan9 differences
  */
-Chan*		commonfdtochan(int, int, int, int);
 char*		commonuser(void);
 char*		commonerror(void);
 

+ 2 - 0
sys/src/9/ip/ip.json

@@ -12,6 +12,7 @@
 	"../ip/ipaux.c",
 	"../ip/ip.c",
 	"../ip/ipifc.c",
+	"../ip/ipmux.c",
 	"../ip/iproute.c",
 	"../ip/ipv6.c",
 	"../ip/loopbackmedium.c",
@@ -20,6 +21,7 @@
 	"../ip/nullmedium.c",
 	"../ip/pktmedium.c",
 	"../ip/ptclbsum.c",
+	"../ip/rudp.c",
 	"../ip/tcp.c",
 	"../ip/udp.c"
     ]

+ 53 - 41
sys/src/9/ip/ipifc.c

@@ -24,7 +24,8 @@ enum {
 	Nself		= Maxmedia*5,
 	NHASH		= 1<<6,
 	NCACHE		= 256,
-	QMAX		= 64*1024-1,
+	QMAX		= 192*1024-1,
+	Maxv6repr	= (128/(4*4))*(4+1), /* limit of xxxx:xxxx:⋯ notation */
 };
 
 Medium *media[Maxmedia] = { 0 };
@@ -68,8 +69,7 @@ struct Ipmcast
 
 static char tifc[] = "ifc ";
 
-static void	addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uint8_t *a,
-				int type);
+static void	addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uint8_t *a, int type);
 static void	remselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uint8_t *a);
 static char*	ipifcjoinmulti(Ipifc *ifc, char **argv, int argc);
 static char*	ipifcleavemulti(Ipifc *ifc, char **argv, int argc);
@@ -162,7 +162,7 @@ ipifcbind(Conv *c, char **argv, int argc)
 	ifc->rp.ttl = MAXTTL;
 	ifc->rp.routerlt = 3 * ifc->rp.maxraint;
 
-	/* any ancillary structures (like routes) no longer pertain */
+	/* any ancillary structures (like routes) no int32_ter pertain */
 	ifc->ifcid++;
 
 	/* reopen all the queues closed by a previous unbind */
@@ -329,7 +329,7 @@ ipifccreate(Conv *c)
 	Ipifc *ifc;
 
 	c->rq = qopen(QMAX, 0, 0, 0);
-	c->sq = qopen(2*QMAX, 0, 0, 0);
+	c->sq = qopen(QMAX, 0, 0, 0);
 	c->wq = qopen(QMAX, Qkick, ipifckick, c);
 	ifc = (Ipifc*)c->ptcl;
 	ifc->conv = c;
@@ -346,11 +346,11 @@ static void
 ipifcclose(Conv *c)
 {
 	Ipifc *ifc;
-	Medium *medium;
+	Medium *m;
 
 	ifc = (Ipifc*)c->ptcl;
-	medium = ifc->medium;
-	if(medium != nil && medium->unbindonclose)
+	m = ifc->medium;
+	if(m && m->unbindonclose)
 		ipifcunbind(ifc);
 }
 
@@ -403,19 +403,21 @@ ipifcadd(Ipifc *ifc, char **argv, int argc, int tentative, Iplifc *lifcp)
 			ifc->maxtu = mtu;
 		/* fall through */
 	case 4:
-		parseip(ip, argv[1]);
+		if (parseip(ip, argv[1]) == -1 || parseip(rem, argv[3]) == -1)
+			return Ebadip;
 		parseipmask(mask, argv[2]);
-		parseip(rem, argv[3]);
 		maskip(rem, mask, net);
 		break;
 	case 3:
-		parseip(ip, argv[1]);
+		if (parseip(ip, argv[1]) == -1)
+			return Ebadip;
 		parseipmask(mask, argv[2]);
 		maskip(ip, mask, rem);
 		maskip(rem, mask, net);
 		break;
 	case 2:
-		parseip(ip, argv[1]);
+		if (parseip(ip, argv[1]) == -1)
+			return Ebadip;
 		memmove(mask, defmask(ip), IPaddrlen);
 		maskip(ip, mask, rem);
 		maskip(rem, mask, net);
@@ -603,12 +605,14 @@ ipifcrem(Ipifc *ifc, char **argv, int argc)
 	if(argc < 3)
 		return Ebadarg;
 
-	parseip(ip, argv[1]);
+	if (parseip(ip, argv[1]) == -1)
+		return Ebadip;
 	parseipmask(mask, argv[2]);
 	if(argc < 4)
 		maskip(ip, mask, rem);
 	else
-		parseip(rem, argv[3]);
+		if (parseip(rem, argv[3]) == -1)
+			return Ebadip;
 
 	wlock(ifc);
 
@@ -634,10 +638,9 @@ ipifcrem(Ipifc *ifc, char **argv, int argc)
  * TRIP linecards
  */
 void
-ipifcaddroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask, uint8_t *gate,
-	      int type)
+ipifcaddroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask, uint8_t *gate, int type)
 {
-	Medium *medium;
+	Medium *m;
 	Conv **cp, **e;
 	Ipifc *ifc;
 
@@ -645,9 +648,9 @@ ipifcaddroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask, uint8_t *gate,
 	for(cp = f->ipifc->conv; cp < e; cp++){
 		if(*cp != nil) {
 			ifc = (Ipifc*)(*cp)->ptcl;
-			medium = ifc->medium;
-			if(medium != nil && medium->addroute != nil)
-				medium->addroute(ifc, vers, addr, mask, gate, type);
+			m = ifc->medium;
+			if(m && m->addroute)
+				m->addroute(ifc, vers, addr, mask, gate, type);
 		}
 	}
 }
@@ -655,7 +658,7 @@ ipifcaddroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask, uint8_t *gate,
 void
 ipifcremroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask)
 {
-	Medium *medium;
+	Medium *m;
 	Conv **cp, **e;
 	Ipifc *ifc;
 
@@ -663,9 +666,9 @@ ipifcremroute(Fs *f, int vers, uint8_t *addr, uint8_t *mask)
 	for(cp = f->ipifc->conv; cp < e; cp++){
 		if(*cp != nil) {
 			ifc = (Ipifc*)(*cp)->ptcl;
-			medium = ifc->medium;
-			if(medium != nil && medium->remroute != nil)
-				medium->remroute(ifc, vers, addr, mask);
+			m = ifc->medium;
+			if(m && m->remroute)
+				m->remroute(ifc, vers, addr, mask);
 		}
 	}
 }
@@ -1161,7 +1164,9 @@ enum {
 int
 v6addrtype(uint8_t *addr)
 {
-	if(islinklocal(addr) ||
+	if(isv4(addr) || ipcmp(addr, IPnoaddr) == 0)
+		return unknownv6;
+	else if(islinklocal(addr) ||
 	    isv6mcast(addr) && (addr[1] & 0xF) <= Link_local_scop)
 		return linklocalv6;
 	else
@@ -1538,7 +1543,7 @@ ipifcregisterproxy(Fs *f, Ipifc *ifc, uint8_t *ip)
 	Conv **cp, **e;
 	Ipifc *nifc;
 	Iplifc *lifc;
-	Medium *medium;
+	Medium *m;
 	uint8_t net[IPaddrlen];
 
 	/* register the address on any network that will proxy for us */
@@ -1549,8 +1554,8 @@ ipifcregisterproxy(Fs *f, Ipifc *ifc, uint8_t *ip)
 			if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc)
 				continue;
 			rlock(nifc);
-			medium = nifc->medium;
-			if(medium == nil || medium->addmulti == nil) {
+			m = nifc->medium;
+			if(m == nil || m->addmulti == nil) {
 				runlock(nifc);
 				continue;
 			}
@@ -1561,7 +1566,7 @@ ipifcregisterproxy(Fs *f, Ipifc *ifc, uint8_t *ip)
 					ipv62smcast(net, ip);
 					addselfcache(f, nifc, lifc, net, Rmulti);
 					arpenter(f, V6, ip, nifc->mac, 6, 0);
-					// (*medium->addmulti)(nifc, net, ip);
+					// (*m->addmulti)(nifc, net, ip);
 					break;
 				}
 			}
@@ -1573,15 +1578,15 @@ ipifcregisterproxy(Fs *f, Ipifc *ifc, uint8_t *ip)
 			if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc)
 				continue;
 			rlock(nifc);
-			medium = nifc->medium;
-			if(medium == nil || medium->areg == nil){
+			m = nifc->medium;
+			if(m == nil || m->areg == nil){
 				runlock(nifc);
 				continue;
 			}
 			for(lifc = nifc->lifc; lifc; lifc = lifc->next){
 				maskip(ip, lifc->mask, net);
 				if(ipcmp(net, lifc->remote) == 0){
-					(*medium->areg)(nifc, ip);
+					(*m->areg)(nifc, ip);
 					break;
 				}
 			}
@@ -1591,7 +1596,6 @@ ipifcregisterproxy(Fs *f, Ipifc *ifc, uint8_t *ip)
 }
 
 
-#if 0
 /* added for new v6 mesg types */
 static void
 adddefroute6(Fs *f, uint8_t *gate, int force)
@@ -1609,7 +1613,7 @@ adddefroute6(Fs *f, uint8_t *gate, int force)
 	v6delroute(f, v6Unspecified, v6Unspecified, 1);
 	v6addroute(f, "ra", v6Unspecified, v6Unspecified, gate, 0);
 }
-#endif
+
 enum {
 	Ngates = 3,
 };
@@ -1619,7 +1623,7 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc)
 {
 	int plen = 64;
 	int32_t origint = NOW / 1000, preflt = ~0L, validlt = ~0L;
-	char addr[40], preflen[6];
+	char addr[Maxv6repr], preflen[6];
 	char *params[3];
 	uint8_t autoflag = 1, onlink = 1;
 	uint8_t prefix[IPaddrlen];
@@ -1647,9 +1651,17 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc)
 		return Ebadarg;
 	}
 
-	if (parseip(prefix, argv[1]) != 6 || validlt < preflt || plen < 0 ||
-	    plen > 64 || islinklocal(prefix))
-		return Ebadarg;
+	if (parseip(prefix, argv[1]) != 6)
+		return "bad ipv6 address";
+	if (validlt < preflt)
+		return "valid ipv6 lifetime less than preferred lifetime";
+	if (plen < 0)
+		return "negative ipv6 prefix length";
+	/* i think that this length limit is bogus - geoff */
+//	if (plen > 64)
+//		return "ipv6 prefix length greater than 64;
+	if (islinklocal(prefix))
+		return "ipv6 prefix is link-local";
 
 	lifc = smalloc(sizeof(Iplifc));
 	lifc->onlink = (onlink != 0);
@@ -1660,10 +1672,10 @@ ipifcadd6(Ipifc *ifc, char**argv, int argc)
 
 	/* issue "add" ctl msg for v6 link-local addr and prefix len */
 	if(!ifc->medium->pref2addr)
-		return Ebadarg;
+		return "no pref2addr on interface";
 	ifc->medium->pref2addr(prefix, ifc->mac);	/* mac → v6 link-local addr */
-	sprint(addr, "%I", prefix);
-	sprint(preflen, "/%d", plen);
+	snprint(addr, sizeof addr, "%I", prefix);
+	snprint(preflen, sizeof preflen, "/%d", plen);
 	params[0] = "add";
 	params[1] = addr;
 	params[2] = preflen;

+ 849 - 0
sys/src/9/ip/ipmux.c

@@ -0,0 +1,849 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ * IP packet filter
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include "ip.h"
+#include "ipv6.h"
+
+typedef struct Ipmuxrock  Ipmuxrock;
+typedef struct Ipmux      Ipmux;
+
+typedef struct Myip4hdr Myip4hdr;
+struct Myip4hdr
+{
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	length[2];	/* packet length */
+	uint8_t	id[2];		/* ip->identification */
+	uint8_t	frag[2];	/* Fragment information */
+	uint8_t	ttl;		/* Time to live */
+	uint8_t	proto;		/* Protocol */
+	uint8_t	cksum[2];	/* Header checksum */
+	uint8_t	src[4];		/* IP source */
+	uint8_t	dst[4];		/* IP destination */
+
+	uint8_t	data[1];	/* start of data */
+};
+Myip4hdr *ipoff = 0;
+
+enum
+{
+	Tproto,
+	Tdata,
+	Tiph,
+	Tdst,
+	Tsrc,
+	Tifc,
+
+	Cother = 0,
+	Cbyte,		/* single byte */
+	Cmbyte,		/* single byte with mask */
+	Cshort,		/* single short */
+	Cmshort,	/* single short with mask */
+	Cint32_t,		/* single int32_t */
+	Cmint32_t,		/* single int32_t with mask */
+	Cifc,
+	Cmifc,
+};
+
+char *ftname[] =
+{
+[Tproto]	"proto",
+[Tdata]		"data",
+[Tiph]	 	"iph",
+[Tdst]		"dst",
+[Tsrc]		"src",
+[Tifc]		"ifc",
+};
+
+/*
+ *  a node in the decision tree
+ */
+struct Ipmux
+{
+	Ipmux	*yes;
+	Ipmux	*no;
+	uint8_t	type;		/* type of field(Txxxx) */
+	uint8_t	ctype;		/* tupe of comparison(Cxxxx) */
+	uint8_t	len;		/* length in bytes of item to compare */
+	uint8_t	n;		/* number of items val points to */
+	short	off;		/* offset of comparison */
+	short	eoff;		/* end offset of comparison */
+	uint8_t	skiphdr;	/* should offset start after ipheader */
+	uint8_t	*val;
+	uint8_t	*mask;
+	uint8_t	*e;		/* val+n*len*/
+
+	int	ref;		/* so we can garbage collect */
+	Conv	*conv;
+};
+
+/*
+ *  someplace to hold per conversation data
+ */
+struct Ipmuxrock
+{
+	Ipmux	*chain;
+};
+
+static int	ipmuxsprint(Ipmux*, int, char*, int);
+static void	ipmuxkick(void *x);
+
+static char*
+skipwhite(char *p)
+{
+	while(*p == ' ' || *p == '\t')
+		p++;
+	return p;
+}
+
+static char*
+follows(char *p, char c)
+{
+	char *f;
+
+	f = strchr(p, c);
+	if(f == nil)
+		return nil;
+	*f++ = 0;
+	f = skipwhite(f);
+	if(*f == 0)
+		return nil;
+	return f;
+}
+
+static Ipmux*
+parseop(char **pp)
+{
+	char *p = *pp;
+	int type, off, end, len;
+	Ipmux *f;
+
+	p = skipwhite(p);
+	if(strncmp(p, "dst", 3) == 0){
+		type = Tdst;
+		off = (int64_t)(ipoff->dst);
+		len = IPv4addrlen;
+		p += 3;
+	}
+	else if(strncmp(p, "src", 3) == 0){
+		type = Tsrc;
+		off = (int64_t)(ipoff->src);
+		len = IPv4addrlen;
+		p += 3;
+	}
+	else if(strncmp(p, "ifc", 3) == 0){
+		type = Tifc;
+		off = -IPv4addrlen;
+		len = IPv4addrlen;
+		p += 3;
+	}
+	else if(strncmp(p, "proto", 5) == 0){
+		type = Tproto;
+		off = (int64_t)&(ipoff->proto);
+		len = 1;
+		p += 5;
+	}
+	else if(strncmp(p, "data", 4) == 0 || strncmp(p, "iph", 3) == 0){
+		if(strncmp(p, "data", 4) == 0) {
+			type = Tdata;
+			p += 4;
+		}
+		else {
+			type = Tiph;
+			p += 3;
+		}
+		p = skipwhite(p);
+		if(*p != '[')
+			return nil;
+		p++;
+		off = strtoul(p, &p, 0);
+		if(off < 0 || off > (64-IP4HDR))
+			return nil;
+		p = skipwhite(p);
+		if(*p != ':')
+			end = off;
+		else {
+			p++;
+			p = skipwhite(p);
+			end = strtoul(p, &p, 0);
+			if(end < off)
+				return nil;
+			p = skipwhite(p);
+		}
+		if(*p != ']')
+			return nil;
+		p++;
+		len = end - off + 1;
+	}
+	else
+		return nil;
+
+	f = smalloc(sizeof(*f));
+	f->type = type;
+	f->len = len;
+	f->off = off;
+	f->val = nil;
+	f->mask = nil;
+	f->n = 1;
+	f->ref = 1;
+	if(type == Tdata)
+		f->skiphdr = 1;
+	else
+		f->skiphdr = 0;
+
+	return f;
+}
+
+static int
+htoi(char x)
+{
+	if(x >= '0' && x <= '9')
+		x -= '0';
+	else if(x >= 'a' && x <= 'f')
+		x -= 'a' - 10;
+	else if(x >= 'A' && x <= 'F')
+		x -= 'A' - 10;
+	else
+		x = 0;
+	return x;
+}
+
+static int
+hextoi(char *p)
+{
+	return (htoi(p[0])<<4) | htoi(p[1]);
+}
+
+static void
+parseval(uint8_t *v, char *p, int len)
+{
+	while(*p && len-- > 0){
+		*v++ = hextoi(p);
+		p += 2;
+	}
+}
+
+static Ipmux*
+parsemux(char *p)
+{
+	int n, nomask;
+	Ipmux *f;
+	char *val;
+	char *mask;
+	char *vals[20];
+	uint8_t *v;
+
+	/* parse operand */
+	f = parseop(&p);
+	if(f == nil)
+		return nil;
+
+	/* find value */
+	val = follows(p, '=');
+	if(val == nil)
+		goto parseerror;
+
+	/* parse mask */
+	mask = follows(p, '&');
+	if(mask != nil){
+		switch(f->type){
+		case Tsrc:
+		case Tdst:
+		case Tifc:
+			f->mask = smalloc(f->len);
+			v4parseip(f->mask, mask);
+			break;
+		case Tdata:
+		case Tiph:
+			f->mask = smalloc(f->len);
+			parseval(f->mask, mask, f->len);
+			break;
+		default:
+			goto parseerror;
+		}
+		nomask = 0;
+	} else {
+		nomask = 1;
+		f->mask = smalloc(f->len);
+		memset(f->mask, 0xff, f->len);
+	}
+
+	/* parse vals */
+	f->n = getfields(val, vals, sizeof(vals)/sizeof(char*), 1, "|");
+	if(f->n == 0)
+		goto parseerror;
+	f->val = smalloc(f->n*f->len);
+	v = f->val;
+	for(n = 0; n < f->n; n++){
+		switch(f->type){
+		case Tsrc:
+		case Tdst:
+		case Tifc:
+			v4parseip(v, vals[n]);
+			break;
+		case Tproto:
+		case Tdata:
+		case Tiph:
+			parseval(v, vals[n], f->len);
+			break;
+		}
+		v += f->len;
+	}
+
+	f->eoff = f->off + f->len;
+	f->e = f->val + f->n*f->len;
+	f->ctype = Cother;
+	if(f->n == 1){
+		switch(f->len){
+		case 1:
+			f->ctype = nomask ? Cbyte : Cmbyte;
+			break;
+		case 2:
+			f->ctype = nomask ? Cshort : Cmshort;
+			break;
+		case 4:
+			if(f->type == Tifc)
+				f->ctype = nomask ? Cifc : Cmifc;
+			else
+				f->ctype = nomask ? Cint32_t : Cmint32_t;
+			break;
+		}
+	}
+	return f;
+
+parseerror:
+	if(f->mask)
+		free(f->mask);
+	if(f->val)
+		free(f->val);
+	free(f);
+	return nil;
+}
+
+/*
+ *  Compare relative ordering of two ipmuxs.  This doesn't compare the
+ *  values, just the fields being looked at.
+ *
+ *  returns:	<0 if a is a more specific match
+ *		 0 if a and b are matching on the same fields
+ *		>0 if b is a more specific match
+ */
+static int
+ipmuxcmp(Ipmux *a, Ipmux *b)
+{
+	int n;
+
+	/* compare types, lesser ones are more important */
+	n = a->type - b->type;
+	if(n != 0)
+		return n;
+
+	/* compare offsets, call earlier ones more specific */
+	n = (a->off+((int)a->skiphdr)*(int64_t)ipoff->data) -
+		(b->off+((int)b->skiphdr)*(int64_t)ipoff->data);
+	if(n != 0)
+		return n;
+
+	/* compare match lengths, int32_ter ones are more specific */
+	n = b->len - a->len;
+	if(n != 0)
+		return n;
+
+	/*
+	 *  if we get here we have two entries matching
+	 *  the same bytes of the record.  Now check
+	 *  the mask for equality.  Longer masks are
+	 *  more specific.
+	 */
+	if(a->mask != nil && b->mask == nil)
+		return -1;
+	if(a->mask == nil && b->mask != nil)
+		return 1;
+	if(a->mask != nil && b->mask != nil){
+		n = memcmp(b->mask, a->mask, a->len);
+		if(n != 0)
+			return n;
+	}
+	return 0;
+}
+
+/*
+ *  Compare the values of two ipmuxs.  We're assuming that ipmuxcmp
+ *  returned 0 comparing them.
+ */
+static int
+ipmuxvalcmp(Ipmux *a, Ipmux *b)
+{
+	int n;
+
+	n = b->len*b->n - a->len*a->n;
+	if(n != 0)
+		return n;
+	return memcmp(a->val, b->val, a->len*a->n);
+}
+
+/*
+ *  add onto an existing ipmux chain in the canonical comparison
+ *  order
+ */
+static void
+ipmuxchain(Ipmux **l, Ipmux *f)
+{
+	for(; *l; l = &(*l)->yes)
+		if(ipmuxcmp(f, *l) < 0)
+			break;
+	f->yes = *l;
+	*l = f;
+}
+
+/*
+ *  copy a tree
+ */
+static Ipmux*
+ipmuxcopy(Ipmux *f)
+{
+	Ipmux *nf;
+
+	if(f == nil)
+		return nil;
+	nf = smalloc(sizeof *nf);
+	*nf = *f;
+	nf->no = ipmuxcopy(f->no);
+	nf->yes = ipmuxcopy(f->yes);
+	nf->val = smalloc(f->n*f->len);
+	nf->e = nf->val + f->len*f->n;
+	memmove(nf->val, f->val, f->n*f->len);
+	return nf;
+}
+
+static void
+ipmuxfree(Ipmux *f)
+{
+	if(f->val != nil)
+		free(f->val);
+	free(f);
+}
+
+static void
+ipmuxtreefree(Ipmux *f)
+{
+	if(f == nil)
+		return;
+	if(f->no != nil)
+		ipmuxfree(f->no);
+	if(f->yes != nil)
+		ipmuxfree(f->yes);
+	ipmuxfree(f);
+}
+
+/*
+ *  merge two trees
+ */
+static Ipmux*
+ipmuxmerge(Ipmux *a, Ipmux *b)
+{
+	int n;
+	Ipmux *f;
+
+	if(a == nil)
+		return b;
+	if(b == nil)
+		return a;
+	n = ipmuxcmp(a, b);
+	if(n < 0){
+		f = ipmuxcopy(b);
+		a->yes = ipmuxmerge(a->yes, b);
+		a->no = ipmuxmerge(a->no, f);
+		return a;
+	}
+	if(n > 0){
+		f = ipmuxcopy(a);
+		b->yes = ipmuxmerge(b->yes, a);
+		b->no = ipmuxmerge(b->no, f);
+		return b;
+	}
+	if(ipmuxvalcmp(a, b) == 0){
+		a->yes = ipmuxmerge(a->yes, b->yes);
+		a->no = ipmuxmerge(a->no, b->no);
+		a->ref++;
+		ipmuxfree(b);
+		return a;
+	}
+	a->no = ipmuxmerge(a->no, b);
+	return a;
+}
+
+/*
+ *  remove a chain from a demux tree.  This is like merging accept that
+ *  we remove instead of insert.
+ */
+static int
+ipmuxremove(Ipmux **l, Ipmux *f)
+{
+	int n, rv;
+	Ipmux *ft;
+
+	if(f == nil)
+		return 0;		/* we've removed it all */
+	if(*l == nil)
+		return -1;
+
+	ft = *l;
+	n = ipmuxcmp(ft, f);
+	if(n < 0){
+		/* *l is maching an earlier field, descend both paths */
+		rv = ipmuxremove(&ft->yes, f);
+		rv += ipmuxremove(&ft->no, f);
+		return rv;
+	}
+	if(n > 0){
+		/* f represents an earlier field than *l, this should be impossible */
+		return -1;
+	}
+
+	/* if we get here f and *l are comparing the same fields */
+	if(ipmuxvalcmp(ft, f) != 0){
+		/* different values mean mutually exclusive */
+		return ipmuxremove(&ft->no, f);
+	}
+
+	/* we found a match */
+	if(--(ft->ref) == 0){
+		/*
+		 *  a dead node implies the whole yes side is also dead.
+		 *  since our chain is constrained to be on that side,
+		 *  we're done.
+		 */
+		ipmuxtreefree(ft->yes);
+		*l = ft->no;
+		ipmuxfree(ft);
+		return 0;
+	}
+
+	/*
+	 *  free the rest of the chain.  it is constrained to match the
+	 *  yes side.
+	 */
+	return ipmuxremove(&ft->yes, f->yes);
+}
+
+/*
+ *  connection request is a semi separated list of filters
+ *  e.g. proto=17;data[0:4]=11aa22bb;ifc=135.104.9.2&255.255.255.0
+ *
+ *  there's no protection against overlapping specs.
+ */
+static char*
+ipmuxconnect(Conv *c, char **argv, int argc)
+{
+	int i, n;
+	char *field[10];
+	Ipmux *mux, *chain;
+	Ipmuxrock *r;
+	Fs *f;
+
+	f = c->p->f;
+
+	if(argc != 2)
+		return Ebadarg;
+
+	n = getfields(argv[1], field, nelem(field), 1, ";");
+	if(n <= 0)
+		return Ebadarg;
+
+	chain = nil;
+	mux = nil;
+	for(i = 0; i < n; i++){
+		mux = parsemux(field[i]);
+		if(mux == nil){
+			ipmuxtreefree(chain);
+			return Ebadarg;
+		}
+		ipmuxchain(&chain, mux);
+	}
+	if(chain == nil)
+		return Ebadarg;
+	mux->conv = c;
+
+	/* save a copy of the chain so we can later remove it */
+	mux = ipmuxcopy(chain);
+	r = (Ipmuxrock*)(c->ptcl);
+	r->chain = chain;
+
+	/* add the chain to the protocol demultiplexor tree */
+	wlock(f);
+	f->ipmux->priv = ipmuxmerge(f->ipmux->priv, mux);
+	wunlock(f);
+
+	Fsconnected(c, nil);
+	return nil;
+}
+
+static int
+ipmuxstate(Conv *c, char *state, int n)
+{
+	Ipmuxrock *r;
+
+	r = (Ipmuxrock*)(c->ptcl);
+	return ipmuxsprint(r->chain, 0, state, n);
+}
+
+static void
+ipmuxcreate(Conv *c)
+{
+	Ipmuxrock *r;
+
+	c->rq = qopen(64*1024, Qmsg, 0, c);
+	c->wq = qopen(64*1024, Qkick, ipmuxkick, c);
+	r = (Ipmuxrock*)(c->ptcl);
+	r->chain = nil;
+}
+
+static char*
+ipmuxannounce(Conv *conv, char **argv, int i)
+{
+	return "ipmux does not support announce";
+}
+
+static void
+ipmuxclose(Conv *c)
+{
+	Ipmuxrock *r;
+	Fs *f = c->p->f;
+
+	r = (Ipmuxrock*)(c->ptcl);
+
+	qclose(c->rq);
+	qclose(c->wq);
+	qclose(c->eq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->lport = 0;
+	c->rport = 0;
+
+	wlock(f);
+	ipmuxremove((struct Ipmux **)&(c->p->priv), r->chain);
+	wunlock(f);
+	ipmuxtreefree(r->chain);
+	r->chain = nil;
+}
+
+/*
+ *  takes a fully formed ip packet and just passes it down
+ *  the stack
+ */
+static void
+ipmuxkick(void *x)
+{
+	Conv *c = x;
+	Block *bp;
+
+	bp = qget(c->wq);
+	if(bp != nil) {
+		Myip4hdr *ih4 = (Myip4hdr*)(bp->rp);
+
+		if((ih4->vihl & 0xF0) != IP_VER6)
+			ipoput4(c->p->f, bp, 0, ih4->ttl, ih4->tos, nil);
+		else
+			ipoput6(c->p->f, bp, 0, ((Ip6hdr*)ih4)->ttl, 0, nil);
+	}
+}
+
+static void
+ipmuxiput(Proto *p, Ipifc *ifc, Block *bp)
+{
+	int len, hl;
+	Fs *f = p->f;
+	uint8_t *m, *h, *v, *e, *ve, *hp;
+	Conv *c;
+	Ipmux *mux;
+	Myip4hdr *ip;
+	Ip6hdr *ip6;
+
+	ip = (Myip4hdr*)bp->rp;
+	hl = (ip->vihl&0x0F)<<2;
+
+	if(p->priv == nil)
+		goto nomatch;
+
+	h = bp->rp;
+	len = BLEN(bp);
+
+	/* run the v4 filter */
+	rlock(f);
+	c = nil;
+	mux = f->ipmux->priv;
+	while(mux != nil){
+		if(mux->eoff > len){
+			mux = mux->no;
+			continue;
+		}
+		hp = h + mux->off + ((int)mux->skiphdr)*hl;
+		switch(mux->ctype){
+		case Cbyte:
+			if(*mux->val == *hp)
+				goto yes;
+			break;
+		case Cmbyte:
+			if((*hp & *mux->mask) == *mux->val)
+				goto yes;
+			break;
+		case Cshort:
+			if(*((uint16_t*)mux->val) == *(uint16_t*)hp)
+				goto yes;
+			break;
+		case Cmshort:
+			if((*(uint16_t*)hp & (*((uint16_t*)mux->mask))) == *((uint16_t*)mux->val))
+				goto yes;
+			break;
+		case Cint32_t:
+			if(*((uint32_t*)mux->val) == *(uint32_t*)hp)
+				goto yes;
+			break;
+		case Cmint32_t:
+			if((*(uint32_t*)hp & (*((uint32_t*)mux->mask))) == *((uint32_t*)mux->val))
+				goto yes;
+			break;
+		case Cifc:
+			if(*((uint32_t*)mux->val) == *(uint32_t*)(ifc->lifc->local + IPv4off))
+				goto yes;
+			break;
+		case Cmifc:
+			if((*(uint32_t*)(ifc->lifc->local + IPv4off) & (*((uint32_t*)mux->mask))) == *((uint32_t*)mux->val))
+				goto yes;
+			break;
+		default:
+			v = mux->val;
+			for(e = mux->e; v < e; v = ve){
+				m = mux->mask;
+				hp = h + mux->off;
+				for(ve = v + mux->len; v < ve; v++){
+					if((*hp++ & *m++) != *v)
+						break;
+				}
+				if(v == ve)
+					goto yes;
+			}
+		}
+		mux = mux->no;
+		continue;
+yes:
+		if(mux->conv != nil)
+			c = mux->conv;
+		mux = mux->yes;
+	}
+	runlock(f);
+
+	if(c != nil){
+		/* tack on interface address */
+		bp = padblock(bp, IPaddrlen);
+		ipmove(bp->rp, ifc->lifc->local);
+		bp = concatblock(bp);
+		if(bp != nil)
+			if(qpass(c->rq, bp) < 0)
+				print("ipmuxiput: qpass failed\n");
+		return;
+	}
+
+nomatch:
+	/* doesn't match any filter, hand it to the specific protocol handler */
+	ip = (Myip4hdr*)bp->rp;
+	if((ip->vihl & 0xF0) == IP_VER4) {
+		p = f->t2p[ip->proto];
+	} else {
+		ip6 = (Ip6hdr*)bp->rp;
+		p = f->t2p[ip6->proto];
+	}
+	if(p && p->rcv)
+		(*p->rcv)(p, ifc, bp);
+	else
+		freeblist(bp);
+	return;
+}
+
+static int
+ipmuxsprint(Ipmux *mux, int level, char *buf, int len)
+{
+	int i, j, n;
+	uint8_t *v;
+
+	n = 0;
+	for(i = 0; i < level; i++)
+		n += snprint(buf+n, len-n, " ");
+	if(mux == nil){
+		n += snprint(buf+n, len-n, "\n");
+		return n;
+	}
+	n += snprint(buf+n, len-n, "h[%d:%d]&",
+               mux->off+((int)mux->skiphdr)*((int64_t)ipoff->data),
+               mux->off+(((int)mux->skiphdr)*((int64_t)ipoff->data))+mux->len-1);
+	for(i = 0; i < mux->len; i++)
+		n += snprint(buf+n, len - n, "%2.2ux", mux->mask[i]);
+	n += snprint(buf+n, len-n, "=");
+	v = mux->val;
+	for(j = 0; j < mux->n; j++){
+		for(i = 0; i < mux->len; i++)
+			n += snprint(buf+n, len - n, "%2.2ux", *v++);
+		n += snprint(buf+n, len-n, "|");
+	}
+	n += snprint(buf+n, len-n, "\n");
+	level++;
+	n += ipmuxsprint(mux->no, level, buf+n, len-n);
+	n += ipmuxsprint(mux->yes, level, buf+n, len-n);
+	return n;
+}
+
+static int
+ipmuxstats(Proto *p, char *buf, int len)
+{
+	int n;
+	Fs *f = p->f;
+
+	rlock(f);
+	n = ipmuxsprint(p->priv, 0, buf, len);
+	runlock(f);
+
+	return n;
+}
+
+void
+ipmuxinit(Fs *f)
+{
+	Proto *ipmux;
+
+	ipmux = smalloc(sizeof(Proto));
+	ipmux->priv = nil;
+	ipmux->name = "ipmux";
+	ipmux->connect = ipmuxconnect;
+	ipmux->announce = ipmuxannounce;
+	ipmux->state = ipmuxstate;
+	ipmux->create = ipmuxcreate;
+	ipmux->close = ipmuxclose;
+	ipmux->rcv = ipmuxiput;
+	ipmux->ctl = nil;
+	ipmux->advise = nil;
+	ipmux->stats = ipmuxstats;
+	ipmux->ipproto = -1;
+	ipmux->nc = 64;
+	ipmux->ptclsize = sizeof(Ipmuxrock);
+
+	f->ipmux = ipmux;			/* hack for Fsrcvpcol */
+
+	Fsproto(f, ipmux);
+}

+ 54 - 17
sys/src/9/ip/iproute.c

@@ -21,10 +21,10 @@ static void	addnode(Fs*, Route**, Route*);
 static void	calcd(Route*);
 
 /* these are used for all instances of IP */
-Route*	v4freelist;
-Route*	v6freelist;
-RWlock	routelock;
-uint32_t	v4routegeneration, v6routegeneration;
+static Route*	v4freelist;
+static Route*	v6freelist;
+static RWlock	routelock;
+static uint32_t	v4routegeneration, v6routegeneration;
 
 static void
 freeroute(Route *r)
@@ -87,7 +87,7 @@ addqueue(Route **q, Route *r)
 }
 
 /*
- *  compare 2 v6 addresses
+ *   compare 2 v6 addresses
  */
 static int
 lcmp(uint32_t *a, uint32_t *b)
@@ -297,8 +297,7 @@ addnode(Fs *f, Route **cur, Route *new)
 #define	V4H(a)	((a&0x07ffffff)>>(32-Lroot-5))
 
 void
-v4addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate,
-	   int type)
+v4addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate, int type)
 {
 	Route *p;
 	uint32_t sa;
@@ -336,8 +335,7 @@ v4addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate,
 #define ISDFLT(a, mask, tag) ((ipcmp((a),v6Unspecified)==0) && (ipcmp((mask),v6Unspecified)==0) && (strcmp((tag), "ra")!=0))
 
 void
-v6addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate,
-	   int type)
+v6addroute(Fs *f, char *tag, uint8_t *a, uint8_t *mask, uint8_t *gate, int type)
 {
 	Route *p;
 	uint32_t sa[IPllen], ea[IPllen];
@@ -628,11 +626,10 @@ routetype(int type, char *p)
 		*p = 'p';
 }
 
-char *rformat = "%-15I %-4M %-15I %4.4s %4.4s %3s\n";
+static char *rformat = "%-15I %-4M %-15I %4.4s %4.4s %3s\n";
 
 void
-convroute(Route *r, uint8_t *addr, uint8_t *mask, uint8_t *gate, char *t,
-          int *nifc)
+convroute(Route *r, uint8_t *addr, uint8_t *mask, uint8_t *gate, char *t, int *nifc)
 {
 	int i;
 
@@ -674,7 +671,7 @@ sprintroute(Route *r, Routewalk *rw)
 	iname = "-";
 	if(nifc != -1) {
 		iname = ifbuf;
-		sprint(ifbuf, "%d", nifc);
+		snprint(ifbuf, sizeof ifbuf, "%d", nifc);
 	}
 	p = seprint(rw->p, rw->e, rformat, addr, mask, gate, t, r->tag, iname);
 	if(rw->o < 0){
@@ -771,7 +768,7 @@ delroute(Fs *f, Route *r, int dolock)
 
 /*
  *  recurse until one route is deleted
- *  returns 0 if nothing is deleted, 1 otherwise
+ *    returns 0 if nothing is deleted, 1 otherwise
  */
 int
 routeflush(Fs *f, Route *r, char *tag)
@@ -793,6 +790,31 @@ routeflush(Fs *f, Route *r, char *tag)
 	return 0;
 }
 
+Route *
+iproute(Fs *fs, uint8_t *ip)
+{
+	if(isv4(ip))
+		return v4lookup(fs, ip+IPv4off, nil);
+	else
+		return v6lookup(fs, ip, nil);
+}
+
+static void
+printroute(Route *r)
+{
+	int nifc;
+	char t[5], *iname, ifbuf[5];
+	uint8_t addr[IPaddrlen], mask[IPaddrlen], gate[IPaddrlen];
+
+	convroute(r, addr, mask, gate, t, &nifc);
+	iname = "-";
+	if(nifc != -1) {
+		iname = ifbuf;
+		snprint(ifbuf, sizeof ifbuf, "%d", nifc);
+	}
+	print(rformat, addr, mask, gate, t, r->tag, iname);
+}
+
 int32_t
 routewrite(Fs *f, Chan *c, char *p, int n)
 {
@@ -804,6 +826,7 @@ routewrite(Fs *f, Chan *c, char *p, int n)
 	uint8_t mask[IPaddrlen];
 	uint8_t gate[IPaddrlen];
 	IPaux *a, *na;
+	Route *q;
 
 	cb = parsecmd(p, n);
 	if(waserror()){
@@ -828,7 +851,8 @@ routewrite(Fs *f, Chan *c, char *p, int n)
 	} else if(strcmp(cb->f[0], "remove") == 0){
 		if(cb->nf < 3)
 			error(Ebadarg);
-		parseip(addr, cb->f[1]);
+		if (parseip(addr, cb->f[1]) == -1)
+			error(Ebadip);
 		parseipmask(mask, cb->f[2]);
 		if(memcmp(addr, v4prefix, IPv4off) == 0)
 			v4delroute(f, addr+IPv4off, mask+IPv4off, 1);
@@ -837,9 +861,10 @@ routewrite(Fs *f, Chan *c, char *p, int n)
 	} else if(strcmp(cb->f[0], "add") == 0){
 		if(cb->nf < 4)
 			error(Ebadarg);
-		parseip(addr, cb->f[1]);
+		if(parseip(addr, cb->f[1]) == -1 ||
+		    parseip(gate, cb->f[3]) == -1)
+			error(Ebadip);
 		parseipmask(mask, cb->f[2]);
-		parseip(gate, cb->f[3]);
 		tag = "none";
 		if(c != nil){
 			a = c->aux;
@@ -857,6 +882,18 @@ routewrite(Fs *f, Chan *c, char *p, int n)
 		na = newipaux(a->owner, cb->f[1]);
 		c->aux = na;
 		free(a);
+	} else if(strcmp(cb->f[0], "route") == 0) {
+		if(cb->nf < 2)
+			error(Ebadarg);
+		if (parseip(addr, cb->f[1]) == -1)
+			error(Ebadip);
+
+		q = iproute(f, addr);
+		print("%I: ", addr);
+		if(q == nil)
+			print("no route\n");
+		else
+			printroute(q);
 	}
 
 	poperror();

+ 17 - 127
sys/src/9/ip/ipv6.c

@@ -19,13 +19,7 @@
 
 enum
 {
-	IP4HDR		= 20,		/* sizeof(Ip4hdr) */
-	IP6HDR		= 40,		/* sizeof(Ip6hdr) */
-	IP_HLEN4	= 0x05,		/* Header length in words */
-	IP_DF		= 0x4000,	/* Don't fragment */
-	IP_MF		= 0x2000,	/* More fragments */
 	IP6FHDR		= 8, 		/* sizeof(Fraghdr6) */
-	IP_MAX		= 32*1024,	/* Maximum Internet packet size */
 };
 
 #define IPV6CLASS(hdr)	(((hdr)->vcf[0]&0x0F)<<2 | ((hdr)->vcf[1]&0xF0)>>2)
@@ -35,11 +29,6 @@ enum
  */
 #define BKFG(xp)	((Ipfrag*)((xp)->base))
 
-typedef struct	IP	IP;
-typedef struct	Fragment4	Fragment4;
-typedef struct	Fragment6	Fragment6;
-typedef struct	Ipfrag	Ipfrag;
-
 Block*		ip6reassemble(IP*, int, Block*, Ip6hdr*);
 Fragment6*	ipfragallo6(IP*);
 void		ipfragfree6(IP*, Fragment6*);
@@ -47,101 +36,6 @@ Block*		procopts(Block *bp);
 static Block*	procxtns(IP *ip, Block *bp, int doreasm);
 int		unfraglen(Block *bp, uint8_t *nexthdr, int setfh);
 
-/* MIB II counters */
-enum
-{
-	Forwarding,
-	DefaultTTL,
-	InReceives,
-	InHdrErrors,
-	InAddrErrors,
-	ForwDatagrams,
-	InUnknownProtos,
-	InDiscards,
-	InDelivers,
-	OutRequests,
-	OutDiscards,
-	OutNoRoutes,
-	ReasmTimeout,
-	ReasmReqds,
-	ReasmOKs,
-	ReasmFails,
-	FragOKs,
-	FragFails,
-	FragCreates,
-
-	Nstats,
-};
-
-#if 0
-static char *statnames[] =
-{
-[Forwarding]	"Forwarding",
-[DefaultTTL]	"DefaultTTL",
-[InReceives]	"InReceives",
-[InHdrErrors]	"InHdrErrors",
-[InAddrErrors]	"InAddrErrors",
-[ForwDatagrams]	"ForwDatagrams",
-[InUnknownProtos]	"InUnknownProtos",
-[InDiscards]	"InDiscards",
-[InDelivers]	"InDelivers",
-[OutRequests]	"OutRequests",
-[OutDiscards]	"OutDiscards",
-[OutNoRoutes]	"OutNoRoutes",
-[ReasmTimeout]	"ReasmTimeout",
-[ReasmReqds]	"ReasmReqds",
-[ReasmOKs]	"ReasmOKs",
-[ReasmFails]	"ReasmFails",
-[FragOKs]	"FragOKs",
-[FragFails]	"FragFails",
-[FragCreates]	"FragCreates",
-};
-#endif
-
-struct Fragment4
-{
-	Block*	blist;
-	Fragment4*	next;
-	uint32_t 	src;
-	uint32_t 	dst;
-	uint16_t	id;
-	uint32_t 	age;
-};
-
-struct Fragment6
-{
-	Block*	blist;
-	Fragment6*	next;
-	uint8_t 	src[IPaddrlen];
-	uint8_t 	dst[IPaddrlen];
-	uint	id;
-	uint32_t 	age;
-};
-
-struct Ipfrag
-{
-	uint16_t	foff;
-	uint16_t	flen;
-};
-
-/* an instance of IP */
-struct IP
-{
-	uint32_t		stats[Nstats];
-
-	QLock		fraglock4;
-	Fragment4*	flisthead4;
-	Fragment4*	fragfree4;
-	Ref		id4;
-
-	QLock		fraglock6;
-	Fragment6*	flisthead6;
-	Fragment6*	fragfree6;
-	Ref		id6;
-
-	int		iprouting;	/* true if we route like a gateway */
-};
-
 int
 ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 {
@@ -163,12 +57,13 @@ ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 
 	ip->stats[OutRequests]++;
 
-	/* Number of uchars in data and ip header to write */
+	/* Number of uint8_ts in data and ip header to write */
 	len = blocklen(bp);
 
 	tentative = iptentative(f, eh->src);
 	if(tentative){
-		netlog(f, Logip, "reject tx of packet with tentative src address\n");
+		netlog(f, Logip, "reject tx of packet with tentative src address %I\n",
+			eh->src);
 		goto free;
 	}
 
@@ -179,12 +74,11 @@ ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 			netlog(f, Logip, "short gated packet\n");
 			goto free;
 		}
-		if(chunk + IPV6HDR_LEN < len)
-			len = chunk + IPV6HDR_LEN;
+		if(chunk + IP6HDR < len)
+			len = chunk + IP6HDR;
 	}
 
 	if(len >= IP_MAX){
-//		print("len > IP_MAX, free\n");
 		ip->stats[OutDiscards]++;
 		netlog(f, Logip, "exceeded ip max size %I\n", eh->dst);
 		goto free;
@@ -233,7 +127,7 @@ ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c)
 	/* If we dont need to fragment just send it */
 	medialen = ifc->maxtu - ifc->medium->hsize;
 	if(len <= medialen) {
-		hnputs(eh->ploadlen, len-IPV6HDR_LEN);
+		hnputs(eh->ploadlen, len - IP6HDR);
 		ifc->medium->bwrite(ifc, bp, V6, gate);
 		runlock(ifc);
 		poperror();
@@ -375,7 +269,7 @@ ipiput6(Fs *f, Ipifc *ifc, Block *bp)
 	tentative = iptentative(f, v6dst);
 
 	if(tentative && h->proto != ICMPv6) {
-		print("tentative addr, drop\n");
+		print("ipv6 non-icmp tentative addr %I, drop\n", v6dst);
 		freeblist(bp);
 		return;
 	}
@@ -391,7 +285,7 @@ ipiput6(Fs *f, Ipifc *ifc, Block *bp)
 	/* route */
 	if(notforme) {
 		if(!ip->iprouting){
-			freeb(bp);
+			freeblist(bp);
 			return;
 		}
 
@@ -545,16 +439,12 @@ unfraglen(Block *bp, uint8_t *nexthdr, int setfh)
 	ufl = IP6HDR;
 	p += ufl;
 
-	for(;;) {
-		if(*nexthdr == HBH || *nexthdr == RH) {
-			*nexthdr = *p;
-			hs = ((int)*(p+1) + 1) * 8;
-			ufl += hs;
-			q = p;
-			p += hs;
-		}
-		else
-			break;
+	while (*nexthdr == HBH || *nexthdr == RH) {
+		*nexthdr = *p;
+		hs = ((int)*(p+1) + 1) * 8;
+		ufl += hs;
+		q = p;
+		p += hs;
 	}
 
 	if(*nexthdr == FH)
@@ -623,9 +513,9 @@ ip6reassemble(IP* ip, int uflen, Block* bp, Ip6hdr* ih)
 		return bp;
 	}
 
-	if(bp->base+sizeof(Ipfrag) >= bp->rp){
-		bp = padblock(bp, sizeof(Ipfrag));
-		bp->rp += sizeof(Ipfrag);
+	if(bp->base+IPFRAGSZ >= bp->rp){
+		bp = padblock(bp, IPFRAGSZ);
+		bp->rp += IPFRAGSZ;
 	}
 
 	BKFG(bp)->foff = offset;

+ 47 - 50
sys/src/9/ip/ipv6.h

@@ -1,4 +1,4 @@
-/* 
+/*
  * This file is part of the UCB release of Plan 9. It is subject to the license
  * terms in the LICENSE file found in the top-level directory of this
  * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
@@ -10,13 +10,15 @@
 /*
  * Internet Protocol Version 6
  *
- * rfc2460 defines the protocol.
- * rfc4291 defines the address prefices.
+ * rfc2460 defines the protocol, rfc2461 neighbour discovery, and
+ * rfc2462 address autoconfiguration.  rfc4443 defines ICMP; was rfc2463.
+ * rfc4291 defines the address architecture (including prefices), was rfc3513.
+ * rfc4007 defines the scoped address architecture.
  *
  * global unicast is anything but unspecified (::), loopback (::1),
  * multicast (ff00::/8), and link-local unicast (fe80::/10).
  *
- * site-local (fec0::/10) is now deprecated by rfc3879.
+ * site-local (fec0::/10) is now deprecated, originally by rfc3879.
  *
  * Unique Local IPv6 Unicast Addresses are defined by rfc4193.
  * prefix is fc00::/7, scope is global, routing is limited to roughly a site.
@@ -27,10 +29,6 @@
 #define optexsts(np)	(nhgets((np)->ploadlen) > 24)
 #define issmcast(addr)	(memcmp((addr), v6solicitednode, 13) == 0)
 
-#ifndef MIN
-#define MIN(a, b) ((a) <= (b)? (a): (b))
-#endif
-
 enum {				/* Header Types */
 	HBH		= 0,	/* hop-by-hop multicast routing protocol */
 	ICMP		= 1,
@@ -72,28 +70,38 @@ enum {
 	/* various prefix lengths */
 	SOLN_PREF_LEN	= 13,
 
-	/* icmpv6 unreach codes */
-	icmp6_no_route		= 0,
-	icmp6_ad_prohib		= 1,
-	icmp6_unassigned	= 2,
-	icmp6_adr_unreach	= 3,
-	icmp6_port_unreach	= 4,
-	icmp6_unkn_code		= 5,
+	/* icmpv6 unreachability codes */
+	Icmp6_no_route		= 0,
+	Icmp6_ad_prohib		= 1,
+	Icmp6_out_src_scope	= 2,
+	Icmp6_adr_unreach	= 3,
+	Icmp6_port_unreach	= 4,
+	Icmp6_gress_src_fail	= 5,
+	Icmp6_rej_route		= 6,
+	Icmp6_unknown		= 7,  /* our own invention for internal use */
 
 	/* various flags & constants */
 	v6MINTU		= 1280,
 	HOP_LIMIT	= 255,
-	ETHERHDR_LEN	= 14,
-	IPV6HDR_LEN	= 40,
-	IPV4HDR_LEN	= 20,
+	IP6HDR		= 40,		/* sizeof(Ip6hdr) = 8 + 2*16 */
 
 	/* option types */
 
+	/* neighbour discovery */
 	SRC_LLADDR	= 1,
 	TARGET_LLADDR	= 2,
 	PREFIX_INFO	= 3,
 	REDIR_HEADER	= 4,
 	MTU_OPTION	= 5,
+	/* new since rfc2461; see iana.org/assignments/icmpv6-parameters */
+	V6nd_home	= 8,
+	V6nd_srcaddrs	= 9,		/* rfc3122 */
+	V6nd_ip		= 17,
+	/* /lib/rfc/drafts/draft-jeong-dnsop-ipv6-dns-discovery-12.txt */
+	V6nd_rdns	= 25,
+	/* plan 9 extensions */
+	V6nd_9fs	= 250,
+	V6nd_9auth	= 251,
 
 	SRC_UNSPEC	= 0,
 	SRC_UNI		= 1,
@@ -104,26 +112,9 @@ enum {
 	Tuniproxy	= 2,
 	Tunirany	= 3,
 
-	/* Router constants (all times in milliseconds) */
-	MAX_INIT_RTR_ADVERT_INTVL = 16000,
-	MAX_INIT_RTR_ADVERTS	= 3,
-	MAX_FINAL_RTR_ADVERTS	= 3,
-	MIN_DELAY_BETWEEN_RAS	= 3000,
-	MAX_RA_DELAY_TIME	= 500,
-
-	/* Host constants */
-	MAX_RTR_SOLICIT_DELAY	= 1000,
-	RTR_SOLICIT_INTVL	= 4000,
-	MAX_RTR_SOLICITS	= 3,
-
 	/* Node constants */
 	MAX_MULTICAST_SOLICIT	= 3,
-	MAX_UNICAST_SOLICIT	= 3,
-	MAX_ANYCAST_DELAY_TIME	= 1000,
-	MAX_NEIGHBOR_ADVERT	= 3,
-	REACHABLE_TIME		= 30000,
 	RETRANS_TIMER		= 1000,
-	DELAY_FIRST_PROBE_TIME	= 5000,
 };
 
 typedef struct Ip6hdr	Ip6hdr;
@@ -131,21 +122,32 @@ typedef struct Opthdr	Opthdr;
 typedef struct Routinghdr Routinghdr;
 typedef struct Fraghdr6	Fraghdr6;
 
+/* we do this in case there's padding at the end of Ip6hdr */
+#define IPV6HDR \
+	uint8_t	vcf[4];		/* version:4, traffic class:8, flow label:20 */\
+	uint8_t	ploadlen[2];	/* payload length: packet length - 40 */ \
+	uint8_t	proto;		/* next header type */ \
+	uint8_t	ttl;		/* hop limit */ \
+	uint8_t	src[IPaddrlen]; \
+	uint8_t	dst[IPaddrlen]
+
 struct	Ip6hdr {
-	uint8_t	vcf[4];		/* version:4, traffic class:8, flow label:20 */
-	uint8_t	ploadlen[2];	/* payload length: packet length - 40 */
-	uint8_t	proto;		/* next header type */
-	uint8_t	ttl;		/* hop limit */
-	uint8_t	src[IPaddrlen];
-	uint8_t	dst[IPaddrlen];
+	IPV6HDR;
+	uint8_t	payload[];
 };
 
-struct	Opthdr {
+struct	Opthdr {		/* unused */
 	uint8_t	nexthdr;
 	uint8_t	len;
 };
 
-struct	Routinghdr {
+/*
+ * Beware routing header type 0 (loose source routing); see
+ * http://www.secdev.org/conf/IPv6_RH_security-csw07.pdf.
+ * Type 1 is unused.  Type 2 is for MIPv6 (mobile IPv6) filtering
+ * against type 0 header.
+ */
+struct	Routinghdr {		/* unused */
 	uint8_t	nexthdr;
 	uint8_t	len;
 	uint8_t	rtetype;
@@ -165,7 +167,6 @@ extern uint8_t v6allroutersN[IPaddrlen];
 extern uint8_t v6allroutersL[IPaddrlen];
 extern uint8_t v6allnodesNmask[IPaddrlen];
 extern uint8_t v6allnodesLmask[IPaddrlen];
-extern uint8_t v6allroutersS[IPaddrlen];
 extern uint8_t v6solicitednode[IPaddrlen];
 extern uint8_t v6solicitednodemask[IPaddrlen];
 extern uint8_t v6Unspecified[IPaddrlen];
@@ -173,12 +174,10 @@ extern uint8_t v6loopback[IPaddrlen];
 extern uint8_t v6loopbackmask[IPaddrlen];
 extern uint8_t v6linklocal[IPaddrlen];
 extern uint8_t v6linklocalmask[IPaddrlen];
-extern uint8_t v6glunicast[IPaddrlen];
 extern uint8_t v6multicast[IPaddrlen];
 extern uint8_t v6multicastmask[IPaddrlen];
 
 extern int v6llpreflen;
-extern int v6lbpreflen;
 extern int v6mcpreflen;
 extern int v6snpreflen;
 extern int v6aNpreflen;
@@ -187,10 +186,8 @@ extern int v6aLpreflen;
 extern int ReTransTimer;
 
 void ipv62smcast(uint8_t *, uint8_t *);
-void icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni,
-	    uint8_t* mac);
-void icmpna(Fs *f, uint8_t* src, uint8_t* dst, uint8_t* targ, uint8_t* mac,
-	    uint8_t flags);
+void icmpns(Fs *f, uint8_t* src, int suni, uint8_t* targ, int tuni, uint8_t* mac);
+void icmpna(Fs *f, uint8_t* src, uint8_t* dst, uint8_t* targ, uint8_t* mac, uint8_t flags);
 void icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp);
 void icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp);
 void icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free);

+ 4 - 4
sys/src/9/ip/loopbackmedium.c

@@ -32,7 +32,7 @@ struct LB
 static void loopbackread(void *a);
 
 static void
-loopbackbind(Ipifc *ifc, int i, char** c)
+loopbackbind(Ipifc *ifc, int i, char**argv)
 {
 	LB *lb;
 
@@ -40,7 +40,7 @@ loopbackbind(Ipifc *ifc, int i, char** c)
 	lb->f = ifc->conv->p->f;
 	lb->q = qopen(1024*1024, Qmsg, nil, nil);
 	ifc->arg = lb;
-	ifc->mbps = 10001;
+	ifc->mbps = 1000;
 
 	kproc("loopbackread", loopbackread, ifc);
 
@@ -53,7 +53,7 @@ loopbackunbind(Ipifc *ifc)
 	LB *lb = ifc->arg;
 
 	if(lb->readp)
-		postnote(lb->readp, 1, "unbind", NUser);
+		postnote(lb->readp, 1, "unbind", 0);
 
 	/* wait for reader to die */
 	while(lb->readp != 0)
@@ -65,7 +65,7 @@ loopbackunbind(Ipifc *ifc)
 }
 
 static void
-loopbackbwrite(Ipifc *ifc, Block *bp, int i, uint8_t* m)
+loopbackbwrite(Ipifc *ifc, Block *bp, int i, uint8_t *c)
 {
 	LB *lb;
 

+ 3 - 4
sys/src/9/ip/netdevmedium.c

@@ -18,8 +18,7 @@
 
 static void	netdevbind(Ipifc *ifc, int argc, char **argv);
 static void	netdevunbind(Ipifc *ifc);
-static void	netdevbwrite(Ipifc *ifc, Block *bp, int version,
-				uint8_t *ip);
+static void	netdevbwrite(Ipifc *ifc, Block *bp, int version, uint8_t *ip);
 static void	netdevread(void *a);
 
 typedef struct	Netdevrock Netdevrock;
@@ -77,7 +76,7 @@ netdevunbind(Ipifc *ifc)
 	Netdevrock *er = ifc->arg;
 
 	if(er->readp != nil)
-		postnote(er->readp, 1, "unbind", NUser);
+		postnote(er->readp, 1, "unbind", 0);
 
 	/* wait for readers to die */
 	while(er->readp != nil)
@@ -93,7 +92,7 @@ netdevunbind(Ipifc *ifc)
  *  called by ipoput with a single block to write
  */
 static void
-netdevbwrite(Ipifc *ifc, Block *bp, int i, uint8_t* n)
+netdevbwrite(Ipifc *ifc, Block *bp, int i, uint8_t *c)
 {
 	Netdevrock *er = ifc->arg;
 

+ 8 - 7
sys/src/9/ip/netlog.c

@@ -16,7 +16,7 @@
 #include	"../ip/ip.h"
 
 enum {
-	Nlog		= 4*1024,
+	Nlog		= 16*1024,
 };
 
 /*
@@ -31,7 +31,7 @@ struct Netlog {
 	int	len;
 
 	int	logmask;			/* mask of things to debug */
-	unsigned char	iponly[IPaddrlen];		/* ip address to print debugging for */
+	uint8_t	iponly[IPaddrlen];		/* ip address to print debugging for */
 	int	iponlyset;
 
 	QLock;
@@ -49,11 +49,9 @@ static Netlogflag flags[] =
 	{ "ip",		Logip, },
 	{ "fs",		Logfs, },
 	{ "tcp",	Logtcp, },
-	{ "il",		Logil, },
 	{ "icmp",	Logicmp, },
 	{ "udp",	Logudp, },
 	{ "compress",	Logcompress, },
-	{ "ilmsg",	Logil|Logilmsg, },
 	{ "gre",	Loggre, },
 	{ "tcpwin",	Logtcp|Logtcpwin, },
 	{ "tcprxmt",	Logtcp|Logtcprxmt, },
@@ -97,6 +95,8 @@ netlogopen(Fs *f)
 	if(f->alog->opens == 0){
 		if(f->alog->buf == nil)
 			f->alog->buf = malloc(Nlog);
+		if(f->alog->buf == nil)
+			error(Enomem);
 		f->alog->rptr = f->alog->buf;
 		f->alog->end = f->alog->buf + Nlog;
 	}
@@ -132,7 +132,7 @@ netlogready(void *a)
 }
 
 int32_t
-netlogread(Fs *f, void *a, uint32_t mm, int32_t n)
+netlogread(Fs *f, void *a, uint32_t u, int32_t n)
 {
 	Mach *m = machp();
 	int i, d;
@@ -215,10 +215,11 @@ netlogctl(Fs *f, char* s, int n)
 		else
 			f->alog->iponlyset = 1;
 		free(cb);
+		poperror();
 		return;
 
 	default:
-		cmderror(cb, "unknown ip control message");
+		cmderror(cb, "unknown netlog control message");
 	}
 
 	for(i = 1; i < cb->nf; i++){
@@ -240,7 +241,7 @@ netlogctl(Fs *f, char* s, int n)
 void
 netlog(Fs *f, int mask, char *fmt, ...)
 {
-	char buf[128], *t, *fp;
+	char buf[256], *t, *fp;
 	int i, n;
 	va_list arg;
 

+ 3 - 3
sys/src/9/ip/nullmedium.c

@@ -17,18 +17,18 @@
 #include "ip.h"
 
 static void
-nullbind(Ipifc* i, int n, char** c)
+nullbind(Ipifc *ipifc, int i, char **argv)
 {
 	error("cannot bind null device");
 }
 
 static void
-nullunbind(Ipifc* i)
+nullunbind(Ipifc *ipifc)
 {
 }
 
 static void
-nullbwrite(Ipifc* i, Block* b, int n, uint8_t* m)
+nullbwrite(Ipifc *ipifc, Block *block, int i, uint8_t *c)
 {
 	error("nullbwrite");
 }

+ 4 - 4
sys/src/9/ip/pktmedium.c

@@ -33,7 +33,6 @@ Medium pktmedium =
 .unbind=	pktunbind,
 .bwrite=	pktbwrite,
 .pktin=		pktin,
-.unbindonclose=	1,
 };
 
 /*
@@ -41,15 +40,16 @@ Medium pktmedium =
  *  called with ifc wlock'd
  */
 static void
-pktbind(Ipifc* i, int n, char** c)
+pktbind(Ipifc *ipifc, int argc, char **argv)
 {
+	USED(argc); USED(argv);
 }
 
 /*
  *  called with ifc wlock'd
  */
 static void
-pktunbind(Ipifc* i)
+pktunbind(Ipifc *ipifc)
 {
 }
 
@@ -57,7 +57,7 @@ pktunbind(Ipifc* i)
  *  called by ipoput with a single packet to write
  */
 static void
-pktbwrite(Ipifc *ifc, Block *bp, int i, uint8_t* m)
+pktbwrite(Ipifc *ifc, Block *bp, int i, uint8_t *c)
 {
 	/* enqueue onto the conversation's rq */
 	bp = concatblock(bp);

+ 1 - 1
sys/src/9/ip/ptclbsum.c

@@ -15,7 +15,7 @@
 #include	"../port/error.h"
 #include	"ip.h"
 
-static	int16_t	endian	= 1;
+static	short	endian	= 1;
 static	uint8_t*	aendian	= (uint8_t*)&endian;
 #define	LITTLE	*aendian
 

+ 1067 - 0
sys/src/9/ip/rudp.c

@@ -0,0 +1,1067 @@
+/*
+ * This file is part of the UCB release of Plan 9. It is subject to the license
+ * terms in the LICENSE file found in the top-level directory of this
+ * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
+ * part of the UCB release of Plan 9, including this file, may be copied,
+ * modified, propagated, or distributed except according to the terms contained
+ * in the LICENSE file.
+ */
+
+/*
+ *  Reliable User Datagram Protocol, currently only for IPv4.
+ *  This protocol is compatible with UDP's packet format.
+ *  It could be done over UDP if need be.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"ip.h"
+
+#define DEBUG	0
+#define DPRINT if(DEBUG)print
+
+#define SEQDIFF(a,b) ( (a)>=(b)?\
+			(a)-(b):\
+			0xffffffffUL-((b)-(a)) )
+#define INSEQ(a,start,end) ( (start)<=(end)?\
+				((a)>(start)&&(a)<=(end)):\
+				((a)>(start)||(a)<=(end)) )
+#define UNACKED(r) SEQDIFF(r->sndseq, r->ackrcvd)
+#define NEXTSEQ(a) ( (a)+1 == 0 ? 1 : (a)+1 )
+
+enum
+{
+	UDP_PHDRSIZE	= 12,	/* pseudo header */
+//	UDP_HDRSIZE	= 20,	/* pseudo header + udp header */
+	UDP_RHDRSIZE	= 36,	/* pseudo header + udp header + rudp header */
+	UDP_IPHDR	= 8,	/* ip header */
+	IP_UDPPROTO	= 254,
+	UDP_USEAD7	= 52,	/* size of new ipv6 headers struct */
+
+	Rudprxms	= 200,
+	Rudptickms	= 50,
+	Rudpmaxxmit	= 10,
+	Maxunacked	= 100,
+};
+
+#define Hangupgen	0xffffffff	/* used only in hangup messages */
+
+typedef struct Udphdr Udphdr;
+struct Udphdr
+{
+	/* ip header */
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	length[2];	/* packet length */
+	uint8_t	id[2];		/* Identification */
+	uint8_t	frag[2];	/* Fragment information */
+
+	/* pseudo header starts here */
+	uint8_t	Unused;
+	uint8_t	udpproto;	/* Protocol */
+	uint8_t	udpplen[2];	/* Header plus data length */
+	uint8_t	udpsrc[4];	/* Ip source */
+	uint8_t	udpdst[4];	/* Ip destination */
+
+	/* udp header */
+	uint8_t	udpsport[2];	/* Source port */
+	uint8_t	udpdport[2];	/* Destination port */
+	uint8_t	udplen[2];	/* data length */
+	uint8_t	udpcksum[2];	/* Checksum */
+};
+
+typedef struct Rudphdr Rudphdr;
+struct Rudphdr
+{
+	/* ip header */
+	uint8_t	vihl;		/* Version and header length */
+	uint8_t	tos;		/* Type of service */
+	uint8_t	length[2];	/* packet length */
+	uint8_t	id[2];		/* Identification */
+	uint8_t	frag[2];	/* Fragment information */
+
+	/* pseudo header starts here */
+	uint8_t	Unused;
+	uint8_t	udpproto;	/* Protocol */
+	uint8_t	udpplen[2];	/* Header plus data length */
+	uint8_t	udpsrc[4];	/* Ip source */
+	uint8_t	udpdst[4];	/* Ip destination */
+
+	/* udp header */
+	uint8_t	udpsport[2];	/* Source port */
+	uint8_t	udpdport[2];	/* Destination port */
+	uint8_t	udplen[2];	/* data length (includes rudp header) */
+	uint8_t	udpcksum[2];	/* Checksum */
+
+	/* rudp header */
+	uint8_t	relseq[4];	/* id of this packet (or 0) */
+	uint8_t	relsgen[4];	/* generation/time stamp */
+	uint8_t	relack[4];	/* packet being acked (or 0) */
+	uint8_t	relagen[4];	/* generation/time stamp */
+};
+
+
+/*
+ *  one state structure per destination
+ */
+typedef struct Reliable Reliable;
+struct Reliable
+{
+	Ref;
+
+	Reliable *next;
+
+	uint8_t	addr[IPaddrlen];	/* always V6 when put here */
+	uint16_t	port;
+
+	Block	*unacked;	/* unacked msg list */
+	Block	*unackedtail;	/*  and its tail */
+
+	int	timeout;	/* time since first unacked msg sent */
+	int	xmits;		/* number of times first unacked msg sent */
+
+	uint32_t	sndseq;		/* next packet to be sent */
+	uint32_t	sndgen;		/*  and its generation */
+
+	uint32_t	rcvseq;		/* last packet received */
+	uint32_t	rcvgen;		/*  and its generation */
+
+	uint32_t	acksent;	/* last ack sent */
+	uint32_t	ackrcvd;	/* last msg for which ack was rcvd */
+
+	/* flow control */
+	QLock	lock;
+	Rendez	vous;
+	int	blocked;
+};
+
+
+
+/* MIB II counters */
+typedef struct Rudpstats Rudpstats;
+struct Rudpstats
+{
+	uint32_t	rudpInDatagrams;
+	uint32_t	rudpNoPorts;
+	uint32_t	rudpInErrors;
+	uint32_t	rudpOutDatagrams;
+};
+
+typedef struct Rudppriv Rudppriv;
+struct Rudppriv
+{
+	Ipht	ht;
+
+	/* MIB counters */
+	Rudpstats	ustats;
+
+	/* non-MIB stats */
+	uint32_t	csumerr;		/* checksum errors */
+	uint32_t	lenerr;			/* short packet */
+	uint32_t	rxmits;			/* # of retransmissions */
+	uint32_t	orders;			/* # of out of order pkts */
+
+	/* keeping track of the ack kproc */
+	int	ackprocstarted;
+	QLock	apl;
+};
+
+
+static uint32_t generation = 0;
+//static Rendez rend;
+
+/*
+ *  protocol specific part of Conv
+ */
+typedef struct Rudpcb Rudpcb;
+struct Rudpcb
+{
+	QLock;
+	uint8_t	headers;
+	uint8_t	randdrop;
+	Reliable *r;
+};
+
+/*
+ * local functions
+ */
+void	relsendack(Conv*, Reliable*, int);
+int	reliput(Conv*, Block*, uint8_t*, uint16_t);
+Reliable *relstate(Rudpcb*, uint8_t*, uint16_t, char*);
+void	relput(Reliable*);
+void	relforget(Conv *, uint8_t*, int, int);
+void	relackproc(void *);
+void	relackq(Reliable *, Block*);
+void	relhangup(Conv *, Reliable*);
+void	relrexmit(Conv *, Reliable*);
+void	relput(Reliable*);
+void	rudpkick(void *x);
+
+static void
+rudpstartackproc(Proto *rudp)
+{
+	Rudppriv *rpriv;
+	char kpname[KNAMELEN];
+
+	rpriv = rudp->priv;
+	if(rpriv->ackprocstarted == 0){
+		qlock(&rpriv->apl);
+		if(rpriv->ackprocstarted == 0){
+			snprint(kpname, sizeof kpname, "#I%drudpack",
+				rudp->f->dev);
+			kproc(kpname, relackproc, rudp);
+			rpriv->ackprocstarted = 1;
+		}
+		qunlock(&rpriv->apl);
+	}
+}
+
+static char*
+rudpconnect(Conv *c, char **argv, int argc)
+{
+	char *e;
+	Rudppriv *upriv;
+
+	upriv = c->p->priv;
+	rudpstartackproc(c->p);
+	e = Fsstdconnect(c, argv, argc);
+	Fsconnected(c, e);
+	iphtadd(&upriv->ht, c);
+
+	return e;
+}
+
+
+static int
+rudpstate(Conv *c, char *state, int n)
+{
+	Rudpcb *ucb;
+	Reliable *r;
+	int m;
+
+	m = snprint(state, n, "%s", c->inuse?"Open":"Closed");
+	ucb = (Rudpcb*)c->ptcl;
+	qlock(ucb);
+	for(r = ucb->r; r; r = r->next)
+		m += snprint(state+m, n-m, " %I/%ld", r->addr, UNACKED(r));
+	m += snprint(state+m, n-m, "\n");
+	qunlock(ucb);
+	return m;
+}
+
+static char*
+rudpannounce(Conv *c, char** argv, int argc)
+{
+	char *e;
+	Rudppriv *upriv;
+
+	upriv = c->p->priv;
+	rudpstartackproc(c->p);
+	e = Fsstdannounce(c, argv, argc);
+	if(e != nil)
+		return e;
+	Fsconnected(c, nil);
+	iphtadd(&upriv->ht, c);
+
+	return nil;
+}
+
+static void
+rudpcreate(Conv *c)
+{
+	c->rq = qopen(64*1024, Qmsg, 0, 0);
+	c->wq = qopen(64*1024, Qkick, rudpkick, c);
+}
+
+static void
+rudpclose(Conv *c)
+{
+	Rudpcb *ucb;
+	Reliable *r, *nr;
+	Rudppriv *upriv;
+
+	upriv = c->p->priv;
+	iphtrem(&upriv->ht, c);
+
+	/* force out any delayed acks */
+	ucb = (Rudpcb*)c->ptcl;
+	qlock(ucb);
+	for(r = ucb->r; r; r = r->next){
+		if(r->acksent != r->rcvseq)
+			relsendack(c, r, 0);
+	}
+	qunlock(ucb);
+
+	qclose(c->rq);
+	qclose(c->wq);
+	qclose(c->eq);
+	ipmove(c->laddr, IPnoaddr);
+	ipmove(c->raddr, IPnoaddr);
+	c->lport = 0;
+	c->rport = 0;
+
+	ucb->headers = 0;
+	ucb->randdrop = 0;
+	qlock(ucb);
+	for(r = ucb->r; r; r = nr){
+		if(r->acksent != r->rcvseq)
+			relsendack(c, r, 0);
+		nr = r->next;
+		relhangup(c, r);
+		relput(r);
+	}
+	ucb->r = 0;
+
+	qunlock(ucb);
+}
+
+/*
+ *  randomly don't send packets
+ */
+static void
+doipoput(Conv *c, Fs *f, Block *bp, int x, int ttl, int tos)
+{
+	Rudpcb *ucb;
+
+	ucb = (Rudpcb*)c->ptcl;
+	if(ucb->randdrop && nrand(100) < ucb->randdrop)
+		freeblist(bp);
+	else
+		ipoput4(f, bp, x, ttl, tos, nil);
+}
+
+int
+flow(void *v)
+{
+	Reliable *r = v;
+
+	return UNACKED(r) <= Maxunacked;
+}
+
+void
+rudpkick(void *x)
+{
+	Mach *m = machp();
+	Conv *c = x;
+	Udphdr *uh;
+	uint16_t rport;
+	uint8_t laddr[IPaddrlen], raddr[IPaddrlen];
+	Block *bp;
+	Rudpcb *ucb;
+	Rudphdr *rh;
+	Reliable *r;
+	int dlen, ptcllen;
+	Rudppriv *upriv;
+	Fs *f;
+
+	upriv = c->p->priv;
+	f = c->p->f;
+
+	netlog(c->p->f, Logrudp, "rudp: kick\n");
+	bp = qget(c->wq);
+	if(bp == nil)
+		return;
+
+	ucb = (Rudpcb*)c->ptcl;
+	switch(ucb->headers) {
+	case 7:
+		/* get user specified addresses */
+		bp = pullupblock(bp, UDP_USEAD7);
+		if(bp == nil)
+			return;
+		ipmove(raddr, bp->rp);
+		bp->rp += IPaddrlen;
+		ipmove(laddr, bp->rp);
+		bp->rp += IPaddrlen;
+		/* pick interface closest to dest */
+		if(ipforme(f, laddr) != Runi)
+			findlocalip(f, laddr, raddr);
+		bp->rp += IPaddrlen;		/* Ignore ifc address */
+		rport = nhgets(bp->rp);
+		bp->rp += 2+2;			/* Ignore local port */
+		break;
+	default:
+		ipmove(raddr, c->raddr);
+		ipmove(laddr, c->laddr);
+		rport = c->rport;
+		break;
+	}
+
+	dlen = blocklen(bp);
+
+	/* Make space to fit rudp & ip header */
+	bp = padblock(bp, UDP_IPHDR+UDP_RHDRSIZE);
+	if(bp == nil)
+		return;
+
+	uh = (Udphdr *)(bp->rp);
+	uh->vihl = IP_VER4;
+
+	rh = (Rudphdr*)uh;
+
+	ptcllen = dlen + (UDP_RHDRSIZE-UDP_PHDRSIZE);
+	uh->Unused = 0;
+	uh->udpproto = IP_UDPPROTO;
+	uh->frag[0] = 0;
+	uh->frag[1] = 0;
+	hnputs(uh->udpplen, ptcllen);
+	switch(ucb->headers){
+	case 7:
+		v6tov4(uh->udpdst, raddr);
+		hnputs(uh->udpdport, rport);
+		v6tov4(uh->udpsrc, laddr);
+		break;
+	default:
+		v6tov4(uh->udpdst, c->raddr);
+		hnputs(uh->udpdport, c->rport);
+		if(ipcmp(c->laddr, IPnoaddr) == 0)
+			findlocalip(f, c->laddr, c->raddr);
+		v6tov4(uh->udpsrc, c->laddr);
+		break;
+	}
+	hnputs(uh->udpsport, c->lport);
+	hnputs(uh->udplen, ptcllen);
+	uh->udpcksum[0] = 0;
+	uh->udpcksum[1] = 0;
+
+	qlock(ucb);
+	r = relstate(ucb, raddr, rport, "kick");
+	r->sndseq = NEXTSEQ(r->sndseq);
+	hnputl(rh->relseq, r->sndseq);
+	hnputl(rh->relsgen, r->sndgen);
+
+	hnputl(rh->relack, r->rcvseq);  /* ACK last rcvd packet */
+	hnputl(rh->relagen, r->rcvgen);
+
+	if(r->rcvseq != r->acksent)
+		r->acksent = r->rcvseq;
+
+	hnputs(uh->udpcksum, ptclcsum(bp, UDP_IPHDR, dlen+UDP_RHDRSIZE));
+
+	relackq(r, bp);
+	qunlock(ucb);
+
+	upriv->ustats.rudpOutDatagrams++;
+
+	DPRINT("sent: %lud/%lud, %lud/%lud\n",
+		r->sndseq, r->sndgen, r->rcvseq, r->rcvgen);
+
+	doipoput(c, f, bp, 0, c->ttl, c->tos);
+
+	if(waserror()) {
+		relput(r);
+		qunlock(&r->lock);
+		nexterror();
+	}
+
+	/* flow control of sorts */
+	qlock(&r->lock);
+	if(UNACKED(r) > Maxunacked){
+		r->blocked = 1;
+		sleep(&r->vous, flow, r);
+		r->blocked = 0;
+	}
+
+	qunlock(&r->lock);
+	relput(r);
+	poperror();
+}
+
+void
+rudpiput(Proto *rudp, Ipifc *ifc, Block *bp)
+{
+	int len, olen, ottl;
+	Udphdr *uh;
+	Conv *c;
+	Rudpcb *ucb;
+	uint8_t raddr[IPaddrlen], laddr[IPaddrlen];
+	uint16_t rport, lport;
+	Rudppriv *upriv;
+	Fs *f;
+	uint8_t *p;
+
+	upriv = rudp->priv;
+	f = rudp->f;
+
+	upriv->ustats.rudpInDatagrams++;
+
+	uh = (Udphdr*)(bp->rp);
+
+	/* Put back pseudo header for checksum
+	 * (remember old values for icmpnoconv())
+	 */
+	ottl = uh->Unused;
+	uh->Unused = 0;
+	len = nhgets(uh->udplen);
+	olen = nhgets(uh->udpplen);
+	hnputs(uh->udpplen, len);
+
+	v4tov6(raddr, uh->udpsrc);
+	v4tov6(laddr, uh->udpdst);
+	lport = nhgets(uh->udpdport);
+	rport = nhgets(uh->udpsport);
+
+	if(nhgets(uh->udpcksum)) {
+		if(ptclcsum(bp, UDP_IPHDR, len+UDP_PHDRSIZE)) {
+			upriv->ustats.rudpInErrors++;
+			upriv->csumerr++;
+			netlog(f, Logrudp, "rudp: checksum error %I\n", raddr);
+			DPRINT("rudp: checksum error %I\n", raddr);
+			freeblist(bp);
+			return;
+		}
+	}
+
+	qlock(rudp);
+
+	c = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
+	if(c == nil){
+		/* no conversation found */
+		upriv->ustats.rudpNoPorts++;
+		qunlock(rudp);
+		netlog(f, Logudp, "udp: no conv %I!%d -> %I!%d\n", raddr, rport,
+			laddr, lport);
+		uh->Unused = ottl;
+		hnputs(uh->udpplen, olen);
+		icmpnoconv(f, bp);
+		freeblist(bp);
+		return;
+	}
+	ucb = (Rudpcb*)c->ptcl;
+	qlock(ucb);
+	qunlock(rudp);
+
+	if(reliput(c, bp, raddr, rport) < 0){
+		qunlock(ucb);
+		freeb(bp);
+		return;
+	}
+
+	/*
+	 * Trim the packet down to data size
+	 */
+
+	len -= (UDP_RHDRSIZE-UDP_PHDRSIZE);
+	bp = trimblock(bp, UDP_IPHDR+UDP_RHDRSIZE, len);
+	if(bp == nil) {
+		netlog(f, Logrudp, "rudp: len err %I.%d -> %I.%d\n",
+			raddr, rport, laddr, lport);
+		DPRINT("rudp: len err %I.%d -> %I.%d\n",
+			raddr, rport, laddr, lport);
+		upriv->lenerr++;
+		return;
+	}
+
+	netlog(f, Logrudpmsg, "rudp: %I.%d -> %I.%d l %d\n",
+		raddr, rport, laddr, lport, len);
+
+	switch(ucb->headers){
+	case 7:
+		/* pass the src address */
+		bp = padblock(bp, UDP_USEAD7);
+		p = bp->rp;
+		ipmove(p, raddr); p += IPaddrlen;
+		ipmove(p, laddr); p += IPaddrlen;
+		ipmove(p, ifc->lifc->local); p += IPaddrlen;
+		hnputs(p, rport); p += 2;
+		hnputs(p, lport);
+		break;
+	default:
+		/* connection oriented rudp */
+		if(ipcmp(c->raddr, IPnoaddr) == 0){
+			/* save the src address in the conversation */
+		 	ipmove(c->raddr, raddr);
+			c->rport = rport;
+
+			/* reply with the same ip address (if not broadcast) */
+			if(ipforme(f, laddr) == Runi)
+				ipmove(c->laddr, laddr);
+			else
+				v4tov6(c->laddr, ifc->lifc->local);
+		}
+		break;
+	}
+	if(bp->next)
+		bp = concatblock(bp);
+
+	if(qfull(c->rq)) {
+		netlog(f, Logrudp, "rudp: qfull %I.%d -> %I.%d\n", raddr, rport,
+			laddr, lport);
+		freeblist(bp);
+	}
+	else
+		qpass(c->rq, bp);
+
+	qunlock(ucb);
+}
+
+static char *rudpunknown = "unknown rudp ctl request";
+
+char*
+rudpctl(Conv *c, char **f, int n)
+{
+	Rudpcb *ucb;
+	uint8_t ip[IPaddrlen];
+	int x;
+
+	ucb = (Rudpcb*)c->ptcl;
+	if(n < 1)
+		return rudpunknown;
+
+	if(strcmp(f[0], "headers") == 0){
+		ucb->headers = 7;		/* new headers format */
+		return nil;
+	} else if(strcmp(f[0], "hangup") == 0){
+		if(n < 3)
+			return "bad syntax";
+		if (parseip(ip, f[1]) == -1)
+			return Ebadip;
+		x = atoi(f[2]);
+		qlock(ucb);
+		relforget(c, ip, x, 1);
+		qunlock(ucb);
+		return nil;
+	} else if(strcmp(f[0], "randdrop") == 0){
+		x = 10;			/* default is 10% */
+		if(n > 1)
+			x = atoi(f[1]);
+		if(x > 100 || x < 0)
+			return "illegal rudp drop rate";
+		ucb->randdrop = x;
+		return nil;
+	}
+	return rudpunknown;
+}
+
+void
+rudpadvise(Proto *rudp, Block *bp, char *msg)
+{
+	Udphdr *h;
+	uint8_t source[IPaddrlen], dest[IPaddrlen];
+	uint16_t psource, pdest;
+	Conv *s, **p;
+
+	h = (Udphdr*)(bp->rp);
+
+	v4tov6(dest, h->udpdst);
+	v4tov6(source, h->udpsrc);
+	psource = nhgets(h->udpsport);
+	pdest = nhgets(h->udpdport);
+
+	/* Look for a connection */
+	for(p = rudp->conv; *p; p++) {
+		s = *p;
+		if(s->rport == pdest)
+		if(s->lport == psource)
+		if(ipcmp(s->raddr, dest) == 0)
+		if(ipcmp(s->laddr, source) == 0){
+			qhangup(s->rq, msg);
+			qhangup(s->wq, msg);
+			break;
+		}
+	}
+	freeblist(bp);
+}
+
+int
+rudpstats(Proto *rudp, char *buf, int len)
+{
+	Rudppriv *upriv;
+
+	upriv = rudp->priv;
+	return snprint(buf, len, "%lud %lud %lud %lud %lud %lud\n",
+		upriv->ustats.rudpInDatagrams,
+		upriv->ustats.rudpNoPorts,
+		upriv->ustats.rudpInErrors,
+		upriv->ustats.rudpOutDatagrams,
+		upriv->rxmits,
+		upriv->orders);
+}
+
+void
+rudpinit(Fs *fs)
+{
+
+	Proto *rudp;
+
+	rudp = smalloc(sizeof(Proto));
+	rudp->priv = smalloc(sizeof(Rudppriv));
+	rudp->name = "rudp";
+	rudp->connect = rudpconnect;
+	rudp->announce = rudpannounce;
+	rudp->ctl = rudpctl;
+	rudp->state = rudpstate;
+	rudp->create = rudpcreate;
+	rudp->close = rudpclose;
+	rudp->rcv = rudpiput;
+	rudp->advise = rudpadvise;
+	rudp->stats = rudpstats;
+	rudp->ipproto = IP_UDPPROTO;
+	rudp->nc = 32;
+	rudp->ptclsize = sizeof(Rudpcb);
+
+	Fsproto(fs, rudp);
+}
+
+/*********************************************/
+/* Here starts the reliable helper functions */
+/*********************************************/
+/*
+ *  Enqueue a copy of an unacked block for possible retransmissions
+ */
+void
+relackq(Reliable *r, Block *bp)
+{
+	Block *np;
+
+	np = copyblock(bp, blocklen(bp));
+	if(r->unacked)
+		r->unackedtail->list = np;
+	else {
+		/* restart timer */
+		r->timeout = 0;
+		r->xmits = 1;
+		r->unacked = np;
+	}
+	r->unackedtail = np;
+	np->list = nil;
+}
+
+/*
+ *  retransmit unacked blocks
+ */
+void
+relackproc(void *a)
+{
+	Mach *m = machp();
+	Rudpcb *ucb;
+	Proto *rudp;
+	Reliable *r;
+	Conv **s, *c;
+
+	rudp = (Proto *)a;
+
+loop:
+	tsleep(&m->externup->sleep, return0, 0, Rudptickms);
+
+	for(s = rudp->conv; *s; s++) {
+		c = *s;
+		ucb = (Rudpcb*)c->ptcl;
+		qlock(ucb);
+
+		for(r = ucb->r; r; r = r->next) {
+			if(r->unacked != nil){
+				r->timeout += Rudptickms;
+				if(r->timeout > Rudprxms*r->xmits)
+					relrexmit(c, r);
+			}
+			if(r->acksent != r->rcvseq)
+				relsendack(c, r, 0);
+		}
+		qunlock(ucb);
+	}
+	goto loop;
+}
+
+/*
+ *  get the state record for a conversation
+ */
+Reliable*
+relstate(Rudpcb *ucb, uint8_t *addr, uint16_t port, char *from)
+{
+	Reliable *r, **l;
+
+	l = &ucb->r;
+	for(r = *l; r; r = *l){
+		if(memcmp(addr, r->addr, IPaddrlen) == 0 &&
+		    port == r->port)
+			break;
+		l = &r->next;
+	}
+
+	/* no state for this addr/port, create some */
+	if(r == nil){
+		while(generation == 0)
+			generation = rand();
+
+		DPRINT("from %s new state %lud for %I!%ud\n",
+		        from, generation, addr, port);
+
+		r = smalloc(sizeof(Reliable));
+		memmove(r->addr, addr, IPaddrlen);
+		r->port = port;
+		r->unacked = 0;
+		if(generation == Hangupgen)
+			generation++;
+		r->sndgen = generation++;
+		r->sndseq = 0;
+		r->ackrcvd = 0;
+		r->rcvgen = 0;
+		r->rcvseq = 0;
+		r->acksent = 0;
+		r->xmits = 0;
+		r->timeout = 0;
+		r->ref = 0;
+		incref(r);	/* one reference for being in the list */
+
+		*l = r;
+	}
+
+	incref(r);
+	return r;
+}
+
+void
+relput(Reliable *r)
+{
+	if(decref(r) == 0)
+		free(r);
+}
+
+/*
+ *  forget a Reliable state
+ */
+void
+relforget(Conv *c, uint8_t *ip, int port, int originator)
+{
+	Rudpcb *ucb;
+	Reliable *r, **l;
+
+	ucb = (Rudpcb*)c->ptcl;
+
+	l = &ucb->r;
+	for(r = *l; r; r = *l){
+		if(ipcmp(ip, r->addr) == 0 && port == r->port){
+			*l = r->next;
+			if(originator)
+				relsendack(c, r, 1);
+			relhangup(c, r);
+			relput(r);	/* remove from the list */
+			break;
+		}
+		l = &r->next;
+	}
+}
+
+/*
+ *  process a rcvd reliable packet. return -1 if not to be passed to user process,
+ *  0 therwise.
+ *
+ *  called with ucb locked.
+ */
+int
+reliput(Conv *c, Block *bp, uint8_t *addr, uint16_t port)
+{
+	Block *nbp;
+	Rudpcb *ucb;
+	Rudppriv *upriv;
+	Udphdr *uh;
+	Reliable *r;
+	Rudphdr *rh;
+	uint32_t seq, ack, sgen, agen, ackreal;
+	int rv = -1;
+
+	/* get fields */
+	uh = (Udphdr*)(bp->rp);
+	rh = (Rudphdr*)uh;
+	seq = nhgetl(rh->relseq);
+	sgen = nhgetl(rh->relsgen);
+	ack = nhgetl(rh->relack);
+	agen = nhgetl(rh->relagen);
+
+	upriv = c->p->priv;
+	ucb = (Rudpcb*)c->ptcl;
+	r = relstate(ucb, addr, port, "input");
+
+	DPRINT("rcvd %lud/%lud, %lud/%lud, r->sndgen = %lud\n",
+		seq, sgen, ack, agen, r->sndgen);
+
+	/* if acking an incorrect generation, ignore */
+	if(ack && agen != r->sndgen)
+		goto out;
+
+	/* Look for a hangup */
+	if(sgen == Hangupgen) {
+		if(agen == r->sndgen)
+			relforget(c, addr, port, 0);
+		goto out;
+	}
+
+	/* make sure we're not talking to a new remote side */
+	if(r->rcvgen != sgen){
+		if(seq != 0 && seq != 1)
+			goto out;
+
+		/* new connection */
+		if(r->rcvgen != 0){
+			DPRINT("new con r->rcvgen = %lud, sgen = %lud\n", r->rcvgen, sgen);
+			relhangup(c, r);
+		}
+		r->rcvgen = sgen;
+	}
+
+	/* dequeue acked packets */
+	if(ack && agen == r->sndgen){
+		ackreal = 0;
+		while(r->unacked != nil && INSEQ(ack, r->ackrcvd, r->sndseq)){
+			nbp = r->unacked;
+			r->unacked = nbp->list;
+			DPRINT("%lud/%lud acked, r->sndgen = %lud\n",
+			       ack, agen, r->sndgen);
+			freeb(nbp);
+			r->ackrcvd = NEXTSEQ(r->ackrcvd);
+			ackreal = 1;
+		}
+
+		/* flow control */
+		if(UNACKED(r) < Maxunacked/8 && r->blocked)
+			wakeup(&r->vous);
+
+		/*
+		 *  retransmit next packet if the acked packet
+		 *  was transmitted more than once
+		 */
+		if(ackreal && r->unacked != nil){
+			r->timeout = 0;
+			if(r->xmits > 1){
+				r->xmits = 1;
+				relrexmit(c, r);
+			}
+		}
+
+	}
+
+	/* no message or input queue full */
+	if(seq == 0 || qfull(c->rq))
+		goto out;
+
+	/* refuse out of order delivery */
+	if(seq != NEXTSEQ(r->rcvseq)){
+		relsendack(c, r, 0);	/* tell him we got it already */
+		upriv->orders++;
+		DPRINT("out of sequence %lud not %lud\n", seq, NEXTSEQ(r->rcvseq));
+		goto out;
+	}
+	r->rcvseq = seq;
+
+	rv = 0;
+out:
+	relput(r);
+	return rv;
+}
+
+void
+relsendack(Conv *c, Reliable *r, int hangup)
+{
+	Udphdr *uh;
+	Block *bp;
+	Rudphdr *rh;
+	int ptcllen;
+	Fs *f;
+
+	bp = allocb(UDP_IPHDR + UDP_RHDRSIZE);
+	if(bp == nil)
+		return;
+	bp->wp += UDP_IPHDR + UDP_RHDRSIZE;
+	f = c->p->f;
+	uh = (Udphdr *)(bp->rp);
+	uh->vihl = IP_VER4;
+	rh = (Rudphdr*)uh;
+
+	ptcllen = (UDP_RHDRSIZE-UDP_PHDRSIZE);
+	uh->Unused = 0;
+	uh->udpproto = IP_UDPPROTO;
+	uh->frag[0] = 0;
+	uh->frag[1] = 0;
+	hnputs(uh->udpplen, ptcllen);
+
+	v6tov4(uh->udpdst, r->addr);
+	hnputs(uh->udpdport, r->port);
+	hnputs(uh->udpsport, c->lport);
+	if(ipcmp(c->laddr, IPnoaddr) == 0)
+		findlocalip(f, c->laddr, c->raddr);
+	v6tov4(uh->udpsrc, c->laddr);
+	hnputs(uh->udplen, ptcllen);
+
+	if(hangup)
+		hnputl(rh->relsgen, Hangupgen);
+	else
+		hnputl(rh->relsgen, r->sndgen);
+	hnputl(rh->relseq, 0);
+	hnputl(rh->relagen, r->rcvgen);
+	hnputl(rh->relack, r->rcvseq);
+
+	if(r->acksent < r->rcvseq)
+		r->acksent = r->rcvseq;
+
+	uh->udpcksum[0] = 0;
+	uh->udpcksum[1] = 0;
+	hnputs(uh->udpcksum, ptclcsum(bp, UDP_IPHDR, UDP_RHDRSIZE));
+
+	DPRINT("sendack: %lud/%lud, %lud/%lud\n", 0L, r->sndgen, r->rcvseq, r->rcvgen);
+	doipoput(c, f, bp, 0, c->ttl, c->tos);
+}
+
+
+/*
+ *  called with ucb locked (and c locked if user initiated close)
+ */
+void
+relhangup(Conv *c, Reliable *r)
+{
+	int n;
+	Block *bp;
+	char hup[ERRMAX];
+
+	n = snprint(hup, sizeof(hup), "hangup %I!%d", r->addr, r->port);
+	qproduce(c->eq, hup, n);
+
+	/*
+	 *  dump any unacked outgoing messages
+	 */
+	for(bp = r->unacked; bp != nil; bp = r->unacked){
+		r->unacked = bp->list;
+		bp->list = nil;
+		freeb(bp);
+	}
+
+	r->rcvgen = 0;
+	r->rcvseq = 0;
+	r->acksent = 0;
+	if(generation == Hangupgen)
+		generation++;
+	r->sndgen = generation++;
+	r->sndseq = 0;
+	r->ackrcvd = 0;
+	r->xmits = 0;
+	r->timeout = 0;
+	wakeup(&r->vous);
+}
+
+/*
+ *  called with ucb locked
+ */
+void
+relrexmit(Conv *c, Reliable *r)
+{
+	Rudppriv *upriv;
+	Block *np;
+	Fs *f;
+
+	upriv = c->p->priv;
+	f = c->p->f;
+	r->timeout = 0;
+	if(r->xmits++ > Rudpmaxxmit){
+		relhangup(c, r);
+		return;
+	}
+
+	upriv->rxmits++;
+	np = copyblock(r->unacked, blocklen(r->unacked));
+	DPRINT("rxmit r->ackrvcd+1 = %lud\n", r->ackrcvd+1);
+	doipoput(c, f, np, 0, c->ttl, c->tos);
+}

File diff suppressed because it is too large
+ 288 - 195
sys/src/9/ip/tcp.c


+ 9 - 41
sys/src/9/ip/udp.c

@@ -33,7 +33,6 @@ enum
 
 	IP_UDPPROTO	= 17,
 	UDP_USEAD7	= 52,
-	UDP_USEAD6	= 36,
 
 	Udprxms		= 200,
 	Udptickms	= 100,
@@ -82,10 +81,10 @@ struct Udp6hdr {
 typedef struct Udpstats Udpstats;
 struct Udpstats
 {
-	uint32_t	udpInDatagrams;
+	uint64_t	udpInDatagrams;
 	uint32_t	udpNoPorts;
 	uint32_t	udpInErrors;
-	uint32_t	udpOutDatagrams;
+	uint64_t	udpOutDatagrams;
 };
 
 typedef struct Udppriv Udppriv;
@@ -111,7 +110,7 @@ typedef struct Udpcb Udpcb;
 struct Udpcb
 {
 	QLock;
-	unsigned char	headers;
+	uint8_t	headers;
 };
 
 static char*
@@ -204,7 +203,7 @@ udpkick(void *x, Block *bp)
 	upriv = c->p->priv;
 	f = c->p->f;
 
-	netlog(c->p->f, Logudp, "udp: kick\n");
+//	netlog(c->p->f, Logudp, "udp: kick\n");	/* frequent and uninteresting */
 	if(bp == nil)
 		return;
 
@@ -226,21 +225,6 @@ udpkick(void *x, Block *bp)
 		rport = nhgets(bp->rp);
 		bp->rp += 2+2;			/* Ignore local port */
 		break;
-	case 6:					/* OBS */
-		/* get user specified addresses */
-		bp = pullupblock(bp, UDP_USEAD6);
-		if(bp == nil)
-			return;
-		ipmove(raddr, bp->rp);
-		bp->rp += IPaddrlen;
-		ipmove(laddr, bp->rp);
-		bp->rp += IPaddrlen;
-		/* pick interface closest to dest */
-		if(ipforme(f, laddr) != Runi)
-			findlocalip(f, laddr, raddr);
-		rport = nhgets(bp->rp);
-		bp->rp += 2+2;			/* Ignore local port */
-		break;
 	default:
 		rport = 0;
 		break;
@@ -430,7 +414,7 @@ udpiput(Proto *udp, Ipifc *ifc, Block *bp)
 
 	c = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
 	if(c == nil){
-		/* no converstation found */
+		/* no conversation found */
 		upriv->ustats.udpNoPorts++;
 		qunlock(udp);
 		netlog(f, Logudp, "udp: no conv %I!%d -> %I!%d\n", raddr, rport,
@@ -441,7 +425,7 @@ udpiput(Proto *udp, Ipifc *ifc, Block *bp)
 			icmpnoconv(f, bp);
 			break;
 		case V6:
-			icmphostunr(f, ifc, bp, icmp6_port_unreach, 0);
+			icmphostunr(f, ifc, bp, Icmp6_port_unreach, 0);
 			break;
 		default:
 			panic("udpiput2: version %d", version);
@@ -518,15 +502,6 @@ udpiput(Proto *udp, Ipifc *ifc, Block *bp)
 		hnputs(p, rport); p += 2;
 		hnputs(p, lport);
 		break;
-	case 6:					/* OBS */
-		/* pass the src address */
-		bp = padblock(bp, UDP_USEAD6);
-		p = bp->rp;
-		ipmove(p, raddr); p += IPaddrlen;
-		ipmove(p, ipforme(f, laddr)==Runi ? laddr : ifc->lifc->local); p += IPaddrlen;
-		hnputs(p, rport); p += 2;
-		hnputs(p, lport);
-		break;
 	}
 
 	if(bp->next)
@@ -548,19 +523,11 @@ udpiput(Proto *udp, Ipifc *ifc, Block *bp)
 char*
 udpctl(Conv *c, char **f, int n)
 {
-	Mach *m = machp();
 	Udpcb *ucb;
 
 	ucb = (Udpcb*)c->ptcl;
 	if(n == 1){
-		if(strcmp(f[0], "oldheaders") == 0){	/* OBS */
-			ucb->headers = 6;
-			if (m->externup)
-				print("program %s wrote `oldheaders' to udp "
-					"ctl file; fix or recompile it\n",
-					m->externup->text);
-			return nil;
-		} else if(strcmp(f[0], "headers") == 0){
+		if(strcmp(f[0], "headers") == 0){
 			ucb->headers = 7;	/* new headers format */
 			return nil;
 		}
@@ -629,7 +596,8 @@ udpstats(Proto *udp, char *buf, int len)
 	Udppriv *upriv;
 
 	upriv = udp->priv;
-	return snprint(buf, len, "InDatagrams: %lud\nNoPorts: %lud\nInErrors: %lud\nOutDatagrams: %lud\n",
+	return snprint(buf, len, "InDatagrams: %llud\nNoPorts: %lud\n"
+		"InErrors: %lud\nOutDatagrams: %llud\n",
 		upriv->ustats.udpInDatagrams,
 		upriv->ustats.udpNoPorts,
 		upriv->ustats.udpInErrors,

+ 3 - 19
sys/src/9/k10/etherif.h

@@ -7,13 +7,7 @@
  * in the LICENSE file.
  */
 
-enum
-{
-	Eaddrlen	= 6,
-	ETHERMINTU	= 60,		/* minimum transmit size */
-	ETHERMAXTU	= 1514,		/* maximum transmit size */
-	ETHERHDRSIZE	= 14,		/* size of an ethernet header */
-
+enum {
 	MaxEther	= 48,
 	Ntypes		= 8,
 };
@@ -24,7 +18,7 @@ struct Ether {
 
 	int	ctlrno;
 	int	tbdf;			/* type+busno+devno+funcno */
-	unsigned char	ea[Eaddrlen];
+	uint8_t	ea[Eaddrlen];
 
 	void	(*attach)(Ether*);	/* filled in by reset routine */
 	void	(*detach)(Ether*);
@@ -36,21 +30,11 @@ struct Ether {
 	void	(*shutdown)(Ether*);	/* shutdown hardware before reboot */
 	void	*ctlr;
 
-	int	scan[Ntypes];		/* base station scanning interval */
-	int	nscan;			/* number of base station scanners */
+	Queue*	oq;
 
 	Netif;
 };
 
-typedef struct Etherpkt Etherpkt;
-struct Etherpkt
-{
-	unsigned char	d[Eaddrlen];
-	unsigned char	s[Eaddrlen];
-	unsigned char	type[2];
-	unsigned char	data[1500];
-};
-
 extern Block* etheriq(Ether*, Block*, int);
 extern void addethercard(char*, int(*)(Ether*));
 extern uint32_t ethercrc(unsigned char*, int);

+ 30 - 7
sys/src/9/port/netif.h

@@ -7,6 +7,7 @@
  * in the LICENSE file.
  */
 
+typedef struct Etherpkt	Etherpkt;
 typedef struct Netaddr	Netaddr;
 typedef struct Netfile	Netfile;
 typedef struct Netif	Netif;
@@ -51,7 +52,7 @@ struct Netfile
 	int	scan;			/* base station scanning interval */
 	int	bridge;			/* bridge mode */
 	int	headersonly;		/* headers only - no data */
-	unsigned char	maddr[8];		/* bitmask of multicast addresses requested */
+	uint8_t	maddr[8];		/* bitmask of multicast addresses requested */
 	int	nmaddr;			/* number of multicast addresses */
 
 	Queue*	iq;			/* input */
@@ -64,7 +65,7 @@ struct Netaddr
 {
 	Netaddr	*next;			/* allocation chain */
 	Netaddr	*hnext;
-	unsigned char	addr[Nmaxaddr];
+	uint8_t	addr[Nmaxaddr];
 	int	ref;
 };
 
@@ -88,8 +89,8 @@ struct Netif
 	int	minmtu;
 	int 	maxmtu;
 	int	mtu;
-	unsigned char	addr[Nmaxaddr];
-	unsigned char	bcast[Nmaxaddr];
+	uint8_t	addr[Nmaxaddr];
+	uint8_t	bcast[Nmaxaddr];
 	Netaddr	*maddr;			/* known multicast addresses */
 	int	nmaddr;			/* number of known multicast addresses */
 	Netaddr *mhash[Nmhash];		/* hash table of multicast addresses */
@@ -97,8 +98,6 @@ struct Netif
 	int	_scan;			/* number of base station scanners */
 	int	all;			/* number of -1 multiplexors */
 
-	Queue*	oq;			/* output */
-
 	/* statistics */
 	uint64_t	misses;
 	uint64_t	inpackets;
@@ -113,7 +112,7 @@ struct Netif
 	/* routines for touching the hardware */
 	void	*arg;
 	void	(*promiscuous)(void*, int);
-	void	(*multicast)(void*, unsigned char*, int);
+	void	(*multicast)(void*, uint8_t*, int);
 	int	(*hwmtu)(void*, int);	/* get/set mtu */
 	void	(*scanbs)(void*, uint);	/* scan for base stations */
 };
@@ -128,3 +127,27 @@ int32_t	netifwrite(Netif*, Chan*, void*, int32_t);
 int32_t	netifwstat(Netif*, Chan*, unsigned char*, int32_t);
 int32_t	netifstat(Netif*, Chan*, unsigned char*, int32_t);
 int	activemulti(Netif*, unsigned char*, int);
+
+/*
+ *  Ethernet specific
+ */
+enum
+{
+	Eaddrlen=	6,
+	ETHERMINTU =	60,		/* minimum transmit size */
+	ETHERMAXTU =	1514,		/* maximum transmit size */
+	ETHERHDRSIZE =	14,		/* size of an ethernet header */
+
+	/* ethernet packet types */
+	ETARP		= 0x0806,
+	ETIP4		= 0x0800,
+	ETIP6		= 0x86DD,
+};
+
+struct Etherpkt
+{
+	uint8_t	d[Eaddrlen];
+	uint8_t	s[Eaddrlen];
+	uint8_t	type[2];
+	uint8_t	data[1500];
+};

+ 2 - 0
sys/src/9/port/portdat.h

@@ -195,6 +195,7 @@ enum
 
 struct Block
 {
+	int32_t	ref;
 	Block*	next;
 	Block*	list;
 	unsigned char*	rp;			/* first unconsumed byte */
@@ -204,6 +205,7 @@ struct Block
 	void	(*free)(Block*);
 	uint16_t	flag;
 	uint16_t	checksum;		/* IP checksum of complete packet (minus media header) */
+	uint32_t	magic;
 };
 #define BLEN(s)	((s)->wp - (s)->rp)
 #define BALLOC(s) ((s)->lim - (s)->base)

Some files were not shown because too many files changed in this diff