Browse Source

Make #9 a bastardized device which responds only to read and write for
spec not empty.

Signed-off-by: golubovsky <golubovsky@gmail.com>

golubovsky 7 years ago
parent
commit
dcdfbe0b85
2 changed files with 285 additions and 103 deletions
  1. 284 102
      sys/src/9/port/dev9p.c
  2. 1 1
      sys/src/9/port/virtio_lib.c

+ 284 - 102
sys/src/9/port/dev9p.c

@@ -41,11 +41,9 @@ char* strdup(char *s);
 
 #define BUFSZ (8192 + IOHDRSZ)
 
-// Shared with devmnt
+// Raise this error when any of the non-implemented functions is called.
 
-extern char Enoversion[];
-extern int alloctag(void);
-extern void freetag(int t);
+#define Edonotcall(f) "Function " #f " should not be called for this device"
 
 extern Dev v9pdevtab;
 
@@ -55,13 +53,85 @@ static uint32_t nv9p;
 
 static Vqctl **v9ps;
 
+// Flag of one-time initailzation
+
+static int initdone;
+
+// A structure to hold a pair of buffer references. Virtio-9p requires buffers to be submitted
+// in pairs for request and response at once. We expect that devmnt issues a write request first, and then
+// read request. Write request will return immediately, but the buffer will be only held until read request
+// comes. Then both buffers are sent to virtqueue, and the result is returned to the caller.
+
+struct holdbuf
+{
+	void *rdbuf;				// read buffer (response)
+	int32_t rdlen;				// read length (allocated)
+	int32_t rfree;				// if true then read buffer was malloc'd and needs to be freed
+	void *wrbuf;				// write buffer (request)
+	int32_t wrlen;				// write length (supplied)
+	int32_t wfree;				// if true then write buffer was malloc'd and needs to be freed
+	Proc *proc;					// holding process
+	int descr;					// virtqueue descriptor index for the request
+};
+
+// PID cache structure. We need a way to find the currently held buffer by a calling process fast.
+// We use the last 4 bits of PID for that as a hash. So we can have up to 16 processes hashed.
+// If lookup fails, linear search will be used. The cache is a simple LRU: newly entering process
+// overwrites the cache entry.
+
+#define PIDCSIZE 16
+#define PIDCMASK 0x0F
+
+struct pidcache
+{
+	int pid;					// actual PID
+	struct holdbuf *hb;			// hold buffer structure pointer
+};
+
+// Per-mount (virtqueue) structure. It holds an array of hold buffer structures of the same
+// length as the number of descriptors in the queue.
+
 static struct v9pmnt
 {
-	char *tag;
-	char *version;
-	uint32_t msize;
+	char *tag;					// mount tag (from host)
+	char *version;				// 9p version (from host)
+	uint32_t msize;				// message size (need this to pad short buffers otherwise QEMU errors out)
+	Virtq *vq;					// associated virtqueue
+	struct holdbuf *hbufs;		// hold buffers
+	struct pidcache pidch[PIDCSIZE];	// PID cache
+	Lock pclock;				// PID cache lock
+	int pcuse;					// cache usage counter (entering processes)
+	int pchit;					// cache hits counter
+	int pcmiss;					// cache misses counter
 } *mounts;
 
+// Find a hold buffer structure by PID. Nil is returned if no process found.
+// First lookup in the cache, then linearly over the whole array.
+
+static struct holdbuf *
+hbbypid(int tidx, int pid)
+{
+	struct holdbuf *ret = nil;
+	lock(&mounts[tidx].pclock);
+	if(mounts[tidx].pidch[pid & PIDCMASK].pid == pid) {
+		ret = mounts[tidx].pidch[pid & PIDCMASK].hb;
+		if(ret->proc->pid == pid) {	// != is unlikely mb corruption but we have to get around
+			mounts[tidx].pchit++;
+			unlock(&mounts[tidx].pclock);
+			return ret;
+		}
+	}
+	unlock(&mounts[tidx].pclock);
+	mounts[tidx].pcmiss++;
+	for(int i = 0; i < PIDCSIZE; i++) {
+		if(mounts[tidx].hbufs[i].proc->pid == pid) {
+			ret = &mounts[tidx].hbufs[i];
+			return ret;
+		}
+	}
+	return nil;
+}
+
 // Find a mount tag index, return -1 if none found.
 
 static int 
@@ -72,9 +142,65 @@ findtag(char *tag)
 		if(mounts[i].tag && (!strcmp(mounts[i].tag, tag)))
 			return i;
 	}
+print("tag not found: %s\n", tag);
 	return -1;
 }
 
+// Emulate bindmount for each mount tag. We attach to devmnt directly, without
+// the user calling mount. The logic of devmnt properly sequences write and read
+// requests, but this cannot be expected from any other client. So 9p virtqueues
+// are not generally accessible, they operate under the hood.
+
+static void
+domountvq(int tidx)
+{
+	Proc *up = externup();
+	struct {
+		Chan    *chan;
+		Chan    *authchan;
+		char    *spec;
+		int     flags;
+	} bogus;
+	int dc = 'M';
+	Dev *dev = devtabget(dc, 0);
+	if(dev == nil)
+		error("no #M device found");
+	bogus.spec = mounts[tidx].tag;
+	bogus.flags = MCACHE;
+	bogus.authchan = nil;
+	bogus.chan = newchan();
+	bogus.chan->dev = &v9pdevtab;
+	bogus.chan->path = newpath(bogus.spec);
+	Chan *c0 = dev->attach((char *)&bogus);
+print("1\n");
+	if(waserror()) {
+		cclose(c0);
+		nexterror();
+	}
+print("2\n");
+	Chan *c1 = namec(strdup("#9"), Amount, 0, 0);
+print("3\n");
+	if(waserror()) {
+		cclose(c1);
+		nexterror();
+	}
+print("4\n");
+	cmount(&c0, c1, MBEFORE, bogus.spec); 
+print("5\n");
+}
+
+static void 
+mntvq(void)
+{
+	if(initdone)
+		return;
+	initdone = 1;
+	for(int i = 0; i < nv9p; i++) {
+		print("auto mount %d\n", i);
+		domountvq(i);
+	}
+}
+
 static void
 v9pinit(void)
 {
@@ -108,8 +234,11 @@ v9pinit(void)
 		mounts[i].tag = mallocz(vcfg.tag_len + 1, 1);
 		readvdevcfg(v9ps[i], mounts[i].tag, vcfg.tag_len, rc);
 		print("tag %s\n", mounts[i].tag);
+		mounts[i].vq = v9ps[i]->vqs[0];
+		mounts[i].hbufs = mallocz(mounts[i].vq->vr.num, 1);
 		finalinitvdev(v9ps[i]);
 	}
+	initdone = 0;
 }
 
 // General virtio request. It takes 2 buffers, one for input and other for output.
@@ -118,15 +247,14 @@ v9pinit(void)
 // indirect mode because this is the only way that work properly with QEMU 9p.
 
 static int32_t
-do_request(int tidx, void *inbuf, int32_t inlen, void *outbuf, int32_t outlen)
+do_request(int gdescr, int tidx, void *inbuf, int32_t inlen, void *outbuf, int32_t outlen)
 {
 	uint16_t descr[1];
 	Virtq *vq = v9ps[tidx]->vqs[0];
-	int rc = getdescr(vq, 1, descr);
-	if(rc < 1) {
-		error("Insufficient number of descriptors in virtqueue");
-		return -1;
+	if(vq == nil) {
+		error("No virtqueue (nil address)");
 	}
+	descr[0] = gdescr;
 	struct vring_desc req[2] = {
 		{
 			.addr = PADDR(outbuf),
@@ -150,113 +278,167 @@ do_request(int tidx, void *inbuf, int32_t inlen, void *outbuf, int32_t outlen)
 	return 0;
 }
 
-typedef int64_t(*post_t)(Fcall *, int, void *);
-
-// Common code to send/receive a fcall. It takes a post-processing function
-// to evaluate the return from fcall. This code takes care of buffer allocation,
-// message conversion, checks for Rerror result and errors out automatically,
-// checks for the result code matching the request code (request + 1), and errors out if not matching,
-// then passes control to the post processing function, finally releases all the buffers
-// it allocated.
+// We expect only 9p messages be written, and only for a non-empty chan path (mount tag).
+// Some messages need massaging (like Tversion because QEMU does not support vanilla 9P2000
+// and we have to cheat here about the protocol version). In such case some additional logic
+// applies based on the extracted message type.
 
-static int64_t
-do_fcall(Fcall *pf, int tidx, int32_t msize, void *data, post_t post)
+static int32_t
+v9pwrite(Chan *c, void *va, int32_t n, int64_t offset)
 {
-	uint8_t *msg, *rsp;
-	usize k;
-	uint8_t rtype = pf->type;
-	msg = mallocz(msize, 1);
-	rsp = mallocz(msize, 1);
-	if(msg == nil || rsp == nil)
-		exhausted("do_fcall buffer memory");
-	k = convS2M(pf, msg, msize);
-	if(k == 0) {
-		free(msg);
-		free(rsp);
-		error("do_fcall bad conversion on send");
-	}
-	int rc = do_request(tidx, rsp, msize, msg, msize);
-	free(msg);
-	if(rc < 0) {
-		free(rsp);
-		error("do_fcall virtio request error");
+print("write %s %d\n", chanpath(c), n);
+	Proc *up = externup();
+	int tidx = findtag(chanpath(c));
+	if(tidx < 0 || tidx >= nv9p)
+		error(Enonexist);
+	uint8_t *msg = va;
+	int mtype = GBIT8(msg + 4);
+	void *nva;
+	int lnva;
+	int alloc;
+print("write type %d\n", mtype);
+	switch(mtype)
+	{
+	case Tversion:
+			alloc = 1;
+			Fcall f = {
+				.type = mtype,
+				.tag = GBIT16(msg + 5),
+				.msize = GBIT32(msg + 7),
+				.version = VERSION9PU
+			};
+			lnva = IOHDRSZ + strlen(f.version) + 20;
+			nva = mallocz(lnva, 1);
+			convS2M(&f, nva, lnva);
+		break;
+	default:
+		if(n >= mounts[tidx].msize) {
+			nva = va;
+			lnva = n;
+			alloc = 0;
+		} else {
+			lnva = mounts[tidx].msize;
+			nva = mallocz(lnva, 1);
+			alloc = 1;
+			memmove(nva, va, n);
+		}
 	}
-	k = convM2S(rsp, msize, pf);
-	if(pf->type != rtype + 1) {
-		free(rsp);
-		error((pf->type == Rerror)?pf->ename:"do_fcall inconsistent return type");
+	uint16_t descr[1];
+	struct v9pmnt *pm = mounts + tidx;
+	int rc = getdescr(pm->vq, 1, descr);
+	if(rc < 1) {
+		if(alloc)
+			free(nva);
+		error("not enough virtqueue descriptors");
 	}
-	int64_t rc2 = (*post)(pf, tidx, data);
-	free(rsp);
-	return rc2;
+	lock(&pm->pclock);
+	pm->hbufs[descr[0]].descr = descr[0];
+	pm->hbufs[descr[0]].proc = up;
+	pm->hbufs[descr[0]].wfree = alloc;
+	pm->hbufs[descr[0]].wrbuf = nva;
+	pm->hbufs[descr[0]].wrlen = lnva;
+	pm->hbufs[descr[0]].rdbuf = nil;
+	pm->hbufs[descr[0]].rdlen = 0;
+	pm->hbufs[descr[0]].rfree = 0;
+	pm->pidch[up->pid & PIDCMASK].hb = &pm->hbufs[descr[0]];
+	pm->pidch[up->pid & PIDCMASK].pid = up->pid;
+	unlock(&pm->pclock);
+	return n;
 }
 
-// Send a version message over the given virtqueue.
+// We expect only 9p messages be received, and only for a non-empty chan path (mount tag).
+// Some messages need massaging (like Rversion because QEMU does not support vanilla 9P2000
+// and we have to cheat here about the protocol version). In such case some additional logic
+// applies based on the extracted message type. The function checks for a held write buffer,
+// absence of such is an error. The length returned may length extracted from the first
+// 4 bytes of the message in some cases.
 
-static int64_t
-post_version(Fcall *pf, int tidx, void *data)
+static int32_t
+v9pread(Chan *c, void *va, int32_t n, int64_t offset)
 {
-	if(pf->msize > MAXRPC) {
-		error("server tries to increase msize in fversion");
-		return -1;
-	}
-	if(pf->msize<256 || pf->msize>1024*1024) {
-		error("nonsense value of msize in fversion");
-		return -1;
+print("read %s %d\n", chanpath(c), n);
+	Proc *up = externup();
+	int tidx = findtag(chanpath(c));
+	if(tidx < 0 || tidx >= nv9p)
+		error(Enonexist);
+	struct holdbuf *hb = hbbypid(tidx, up->pid);
+	if(hb == nil)
+		error("read request without previously held write request");
+	hb->rdbuf = va;
+	hb->rdlen = n;
+	do_request(hb->descr, tidx, hb->rdbuf, hb->rdlen, hb->wrbuf, hb->wrlen);
+	if(hb->wfree)
+		free(hb->wrbuf);
+	uint8_t *msg = va;
+	int mtype = GBIT8(msg + 4);
+print("read type %d\n", mtype);
+	uint32_t mlen = GBIT32(msg);
+	Fcall f;
+	switch(mtype)
+	{
+	case Rerror:
+		convM2S(msg, n, &f);
+		error(f.ename);
+		break;
+	case Rversion:
+		convM2S(msg, n, &f);
+		mounts[tidx].version = strdup(f.version);
+		mounts[tidx].msize = f.msize;
+		f.version = VERSION9P;
+		convS2M(&f, va, n);
+		mlen = GBIT32(msg);
+		break;
+	default:
+		;
 	}
-	mounts[tidx].msize = pf->msize;
-	mounts[tidx].version = strdup(pf->version);
-	return 0;
+	return mlen;
 }
 
 static int
 v9pversion(int tidx)
 {
-	Fcall f = {
-		.type = Tversion,
-		.tag = NOTAG,
-		.msize = MAXRPC,
-		.version = VERSION9PU
-	};
-	return do_fcall(&f, tidx, BUFSZ, nil, post_version);
+	error(Edonotcall(version));
+	return 0;
 }
 
-static int64_t
-post_attach(Fcall *pf, int tidx, Chan *ch)
+// First attach of #9 will force mounting of all defined tags with devmnt. It cannot be done in devinit
+// because error() called in a kernel process without user context causes double fault and kernel crash.
+// So, to have the host shares actually mounted one has to issue something like ls '#9'.
+
+static Chan*
+v9pattach(char *spec)
 {
-	ch->qid = pf->qid;
-	return 0;
+	mntvq();
+	error(Edonotcall(attach));
+	return nil;
 }
 
 static Chan*
-v9pattach(char *spec)
+v9popen(Chan *c, int omode)
 {
-print("v9pattach %s\n", spec);
-	int tidx = findtag(spec);
-	if(tidx < 0) {
-		error(Enonexist);
-		return nil;
-	}
-	if(!mounts[tidx].version) {
-		int rc = v9pversion(tidx);
-		if(rc < 0) {
-			error(Enoversion);
-			return nil;
-		}
-	}
-	Chan *ch = devattach(v9pdevtab.dc, mounts[tidx].tag);
-	Fcall r = {
-		.type = Tattach,
-		.tag = alloctag(),
-		.fid = ch->fid,
-		.afid = NOFID,
-		.uname = "",
-		.aname = ""
-	};
-	do_fcall(&r, tidx, mounts[tidx].msize, (void *)ch, (post_t)post_attach);
-	return ch;
+	error(Edonotcall(open));
+	return nil;
+}
+
+static Walkqid*
+v9pwalk(Chan* c, Chan *nc, char** name, int nname)
+{
+	error(Edonotcall(walk));
+	return nil;
 }
 
+static int32_t
+v9pstat(Chan* c, uint8_t* dp, int32_t n)
+{
+	error(Edonotcall(stat));
+	return 0;
+}
+
+static void
+v9pclose(Chan* c)
+{
+	error(Edonotcall(close));
+}
 
 Dev v9pdevtab = {
 	.dc = '9',
@@ -266,14 +448,14 @@ Dev v9pdevtab = {
 	.init = v9pinit,
 	.shutdown = devshutdown,
 	.attach = v9pattach,
-//	.walk = v9pwalk,
-//	.stat = v9pstat,
-//	.open = v9popen,
+	.walk = v9pwalk,
+	.stat = v9pstat,
+	.open = v9popen,
 	.create = devcreate,
-//	.close = v9pclose,
-//	.read = v9pread,
+	.close = v9pclose,
+	.read = v9pread,
 	.bread = devbread,
-//	.write = v9pwrite,
+	.write = v9pwrite,
 	.bwrite = devbwrite,
 	.remove = devremove,
 	.wstat = devwstat,

+ 1 - 1
sys/src/9/port/virtio_lib.c

@@ -196,7 +196,7 @@ queuedescr(Virtq *q, int n, uint16_t *descr)
 {
 	Proc *up = externup();
 	int head = descr[0];
-	uint16_t mask = q->vr.num - 1;				// q->num is power of 2 so mask has all bits set
+	uint16_t mask = q->vr.num - 1;			// q->num is power of 2 so mask has all bits set
 	Rock rock;								// the sleep-wakeup semaphore on the process stack
 	rock.done = 0;
 	rock.sleep = &up->sleep;