dev9p.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. /*
  2. * This file is part of the Harvey operating system. It is subject to the
  3. * license terms of the GNU GPL v2 in LICENSE.gpl found in the top-level
  4. * directory of this distribution and at http://www.gnu.org/licenses/gpl-2.0.txt
  5. *
  6. * No part of Harvey operating system, including this file, may be copied,
  7. * modified, propagated, or distributed except according to the terms
  8. * contained in the LICENSE.gpl file.
  9. */
  10. // dev9p.c ('#9'): a virtio9p protocol translation driver to use QEMU's built-in 9p.
  11. #include "u.h"
  12. #include "../port/lib.h"
  13. #include "mem.h"
  14. #include "dat.h"
  15. #include "fns.h"
  16. #include "io.h"
  17. #include "../port/error.h"
  18. #include "virtio_ring.h"
  19. #include "virtio_config.h"
  20. #include "virtio_9p.h"
  21. #include "virtio_pci.h"
  22. #include "virtio_lib.h"
  23. char *strdup(char *s);
  24. // Functions from devmnt.c
  25. i32 mntrdwr(int, Chan *, void *, i32, i64);
  26. void mntdirfix(u8 *dirbuf, Chan *c);
  27. extern char Esbadstat[];
  28. // We use this version string to communicate with QEMU virtio-9P.
  29. #define VERSION9PU "9P2000.u"
  30. // Same as in devmnt.c
  31. #define MAXRPC (IOHDRSZ + 128 * 1024)
  32. // Buffer size
  33. #define BUFSZ (8192 + IOHDRSZ)
  34. // Raise this error when any of the non-implemented functions is called.
  35. #define Edonotcall(f) "Function " #f " should not be called for this device"
  36. // This device's dev methods table
  37. extern Dev v9pdevtab;
  38. // A phantom device's dev methods table
  39. extern Dev phdevtab;
  40. // The 'M' device's dev methods table
  41. static Dev *mdevtab;
  42. // Array of defined 9p mounts and their number
  43. static u32 nv9p;
  44. static Vqctl **v9ps;
  45. // Flag of one-time initailzation
  46. static int initdone;
  47. // A structure to hold a pair of buffer references. Virtio-9p requires buffers to be submitted
  48. // in pairs for request and response at once. We expect that devmnt issues a write request first, and then
  49. // read request. Write request will return immediately, but the buffer will be only held until read request
  50. // comes. Then both buffers are sent to virtqueue, and the result is returned to the caller.
  51. struct holdbuf {
  52. void *rdbuf; // read buffer (response)
  53. i32 rdlen; // read length (allocated)
  54. i32 rfree; // if true then read buffer was malloc'd and needs to be freed
  55. void *wrbuf; // write buffer (request)
  56. i32 wrlen; // write length (supplied)
  57. i32 wfree; // if true then write buffer was malloc'd and needs to be freed
  58. Proc *proc; // holding process
  59. int descr; // virtqueue descriptor index for the request
  60. };
  61. // PID cache structure. We need a way to find the currently held buffer by a calling process fast.
  62. // We use the last 4 bits of PID for that as a hash. So we can have up to 16 processes hashed.
  63. // If lookup fails, linear search will be used. The cache is a simple LRU: newly entering process
  64. // overwrites the cache entry.
  65. #define PIDCSIZE 16
  66. #define PIDCMASK 0x0F
  67. struct pidcache {
  68. int pid; // actual PID
  69. struct holdbuf *hb; // hold buffer structure pointer
  70. };
  71. // Per-mount (virtqueue) structure. It holds an array of hold buffer structures of the same
  72. // length as the number of descriptors in the queue.
  73. static struct v9pmnt {
  74. char *tag; // mount tag (from host)
  75. char *version; // 9p version (from host)
  76. u32 msize; // message size (need this to pad short buffers otherwise QEMU errors out)
  77. Virtq *vq; // associated virtqueue
  78. struct holdbuf *hbufs; // hold buffers
  79. struct pidcache pidch[PIDCSIZE]; // PID cache
  80. Lock pclock; // PID cache lock
  81. int pcuse; // cache usage counter (entering processes)
  82. int pchit; // cache hits counter
  83. int pcmiss; // cache misses counter
  84. u32 mounted; // true if mounted
  85. } * mounts;
  86. // Find a hold buffer structure by PID. Nil is returned if no process found.
  87. // First lookup in the cache, then linearly over the whole array.
  88. static struct holdbuf *
  89. hbbypid(int tidx, int pid)
  90. {
  91. struct holdbuf *ret = nil;
  92. lock(&mounts[tidx].pclock);
  93. if(mounts[tidx].pidch[pid & PIDCMASK].pid == pid){
  94. ret = mounts[tidx].pidch[pid & PIDCMASK].hb;
  95. if(ret->proc->pid == pid) { // != is unlikely mb corruption but we have to get around
  96. mounts[tidx].pchit++;
  97. unlock(&mounts[tidx].pclock);
  98. return ret;
  99. }
  100. }
  101. unlock(&mounts[tidx].pclock);
  102. mounts[tidx].pcmiss++;
  103. for(int i = 0; i < PIDCSIZE; i++){
  104. if(mounts[tidx].hbufs[i].proc->pid == pid){
  105. ret = &mounts[tidx].hbufs[i];
  106. return ret;
  107. }
  108. }
  109. return nil;
  110. }
  111. // Find a mount tag index, return -1 if none found.
  112. static int
  113. findtag(char *tag)
  114. {
  115. for(int i = 0; i < nv9p; i++){
  116. if(mounts[i].tag && (!strcmp(mounts[i].tag, tag)))
  117. return i;
  118. }
  119. return -1;
  120. }
  121. static void
  122. v9pinit(void)
  123. {
  124. u32 nvdev;
  125. print("virtio-9p initializing\n");
  126. mdevtab = devtabget('M', 1);
  127. if(mdevtab == nil){
  128. print("no #M device found, cannot initialize virtio-9p");
  129. return;
  130. }
  131. nvdev = getvdevnum();
  132. if(nvdev <= 0)
  133. return;
  134. v9ps = mallocz(nvdev * sizeof(Vqctl *), 1);
  135. if(v9ps == nil){
  136. print("no memory to allocate v9p\n");
  137. return;
  138. }
  139. nv9p = 0;
  140. nv9p = getvdevsbypciid(PCI_DEVICE_ID_VIRTIO_9P, v9ps, nvdev);
  141. if(nv9p <= 0)
  142. return;
  143. mounts = mallocz(sizeof(struct v9pmnt) * nv9p, 1);
  144. if(mounts == nil){
  145. print("no memory to allocate v9p\n");
  146. return;
  147. }
  148. print("virtio 9p mounts found: %d\n", nv9p);
  149. for(int i = 0; i < nv9p; i++){
  150. struct virtio_9p_config vcfg;
  151. int rc = readvdevcfg(v9ps[i], &vcfg, sizeof(vcfg), 0);
  152. if(rc < 0)
  153. continue;
  154. print("config area size %d tag_len %d\n", rc, vcfg.tag_len);
  155. mounts[i].tag = mallocz(vcfg.tag_len + 1, 1);
  156. readvdevcfg(v9ps[i], mounts[i].tag, vcfg.tag_len, rc);
  157. print("tag %s\n", mounts[i].tag);
  158. mounts[i].vq = v9ps[i]->vqs[0];
  159. mounts[i].hbufs = mallocz(mounts[i].vq->vr.num, 1);
  160. finalinitvdev(v9ps[i]);
  161. }
  162. initdone = 0;
  163. }
  164. // General virtio request. It takes 2 buffers, one for input and other for output.
  165. // Both buffers should be mappable to physical addresses (that is, malloced from the
  166. // system heap, not in any application's address space). The request will be made in the
  167. // indirect mode because this is the only way that work properly with QEMU 9p.
  168. static i32
  169. do_request(int gdescr, int tidx, void *inbuf, i32 inlen, void *outbuf,
  170. i32 outlen)
  171. {
  172. u16 descr[1];
  173. Virtq *vq = v9ps[tidx]->vqs[0];
  174. if(vq == nil){
  175. error("No virtqueue (nil address)");
  176. }
  177. descr[0] = gdescr;
  178. struct vring_desc req[2] = {
  179. {.addr = PADDR(outbuf),
  180. .len = outlen,
  181. .flags = VRING_DESC_F_NEXT,
  182. .next = 1},
  183. {.addr = PADDR(inbuf),
  184. .len = inlen,
  185. .flags = VRING_DESC_F_WRITE,
  186. .next = 0}};
  187. q2descr(vq, descr[0])->addr = PADDR(&req);
  188. q2descr(vq, descr[0])->len = sizeof(req);
  189. q2descr(vq, descr[0])->flags = VRING_DESC_F_INDIRECT;
  190. q2descr(vq, descr[0])->next = 0;
  191. queuedescr(vq, 1, descr);
  192. reldescr(vq, 1, descr);
  193. return 0;
  194. }
  195. // We expect only 9p messages be written, and only for a non-empty chan path (mount tag).
  196. // Some messages need massaging (like Tversion because QEMU does not support vanilla 9P2000
  197. // and we have to cheat here about the protocol version). In such case some additional logic
  198. // applies based on the extracted message type.
  199. static i32
  200. phwrite(Chan *c, void *va, i32 n, i64 offset)
  201. {
  202. Proc *up = externup();
  203. int tidx = findtag(chanpath(c));
  204. if(tidx < 0 || tidx >= nv9p)
  205. error(Enonexist);
  206. u8 *msg = va;
  207. int mtype = GBIT8(msg + 4);
  208. void *nva;
  209. int lnva;
  210. int alloc;
  211. switch(mtype){
  212. case Tversion:
  213. alloc = 1;
  214. Fcall f = {
  215. .type = mtype,
  216. .tag = GBIT16(msg + 5),
  217. .msize = GBIT32(msg + 7),
  218. .version = VERSION9PU};
  219. lnva = IOHDRSZ + strlen(f.version) + 20;
  220. nva = mallocz(lnva, 1);
  221. convS2M(&f, nva, lnva);
  222. break;
  223. default:
  224. if(n >= mounts[tidx].msize){
  225. nva = va;
  226. lnva = n;
  227. alloc = 0;
  228. } else {
  229. lnva = mounts[tidx].msize;
  230. nva = mallocz(lnva, 1);
  231. alloc = 1;
  232. memmove(nva, va, n);
  233. }
  234. }
  235. u16 descr[1];
  236. struct v9pmnt *pm = mounts + tidx;
  237. int rc = getdescr(pm->vq, 1, descr);
  238. if(rc < 1){
  239. if(alloc)
  240. free(nva);
  241. error("not enough virtqueue descriptors");
  242. }
  243. lock(&pm->pclock);
  244. pm->hbufs[descr[0]].descr = descr[0];
  245. pm->hbufs[descr[0]].proc = up;
  246. pm->hbufs[descr[0]].wfree = alloc;
  247. pm->hbufs[descr[0]].wrbuf = nva;
  248. pm->hbufs[descr[0]].wrlen = lnva;
  249. pm->hbufs[descr[0]].rdbuf = nil;
  250. pm->hbufs[descr[0]].rdlen = 0;
  251. pm->hbufs[descr[0]].rfree = 0;
  252. pm->pidch[up->pid & PIDCMASK].hb = &pm->hbufs[descr[0]];
  253. pm->pidch[up->pid & PIDCMASK].pid = up->pid;
  254. pm->pcuse++;
  255. unlock(&pm->pclock);
  256. return n;
  257. }
  258. // Override the devmnt's read method. It is necessary to fix the incorrectly packed
  259. // stat structures when reading from a directory.
  260. static i32
  261. v9pread(Chan *c, void *buf, i32 n, i64 off)
  262. {
  263. u8 *p, *e;
  264. int nc, cache, isdir;
  265. usize dirlen;
  266. isdir = 0;
  267. cache = c->flag & CCACHE;
  268. if(c->qid.type & QTDIR){
  269. cache = 0;
  270. isdir = 1;
  271. }
  272. p = buf;
  273. if(cache){
  274. nc = mfcread(c, buf, n, off);
  275. if(nc > 0){
  276. n -= nc;
  277. if(n == 0)
  278. return nc;
  279. p += nc;
  280. off += nc;
  281. }
  282. n = mntrdwr(Tread, c, p, n, off);
  283. mfcupdate(c, p, n, off);
  284. return n + nc;
  285. }
  286. n = mntrdwr(Tread, c, buf, n, off);
  287. if(isdir){
  288. u8 *nbuf = malloc(n);
  289. if(nbuf == nil)
  290. error(Enomem);
  291. u8 *xnbuf = nbuf;
  292. for(e = &p[n]; p + BIT16SZ < e; p += dirlen){
  293. dirlen = BIT16SZ + GBIT16(p);
  294. if(p + dirlen > e)
  295. break;
  296. u8 *pn = p + 41;
  297. u32 lstrs = 0;
  298. for(int i = 0; i < 4; i++){
  299. int ns = GBIT16(pn);
  300. lstrs += ns + 1;
  301. pn += ns + BIT16SZ;
  302. }
  303. {
  304. char strs[lstrs];
  305. Dir d;
  306. convM2D(p, dirlen, &d, strs);
  307. d.uid = eve;
  308. d.gid = eve;
  309. d.muid = eve;
  310. u32 dms = convD2M(&d, xnbuf, dirlen);
  311. validstat(xnbuf, dms);
  312. mntdirfix(xnbuf, c);
  313. xnbuf = xnbuf + dms;
  314. }
  315. }
  316. if(p != e)
  317. error(Esbadstat);
  318. memmove(buf, nbuf, (xnbuf - nbuf));
  319. n = xnbuf - nbuf;
  320. }
  321. return n;
  322. }
  323. // We expect only 9p messages to be received.
  324. // Some messages need massaging (like Rversion because QEMU does not support vanilla 9P2000
  325. // and we have to cheat here about the protocol version). In such case some additional logic
  326. // applies based on the extracted message type. The function checks for a held write buffer,
  327. // absence of such is an error. The length returned may length extracted from the first
  328. // 4 bytes of the message in some cases.
  329. static i32
  330. phread(Chan *c, void *va, i32 n, i64 offset)
  331. {
  332. Proc *up = externup();
  333. int tidx = findtag(chanpath(c));
  334. if(tidx < 0 || tidx >= nv9p)
  335. error(Enonexist);
  336. struct holdbuf *hb = hbbypid(tidx, up->pid);
  337. if(hb == nil)
  338. error("read request without previously held write request");
  339. hb->rdbuf = va;
  340. hb->rdlen = n;
  341. do_request(hb->descr, tidx, hb->rdbuf, hb->rdlen, hb->wrbuf, hb->wrlen);
  342. if(hb->wfree)
  343. free(hb->wrbuf);
  344. u8 *msg = va;
  345. int mtype = GBIT8(msg + 4);
  346. u32 mlen = GBIT32(msg);
  347. Fcall f;
  348. switch(mtype){
  349. case Rerror:
  350. convM2S(msg, n, &f);
  351. error(f.ename);
  352. break;
  353. case Rlerror:
  354. convM2S(msg, n, &f);
  355. error(errno2str(f.errno));
  356. break;
  357. case Rversion:
  358. convM2S(msg, n, &f);
  359. mounts[tidx].version = strdup(f.version);
  360. mounts[tidx].msize = f.msize;
  361. f.version = VERSION9P;
  362. convS2M(&f, va, n);
  363. mlen = GBIT32(msg);
  364. break;
  365. case Rstat:
  366. mlen = GBIT16(msg);
  367. u32 nbuf = GBIT16(msg + 9);
  368. u8 *buf = msg + 9;
  369. Dir d;
  370. u8 *pn = buf + 41;
  371. u32 lstrs = 0;
  372. for(int i = 0; i < 4; i++){
  373. int ns = GBIT16(pn);
  374. lstrs += ns + 1;
  375. pn += ns + BIT16SZ;
  376. }
  377. {
  378. char strs[lstrs];
  379. convM2D(buf, nbuf, &d, strs);
  380. d.uid = eve;
  381. d.gid = eve;
  382. d.muid = eve;
  383. u32 dms = convD2M(&d, buf, nbuf);
  384. PBIT16(msg + 7, dms);
  385. mlen = 9 + dms;
  386. PBIT32(msg, mlen);
  387. }
  388. default:;
  389. }
  390. return mlen;
  391. }
  392. // Use a command like "mount [-c] -d '#9' /dev/null /mount/point tag".
  393. // It is "tag" that matters: it should be same as one of the mount tags
  394. // provided by the host. The server file name may be any existing file name.
  395. // It will not be used, cf. "mount none" in Linux. Use "-d '#9'" to use
  396. // proper mount device methods.
  397. static Chan *
  398. v9pattach(char *spec)
  399. {
  400. struct bogus {
  401. Chan *chan;
  402. Chan *authchan;
  403. char *spec;
  404. int flags;
  405. } bogus;
  406. bogus = *((struct bogus *)spec);
  407. int tidx = findtag(bogus.spec);
  408. if(tidx < 0)
  409. error("tag does not exist");
  410. bogus.authchan = nil;
  411. Chan *c = bogus.chan;
  412. c->dev = &phdevtab;
  413. c->path = newpath(bogus.spec);
  414. Chan *mc = mdevtab->attach((char *)&bogus);
  415. mc->dev = &v9pdevtab;
  416. mounts[tidx].mounted = 1;
  417. return mc;
  418. }
  419. static Chan *
  420. v9popen(Chan *c, int omode)
  421. {
  422. return mdevtab->open(c, omode);
  423. }
  424. static Walkqid *
  425. v9pwalk(Chan *c, Chan *nc, char **name, int nname)
  426. {
  427. return mdevtab->walk(c, nc, name, nname);
  428. }
  429. static i32
  430. v9pstat(Chan *c, u8 *dp, i32 n)
  431. {
  432. return mdevtab->stat(c, dp, n);
  433. }
  434. static void
  435. v9pclose(Chan *c)
  436. {
  437. int tidx = findtag(chanpath(c));
  438. if(tidx >= 0 && tidx < nv9p)
  439. mounts[tidx].mounted = 0;
  440. mdevtab->close(c);
  441. }
  442. static void
  443. v9pcreate(Chan *c, char *name, int omode, int perm)
  444. {
  445. mdevtab->create(c, name, omode, perm);
  446. }
  447. static void
  448. v9premove(Chan *c)
  449. {
  450. mdevtab->remove(c);
  451. }
  452. static i32
  453. v9pwstat(Chan *c, u8 *dp, i32 n)
  454. {
  455. return mdevtab->wstat(c, dp, n);
  456. }
  457. static i32
  458. v9pwrite(Chan *c, void *va, i32 n, i64 offset)
  459. {
  460. return mdevtab->write(c, va, n, offset);
  461. }
  462. // Phantom device. It is used only for read/write operations. It is not registered in the
  463. // global table or devices, and is not addressable in any other way. It is only needed to
  464. // pass the reference to the read/write methods to the mount driver.
  465. static Chan *
  466. phattach(char *spec)
  467. {
  468. error(Edonotcall(__FUNCTION__));
  469. return nil;
  470. }
  471. static Walkqid *
  472. phwalk(Chan *c, Chan *nc, char **name, int nname)
  473. {
  474. error(Edonotcall(__FUNCTION__));
  475. return nil;
  476. }
  477. static i32
  478. phstat(Chan *c, u8 *dp, i32 n)
  479. {
  480. error(Edonotcall(__FUNCTION__));
  481. return -1;
  482. }
  483. static i32
  484. phwstat(Chan *c, u8 *dp, i32 n)
  485. {
  486. error(Edonotcall(__FUNCTION__));
  487. return -1;
  488. }
  489. static Chan *
  490. phopen(Chan *c, int omode)
  491. {
  492. error(Edonotcall(__FUNCTION__));
  493. return nil;
  494. }
  495. static void
  496. phclose(Chan *c)
  497. {
  498. error(Edonotcall(__FUNCTION__));
  499. }
  500. static void
  501. phcreate(Chan *c, char *name, int omode, int perm)
  502. {
  503. error(Edonotcall(__FUNCTION__));
  504. }
  505. static void
  506. phremove(Chan *c)
  507. {
  508. error(Edonotcall(__FUNCTION__));
  509. }
  510. // Read mount tags information as tag:version:msize:pcuse:pchit:pcmiss for mounted tags, and
  511. // tag:- for non-mounted.
  512. i32
  513. mtagsread(Chan *c, void *buf, i32 n, i64 off)
  514. {
  515. Proc *up = externup();
  516. int i;
  517. char *alloc, *e, *p;
  518. alloc = malloc(READSTR);
  519. if(alloc == nil)
  520. error(Enomem);
  521. p = alloc;
  522. e = p + READSTR;
  523. for(i = 0; i < nv9p; i++){
  524. p = mounts[i].mounted ? seprint(p, e, "%s:%s:%d:%d:%d\n", mounts[i].tag, mounts[i].version, mounts[i].msize, mounts[i].pcuse, mounts[i].pchit, mounts[i].pcmiss) : seprint(p, e, "%s:-\n", mounts[i].tag);
  525. }
  526. if(waserror()){
  527. free(alloc);
  528. nexterror();
  529. }
  530. n = readstr(off, buf, n, alloc);
  531. free(alloc);
  532. poperror();
  533. return n;
  534. }
  535. Dev phdevtab = {
  536. .dc = 2151, /* 1/9 */
  537. .name = "9phantom",
  538. .reset = devreset,
  539. .init = devinit,
  540. .shutdown = devshutdown,
  541. .attach = phattach,
  542. .walk = phwalk,
  543. .stat = phstat,
  544. .open = phopen,
  545. .create = phcreate,
  546. .close = phclose,
  547. .read = phread,
  548. .bread = devbread,
  549. .write = phwrite,
  550. .bwrite = devbwrite,
  551. .remove = phremove,
  552. .wstat = phwstat,
  553. };
  554. Dev v9pdevtab = {
  555. .dc = '9',
  556. .name = "9p",
  557. .reset = devreset,
  558. .init = v9pinit,
  559. .shutdown = devshutdown,
  560. .attach = v9pattach,
  561. .walk = v9pwalk,
  562. .stat = v9pstat,
  563. .open = v9popen,
  564. .create = v9pcreate,
  565. .close = v9pclose,
  566. .read = v9pread,
  567. .bread = devbread,
  568. .write = v9pwrite,
  569. .bwrite = devbwrite,
  570. .remove = v9premove,
  571. .wstat = v9pwstat,
  572. };