syszio.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include "u.h"
  10. #include "../port/lib.h"
  11. #include "mem.h"
  12. #include "dat.h"
  13. #include "fns.h"
  14. #include "../port/error.h"
  15. /*
  16. * Experiment on zero-copy
  17. *
  18. * Each address in a Zio slot implies a reference
  19. * counter for that buffer. Provided the address,
  20. * we must be able to get to the counter.
  21. * We can use shared segments with fixed message sizes per
  22. * segment, so we can do arithmetic to locate the counter.
  23. * We could also use per-page reference counters, and perhaps
  24. * accept any user pointer.
  25. * If the kernel supplies the buffers, it must allocate them
  26. * from a place available for the user, perhaps a heap segment
  27. * or something like that.
  28. */
  29. enum
  30. {
  31. Maxatomic = 64*KiB
  32. };
  33. typedef struct ZMap ZMap;
  34. typedef struct Map Map;
  35. struct Map {
  36. Map* next;
  37. int free;
  38. uintptr_t addr;
  39. uint64_t size;
  40. };
  41. struct ZMap {
  42. Map* map;
  43. Lock Lock;
  44. };
  45. static int inited;
  46. static void zmapfree(ZMap* rmap, uintptr_t addr);
  47. static uintptr_t zmapalloc(ZMap* rmap, usize size);
  48. static void
  49. zioinit(void)
  50. {
  51. if(inited)
  52. return;
  53. inited++;
  54. fmtinstall('Z', ziofmt);
  55. }
  56. int
  57. ziofmt(Fmt *f)
  58. {
  59. Kzio *io;
  60. io = va_arg(f->args, Kzio*);
  61. return fmtprint(f, "%#p[%#lx]", io->Zio.data, io->Zio.size);
  62. }
  63. static void
  64. dumpzmap(ZMap *map)
  65. {
  66. Map *mp;
  67. for(mp = map->map; mp != nil; mp = mp->next)
  68. print("\tmap %#llx[%#llx] %c\n", mp->addr, mp->size,
  69. mp->free ? 'f' : 'a');
  70. }
  71. /*
  72. * No locks!
  73. */
  74. void
  75. dumpzseg(Segment *s)
  76. {
  77. Zseg *zs;
  78. ZMap *map;
  79. int i;
  80. if(DBGFLG == 0)
  81. return;
  82. zs = &s->zseg;
  83. print("zseg %#llx type %#x map %#p naddr %d end %d\n",
  84. s->base, s->type, zs->map, zs->naddr, zs->end);
  85. if(zs->addr != nil)
  86. for(i = 0; i < zs->end; i++)
  87. print("\taddr %#llx\n", zs->addr[i]);
  88. map = zs->map;
  89. if(map == nil)
  90. return;
  91. dumpzmap(map);
  92. }
  93. /*
  94. * Called from putseg, when the segment is being destroyed.
  95. */
  96. void
  97. freezseg(Segment *s)
  98. {
  99. Zseg *zs;
  100. ZMap *zp;
  101. Map *mp;
  102. DBG("freezseg: ");
  103. dumpzseg(s);
  104. zs = &s->zseg;
  105. zp = zs->map;
  106. if(zp == nil)
  107. return;
  108. while(zp->map != nil){
  109. mp = zp->map;
  110. zp->map = mp->next;
  111. free(mp);
  112. }
  113. free(zp);
  114. }
  115. /*
  116. * Grow the pool of addresses in s's zseg, s is qlocked
  117. */
  118. void
  119. zgrow(Segment *s)
  120. {
  121. enum{Incr = 32};
  122. Zseg *zs;
  123. zioinit();
  124. zs = &s->zseg;
  125. zs->naddr += Incr;
  126. zs->addr = realloc(zs->addr, zs->naddr*sizeof(uintptr_t));
  127. if(zs->addr == nil)
  128. panic("zgrow: no memory");
  129. }
  130. /*
  131. * Find an address in s's zseg; s is qlocked
  132. */
  133. uintptr_t
  134. zgetaddr(Segment *s)
  135. {
  136. Zseg *zs;
  137. uintptr_t va;
  138. zs = &s->zseg;
  139. if(zs->end == 0)
  140. return 0ULL;
  141. va = zs->addr[0];
  142. zs->end--;
  143. if(zs->end > 0)
  144. zs->addr[0] = zs->addr[zs->end];
  145. DBG("zgetaddr: %#llx\n", va);
  146. dumpzseg(s);
  147. return va;
  148. }
  149. /*
  150. * add an address to s's zseg; s is qlocked.
  151. * wakeup any reader if it's waiting.
  152. */
  153. int
  154. zputaddr(Segment *s, uintptr_t va)
  155. {
  156. Zseg *zs;
  157. zs = &s->zseg;
  158. if((s->type&SG_ZIO) == 0)
  159. return -1;
  160. if((s->type&SG_KZIO) != 0){
  161. DBG("zputaddr: zmapfree %#llx\n", va);
  162. zmapfree(s->zseg.map, va);
  163. dumpzseg(s);
  164. return 0;
  165. }
  166. if(zs->end == zs->naddr)
  167. zgrow(s);
  168. zs->addr[zs->end++] = va;
  169. if(zs->end == 1)
  170. wakeup(&zs->rr); /* in case anyone was waiting */
  171. DBG("zputaddr %#llx\n", va);
  172. dumpzseg(s);
  173. return 0;
  174. }
  175. void*
  176. alloczio(Segment *s, int32_t len)
  177. {
  178. Zseg *zs;
  179. uintptr_t va;
  180. zs = &s->zseg;
  181. va = zmapalloc(zs->map, len);
  182. if(va == 0ULL)
  183. error("kernel zero copy segment exhausted");
  184. return UINT2PTR(va);
  185. }
  186. /*
  187. * Locate the kernel segment for zero copy here,
  188. * return it unlocked with a reference added.
  189. */
  190. Segment*
  191. getzkseg(void)
  192. {
  193. Proc *up = externup();
  194. Segment *s;
  195. int i;
  196. qlock(&up->seglock);
  197. for(i = 0; i < NSEG; i++){
  198. s = up->seg[i];
  199. if(s != nil && (s->type&SG_KZIO) != 0){
  200. incref(&s->r);
  201. qunlock(&up->seglock);
  202. DBG("getzkseg: %#p\n", s);
  203. return s;
  204. }
  205. }
  206. qunlock(&up->seglock);
  207. DBG("getzkseg: nil\n");
  208. return nil;
  209. }
  210. /*
  211. * This is the counterpart of devzread in some sense,
  212. * it reads in the traditional way from io[].
  213. */
  214. int32_t
  215. readzio(Kzio *io, int nio, void *a, int32_t count)
  216. {
  217. int32_t tot, nr;
  218. char *p;
  219. p = a;
  220. tot = 0;
  221. while(nio-- > 0){
  222. if(tot < count){
  223. nr = io->Zio.size;
  224. if(tot + nr > count)
  225. nr = count - tot;
  226. DBG("readzio: copy %#p %Z\n", p+tot, io);
  227. memmove(p+tot, io->Zio.data, nr);
  228. tot += nr;
  229. }
  230. qlock(&io->seg->lk);
  231. zputaddr(io->seg, PTR2UINT(io->Zio.data));
  232. qunlock(&io->seg->lk);
  233. putseg(io->seg);
  234. io->seg = nil;
  235. io++;
  236. }
  237. return tot;
  238. }
  239. int
  240. devzread(Chan *c, Kzio io[], int nio, usize tot, int64_t offset)
  241. {
  242. Proc *up = externup();
  243. Segment *s;
  244. DBG("devzread %#p[%d]\n", io, nio);
  245. s = getzkseg();
  246. if(s == nil)
  247. error("no kernel segment for zero-copy");
  248. if(tot > Maxatomic)
  249. tot = Maxatomic;
  250. io[0].Zio.data = alloczio(s, tot);
  251. io[0].seg = s;
  252. if(waserror()){
  253. zputaddr(s, PTR2UINT(io[0].Zio.data));
  254. putseg(s);
  255. nexterror();
  256. }
  257. io[0].Zio.size = c->dev->read(c, io[0].Zio.data, tot, offset);
  258. poperror();
  259. return 1;
  260. }
  261. int
  262. devzwrite(Chan *c, Kzio io[], int nio, int64_t offset)
  263. {
  264. Proc *up = externup();
  265. int i, j;
  266. int32_t tot;
  267. Block *bp;
  268. DBG("devzwrite %#p[%d]\n", io, nio);
  269. tot = 0;
  270. for(i = 0; i < nio; i++)
  271. tot += io[i].Zio.size;
  272. bp = nil;
  273. if(waserror()){
  274. if(bp != nil)
  275. freeb(bp);
  276. nexterror();
  277. }
  278. if(nio == 1)
  279. tot = c->dev->write(c, io[0].Zio.data, io[0].Zio.size, offset);
  280. else{
  281. bp = allocb(tot);
  282. if(bp == nil)
  283. error(Enomem);
  284. for(i = 0; i < nio; i++){
  285. DBG("devzwrite: copy %#p %Z\n", bp->wp, &io[i]);
  286. memmove(bp->wp, io[i].Zio.data, io[i].Zio.size);
  287. bp->wp += io[i].Zio.size;
  288. qlock(&io[i].seg->lk);
  289. if(zputaddr(io[i].seg, PTR2UINT(io[i].Zio.data)) < 0)
  290. panic("devzwrite: not a shared data segment");
  291. qunlock(&io[i].seg->lk);
  292. }
  293. tot = c->dev->bwrite(c, bp, offset);
  294. }
  295. j = 0;
  296. for(i = 0; i < nio; i++){
  297. io[i].Zio.data = nil; /* safety */
  298. io[i].seg = nil;
  299. putseg(io[i].seg);
  300. if(tot > 0)
  301. if(tot >= io[i].Zio.size)
  302. tot -= io[i].Zio.size;
  303. else
  304. io[i].Zio.size = tot;
  305. else{
  306. j = i;
  307. io[i].Zio.size = 0;
  308. }
  309. io[i].Zio.data = nil; /* safety */
  310. putseg(io[i].seg);
  311. io[i].seg = nil;
  312. }
  313. nio = j;
  314. poperror();
  315. return nio;
  316. }
  317. static void
  318. kernzio(Kzio *io)
  319. {
  320. Segment *s;
  321. void *data;
  322. Kzio uio;
  323. s = getzkseg();
  324. if(s == nil)
  325. error("can't use zero copy in this segment");
  326. uio = *io;
  327. data = alloczio(s, io->Zio.size);
  328. memmove(data, io->Zio.data, io->Zio.size);
  329. io->Zio.data = data;
  330. DBG("kernzio: copy %Z %Z\n", io, &uio);
  331. putseg(io->seg);
  332. io->seg = s;
  333. }
  334. /*
  335. * Zero copy I/O.
  336. * I/O is performed using an array of Zio structures.
  337. * Each one points to a shared buffer address indicating a length.
  338. * Each entry indicating a length and using nil as the address
  339. * is asking the system to allocate memory as needed (mread only).
  340. */
  341. static int
  342. ziorw(int fd, Zio *io, int nio, usize count, int64_t offset, int iswrite)
  343. {
  344. Proc *up = externup();
  345. int i, n, isprw;
  346. Kzio *kio, skio[16];
  347. Chan *c;
  348. usize tot;
  349. if(nio <= 0 || nio > 512)
  350. error("wrong io[] size");
  351. zioinit();
  352. kio = nil;
  353. io = validaddr(io, sizeof io[0] * nio, 1);
  354. DBG("ziorw %d io%#p[%d] %lu %lld\n", fd, io, nio, count, offset);
  355. if(DBGFLG)
  356. for(i = 0; i < nio; i++)
  357. print("\tio%#p[%d] = %Z %s\n",
  358. io, i, (Kzio*)&io[i], iswrite?"w":"r");
  359. if(iswrite)
  360. c = fdtochan(fd, OWRITE, 1, 1);
  361. else
  362. c = fdtochan(fd, OREAD, 1, 1);
  363. isprw = offset != -1LL;
  364. if(isprw)
  365. offset = c->offset;
  366. if(waserror()){
  367. cclose(c);
  368. if(kio != nil){
  369. for(i = 0; i < nio; i++)
  370. if(kio[i].seg != nil)
  371. putseg(kio[i].seg);
  372. if(kio != skio)
  373. free(kio);
  374. }
  375. nexterror();
  376. }
  377. if(nio < nelem(skio))
  378. kio = skio;
  379. else
  380. kio = smalloc(sizeof kio[0] * nio);
  381. for(i = 0; i < nio; i++){
  382. kio[i].Zio = io[i];
  383. if(iswrite){
  384. kio[i].seg = seg(up, PTR2UINT(io[i].data), 1);
  385. if(kio[i].seg == nil)
  386. error("invalid address in zio");
  387. incref(&kio[i].seg->r);
  388. qunlock(&kio[i].seg->lk);
  389. validaddr(kio[i].Zio.data, kio[i].Zio.size, 1);
  390. if((kio[i].seg->type&SG_ZIO) == 0){
  391. /*
  392. * It's not a segment where we can report
  393. * addresses to anyone once they are free.
  394. * So, allocate space in the kernel
  395. * and copy the user data there.
  396. */
  397. kernzio(&kio[i]);
  398. }
  399. assert(kio[i].seg->type&SG_ZIO);
  400. }else{
  401. kio[i].Zio.data = nil;
  402. kio[i].seg = nil;
  403. }
  404. }
  405. if(c->dev->zread == nil){
  406. DBG("installing devzread for %s\n", c->dev->name);
  407. c->dev->zread = devzread;
  408. }
  409. if(c->dev->zwrite == nil){
  410. DBG("installing devzwrite for %s\n", c->dev->name);
  411. c->dev->zwrite = devzwrite;
  412. }
  413. if(iswrite)
  414. n = c->dev->zwrite(c, kio, nio, offset);
  415. else
  416. n = c->dev->zread(c, kio, nio, count, offset);
  417. tot = 0;
  418. for(i = 0; i < n; i++){
  419. io[i] = kio[i].Zio;
  420. tot += kio[i].Zio.size;
  421. }
  422. if(!isprw){
  423. /* unlike in syswrite, we update offsets at the end */
  424. lock(&c->r.l);
  425. c->devoffset += tot;
  426. c->offset += tot;
  427. unlock(&c->r.l);
  428. }
  429. poperror();
  430. cclose(c);
  431. if(kio != skio)
  432. free(kio);
  433. return n;
  434. }
  435. void
  436. sysziopread(Ar0 *ar0, ...)
  437. {
  438. int fd, nio;
  439. int32_t count;
  440. int64_t offset;
  441. Zio *io;
  442. va_list list;
  443. va_start(list, ar0);
  444. /*
  445. * int zpread(int fd, Zio *io[], int nio, usize count, int64_t offset);
  446. */
  447. fd = va_arg(list, int);
  448. io = va_arg(list, Zio*);
  449. nio = va_arg(list, int);
  450. count = va_arg(list, usize);
  451. offset = va_arg(list, int64_t);
  452. va_end(list);
  453. ar0->i = ziorw(fd, io, nio, count, offset, 0);
  454. }
  455. void
  456. sysziopwrite(Ar0 *ar0, ...)
  457. {
  458. int fd, nio;
  459. int64_t offset;
  460. Zio *io;
  461. va_list list;
  462. va_start(list, ar0);
  463. /*
  464. * int zpwrite(int fd, Zio *io[], int nio, int64_t offset);
  465. */
  466. fd = va_arg(list, int);
  467. io = va_arg(list, Zio*);
  468. nio = va_arg(list, int);
  469. offset = va_arg(list, int64_t);
  470. va_end(list);
  471. ar0->i = ziorw(fd, io, nio, 0, offset, 1);
  472. }
  473. void
  474. sysziofree(Ar0 *ar0, ...)
  475. {
  476. Proc *up = externup();
  477. Zio *io;
  478. int nio, i;
  479. Segment *s;
  480. va_list list;
  481. va_start(list, ar0);
  482. /*
  483. * zfree(Zio io[], int nio);
  484. */
  485. io = va_arg(list, Zio*);
  486. nio = va_arg(list, int);
  487. va_end(list);
  488. io = validaddr(io, sizeof io[0] * nio, 1);
  489. for(i = 0; i < nio; i++){
  490. s = seg(up, PTR2UINT(io[i].data), 1);
  491. if(s == nil)
  492. error("invalid address in zio");
  493. if((s->type&SG_ZIO) == 0){
  494. qunlock(&s->lk);
  495. error("segment is not a zero-copy segment");
  496. }
  497. zputaddr(s, PTR2UINT(io[i].data));
  498. qunlock(&s->lk);
  499. io[i].data = nil;
  500. io[i].size = 0;
  501. }
  502. }
  503. /*
  504. * This must go, but for now, we use Zmaps
  505. * to allocate messages within the shared kernel segment.
  506. * This is a simple first fist with a single fragment list.
  507. */
  508. void
  509. newzmap(Segment *s)
  510. {
  511. ZMap *zp;
  512. Map *mp;
  513. zioinit();
  514. if((s->type&SG_KZIO) == 0)
  515. panic("newzmap but not SG_KZIO");
  516. if(s->zseg.map != nil)
  517. panic("newzmap: already allocated");
  518. zp = smalloc(sizeof(ZMap));
  519. s->zseg.map = zp;
  520. mp = smalloc(sizeof(Map));
  521. mp->free = 1;
  522. mp->addr = s->base;
  523. mp->size = s->top - s->base;
  524. zp->map = mp;
  525. if(DBGFLG > 1){
  526. DBG("newzmap:\n");
  527. dumpzmap(zp);
  528. }
  529. }
  530. static void
  531. zmapfree(ZMap* rmap, uintptr_t addr)
  532. {
  533. Proc *up = externup();
  534. Map *mp, *prev, *next;
  535. lock(&rmap->Lock);
  536. if(waserror()){
  537. unlock(&rmap->Lock);
  538. nexterror();
  539. }
  540. prev = nil;
  541. for(mp = rmap->map; mp != nil; mp = mp->next){
  542. if(mp->addr <= addr)
  543. break;
  544. prev = mp;
  545. }
  546. if(mp == nil)
  547. panic("zmapfree: no map");
  548. if(mp->free == 1)
  549. panic("zmapfree: already free");
  550. if(prev != nil && prev->free && prev->addr + prev->size == addr){
  551. prev->size += mp->size;
  552. prev->next = mp->next;
  553. free(mp);
  554. mp = prev;
  555. }
  556. next = mp->next;
  557. if(next != nil && next->free && mp->addr + mp->size == next->addr){
  558. mp->size += next->size;
  559. mp->next = next->next;
  560. mp->free = 1;
  561. free(next);
  562. }
  563. poperror();
  564. unlock(&rmap->Lock);
  565. if(DBGFLG > 1){
  566. DBG("zmapfree %#llx:\n", addr);
  567. dumpzmap(rmap);
  568. }
  569. }
  570. static uintptr_t
  571. zmapalloc(ZMap* rmap, usize size)
  572. {
  573. Proc *up = externup();
  574. Map *mp, *nmp;
  575. lock(&rmap->Lock);
  576. if(waserror()){
  577. unlock(&rmap->Lock);
  578. nexterror();
  579. }
  580. for(mp = rmap->map; mp->free == 0 || mp->size < size; mp = mp->next)
  581. ;
  582. if(mp == nil){
  583. poperror();
  584. unlock(&rmap->Lock);
  585. return 0ULL;
  586. }
  587. if(mp->free == 0)
  588. panic("zmapalloc: not free");
  589. if(mp->size > size){
  590. nmp = smalloc(sizeof *nmp);
  591. *nmp = *mp;
  592. nmp->addr += size;
  593. nmp->size -= size;
  594. nmp->free = 1;
  595. mp->size = size;
  596. mp->next = nmp;
  597. }
  598. mp->free = 0;
  599. poperror();
  600. unlock(&rmap->Lock);
  601. if(DBGFLG > 1){
  602. DBG("zmapalloc %#llx:\n", mp->addr);
  603. dumpzmap(rmap);
  604. }
  605. return mp->addr;
  606. }