syszio.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include "u.h"
  10. #include "../port/lib.h"
  11. #include "mem.h"
  12. #include "dat.h"
  13. #include "fns.h"
  14. #include "../port/error.h"
  15. /*
  16. * Experiment on zero-copy
  17. *
  18. * Each address in a Zio slot implies a reference
  19. * counter for that buffer. Provided the address,
  20. * we must be able to get to the counter.
  21. * We can use shared segments with fixed message sizes per
  22. * segment, so we can do arithmetic to locate the counter.
  23. * We could also use per-page reference counters, and perhaps
  24. * accept any user pointer.
  25. * If the kernel supplies the buffers, it must allocate them
  26. * from a place available for the user, perhaps a heap segment
  27. * or something like that.
  28. */
  29. enum
  30. {
  31. Maxatomic = 64*KiB
  32. };
  33. typedef struct ZMap ZMap;
  34. typedef struct Map Map;
  35. struct Map {
  36. Map* next;
  37. int free;
  38. uintptr addr;
  39. uint64_t size;
  40. };
  41. struct ZMap {
  42. Map* map;
  43. Lock;
  44. };
  45. static int inited;
  46. static void zmapfree(ZMap* rmap, uintptr addr);
  47. static uintptr zmapalloc(ZMap* rmap, usize size);
  48. static void
  49. zioinit(void)
  50. {
  51. if(inited)
  52. return;
  53. inited++;
  54. fmtinstall('Z', ziofmt);
  55. }
  56. int
  57. ziofmt(Fmt *f)
  58. {
  59. Kzio *io;
  60. io = va_arg(f->args, Kzio*);
  61. return fmtprint(f, "%#p[%#ulx]", io->data, io->size);
  62. }
  63. static void
  64. dumpzmap(ZMap *map)
  65. {
  66. Map *mp;
  67. for(mp = map->map; mp != nil; mp = mp->next)
  68. print("\tmap %#ullx[%#ullx] %c\n", mp->addr, mp->size,
  69. mp->free ? 'f' : 'a');
  70. }
  71. /*
  72. * No locks!
  73. */
  74. void
  75. dumpzseg(Segment *s)
  76. {
  77. Zseg *zs;
  78. ZMap *map;
  79. int i;
  80. if(DBGFLG == 0)
  81. return;
  82. zs = &s->zseg;
  83. print("zseg %#ullx type %#ux map %#p naddr %d end %d\n",
  84. s->base, s->type, zs->map, zs->naddr, zs->end);
  85. if(zs->addr != nil)
  86. for(i = 0; i < zs->end; i++)
  87. print("\taddr %#ullx\n", zs->addr[i]);
  88. map = zs->map;
  89. if(map == nil)
  90. return;
  91. dumpzmap(map);
  92. }
  93. /*
  94. * Called from putseg, when the segment is being destroyed.
  95. */
  96. void
  97. freezseg(Segment *s)
  98. {
  99. Zseg *zs;
  100. ZMap *zp;
  101. Map *mp;
  102. DBG("freezseg: ");
  103. dumpzseg(s);
  104. zs = &s->zseg;
  105. zp = zs->map;
  106. if(zp == nil)
  107. return;
  108. while(zp->map != nil){
  109. mp = zp->map;
  110. zp->map = mp->next;
  111. free(mp);
  112. }
  113. free(zp);
  114. }
  115. /*
  116. * Grow the pool of addresses in s's zseg, s is qlocked
  117. */
  118. void
  119. zgrow(Segment *s)
  120. {
  121. enum{Incr = 32};
  122. Zseg *zs;
  123. zioinit();
  124. zs = &s->zseg;
  125. zs->naddr += Incr;
  126. zs->addr = realloc(zs->addr, zs->naddr*sizeof(uintptr));
  127. if(zs->addr == nil)
  128. panic("zgrow: no memory");
  129. }
  130. /*
  131. * Find an address in s's zseg; s is qlocked
  132. */
  133. uintptr
  134. zgetaddr(Segment *s)
  135. {
  136. Zseg *zs;
  137. uintptr va;
  138. zs = &s->zseg;
  139. if(zs->end == 0)
  140. return 0ULL;
  141. va = zs->addr[0];
  142. zs->end--;
  143. if(zs->end > 0)
  144. zs->addr[0] = zs->addr[zs->end];
  145. DBG("zgetaddr: %#ullx\n", va);
  146. dumpzseg(s);
  147. return va;
  148. }
  149. /*
  150. * add an address to s's zseg; s is qlocked.
  151. * wakeup any reader if it's waiting.
  152. */
  153. int
  154. zputaddr(Segment *s, uintptr va)
  155. {
  156. Zseg *zs;
  157. zs = &s->zseg;
  158. if((s->type&SG_ZIO) == 0)
  159. return -1;
  160. if((s->type&SG_KZIO) != 0){
  161. DBG("zputaddr: zmapfree %#ullx\n", va);
  162. zmapfree(s->zseg.map, va);
  163. dumpzseg(s);
  164. return 0;
  165. }
  166. if(zs->end == zs->naddr)
  167. zgrow(s);
  168. zs->addr[zs->end++] = va;
  169. if(zs->end == 1)
  170. wakeup(&zs->rr); /* in case anyone was waiting */
  171. DBG("zputaddr %#ullx\n", va);
  172. dumpzseg(s);
  173. return 0;
  174. }
  175. void*
  176. alloczio(Segment *s, int32_t len)
  177. {
  178. Zseg *zs;
  179. uintptr va;
  180. zs = &s->zseg;
  181. va = zmapalloc(zs->map, len);
  182. if(va == 0ULL)
  183. error("kernel zero copy segment exhausted");
  184. return UINT2PTR(va);
  185. }
  186. /*
  187. * Locate the kernel segment for zero copy here,
  188. * return it unlocked with a reference added.
  189. */
  190. Segment*
  191. getzkseg(void)
  192. {
  193. Segment *s;
  194. int i;
  195. qlock(&up->seglock);
  196. for(i = 0; i < NSEG; i++){
  197. s = up->seg[i];
  198. if(s != nil && (s->type&SG_KZIO) != 0){
  199. incref(s);
  200. qunlock(&up->seglock);
  201. DBG("getzkseg: %#p\n", s);
  202. return s;
  203. }
  204. }
  205. qunlock(&up->seglock);
  206. DBG("getzkseg: nil\n");
  207. return nil;
  208. }
  209. /*
  210. * This is the counterpart of devzread in some sense,
  211. * it reads in the traditional way from io[].
  212. */
  213. int32_t
  214. readzio(Kzio *io, int nio, void *a, int32_t count)
  215. {
  216. int32_t tot, nr;
  217. char *p;
  218. p = a;
  219. tot = 0;
  220. while(nio-- > 0){
  221. if(tot < count){
  222. nr = io->size;
  223. if(tot + nr > count)
  224. nr = count - tot;
  225. DBG("readzio: copy %#p %Z\n", p+tot, io);
  226. memmove(p+tot, io->data, nr);
  227. tot += nr;
  228. }
  229. qlock(&io->seg->lk);
  230. zputaddr(io->seg, PTR2UINT(io->data));
  231. qunlock(&io->seg->lk);
  232. putseg(io->seg);
  233. io->seg = nil;
  234. io++;
  235. }
  236. return tot;
  237. }
  238. int
  239. devzread(Chan *c, Kzio io[], int nio, usize tot, int64_t offset)
  240. {
  241. Segment *s;
  242. DBG("devzread %#p[%d]\n", io, nio);
  243. s = getzkseg();
  244. if(s == nil)
  245. error("no kernel segment for zero-copy");
  246. if(tot > Maxatomic)
  247. tot = Maxatomic;
  248. io[0].data = alloczio(s, tot);
  249. io[0].seg = s;
  250. if(waserror()){
  251. zputaddr(s, PTR2UINT(io[0].data));
  252. putseg(s);
  253. nexterror();
  254. }
  255. io[0].size = c->dev->read(c, io[0].data, tot, offset);
  256. poperror();
  257. return 1;
  258. }
  259. int
  260. devzwrite(Chan *c, Kzio io[], int nio, int64_t offset)
  261. {
  262. int i, j;
  263. int32_t tot;
  264. Block *bp;
  265. DBG("devzwrite %#p[%d]\n", io, nio);
  266. tot = 0;
  267. for(i = 0; i < nio; i++)
  268. tot += io[i].size;
  269. bp = nil;
  270. if(waserror()){
  271. if(bp != nil)
  272. freeb(bp);
  273. nexterror();
  274. }
  275. if(nio == 1)
  276. tot = c->dev->write(c, io[0].data, io[0].size, offset);
  277. else{
  278. bp = allocb(tot);
  279. if(bp == nil)
  280. error(Enomem);
  281. for(i = 0; i < nio; i++){
  282. DBG("devzwrite: copy %#p %Z\n", bp->wp, &io[i]);
  283. memmove(bp->wp, io[i].data, io[i].size);
  284. bp->wp += io[i].size;
  285. qlock(&io[i].seg->lk);
  286. if(zputaddr(io[i].seg, PTR2UINT(io[i].data)) < 0)
  287. panic("devzwrite: not a shared data segment");
  288. qunlock(&io[i].seg->lk);
  289. }
  290. tot = c->dev->bwrite(c, bp, offset);
  291. }
  292. j = 0;
  293. for(i = 0; i < nio; i++){
  294. io[i].data = nil; /* safety */
  295. io[i].seg = nil;
  296. putseg(io[i].seg);
  297. if(tot > 0)
  298. if(tot >= io[i].size)
  299. tot -= io[i].size;
  300. else
  301. io[i].size = tot;
  302. else{
  303. j = i;
  304. io[i].size = 0;
  305. }
  306. io[i].data = nil; /* safety */
  307. putseg(io[i].seg);
  308. io[i].seg = nil;
  309. }
  310. nio = j;
  311. poperror();
  312. return nio;
  313. }
  314. static void
  315. kernzio(Kzio *io)
  316. {
  317. Segment *s;
  318. void *data;
  319. Kzio uio;
  320. s = getzkseg();
  321. if(s == nil)
  322. error("can't use zero copy in this segment");
  323. uio = *io;
  324. data = alloczio(s, io->size);
  325. memmove(data, io->data, io->size);
  326. io->data = data;
  327. DBG("kernzio: copy %Z %Z\n", io, &uio);
  328. putseg(io->seg);
  329. io->seg = s;
  330. }
  331. /*
  332. * Zero copy I/O.
  333. * I/O is performed using an array of Zio structures.
  334. * Each one points to a shared buffer address indicating a length.
  335. * Each entry indicating a length and using nil as the address
  336. * is asking the system to allocate memory as needed (mread only).
  337. */
  338. static int
  339. ziorw(int fd, Zio *io, int nio, usize count, int64_t offset, int iswrite)
  340. {
  341. int i, n, isprw;
  342. Kzio *kio, skio[16];
  343. Chan *c;
  344. usize tot;
  345. if(nio <= 0 || nio > 512)
  346. error("wrong io[] size");
  347. zioinit();
  348. kio = nil;
  349. io = validaddr(io, sizeof io[0] * nio, 1);
  350. DBG("ziorw %d io%#p[%d] %uld %lld\n", fd, io, nio, count, offset);
  351. if(DBGFLG)
  352. for(i = 0; i < nio; i++)
  353. print("\tio%#p[%d] = %Z %s\n",
  354. io, i, (Kzio*)&io[i], iswrite?"w":"r");
  355. if(iswrite)
  356. c = fdtochan(fd, OWRITE, 1, 1);
  357. else
  358. c = fdtochan(fd, OREAD, 1, 1);
  359. isprw = offset != -1LL;
  360. if(isprw)
  361. offset = c->offset;
  362. if(waserror()){
  363. cclose(c);
  364. if(kio != nil){
  365. for(i = 0; i < nio; i++)
  366. if(kio[i].seg != nil)
  367. putseg(kio[i].seg);
  368. if(kio != skio)
  369. free(kio);
  370. }
  371. nexterror();
  372. }
  373. if(nio < nelem(skio))
  374. kio = skio;
  375. else
  376. kio = smalloc(sizeof kio[0] * nio);
  377. for(i = 0; i < nio; i++){
  378. kio[i].Zio = io[i];
  379. if(iswrite){
  380. kio[i].seg = seg(up, PTR2UINT(io[i].data), 1);
  381. if(kio[i].seg == nil)
  382. error("invalid address in zio");
  383. incref(kio[i].seg);
  384. qunlock(&kio[i].seg->lk);
  385. validaddr(kio[i].data, kio[i].size, 1);
  386. if((kio[i].seg->type&SG_ZIO) == 0){
  387. /*
  388. * It's not a segment where we can report
  389. * addresses to anyone once they are free.
  390. * So, allocate space in the kernel
  391. * and copy the user data there.
  392. */
  393. kernzio(&kio[i]);
  394. }
  395. assert(kio[i].seg->type&SG_ZIO);
  396. }else{
  397. kio[i].data = nil;
  398. kio[i].seg = nil;
  399. }
  400. }
  401. if(c->dev->zread == nil){
  402. DBG("installing devzread for %s\n", c->dev->name);
  403. c->dev->zread = devzread;
  404. }
  405. if(c->dev->zwrite == nil){
  406. DBG("installing devzwrite for %s\n", c->dev->name);
  407. c->dev->zwrite = devzwrite;
  408. }
  409. if(iswrite)
  410. n = c->dev->zwrite(c, kio, nio, offset);
  411. else
  412. n = c->dev->zread(c, kio, nio, count, offset);
  413. tot = 0;
  414. for(i = 0; i < n; i++){
  415. io[i] = kio[i].Zio;
  416. tot += kio[i].size;
  417. }
  418. if(!isprw){
  419. /* unlike in syswrite, we update offsets at the end */
  420. lock(c);
  421. c->devoffset += tot;
  422. c->offset += tot;
  423. unlock(c);
  424. }
  425. poperror();
  426. cclose(c);
  427. if(kio != skio)
  428. free(kio);
  429. return n;
  430. }
  431. void
  432. sysziopread(Ar0 *ar0, va_list list)
  433. {
  434. int fd, nio;
  435. int32_t count;
  436. int64_t offset;
  437. Zio *io;
  438. /*
  439. * int zpread(int fd, Zio *io[], int nio, usize count, vlong offset);
  440. */
  441. fd = va_arg(list, int);
  442. io = va_arg(list, Zio*);
  443. nio = va_arg(list, int);
  444. count = va_arg(list, usize);
  445. offset = va_arg(list, int64_t);
  446. ar0->i = ziorw(fd, io, nio, count, offset, 0);
  447. }
  448. void
  449. sysziopwrite(Ar0 *ar0, va_list list)
  450. {
  451. int fd, nio;
  452. int64_t offset;
  453. Zio *io;
  454. /*
  455. * int zpwrite(int fd, Zio *io[], int nio, vlong offset);
  456. */
  457. fd = va_arg(list, int);
  458. io = va_arg(list, Zio*);
  459. nio = va_arg(list, int);
  460. offset = va_arg(list, int64_t);
  461. ar0->i = ziorw(fd, io, nio, 0, offset, 1);
  462. }
  463. void
  464. sysziofree(Ar0 *, va_list list)
  465. {
  466. Zio *io;
  467. int nio, i;
  468. Segment *s;
  469. /*
  470. * zfree(Zio io[], int nio);
  471. */
  472. io = va_arg(list, Zio*);
  473. nio = va_arg(list, int);
  474. io = validaddr(io, sizeof io[0] * nio, 1);
  475. for(i = 0; i < nio; i++){
  476. s = seg(up, PTR2UINT(io[i].data), 1);
  477. if(s == nil)
  478. error("invalid address in zio");
  479. if((s->type&SG_ZIO) == 0){
  480. qunlock(&s->lk);
  481. error("segment is not a zero-copy segment");
  482. }
  483. zputaddr(s, PTR2UINT(io[i].data));
  484. qunlock(&s->lk);
  485. io[i].data = nil;
  486. io[i].size = 0;
  487. }
  488. }
  489. /*
  490. * This must go, but for now, we use Zmaps
  491. * to allocate messages within the shared kernel segment.
  492. * This is a simple first fist with a single fragment list.
  493. */
  494. void
  495. newzmap(Segment *s)
  496. {
  497. ZMap *zp;
  498. Map *mp;
  499. zioinit();
  500. if((s->type&SG_KZIO) == 0)
  501. panic("newzmap but not SG_KZIO");
  502. if(s->zseg.map != nil)
  503. panic("newzmap: already allocated");
  504. zp = smalloc(sizeof(ZMap));
  505. s->zseg.map = zp;
  506. mp = smalloc(sizeof(Map));
  507. mp->free = 1;
  508. mp->addr = s->base;
  509. mp->size = s->top - s->base;
  510. zp->map = mp;
  511. if(DBGFLG > 1){
  512. DBG("newzmap:\n");
  513. dumpzmap(zp);
  514. }
  515. }
  516. static void
  517. zmapfree(ZMap* rmap, uintptr addr)
  518. {
  519. Map *mp, *prev, *next;
  520. lock(rmap);
  521. if(waserror()){
  522. unlock(rmap);
  523. nexterror();
  524. }
  525. prev = nil;
  526. for(mp = rmap->map; mp != nil; mp = mp->next){
  527. if(mp->addr <= addr)
  528. break;
  529. prev = mp;
  530. }
  531. if(mp == nil)
  532. panic("zmapfree: no map");
  533. if(mp->free == 1)
  534. panic("zmapfree: already free");
  535. if(prev != nil && prev->free && prev->addr + prev->size == addr){
  536. prev->size += mp->size;
  537. prev->next = mp->next;
  538. free(mp);
  539. mp = prev;
  540. }
  541. next = mp->next;
  542. if(next != nil && next->free && mp->addr + mp->size == next->addr){
  543. mp->size += next->size;
  544. mp->next = next->next;
  545. mp->free = 1;
  546. free(next);
  547. }
  548. poperror();
  549. unlock(rmap);
  550. if(DBGFLG > 1){
  551. DBG("zmapfree %#ullx:\n", addr);
  552. dumpzmap(rmap);
  553. }
  554. }
  555. static uintptr
  556. zmapalloc(ZMap* rmap, usize size)
  557. {
  558. Map *mp, *nmp;
  559. lock(rmap);
  560. if(waserror()){
  561. unlock(rmap);
  562. nexterror();
  563. }
  564. for(mp = rmap->map; mp->free == 0 || mp->size < size; mp = mp->next)
  565. ;
  566. if(mp == nil){
  567. poperror();
  568. unlock(rmap);
  569. return 0ULL;
  570. }
  571. if(mp->free == 0)
  572. panic("zmapalloc: not free");
  573. if(mp->size > size){
  574. nmp = smalloc(sizeof *nmp);
  575. *nmp = *mp;
  576. nmp->addr += size;
  577. nmp->size -= size;
  578. nmp->free = 1;
  579. mp->size = size;
  580. mp->next = nmp;
  581. }
  582. mp->free = 0;
  583. poperror();
  584. unlock(rmap);
  585. if(DBGFLG > 1){
  586. DBG("zmapalloc %#ullx:\n", mp->addr);
  587. dumpzmap(rmap);
  588. }
  589. return mp->addr;
  590. }