dcache.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712
  1. /*
  2. * Disk cache.
  3. *
  4. * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
  5. * Getdblock has a mode parameter that determines i/o and access to a block:
  6. * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
  7. * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
  8. * It is *not* marked dirty -- once changes have been made, they should be noted
  9. * by using dirtydblock() before putdblock().
  10. *
  11. * There is a global cache lock as well as a lock on each block.
  12. * Within a thread, the cache lock can be acquired while holding a block lock,
  13. * but not vice versa; and a block cannot be locked if you already hold the lock
  14. * on another block.
  15. *
  16. * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
  17. * For example, the DirtyArena blocks are all written to disk before any of the
  18. * DirtyArenaCib blocks.
  19. *
  20. * This code used to be in charge of flushing the dirty index blocks out to
  21. * disk, but updating the index turned out to benefit from extra care.
  22. * Now cached index blocks are never marked dirty. The index.c code takes
  23. * care of updating them behind our back, and uses _getdblock to update any
  24. * cached copies of the blocks as it changes them on disk.
  25. */
  26. #include "stdinc.h"
  27. #include "dat.h"
  28. #include "fns.h"
  29. typedef struct DCache DCache;
  30. enum
  31. {
  32. HashLog = 9,
  33. HashSize = 1<<HashLog,
  34. HashMask = HashSize - 1,
  35. };
  36. struct DCache
  37. {
  38. QLock lock;
  39. RWLock dirtylock; /* must be held to inspect or set b->dirty */
  40. Rendez full;
  41. Round round;
  42. DBlock *free; /* list of available lumps */
  43. u32int now; /* ticks for usage timestamps */
  44. int size; /* max. size of any block; allocated to each block */
  45. DBlock **heads; /* hash table for finding address */
  46. int nheap; /* number of available victims */
  47. DBlock **heap; /* heap for locating victims */
  48. int nblocks; /* number of blocks allocated */
  49. DBlock *blocks; /* array of block descriptors */
  50. DBlock **write; /* array of block pointers to be written */
  51. u8int *mem; /* memory for all block descriptors */
  52. int ndirty; /* number of dirty blocks */
  53. int maxdirty; /* max. number of dirty blocks */
  54. };
  55. typedef struct Ra Ra;
  56. struct Ra
  57. {
  58. Part *part;
  59. u64int addr;
  60. };
  61. static DCache dcache;
  62. static int downheap(int i, DBlock *b);
  63. static int upheap(int i, DBlock *b);
  64. static DBlock *bumpdblock(void);
  65. static void delheap(DBlock *db);
  66. static void fixheap(int i, DBlock *b);
  67. static void flushproc(void*);
  68. static void writeproc(void*);
  69. void
  70. initdcache(u32int mem)
  71. {
  72. DBlock *b, *last;
  73. u32int nblocks, blocksize;
  74. int i;
  75. u8int *p;
  76. if(mem < maxblocksize * 2)
  77. sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
  78. if(maxblocksize == 0)
  79. sysfatal("no max. block size given for disk cache");
  80. blocksize = maxblocksize;
  81. nblocks = mem / blocksize;
  82. dcache.full.l = &dcache.lock;
  83. dcache.nblocks = nblocks;
  84. dcache.maxdirty = (nblocks * 2) / 3;
  85. trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
  86. nblocks, blocksize, dcache.maxdirty);
  87. dcache.size = blocksize;
  88. dcache.heads = MKNZ(DBlock*, HashSize);
  89. dcache.heap = MKNZ(DBlock*, nblocks);
  90. dcache.blocks = MKNZ(DBlock, nblocks);
  91. dcache.write = MKNZ(DBlock*, nblocks);
  92. dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
  93. last = nil;
  94. p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
  95. for(i = 0; i < nblocks; i++){
  96. b = &dcache.blocks[i];
  97. b->data = &p[i * blocksize];
  98. b->heap = TWID32;
  99. b->writedonechan = chancreate(sizeof(void*), 1);
  100. b->next = last;
  101. last = b;
  102. }
  103. dcache.free = last;
  104. dcache.nheap = 0;
  105. setstat(StatDcacheSize, nblocks);
  106. initround(&dcache.round, "dcache", 120*1000);
  107. vtproc(flushproc, nil);
  108. vtproc(delaykickroundproc, &dcache.round);
  109. }
  110. static u32int
  111. pbhash(u64int addr)
  112. {
  113. u32int h;
  114. #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
  115. h = (addr >> 32) ^ addr;
  116. return hashit(h);
  117. }
  118. DBlock*
  119. getdblock(Part *part, u64int addr, int mode)
  120. {
  121. DBlock *b;
  122. b = _getdblock(part, addr, mode, 1);
  123. if(mode == OREAD || mode == ORDWR)
  124. addstat(StatDcacheRead, 1);
  125. if(mode == OWRITE || mode == ORDWR)
  126. addstat(StatDcacheWrite, 1);
  127. return b;
  128. }
  129. DBlock*
  130. _getdblock(Part *part, u64int addr, int mode, int load)
  131. {
  132. DBlock *b;
  133. u32int h, size, ms;
  134. ms = 0;
  135. trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
  136. size = part->blocksize;
  137. if(size > dcache.size){
  138. seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
  139. if(load)
  140. addstat(StatDcacheLookup, 1);
  141. return nil;
  142. }
  143. h = pbhash(addr);
  144. /*
  145. * look for the block in the cache
  146. */
  147. qlock(&dcache.lock);
  148. again:
  149. for(b = dcache.heads[h]; b != nil; b = b->next){
  150. if(b->part == part && b->addr == addr){
  151. if(load)
  152. addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
  153. goto found;
  154. }
  155. }
  156. /*
  157. * missed: locate the block with the oldest second to last use.
  158. * remove it from the heap, and fix up the heap.
  159. */
  160. if(!load){
  161. qunlock(&dcache.lock);
  162. return nil;
  163. }
  164. /*
  165. * Only start timer here, on cache miss - calling msec() on plain cache hits
  166. * makes cache hits system-call bound.
  167. */
  168. ms = msec();
  169. addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
  170. b = bumpdblock();
  171. if(b == nil){
  172. trace(TraceBlock, "all disk cache blocks in use");
  173. addstat(StatDcacheStall, 1);
  174. rsleep(&dcache.full);
  175. addstat(StatDcacheStall, -1);
  176. goto again;
  177. }
  178. assert(!b->dirty);
  179. /*
  180. * the new block has no last use, so assume it happens sometime in the middle
  181. ZZZ this is not reasonable
  182. */
  183. b->used = (b->used2 + dcache.now) / 2;
  184. /*
  185. * rechain the block on the correct hash chain
  186. */
  187. b->next = dcache.heads[h];
  188. dcache.heads[h] = b;
  189. if(b->next != nil)
  190. b->next->prev = b;
  191. b->prev = nil;
  192. b->addr = addr;
  193. b->part = part;
  194. b->size = 0;
  195. found:
  196. b->ref++;
  197. b->used2 = b->used;
  198. b->used = dcache.now++;
  199. if(b->heap != TWID32)
  200. fixheap(b->heap, b);
  201. if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
  202. trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
  203. part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
  204. vtproc(writeproc, part);
  205. }
  206. qunlock(&dcache.lock);
  207. trace(TraceBlock, "getdblock lock");
  208. addstat(StatDblockStall, 1);
  209. if(mode == OREAD)
  210. rlock(&b->lock);
  211. else
  212. wlock(&b->lock);
  213. addstat(StatDblockStall, -1);
  214. trace(TraceBlock, "getdblock locked");
  215. if(b->size != size){
  216. if(mode == OREAD){
  217. addstat(StatDblockStall, 1);
  218. runlock(&b->lock);
  219. wlock(&b->lock);
  220. addstat(StatDblockStall, -1);
  221. }
  222. if(b->size < size){
  223. if(mode == OWRITE)
  224. memset(&b->data[b->size], 0, size - b->size);
  225. else{
  226. trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
  227. diskaccess(0);
  228. if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
  229. b->mode = ORDWR; /* so putdblock wunlocks */
  230. putdblock(b);
  231. return nil;
  232. }
  233. trace(TraceBlock, "getdblock readpartdone");
  234. addstat(StatApartRead, 1);
  235. addstat(StatApartReadBytes, size-b->size);
  236. }
  237. }
  238. b->size = size;
  239. if(mode == OREAD){
  240. addstat(StatDblockStall, 1);
  241. wunlock(&b->lock);
  242. rlock(&b->lock);
  243. addstat(StatDblockStall, -1);
  244. }
  245. }
  246. b->mode = mode;
  247. trace(TraceBlock, "getdblock exit");
  248. if(ms)
  249. addstat(StatDcacheLookupTime, msec() - ms);
  250. return b;
  251. }
  252. void
  253. putdblock(DBlock *b)
  254. {
  255. if(b == nil)
  256. return;
  257. trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
  258. if(b->mode == OREAD)
  259. runlock(&b->lock);
  260. else
  261. wunlock(&b->lock);
  262. qlock(&dcache.lock);
  263. if(--b->ref == 0 && !b->dirty){
  264. if(b->heap == TWID32)
  265. upheap(dcache.nheap++, b);
  266. rwakeupall(&dcache.full);
  267. }
  268. qunlock(&dcache.lock);
  269. }
  270. void
  271. dirtydblock(DBlock *b, int dirty)
  272. {
  273. int odirty;
  274. trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
  275. b->part->name, b->addr, dirty, getcallerpc(&b));
  276. assert(b->ref != 0);
  277. assert(b->mode==ORDWR || b->mode==OWRITE);
  278. odirty = b->dirty;
  279. if(b->dirty)
  280. assert(b->dirty == dirty);
  281. else
  282. b->dirty = dirty;
  283. qlock(&dcache.lock);
  284. if(!odirty){
  285. dcache.ndirty++;
  286. setstat(StatDcacheDirty, dcache.ndirty);
  287. if(dcache.ndirty >= dcache.maxdirty)
  288. kickround(&dcache.round, 0);
  289. else
  290. delaykickround(&dcache.round);
  291. }
  292. qunlock(&dcache.lock);
  293. }
  294. static void
  295. unchain(DBlock *b)
  296. {
  297. ulong h;
  298. /*
  299. * unchain the block
  300. */
  301. if(b->prev == nil){
  302. h = pbhash(b->addr);
  303. if(dcache.heads[h] != b)
  304. sysfatal("bad hash chains in disk cache");
  305. dcache.heads[h] = b->next;
  306. }else
  307. b->prev->next = b->next;
  308. if(b->next != nil)
  309. b->next->prev = b->prev;
  310. }
  311. /*
  312. * remove some block from use and update the free list and counters
  313. */
  314. static DBlock*
  315. bumpdblock(void)
  316. {
  317. DBlock *b;
  318. trace(TraceBlock, "bumpdblock enter");
  319. b = dcache.free;
  320. if(b != nil){
  321. dcache.free = b->next;
  322. return b;
  323. }
  324. if(dcache.ndirty >= dcache.maxdirty)
  325. kickdcache();
  326. /*
  327. * remove blocks until we find one that is unused
  328. * referenced blocks are left in the heap even though
  329. * they can't be scavenged; this is simple a speed optimization
  330. */
  331. for(;;){
  332. if(dcache.nheap == 0){
  333. kickdcache();
  334. trace(TraceBlock, "bumpdblock gotnothing");
  335. return nil;
  336. }
  337. b = dcache.heap[0];
  338. delheap(b);
  339. if(!b->ref && !b->dirty)
  340. break;
  341. }
  342. trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
  343. unchain(b);
  344. return b;
  345. }
  346. void
  347. emptydcache(void)
  348. {
  349. DBlock *b;
  350. qlock(&dcache.lock);
  351. while(dcache.nheap > 0){
  352. b = dcache.heap[0];
  353. delheap(b);
  354. if(!b->ref && !b->dirty){
  355. unchain(b);
  356. b->next = dcache.free;
  357. dcache.free = b;
  358. }
  359. }
  360. qunlock(&dcache.lock);
  361. }
  362. /*
  363. * delete an arbitrary block from the heap
  364. */
  365. static void
  366. delheap(DBlock *db)
  367. {
  368. if(db->heap == TWID32)
  369. return;
  370. fixheap(db->heap, dcache.heap[--dcache.nheap]);
  371. db->heap = TWID32;
  372. }
  373. /*
  374. * push an element up or down to it's correct new location
  375. */
  376. static void
  377. fixheap(int i, DBlock *b)
  378. {
  379. if(upheap(i, b) == i)
  380. downheap(i, b);
  381. }
  382. static int
  383. upheap(int i, DBlock *b)
  384. {
  385. DBlock *bb;
  386. u32int now;
  387. int p;
  388. now = dcache.now;
  389. for(; i != 0; i = p){
  390. p = (i - 1) >> 1;
  391. bb = dcache.heap[p];
  392. if(b->used2 - now >= bb->used2 - now)
  393. break;
  394. dcache.heap[i] = bb;
  395. bb->heap = i;
  396. }
  397. dcache.heap[i] = b;
  398. b->heap = i;
  399. return i;
  400. }
  401. static int
  402. downheap(int i, DBlock *b)
  403. {
  404. DBlock *bb;
  405. u32int now;
  406. int k;
  407. now = dcache.now;
  408. for(; ; i = k){
  409. k = (i << 1) + 1;
  410. if(k >= dcache.nheap)
  411. break;
  412. if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
  413. k++;
  414. bb = dcache.heap[k];
  415. if(b->used2 - now <= bb->used2 - now)
  416. break;
  417. dcache.heap[i] = bb;
  418. bb->heap = i;
  419. }
  420. dcache.heap[i] = b;
  421. b->heap = i;
  422. return i;
  423. }
  424. static void
  425. findblock(DBlock *bb)
  426. {
  427. DBlock *b, *last;
  428. int h;
  429. last = nil;
  430. h = pbhash(bb->addr);
  431. for(b = dcache.heads[h]; b != nil; b = b->next){
  432. if(last != b->prev)
  433. sysfatal("bad prev link");
  434. if(b == bb)
  435. return;
  436. last = b;
  437. }
  438. sysfatal("block missing from hash table");
  439. }
  440. void
  441. checkdcache(void)
  442. {
  443. DBlock *b;
  444. u32int size, now;
  445. int i, k, refed, nfree;
  446. qlock(&dcache.lock);
  447. size = dcache.size;
  448. now = dcache.now;
  449. for(i = 0; i < dcache.nheap; i++){
  450. if(dcache.heap[i]->heap != i)
  451. sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
  452. if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
  453. sysfatal("dc: bad heap ordering");
  454. k = (i << 1) + 1;
  455. if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
  456. sysfatal("dc: bad heap ordering");
  457. k++;
  458. if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
  459. sysfatal("dc: bad heap ordering");
  460. }
  461. refed = 0;
  462. for(i = 0; i < dcache.nblocks; i++){
  463. b = &dcache.blocks[i];
  464. if(b->data != &dcache.mem[i * size])
  465. sysfatal("dc: mis-blocked at %d", i);
  466. if(b->ref && b->heap == TWID32)
  467. refed++;
  468. if(b->addr)
  469. findblock(b);
  470. if(b->heap != TWID32
  471. && dcache.heap[b->heap] != b)
  472. sysfatal("dc: spurious heap value");
  473. }
  474. nfree = 0;
  475. for(b = dcache.free; b != nil; b = b->next){
  476. if(b->addr != 0 || b->heap != TWID32)
  477. sysfatal("dc: bad free list");
  478. nfree++;
  479. }
  480. if(dcache.nheap + nfree + refed != dcache.nblocks)
  481. sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
  482. qunlock(&dcache.lock);
  483. }
  484. void
  485. flushdcache(void)
  486. {
  487. trace(TraceProc, "flushdcache enter");
  488. kickround(&dcache.round, 1);
  489. trace(TraceProc, "flushdcache exit");
  490. }
  491. void
  492. kickdcache(void)
  493. {
  494. kickround(&dcache.round, 0);
  495. }
  496. static int
  497. parallelwrites(DBlock **b, DBlock **eb, int dirty)
  498. {
  499. DBlock **p, **q;
  500. Part *part;
  501. for(p=b; p<eb && (*p)->dirty == dirty; p++){
  502. assert(b<=p && p<eb);
  503. sendp((*p)->part->writechan, *p);
  504. }
  505. q = p;
  506. for(p=b; p<q; p++){
  507. assert(b<=p && p<eb);
  508. recvp((*p)->writedonechan);
  509. }
  510. /*
  511. * Flush the partitions that have been written to.
  512. */
  513. part = nil;
  514. for(p=b; p<q; p++){
  515. if(part != (*p)->part){
  516. part = (*p)->part;
  517. flushpart(part); /* what if it fails? */
  518. }
  519. }
  520. return p-b;
  521. }
  522. /*
  523. * Sort first by dirty flag, then by partition, then by address in partition.
  524. */
  525. static int
  526. writeblockcmp(const void *va, const void *vb)
  527. {
  528. DBlock *a, *b;
  529. a = *(DBlock**)va;
  530. b = *(DBlock**)vb;
  531. if(a->dirty != b->dirty)
  532. return a->dirty - b->dirty;
  533. if(a->part != b->part){
  534. if(a->part < b->part)
  535. return -1;
  536. if(a->part > b->part)
  537. return 1;
  538. }
  539. if(a->addr < b->addr)
  540. return -1;
  541. return 1;
  542. }
  543. static void
  544. flushproc(void *v)
  545. {
  546. int i, j, n;
  547. ulong t0;
  548. DBlock *b, **write;
  549. USED(v);
  550. threadsetname("flushproc");
  551. for(;;){
  552. waitforkick(&dcache.round);
  553. trace(TraceWork, "start");
  554. t0 = nsec()/1000;
  555. trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
  556. write = dcache.write;
  557. n = 0;
  558. for(i=0; i<dcache.nblocks; i++){
  559. b = &dcache.blocks[i];
  560. if(b->dirty)
  561. write[n++] = b;
  562. }
  563. qsort(write, n, sizeof(write[0]), writeblockcmp);
  564. /* Write each stage of blocks out. */
  565. trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
  566. i = 0;
  567. for(j=1; j<DirtyMax; j++){
  568. trace(TraceProc, "writeblocks.%d t=%lud",
  569. j, (ulong)(nsec()/1000)-t0);
  570. i += parallelwrites(write+i, write+n, j);
  571. }
  572. if(i != n){
  573. fprint(2, "in flushproc i=%d n=%d\n", i, n);
  574. for(i=0; i<n; i++)
  575. fprint(2, "\tblock %d: dirty=%d\n",
  576. i, write[i]->dirty);
  577. abort();
  578. }
  579. /*
  580. * b->dirty is protected by b->lock while ndirty is protected
  581. * by dcache.lock, so the --ndirty below is the delayed one
  582. * from clearing b->dirty in the write proc. It may happen
  583. * that some other proc has come along and redirtied b since
  584. * the write. That's okay, it just means that ndirty may be
  585. * one too high until we catch up and do the decrement.
  586. */
  587. trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
  588. qlock(&dcache.lock);
  589. for(i=0; i<n; i++){
  590. b = write[i];
  591. --dcache.ndirty;
  592. if(b->ref == 0 && b->heap == TWID32){
  593. upheap(dcache.nheap++, b);
  594. rwakeupall(&dcache.full);
  595. }
  596. }
  597. setstat(StatDcacheDirty, dcache.ndirty);
  598. qunlock(&dcache.lock);
  599. addstat(StatDcacheFlush, 1);
  600. trace(TraceWork, "finish");
  601. }
  602. }
  603. static void
  604. writeproc(void *v)
  605. {
  606. DBlock *b;
  607. Part *p;
  608. p = v;
  609. threadsetname("writeproc:%s", p->name);
  610. for(;;){
  611. b = recvp(p->writechan);
  612. trace(TraceWork, "start");
  613. assert(b->part == p);
  614. trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
  615. wlock(&b->lock);
  616. trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
  617. diskaccess(0);
  618. if(writepart(p, b->addr, b->data, b->size) < 0)
  619. fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
  620. argv0, p->name, b->addr);
  621. addstat(StatApartWrite, 1);
  622. addstat(StatApartWriteBytes, b->size);
  623. b->dirty = 0;
  624. wunlock(&b->lock);
  625. trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
  626. trace(TraceWork, "finish");
  627. sendp(b->writedonechan, b);
  628. }
  629. }