cache.c 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119
  1. #include "stdinc.h"
  2. #include "dat.h"
  3. #include "fns.h"
  4. #include "error.h"
  5. #include "9.h" /* for cacheFlush */
  6. typedef struct FreeList FreeList;
  7. typedef struct BAddr BAddr;
  8. enum {
  9. BadHeap = ~0,
  10. };
  11. /*
  12. * Store data to the memory cache in c->size blocks
  13. * with the block zero extended to fill it out. When writing to
  14. * Venti, the block will be zero truncated. The walker will also check
  15. * that the block fits within psize or dsize as the case may be.
  16. */
  17. struct Cache
  18. {
  19. VtLock *lk;
  20. VtLock *dirtylk;
  21. int ref;
  22. int mode;
  23. Disk *disk;
  24. int size; /* block size */
  25. int ndmap; /* size of per-block dirty pointer map used in blockWrite */
  26. VtSession *z;
  27. u32int now; /* ticks for usage timestamps */
  28. Block **heads; /* hash table for finding address */
  29. int nheap; /* number of available victims */
  30. Block **heap; /* heap for locating victims */
  31. long nblocks; /* number of blocks allocated */
  32. Block *blocks; /* array of block descriptors */
  33. u8int *mem; /* memory for all block data & blists */
  34. BList *blfree;
  35. VtRendez *blrend;
  36. int ndirty; /* number of dirty blocks in the cache */
  37. int maxdirty; /* max number of dirty blocks */
  38. u32int vers;
  39. long hashSize;
  40. FreeList *fl;
  41. VtRendez *die; /* daemon threads should die when != nil */
  42. VtRendez *flush;
  43. VtRendez *flushwait;
  44. VtRendez *heapwait;
  45. BAddr *baddr;
  46. int bw, br, be;
  47. int nflush;
  48. Periodic *sync;
  49. /* unlink daemon */
  50. BList *uhead;
  51. BList *utail;
  52. VtRendez *unlink;
  53. /* block counts */
  54. int nused;
  55. int ndisk;
  56. };
  57. struct BList {
  58. int part;
  59. u32int addr;
  60. uchar type;
  61. u32int tag;
  62. u32int epoch;
  63. u32int vers;
  64. int recurse; /* for block unlink */
  65. /* for roll back */
  66. int index; /* -1 indicates not valid */
  67. union {
  68. uchar score[VtScoreSize];
  69. uchar entry[VtEntrySize];
  70. } old;
  71. BList *next;
  72. };
  73. struct BAddr {
  74. int part;
  75. u32int addr;
  76. u32int vers;
  77. };
  78. struct FreeList {
  79. VtLock *lk;
  80. u32int last; /* last block allocated */
  81. u32int end; /* end of data partition */
  82. u32int nused; /* number of used blocks */
  83. u32int epochLow; /* low epoch when last updated nused */
  84. };
  85. static FreeList *flAlloc(u32int end);
  86. static void flFree(FreeList *fl);
  87. static Block *cacheBumpBlock(Cache *c);
  88. static void heapDel(Block*);
  89. static void heapIns(Block*);
  90. static void cacheCheck(Cache*);
  91. static void unlinkThread(void *a);
  92. static void flushThread(void *a);
  93. static void flushBody(Cache *c);
  94. static void unlinkBody(Cache *c);
  95. static int cacheFlushBlock(Cache *c);
  96. static void cacheSync(void*);
  97. static BList *blistAlloc(Block*);
  98. static void blistFree(Cache*, BList*);
  99. static void doRemoveLink(Cache*, BList*);
  100. static void doRemoveLinkList(Cache*, BList*);
  101. /*
  102. * Mapping from local block type to Venti type
  103. */
  104. int vtType[BtMax] = {
  105. VtDataType, /* BtData | 0 */
  106. VtPointerType0, /* BtData | 1 */
  107. VtPointerType1, /* BtData | 2 */
  108. VtPointerType2, /* BtData | 3 */
  109. VtPointerType3, /* BtData | 4 */
  110. VtPointerType4, /* BtData | 5 */
  111. VtPointerType5, /* BtData | 6 */
  112. VtPointerType6, /* BtData | 7 */
  113. VtDirType, /* BtDir | 0 */
  114. VtPointerType0, /* BtDir | 1 */
  115. VtPointerType1, /* BtDir | 2 */
  116. VtPointerType2, /* BtDir | 3 */
  117. VtPointerType3, /* BtDir | 4 */
  118. VtPointerType4, /* BtDir | 5 */
  119. VtPointerType5, /* BtDir | 6 */
  120. VtPointerType6, /* BtDir | 7 */
  121. };
  122. /*
  123. * Allocate the memory cache.
  124. */
  125. Cache *
  126. cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
  127. {
  128. int i;
  129. Cache *c;
  130. Block *b;
  131. BList *bl;
  132. u8int *p;
  133. int nbl;
  134. c = vtMemAllocZ(sizeof(Cache));
  135. /* reasonable number of BList elements */
  136. nbl = nblocks * 4;
  137. c->lk = vtLockAlloc();
  138. c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */
  139. c->ref = 1;
  140. c->disk = disk;
  141. c->z = z;
  142. c->size = diskBlockSize(disk);
  143. bwatchSetBlockSize(c->size);
  144. /* round c->size up to be a nice multiple */
  145. c->size = (c->size + 127) & ~127;
  146. c->ndmap = (c->size/20 + 7) / 8;
  147. c->nblocks = nblocks;
  148. c->hashSize = nblocks;
  149. c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
  150. c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
  151. c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
  152. c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
  153. c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
  154. c->mode = mode;
  155. c->vers++;
  156. p = c->mem;
  157. for(i = 0; i < nblocks; i++){
  158. b = &c->blocks[i];
  159. b->lk = vtLockAlloc();
  160. b->c = c;
  161. b->data = p;
  162. b->heap = i;
  163. b->ioready = vtRendezAlloc(b->lk);
  164. c->heap[i] = b;
  165. p += c->size;
  166. }
  167. c->nheap = nblocks;
  168. for(i = 0; i < nbl; i++){
  169. bl = (BList*)p;
  170. bl->next = c->blfree;
  171. c->blfree = bl;
  172. p += sizeof(BList);
  173. }
  174. /* separate loop to keep blocks and blists reasonably aligned */
  175. for(i = 0; i < nblocks; i++){
  176. b = &c->blocks[i];
  177. b->dmap = p;
  178. p += c->ndmap;
  179. }
  180. c->blrend = vtRendezAlloc(c->lk);
  181. c->maxdirty = nblocks*(DirtyPercentage*0.01);
  182. c->fl = flAlloc(diskSize(disk, PartData));
  183. c->unlink = vtRendezAlloc(c->lk);
  184. c->flush = vtRendezAlloc(c->lk);
  185. c->flushwait = vtRendezAlloc(c->lk);
  186. c->heapwait = vtRendezAlloc(c->lk);
  187. c->sync = periodicAlloc(cacheSync, c, 30*1000);
  188. if(mode == OReadWrite){
  189. c->ref += 2;
  190. vtThread(unlinkThread, c);
  191. vtThread(flushThread, c);
  192. }
  193. cacheCheck(c);
  194. return c;
  195. }
  196. /*
  197. * Free the whole memory cache, flushing all dirty blocks to the disk.
  198. */
  199. void
  200. cacheFree(Cache *c)
  201. {
  202. int i;
  203. /* kill off daemon threads */
  204. vtLock(c->lk);
  205. c->die = vtRendezAlloc(c->lk);
  206. periodicKill(c->sync);
  207. vtWakeup(c->flush);
  208. vtWakeup(c->unlink);
  209. while(c->ref > 1)
  210. vtSleep(c->die);
  211. /* flush everything out */
  212. do {
  213. unlinkBody(c);
  214. vtUnlock(c->lk);
  215. while(cacheFlushBlock(c))
  216. ;
  217. diskFlush(c->disk);
  218. vtLock(c->lk);
  219. } while(c->uhead || c->ndirty);
  220. vtUnlock(c->lk);
  221. cacheCheck(c);
  222. for(i = 0; i < c->nblocks; i++){
  223. assert(c->blocks[i].ref == 0);
  224. vtRendezFree(c->blocks[i].ioready);
  225. vtLockFree(c->blocks[i].lk);
  226. }
  227. flFree(c->fl);
  228. vtMemFree(c->baddr);
  229. vtMemFree(c->heads);
  230. vtMemFree(c->blocks);
  231. vtMemFree(c->mem);
  232. vtLockFree(c->lk);
  233. diskFree(c->disk);
  234. vtRendezFree(c->blrend);
  235. /* don't close vtSession */
  236. vtMemFree(c);
  237. }
  238. static void
  239. cacheDump(Cache *c)
  240. {
  241. int i;
  242. Block *b;
  243. for(i = 0; i < c->nblocks; i++){
  244. b = &c->blocks[i];
  245. fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
  246. i, b->part, b->addr, b->score, b->l.type, b->ref,
  247. bsStr(b->l.state), bioStr(b->iostate), b->pc);
  248. }
  249. }
  250. static void
  251. cacheCheck(Cache *c)
  252. {
  253. u32int size, now;
  254. int i, k, refed;
  255. static uchar zero[VtScoreSize];
  256. Block *b;
  257. size = c->size;
  258. now = c->now;
  259. for(i = 0; i < c->nheap; i++){
  260. if(c->heap[i]->heap != i)
  261. vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
  262. if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
  263. vtFatal("bad heap ordering");
  264. k = (i << 1) + 1;
  265. if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
  266. vtFatal("bad heap ordering");
  267. k++;
  268. if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
  269. vtFatal("bad heap ordering");
  270. }
  271. refed = 0;
  272. for(i = 0; i < c->nblocks; i++){
  273. b = &c->blocks[i];
  274. if(b->data != &c->mem[i * size])
  275. vtFatal("mis-blocked at %d", i);
  276. if(b->ref && b->heap == BadHeap){
  277. refed++;
  278. }
  279. }
  280. if(c->nheap + refed != c->nblocks){
  281. fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
  282. cacheDump(c);
  283. }
  284. assert(c->nheap + refed == c->nblocks);
  285. refed = 0;
  286. for(i = 0; i < c->nblocks; i++){
  287. b = &c->blocks[i];
  288. if(b->ref){
  289. if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
  290. refed++;
  291. }
  292. }
  293. if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
  294. }
  295. /*
  296. * locate the block with the oldest second to last use.
  297. * remove it from the heap, and fix up the heap.
  298. */
  299. /* called with c->lk held */
  300. static Block *
  301. cacheBumpBlock(Cache *c)
  302. {
  303. int printed;
  304. Block *b;
  305. /*
  306. * locate the block with the oldest second to last use.
  307. * remove it from the heap, and fix up the heap.
  308. */
  309. printed = 0;
  310. if(c->nheap == 0){
  311. while(c->nheap == 0){
  312. vtWakeup(c->flush);
  313. vtSleep(c->heapwait);
  314. if(c->nheap == 0){
  315. printed = 1;
  316. fprint(2, "%s: entire cache is busy, %d dirty "
  317. "-- waking flush thread\n",
  318. argv0, c->ndirty);
  319. }
  320. }
  321. if(printed)
  322. fprint(2, "%s: cache is okay again, %d dirty\n",
  323. argv0, c->ndirty);
  324. }
  325. b = c->heap[0];
  326. heapDel(b);
  327. assert(b->heap == BadHeap);
  328. assert(b->ref == 0);
  329. assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
  330. assert(b->prior == nil);
  331. assert(b->uhead == nil);
  332. /*
  333. * unchain the block from hash chain
  334. */
  335. if(b->prev){
  336. *(b->prev) = b->next;
  337. if(b->next)
  338. b->next->prev = b->prev;
  339. b->prev = nil;
  340. }
  341. if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
  342. /* set block to a reasonable state */
  343. b->ref = 1;
  344. b->part = PartError;
  345. memset(&b->l, 0, sizeof(b->l));
  346. b->iostate = BioEmpty;
  347. return b;
  348. }
  349. /*
  350. * look for a particular version of the block in the memory cache.
  351. */
  352. static Block *
  353. _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
  354. int waitlock, int *lockfailure)
  355. {
  356. Block *b;
  357. ulong h;
  358. h = addr % c->hashSize;
  359. if(lockfailure)
  360. *lockfailure = 0;
  361. /*
  362. * look for the block in the cache
  363. */
  364. vtLock(c->lk);
  365. for(b = c->heads[h]; b != nil; b = b->next){
  366. if(b->part == part && b->addr == addr)
  367. break;
  368. }
  369. if(b == nil || b->vers != vers){
  370. vtUnlock(c->lk);
  371. return nil;
  372. }
  373. if(!waitlock && !vtCanLock(b->lk)){
  374. *lockfailure = 1;
  375. vtUnlock(c->lk);
  376. return nil;
  377. }
  378. heapDel(b);
  379. b->ref++;
  380. vtUnlock(c->lk);
  381. bwatchLock(b);
  382. if(waitlock)
  383. vtLock(b->lk);
  384. b->nlock = 1;
  385. for(;;){
  386. switch(b->iostate){
  387. default:
  388. abort();
  389. case BioEmpty:
  390. case BioLabel:
  391. case BioClean:
  392. case BioDirty:
  393. if(b->vers != vers){
  394. blockPut(b);
  395. return nil;
  396. }
  397. return b;
  398. case BioReading:
  399. case BioWriting:
  400. vtSleep(b->ioready);
  401. break;
  402. case BioVentiError:
  403. blockPut(b);
  404. vtSetError("venti i/o error block 0x%.8ux", addr);
  405. return nil;
  406. case BioReadError:
  407. blockPut(b);
  408. vtSetError("error reading block 0x%.8ux", addr);
  409. return nil;
  410. }
  411. }
  412. /* NOT REACHED */
  413. }
  414. static Block*
  415. cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
  416. {
  417. return _cacheLocalLookup(c, part, addr, vers, 1, 0);
  418. }
  419. /*
  420. * fetch a local (on-disk) block from the memory cache.
  421. * if it's not there, load it, bumping some other block.
  422. */
  423. Block *
  424. _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
  425. {
  426. Block *b;
  427. ulong h;
  428. assert(part != PartVenti);
  429. h = addr % c->hashSize;
  430. /*
  431. * look for the block in the cache
  432. */
  433. vtLock(c->lk);
  434. for(b = c->heads[h]; b != nil; b = b->next){
  435. if(b->part != part || b->addr != addr)
  436. continue;
  437. if(epoch && b->l.epoch != epoch){
  438. fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
  439. vtUnlock(c->lk);
  440. vtSetError(ELabelMismatch);
  441. return nil;
  442. }
  443. heapDel(b);
  444. b->ref++;
  445. break;
  446. }
  447. if(b == nil){
  448. b = cacheBumpBlock(c);
  449. b->part = part;
  450. b->addr = addr;
  451. localToGlobal(addr, b->score);
  452. /* chain onto correct hash */
  453. b->next = c->heads[h];
  454. c->heads[h] = b;
  455. if(b->next != nil)
  456. b->next->prev = &b->next;
  457. b->prev = &c->heads[h];
  458. }
  459. vtUnlock(c->lk);
  460. /*
  461. * BUG: what if the epoch changes right here?
  462. * In the worst case, we could end up in some weird
  463. * lock loop, because the block we want no longer exists,
  464. * and instead we're trying to lock a block we have no
  465. * business grabbing.
  466. *
  467. * For now, I'm not going to worry about it.
  468. */
  469. if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
  470. bwatchLock(b);
  471. vtLock(b->lk);
  472. b->nlock = 1;
  473. if(part == PartData && b->iostate == BioEmpty){
  474. if(!readLabel(c, &b->l, addr)){
  475. blockPut(b);
  476. return nil;
  477. }
  478. blockSetIOState(b, BioLabel);
  479. }
  480. if(epoch && b->l.epoch != epoch){
  481. blockPut(b);
  482. fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
  483. vtSetError(ELabelMismatch);
  484. return nil;
  485. }
  486. b->pc = getcallerpc(&c);
  487. for(;;){
  488. switch(b->iostate){
  489. default:
  490. abort();
  491. case BioEmpty:
  492. case BioLabel:
  493. if(mode == OOverWrite){
  494. blockSetIOState(b, BioClean);
  495. return b;
  496. }
  497. diskRead(c->disk, b);
  498. vtSleep(b->ioready);
  499. break;
  500. case BioClean:
  501. case BioDirty:
  502. return b;
  503. case BioReading:
  504. case BioWriting:
  505. vtSleep(b->ioready);
  506. break;
  507. case BioReadError:
  508. blockSetIOState(b, BioEmpty);
  509. blockPut(b);
  510. vtSetError("error reading block 0x%.8ux", addr);
  511. return nil;
  512. }
  513. }
  514. /* NOT REACHED */
  515. }
  516. Block *
  517. cacheLocal(Cache *c, int part, u32int addr, int mode)
  518. {
  519. return _cacheLocal(c, part, addr, mode, 0);
  520. }
  521. /*
  522. * fetch a local (on-disk) block from the memory cache.
  523. * if it's not there, load it, bumping some other block.
  524. * check tag and type.
  525. */
  526. Block *
  527. cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
  528. {
  529. Block *b;
  530. b = _cacheLocal(c, PartData, addr, mode, epoch);
  531. if(b == nil)
  532. return nil;
  533. if(b->l.type != type || b->l.tag != tag){
  534. fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
  535. argv0, addr, b->l.type, type, b->l.tag, tag);
  536. vtSetError(ELabelMismatch);
  537. blockPut(b);
  538. return nil;
  539. }
  540. b->pc = getcallerpc(&c);
  541. return b;
  542. }
  543. /*
  544. * fetch a global (Venti) block from the memory cache.
  545. * if it's not there, load it, bumping some other block.
  546. * check tag and type if it's really a local block in disguise.
  547. */
  548. Block *
  549. cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
  550. {
  551. int n;
  552. Block *b;
  553. ulong h;
  554. u32int addr;
  555. addr = globalToLocal(score);
  556. if(addr != NilBlock){
  557. b = cacheLocalData(c, addr, type, tag, mode, 0);
  558. if(b)
  559. b->pc = getcallerpc(&c);
  560. return b;
  561. }
  562. h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
  563. /*
  564. * look for the block in the cache
  565. */
  566. vtLock(c->lk);
  567. for(b = c->heads[h]; b != nil; b = b->next){
  568. if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
  569. continue;
  570. heapDel(b);
  571. b->ref++;
  572. break;
  573. }
  574. if(b == nil){
  575. if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
  576. b = cacheBumpBlock(c);
  577. b->part = PartVenti;
  578. b->addr = NilBlock;
  579. b->l.type = type;
  580. memmove(b->score, score, VtScoreSize);
  581. /* chain onto correct hash */
  582. b->next = c->heads[h];
  583. c->heads[h] = b;
  584. if(b->next != nil)
  585. b->next->prev = &b->next;
  586. b->prev = &c->heads[h];
  587. }
  588. vtUnlock(c->lk);
  589. bwatchLock(b);
  590. vtLock(b->lk);
  591. b->nlock = 1;
  592. b->pc = getcallerpc(&c);
  593. switch(b->iostate){
  594. default:
  595. abort();
  596. case BioEmpty:
  597. n = vtRead(c->z, score, vtType[type], b->data, c->size);
  598. if(n < 0 || !vtSha1Check(score, b->data, n)){
  599. blockSetIOState(b, BioVentiError);
  600. blockPut(b);
  601. vtSetError(
  602. "venti error reading block %V or wrong score: %r",
  603. score);
  604. return nil;
  605. }
  606. vtZeroExtend(vtType[type], b->data, n, c->size);
  607. blockSetIOState(b, BioClean);
  608. return b;
  609. case BioClean:
  610. return b;
  611. case BioVentiError:
  612. blockPut(b);
  613. vtSetError("venti i/o error or wrong score, block %V", score);
  614. return nil;
  615. case BioReadError:
  616. blockPut(b);
  617. vtSetError("error reading block %V", b->score);
  618. return nil;
  619. }
  620. /* NOT REACHED */
  621. }
  622. /*
  623. * allocate a new on-disk block and load it into the memory cache.
  624. * BUG: if the disk is full, should we flush some of it to Venti?
  625. */
  626. static u32int lastAlloc;
  627. Block *
  628. cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
  629. {
  630. FreeList *fl;
  631. u32int addr;
  632. Block *b;
  633. int n, nwrap;
  634. Label lab;
  635. n = c->size / LabelSize;
  636. fl = c->fl;
  637. vtLock(fl->lk);
  638. addr = fl->last;
  639. b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
  640. if(b == nil){
  641. fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
  642. vtUnlock(fl->lk);
  643. return nil;
  644. }
  645. nwrap = 0;
  646. for(;;){
  647. if(++addr >= fl->end){
  648. addr = 0;
  649. if(++nwrap >= 2){
  650. blockPut(b);
  651. vtSetError("disk is full");
  652. /*
  653. * try to avoid a continuous spew of console
  654. * messages.
  655. */
  656. if (fl->last != 0)
  657. fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
  658. argv0);
  659. fl->last = 0;
  660. vtUnlock(fl->lk);
  661. return nil;
  662. }
  663. }
  664. if(addr%n == 0){
  665. blockPut(b);
  666. b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
  667. if(b == nil){
  668. fl->last = addr;
  669. fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
  670. vtUnlock(fl->lk);
  671. return nil;
  672. }
  673. }
  674. if(!labelUnpack(&lab, b->data, addr%n))
  675. continue;
  676. if(lab.state == BsFree)
  677. goto Found;
  678. if(lab.state&BsClosed)
  679. if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
  680. goto Found;
  681. }
  682. Found:
  683. blockPut(b);
  684. b = cacheLocal(c, PartData, addr, OOverWrite);
  685. if(b == nil){
  686. fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
  687. return nil;
  688. }
  689. assert(b->iostate == BioLabel || b->iostate == BioClean);
  690. fl->last = addr;
  691. lab.type = type;
  692. lab.tag = tag;
  693. lab.state = BsAlloc;
  694. lab.epoch = epoch;
  695. lab.epochClose = ~(u32int)0;
  696. if(!blockSetLabel(b, &lab, 1)){
  697. fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
  698. blockPut(b);
  699. return nil;
  700. }
  701. vtZeroExtend(vtType[type], b->data, 0, c->size);
  702. if(0)diskWrite(c->disk, b);
  703. if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
  704. lastAlloc = addr;
  705. fl->nused++;
  706. vtUnlock(fl->lk);
  707. b->pc = getcallerpc(&c);
  708. return b;
  709. }
  710. int
  711. cacheDirty(Cache *c)
  712. {
  713. return c->ndirty;
  714. }
  715. void
  716. cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
  717. {
  718. int n;
  719. u32int addr, nused;
  720. Block *b;
  721. Label lab;
  722. FreeList *fl;
  723. fl = c->fl;
  724. n = c->size / LabelSize;
  725. *bsize = c->size;
  726. vtLock(fl->lk);
  727. if(fl->epochLow == epochLow){
  728. *used = fl->nused;
  729. *total = fl->end;
  730. vtUnlock(fl->lk);
  731. return;
  732. }
  733. b = nil;
  734. nused = 0;
  735. for(addr=0; addr<fl->end; addr++){
  736. if(addr%n == 0){
  737. blockPut(b);
  738. b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
  739. if(b == nil){
  740. fprint(2, "%s: flCountUsed: loading %ux: %R\n",
  741. argv0, addr/n);
  742. break;
  743. }
  744. }
  745. if(!labelUnpack(&lab, b->data, addr%n))
  746. continue;
  747. if(lab.state == BsFree)
  748. continue;
  749. if(lab.state&BsClosed)
  750. if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
  751. continue;
  752. nused++;
  753. }
  754. blockPut(b);
  755. if(addr == fl->end){
  756. fl->nused = nused;
  757. fl->epochLow = epochLow;
  758. }
  759. *used = nused;
  760. *total = fl->end;
  761. vtUnlock(fl->lk);
  762. return;
  763. }
  764. static FreeList *
  765. flAlloc(u32int end)
  766. {
  767. FreeList *fl;
  768. fl = vtMemAllocZ(sizeof(*fl));
  769. fl->lk = vtLockAlloc();
  770. fl->last = 0;
  771. fl->end = end;
  772. return fl;
  773. }
  774. static void
  775. flFree(FreeList *fl)
  776. {
  777. vtLockFree(fl->lk);
  778. vtMemFree(fl);
  779. }
  780. u32int
  781. cacheLocalSize(Cache *c, int part)
  782. {
  783. return diskSize(c->disk, part);
  784. }
  785. /*
  786. * The thread that has locked b may refer to it by
  787. * multiple names. Nlock counts the number of
  788. * references the locking thread holds. It will call
  789. * blockPut once per reference.
  790. */
  791. void
  792. blockDupLock(Block *b)
  793. {
  794. assert(b->nlock > 0);
  795. b->nlock++;
  796. }
  797. /*
  798. * we're done with the block.
  799. * unlock it. can't use it after calling this.
  800. */
  801. void
  802. blockPut(Block* b)
  803. {
  804. Cache *c;
  805. if(b == nil)
  806. return;
  807. if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
  808. if(b->iostate == BioDirty)
  809. bwatchDependency(b);
  810. if(--b->nlock > 0)
  811. return;
  812. /*
  813. * b->nlock should probably stay at zero while
  814. * the block is unlocked, but diskThread and vtSleep
  815. * conspire to assume that they can just vtLock(b->lk); blockPut(b),
  816. * so we have to keep b->nlock set to 1 even
  817. * when the block is unlocked.
  818. */
  819. assert(b->nlock == 0);
  820. b->nlock = 1;
  821. // b->pc = 0;
  822. bwatchUnlock(b);
  823. vtUnlock(b->lk);
  824. c = b->c;
  825. vtLock(c->lk);
  826. if(--b->ref > 0){
  827. vtUnlock(c->lk);
  828. return;
  829. }
  830. assert(b->ref == 0);
  831. switch(b->iostate){
  832. default:
  833. b->used = c->now++;
  834. heapIns(b);
  835. break;
  836. case BioEmpty:
  837. case BioLabel:
  838. if(c->nheap == 0)
  839. b->used = c->now++;
  840. else
  841. b->used = c->heap[0]->used;
  842. heapIns(b);
  843. break;
  844. case BioDirty:
  845. break;
  846. }
  847. vtUnlock(c->lk);
  848. }
  849. /*
  850. * set the label associated with a block.
  851. */
  852. Block*
  853. _blockSetLabel(Block *b, Label *l)
  854. {
  855. int lpb;
  856. Block *bb;
  857. u32int a;
  858. Cache *c;
  859. c = b->c;
  860. assert(b->part == PartData);
  861. assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
  862. lpb = c->size / LabelSize;
  863. a = b->addr / lpb;
  864. bb = cacheLocal(c, PartLabel, a, OReadWrite);
  865. if(bb == nil){
  866. blockPut(b);
  867. return nil;
  868. }
  869. b->l = *l;
  870. labelPack(l, bb->data, b->addr%lpb);
  871. blockDirty(bb);
  872. return bb;
  873. }
  874. int
  875. blockSetLabel(Block *b, Label *l, int allocating)
  876. {
  877. Block *lb;
  878. Label oldl;
  879. oldl = b->l;
  880. lb = _blockSetLabel(b, l);
  881. if(lb == nil)
  882. return 0;
  883. /*
  884. * If we're allocating the block, make sure the label (bl)
  885. * goes to disk before the data block (b) itself. This is to help
  886. * the blocks that in turn depend on b.
  887. *
  888. * Suppose bx depends on (must be written out after) b.
  889. * Once we write b we'll think it's safe to write bx.
  890. * Bx can't get at b unless it has a valid label, though.
  891. *
  892. * Allocation is the only case in which having a current label
  893. * is vital because:
  894. *
  895. * - l.type is set at allocation and never changes.
  896. * - l.tag is set at allocation and never changes.
  897. * - l.state is not checked when we load blocks.
  898. * - the archiver cares deeply about l.state being
  899. * BaActive vs. BaCopied, but that's handled
  900. * by direct calls to _blockSetLabel.
  901. */
  902. if(allocating)
  903. blockDependency(b, lb, -1, nil, nil);
  904. blockPut(lb);
  905. return 1;
  906. }
  907. /*
  908. * Record that bb must be written out before b.
  909. * If index is given, we're about to overwrite the score/e
  910. * at that index in the block. Save the old value so we
  911. * can write a safer ``old'' version of the block if pressed.
  912. */
  913. void
  914. blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
  915. {
  916. BList *p;
  917. if(bb->iostate == BioClean)
  918. return;
  919. /*
  920. * Dependencies for blocks containing Entry structures
  921. * or scores must always be explained. The problem with
  922. * only explaining some of them is this. Suppose we have two
  923. * dependencies for the same field, the first explained
  924. * and the second not. We try to write the block when the first
  925. * dependency is not written but the second is. We will roll back
  926. * the first change even though the second trumps it.
  927. */
  928. if(index == -1 && bb->part == PartData)
  929. assert(b->l.type == BtData);
  930. if(bb->iostate != BioDirty){
  931. fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
  932. argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
  933. abort();
  934. }
  935. p = blistAlloc(bb);
  936. if(p == nil)
  937. return;
  938. assert(bb->iostate == BioDirty);
  939. if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
  940. p->part = bb->part;
  941. p->addr = bb->addr;
  942. p->type = bb->l.type;
  943. p->vers = bb->vers;
  944. p->index = index;
  945. if(p->index >= 0){
  946. /*
  947. * This test would just be b->l.type==BtDir except
  948. * we need to exclude the super block.
  949. */
  950. if(b->l.type == BtDir && b->part == PartData)
  951. entryPack(e, p->old.entry, 0);
  952. else
  953. memmove(p->old.score, score, VtScoreSize);
  954. }
  955. p->next = b->prior;
  956. b->prior = p;
  957. }
  958. /*
  959. * Mark an in-memory block as dirty. If there are too many
  960. * dirty blocks, start writing some out to disk.
  961. *
  962. * If there were way too many dirty blocks, we used to
  963. * try to do some flushing ourselves, but it's just too dangerous --
  964. * it implies that the callers cannot have any of our priors locked,
  965. * but this is hard to avoid in some cases.
  966. */
  967. int
  968. blockDirty(Block *b)
  969. {
  970. Cache *c;
  971. c = b->c;
  972. assert(b->part != PartVenti);
  973. if(b->iostate == BioDirty)
  974. return 1;
  975. assert(b->iostate == BioClean);
  976. vtLock(c->dirtylk);
  977. vtLock(c->lk);
  978. b->iostate = BioDirty;
  979. c->ndirty++;
  980. if(c->ndirty > (c->maxdirty>>1))
  981. vtWakeup(c->flush);
  982. vtUnlock(c->lk);
  983. vtUnlock(c->dirtylk);
  984. return 1;
  985. }
  986. /*
  987. * We've decided to write out b. Maybe b has some pointers to blocks
  988. * that haven't yet been written to disk. If so, construct a slightly out-of-date
  989. * copy of b that is safe to write out. (diskThread will make sure the block
  990. * remains marked as dirty.)
  991. */
  992. uchar *
  993. blockRollback(Block *b, uchar *buf)
  994. {
  995. u32int addr;
  996. BList *p;
  997. Super super;
  998. /* easy case */
  999. if(b->prior == nil)
  1000. return b->data;
  1001. memmove(buf, b->data, b->c->size);
  1002. for(p=b->prior; p; p=p->next){
  1003. /*
  1004. * we know p->index >= 0 because blockWrite has vetted this block for us.
  1005. */
  1006. assert(p->index >= 0);
  1007. assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
  1008. if(b->part == PartSuper){
  1009. assert(p->index == 0);
  1010. superUnpack(&super, buf);
  1011. addr = globalToLocal(p->old.score);
  1012. if(addr == NilBlock){
  1013. fprint(2, "%s: rolling back super block: "
  1014. "bad replacement addr %V\n",
  1015. argv0, p->old.score);
  1016. abort();
  1017. }
  1018. super.active = addr;
  1019. superPack(&super, buf);
  1020. continue;
  1021. }
  1022. if(b->l.type == BtDir)
  1023. memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
  1024. else
  1025. memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
  1026. }
  1027. return buf;
  1028. }
  1029. /*
  1030. * Try to write block b.
  1031. * If b depends on other blocks:
  1032. *
  1033. * If the block has been written out, remove the dependency.
  1034. * If the dependency is replaced by a more recent dependency,
  1035. * throw it out.
  1036. * If we know how to write out an old version of b that doesn't
  1037. * depend on it, do that.
  1038. *
  1039. * Otherwise, bail.
  1040. */
  1041. int
  1042. blockWrite(Block *b)
  1043. {
  1044. uchar *dmap;
  1045. Cache *c;
  1046. BList *p, **pp;
  1047. Block *bb;
  1048. int lockfail;
  1049. c = b->c;
  1050. if(b->iostate != BioDirty)
  1051. return 1;
  1052. dmap = b->dmap;
  1053. memset(dmap, 0, c->ndmap);
  1054. pp = &b->prior;
  1055. for(p=*pp; p; p=*pp){
  1056. if(p->index >= 0){
  1057. /* more recent dependency has succeeded; this one can go */
  1058. if(dmap[p->index/8] & (1<<(p->index%8)))
  1059. goto ignblock;
  1060. }
  1061. lockfail = 0;
  1062. bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
  1063. if(bb == nil){
  1064. if(lockfail)
  1065. return 0;
  1066. /* block not in cache => was written already */
  1067. dmap[p->index/8] |= 1<<(p->index%8);
  1068. goto ignblock;
  1069. }
  1070. /*
  1071. * same version of block is still in cache.
  1072. *
  1073. * the assertion is true because the block still has version p->vers,
  1074. * which means it hasn't been written out since we last saw it.
  1075. */
  1076. if(bb->iostate != BioDirty){
  1077. fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
  1078. argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
  1079. /* probably BioWriting if it happens? */
  1080. if(bb->iostate == BioClean)
  1081. goto ignblock;
  1082. }
  1083. blockPut(bb);
  1084. if(p->index < 0){
  1085. /*
  1086. * We don't know how to temporarily undo
  1087. * b's dependency on bb, so just don't write b yet.
  1088. */
  1089. if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
  1090. argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
  1091. return 0;
  1092. }
  1093. /* keep walking down the list */
  1094. pp = &p->next;
  1095. continue;
  1096. ignblock:
  1097. *pp = p->next;
  1098. blistFree(c, p);
  1099. continue;
  1100. }
  1101. /*
  1102. * DiskWrite must never be called with a double-locked block.
  1103. * This call to diskWrite is okay because blockWrite is only called
  1104. * from the cache flush thread, which never double-locks a block.
  1105. */
  1106. diskWrite(c->disk, b);
  1107. return 1;
  1108. }
  1109. /*
  1110. * Change the I/O state of block b.
  1111. * Just an assignment except for magic in
  1112. * switch statement (read comments there).
  1113. */
  1114. void
  1115. blockSetIOState(Block *b, int iostate)
  1116. {
  1117. int dowakeup;
  1118. Cache *c;
  1119. BList *p, *q;
  1120. if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
  1121. c = b->c;
  1122. dowakeup = 0;
  1123. switch(iostate){
  1124. default:
  1125. abort();
  1126. case BioEmpty:
  1127. assert(!b->uhead);
  1128. break;
  1129. case BioLabel:
  1130. assert(!b->uhead);
  1131. break;
  1132. case BioClean:
  1133. bwatchDependency(b);
  1134. /*
  1135. * If b->prior is set, it means a write just finished.
  1136. * The prior list isn't needed anymore.
  1137. */
  1138. for(p=b->prior; p; p=q){
  1139. q = p->next;
  1140. blistFree(c, p);
  1141. }
  1142. b->prior = nil;
  1143. /*
  1144. * Freeing a block or just finished a write.
  1145. * Move the blocks from the per-block unlink
  1146. * queue to the cache unlink queue.
  1147. */
  1148. if(b->iostate == BioDirty || b->iostate == BioWriting){
  1149. vtLock(c->lk);
  1150. c->ndirty--;
  1151. b->iostate = iostate; /* change here to keep in sync with ndirty */
  1152. b->vers = c->vers++;
  1153. if(b->uhead){
  1154. /* add unlink blocks to unlink queue */
  1155. if(c->uhead == nil){
  1156. c->uhead = b->uhead;
  1157. vtWakeup(c->unlink);
  1158. }else
  1159. c->utail->next = b->uhead;
  1160. c->utail = b->utail;
  1161. b->uhead = nil;
  1162. }
  1163. vtUnlock(c->lk);
  1164. }
  1165. assert(!b->uhead);
  1166. dowakeup = 1;
  1167. break;
  1168. case BioDirty:
  1169. /*
  1170. * Wrote out an old version of the block (see blockRollback).
  1171. * Bump a version count, leave it dirty.
  1172. */
  1173. if(b->iostate == BioWriting){
  1174. vtLock(c->lk);
  1175. b->vers = c->vers++;
  1176. vtUnlock(c->lk);
  1177. dowakeup = 1;
  1178. }
  1179. break;
  1180. case BioReading:
  1181. case BioWriting:
  1182. /*
  1183. * Adding block to disk queue. Bump reference count.
  1184. * diskThread decs the count later by calling blockPut.
  1185. * This is here because we need to lock c->lk to
  1186. * manipulate the ref count.
  1187. */
  1188. vtLock(c->lk);
  1189. b->ref++;
  1190. vtUnlock(c->lk);
  1191. break;
  1192. case BioReadError:
  1193. case BioVentiError:
  1194. /*
  1195. * Oops.
  1196. */
  1197. dowakeup = 1;
  1198. break;
  1199. }
  1200. b->iostate = iostate;
  1201. /*
  1202. * Now that the state has changed, we can wake the waiters.
  1203. */
  1204. if(dowakeup)
  1205. vtWakeupAll(b->ioready);
  1206. }
  1207. /*
  1208. * The active file system is a tree of blocks.
  1209. * When we add snapshots to the mix, the entire file system
  1210. * becomes a dag and thus requires a bit more care.
  1211. *
  1212. * The life of the file system is divided into epochs. A snapshot
  1213. * ends one epoch and begins the next. Each file system block
  1214. * is marked with the epoch in which it was created (b.epoch).
  1215. * When the block is unlinked from the file system (closed), it is marked
  1216. * with the epoch in which it was removed (b.epochClose).
  1217. * Once we have discarded or archived all snapshots up to
  1218. * b.epochClose, we can reclaim the block.
  1219. *
  1220. * If a block was created in a past epoch but is not yet closed,
  1221. * it is treated as copy-on-write. Of course, in order to insert the
  1222. * new pointer into the tree, the parent must be made writable,
  1223. * and so on up the tree. The recursion stops because the root
  1224. * block is always writable.
  1225. *
  1226. * If blocks are never closed, they will never be reused, and
  1227. * we will run out of disk space. But marking a block as closed
  1228. * requires some care about dependencies and write orderings.
  1229. *
  1230. * (1) If a block p points at a copy-on-write block b and we
  1231. * copy b to create bb, then p must be written out after bb and
  1232. * lbb (bb's label block).
  1233. *
  1234. * (2) We have to mark b as closed, but only after we switch
  1235. * the pointer, so lb must be written out after p. In fact, we
  1236. * can't even update the in-memory copy, or the cache might
  1237. * mistakenly give out b for reuse before p gets written.
  1238. *
  1239. * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
  1240. * The caller is expected to record a "p after bb" dependency
  1241. * to finish (1), and also expected to call blockRemoveLink
  1242. * to arrange for (2) to happen once p is written.
  1243. *
  1244. * Until (2) happens, some pieces of the code (e.g., the archiver)
  1245. * still need to know whether a block has been copied, so we
  1246. * set the BsCopied bit in the label and force that to disk *before*
  1247. * the copy gets written out.
  1248. */
  1249. Block*
  1250. blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
  1251. {
  1252. Block *bb, *lb;
  1253. Label l;
  1254. if((b->l.state&BsClosed) || b->l.epoch >= ehi)
  1255. fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
  1256. argv0, b->addr, &b->l, elo, ehi);
  1257. bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
  1258. if(bb == nil){
  1259. blockPut(b);
  1260. return nil;
  1261. }
  1262. /*
  1263. * Update label so we know the block has been copied.
  1264. * (It will be marked closed once it has been unlinked from
  1265. * the tree.) This must follow cacheAllocBlock since we
  1266. * can't be holding onto lb when we call cacheAllocBlock.
  1267. */
  1268. if((b->l.state&BsCopied)==0)
  1269. if(b->part == PartData){ /* not the superblock */
  1270. l = b->l;
  1271. l.state |= BsCopied;
  1272. lb = _blockSetLabel(b, &l);
  1273. if(lb == nil){
  1274. /* can't set label => can't copy block */
  1275. blockPut(b);
  1276. l.type = BtMax;
  1277. l.state = BsFree;
  1278. l.epoch = 0;
  1279. l.epochClose = 0;
  1280. l.tag = 0;
  1281. blockSetLabel(bb, &l, 0);
  1282. blockPut(bb);
  1283. return nil;
  1284. }
  1285. blockDependency(bb, lb, -1, nil, nil);
  1286. blockPut(lb);
  1287. }
  1288. memmove(bb->data, b->data, b->c->size);
  1289. blockDirty(bb);
  1290. blockPut(b);
  1291. return bb;
  1292. }
  1293. /*
  1294. * Block b once pointed at the block bb at addr/type/tag, but no longer does.
  1295. * If recurse is set, we are unlinking all of bb's children as well.
  1296. *
  1297. * We can't reclaim bb (or its kids) until the block b gets written to disk. We add
  1298. * the relevant information to b's list of unlinked blocks. Once b is written,
  1299. * the list will be queued for processing.
  1300. *
  1301. * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
  1302. */
  1303. void
  1304. blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
  1305. {
  1306. BList *p, **pp, bl;
  1307. /* remove bb from prior list */
  1308. for(pp=&b->prior; (p=*pp)!=nil; ){
  1309. if(p->part == PartData && p->addr == addr){
  1310. *pp = p->next;
  1311. blistFree(b->c, p);
  1312. }else
  1313. pp = &p->next;
  1314. }
  1315. bl.part = PartData;
  1316. bl.addr = addr;
  1317. bl.type = type;
  1318. bl.tag = tag;
  1319. if(b->l.epoch == 0)
  1320. assert(b->part == PartSuper);
  1321. bl.epoch = b->l.epoch;
  1322. bl.next = nil;
  1323. bl.recurse = recurse;
  1324. p = blistAlloc(b);
  1325. if(p == nil){
  1326. /*
  1327. * We were out of blists so blistAlloc wrote b to disk.
  1328. */
  1329. doRemoveLink(b->c, &bl);
  1330. return;
  1331. }
  1332. /* Uhead is only processed when the block goes from Dirty -> Clean */
  1333. assert(b->iostate == BioDirty);
  1334. *p = bl;
  1335. if(b->uhead == nil)
  1336. b->uhead = p;
  1337. else
  1338. b->utail->next = p;
  1339. b->utail = p;
  1340. }
  1341. /*
  1342. * Process removal of a single block and perhaps its children.
  1343. */
  1344. static void
  1345. doRemoveLink(Cache *c, BList *p)
  1346. {
  1347. int i, n, recurse;
  1348. u32int a;
  1349. Block *b;
  1350. Label l;
  1351. BList bl;
  1352. recurse = (p->recurse && p->type != BtData && p->type != BtDir);
  1353. /*
  1354. * We're not really going to overwrite b, but if we're not
  1355. * going to look at its contents, there is no point in reading
  1356. * them from the disk.
  1357. */
  1358. b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
  1359. if(b == nil)
  1360. return;
  1361. /*
  1362. * When we're unlinking from the superblock, close with the next epoch.
  1363. */
  1364. if(p->epoch == 0)
  1365. p->epoch = b->l.epoch+1;
  1366. /* sanity check */
  1367. if(b->l.epoch > p->epoch){
  1368. fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
  1369. argv0, b->l.epoch, p->epoch);
  1370. blockPut(b);
  1371. return;
  1372. }
  1373. if(recurse){
  1374. n = c->size / VtScoreSize;
  1375. for(i=0; i<n; i++){
  1376. a = globalToLocal(b->data + i*VtScoreSize);
  1377. if(a == NilBlock || !readLabel(c, &l, a))
  1378. continue;
  1379. if(l.state&BsClosed)
  1380. continue;
  1381. /*
  1382. * If stack space becomes an issue...
  1383. p->addr = a;
  1384. p->type = l.type;
  1385. p->tag = l.tag;
  1386. doRemoveLink(c, p);
  1387. */
  1388. bl.part = PartData;
  1389. bl.addr = a;
  1390. bl.type = l.type;
  1391. bl.tag = l.tag;
  1392. bl.epoch = p->epoch;
  1393. bl.next = nil;
  1394. bl.recurse = 1;
  1395. /* give up the block lock - share with others */
  1396. blockPut(b);
  1397. doRemoveLink(c, &bl);
  1398. b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
  1399. if(b == nil){
  1400. fprint(2, "%s: warning: lost block in doRemoveLink\n",
  1401. argv0);
  1402. return;
  1403. }
  1404. }
  1405. }
  1406. l = b->l;
  1407. l.state |= BsClosed;
  1408. l.epochClose = p->epoch;
  1409. if(l.epochClose == l.epoch){
  1410. vtLock(c->fl->lk);
  1411. if(l.epoch == c->fl->epochLow)
  1412. c->fl->nused--;
  1413. blockSetLabel(b, &l, 0);
  1414. vtUnlock(c->fl->lk);
  1415. }else
  1416. blockSetLabel(b, &l, 0);
  1417. blockPut(b);
  1418. }
  1419. /*
  1420. * Allocate a BList so that we can record a dependency
  1421. * or queue a removal related to block b.
  1422. * If we can't find a BList, we write out b and return nil.
  1423. */
  1424. static BList *
  1425. blistAlloc(Block *b)
  1426. {
  1427. Cache *c;
  1428. BList *p;
  1429. if(b->iostate != BioDirty){
  1430. /*
  1431. * should not happen anymore -
  1432. * blockDirty used to flush but no longer does.
  1433. */
  1434. assert(b->iostate == BioClean);
  1435. fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
  1436. return nil;
  1437. }
  1438. c = b->c;
  1439. vtLock(c->lk);
  1440. if(c->blfree == nil){
  1441. /*
  1442. * No free BLists. What are our options?
  1443. */
  1444. /* Block has no priors? Just write it. */
  1445. if(b->prior == nil){
  1446. vtUnlock(c->lk);
  1447. diskWriteAndWait(c->disk, b);
  1448. return nil;
  1449. }
  1450. /*
  1451. * Wake the flush thread, which will hopefully free up
  1452. * some BLists for us. We used to flush a block from
  1453. * our own prior list and reclaim that BList, but this is
  1454. * a no-no: some of the blocks on our prior list may
  1455. * be locked by our caller. Or maybe their label blocks
  1456. * are locked by our caller. In any event, it's too hard
  1457. * to make sure we can do I/O for ourselves. Instead,
  1458. * we assume the flush thread will find something.
  1459. * (The flush thread never blocks waiting for a block,
  1460. * so it can't deadlock like we can.)
  1461. */
  1462. while(c->blfree == nil){
  1463. vtWakeup(c->flush);
  1464. vtSleep(c->blrend);
  1465. if(c->blfree == nil)
  1466. fprint(2, "%s: flushing for blists\n", argv0);
  1467. }
  1468. }
  1469. p = c->blfree;
  1470. c->blfree = p->next;
  1471. vtUnlock(c->lk);
  1472. return p;
  1473. }
  1474. static void
  1475. blistFree(Cache *c, BList *bl)
  1476. {
  1477. vtLock(c->lk);
  1478. bl->next = c->blfree;
  1479. c->blfree = bl;
  1480. vtWakeup(c->blrend);
  1481. vtUnlock(c->lk);
  1482. }
  1483. char*
  1484. bsStr(int state)
  1485. {
  1486. static char s[100];
  1487. if(state == BsFree)
  1488. return "Free";
  1489. if(state == BsBad)
  1490. return "Bad";
  1491. sprint(s, "%x", state);
  1492. if(!(state&BsAlloc))
  1493. strcat(s, ",Free"); /* should not happen */
  1494. if(state&BsCopied)
  1495. strcat(s, ",Copied");
  1496. if(state&BsVenti)
  1497. strcat(s, ",Venti");
  1498. if(state&BsClosed)
  1499. strcat(s, ",Closed");
  1500. return s;
  1501. }
  1502. char *
  1503. bioStr(int iostate)
  1504. {
  1505. switch(iostate){
  1506. default:
  1507. return "Unknown!!";
  1508. case BioEmpty:
  1509. return "Empty";
  1510. case BioLabel:
  1511. return "Label";
  1512. case BioClean:
  1513. return "Clean";
  1514. case BioDirty:
  1515. return "Dirty";
  1516. case BioReading:
  1517. return "Reading";
  1518. case BioWriting:
  1519. return "Writing";
  1520. case BioReadError:
  1521. return "ReadError";
  1522. case BioVentiError:
  1523. return "VentiError";
  1524. case BioMax:
  1525. return "Max";
  1526. }
  1527. }
  1528. static char *bttab[] = {
  1529. "BtData",
  1530. "BtData+1",
  1531. "BtData+2",
  1532. "BtData+3",
  1533. "BtData+4",
  1534. "BtData+5",
  1535. "BtData+6",
  1536. "BtData+7",
  1537. "BtDir",
  1538. "BtDir+1",
  1539. "BtDir+2",
  1540. "BtDir+3",
  1541. "BtDir+4",
  1542. "BtDir+5",
  1543. "BtDir+6",
  1544. "BtDir+7",
  1545. };
  1546. char*
  1547. btStr(int type)
  1548. {
  1549. if(type < nelem(bttab))
  1550. return bttab[type];
  1551. return "unknown";
  1552. }
  1553. int
  1554. labelFmt(Fmt *f)
  1555. {
  1556. Label *l;
  1557. l = va_arg(f->args, Label*);
  1558. return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
  1559. btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
  1560. }
  1561. int
  1562. scoreFmt(Fmt *f)
  1563. {
  1564. uchar *v;
  1565. int i;
  1566. u32int addr;
  1567. v = va_arg(f->args, uchar*);
  1568. if(v == nil){
  1569. fmtprint(f, "*");
  1570. }else if((addr = globalToLocal(v)) != NilBlock)
  1571. fmtprint(f, "0x%.8ux", addr);
  1572. else{
  1573. for(i = 0; i < VtScoreSize; i++)
  1574. fmtprint(f, "%2.2ux", v[i]);
  1575. }
  1576. return 0;
  1577. }
  1578. static int
  1579. upHeap(int i, Block *b)
  1580. {
  1581. Block *bb;
  1582. u32int now;
  1583. int p;
  1584. Cache *c;
  1585. c = b->c;
  1586. now = c->now;
  1587. for(; i != 0; i = p){
  1588. p = (i - 1) >> 1;
  1589. bb = c->heap[p];
  1590. if(b->used - now >= bb->used - now)
  1591. break;
  1592. c->heap[i] = bb;
  1593. bb->heap = i;
  1594. }
  1595. c->heap[i] = b;
  1596. b->heap = i;
  1597. return i;
  1598. }
  1599. static int
  1600. downHeap(int i, Block *b)
  1601. {
  1602. Block *bb;
  1603. u32int now;
  1604. int k;
  1605. Cache *c;
  1606. c = b->c;
  1607. now = c->now;
  1608. for(; ; i = k){
  1609. k = (i << 1) + 1;
  1610. if(k >= c->nheap)
  1611. break;
  1612. if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
  1613. k++;
  1614. bb = c->heap[k];
  1615. if(b->used - now <= bb->used - now)
  1616. break;
  1617. c->heap[i] = bb;
  1618. bb->heap = i;
  1619. }
  1620. c->heap[i] = b;
  1621. b->heap = i;
  1622. return i;
  1623. }
  1624. /*
  1625. * Delete a block from the heap.
  1626. * Called with c->lk held.
  1627. */
  1628. static void
  1629. heapDel(Block *b)
  1630. {
  1631. int i, si;
  1632. Cache *c;
  1633. c = b->c;
  1634. si = b->heap;
  1635. if(si == BadHeap)
  1636. return;
  1637. b->heap = BadHeap;
  1638. c->nheap--;
  1639. if(si == c->nheap)
  1640. return;
  1641. b = c->heap[c->nheap];
  1642. i = upHeap(si, b);
  1643. if(i == si)
  1644. downHeap(i, b);
  1645. }
  1646. /*
  1647. * Insert a block into the heap.
  1648. * Called with c->lk held.
  1649. */
  1650. static void
  1651. heapIns(Block *b)
  1652. {
  1653. assert(b->heap == BadHeap);
  1654. upHeap(b->c->nheap++, b);
  1655. vtWakeup(b->c->heapwait);
  1656. }
  1657. /*
  1658. * Get just the label for a block.
  1659. */
  1660. int
  1661. readLabel(Cache *c, Label *l, u32int addr)
  1662. {
  1663. int lpb;
  1664. Block *b;
  1665. u32int a;
  1666. lpb = c->size / LabelSize;
  1667. a = addr / lpb;
  1668. b = cacheLocal(c, PartLabel, a, OReadOnly);
  1669. if(b == nil){
  1670. blockPut(b);
  1671. return 0;
  1672. }
  1673. if(!labelUnpack(l, b->data, addr%lpb)){
  1674. blockPut(b);
  1675. return 0;
  1676. }
  1677. blockPut(b);
  1678. return 1;
  1679. }
  1680. /*
  1681. * Process unlink queue.
  1682. * Called with c->lk held.
  1683. */
  1684. static void
  1685. unlinkBody(Cache *c)
  1686. {
  1687. BList *p;
  1688. while(c->uhead != nil){
  1689. p = c->uhead;
  1690. c->uhead = p->next;
  1691. vtUnlock(c->lk);
  1692. doRemoveLink(c, p);
  1693. vtLock(c->lk);
  1694. p->next = c->blfree;
  1695. c->blfree = p;
  1696. }
  1697. }
  1698. /*
  1699. * Occasionally unlink the blocks on the cache unlink queue.
  1700. */
  1701. static void
  1702. unlinkThread(void *a)
  1703. {
  1704. Cache *c = a;
  1705. vtThreadSetName("unlink");
  1706. vtLock(c->lk);
  1707. for(;;){
  1708. while(c->uhead == nil && c->die == nil)
  1709. vtSleep(c->unlink);
  1710. if(c->die != nil)
  1711. break;
  1712. unlinkBody(c);
  1713. }
  1714. c->ref--;
  1715. vtWakeup(c->die);
  1716. vtUnlock(c->lk);
  1717. }
  1718. static int
  1719. baddrCmp(void *a0, void *a1)
  1720. {
  1721. BAddr *b0, *b1;
  1722. b0 = a0;
  1723. b1 = a1;
  1724. if(b0->part < b1->part)
  1725. return -1;
  1726. if(b0->part > b1->part)
  1727. return 1;
  1728. if(b0->addr < b1->addr)
  1729. return -1;
  1730. if(b0->addr > b1->addr)
  1731. return 1;
  1732. return 0;
  1733. }
  1734. /*
  1735. * Scan the block list for dirty blocks; add them to the list c->baddr.
  1736. */
  1737. static void
  1738. flushFill(Cache *c)
  1739. {
  1740. int i, ndirty;
  1741. BAddr *p;
  1742. Block *b;
  1743. vtLock(c->lk);
  1744. if(c->ndirty == 0){
  1745. vtUnlock(c->lk);
  1746. return;
  1747. }
  1748. p = c->baddr;
  1749. ndirty = 0;
  1750. for(i=0; i<c->nblocks; i++){
  1751. b = c->blocks + i;
  1752. if(b->part == PartError)
  1753. continue;
  1754. if(b->iostate == BioDirty || b->iostate == BioWriting)
  1755. ndirty++;
  1756. if(b->iostate != BioDirty)
  1757. continue;
  1758. p->part = b->part;
  1759. p->addr = b->addr;
  1760. p->vers = b->vers;
  1761. p++;
  1762. }
  1763. if(ndirty != c->ndirty){
  1764. fprint(2, "%s: ndirty mismatch expected %d found %d\n",
  1765. argv0, c->ndirty, ndirty);
  1766. c->ndirty = ndirty;
  1767. }
  1768. vtUnlock(c->lk);
  1769. c->bw = p - c->baddr;
  1770. qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
  1771. }
  1772. /*
  1773. * This is not thread safe, i.e. it can't be called from multiple threads.
  1774. *
  1775. * It's okay how we use it, because it only gets called in
  1776. * the flushThread. And cacheFree, but only after
  1777. * cacheFree has killed off the flushThread.
  1778. */
  1779. static int
  1780. cacheFlushBlock(Cache *c)
  1781. {
  1782. Block *b;
  1783. BAddr *p;
  1784. int lockfail, nfail;
  1785. nfail = 0;
  1786. for(;;){
  1787. if(c->br == c->be){
  1788. if(c->bw == 0 || c->bw == c->be)
  1789. flushFill(c);
  1790. c->br = 0;
  1791. c->be = c->bw;
  1792. c->bw = 0;
  1793. c->nflush = 0;
  1794. }
  1795. if(c->br == c->be)
  1796. return 0;
  1797. p = c->baddr + c->br;
  1798. c->br++;
  1799. b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
  1800. if(b && blockWrite(b)){
  1801. c->nflush++;
  1802. blockPut(b);
  1803. return 1;
  1804. }
  1805. if(b)
  1806. blockPut(b);
  1807. /*
  1808. * Why didn't we write the block?
  1809. */
  1810. /* Block already written out */
  1811. if(b == nil && !lockfail)
  1812. continue;
  1813. /* Failed to acquire lock; sleep if happens a lot. */
  1814. if(lockfail && ++nfail > 100){
  1815. sleep(500);
  1816. nfail = 0;
  1817. }
  1818. /* Requeue block. */
  1819. if(c->bw < c->be)
  1820. c->baddr[c->bw++] = *p;
  1821. }
  1822. }
  1823. /*
  1824. * Occasionally flush dirty blocks from memory to the disk.
  1825. */
  1826. static void
  1827. flushThread(void *a)
  1828. {
  1829. Cache *c = a;
  1830. int i;
  1831. vtThreadSetName("flush");
  1832. vtLock(c->lk);
  1833. while(c->die == nil){
  1834. vtSleep(c->flush);
  1835. vtUnlock(c->lk);
  1836. for(i=0; i<FlushSize; i++)
  1837. if(!cacheFlushBlock(c)){
  1838. /*
  1839. * If i==0, could be someone is waking us repeatedly
  1840. * to flush the cache but there's no work to do.
  1841. * Pause a little.
  1842. */
  1843. if(i==0){
  1844. // fprint(2, "%s: flushthread found "
  1845. // "nothing to flush - %d dirty\n",
  1846. // argv0, c->ndirty);
  1847. sleep(250);
  1848. }
  1849. break;
  1850. }
  1851. if(i==0 && c->ndirty){
  1852. /*
  1853. * All the blocks are being written right now -- there's nothing to do.
  1854. * We might be spinning with cacheFlush though -- he'll just keep
  1855. * kicking us until c->ndirty goes down. Probably we should sleep
  1856. * on something that the diskThread can kick, but for now we'll
  1857. * just pause for a little while waiting for disks to finish.
  1858. */
  1859. sleep(100);
  1860. }
  1861. vtLock(c->lk);
  1862. vtWakeupAll(c->flushwait);
  1863. }
  1864. c->ref--;
  1865. vtWakeup(c->die);
  1866. vtUnlock(c->lk);
  1867. }
  1868. /*
  1869. * Flush the cache.
  1870. */
  1871. void
  1872. cacheFlush(Cache *c, int wait)
  1873. {
  1874. /*
  1875. * Lock c->dirtylk so that more blocks aren't being dirtied
  1876. * while we try to write out what's already here.
  1877. * Otherwise we might not ever finish!
  1878. */
  1879. vtLock(c->dirtylk);
  1880. vtLock(c->lk);
  1881. if(wait){
  1882. while(c->ndirty){
  1883. // consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
  1884. // c->ndirty, c->uhead);
  1885. vtWakeup(c->flush);
  1886. vtSleep(c->flushwait);
  1887. }
  1888. // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
  1889. }else if(c->ndirty)
  1890. vtWakeup(c->flush);
  1891. vtUnlock(c->lk);
  1892. vtUnlock(c->dirtylk);
  1893. }
  1894. /*
  1895. * Kick the flushThread every 30 seconds.
  1896. */
  1897. static void
  1898. cacheSync(void *v)
  1899. {
  1900. Cache *c;
  1901. c = v;
  1902. cacheFlush(c, 0);
  1903. }