fixarenas.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914
  1. /*
  2. * Check and fix an arena partition.
  3. *
  4. * This is a lot grittier than the rest of Venti because
  5. * it can't just give up if a byte here or there is wrong.
  6. *
  7. * The rule here (hopefully followed!) is that block corruption
  8. * only ever has a local effect -- there are no blocks that you
  9. * can wipe out that will cause large portions of
  10. * uncorrupted data blocks to be useless.
  11. */
  12. #include "stdinc.h"
  13. #include "dat.h"
  14. #include "fns.h"
  15. #include "whack.h"
  16. #define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1))
  17. #pragma varargck type "z" uvlong
  18. #pragma varargck type "z" vlong
  19. #pragma varargck type "t" uint
  20. enum
  21. {
  22. K = 1024,
  23. M = 1024*1024,
  24. G = 1024*1024*1024,
  25. Block = 4096,
  26. };
  27. int debugsha1;
  28. int verbose;
  29. Part *part;
  30. char *file;
  31. char *basename;
  32. char *dumpbase;
  33. int fix;
  34. int badreads;
  35. int unseal;
  36. uchar zero[MaxDiskBlock];
  37. Arena lastarena;
  38. ArenaPart ap;
  39. uvlong arenasize;
  40. int nbadread;
  41. int nbad;
  42. uvlong partend;
  43. void checkarena(vlong, int);
  44. void
  45. usage(void)
  46. {
  47. fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
  48. threadexitsall(0);
  49. }
  50. /*
  51. * Format number in simplest way that is okay with unittoull.
  52. */
  53. static int
  54. zfmt(Fmt *fmt)
  55. {
  56. vlong x;
  57. x = va_arg(fmt->args, vlong);
  58. if(x == 0)
  59. return fmtstrcpy(fmt, "0");
  60. if(x%G == 0)
  61. return fmtprint(fmt, "%lldG", x/G);
  62. if(x%M == 0)
  63. return fmtprint(fmt, "%lldM", x/M);
  64. if(x%K == 0)
  65. return fmtprint(fmt, "%lldK", x/K);
  66. return fmtprint(fmt, "%lld", x);
  67. }
  68. /*
  69. * Format time like ctime without newline.
  70. */
  71. static int
  72. tfmt(Fmt *fmt)
  73. {
  74. uint t;
  75. char buf[30];
  76. t = va_arg(fmt->args, uint);
  77. strcpy(buf, ctime(t));
  78. buf[28] = 0;
  79. return fmtstrcpy(fmt, buf);
  80. }
  81. /*
  82. * Coalesce messages about unreadable sectors into larger ranges.
  83. * bad(0, 0) flushes the buffer.
  84. */
  85. static void
  86. bad(char *msg, vlong o, int len)
  87. {
  88. static vlong lb0, lb1;
  89. static char *lmsg;
  90. if(msg == nil)
  91. msg = lmsg;
  92. if(o == -1){
  93. lmsg = nil;
  94. lb0 = 0;
  95. lb1 = 0;
  96. return;
  97. }
  98. if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
  99. if(lb0 != lb1)
  100. print("%s %#llux+%#llux (%,lld+%,lld)\n",
  101. lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
  102. lb0 = o;
  103. }
  104. lmsg = msg;
  105. lb1 = o+len;
  106. }
  107. /*
  108. * Read in the len bytes of data at the offset. If can't for whatever reason,
  109. * fill it with garbage but print an error.
  110. */
  111. static uchar*
  112. readdisk(uchar *buf, vlong offset, int len)
  113. {
  114. int i, j, k, n;
  115. if(offset >= partend){
  116. memset(buf, 0xFB, sizeof buf);
  117. return buf;
  118. }
  119. if(offset+len > partend){
  120. memset(buf, 0xFB, sizeof buf);
  121. len = partend - offset;
  122. }
  123. if(readpart(part, offset, buf, len) >= 0)
  124. return buf;
  125. /*
  126. * The read failed. Clear the buffer to nonsense, and
  127. * then try reading in smaller pieces. If that fails,
  128. * read in even smaller pieces. And so on down to sectors.
  129. */
  130. memset(buf, 0xFD, len);
  131. for(i=0; i<len; i+=64*K){
  132. n = 64*K;
  133. if(i+n > len)
  134. n = len-i;
  135. if(readpart(part, offset+i, buf+i, n) >= 0)
  136. continue;
  137. for(j=i; j<len && j<i+64*K; j+=4*K){
  138. n = 4*K;
  139. if(j+n > len)
  140. n = len-j;
  141. if(readpart(part, offset+j, buf+j, n) >= 0)
  142. continue;
  143. for(k=j; k<len && k<j+4*K; k+=512){
  144. if(readpart(part, offset+k, buf+k, 512) >= 0)
  145. continue;
  146. bad("disk read failed at", k, 512);
  147. badreads++;
  148. }
  149. }
  150. }
  151. bad(nil, 0, 0);
  152. return buf;
  153. }
  154. /*
  155. * Buffer to support running SHA1 hash of the disk.
  156. */
  157. typedef struct Shabuf Shabuf;
  158. struct Shabuf
  159. {
  160. int fd;
  161. vlong offset;
  162. DigestState state;
  163. int rollback;
  164. vlong r0;
  165. DigestState *hist;
  166. int nhist;
  167. };
  168. void
  169. sbdebug(Shabuf *sb, char *file)
  170. {
  171. int fd;
  172. if(sb->fd > 0){
  173. close(sb->fd);
  174. sb->fd = 0;
  175. }
  176. if((fd = create(file, OWRITE, 0666)) < 0)
  177. return;
  178. if(fd == 0){
  179. fd = dup(fd, -1);
  180. close(0);
  181. }
  182. sb->fd = fd;
  183. }
  184. void
  185. sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
  186. {
  187. int n, x;
  188. vlong o;
  189. if(sb->rollback && !sb->hist){
  190. sb->r0 = offset;
  191. sb->nhist = 1;
  192. sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
  193. memset(sb->hist, 0, sizeof sb->hist[0]);
  194. }
  195. if(sb->r0 == 0)
  196. sb->r0 = offset;
  197. if(sb->offset < offset || sb->offset >= offset+len){
  198. if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
  199. p, offset, len, sb->offset);
  200. return;
  201. }
  202. x = sb->offset - offset;
  203. if(0) print("sbupdate %p %#llux+%d skip %d\n",
  204. sb, offset, len, x);
  205. if(x){
  206. p += x;
  207. offset += x;
  208. len -= x;
  209. }
  210. assert(sb->offset == offset);
  211. if(sb->fd > 0)
  212. pwrite(sb->fd, p, len, offset - sb->r0);
  213. if(!sb->rollback){
  214. sha1(p, len, nil, &sb->state);
  215. sb->offset += len;
  216. return;
  217. }
  218. /* save state every 4M so we can roll back quickly */
  219. o = offset - sb->r0;
  220. while(len > 0){
  221. n = 4*M - o%(4*M);
  222. if(n > len)
  223. n = len;
  224. sha1(p, n, nil, &sb->state);
  225. sb->offset += n;
  226. o += n;
  227. p += n;
  228. len -= n;
  229. if(o%(4*M) == 0){
  230. x = o/(4*M);
  231. if(x >= sb->nhist){
  232. if(x != sb->nhist)
  233. print("oops! x=%d nhist=%d\n", x, sb->nhist);
  234. sb->nhist += 32;
  235. sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
  236. }
  237. sb->hist[x] = sb->state;
  238. }
  239. }
  240. }
  241. void
  242. sbdiskhash(Shabuf *sb, vlong eoffset)
  243. {
  244. static uchar dbuf[4*M];
  245. int n;
  246. while(sb->offset < eoffset){
  247. n = sizeof dbuf;
  248. if(sb->offset+n > eoffset)
  249. n = eoffset - sb->offset;
  250. readdisk(dbuf, sb->offset, n);
  251. sbupdate(sb, dbuf, sb->offset, n);
  252. }
  253. }
  254. void
  255. sbrollback(Shabuf *sb, vlong offset)
  256. {
  257. int x;
  258. vlong o;
  259. Dir d;
  260. if(!sb->rollback || !sb->r0){
  261. print("cannot rollback sha\n");
  262. return;
  263. }
  264. if(offset >= sb->offset)
  265. return;
  266. o = offset - sb->r0;
  267. x = o/(4*M);
  268. if(x >= sb->nhist){
  269. print("cannot rollback sha\n");
  270. return;
  271. }
  272. sb->state = sb->hist[x];
  273. sb->offset = sb->r0 + x*4*M;
  274. assert(sb->offset <= offset);
  275. if(sb->fd > 0){
  276. nulldir(&d);
  277. d.length = sb->offset - sb->r0;
  278. dirfwstat(sb->fd, &d);
  279. }
  280. }
  281. void
  282. sbscore(Shabuf *sb, uchar *score)
  283. {
  284. if(sb->hist){
  285. free(sb->hist);
  286. sb->hist = nil;
  287. }
  288. sha1(nil, 0, score, &sb->state);
  289. }
  290. /*
  291. * If we're fixing arenas, then editing this memory edits the disk!
  292. * It will be written back out as new data is paged in.
  293. */
  294. uchar buf[4*M];
  295. uchar sbuf[4*M];
  296. vlong bufoffset;
  297. int buflen;
  298. static void pageout(void);
  299. static uchar*
  300. pagein(vlong offset, int len)
  301. {
  302. pageout();
  303. if(offset >= partend){
  304. memset(buf, 0xFB, sizeof buf);
  305. return buf;
  306. }
  307. if(offset+len > partend){
  308. memset(buf, 0xFB, sizeof buf);
  309. len = partend - offset;
  310. }
  311. bufoffset = offset;
  312. buflen = len;
  313. readdisk(buf, offset, len);
  314. memmove(sbuf, buf, len);
  315. return buf;
  316. }
  317. static void
  318. pageout(void)
  319. {
  320. if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
  321. buflen = 0;
  322. return;
  323. }
  324. if(writepart(part, bufoffset, buf, buflen) < 0)
  325. print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
  326. bufoffset, buflen, bufoffset, buflen);
  327. buflen = 0;
  328. }
  329. static void
  330. zerorange(vlong offset, int len)
  331. {
  332. int i;
  333. vlong ooff;
  334. int olen;
  335. enum { MinBlock = 4*K, MaxBlock = 8*K };
  336. if(0)
  337. if(bufoffset <= offset && offset+len <= bufoffset+buflen){
  338. memset(buf+(offset-bufoffset), 0, len);
  339. return;
  340. }
  341. ooff = bufoffset;
  342. olen = buflen;
  343. i = offset%MinBlock;
  344. if(i+len < MaxBlock){
  345. pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
  346. memset(buf+i, 0, len);
  347. }else{
  348. pagein(offset-i, MaxBlock);
  349. memset(buf+i, 0, MaxBlock-i);
  350. offset += MaxBlock-i;
  351. len -= MaxBlock-i;
  352. while(len >= MaxBlock){
  353. pagein(offset, MaxBlock);
  354. memset(buf, 0, MaxBlock);
  355. offset += MaxBlock;
  356. len -= MaxBlock;
  357. }
  358. pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
  359. memset(buf, 0, len);
  360. }
  361. pagein(ooff, olen);
  362. }
  363. /*
  364. * read/write integers
  365. *
  366. static void
  367. p16(uchar *p, u16int u)
  368. {
  369. p[0] = (u>>8) & 0xFF;
  370. p[1] = u & 0xFF;
  371. }
  372. */
  373. static u16int
  374. u16(uchar *p)
  375. {
  376. return (p[0]<<8)|p[1];
  377. }
  378. static void
  379. p32(uchar *p, u32int u)
  380. {
  381. p[0] = (u>>24) & 0xFF;
  382. p[1] = (u>>16) & 0xFF;
  383. p[2] = (u>>8) & 0xFF;
  384. p[3] = u & 0xFF;
  385. }
  386. static u32int
  387. u32(uchar *p)
  388. {
  389. return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
  390. }
  391. /*
  392. static void
  393. p64(uchar *p, u64int u)
  394. {
  395. p32(p, u>>32);
  396. p32(p, u);
  397. }
  398. */
  399. static u64int
  400. u64(uchar *p)
  401. {
  402. return ((u64int)u32(p)<<32) | u32(p+4);
  403. }
  404. static int
  405. vlongcmp(const void *va, const void *vb)
  406. {
  407. vlong a, b;
  408. a = *(vlong*)va;
  409. b = *(vlong*)vb;
  410. if(a < b)
  411. return -1;
  412. if(b > a)
  413. return 1;
  414. return 0;
  415. }
  416. /* D and S are in draw.h */
  417. #define D VD
  418. #define S VS
  419. enum
  420. {
  421. D = 0x10000,
  422. Z = 0x20000,
  423. S = 0x30000,
  424. T = 0x40000,
  425. N = 0xFFFF
  426. };
  427. typedef struct Info Info;
  428. struct Info
  429. {
  430. int len;
  431. char *name;
  432. };
  433. Info partinfo[] = {
  434. 4, "magic",
  435. D|4, "version",
  436. Z|4, "blocksize",
  437. 4, "arenabase",
  438. 0
  439. };
  440. Info headinfo4[] = {
  441. 4, "magic",
  442. D|4, "version",
  443. S|ANameSize, "name",
  444. Z|4, "blocksize",
  445. Z|8, "size",
  446. 0
  447. };
  448. Info headinfo5[] = {
  449. 4, "magic",
  450. D|4, "version",
  451. S|ANameSize, "name",
  452. Z|4, "blocksize",
  453. Z|8, "size",
  454. 4, "clumpmagic",
  455. 0
  456. };
  457. Info tailinfo4[] = {
  458. 4, "magic",
  459. D|4, "version",
  460. S|ANameSize, "name",
  461. D|4, "clumps",
  462. D|4, "cclumps",
  463. T|4, "ctime",
  464. T|4, "wtime",
  465. D|8, "used",
  466. D|8, "uncsize",
  467. 1, "sealed",
  468. 0
  469. };
  470. Info tailinfo4a[] = {
  471. /* tailinfo 4 */
  472. 4, "magic",
  473. D|4, "version",
  474. S|ANameSize, "name",
  475. D|4, "clumps",
  476. D|4, "cclumps",
  477. T|4, "ctime",
  478. T|4, "wtime",
  479. D|8, "used",
  480. D|8, "uncsize",
  481. 1, "sealed",
  482. /* mem stats */
  483. 1, "extension",
  484. D|4, "mem.clumps",
  485. D|4, "mem.cclumps",
  486. D|8, "mem.used",
  487. D|8, "mem.uncsize",
  488. 1, "mem.sealed",
  489. 0
  490. };
  491. Info tailinfo5[] = {
  492. 4, "magic",
  493. D|4, "version",
  494. S|ANameSize, "name",
  495. D|4, "clumps",
  496. D|4, "cclumps",
  497. T|4, "ctime",
  498. T|4, "wtime",
  499. 4, "clumpmagic",
  500. D|8, "used",
  501. D|8, "uncsize",
  502. 1, "sealed",
  503. 0
  504. };
  505. Info tailinfo5a[] = {
  506. /* tailinfo 5 */
  507. 4, "magic",
  508. D|4, "version",
  509. S|ANameSize, "name",
  510. D|4, "clumps",
  511. D|4, "cclumps",
  512. T|4, "ctime",
  513. T|4, "wtime",
  514. 4, "clumpmagic",
  515. D|8, "used",
  516. D|8, "uncsize",
  517. 1, "sealed",
  518. /* mem stats */
  519. 1, "extension",
  520. D|4, "mem.clumps",
  521. D|4, "mem.cclumps",
  522. D|8, "mem.used",
  523. D|8, "mem.uncsize",
  524. 1, "mem.sealed",
  525. 0
  526. };
  527. void
  528. showdiffs(uchar *want, uchar *have, int len, Info *info)
  529. {
  530. int n;
  531. while(len > 0 && (n=info->len&N) > 0){
  532. if(memcmp(have, want, n) != 0){
  533. switch(info->len){
  534. case 1:
  535. print("\t%s: correct=%d disk=%d\n",
  536. info->name, *want, *have);
  537. break;
  538. case 4:
  539. print("\t%s: correct=%#ux disk=%#ux\n",
  540. info->name, u32(want), u32(have));
  541. break;
  542. case D|4:
  543. print("\t%s: correct=%,ud disk=%,ud\n",
  544. info->name, u32(want), u32(have));
  545. break;
  546. case T|4:
  547. print("\t%s: correct=%t\n\t\tdisk=%t\n",
  548. info->name, u32(want), u32(have));
  549. break;
  550. case Z|4:
  551. print("\t%s: correct=%z disk=%z\n",
  552. info->name, (uvlong)u32(want), (uvlong)u32(have));
  553. break;
  554. case D|8:
  555. print("\t%s: correct=%,lld disk=%,lld\n",
  556. info->name, u64(want), u64(have));
  557. break;
  558. case Z|8:
  559. print("\t%s: correct=%z disk=%z\n",
  560. info->name, u64(want), u64(have));
  561. break;
  562. case S|ANameSize:
  563. print("\t%s: correct=%s disk=%.*s\n",
  564. info->name, (char*)want,
  565. utfnlen((char*)have, ANameSize-1),
  566. (char*)have);
  567. break;
  568. default:
  569. print("\t%s: correct=%.*H disk=%.*H\n",
  570. info->name, n, want, n, have);
  571. break;
  572. }
  573. }
  574. have += n;
  575. want += n;
  576. len -= n;
  577. info++;
  578. }
  579. if(len > 0 && memcmp(have, want, len) != 0){
  580. if(memcmp(want, zero, len) != 0)
  581. print("!!\textra want data in showdiffs (bug in fixarenas)\n");
  582. else
  583. print("\tnon-zero data on disk after structure\n");
  584. if(verbose > 1){
  585. print("want: %.*H\n", len, want);
  586. print("have: %.*H\n", len, have);
  587. }
  588. }
  589. }
  590. /*
  591. * Does part begin with an arena?
  592. */
  593. int
  594. isonearena(void)
  595. {
  596. return u32(pagein(0, Block)) == ArenaHeadMagic;
  597. }
  598. static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
  599. /*
  600. * Poke around on the disk to guess what the ArenaPart numbers are.
  601. */
  602. void
  603. guessgeometry(void)
  604. {
  605. int i, j, n, bestn, ndiff, nhead, ntail;
  606. uchar *p, *ep, *sp;
  607. u64int diff[100], head[20], tail[20];
  608. u64int offset, bestdiff;
  609. ap.version = ArenaPartVersion;
  610. if(arenasize == 0 || ap.blocksize == 0){
  611. /*
  612. * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
  613. * Instead, look for the individual arena headers and tails, which there
  614. * are many of, and once we've seen enough, infer the spacing.
  615. *
  616. * Of course, nothing in the file format requires that arenas be evenly
  617. * spaced, but fmtarenas always does that for us.
  618. */
  619. nhead = 0;
  620. ntail = 0;
  621. for(offset=PartBlank; offset<partend; offset+=4*M){
  622. p = pagein(offset, 4*M);
  623. for(sp=p, ep=p+4*M; p<ep; p+=K){
  624. if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
  625. if(verbose)
  626. print("arena head at %#llx\n", offset+(p-sp));
  627. head[nhead++] = offset+(p-sp);
  628. }
  629. if(u32(p) == ArenaMagic && ntail < nelem(tail)){
  630. tail[ntail++] = offset+(p-sp);
  631. if(verbose)
  632. print("arena tail at %#llx\n", offset+(p-sp));
  633. }
  634. }
  635. if(nhead == nelem(head) && ntail == nelem(tail))
  636. break;
  637. }
  638. if(nhead < 3 && ntail < 3)
  639. sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
  640. /*
  641. * Arena size is likely the most common
  642. * inter-head or inter-tail spacing.
  643. */
  644. ndiff = 0;
  645. for(i=1; i<nhead; i++)
  646. diff[ndiff++] = head[i] - head[i-1];
  647. for(i=1; i<ntail; i++)
  648. diff[ndiff++] = tail[i] - tail[i-1];
  649. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  650. bestn = 0;
  651. bestdiff = 0;
  652. for(i=1, n=1; i<=ndiff; i++, n++){
  653. if(i==ndiff || diff[i] != diff[i-1]){
  654. if(n > bestn){
  655. bestn = n;
  656. bestdiff = diff[i-1];
  657. }
  658. n = 0;
  659. }
  660. }
  661. print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
  662. if(arenasize != 0 && arenasize != bestdiff)
  663. print("using user-specified size %z instead\n", arenasize);
  664. else
  665. arenasize = bestdiff;
  666. /*
  667. * The arena tail for an arena is arenasize-blocksize from the head.
  668. */
  669. ndiff = 0;
  670. for(i=j=0; i<nhead && j<ntail; ){
  671. if(tail[j] < head[i]){
  672. j++;
  673. continue;
  674. }
  675. if(tail[j] < head[i]+arenasize){
  676. diff[ndiff++] = head[i]+arenasize - tail[j];
  677. j++;
  678. continue;
  679. }
  680. i++;
  681. }
  682. if(ndiff < 3)
  683. sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
  684. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  685. bestn = 0;
  686. bestdiff = 0;
  687. for(i=1, n=1; i<=ndiff; i++, n++){
  688. if(i==ndiff || diff[i] != diff[i-1]){
  689. if(n > bestn){
  690. bestn = n;
  691. bestdiff = diff[i-1];
  692. }
  693. n = 0;
  694. }
  695. }
  696. print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
  697. if(ap.blocksize != 0 && ap.blocksize != bestdiff)
  698. print("using user-specified size %z instead\n", (vlong)ap.blocksize);
  699. else
  700. ap.blocksize = bestdiff;
  701. if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
  702. sysfatal("block size not a power of two");
  703. if(ap.blocksize > MaxDiskBlock)
  704. sysfatal("block size too big (max=%d)", MaxDiskBlock);
  705. /*
  706. * Use head/tail information to deduce arena base.
  707. */
  708. ndiff = 0;
  709. for(i=0; i<nhead; i++)
  710. diff[ndiff++] = head[i]%arenasize;
  711. for(i=0; i<ntail; i++)
  712. diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
  713. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  714. bestn = 0;
  715. bestdiff = 0;
  716. for(i=1, n=1; i<=ndiff; i++, n++){
  717. if(i==ndiff || diff[i] != diff[i-1]){
  718. if(n > bestn){
  719. bestn = n;
  720. bestdiff = diff[i-1];
  721. }
  722. n = 0;
  723. }
  724. }
  725. ap.arenabase = bestdiff;
  726. }
  727. ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
  728. /*
  729. * XXX pick up table, check arenabase.
  730. * XXX pick up table, record base name.
  731. */
  732. /*
  733. * Somewhat standard computation.
  734. * Fmtarenas used to use 64k tab, now uses 512k tab.
  735. */
  736. if(ap.arenabase == 0){
  737. print("trying standard arena bases...\n");
  738. for(i=0; i<nelem(tabsizes); i++){
  739. ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
  740. p = pagein(ap.arenabase, Block);
  741. if(u32(p) == ArenaHeadMagic)
  742. break;
  743. }
  744. }
  745. p = pagein(ap.arenabase, Block);
  746. print("arena base likely %z%s\n", (vlong)ap.arenabase,
  747. u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
  748. ap.tabsize = ap.arenabase - ap.tabbase;
  749. }
  750. /*
  751. * Check the arena partition blocks and then the arenas listed in range.
  752. */
  753. void
  754. checkarenas(char *range)
  755. {
  756. char *s, *t;
  757. int i, lo, hi, narena;
  758. uchar dbuf[HeadSize];
  759. uchar *p;
  760. guessgeometry();
  761. partend -= partend%ap.blocksize;
  762. memset(dbuf, 0, sizeof dbuf);
  763. packarenapart(&ap, dbuf);
  764. p = pagein(PartBlank, Block);
  765. if(memcmp(p, dbuf, HeadSize) != 0){
  766. print("on-disk arena part superblock incorrect\n");
  767. showdiffs(dbuf, p, HeadSize, partinfo);
  768. }
  769. memmove(p, dbuf, HeadSize);
  770. narena = (partend-ap.arenabase + arenasize-1)/arenasize;
  771. if(range == nil){
  772. for(i=0; i<narena; i++)
  773. checkarena(ap.arenabase+(vlong)i*arenasize, i);
  774. }else if(strcmp(range, "none") == 0){
  775. /* nothing */
  776. }else{
  777. /* parse, e.g., -4,8-9,10- */
  778. for(s=range; *s; s=t){
  779. t = strchr(s, ',');
  780. if(t)
  781. *t++ = 0;
  782. else
  783. t = s+strlen(s);
  784. if(*s == '-')
  785. lo = 0;
  786. else
  787. lo = strtol(s, &s, 0);
  788. hi = lo;
  789. if(*s == '-'){
  790. s++;
  791. if(*s == 0)
  792. hi = narena-1;
  793. else
  794. hi = strtol(s, &s, 0);
  795. }
  796. if(*s != 0){
  797. print("bad arena range: %s\n", s);
  798. continue;
  799. }
  800. for(i=lo; i<=hi; i++)
  801. checkarena(ap.arenabase+(vlong)i*arenasize, i);
  802. }
  803. }
  804. }
  805. /*
  806. * Is there a clump here at p?
  807. */
  808. static int
  809. isclump(uchar *p, Clump *cl, u32int *pmagic)
  810. {
  811. int n;
  812. u32int magic;
  813. uchar score[VtScoreSize], *bp;
  814. Unwhack uw;
  815. uchar ubuf[70*1024];
  816. bp = p;
  817. magic = u32(p);
  818. if(magic == 0)
  819. return 0;
  820. p += U32Size;
  821. cl->info.type = vtfromdisktype(*p);
  822. if(cl->info.type == 0xFF)
  823. return 0;
  824. p++;
  825. cl->info.size = u16(p);
  826. p += U16Size;
  827. cl->info.uncsize = u16(p);
  828. if(cl->info.size > cl->info.uncsize)
  829. return 0;
  830. p += U16Size;
  831. scorecp(cl->info.score, p);
  832. p += VtScoreSize;
  833. cl->encoding = *p;
  834. p++;
  835. cl->creator = u32(p);
  836. p += U32Size;
  837. cl->time = u32(p);
  838. p += U32Size;
  839. switch(cl->encoding){
  840. case ClumpENone:
  841. if(cl->info.size != cl->info.uncsize)
  842. return 0;
  843. scoremem(score, p, cl->info.size);
  844. if(scorecmp(score, cl->info.score) != 0)
  845. return 0;
  846. break;
  847. case ClumpECompress:
  848. if(cl->info.size >= cl->info.uncsize)
  849. return 0;
  850. unwhackinit(&uw);
  851. n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
  852. if(n != cl->info.uncsize)
  853. return 0;
  854. scoremem(score, ubuf, cl->info.uncsize);
  855. if(scorecmp(score, cl->info.score) != 0)
  856. return 0;
  857. break;
  858. default:
  859. return 0;
  860. }
  861. p += cl->info.size;
  862. /* it all worked out in the end */
  863. *pmagic = magic;
  864. return p - bp;
  865. }
  866. /*
  867. * All ClumpInfos seen in this arena.
  868. * Kept in binary tree so we can look up by score.
  869. */
  870. typedef struct Cit Cit;
  871. struct Cit
  872. {
  873. int left;
  874. int right;
  875. vlong corrupt;
  876. ClumpInfo ci;
  877. };
  878. Cit *cibuf;
  879. int ciroot;
  880. int ncibuf, mcibuf;
  881. void
  882. resetcibuf(void)
  883. {
  884. ncibuf = 0;
  885. ciroot = -1;
  886. }
  887. int*
  888. ltreewalk(int *p, uchar *score)
  889. {
  890. int i;
  891. for(;;){
  892. if(*p == -1)
  893. return p;
  894. i = scorecmp(cibuf[*p].ci.score, score);
  895. if(i == 0)
  896. return p;
  897. if(i < 0)
  898. p = &cibuf[*p].right;
  899. else
  900. p = &cibuf[*p].left;
  901. }
  902. }
  903. void
  904. addcibuf(ClumpInfo *ci, vlong corrupt)
  905. {
  906. Cit *cit;
  907. if(ncibuf == mcibuf){
  908. mcibuf += 131072;
  909. cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
  910. }
  911. cit = &cibuf[ncibuf];
  912. cit->ci = *ci;
  913. cit->left = -1;
  914. cit->right = -1;
  915. cit->corrupt = corrupt;
  916. if(!corrupt)
  917. *ltreewalk(&ciroot, ci->score) = ncibuf;
  918. ncibuf++;
  919. }
  920. void
  921. addcicorrupt(vlong len)
  922. {
  923. static ClumpInfo zci;
  924. addcibuf(&zci, len);
  925. }
  926. int
  927. haveclump(uchar *score)
  928. {
  929. int i;
  930. int p;
  931. p = ciroot;
  932. for(;;){
  933. if(p == -1)
  934. return 0;
  935. i = scorecmp(cibuf[p].ci.score, score);
  936. if(i == 0)
  937. return 1;
  938. if(i < 0)
  939. p = cibuf[p].right;
  940. else
  941. p = cibuf[p].left;
  942. }
  943. }
  944. int
  945. matchci(ClumpInfo *ci, uchar *p)
  946. {
  947. if(ci->type != vtfromdisktype(p[0]))
  948. return 0;
  949. if(ci->size != u16(p+1))
  950. return 0;
  951. if(ci->uncsize != u16(p+3))
  952. return 0;
  953. if(scorecmp(ci->score, p+5) != 0)
  954. return 0;
  955. return 1;
  956. }
  957. int
  958. sealedarena(uchar *p, int blocksize)
  959. {
  960. int v, n;
  961. v = u32(p+4);
  962. switch(v){
  963. default:
  964. return 0;
  965. case ArenaVersion4:
  966. n = ArenaSize4;
  967. break;
  968. case ArenaVersion5:
  969. n = ArenaSize5;
  970. break;
  971. }
  972. if(p[n-1] != 1){
  973. print("arena tail says not sealed\n");
  974. return 0;
  975. }
  976. if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
  977. print("arena tail followed by non-zero data\n");
  978. return 0;
  979. }
  980. if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
  981. print("arena score zero\n");
  982. return 0;
  983. }
  984. return 1;
  985. }
  986. int
  987. okayname(char *name, int n)
  988. {
  989. char buf[20];
  990. if(nameok(name) < 0)
  991. return 0;
  992. sprint(buf, "%d", n);
  993. if(n == 0)
  994. buf[0] = 0;
  995. if(strlen(name) < strlen(buf)
  996. || strcmp(name+strlen(name)-strlen(buf), buf) != 0)
  997. return 0;
  998. return 1;
  999. }
  1000. int
  1001. clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
  1002. {
  1003. if(a->type != b->type)
  1004. return a->type - b->type;
  1005. if(a->size != b->size)
  1006. return a->size - b->size;
  1007. if(a->uncsize != b->uncsize)
  1008. return a->uncsize - b->uncsize;
  1009. return scorecmp(a->score, b->score);
  1010. }
  1011. ClumpInfo*
  1012. loadci(vlong offset, Arena *arena, int nci)
  1013. {
  1014. int i, j, per;
  1015. uchar *p, *sp;
  1016. ClumpInfo *bci, *ci;
  1017. per = arena->blocksize/ClumpInfoSize;
  1018. bci = vtmalloc(nci*sizeof bci[0]);
  1019. ci = bci;
  1020. offset += arena->size - arena->blocksize;
  1021. p = sp = nil;
  1022. for(i=0; i<nci; i+=per){
  1023. if(p == sp){
  1024. sp = pagein(offset-4*M, 4*M);
  1025. p = sp+4*M;
  1026. }
  1027. p -= arena->blocksize;
  1028. offset -= arena->blocksize;
  1029. for(j=0; j<per && i+j<nci; j++)
  1030. unpackclumpinfo(ci++, p+j*ClumpInfoSize);
  1031. }
  1032. return bci;
  1033. }
  1034. vlong
  1035. writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
  1036. {
  1037. int i, j, per;
  1038. uchar *p, *sp;
  1039. per = arena->blocksize/ClumpInfoSize;
  1040. offset += arena->size - arena->blocksize;
  1041. p = sp = nil;
  1042. for(i=0; i<nci; i+=per){
  1043. if(p == sp){
  1044. sp = pagein(offset-4*M, 4*M);
  1045. p = sp+4*M;
  1046. }
  1047. p -= arena->blocksize;
  1048. offset -= arena->blocksize;
  1049. memset(p, 0, arena->blocksize);
  1050. for(j=0; j<per && i+j<nci; j++)
  1051. packclumpinfo(ci++, p+j*ClumpInfoSize);
  1052. }
  1053. pageout();
  1054. return offset;
  1055. }
  1056. void
  1057. loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
  1058. {
  1059. char dname[ANameSize];
  1060. static char lastbase[ANameSize];
  1061. uchar *p;
  1062. Arena oarena;
  1063. ArenaHead ohead;
  1064. /*
  1065. * Fmtarenas makes all arenas the same size
  1066. * except the last, which may be smaller.
  1067. * It uses the same block size for arenas as for
  1068. * the arena partition blocks.
  1069. */
  1070. arena->size = arenasize;
  1071. if(offset0+arena->size > partend)
  1072. arena->size = partend - offset0;
  1073. head->size = arena->size;
  1074. arena->blocksize = ap.blocksize;
  1075. head->blocksize = arena->blocksize;
  1076. /*
  1077. * Look for clump magic and name in head/tail blocks.
  1078. * All the other info we will reconstruct just in case.
  1079. */
  1080. p = pagein(offset0, arena->blocksize);
  1081. memset(&ohead, 0, sizeof ohead);
  1082. if(unpackarenahead(&ohead, p) >= 0){
  1083. head->version = ohead.version;
  1084. head->clumpmagic = ohead.clumpmagic;
  1085. if(okayname(ohead.name, anum))
  1086. strcpy(head->name, ohead.name);
  1087. }
  1088. p = pagein(offset0+arena->size-arena->blocksize,
  1089. arena->blocksize);
  1090. memset(&oarena, 0, sizeof oarena);
  1091. if(unpackarena(&oarena, p) >= 0){
  1092. arena->version = oarena.version;
  1093. arena->clumpmagic = oarena.clumpmagic;
  1094. if(okayname(oarena.name, anum))
  1095. strcpy(arena->name, oarena.name);
  1096. arena->diskstats.clumps = oarena.diskstats.clumps;
  1097. print("old arena: sealed=%d\n", oarena.diskstats.sealed);
  1098. arena->diskstats.sealed = oarena.diskstats.sealed;
  1099. }
  1100. /* Head trumps arena. */
  1101. if(head->version){
  1102. arena->version = head->version;
  1103. arena->clumpmagic = head->clumpmagic;
  1104. }
  1105. if(arena->version == 0)
  1106. arena->version = ArenaVersion5;
  1107. if(basename){
  1108. if(anum == -1)
  1109. snprint(arena->name, ANameSize, "%s", basename);
  1110. else
  1111. snprint(arena->name, ANameSize, "%s%d", basename, anum);
  1112. }else if(lastbase[0])
  1113. snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
  1114. else if(head->name[0])
  1115. strcpy(arena->name, head->name);
  1116. else if(arena->name[0] == 0)
  1117. sysfatal("cannot determine base name for arena; use -n");
  1118. strcpy(lastbase, arena->name);
  1119. sprint(dname, "%d", anum);
  1120. lastbase[strlen(lastbase)-strlen(dname)] = 0;
  1121. /* Was working in arena, now copy to head. */
  1122. head->version = arena->version;
  1123. memmove(head->name, arena->name, sizeof head->name);
  1124. head->blocksize = arena->blocksize;
  1125. head->size = arena->size;
  1126. }
  1127. void
  1128. shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
  1129. {
  1130. uchar headbuf[MaxDiskBlock];
  1131. sb->offset = offset0;
  1132. memset(headbuf, 0, sizeof headbuf);
  1133. packarenahead(head, headbuf);
  1134. sbupdate(sb, headbuf, offset0, head->blocksize);
  1135. }
  1136. u32int
  1137. newclumpmagic(int version)
  1138. {
  1139. u32int m;
  1140. if(version == ArenaVersion4)
  1141. return _ClumpMagic;
  1142. do{
  1143. m = fastrand();
  1144. }while(m==0 || m == _ClumpMagic);
  1145. return m;
  1146. }
  1147. /*
  1148. * Poke around in the arena to find the clump data
  1149. * and compute the relevant statistics.
  1150. */
  1151. void
  1152. guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
  1153. uchar *oldscore, uchar *score)
  1154. {
  1155. uchar dbuf[MaxDiskBlock];
  1156. int needtozero, clumps, nb1, nb2, minclumps;
  1157. int inbad, n, ncib, printed, sealing, smart;
  1158. u32int magic;
  1159. uchar *sp, *ep, *p;
  1160. vlong boffset, eoffset, lastclumpend, leaked;
  1161. vlong offset, toffset, totalcorrupt, v;
  1162. Clump cl;
  1163. ClumpInfo *bci, *ci, *eci, *xci;
  1164. Cit *bcit, *cit, *ecit;
  1165. Shabuf oldsha, newsha;
  1166. /*
  1167. * We expect to find an arena, with data, between offset
  1168. * and offset+arenasize. With any luck, the data starts at
  1169. * offset+ap.blocksize. The blocks have variable size and
  1170. * aren't padded at all, which doesn't give us any alignment
  1171. * constraints. The blocks are compressed or high entropy,
  1172. * but the headers are pretty low entropy (except the score):
  1173. *
  1174. * type[1] (range 0 thru 9, 13)
  1175. * size[2]
  1176. * uncsize[2] (<= size)
  1177. *
  1178. * so we can look for these. We check the scores as we go,
  1179. * so we can't make any wrong turns. If we find ourselves
  1180. * in a dead end, scan forward looking for a new start.
  1181. */
  1182. resetcibuf();
  1183. memset(head, 0, sizeof *head);
  1184. memset(arena, 0, sizeof *arena);
  1185. memset(oldscore, 0, VtScoreSize);
  1186. memset(score, 0, VtScoreSize);
  1187. memset(&oldsha, 0, sizeof oldsha);
  1188. memset(&newsha, 0, sizeof newsha);
  1189. newsha.rollback = 1;
  1190. if(0){
  1191. sbdebug(&oldsha, "old.sha");
  1192. sbdebug(&newsha, "new.sha");
  1193. }
  1194. loadarenabasics(offset0, anum, head, arena);
  1195. /* start the clump hunt */
  1196. clumps = 0;
  1197. totalcorrupt = 0;
  1198. sealing = 1;
  1199. boffset = offset0 + arena->blocksize;
  1200. offset = boffset;
  1201. eoffset = offset0+arena->size - arena->blocksize;
  1202. toffset = eoffset;
  1203. sp = pagein(offset0, 4*M);
  1204. if(arena->diskstats.sealed){
  1205. oldsha.offset = offset0;
  1206. sbupdate(&oldsha, sp, offset0, 4*M);
  1207. }
  1208. ep = sp+4*M;
  1209. p = sp + (boffset - offset0);
  1210. ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
  1211. lastclumpend = offset;
  1212. nbad = 0;
  1213. inbad = 0;
  1214. needtozero = 0;
  1215. minclumps = 0;
  1216. while(offset < eoffset){
  1217. /*
  1218. * Shift buffer if we're running out of room.
  1219. */
  1220. if(p+70*K >= ep){
  1221. /*
  1222. * Start the post SHA1 buffer. By now we should know the
  1223. * clumpmagic and arena version, so we can create a
  1224. * correct head block to get things going.
  1225. */
  1226. if(sealing && fix && newsha.offset == 0){
  1227. newsha.offset = offset0;
  1228. if(arena->clumpmagic == 0){
  1229. if(arena->version == 0)
  1230. arena->version = ArenaVersion5;
  1231. arena->clumpmagic = newclumpmagic(arena->version);
  1232. }
  1233. head->clumpmagic = arena->clumpmagic;
  1234. shahead(&newsha, offset0, head);
  1235. }
  1236. n = 4*M-256*K;
  1237. if(sealing && fix){
  1238. sbdiskhash(&newsha, bufoffset);
  1239. sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
  1240. }
  1241. pagein(bufoffset+n, 4*M);
  1242. p -= n;
  1243. if(arena->diskstats.sealed)
  1244. sbupdate(&oldsha, buf, bufoffset, 4*M);
  1245. }
  1246. /*
  1247. * Check for a clump at p, which is at offset in the disk.
  1248. * Duplicate clumps happen in corrupted disks
  1249. * (the same pattern gets written many times in a row)
  1250. * and should never happen during regular use.
  1251. */
  1252. magic = 0;
  1253. if((n = isclump(p, &cl, &magic)) > 0){
  1254. /*
  1255. * If we were in the middle of some corrupted data,
  1256. * flush a warning about it and then add any clump
  1257. * info blocks as necessary.
  1258. */
  1259. if(inbad){
  1260. inbad = 0;
  1261. v = offset-lastclumpend;
  1262. if(needtozero){
  1263. zerorange(lastclumpend, v);
  1264. sbrollback(&newsha, lastclumpend);
  1265. print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
  1266. lastclumpend, v, v);
  1267. }
  1268. addcicorrupt(v);
  1269. totalcorrupt += v;
  1270. nb1 = (minclumps+ncib-1)/ncib;
  1271. minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
  1272. nb2 = (minclumps+ncib-1)/ncib;
  1273. eoffset -= (nb2-nb1)*arena->blocksize;
  1274. }
  1275. if(haveclump(cl.info.score))
  1276. print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
  1277. /*
  1278. * If clumps use different magic numbers, we don't care.
  1279. * We'll just use the first one we find and make the others
  1280. * follow suit.
  1281. */
  1282. if(arena->clumpmagic == 0){
  1283. print("clump type %d size %d score %V magic %x\n",
  1284. cl.info.type, cl.info.size, cl.info.score, magic);
  1285. arena->clumpmagic = magic;
  1286. if(magic == _ClumpMagic)
  1287. arena->version = ArenaVersion4;
  1288. else
  1289. arena->version = ArenaVersion5;
  1290. }
  1291. if(magic != arena->clumpmagic)
  1292. p32(p, arena->clumpmagic);
  1293. if(clumps == 0)
  1294. arena->ctime = cl.time;
  1295. /*
  1296. * Record the clump, update arena stats,
  1297. * grow clump info blocks if needed.
  1298. */
  1299. if(verbose > 1)
  1300. print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
  1301. clumps, cl.info.type, cl.info.score, offset, n, n);
  1302. addcibuf(&cl.info, 0);
  1303. if(minclumps%ncib == 0)
  1304. eoffset -= arena->blocksize;
  1305. minclumps++;
  1306. clumps++;
  1307. if(cl.encoding != ClumpENone)
  1308. arena->diskstats.cclumps++;
  1309. arena->diskstats.uncsize += cl.info.uncsize;
  1310. arena->wtime = cl.time;
  1311. /*
  1312. * Move to next clump.
  1313. */
  1314. offset += n;
  1315. p += n;
  1316. lastclumpend = offset;
  1317. }else{
  1318. /*
  1319. * Overwrite malformed clump data with zeros later.
  1320. * For now, just record whether it needs to be overwritten.
  1321. * Bad regions must be of size at least ClumpSize.
  1322. * Postponing the overwriting keeps us from writing past
  1323. * the end of the arena data (which might be directory data)
  1324. * with zeros.
  1325. */
  1326. if(!inbad){
  1327. inbad = 1;
  1328. needtozero = 0;
  1329. if(memcmp(p, zero, ClumpSize) != 0)
  1330. needtozero = 1;
  1331. p += ClumpSize;
  1332. offset += ClumpSize;
  1333. nbad++;
  1334. }else{
  1335. if(*p != 0)
  1336. needtozero = 1;
  1337. p++;
  1338. offset++;
  1339. }
  1340. }
  1341. }
  1342. pageout();
  1343. if(verbose)
  1344. print("readable clumps: %d; min. directory entries: %d\n",
  1345. clumps, minclumps);
  1346. arena->diskstats.used = lastclumpend - boffset;
  1347. leaked = eoffset - lastclumpend;
  1348. if(verbose)
  1349. print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
  1350. boffset, lastclumpend, arena->diskstats.used, leaked);
  1351. /*
  1352. * Finish the SHA1 of the old data.
  1353. */
  1354. if(arena->diskstats.sealed){
  1355. sbdiskhash(&oldsha, toffset);
  1356. readdisk(dbuf, toffset, arena->blocksize);
  1357. scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
  1358. sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
  1359. sbscore(&oldsha, oldscore);
  1360. }
  1361. /*
  1362. * If we still don't know the clump magic, the arena
  1363. * must be empty. It still needs a value, so make
  1364. * something up.
  1365. */
  1366. if(arena->version == 0)
  1367. arena->version = ArenaVersion5;
  1368. if(arena->clumpmagic == 0){
  1369. if(arena->version == ArenaVersion4)
  1370. arena->clumpmagic = _ClumpMagic;
  1371. else{
  1372. do
  1373. arena->clumpmagic = fastrand();
  1374. while(arena->clumpmagic==_ClumpMagic
  1375. ||arena->clumpmagic==0);
  1376. }
  1377. head->clumpmagic = arena->clumpmagic;
  1378. }
  1379. /*
  1380. * Guess at number of clumpinfo blocks to load.
  1381. * If we guess high, it's no big deal. If we guess low,
  1382. * we'll be forced into rewriting the whole directory.
  1383. * Still not such a big deal.
  1384. */
  1385. if(clumps == 0 || arena->diskstats.used == totalcorrupt)
  1386. goto Nocib;
  1387. if(clumps < arena->diskstats.clumps)
  1388. clumps = arena->diskstats.clumps;
  1389. if(clumps < ncibuf)
  1390. clumps = ncibuf;
  1391. clumps += totalcorrupt/
  1392. ((arena->diskstats.used - totalcorrupt)/clumps);
  1393. clumps += totalcorrupt/2000;
  1394. if(clumps < minclumps)
  1395. clumps = minclumps;
  1396. clumps += ncib-1;
  1397. clumps -= clumps%ncib;
  1398. /*
  1399. * Can't write into the actual data.
  1400. */
  1401. v = offset0 + arena->size - arena->blocksize;
  1402. v -= (clumps+ncib-1)/ncib * arena->blocksize;
  1403. if(v < lastclumpend){
  1404. v = offset0 + arena->size - arena->blocksize;
  1405. clumps = (v-lastclumpend)/arena->blocksize * ncib;
  1406. }
  1407. if(clumps < minclumps)
  1408. print("cannot happen?\n");
  1409. /*
  1410. * Check clumpinfo blocks against directory we created.
  1411. * The tricky part is handling the corrupt sections of arena.
  1412. * If possible, we remark just the affected directory entries
  1413. * rather than slide everything down.
  1414. *
  1415. * Allocate clumps+1 blocks and check that we don't need
  1416. * the last one at the end.
  1417. */
  1418. bci = loadci(offset0, arena, clumps+1);
  1419. eci = bci+clumps+1;
  1420. bcit = cibuf;
  1421. ecit = cibuf+ncibuf;
  1422. smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */
  1423. Again:
  1424. nbad = 0;
  1425. ci = bci;
  1426. for(cit=bcit; cit<ecit && ci<eci; cit++){
  1427. if(cit->corrupt){
  1428. vlong n, m;
  1429. if(smart){
  1430. /*
  1431. * If we can, just mark existing entries as corrupt.
  1432. */
  1433. n = cit->corrupt;
  1434. for(xci=ci; n>0 && xci<eci; xci++)
  1435. n -= ClumpSize+xci->size;
  1436. if(n > 0 || xci >= eci)
  1437. goto Dumb;
  1438. printed = 0;
  1439. for(; ci<xci; ci++){
  1440. if(verbose && ci->type != VtCorruptType){
  1441. if(!printed){
  1442. print("marking directory %d-%d as corrupt\n",
  1443. (int)(ci-bci), (int)(xci-bci));
  1444. printed = 1;
  1445. }
  1446. print("\ttype=%d size=%d uncsize=%d score=%V\n",
  1447. ci->type, ci->size, ci->uncsize, ci->score);
  1448. }
  1449. ci->type = VtCorruptType;
  1450. }
  1451. }else{
  1452. Dumb:
  1453. print("\trewriting clump directory\n");
  1454. /*
  1455. * Otherwise, blaze a new trail.
  1456. */
  1457. n = cit->corrupt;
  1458. while(n > 0 && ci < eci){
  1459. if(n < ClumpSize)
  1460. sysfatal("bad math in clump corrupt");
  1461. if(n <= VtMaxLumpSize+ClumpSize)
  1462. m = n;
  1463. else{
  1464. m = VtMaxLumpSize+ClumpSize;
  1465. if(n-m < ClumpSize)
  1466. m -= ClumpSize;
  1467. }
  1468. ci->type = VtCorruptType;
  1469. ci->size = m-ClumpSize;
  1470. ci->uncsize = m-ClumpSize;
  1471. memset(ci->score, 0, VtScoreSize);
  1472. ci++;
  1473. n -= m;
  1474. }
  1475. }
  1476. continue;
  1477. }
  1478. if(clumpinfocmp(&cit->ci, ci) != 0){
  1479. if(verbose && (smart || verbose>1)){
  1480. print("clumpinfo %d\n", (int)(ci-bci));
  1481. print("\twant: %d %d %d %V\n",
  1482. cit->ci.type, cit->ci.size,
  1483. cit->ci.uncsize, cit->ci.score);
  1484. print("\thave: %d %d %d %V\n",
  1485. ci->type, ci->size,
  1486. ci->uncsize, ci->score);
  1487. }
  1488. *ci = cit->ci;
  1489. nbad++;
  1490. }
  1491. ci++;
  1492. }
  1493. if(ci >= eci || cit < ecit){
  1494. print("ran out of space editing existing directory; rewriting\n");
  1495. print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
  1496. assert(smart); /* can't happen second time thru */
  1497. smart = 0;
  1498. goto Again;
  1499. }
  1500. assert(ci <= eci);
  1501. arena->diskstats.clumps = ci-bci;
  1502. eoffset = writeci(offset0, arena, bci, ci-bci);
  1503. if(sealing && fix)
  1504. sbrollback(&newsha, v);
  1505. print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
  1506. if(lastclumpend > eoffset)
  1507. print("arena directory overwrote blocks! cannot happen!\n");
  1508. free(bci);
  1509. if(smart && nbad)
  1510. print("arena directory has %d bad or missing entries\n", nbad);
  1511. Nocib:
  1512. if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
  1513. if(arena->diskstats.sealed)
  1514. print("unsealing arena\n");
  1515. sealing = 0;
  1516. memset(oldscore, 0, VtScoreSize);
  1517. }
  1518. /*
  1519. * Finish the SHA1 of the new data - only meaningful
  1520. * if we've been writing to disk (`fix').
  1521. */
  1522. arena->diskstats.sealed = sealing;
  1523. arena->memstats = arena->diskstats;
  1524. if(sealing && fix){
  1525. uchar tbuf[MaxDiskBlock];
  1526. sbdiskhash(&newsha, toffset);
  1527. memset(tbuf, 0, sizeof tbuf);
  1528. packarena(arena, tbuf);
  1529. sbupdate(&newsha, tbuf, toffset, arena->blocksize);
  1530. sbscore(&newsha, score);
  1531. }
  1532. }
  1533. void
  1534. dumparena(vlong offset, int anum, Arena *arena)
  1535. {
  1536. char buf[1000];
  1537. vlong o, e;
  1538. int fd, n;
  1539. snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
  1540. if((fd = create(buf, OWRITE, 0666)) < 0){
  1541. fprint(2, "create %s: %r\n", buf);
  1542. return;
  1543. }
  1544. e = offset+arena->size;
  1545. for(o=offset; o<e; o+=n){
  1546. n = 4*M;
  1547. if(o+n > e)
  1548. n = e-o;
  1549. if(pwrite(fd, pagein(o, n), n, o-offset) != n){
  1550. fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
  1551. return;
  1552. }
  1553. }
  1554. }
  1555. void
  1556. checkarena(vlong offset, int anum)
  1557. {
  1558. uchar dbuf[MaxDiskBlock];
  1559. uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
  1560. Arena arena, oarena;
  1561. ArenaHead head;
  1562. Info *fmt, *fmta;
  1563. int sz;
  1564. print("# arena %d: offset %#llux\n", anum, offset);
  1565. if(offset >= partend){
  1566. print("arena offset out of bounds\n");
  1567. return;
  1568. }
  1569. guessarena(offset, anum, &head, &arena, oldscore, score);
  1570. if(verbose){
  1571. print("#\tversion=%d name=%s blocksize=%d size=%z",
  1572. head.version, head.name, head.blocksize, head.size);
  1573. if(head.clumpmagic)
  1574. print(" clumpmagic=%#.8ux", head.clumpmagic);
  1575. print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
  1576. arena.diskstats.clumps, arena.diskstats.cclumps,
  1577. arena.diskstats.used, arena.diskstats.uncsize);
  1578. print("#\tctime=%t\n", arena.ctime);
  1579. print("#\twtime=%t\n", arena.wtime);
  1580. if(arena.diskstats.sealed)
  1581. print("#\tsealed score=%V\n", score);
  1582. }
  1583. if(dumpbase){
  1584. dumparena(offset, anum, &arena);
  1585. return;
  1586. }
  1587. memset(dbuf, 0, sizeof dbuf);
  1588. packarenahead(&head, dbuf);
  1589. p = pagein(offset, arena.blocksize);
  1590. if(memcmp(dbuf, p, arena.blocksize) != 0){
  1591. print("on-disk arena header incorrect\n");
  1592. showdiffs(dbuf, p, arena.blocksize,
  1593. arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
  1594. }
  1595. memmove(p, dbuf, arena.blocksize);
  1596. memset(dbuf, 0, sizeof dbuf);
  1597. packarena(&arena, dbuf);
  1598. if(arena.diskstats.sealed)
  1599. scorecp(dbuf+arena.blocksize-VtScoreSize, score);
  1600. p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
  1601. memset(&oarena, 0, sizeof oarena);
  1602. unpackarena(&oarena, p);
  1603. if(arena.version == ArenaVersion4){
  1604. sz = ArenaSize4;
  1605. fmt = tailinfo4;
  1606. fmta = tailinfo4a;
  1607. }else{
  1608. sz = ArenaSize5;
  1609. fmt = tailinfo5;
  1610. fmta = tailinfo5a;
  1611. }
  1612. if(p[sz] == 1){
  1613. fmt = fmta;
  1614. if(oarena.diskstats.sealed){
  1615. /*
  1616. * some arenas were sealed with the extension
  1617. * before we adopted the convention that if it didn't
  1618. * add new information it gets dropped.
  1619. */
  1620. _packarena(&arena, dbuf, 1);
  1621. }
  1622. }
  1623. if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
  1624. print("on-disk arena tail incorrect\n");
  1625. showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
  1626. }
  1627. if(arena.diskstats.sealed){
  1628. if(oarena.diskstats.sealed)
  1629. if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
  1630. print("on-disk arena seal score incorrect\n");
  1631. print("\tcorrect=%V\n", oldscore);
  1632. print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
  1633. }
  1634. if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
  1635. print("%ssealing arena%s: %V\n",
  1636. oarena.diskstats.sealed ? "re" : "",
  1637. scorecmp(oldscore, score) == 0 ?
  1638. "" : " after changes", score);
  1639. }
  1640. }
  1641. memmove(p, dbuf, arena.blocksize);
  1642. pageout();
  1643. }
  1644. AMapN*
  1645. buildamap(void)
  1646. {
  1647. uchar *p;
  1648. vlong o;
  1649. ArenaHead h;
  1650. AMapN *an;
  1651. AMap *m;
  1652. an = vtmallocz(sizeof *an);
  1653. for(o=ap.arenabase; o<partend; o+=arenasize){
  1654. p = pagein(o, Block);
  1655. if(unpackarenahead(&h, p) >= 0){
  1656. an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
  1657. m = &an->map[an->n++];
  1658. m->start = o;
  1659. m->stop = o+h.size;
  1660. strcpy(m->name, h.name);
  1661. }
  1662. }
  1663. return an;
  1664. }
  1665. void
  1666. checkmap(void)
  1667. {
  1668. char *s;
  1669. uchar *p;
  1670. int i, len;
  1671. AMapN *an;
  1672. Fmt fmt;
  1673. an = buildamap();
  1674. fmtstrinit(&fmt);
  1675. fmtprint(&fmt, "%ud\n", an->n);
  1676. for(i=0; i<an->n; i++)
  1677. fmtprint(&fmt, "%s\t%lld\t%lld\n",
  1678. an->map[i].name, an->map[i].start, an->map[i].stop);
  1679. s = fmtstrflush(&fmt);
  1680. len = strlen(s);
  1681. if(len > ap.tabsize){
  1682. print("arena partition map too long: need %z bytes have %z\n",
  1683. (vlong)len, (vlong)ap.tabsize);
  1684. len = ap.tabsize;
  1685. }
  1686. if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
  1687. print("arena partition map *way* too long\n");
  1688. return;
  1689. }
  1690. p = pagein(ap.tabbase, ap.tabsize);
  1691. if(memcmp(p, s, len) != 0){
  1692. print("arena partition map incorrect; rewriting.\n");
  1693. memmove(p, s, len);
  1694. }
  1695. pageout();
  1696. }
  1697. int mainstacksize = 512*1024;
  1698. void
  1699. threadmain(int argc, char **argv)
  1700. {
  1701. int mode;
  1702. mode = OREAD;
  1703. readonly = 1;
  1704. ARGBEGIN{
  1705. case 'U':
  1706. unseal = 1;
  1707. break;
  1708. case 'a':
  1709. arenasize = unittoull(EARGF(usage()));
  1710. break;
  1711. case 'b':
  1712. ap.blocksize = unittoull(EARGF(usage()));
  1713. break;
  1714. case 'f':
  1715. fix = 1;
  1716. mode = ORDWR;
  1717. readonly = 0;
  1718. break;
  1719. case 'n':
  1720. basename = EARGF(usage());
  1721. break;
  1722. case 'v':
  1723. verbose++;
  1724. break;
  1725. case 'x':
  1726. dumpbase = EARGF(usage());
  1727. break;
  1728. default:
  1729. usage();
  1730. }ARGEND
  1731. if(argc != 1 && argc != 2)
  1732. usage();
  1733. file = argv[0];
  1734. ventifmtinstall();
  1735. fmtinstall('z', zfmt);
  1736. fmtinstall('t', tfmt);
  1737. quotefmtinstall();
  1738. part = initpart(file, mode|ODIRECT);
  1739. if(part == nil)
  1740. sysfatal("can't open %s: %r", file);
  1741. partend = part->size;
  1742. if(isonearena()){
  1743. checkarena(0, -1);
  1744. threadexitsall(nil);
  1745. }
  1746. checkarenas(argc > 1 ? argv[1] : nil);
  1747. checkmap();
  1748. threadexitsall(nil);
  1749. }