fixarenas.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913
  1. /*
  2. * Check and fix an arena partition.
  3. *
  4. * This is a lot grittier than the rest of Venti because
  5. * it can't just give up if a byte here or there is wrong.
  6. *
  7. * The rule here (hopefully followed!) is that block corruption
  8. * only ever has a local effect -- there are no blocks that you
  9. * can wipe out that will cause large portions of
  10. * uncorrupted data blocks to be useless.
  11. */
  12. #include "stdinc.h"
  13. #include "dat.h"
  14. #include "fns.h"
  15. #include "whack.h"
  16. #define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1))
  17. #pragma varargck type "z" uvlong
  18. #pragma varargck type "z" vlong
  19. #pragma varargck type "t" uint
  20. enum
  21. {
  22. K = 1024,
  23. M = 1024*1024,
  24. G = 1024*1024*1024,
  25. Block = 4096,
  26. };
  27. int debugsha1;
  28. int verbose;
  29. Part *part;
  30. char *file;
  31. char *basename;
  32. char *dumpbase;
  33. int fix;
  34. int badreads;
  35. int unseal;
  36. uchar zero[MaxDiskBlock];
  37. Arena lastarena;
  38. ArenaPart ap;
  39. uvlong arenasize;
  40. int nbadread;
  41. int nbad;
  42. uvlong partend;
  43. void checkarena(vlong, int);
  44. void
  45. usage(void)
  46. {
  47. fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
  48. threadexitsall(0);
  49. }
  50. /*
  51. * Format number in simplest way that is okay with unittoull.
  52. */
  53. static int
  54. zfmt(Fmt *fmt)
  55. {
  56. vlong x;
  57. x = va_arg(fmt->args, vlong);
  58. if(x == 0)
  59. return fmtstrcpy(fmt, "0");
  60. if(x%G == 0)
  61. return fmtprint(fmt, "%lldG", x/G);
  62. if(x%M == 0)
  63. return fmtprint(fmt, "%lldM", x/M);
  64. if(x%K == 0)
  65. return fmtprint(fmt, "%lldK", x/K);
  66. return fmtprint(fmt, "%lld", x);
  67. }
  68. /*
  69. * Format time like ctime without newline.
  70. */
  71. static int
  72. tfmt(Fmt *fmt)
  73. {
  74. uint t;
  75. char buf[30];
  76. t = va_arg(fmt->args, uint);
  77. strcpy(buf, ctime(t));
  78. buf[28] = 0;
  79. return fmtstrcpy(fmt, buf);
  80. }
  81. /*
  82. * Coalesce messages about unreadable sectors into larger ranges.
  83. * bad(0, 0) flushes the buffer.
  84. */
  85. static void
  86. bad(char *msg, vlong o, int len)
  87. {
  88. static vlong lb0, lb1;
  89. static char *lmsg;
  90. if(msg == nil)
  91. msg = lmsg;
  92. if(o == -1){
  93. lmsg = nil;
  94. lb0 = 0;
  95. lb1 = 0;
  96. return;
  97. }
  98. if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
  99. if(lb0 != lb1)
  100. print("%s %#llux+%#llux (%,lld+%,lld)\n",
  101. lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
  102. lb0 = o;
  103. }
  104. lmsg = msg;
  105. lb1 = o+len;
  106. }
  107. /*
  108. * Read in the len bytes of data at the offset. If can't for whatever reason,
  109. * fill it with garbage but print an error.
  110. */
  111. static uchar*
  112. readdisk(uchar *buf, vlong offset, int len)
  113. {
  114. int i, j, k, n;
  115. if(offset >= partend){
  116. memset(buf, 0xFB, sizeof buf);
  117. return buf;
  118. }
  119. if(offset+len > partend){
  120. memset(buf, 0xFB, sizeof buf);
  121. len = partend - offset;
  122. }
  123. if(readpart(part, offset, buf, len) >= 0)
  124. return buf;
  125. /*
  126. * The read failed. Clear the buffer to nonsense, and
  127. * then try reading in smaller pieces. If that fails,
  128. * read in even smaller pieces. And so on down to sectors.
  129. */
  130. memset(buf, 0xFD, len);
  131. for(i=0; i<len; i+=64*K){
  132. n = 64*K;
  133. if(i+n > len)
  134. n = len-i;
  135. if(readpart(part, offset+i, buf+i, n) >= 0)
  136. continue;
  137. for(j=i; j<len && j<i+64*K; j+=4*K){
  138. n = 4*K;
  139. if(j+n > len)
  140. n = len-j;
  141. if(readpart(part, offset+j, buf+j, n) >= 0)
  142. continue;
  143. for(k=j; k<len && k<j+4*K; k+=512){
  144. if(readpart(part, offset+k, buf+k, 512) >= 0)
  145. continue;
  146. bad("disk read failed at", k, 512);
  147. badreads++;
  148. }
  149. }
  150. }
  151. bad(nil, 0, 0);
  152. return buf;
  153. }
  154. /*
  155. * Buffer to support running SHA1 hash of the disk.
  156. */
  157. typedef struct Shabuf Shabuf;
  158. struct Shabuf
  159. {
  160. int fd;
  161. vlong offset;
  162. DigestState state;
  163. int rollback;
  164. vlong r0;
  165. DigestState *hist;
  166. int nhist;
  167. };
  168. void
  169. sbdebug(Shabuf *sb, char *file)
  170. {
  171. int fd;
  172. if(sb->fd > 0){
  173. close(sb->fd);
  174. sb->fd = 0;
  175. }
  176. if((fd = create(file, OWRITE, 0666)) < 0)
  177. return;
  178. if(fd == 0){
  179. fd = dup(fd, -1);
  180. close(0);
  181. }
  182. sb->fd = fd;
  183. }
  184. void
  185. sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
  186. {
  187. int n, x;
  188. vlong o;
  189. if(sb->rollback && !sb->hist){
  190. sb->r0 = offset;
  191. sb->nhist = 1;
  192. sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
  193. memset(sb->hist, 0, sizeof sb->hist[0]);
  194. }
  195. if(sb->r0 == 0)
  196. sb->r0 = offset;
  197. if(sb->offset < offset || sb->offset >= offset+len){
  198. if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
  199. p, offset, len, sb->offset);
  200. return;
  201. }
  202. x = sb->offset - offset;
  203. if(0) print("sbupdate %p %#llux+%d skip %d\n",
  204. sb, offset, len, x);
  205. if(x){
  206. p += x;
  207. offset += x;
  208. len -= x;
  209. }
  210. assert(sb->offset == offset);
  211. if(sb->fd > 0)
  212. pwrite(sb->fd, p, len, offset - sb->r0);
  213. if(!sb->rollback){
  214. sha1(p, len, nil, &sb->state);
  215. sb->offset += len;
  216. return;
  217. }
  218. /* save state every 4M so we can roll back quickly */
  219. o = offset - sb->r0;
  220. while(len > 0){
  221. n = 4*M - o%(4*M);
  222. if(n > len)
  223. n = len;
  224. sha1(p, n, nil, &sb->state);
  225. sb->offset += n;
  226. o += n;
  227. p += n;
  228. len -= n;
  229. if(o%(4*M) == 0){
  230. x = o/(4*M);
  231. if(x >= sb->nhist){
  232. if(x != sb->nhist)
  233. print("oops! x=%d nhist=%d\n", x, sb->nhist);
  234. sb->nhist += 32;
  235. sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
  236. }
  237. sb->hist[x] = sb->state;
  238. }
  239. }
  240. }
  241. void
  242. sbdiskhash(Shabuf *sb, vlong eoffset)
  243. {
  244. static uchar dbuf[4*M];
  245. int n;
  246. while(sb->offset < eoffset){
  247. n = sizeof dbuf;
  248. if(sb->offset+n > eoffset)
  249. n = eoffset - sb->offset;
  250. readdisk(dbuf, sb->offset, n);
  251. sbupdate(sb, dbuf, sb->offset, n);
  252. }
  253. }
  254. void
  255. sbrollback(Shabuf *sb, vlong offset)
  256. {
  257. int x;
  258. vlong o;
  259. Dir d;
  260. if(!sb->rollback || !sb->r0){
  261. print("cannot rollback sha\n");
  262. return;
  263. }
  264. if(offset >= sb->offset)
  265. return;
  266. o = offset - sb->r0;
  267. x = o/(4*M);
  268. if(x >= sb->nhist){
  269. print("cannot rollback sha\n");
  270. return;
  271. }
  272. sb->state = sb->hist[x];
  273. sb->offset = sb->r0 + x*4*M;
  274. assert(sb->offset <= offset);
  275. if(sb->fd > 0){
  276. nulldir(&d);
  277. d.length = sb->offset - sb->r0;
  278. dirfwstat(sb->fd, &d);
  279. }
  280. }
  281. void
  282. sbscore(Shabuf *sb, uchar *score)
  283. {
  284. if(sb->hist){
  285. free(sb->hist);
  286. sb->hist = nil;
  287. }
  288. sha1(nil, 0, score, &sb->state);
  289. }
  290. /*
  291. * If we're fixing arenas, then editing this memory edits the disk!
  292. * It will be written back out as new data is paged in.
  293. */
  294. uchar buf[4*M];
  295. uchar sbuf[4*M];
  296. vlong bufoffset;
  297. int buflen;
  298. static void pageout(void);
  299. static uchar*
  300. pagein(vlong offset, int len)
  301. {
  302. pageout();
  303. if(offset >= partend){
  304. memset(buf, 0xFB, sizeof buf);
  305. return buf;
  306. }
  307. if(offset+len > partend){
  308. memset(buf, 0xFB, sizeof buf);
  309. len = partend - offset;
  310. }
  311. bufoffset = offset;
  312. buflen = len;
  313. readdisk(buf, offset, len);
  314. memmove(sbuf, buf, len);
  315. return buf;
  316. }
  317. static void
  318. pageout(void)
  319. {
  320. if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
  321. buflen = 0;
  322. return;
  323. }
  324. if(writepart(part, bufoffset, buf, buflen) < 0)
  325. print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
  326. bufoffset, buflen, bufoffset, buflen);
  327. buflen = 0;
  328. }
  329. static void
  330. zerorange(vlong offset, int len)
  331. {
  332. int i;
  333. vlong ooff;
  334. int olen;
  335. enum { MinBlock = 4*K, MaxBlock = 8*K };
  336. if(0)
  337. if(bufoffset <= offset && offset+len <= bufoffset+buflen){
  338. memset(buf+(offset-bufoffset), 0, len);
  339. return;
  340. }
  341. ooff = bufoffset;
  342. olen = buflen;
  343. i = offset%MinBlock;
  344. if(i+len < MaxBlock){
  345. pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
  346. memset(buf+i, 0, len);
  347. }else{
  348. pagein(offset-i, MaxBlock);
  349. memset(buf+i, 0, MaxBlock-i);
  350. offset += MaxBlock-i;
  351. len -= MaxBlock-i;
  352. while(len >= MaxBlock){
  353. pagein(offset, MaxBlock);
  354. memset(buf, 0, MaxBlock);
  355. offset += MaxBlock;
  356. len -= MaxBlock;
  357. }
  358. pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
  359. memset(buf, 0, len);
  360. }
  361. pagein(ooff, olen);
  362. }
  363. /*
  364. * read/write integers
  365. *
  366. static void
  367. p16(uchar *p, u16int u)
  368. {
  369. p[0] = (u>>8) & 0xFF;
  370. p[1] = u & 0xFF;
  371. }
  372. */
  373. static u16int
  374. u16(uchar *p)
  375. {
  376. return (p[0]<<8)|p[1];
  377. }
  378. static void
  379. p32(uchar *p, u32int u)
  380. {
  381. p[0] = (u>>24) & 0xFF;
  382. p[1] = (u>>16) & 0xFF;
  383. p[2] = (u>>8) & 0xFF;
  384. p[3] = u & 0xFF;
  385. }
  386. static u32int
  387. u32(uchar *p)
  388. {
  389. return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
  390. }
  391. /*
  392. static void
  393. p64(uchar *p, u64int u)
  394. {
  395. p32(p, u>>32);
  396. p32(p, u);
  397. }
  398. */
  399. static u64int
  400. u64(uchar *p)
  401. {
  402. return ((u64int)u32(p)<<32) | u32(p+4);
  403. }
  404. static int
  405. vlongcmp(const void *va, const void *vb)
  406. {
  407. vlong a, b;
  408. a = *(vlong*)va;
  409. b = *(vlong*)vb;
  410. if(a < b)
  411. return -1;
  412. if(b > a)
  413. return 1;
  414. return 0;
  415. }
  416. /* D and S are in draw.h */
  417. #define D VD
  418. #define S VS
  419. enum
  420. {
  421. D = 0x10000,
  422. Z = 0x20000,
  423. S = 0x30000,
  424. T = 0x40000,
  425. N = 0xFFFF
  426. };
  427. typedef struct Info Info;
  428. struct Info
  429. {
  430. int len;
  431. char *name;
  432. };
  433. Info partinfo[] = {
  434. 4, "magic",
  435. D|4, "version",
  436. Z|4, "blocksize",
  437. 4, "arenabase",
  438. 0
  439. };
  440. Info headinfo4[] = {
  441. 4, "magic",
  442. D|4, "version",
  443. S|ANameSize, "name",
  444. Z|4, "blocksize",
  445. Z|8, "size",
  446. 0
  447. };
  448. Info headinfo5[] = {
  449. 4, "magic",
  450. D|4, "version",
  451. S|ANameSize, "name",
  452. Z|4, "blocksize",
  453. Z|8, "size",
  454. 4, "clumpmagic",
  455. 0
  456. };
  457. Info tailinfo4[] = {
  458. 4, "magic",
  459. D|4, "version",
  460. S|ANameSize, "name",
  461. D|4, "clumps",
  462. D|4, "cclumps",
  463. T|4, "ctime",
  464. T|4, "wtime",
  465. D|8, "used",
  466. D|8, "uncsize",
  467. 1, "sealed",
  468. 0
  469. };
  470. Info tailinfo4a[] = {
  471. /* tailinfo 4 */
  472. 4, "magic",
  473. D|4, "version",
  474. S|ANameSize, "name",
  475. D|4, "clumps",
  476. D|4, "cclumps",
  477. T|4, "ctime",
  478. T|4, "wtime",
  479. D|8, "used",
  480. D|8, "uncsize",
  481. 1, "sealed",
  482. /* mem stats */
  483. 1, "extension",
  484. D|4, "mem.clumps",
  485. D|4, "mem.cclumps",
  486. D|8, "mem.used",
  487. D|8, "mem.uncsize",
  488. 1, "mem.sealed",
  489. 0
  490. };
  491. Info tailinfo5[] = {
  492. 4, "magic",
  493. D|4, "version",
  494. S|ANameSize, "name",
  495. D|4, "clumps",
  496. D|4, "cclumps",
  497. T|4, "ctime",
  498. T|4, "wtime",
  499. 4, "clumpmagic",
  500. D|8, "used",
  501. D|8, "uncsize",
  502. 1, "sealed",
  503. 0
  504. };
  505. Info tailinfo5a[] = {
  506. /* tailinfo 5 */
  507. 4, "magic",
  508. D|4, "version",
  509. S|ANameSize, "name",
  510. D|4, "clumps",
  511. D|4, "cclumps",
  512. T|4, "ctime",
  513. T|4, "wtime",
  514. 4, "clumpmagic",
  515. D|8, "used",
  516. D|8, "uncsize",
  517. 1, "sealed",
  518. /* mem stats */
  519. 1, "extension",
  520. D|4, "mem.clumps",
  521. D|4, "mem.cclumps",
  522. D|8, "mem.used",
  523. D|8, "mem.uncsize",
  524. 1, "mem.sealed",
  525. 0
  526. };
  527. void
  528. showdiffs(uchar *want, uchar *have, int len, Info *info)
  529. {
  530. int n;
  531. while(len > 0 && (n=info->len&N) > 0){
  532. if(memcmp(have, want, n) != 0){
  533. switch(info->len){
  534. case 1:
  535. print("\t%s: correct=%d disk=%d\n",
  536. info->name, *want, *have);
  537. break;
  538. case 4:
  539. print("\t%s: correct=%#ux disk=%#ux\n",
  540. info->name, u32(want), u32(have));
  541. break;
  542. case D|4:
  543. print("\t%s: correct=%,ud disk=%,ud\n",
  544. info->name, u32(want), u32(have));
  545. break;
  546. case T|4:
  547. print("\t%s: correct=%t\n\t\tdisk=%t\n",
  548. info->name, u32(want), u32(have));
  549. break;
  550. case Z|4:
  551. print("\t%s: correct=%z disk=%z\n",
  552. info->name, (uvlong)u32(want), (uvlong)u32(have));
  553. break;
  554. case D|8:
  555. print("\t%s: correct=%,lld disk=%,lld\n",
  556. info->name, u64(want), u64(have));
  557. break;
  558. case Z|8:
  559. print("\t%s: correct=%z disk=%z\n",
  560. info->name, u64(want), u64(have));
  561. break;
  562. case S|ANameSize:
  563. print("\t%s: correct=%s disk=%.*s\n",
  564. info->name, (char*)want,
  565. utfnlen((char*)have, ANameSize-1),
  566. (char*)have);
  567. break;
  568. default:
  569. print("\t%s: correct=%.*H disk=%.*H\n",
  570. info->name, n, want, n, have);
  571. break;
  572. }
  573. }
  574. have += n;
  575. want += n;
  576. len -= n;
  577. info++;
  578. }
  579. if(len > 0 && memcmp(have, want, len) != 0){
  580. if(memcmp(want, zero, len) != 0)
  581. print("!!\textra want data in showdiffs (bug in fixarenas)\n");
  582. else
  583. print("\tnon-zero data on disk after structure\n");
  584. if(verbose > 1){
  585. print("want: %.*H\n", len, want);
  586. print("have: %.*H\n", len, have);
  587. }
  588. }
  589. }
  590. /*
  591. * Does part begin with an arena?
  592. */
  593. int
  594. isonearena(void)
  595. {
  596. return u32(pagein(0, Block)) == ArenaHeadMagic;
  597. }
  598. static int tabsizes[] = { 16*1024, 64*1024, 512*1024, };
  599. /*
  600. * Poke around on the disk to guess what the ArenaPart numbers are.
  601. */
  602. void
  603. guessgeometry(void)
  604. {
  605. int i, j, n, bestn, ndiff, nhead, ntail;
  606. uchar *p, *ep, *sp;
  607. u64int diff[100], head[20], tail[20];
  608. u64int offset, bestdiff;
  609. ap.version = ArenaPartVersion;
  610. if(arenasize == 0 || ap.blocksize == 0){
  611. /*
  612. * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
  613. * Instead, look for the individual arena headers and tails, which there
  614. * are many of, and once we've seen enough, infer the spacing.
  615. *
  616. * Of course, nothing in the file format requires that arenas be evenly
  617. * spaced, but fmtarenas always does that for us.
  618. */
  619. nhead = 0;
  620. ntail = 0;
  621. for(offset=PartBlank; offset<partend; offset+=4*M){
  622. p = pagein(offset, 4*M);
  623. for(sp=p, ep=p+4*M; p<ep; p+=K){
  624. if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
  625. if(verbose)
  626. print("arena head at %#llx\n", offset+(p-sp));
  627. head[nhead++] = offset+(p-sp);
  628. }
  629. if(u32(p) == ArenaMagic && ntail < nelem(tail)){
  630. tail[ntail++] = offset+(p-sp);
  631. if(verbose)
  632. print("arena tail at %#llx\n", offset+(p-sp));
  633. }
  634. }
  635. if(nhead == nelem(head) && ntail == nelem(tail))
  636. break;
  637. }
  638. if(nhead < 3 && ntail < 3)
  639. sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
  640. /*
  641. * Arena size is likely the most common
  642. * inter-head or inter-tail spacing.
  643. */
  644. ndiff = 0;
  645. for(i=1; i<nhead; i++)
  646. diff[ndiff++] = head[i] - head[i-1];
  647. for(i=1; i<ntail; i++)
  648. diff[ndiff++] = tail[i] - tail[i-1];
  649. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  650. bestn = 0;
  651. bestdiff = 0;
  652. for(i=1, n=1; i<=ndiff; i++, n++){
  653. if(i==ndiff || diff[i] != diff[i-1]){
  654. if(n > bestn){
  655. bestn = n;
  656. bestdiff = diff[i-1];
  657. }
  658. n = 0;
  659. }
  660. }
  661. print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
  662. if(arenasize != 0 && arenasize != bestdiff)
  663. print("using user-specified size %z instead\n", arenasize);
  664. else
  665. arenasize = bestdiff;
  666. /*
  667. * The arena tail for an arena is arenasize-blocksize from the head.
  668. */
  669. ndiff = 0;
  670. for(i=j=0; i<nhead && j<ntail; ){
  671. if(tail[j] < head[i]){
  672. j++;
  673. continue;
  674. }
  675. if(tail[j] < head[i]+arenasize){
  676. diff[ndiff++] = head[i]+arenasize - tail[j];
  677. j++;
  678. continue;
  679. }
  680. i++;
  681. }
  682. if(ndiff < 3)
  683. sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
  684. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  685. bestn = 0;
  686. bestdiff = 0;
  687. for(i=1, n=1; i<=ndiff; i++, n++){
  688. if(i==ndiff || diff[i] != diff[i-1]){
  689. if(n > bestn){
  690. bestn = n;
  691. bestdiff = diff[i-1];
  692. }
  693. n = 0;
  694. }
  695. }
  696. print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
  697. if(ap.blocksize != 0 && ap.blocksize != bestdiff)
  698. print("using user-specified size %z instead\n", (vlong)ap.blocksize);
  699. else
  700. ap.blocksize = bestdiff;
  701. if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
  702. sysfatal("block size not a power of two");
  703. if(ap.blocksize > MaxDiskBlock)
  704. sysfatal("block size too big (max=%d)", MaxDiskBlock);
  705. /*
  706. * Use head/tail information to deduce arena base.
  707. */
  708. ndiff = 0;
  709. for(i=0; i<nhead; i++)
  710. diff[ndiff++] = head[i]%arenasize;
  711. for(i=0; i<ntail; i++)
  712. diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
  713. qsort(diff, ndiff, sizeof diff[0], vlongcmp);
  714. bestn = 0;
  715. bestdiff = 0;
  716. for(i=1, n=1; i<=ndiff; i++, n++){
  717. if(i==ndiff || diff[i] != diff[i-1]){
  718. if(n > bestn){
  719. bestn = n;
  720. bestdiff = diff[i-1];
  721. }
  722. n = 0;
  723. }
  724. }
  725. ap.arenabase = bestdiff;
  726. }
  727. ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
  728. /*
  729. * XXX pick up table, check arenabase.
  730. * XXX pick up table, record base name.
  731. */
  732. /*
  733. * Somewhat standard computation.
  734. * Fmtarenas used to use 64k tab, now uses 512k tab.
  735. */
  736. if(ap.arenabase == 0){
  737. for(i=0; i<nelem(tabsizes); i++){
  738. ap.arenabase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
  739. p = pagein(ap.arenabase, Block);
  740. if(u32(p) == ArenaHeadMagic)
  741. break;
  742. }
  743. }
  744. p = pagein(ap.arenabase, Block);
  745. print("arena base likely %z%s\n", (vlong)ap.arenabase,
  746. u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
  747. ap.tabsize = ap.arenabase - ap.tabbase;
  748. }
  749. /*
  750. * Check the arena partition blocks and then the arenas listed in range.
  751. */
  752. void
  753. checkarenas(char *range)
  754. {
  755. char *s, *t;
  756. int i, lo, hi, narena;
  757. uchar dbuf[HeadSize];
  758. uchar *p;
  759. guessgeometry();
  760. partend -= partend%ap.blocksize;
  761. memset(dbuf, 0, sizeof dbuf);
  762. packarenapart(&ap, dbuf);
  763. p = pagein(PartBlank, Block);
  764. if(memcmp(p, dbuf, HeadSize) != 0){
  765. print("on-disk arena part superblock incorrect\n");
  766. showdiffs(dbuf, p, HeadSize, partinfo);
  767. }
  768. memmove(p, dbuf, HeadSize);
  769. narena = (partend-ap.arenabase + arenasize-1)/arenasize;
  770. if(range == nil){
  771. for(i=0; i<narena; i++)
  772. checkarena(ap.arenabase+(vlong)i*arenasize, i);
  773. }else if(strcmp(range, "none") == 0){
  774. /* nothing */
  775. }else{
  776. /* parse, e.g., -4,8-9,10- */
  777. for(s=range; *s; s=t){
  778. t = strchr(s, ',');
  779. if(t)
  780. *t++ = 0;
  781. else
  782. t = s+strlen(s);
  783. if(*s == '-')
  784. lo = 0;
  785. else
  786. lo = strtol(s, &s, 0);
  787. hi = lo;
  788. if(*s == '-'){
  789. s++;
  790. if(*s == 0)
  791. hi = narena-1;
  792. else
  793. hi = strtol(s, &s, 0);
  794. }
  795. if(*s != 0){
  796. print("bad arena range: %s\n", s);
  797. continue;
  798. }
  799. for(i=lo; i<=hi; i++)
  800. checkarena(ap.arenabase+(vlong)i*arenasize, i);
  801. }
  802. }
  803. }
  804. /*
  805. * Is there a clump here at p?
  806. */
  807. static int
  808. isclump(uchar *p, Clump *cl, u32int *pmagic)
  809. {
  810. int n;
  811. u32int magic;
  812. uchar score[VtScoreSize], *bp;
  813. Unwhack uw;
  814. uchar ubuf[70*1024];
  815. bp = p;
  816. magic = u32(p);
  817. if(magic == 0)
  818. return 0;
  819. p += U32Size;
  820. cl->info.type = vtfromdisktype(*p);
  821. if(cl->info.type == 0xFF)
  822. return 0;
  823. p++;
  824. cl->info.size = u16(p);
  825. p += U16Size;
  826. cl->info.uncsize = u16(p);
  827. if(cl->info.size > cl->info.uncsize)
  828. return 0;
  829. p += U16Size;
  830. scorecp(cl->info.score, p);
  831. p += VtScoreSize;
  832. cl->encoding = *p;
  833. p++;
  834. cl->creator = u32(p);
  835. p += U32Size;
  836. cl->time = u32(p);
  837. p += U32Size;
  838. switch(cl->encoding){
  839. case ClumpENone:
  840. if(cl->info.size != cl->info.uncsize)
  841. return 0;
  842. scoremem(score, p, cl->info.size);
  843. if(scorecmp(score, cl->info.score) != 0)
  844. return 0;
  845. break;
  846. case ClumpECompress:
  847. if(cl->info.size >= cl->info.uncsize)
  848. return 0;
  849. unwhackinit(&uw);
  850. n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
  851. if(n != cl->info.uncsize)
  852. return 0;
  853. scoremem(score, ubuf, cl->info.uncsize);
  854. if(scorecmp(score, cl->info.score) != 0)
  855. return 0;
  856. break;
  857. default:
  858. return 0;
  859. }
  860. p += cl->info.size;
  861. /* it all worked out in the end */
  862. *pmagic = magic;
  863. return p - bp;
  864. }
  865. /*
  866. * All ClumpInfos seen in this arena.
  867. * Kept in binary tree so we can look up by score.
  868. */
  869. typedef struct Cit Cit;
  870. struct Cit
  871. {
  872. int left;
  873. int right;
  874. vlong corrupt;
  875. ClumpInfo ci;
  876. };
  877. Cit *cibuf;
  878. int ciroot;
  879. int ncibuf, mcibuf;
  880. void
  881. resetcibuf(void)
  882. {
  883. ncibuf = 0;
  884. ciroot = -1;
  885. }
  886. int*
  887. ltreewalk(int *p, uchar *score)
  888. {
  889. int i;
  890. for(;;){
  891. if(*p == -1)
  892. return p;
  893. i = scorecmp(cibuf[*p].ci.score, score);
  894. if(i == 0)
  895. return p;
  896. if(i < 0)
  897. p = &cibuf[*p].right;
  898. else
  899. p = &cibuf[*p].left;
  900. }
  901. }
  902. void
  903. addcibuf(ClumpInfo *ci, vlong corrupt)
  904. {
  905. Cit *cit;
  906. if(ncibuf == mcibuf){
  907. mcibuf += 131072;
  908. cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
  909. }
  910. cit = &cibuf[ncibuf];
  911. cit->ci = *ci;
  912. cit->left = -1;
  913. cit->right = -1;
  914. cit->corrupt = corrupt;
  915. if(!corrupt)
  916. *ltreewalk(&ciroot, ci->score) = ncibuf;
  917. ncibuf++;
  918. }
  919. void
  920. addcicorrupt(vlong len)
  921. {
  922. static ClumpInfo zci;
  923. addcibuf(&zci, len);
  924. }
  925. int
  926. haveclump(uchar *score)
  927. {
  928. int i;
  929. int p;
  930. p = ciroot;
  931. for(;;){
  932. if(p == -1)
  933. return 0;
  934. i = scorecmp(cibuf[p].ci.score, score);
  935. if(i == 0)
  936. return 1;
  937. if(i < 0)
  938. p = cibuf[p].right;
  939. else
  940. p = cibuf[p].left;
  941. }
  942. }
  943. int
  944. matchci(ClumpInfo *ci, uchar *p)
  945. {
  946. if(ci->type != vtfromdisktype(p[0]))
  947. return 0;
  948. if(ci->size != u16(p+1))
  949. return 0;
  950. if(ci->uncsize != u16(p+3))
  951. return 0;
  952. if(scorecmp(ci->score, p+5) != 0)
  953. return 0;
  954. return 1;
  955. }
  956. int
  957. sealedarena(uchar *p, int blocksize)
  958. {
  959. int v, n;
  960. v = u32(p+4);
  961. switch(v){
  962. default:
  963. return 0;
  964. case ArenaVersion4:
  965. n = ArenaSize4;
  966. break;
  967. case ArenaVersion5:
  968. n = ArenaSize5;
  969. break;
  970. }
  971. if(p[n-1] != 1){
  972. print("arena tail says not sealed\n");
  973. return 0;
  974. }
  975. if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
  976. print("arena tail followed by non-zero data\n");
  977. return 0;
  978. }
  979. if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
  980. print("arena score zero\n");
  981. return 0;
  982. }
  983. return 1;
  984. }
  985. int
  986. okayname(char *name, int n)
  987. {
  988. char buf[20];
  989. if(nameok(name) < 0)
  990. return 0;
  991. sprint(buf, "%d", n);
  992. if(n == 0)
  993. buf[0] = 0;
  994. if(strlen(name) < strlen(buf)
  995. || strcmp(name+strlen(name)-strlen(buf), buf) != 0)
  996. return 0;
  997. return 1;
  998. }
  999. int
  1000. clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
  1001. {
  1002. if(a->type != b->type)
  1003. return a->type - b->type;
  1004. if(a->size != b->size)
  1005. return a->size - b->size;
  1006. if(a->uncsize != b->uncsize)
  1007. return a->uncsize - b->uncsize;
  1008. return scorecmp(a->score, b->score);
  1009. }
  1010. ClumpInfo*
  1011. loadci(vlong offset, Arena *arena, int nci)
  1012. {
  1013. int i, j, per;
  1014. uchar *p, *sp;
  1015. ClumpInfo *bci, *ci;
  1016. per = arena->blocksize/ClumpInfoSize;
  1017. bci = vtmalloc(nci*sizeof bci[0]);
  1018. ci = bci;
  1019. offset += arena->size - arena->blocksize;
  1020. p = sp = nil;
  1021. for(i=0; i<nci; i+=per){
  1022. if(p == sp){
  1023. sp = pagein(offset-4*M, 4*M);
  1024. p = sp+4*M;
  1025. }
  1026. p -= arena->blocksize;
  1027. offset -= arena->blocksize;
  1028. for(j=0; j<per && i+j<nci; j++)
  1029. unpackclumpinfo(ci++, p+j*ClumpInfoSize);
  1030. }
  1031. return bci;
  1032. }
  1033. vlong
  1034. writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
  1035. {
  1036. int i, j, per;
  1037. uchar *p, *sp;
  1038. per = arena->blocksize/ClumpInfoSize;
  1039. offset += arena->size - arena->blocksize;
  1040. p = sp = nil;
  1041. for(i=0; i<nci; i+=per){
  1042. if(p == sp){
  1043. sp = pagein(offset-4*M, 4*M);
  1044. p = sp+4*M;
  1045. }
  1046. p -= arena->blocksize;
  1047. offset -= arena->blocksize;
  1048. memset(p, 0, arena->blocksize);
  1049. for(j=0; j<per && i+j<nci; j++)
  1050. packclumpinfo(ci++, p+j*ClumpInfoSize);
  1051. }
  1052. pageout();
  1053. return offset;
  1054. }
  1055. void
  1056. loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
  1057. {
  1058. char dname[ANameSize];
  1059. static char lastbase[ANameSize];
  1060. uchar *p;
  1061. Arena oarena;
  1062. ArenaHead ohead;
  1063. /*
  1064. * Fmtarenas makes all arenas the same size
  1065. * except the last, which may be smaller.
  1066. * It uses the same block size for arenas as for
  1067. * the arena partition blocks.
  1068. */
  1069. arena->size = arenasize;
  1070. if(offset0+arena->size > partend)
  1071. arena->size = partend - offset0;
  1072. head->size = arena->size;
  1073. arena->blocksize = ap.blocksize;
  1074. head->blocksize = arena->blocksize;
  1075. /*
  1076. * Look for clump magic and name in head/tail blocks.
  1077. * All the other info we will reconstruct just in case.
  1078. */
  1079. p = pagein(offset0, arena->blocksize);
  1080. memset(&ohead, 0, sizeof ohead);
  1081. if(unpackarenahead(&ohead, p) >= 0){
  1082. head->version = ohead.version;
  1083. head->clumpmagic = ohead.clumpmagic;
  1084. if(okayname(ohead.name, anum))
  1085. strcpy(head->name, ohead.name);
  1086. }
  1087. p = pagein(offset0+arena->size-arena->blocksize,
  1088. arena->blocksize);
  1089. memset(&oarena, 0, sizeof oarena);
  1090. if(unpackarena(&oarena, p) >= 0){
  1091. arena->version = oarena.version;
  1092. arena->clumpmagic = oarena.clumpmagic;
  1093. if(okayname(oarena.name, anum))
  1094. strcpy(arena->name, oarena.name);
  1095. arena->diskstats.clumps = oarena.diskstats.clumps;
  1096. print("old arena: sealed=%d\n", oarena.diskstats.sealed);
  1097. arena->diskstats.sealed = oarena.diskstats.sealed;
  1098. }
  1099. /* Head trumps arena. */
  1100. if(head->version){
  1101. arena->version = head->version;
  1102. arena->clumpmagic = head->clumpmagic;
  1103. }
  1104. if(arena->version == 0)
  1105. arena->version = ArenaVersion5;
  1106. if(basename){
  1107. if(anum == -1)
  1108. snprint(arena->name, ANameSize, "%s", basename);
  1109. else
  1110. snprint(arena->name, ANameSize, "%s%d", basename, anum);
  1111. }else if(lastbase[0])
  1112. snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
  1113. else if(head->name[0])
  1114. strcpy(arena->name, head->name);
  1115. else if(arena->name[0] == 0)
  1116. sysfatal("cannot determine base name for arena; use -n");
  1117. strcpy(lastbase, arena->name);
  1118. sprint(dname, "%d", anum);
  1119. lastbase[strlen(lastbase)-strlen(dname)] = 0;
  1120. /* Was working in arena, now copy to head. */
  1121. head->version = arena->version;
  1122. memmove(head->name, arena->name, sizeof head->name);
  1123. head->blocksize = arena->blocksize;
  1124. head->size = arena->size;
  1125. }
  1126. void
  1127. shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
  1128. {
  1129. uchar headbuf[MaxDiskBlock];
  1130. sb->offset = offset0;
  1131. memset(headbuf, 0, sizeof headbuf);
  1132. packarenahead(head, headbuf);
  1133. sbupdate(sb, headbuf, offset0, head->blocksize);
  1134. }
  1135. u32int
  1136. newclumpmagic(int version)
  1137. {
  1138. u32int m;
  1139. if(version == ArenaVersion4)
  1140. return _ClumpMagic;
  1141. do{
  1142. m = fastrand();
  1143. }while(m==0 || m == _ClumpMagic);
  1144. return m;
  1145. }
  1146. /*
  1147. * Poke around in the arena to find the clump data
  1148. * and compute the relevant statistics.
  1149. */
  1150. void
  1151. guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
  1152. uchar *oldscore, uchar *score)
  1153. {
  1154. uchar dbuf[MaxDiskBlock];
  1155. int needtozero, clumps, nb1, nb2, minclumps;
  1156. int inbad, n, ncib, printed, sealing, smart;
  1157. u32int magic;
  1158. uchar *sp, *ep, *p;
  1159. vlong boffset, eoffset, lastclumpend, leaked;
  1160. vlong offset, toffset, totalcorrupt, v;
  1161. Clump cl;
  1162. ClumpInfo *bci, *ci, *eci, *xci;
  1163. Cit *bcit, *cit, *ecit;
  1164. Shabuf oldsha, newsha;
  1165. /*
  1166. * We expect to find an arena, with data, between offset
  1167. * and offset+arenasize. With any luck, the data starts at
  1168. * offset+ap.blocksize. The blocks have variable size and
  1169. * aren't padded at all, which doesn't give us any alignment
  1170. * constraints. The blocks are compressed or high entropy,
  1171. * but the headers are pretty low entropy (except the score):
  1172. *
  1173. * type[1] (range 0 thru 9, 13)
  1174. * size[2]
  1175. * uncsize[2] (<= size)
  1176. *
  1177. * so we can look for these. We check the scores as we go,
  1178. * so we can't make any wrong turns. If we find ourselves
  1179. * in a dead end, scan forward looking for a new start.
  1180. */
  1181. resetcibuf();
  1182. memset(head, 0, sizeof *head);
  1183. memset(arena, 0, sizeof *arena);
  1184. memset(oldscore, 0, VtScoreSize);
  1185. memset(score, 0, VtScoreSize);
  1186. memset(&oldsha, 0, sizeof oldsha);
  1187. memset(&newsha, 0, sizeof newsha);
  1188. newsha.rollback = 1;
  1189. if(0){
  1190. sbdebug(&oldsha, "old.sha");
  1191. sbdebug(&newsha, "new.sha");
  1192. }
  1193. loadarenabasics(offset0, anum, head, arena);
  1194. /* start the clump hunt */
  1195. clumps = 0;
  1196. totalcorrupt = 0;
  1197. sealing = 1;
  1198. boffset = offset0 + arena->blocksize;
  1199. offset = boffset;
  1200. eoffset = offset0+arena->size - arena->blocksize;
  1201. toffset = eoffset;
  1202. sp = pagein(offset0, 4*M);
  1203. if(arena->diskstats.sealed){
  1204. oldsha.offset = offset0;
  1205. sbupdate(&oldsha, sp, offset0, 4*M);
  1206. }
  1207. ep = sp+4*M;
  1208. p = sp + (boffset - offset0);
  1209. ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
  1210. lastclumpend = offset;
  1211. nbad = 0;
  1212. inbad = 0;
  1213. needtozero = 0;
  1214. minclumps = 0;
  1215. while(offset < eoffset){
  1216. /*
  1217. * Shift buffer if we're running out of room.
  1218. */
  1219. if(p+70*K >= ep){
  1220. /*
  1221. * Start the post SHA1 buffer. By now we should know the
  1222. * clumpmagic and arena version, so we can create a
  1223. * correct head block to get things going.
  1224. */
  1225. if(sealing && fix && newsha.offset == 0){
  1226. newsha.offset = offset0;
  1227. if(arena->clumpmagic == 0){
  1228. if(arena->version == 0)
  1229. arena->version = ArenaVersion5;
  1230. arena->clumpmagic = newclumpmagic(arena->version);
  1231. }
  1232. head->clumpmagic = arena->clumpmagic;
  1233. shahead(&newsha, offset0, head);
  1234. }
  1235. n = 4*M-256*K;
  1236. if(sealing && fix){
  1237. sbdiskhash(&newsha, bufoffset);
  1238. sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
  1239. }
  1240. pagein(bufoffset+n, 4*M);
  1241. p -= n;
  1242. if(arena->diskstats.sealed)
  1243. sbupdate(&oldsha, buf, bufoffset, 4*M);
  1244. }
  1245. /*
  1246. * Check for a clump at p, which is at offset in the disk.
  1247. * Duplicate clumps happen in corrupted disks
  1248. * (the same pattern gets written many times in a row)
  1249. * and should never happen during regular use.
  1250. */
  1251. magic = 0;
  1252. if((n = isclump(p, &cl, &magic)) > 0){
  1253. /*
  1254. * If we were in the middle of some corrupted data,
  1255. * flush a warning about it and then add any clump
  1256. * info blocks as necessary.
  1257. */
  1258. if(inbad){
  1259. inbad = 0;
  1260. v = offset-lastclumpend;
  1261. if(needtozero){
  1262. zerorange(lastclumpend, v);
  1263. sbrollback(&newsha, lastclumpend);
  1264. print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
  1265. lastclumpend, v, v);
  1266. }
  1267. addcicorrupt(v);
  1268. totalcorrupt += v;
  1269. nb1 = (minclumps+ncib-1)/ncib;
  1270. minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
  1271. nb2 = (minclumps+ncib-1)/ncib;
  1272. eoffset -= (nb2-nb1)*arena->blocksize;
  1273. }
  1274. if(haveclump(cl.info.score))
  1275. print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
  1276. /*
  1277. * If clumps use different magic numbers, we don't care.
  1278. * We'll just use the first one we find and make the others
  1279. * follow suit.
  1280. */
  1281. if(arena->clumpmagic == 0){
  1282. print("clump type %d size %d score %V magic %x\n",
  1283. cl.info.type, cl.info.size, cl.info.score, magic);
  1284. arena->clumpmagic = magic;
  1285. if(magic == _ClumpMagic)
  1286. arena->version = ArenaVersion4;
  1287. else
  1288. arena->version = ArenaVersion5;
  1289. }
  1290. if(magic != arena->clumpmagic)
  1291. p32(p, arena->clumpmagic);
  1292. if(clumps == 0)
  1293. arena->ctime = cl.time;
  1294. /*
  1295. * Record the clump, update arena stats,
  1296. * grow clump info blocks if needed.
  1297. */
  1298. if(verbose > 1)
  1299. print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
  1300. clumps, cl.info.type, cl.info.score, offset, n, n);
  1301. addcibuf(&cl.info, 0);
  1302. if(minclumps%ncib == 0)
  1303. eoffset -= arena->blocksize;
  1304. minclumps++;
  1305. clumps++;
  1306. if(cl.encoding != ClumpENone)
  1307. arena->diskstats.cclumps++;
  1308. arena->diskstats.uncsize += cl.info.uncsize;
  1309. arena->wtime = cl.time;
  1310. /*
  1311. * Move to next clump.
  1312. */
  1313. offset += n;
  1314. p += n;
  1315. lastclumpend = offset;
  1316. }else{
  1317. /*
  1318. * Overwrite malformed clump data with zeros later.
  1319. * For now, just record whether it needs to be overwritten.
  1320. * Bad regions must be of size at least ClumpSize.
  1321. * Postponing the overwriting keeps us from writing past
  1322. * the end of the arena data (which might be directory data)
  1323. * with zeros.
  1324. */
  1325. if(!inbad){
  1326. inbad = 1;
  1327. needtozero = 0;
  1328. if(memcmp(p, zero, ClumpSize) != 0)
  1329. needtozero = 1;
  1330. p += ClumpSize;
  1331. offset += ClumpSize;
  1332. nbad++;
  1333. }else{
  1334. if(*p != 0)
  1335. needtozero = 1;
  1336. p++;
  1337. offset++;
  1338. }
  1339. }
  1340. }
  1341. pageout();
  1342. if(verbose)
  1343. print("readable clumps: %d; min. directory entries: %d\n",
  1344. clumps, minclumps);
  1345. arena->diskstats.used = lastclumpend - boffset;
  1346. leaked = eoffset - lastclumpend;
  1347. if(verbose)
  1348. print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
  1349. boffset, lastclumpend, arena->diskstats.used, leaked);
  1350. /*
  1351. * Finish the SHA1 of the old data.
  1352. */
  1353. if(arena->diskstats.sealed){
  1354. sbdiskhash(&oldsha, toffset);
  1355. readdisk(dbuf, toffset, arena->blocksize);
  1356. scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
  1357. sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
  1358. sbscore(&oldsha, oldscore);
  1359. }
  1360. /*
  1361. * If we still don't know the clump magic, the arena
  1362. * must be empty. It still needs a value, so make
  1363. * something up.
  1364. */
  1365. if(arena->version == 0)
  1366. arena->version = ArenaVersion5;
  1367. if(arena->clumpmagic == 0){
  1368. if(arena->version == ArenaVersion4)
  1369. arena->clumpmagic = _ClumpMagic;
  1370. else{
  1371. do
  1372. arena->clumpmagic = fastrand();
  1373. while(arena->clumpmagic==_ClumpMagic
  1374. ||arena->clumpmagic==0);
  1375. }
  1376. head->clumpmagic = arena->clumpmagic;
  1377. }
  1378. /*
  1379. * Guess at number of clumpinfo blocks to load.
  1380. * If we guess high, it's no big deal. If we guess low,
  1381. * we'll be forced into rewriting the whole directory.
  1382. * Still not such a big deal.
  1383. */
  1384. if(clumps == 0 || arena->diskstats.used == totalcorrupt)
  1385. goto Nocib;
  1386. if(clumps < arena->diskstats.clumps)
  1387. clumps = arena->diskstats.clumps;
  1388. if(clumps < ncibuf)
  1389. clumps = ncibuf;
  1390. clumps += totalcorrupt/
  1391. ((arena->diskstats.used - totalcorrupt)/clumps);
  1392. clumps += totalcorrupt/2000;
  1393. if(clumps < minclumps)
  1394. clumps = minclumps;
  1395. clumps += ncib-1;
  1396. clumps -= clumps%ncib;
  1397. /*
  1398. * Can't write into the actual data.
  1399. */
  1400. v = offset0 + arena->size - arena->blocksize;
  1401. v -= (clumps+ncib-1)/ncib * arena->blocksize;
  1402. if(v < lastclumpend){
  1403. v = offset0 + arena->size - arena->blocksize;
  1404. clumps = (v-lastclumpend)/arena->blocksize * ncib;
  1405. }
  1406. if(clumps < minclumps)
  1407. print("cannot happen?\n");
  1408. /*
  1409. * Check clumpinfo blocks against directory we created.
  1410. * The tricky part is handling the corrupt sections of arena.
  1411. * If possible, we remark just the affected directory entries
  1412. * rather than slide everything down.
  1413. *
  1414. * Allocate clumps+1 blocks and check that we don't need
  1415. * the last one at the end.
  1416. */
  1417. bci = loadci(offset0, arena, clumps+1);
  1418. eci = bci+clumps+1;
  1419. bcit = cibuf;
  1420. ecit = cibuf+ncibuf;
  1421. smart = 1;
  1422. Again:
  1423. nbad = 0;
  1424. ci = bci;
  1425. for(cit=bcit; cit<ecit && ci<eci; cit++){
  1426. if(cit->corrupt){
  1427. vlong n, m;
  1428. if(smart){
  1429. /*
  1430. * If we can, just mark existing entries as corrupt.
  1431. */
  1432. n = cit->corrupt;
  1433. for(xci=ci; n>0 && xci<eci; xci++)
  1434. n -= ClumpSize+xci->size;
  1435. if(n > 0 || xci >= eci)
  1436. goto Dumb;
  1437. printed = 0;
  1438. for(; ci<xci; ci++){
  1439. if(verbose && ci->type != VtCorruptType){
  1440. if(!printed){
  1441. print("marking directory %d-%d as corrupt\n",
  1442. (int)(ci-bci), (int)(xci-bci));
  1443. printed = 1;
  1444. }
  1445. print("\ttype=%d size=%d uncsize=%d score=%V\n",
  1446. ci->type, ci->size, ci->uncsize, ci->score);
  1447. }
  1448. ci->type = VtCorruptType;
  1449. }
  1450. }else{
  1451. Dumb:
  1452. print("\trewriting clump directory\n");
  1453. /*
  1454. * Otherwise, blaze a new trail.
  1455. */
  1456. n = cit->corrupt;
  1457. while(n > 0 && ci < eci){
  1458. if(n < ClumpSize)
  1459. sysfatal("bad math in clump corrupt");
  1460. if(n <= VtMaxLumpSize+ClumpSize)
  1461. m = n;
  1462. else{
  1463. m = VtMaxLumpSize+ClumpSize;
  1464. if(n-m < ClumpSize)
  1465. m -= ClumpSize;
  1466. }
  1467. ci->type = VtCorruptType;
  1468. ci->size = m-ClumpSize;
  1469. ci->uncsize = m-ClumpSize;
  1470. memset(ci->score, 0, VtScoreSize);
  1471. ci++;
  1472. n -= m;
  1473. }
  1474. }
  1475. continue;
  1476. }
  1477. if(clumpinfocmp(&cit->ci, ci) != 0){
  1478. if(verbose && (smart || verbose>1)){
  1479. print("clumpinfo %d\n", (int)(ci-bci));
  1480. print("\twant: %d %d %d %V\n",
  1481. cit->ci.type, cit->ci.size,
  1482. cit->ci.uncsize, cit->ci.score);
  1483. print("\thave: %d %d %d %V\n",
  1484. ci->type, ci->size,
  1485. ci->uncsize, ci->score);
  1486. }
  1487. *ci = cit->ci;
  1488. nbad++;
  1489. }
  1490. ci++;
  1491. }
  1492. if(ci >= eci || cit < ecit){
  1493. print("ran out of space editing existing directory; rewriting\n");
  1494. print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
  1495. assert(smart); /* can't happen second time thru */
  1496. smart = 0;
  1497. goto Again;
  1498. }
  1499. assert(ci <= eci);
  1500. arena->diskstats.clumps = ci-bci;
  1501. eoffset = writeci(offset0, arena, bci, ci-bci);
  1502. if(sealing && fix)
  1503. sbrollback(&newsha, v);
  1504. print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
  1505. if(lastclumpend > eoffset)
  1506. print("arena directory overwrote blocks! cannot happen!\n");
  1507. free(bci);
  1508. if(smart && nbad)
  1509. print("arena directory has %d bad or missing entries\n", nbad);
  1510. Nocib:
  1511. if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
  1512. if(arena->diskstats.sealed)
  1513. print("unsealing arena\n");
  1514. sealing = 0;
  1515. memset(oldscore, 0, VtScoreSize);
  1516. }
  1517. /*
  1518. * Finish the SHA1 of the new data - only meaningful
  1519. * if we've been writing to disk (`fix').
  1520. */
  1521. arena->diskstats.sealed = sealing;
  1522. arena->memstats = arena->diskstats;
  1523. if(sealing && fix){
  1524. uchar tbuf[MaxDiskBlock];
  1525. sbdiskhash(&newsha, toffset);
  1526. memset(tbuf, 0, sizeof tbuf);
  1527. packarena(arena, tbuf);
  1528. sbupdate(&newsha, tbuf, toffset, arena->blocksize);
  1529. sbscore(&newsha, score);
  1530. }
  1531. }
  1532. void
  1533. dumparena(vlong offset, int anum, Arena *arena)
  1534. {
  1535. char buf[1000];
  1536. vlong o, e;
  1537. int fd, n;
  1538. snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
  1539. if((fd = create(buf, OWRITE, 0666)) < 0){
  1540. fprint(2, "create %s: %r\n", buf);
  1541. return;
  1542. }
  1543. e = offset+arena->size;
  1544. for(o=offset; o<e; o+=n){
  1545. n = 4*M;
  1546. if(o+n > e)
  1547. n = e-o;
  1548. if(pwrite(fd, pagein(o, n), n, o-offset) != n){
  1549. fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
  1550. return;
  1551. }
  1552. }
  1553. }
  1554. void
  1555. checkarena(vlong offset, int anum)
  1556. {
  1557. uchar dbuf[MaxDiskBlock];
  1558. uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
  1559. Arena arena, oarena;
  1560. ArenaHead head;
  1561. Info *fmt, *fmta;
  1562. int sz;
  1563. print("# arena %d: offset %#llux\n", anum, offset);
  1564. if(offset >= partend){
  1565. print("arena offset out of bounds\n");
  1566. return;
  1567. }
  1568. guessarena(offset, anum, &head, &arena, oldscore, score);
  1569. if(verbose){
  1570. print("#\tversion=%d name=%s blocksize=%d size=%z",
  1571. head.version, head.name, head.blocksize, head.size);
  1572. if(head.clumpmagic)
  1573. print(" clumpmagic=%#.8ux", head.clumpmagic);
  1574. print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
  1575. arena.diskstats.clumps, arena.diskstats.cclumps,
  1576. arena.diskstats.used, arena.diskstats.uncsize);
  1577. print("#\tctime=%t\n", arena.ctime);
  1578. print("#\twtime=%t\n", arena.wtime);
  1579. if(arena.diskstats.sealed)
  1580. print("#\tsealed score=%V\n", score);
  1581. }
  1582. if(dumpbase){
  1583. dumparena(offset, anum, &arena);
  1584. return;
  1585. }
  1586. memset(dbuf, 0, sizeof dbuf);
  1587. packarenahead(&head, dbuf);
  1588. p = pagein(offset, arena.blocksize);
  1589. if(memcmp(dbuf, p, arena.blocksize) != 0){
  1590. print("on-disk arena header incorrect\n");
  1591. showdiffs(dbuf, p, arena.blocksize,
  1592. arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
  1593. }
  1594. memmove(p, dbuf, arena.blocksize);
  1595. memset(dbuf, 0, sizeof dbuf);
  1596. packarena(&arena, dbuf);
  1597. if(arena.diskstats.sealed)
  1598. scorecp(dbuf+arena.blocksize-VtScoreSize, score);
  1599. p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
  1600. memset(&oarena, 0, sizeof oarena);
  1601. unpackarena(&oarena, p);
  1602. if(arena.version == ArenaVersion4){
  1603. sz = ArenaSize4;
  1604. fmt = tailinfo4;
  1605. fmta = tailinfo4a;
  1606. }else{
  1607. sz = ArenaSize5;
  1608. fmt = tailinfo5;
  1609. fmta = tailinfo5a;
  1610. }
  1611. if(p[sz] == 1){
  1612. fmt = fmta;
  1613. if(oarena.diskstats.sealed){
  1614. /*
  1615. * some arenas were sealed with the extension
  1616. * before we adopted the convention that if it didn't
  1617. * add new information it gets dropped.
  1618. */
  1619. _packarena(&arena, dbuf, 1);
  1620. }
  1621. }
  1622. if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
  1623. print("on-disk arena tail incorrect\n");
  1624. showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
  1625. }
  1626. if(arena.diskstats.sealed){
  1627. if(oarena.diskstats.sealed)
  1628. if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
  1629. print("on-disk arena seal score incorrect\n");
  1630. print("\tcorrect=%V\n", oldscore);
  1631. print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
  1632. }
  1633. if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
  1634. print("%ssealing arena%s: %V\n",
  1635. oarena.diskstats.sealed ? "re" : "",
  1636. scorecmp(oldscore, score) == 0 ?
  1637. "" : " after changes", score);
  1638. }
  1639. }
  1640. memmove(p, dbuf, arena.blocksize);
  1641. pageout();
  1642. }
  1643. AMapN*
  1644. buildamap(void)
  1645. {
  1646. uchar *p;
  1647. vlong o;
  1648. ArenaHead h;
  1649. AMapN *an;
  1650. AMap *m;
  1651. an = vtmallocz(sizeof *an);
  1652. for(o=ap.arenabase; o<partend; o+=arenasize){
  1653. p = pagein(o, Block);
  1654. if(unpackarenahead(&h, p) >= 0){
  1655. an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
  1656. m = &an->map[an->n++];
  1657. m->start = o;
  1658. m->stop = o+h.size;
  1659. strcpy(m->name, h.name);
  1660. }
  1661. }
  1662. return an;
  1663. }
  1664. void
  1665. checkmap(void)
  1666. {
  1667. char *s;
  1668. uchar *p;
  1669. int i, len;
  1670. AMapN *an;
  1671. Fmt fmt;
  1672. an = buildamap();
  1673. fmtstrinit(&fmt);
  1674. fmtprint(&fmt, "%ud\n", an->n);
  1675. for(i=0; i<an->n; i++)
  1676. fmtprint(&fmt, "%s\t%lld\t%lld\n",
  1677. an->map[i].name, an->map[i].start, an->map[i].stop);
  1678. s = fmtstrflush(&fmt);
  1679. len = strlen(s);
  1680. if(len > ap.tabsize){
  1681. print("arena partition map too long: need %z bytes have %z\n",
  1682. (vlong)len, (vlong)ap.tabsize);
  1683. len = ap.tabsize;
  1684. }
  1685. if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
  1686. print("arena partition map *way* too long\n");
  1687. return;
  1688. }
  1689. p = pagein(ap.tabbase, ap.tabsize);
  1690. if(memcmp(p, s, len) != 0){
  1691. print("arena partition map incorrect; rewriting.\n");
  1692. memmove(p, s, len);
  1693. }
  1694. pageout();
  1695. }
  1696. int mainstacksize = 512*1024;
  1697. void
  1698. threadmain(int argc, char **argv)
  1699. {
  1700. int mode;
  1701. mode = OREAD;
  1702. readonly = 1;
  1703. ARGBEGIN{
  1704. case 'U':
  1705. unseal = 1;
  1706. break;
  1707. case 'a':
  1708. arenasize = unittoull(EARGF(usage()));
  1709. break;
  1710. case 'b':
  1711. ap.blocksize = unittoull(EARGF(usage()));
  1712. break;
  1713. case 'f':
  1714. fix = 1;
  1715. mode = ORDWR;
  1716. readonly = 0;
  1717. break;
  1718. case 'n':
  1719. basename = EARGF(usage());
  1720. break;
  1721. case 'v':
  1722. verbose++;
  1723. break;
  1724. case 'x':
  1725. dumpbase = EARGF(usage());
  1726. break;
  1727. default:
  1728. usage();
  1729. }ARGEND
  1730. if(argc != 1 && argc != 2)
  1731. usage();
  1732. file = argv[0];
  1733. ventifmtinstall();
  1734. fmtinstall('z', zfmt);
  1735. fmtinstall('t', tfmt);
  1736. quotefmtinstall();
  1737. part = initpart(file, mode|ODIRECT);
  1738. if(part == nil)
  1739. sysfatal("can't open %s: %r", file);
  1740. partend = part->size;
  1741. if(isonearena()){
  1742. checkarena(0, -1);
  1743. threadexitsall(nil);
  1744. }
  1745. checkarenas(argc > 1 ? argv[1] : nil);
  1746. checkmap();
  1747. threadexitsall(nil);
  1748. }