dat.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. typedef struct Config Config;
  2. typedef struct AMap AMap;
  3. typedef struct AMapN AMapN;
  4. typedef struct Arena Arena;
  5. typedef struct ArenaHead ArenaHead;
  6. typedef struct ArenaPart ArenaPart;
  7. typedef struct CIBlock CIBlock;
  8. typedef struct Clump Clump;
  9. typedef struct ClumpInfo ClumpInfo;
  10. typedef struct IAddr IAddr;
  11. typedef struct IBucket IBucket;
  12. typedef struct IEStream IEStream;
  13. typedef struct IEntry IEntry;
  14. typedef struct IFile IFile;
  15. typedef struct ISect ISect;
  16. typedef struct Index Index;
  17. typedef struct Lump Lump;
  18. typedef struct DBlock DBlock;
  19. typedef struct Part Part;
  20. typedef struct Stats Stats;
  21. typedef struct ZBlock ZBlock;
  22. #pragma incomplete IEStream
  23. #define TWID32 ((u32int)~(u32int)0)
  24. #define TWID64 ((u64int)~(u64int)0)
  25. #define TWID8 ((u8int)~(u8int)0)
  26. enum
  27. {
  28. ABlockLog = 9, /* log2(512), the quantum for reading arenas */
  29. ANameSize = 64,
  30. MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
  31. MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
  32. PartBlank = 256*1024, /* untouched section at beginning of partition */
  33. HeadSize = 512, /* size of a header after PartBlank */
  34. MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
  35. IndexBase = 1024*1024, /* initial address to use in an index */
  36. MaxIo = 64*1024, /* max size of a single read or write operation */
  37. ICacheBits = 16, /* default bits for indexing icache */
  38. ICacheDepth = 4, /* default depth of an icache hash chain */
  39. MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
  40. /*
  41. * return codes from syncArena
  42. */
  43. SyncDataErr = 1 << 0, /* problem reading the clump data */
  44. SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
  45. SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
  46. SyncFixErr = 1 << 3, /* error writing fixed data */
  47. SyncHeader = 1 << 4, /* altered header fields */
  48. /*
  49. * error severity
  50. */
  51. EOk = 0, /* error expected in normal operation */
  52. EStrange, /* strange error that should be logged */
  53. ECorrupt, /* corrupted data found in arenas */
  54. EICorrupt, /* corrupted data found in index */
  55. EAdmin, /* should be brought to administrators' attention */
  56. ECrash, /* really bad internal error */
  57. EBug, /* a limitation which should be fixed */
  58. EInconsist, /* inconsistencies between index and arena */
  59. EMax,
  60. /*
  61. * internal disk formats for the venti archival storage system
  62. */
  63. /*
  64. * magic numbers on disk
  65. */
  66. ClumpMagic = 0xd15cb10c, /* clump header */
  67. ClumpFreeMagic = 0, /* free clump; terminates active clump log */
  68. ArenaPartMagic = 0xa9e4a5e7, /* arena partition header */
  69. ArenaMagic = 0xf2a14ead, /* arena trailer */
  70. ArenaHeadMagic = 0xd15c4ead, /* arena header */
  71. ISectMagic = 0xd15c5ec7, /* index header */
  72. ArenaPartVersion = 3,
  73. ArenaVersion = 4,
  74. IndexVersion = 1,
  75. ISectVersion = 1,
  76. /*
  77. * encodings of clumps on disk
  78. */
  79. ClumpEErr = 0, /* can't happen */
  80. ClumpENone, /* plain */
  81. ClumpECompress, /* compressed */
  82. ClumpEMax,
  83. /*
  84. * marker for corrupted data on disk
  85. */
  86. VtTypeCorrupt = VtMaxType,
  87. /*
  88. * sizes in bytes on disk
  89. */
  90. U8Size = 1,
  91. U16Size = 2,
  92. U32Size = 4,
  93. U64Size = 8,
  94. ArenaPartSize = 4 * U32Size,
  95. ArenaSize = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
  96. ArenaHeadSize = U64Size + 3 * U32Size + ANameSize,
  97. ISectSize = 7 * U32Size + 2 * ANameSize,
  98. ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
  99. ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
  100. IBucketSize = U32Size + U16Size,
  101. IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
  102. IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size,
  103. MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
  104. VentiZZZZZZZZ
  105. };
  106. /*
  107. * results of parsing and initializing a config file
  108. */
  109. struct Config
  110. {
  111. char *index; /* name of the index to initialize */
  112. int naparts; /* arena partitions initialized */
  113. ArenaPart **aparts;
  114. int nsects; /* index sections initialized */
  115. ISect **sects;
  116. u32int bcmem;
  117. u32int mem;
  118. u32int icmem;
  119. int queueWrites;
  120. char *haddr;
  121. char *vaddr;
  122. };
  123. /*
  124. * a Part is the low level interface to files or disks.
  125. * there are two main types of partitions
  126. * arena partitions, which some number of arenas, each in a sub-partition.
  127. * index partition, which only have one subpartition.
  128. */
  129. struct Part
  130. {
  131. int fd; /* rock for accessing the disk */
  132. u64int size; /* size of the partiton */
  133. u32int blockSize; /* block size for reads and writes */
  134. char *name;
  135. };
  136. /*
  137. * a cached block from the partition
  138. * yuck -- most of this is internal structure for the cache
  139. * all other routines should only use data
  140. */
  141. struct DBlock
  142. {
  143. u8int *data;
  144. Part *part; /* partition in which cached */
  145. u64int addr; /* base address on the partition */
  146. u16int size; /* amount of data available, not amount allocated; should go away */
  147. DBlock *next; /* doubly linked hash chains */
  148. DBlock *prev;
  149. u32int heap; /* index in heap table */
  150. u32int used; /* last reference times */
  151. u32int used2;
  152. u32int ref; /* reference count */
  153. VtLock *lock; /* for access to data only */
  154. };
  155. /*
  156. * a cached block from the partition
  157. * yuck -- most of this is internal structure for the cache
  158. * all other routines should only use data
  159. * double yuck -- this is mostly the same as a DBlock
  160. */
  161. struct Lump
  162. {
  163. Packet *data;
  164. Part *part; /* partition in which cached */
  165. u8int score[VtScoreSize]; /* score of packet */
  166. u8int type; /* type of packet */
  167. u16int size; /* amount of data allocated to hold packet */
  168. Lump *next; /* doubly linked hash chains */
  169. Lump *prev;
  170. u32int heap; /* index in heap table */
  171. u32int used; /* last reference times */
  172. u32int used2;
  173. u32int ref; /* reference count */
  174. VtLock *lock; /* for access to data only */
  175. };
  176. /*
  177. * mapping between names and address ranges
  178. */
  179. struct AMap
  180. {
  181. u64int start;
  182. u64int stop;
  183. char name[ANameSize];
  184. };
  185. /*
  186. * an AMap along with a length
  187. */
  188. struct AMapN
  189. {
  190. int n;
  191. AMap *map;
  192. };
  193. /*
  194. * an ArenaPart is a partition made up of Arenas
  195. * it exists because most os's don't support many partitions,
  196. * and we want to have many different Arenas
  197. */
  198. struct ArenaPart
  199. {
  200. Part *part;
  201. u64int size; /* size of underlying partition, rounded down to blocks */
  202. Arena **arenas;
  203. u32int tabBase; /* base address of arena table on disk */
  204. u32int tabSize; /* max. bytes in arena table */
  205. /*
  206. * fields stored on disk
  207. */
  208. u32int version;
  209. u32int blockSize; /* "optimal" block size for reads and writes */
  210. u32int arenaBase; /* base address of first arena */
  211. /*
  212. * stored in the arena mapping table on disk
  213. */
  214. AMap *map;
  215. int narenas;
  216. };
  217. /*
  218. * info about one block in the clump info cache
  219. */
  220. struct CIBlock
  221. {
  222. u32int block; /* blocks in the directory */
  223. int offset; /* offsets of one clump in the data */
  224. DBlock *data;
  225. };
  226. /*
  227. * an Arena is a log of Clumps, preceeded by an ArenaHeader,
  228. * and followed by a Arena, each in one disk block.
  229. * struct on disk is not always up to date, but should be self-consistent.
  230. * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
  231. * <struct name="Arena" type="Arena *">
  232. * <field name="name" val="s->name" type="AName"/>
  233. * <field name="version" val="s->version" type="U32int"/>
  234. * <field name="partition" val="s->part->name" type="AName"/>
  235. * <field name="blockSize" val="s->blockSize" type="U32int"/>
  236. * <field name="start" val="s->base" type="U64int"/>
  237. * <field name="stop" val="s->base+2*s->blockSize" type="U64int"/>
  238. * <field name="created" val="s->ctime" type="U32int"/>
  239. * <field name="modified" val="s->wtime" type="U32int"/>
  240. * <field name="sealed" val="s->sealed" type="Sealed"/>
  241. * <field name="score" val="s->score" type="Score"/>
  242. * <field name="clumps" val="s->clumps" type="U32int"/>
  243. * <field name="compressedClumps" val="s->cclumps" type="U32int"/>
  244. * <field name="data" val="s->uncsize" type="U64int"/>
  245. * <field name="compressedData" val="s->used - s->clumps * ClumpSize" type="U64int"/>
  246. * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
  247. * </struct>
  248. */
  249. struct Arena
  250. {
  251. VtLock *lock; /* lock for arena fields, writing to disk */
  252. Part *part; /* partition in which arena lives */
  253. int blockSize; /* size of block to read or write */
  254. u64int base; /* base address on disk */
  255. u64int size; /* total space in the arena */
  256. u64int limit; /* storage limit for clumps */
  257. u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
  258. int clumpMax; /* ClumpInfos per block */
  259. CIBlock cib; /* dirty clump directory block */
  260. /*
  261. * fields stored on disk
  262. */
  263. u32int version;
  264. char name[ANameSize]; /* text label */
  265. u32int clumps; /* number of allocated clumps */
  266. u32int cclumps; /* clumps which are compressed; informational only */
  267. u32int ctime; /* first time a block was written */
  268. u32int wtime; /* last time a block was written */
  269. u64int used; /* number of bytes currently used */
  270. u64int uncsize; /* total of all clumps's uncsize; informational only */
  271. u8int sealed; /* arena all filled up? */
  272. };
  273. /*
  274. * redundant storage of some fields at the beginning of each arena
  275. */
  276. struct ArenaHead
  277. {
  278. u32int version;
  279. char name[ANameSize];
  280. u32int blockSize;
  281. u64int size;
  282. };
  283. /*
  284. * most interesting meta information for a clump.
  285. * stored in each clump's header and in the Arena's directory,
  286. * stored in reverse order just prior to the arena trailer
  287. */
  288. struct ClumpInfo
  289. {
  290. u8int type;
  291. u16int size; /* size of disk data, not including header */
  292. u16int uncsize; /* size of uncompressed data */
  293. u8int score[VtScoreSize]; /* score of the uncompressed data only */
  294. };
  295. /*
  296. * header for an immutable clump of data
  297. */
  298. struct Clump
  299. {
  300. ClumpInfo info;
  301. u8int encoding;
  302. u32int creator; /* initial client which wrote the block */
  303. u32int time; /* creation at gmt seconds since 1/1/1970 */
  304. };
  305. /*
  306. * index of all clumps according to their score
  307. * this is just a wrapper to tie together the index sections
  308. * <struct name="Index" type="Index *">
  309. * <field name="name" val="s->name" type="AName"/>
  310. * <field name="version" val="s->version" type="U32int"/>
  311. * <field name="blockSize" val="s->blockSize" type="U32int"/>
  312. * <field name="tabSize" val="s->tabSize" type="U32int"/>
  313. * <field name="buckets" val="s->buckets" type="U32int"/>
  314. * <field name="buckDiv" val="s->div" type="U32int"/>
  315. * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
  316. * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
  317. * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
  318. * </struct>
  319. * <struct name="Amap" type="AMap *">
  320. * <field name="name" val="s->name" type="AName"/>
  321. * <field name="start" val="s->start" type="U64int"/>
  322. * <field name="stop" val="s->stop" type="U64int"/>
  323. * </struct>
  324. */
  325. struct Index
  326. {
  327. u32int div; /* divisor for mapping score to bucket */
  328. u32int buckets; /* last bucket used in disk hash table */
  329. u32int blockSize;
  330. u32int tabSize; /* max. bytes in index config */
  331. int mapAlloc; /* first arena to check when adding a lump */
  332. Arena **arenas; /* arenas in the mapping */
  333. ISect **sects; /* sections which hold the buckets */
  334. /*
  335. * fields stored in config file
  336. */
  337. u32int version;
  338. char name[ANameSize]; /* text label */
  339. int nsects;
  340. AMap *smap; /* mapping of buckets to index sections */
  341. int narenas;
  342. AMap *amap; /* mapping from index addesses to arenas */
  343. };
  344. /*
  345. * one part of the bucket storage for an index.
  346. * the index blocks are sequentially allocated
  347. * across all of the sections.
  348. */
  349. struct ISect
  350. {
  351. Part *part;
  352. int blockLog; /* log2(blockSize) */
  353. int buckMax; /* max. entries in a index bucket */
  354. u32int tabBase; /* base address of index config table on disk */
  355. u32int tabSize; /* max. bytes in index config */
  356. /*
  357. * fields stored on disk
  358. */
  359. u32int version;
  360. char name[ANameSize]; /* text label */
  361. char index[ANameSize]; /* index owning the section */
  362. u32int blockSize; /* size of hash buckets in index */
  363. u32int blockBase; /* address of start of on disk index table */
  364. u32int blocks; /* total blocks on disk; some may be unused */
  365. u32int start; /* first bucket in this section */
  366. u32int stop; /* limit of buckets in this section */
  367. };
  368. /*
  369. * externally interesting part of an IEntry
  370. */
  371. struct IAddr
  372. {
  373. u64int addr;
  374. u16int size; /* uncompressed size */
  375. u8int type; /* type of block */
  376. u8int blocks; /* arena io quanta for Clump + data */
  377. };
  378. /*
  379. * entries in the index
  380. * kept in IBuckets in the disk index table,
  381. * cached in the memory ICache.
  382. */
  383. struct IEntry
  384. {
  385. u8int score[VtScoreSize];
  386. IEntry *next; /* next in hash chain */
  387. u32int wtime; /* last write time */
  388. u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */
  389. u8int rac; /* read ahead count */
  390. IAddr ia;
  391. };
  392. /*
  393. * buckets in the on disk index table
  394. */
  395. struct IBucket
  396. {
  397. u16int n; /* number of active indices */
  398. u32int next; /* overflow bucket */
  399. u8int *data;
  400. };
  401. /*
  402. * temporary buffers used by individual threads
  403. */
  404. struct ZBlock
  405. {
  406. u32int len;
  407. u8int *data;
  408. };
  409. /*
  410. * simple input buffer for a '\0' terminated text file
  411. */
  412. struct IFile
  413. {
  414. char *name; /* name of the file */
  415. ZBlock *b; /* entire contents of file */
  416. u32int pos; /* current position in the file */
  417. };
  418. /*
  419. * statistics about the operation of the server
  420. * mainly for performance monitoring and profiling.
  421. */
  422. struct Stats
  423. {
  424. VtLock *lock;
  425. long lumpWrites; /* protocol block writes */
  426. long lumpReads; /* protocol block reads */
  427. long lumpHit; /* lump cache hit */
  428. long lumpMiss; /* lump cache miss */
  429. long clumpWrites; /* clumps to disk */
  430. vlong clumpBWrites; /* clump data bytes to disk */
  431. vlong clumpBComp; /* clump bytes compressed */
  432. long clumpReads; /* clumps from disk */
  433. vlong clumpBReads; /* clump data bytes from disk */
  434. vlong clumpBUncomp; /* clump bytes uncompressed */
  435. long ciWrites; /* clump directory to disk */
  436. long ciReads; /* clump directory from disk */
  437. long indexWrites; /* index to disk */
  438. long indexReads; /* index from disk */
  439. long indexWReads; /* for writing a new entry */
  440. long indexAReads; /* for allocating an overflow block */
  441. long diskWrites; /* total disk writes */
  442. long diskReads; /* total disk reads */
  443. vlong diskBWrites; /* total disk bytes written */
  444. vlong diskBReads; /* total disk bytes read */
  445. long pcHit; /* partition cache hit */
  446. long pcMiss; /* partition cache miss */
  447. long pcReads; /* partition cache reads from disk */
  448. vlong pcBReads; /* partition cache bytes read */
  449. long icInserts; /* stores into index cache */
  450. long icLookups; /* index cache lookups */
  451. long icHits; /* hits in the cache */
  452. long icFills; /* successful fills from index */
  453. };
  454. extern Index *mainIndex;
  455. extern u32int maxBlockSize; /* max. block size used by any partition */
  456. extern int paranoid; /* should verify hashes on disk read */
  457. extern int queueWrites; /* put all lump writes on a queue and finish later */
  458. extern int readonly; /* only allowed to read the disk data */
  459. extern Stats stats;
  460. extern u8int zeroScore[VtScoreSize];
  461. extern int chattyzero;