bflz.c 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. /*
  10. * Extraordinarily brute force Lempel & Ziv-like
  11. * compressor. The input file must fit in memory
  12. * during compression, and the output file will
  13. * be reconstructed in memory during decompression.
  14. * We search for large common sequences and use a
  15. * greedy algorithm to choose which sequence gets
  16. * compressed first.
  17. *
  18. * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
  19. *
  20. * Output format is a series of blocks followed by
  21. * a raw data section. Each block begins with a 32-bit big-endian
  22. * number. The top bit is type and the next 31 bits
  23. * are uncompressed size. Type is one of
  24. * 0 - use raw data for this length
  25. * 1 - a 32-bit offset follows
  26. * After the blocks come the raw data. (The end of the blocks can be
  27. * noted by summing block lengths until you reach the file length.)
  28. */
  29. #include <u.h>
  30. #include <libc.h>
  31. #include <bio.h>
  32. #define malloc sbrk
  33. int minrun = 16;
  34. int win = 16;
  35. uint32_t outn;
  36. int verbose;
  37. int mindist;
  38. enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
  39. enum { NOFF = 3 };
  40. Biobuf bout;
  41. uint32_t length;
  42. uchar *data;
  43. uint32_t sum32(uint32_t, void*, long);
  44. uchar *odat;
  45. int nodat;
  46. int nraw;
  47. int rawstart;
  48. int acct;
  49. int zlength;
  50. int maxchain;
  51. int maxrle[256];
  52. int nnew;
  53. typedef struct Node Node;
  54. struct Node {
  55. Node *link;
  56. uint32_t key;
  57. uint32_t offset[NOFF];
  58. };
  59. Node *nodepool;
  60. int nnodepool;
  61. Node **hash;
  62. uint nhash;
  63. uint maxlen;
  64. uint maxsame;
  65. uint replacesame = 8*1024*1024;
  66. Node *freelist, **freeend;
  67. uint nalloc;
  68. Node*
  69. allocnode(void)
  70. {
  71. int i;
  72. Node *n;
  73. if(nnodepool == 0){
  74. nnodepool = 256*1024;
  75. nodepool = malloc(sizeof(Node)*nnodepool);
  76. }
  77. if(freelist){
  78. n = freelist;
  79. freelist = n->link;
  80. return n;
  81. }
  82. assert(nnodepool > 0);
  83. nalloc++;
  84. n = &nodepool[--nnodepool];
  85. for(i=0; i<NOFF; i++)
  86. n->offset[i] = -1;
  87. return n;
  88. }
  89. void
  90. freenode(Node *n)
  91. {
  92. if(freelist == nil)
  93. freelist = n;
  94. else
  95. *freeend = n;
  96. freeend = &n->link;
  97. n->link = nil;
  98. }
  99. Node**
  100. llookup(uint32_t key)
  101. {
  102. uint c;
  103. Node **l, **top, *n;
  104. if(nhash == 0){
  105. uint x;
  106. x = length/8;
  107. for(nhash=1; nhash<x; nhash<<=1)
  108. ;
  109. hash = sbrk(sizeof(Node*)*nhash);
  110. }
  111. top = &hash[key&(nhash-1)];
  112. c = 0;
  113. for(l=top; *l; l=&(*l)->link){
  114. c++;
  115. if((*l)->key == key){
  116. /* move to front */
  117. n = *l;
  118. *l = n->link;
  119. n->link = *top;
  120. *top = n;
  121. return top;
  122. }
  123. }
  124. if(c > maxlen)
  125. maxlen = c;
  126. return l;
  127. }
  128. Node*
  129. lookup(uint32_t key)
  130. {
  131. return *llookup(key);
  132. }
  133. void
  134. insertnode(uint32_t key, uint32_t offset)
  135. {
  136. int i;
  137. Node *n, **l;
  138. l = llookup(key);
  139. if(*l == nil){
  140. if(l==&hash[key&(nhash-1)])
  141. nnew++;
  142. *l = allocnode();
  143. (*l)->key = key;
  144. }
  145. n = *l;
  146. /* add or replace last */
  147. for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
  148. ;
  149. n->offset[i] = offset;
  150. }
  151. void
  152. Bputint(Biobufhdr *b, int n)
  153. {
  154. uchar p[4];
  155. p[0] = n>>24;
  156. p[1] = n>>16;
  157. p[2] = n>>8;
  158. p[3] = n;
  159. Bwrite(b, p, 4);
  160. }
  161. void
  162. flushraw(void)
  163. {
  164. if(nraw){
  165. if(verbose)
  166. fprint(2, "Raw %d+%d\n", rawstart, nraw);
  167. zlength += 4+nraw;
  168. Bputint(&bout, (1<<31)|nraw);
  169. memmove(odat+nodat, data+rawstart, nraw);
  170. nodat += nraw;
  171. nraw = 0;
  172. }
  173. }
  174. int
  175. rawbyte(int i)
  176. {
  177. assert(acct == i);
  178. if(nraw == 0)
  179. rawstart = i;
  180. acct++;
  181. nraw++;
  182. return 1;
  183. }
  184. int
  185. refblock(int i, int len, int off)
  186. {
  187. assert(acct == i);
  188. acct += len;
  189. if(nraw)
  190. flushraw();
  191. if(verbose)
  192. fprint(2, "Copy %d+%d from %d\n", i, len, off);
  193. Bputint(&bout, len);
  194. Bputint(&bout, off);
  195. zlength += 4+4;
  196. return len;
  197. }
  198. int
  199. cmprun(uchar *a, uchar *b, int len)
  200. {
  201. int i;
  202. if(a==b)
  203. return 0;
  204. for(i=0; i<len && a[i]==b[i]; i++)
  205. ;
  206. return i;
  207. }
  208. int
  209. countrle(uchar *a)
  210. {
  211. uchar a0;
  212. uchar *p;
  213. p = a;
  214. a0 = *p;
  215. while(*++p == a0)
  216. ;
  217. return p - a;
  218. }
  219. void
  220. compress(void)
  221. {
  222. int best, i, j, o, rle, run, maxrun, maxoff;
  223. uint32_t sum;
  224. Node *n;
  225. sum = 0;
  226. for(i=0; i<win && i<length; i++)
  227. sum = (sum*256+data[i])%Prime;
  228. for(i=0; i<length-win; ){
  229. maxrun = 0;
  230. maxoff = 0;
  231. if(verbose)
  232. fprint(2, "look %.6lux\n", sum);
  233. n = lookup(sum);
  234. if(n){
  235. best = -1;
  236. for(o=0; o<NOFF; o++){
  237. if(n->offset[o] == -1)
  238. break;
  239. run = cmprun(data+i, data+n->offset[o], length-i);
  240. if(run > maxrun && n->offset[o]+mindist < i){
  241. maxrun = run;
  242. maxoff = n->offset[o];
  243. best = o;
  244. }
  245. }
  246. if(best > 0){
  247. o = n->offset[best];
  248. for(j=best; j>0; j--)
  249. n->offset[j] = n->offset[j-1];
  250. n->offset[0] = o;
  251. }
  252. }
  253. if(maxrun >= minrun)
  254. j = i+refblock(i, maxrun, maxoff);
  255. else
  256. j = i+rawbyte(i);
  257. for(; i<j; i++){
  258. /* avoid huge chains from large runs of same byte */
  259. rle = countrle(data+i);
  260. if(rle<4)
  261. insertnode(sum, i);
  262. else if(rle>maxrle[data[i]]){
  263. maxrle[data[i]] = rle;
  264. insertnode(sum, i);
  265. }
  266. sum = (sum*256+data[i+win]) % Prime;
  267. sum = (sum + data[i]*outn) % Prime;
  268. }
  269. }
  270. /* could do better here */
  271. for(; i<length; i++)
  272. rawbyte(i);
  273. flushraw();
  274. }
  275. void
  276. usage(void)
  277. {
  278. fprint(2, "usage: bflz [-n winsize] [file]\n");
  279. exits("usage");
  280. }
  281. void
  282. main(int argc, char **argv)
  283. {
  284. int fd, i, n;
  285. char buf[10485760];
  286. ARGBEGIN{
  287. case 'd':
  288. verbose = 1;
  289. break;
  290. case 's':
  291. replacesame = atoi(EARGF(usage()));
  292. break;
  293. case 'm':
  294. mindist = atoi(EARGF(usage()));
  295. break;
  296. case 'n':
  297. win = atoi(EARGF(usage()));
  298. minrun = win;
  299. break;
  300. default:
  301. usage();
  302. }ARGEND
  303. switch(argc){
  304. default:
  305. usage();
  306. case 0:
  307. fd = 0;
  308. break;
  309. case 1:
  310. if((fd = open(argv[0], OREAD)) < 0)
  311. sysfatal("open %s: %r", argv[0]);
  312. break;
  313. }
  314. while((n = readn(fd, buf, sizeof buf)) > 0){
  315. data = realloc(data, length+n);
  316. if(data == nil)
  317. sysfatal("realloc: %r");
  318. memmove(data+length, buf, n);
  319. length += n;
  320. if(n < sizeof buf)
  321. break;
  322. }
  323. odat = malloc(length);
  324. if(odat == nil)
  325. sysfatal("malloc: %r");
  326. Binit(&bout, 1, OWRITE);
  327. Bprint(&bout, "BLZ\n");
  328. Bputint(&bout, length);
  329. outn = 1;
  330. for(i=0; i<win; i++)
  331. outn = (outn * 256) % Prime;
  332. if(verbose)
  333. fprint(2, "256^%d = %.6lux\n", win, outn);
  334. outn = Prime - outn;
  335. if(verbose)
  336. fprint(2, "outn = %.6lux\n", outn);
  337. compress();
  338. Bwrite(&bout, odat, nodat);
  339. Bterm(&bout);
  340. fprint(2, "brk %p\n", sbrk(1));
  341. fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
  342. exits(nil);
  343. }