bflz.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*
  2. * Extraordinarily brute force Lempel & Ziv-like
  3. * compressor. The input file must fit in memory
  4. * during compression, and the output file will
  5. * be reconstructed in memory during decompression.
  6. * We search for large common sequences and use a
  7. * greedy algorithm to choose which sequence gets
  8. * compressed first.
  9. *
  10. * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
  11. *
  12. * Output format is a series of blocks followed by
  13. * a raw data section. Each block begins with a 32-bit big-endian
  14. * number. The top bit is type and the next 31 bits
  15. * are uncompressed size. Type is one of
  16. * 0 - use raw data for this length
  17. * 1 - a 32-bit offset follows
  18. * After the blocks come the raw data. (The end of the blocks can be
  19. * noted by summing block lengths until you reach the file length.)
  20. */
  21. #include <u.h>
  22. #include <libc.h>
  23. #include <bio.h>
  24. #define malloc sbrk
  25. int minrun = 16;
  26. int win = 16;
  27. ulong outn;
  28. int verbose;
  29. int mindist;
  30. enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
  31. enum { NOFF = 3 };
  32. Biobuf bout;
  33. ulong length;
  34. uchar *data;
  35. ulong sum32(ulong, void*, long);
  36. uchar *odat;
  37. int nodat;
  38. int nraw;
  39. int rawstart;
  40. int acct;
  41. int zlength;
  42. int maxchain;
  43. int maxrle[256];
  44. int nnew;
  45. typedef struct Node Node;
  46. struct Node {
  47. Node *link;
  48. ulong key;
  49. ulong offset[NOFF];
  50. };
  51. Node *nodepool;
  52. int nnodepool;
  53. Node **hash;
  54. uint nhash;
  55. uint maxlen;
  56. uint maxsame;
  57. uint replacesame = 8*1024*1024;
  58. Node *freelist, **freeend;
  59. uint nalloc;
  60. Node*
  61. allocnode(void)
  62. {
  63. int i;
  64. Node *n;
  65. if(nnodepool == 0){
  66. nnodepool = 256*1024;
  67. nodepool = malloc(sizeof(Node)*nnodepool);
  68. }
  69. if(freelist){
  70. n = freelist;
  71. freelist = n->link;
  72. return n;
  73. }
  74. assert(nnodepool > 0);
  75. nalloc++;
  76. n = &nodepool[--nnodepool];
  77. for(i=0; i<NOFF; i++)
  78. n->offset[i] = -1;
  79. return n;
  80. }
  81. void
  82. freenode(Node *n)
  83. {
  84. if(freelist == nil)
  85. freelist = n;
  86. else
  87. *freeend = n;
  88. freeend = &n->link;
  89. n->link = nil;
  90. }
  91. Node**
  92. llookup(ulong key)
  93. {
  94. uint c;
  95. Node **l, **top, *n;
  96. if(nhash == 0){
  97. uint x;
  98. x = length/8;
  99. for(nhash=1; nhash<x; nhash<<=1)
  100. ;
  101. hash = sbrk(sizeof(Node*)*nhash);
  102. }
  103. top = &hash[key&(nhash-1)];
  104. c = 0;
  105. for(l=top; *l; l=&(*l)->link){
  106. c++;
  107. if((*l)->key == key){
  108. /* move to front */
  109. n = *l;
  110. *l = n->link;
  111. n->link = *top;
  112. *top = n;
  113. return top;
  114. }
  115. }
  116. if(c > maxlen)
  117. maxlen = c;
  118. return l;
  119. }
  120. Node*
  121. lookup(ulong key)
  122. {
  123. return *llookup(key);
  124. }
  125. void
  126. insertnode(ulong key, ulong offset)
  127. {
  128. int i;
  129. Node *n, **l;
  130. l = llookup(key);
  131. if(*l == nil){
  132. if(l==&hash[key&(nhash-1)])
  133. nnew++;
  134. *l = allocnode();
  135. (*l)->key = key;
  136. }
  137. n = *l;
  138. /* add or replace last */
  139. for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
  140. ;
  141. n->offset[i] = offset;
  142. }
  143. void
  144. Bputint(Biobufhdr *b, int n)
  145. {
  146. uchar p[4];
  147. p[0] = n>>24;
  148. p[1] = n>>16;
  149. p[2] = n>>8;
  150. p[3] = n;
  151. Bwrite(b, p, 4);
  152. }
  153. void
  154. flushraw(void)
  155. {
  156. if(nraw){
  157. if(verbose)
  158. fprint(2, "Raw %d+%d\n", rawstart, nraw);
  159. zlength += 4+nraw;
  160. Bputint(&bout, (1<<31)|nraw);
  161. memmove(odat+nodat, data+rawstart, nraw);
  162. nodat += nraw;
  163. nraw = 0;
  164. }
  165. }
  166. int
  167. rawbyte(int i)
  168. {
  169. assert(acct == i);
  170. if(nraw == 0)
  171. rawstart = i;
  172. acct++;
  173. nraw++;
  174. return 1;
  175. }
  176. int
  177. refblock(int i, int len, int off)
  178. {
  179. assert(acct == i);
  180. acct += len;
  181. if(nraw)
  182. flushraw();
  183. if(verbose)
  184. fprint(2, "Copy %d+%d from %d\n", i, len, off);
  185. Bputint(&bout, len);
  186. Bputint(&bout, off);
  187. zlength += 4+4;
  188. return len;
  189. }
  190. int
  191. cmprun(uchar *a, uchar *b, int len)
  192. {
  193. int i;
  194. if(a==b)
  195. return 0;
  196. for(i=0; i<len && a[i]==b[i]; i++)
  197. ;
  198. return i;
  199. }
  200. int
  201. countrle(uchar *a)
  202. {
  203. int i;
  204. for(i=0; a[i]==a[0]; i++)
  205. ;
  206. return i;
  207. }
  208. void
  209. compress(void)
  210. {
  211. int best, i, j, o, rle, run, maxrun, maxoff;
  212. ulong sum;
  213. Node *n;
  214. sum = 0;
  215. for(i=0; i<win && i<length; i++)
  216. sum = (sum*256+data[i])%Prime;
  217. for(i=0; i<length-win; ){
  218. maxrun = 0;
  219. maxoff = 0;
  220. if(verbose)
  221. fprint(2, "look %.6lux\n", sum);
  222. n = lookup(sum);
  223. if(n){
  224. best = -1;
  225. for(o=0; o<NOFF; o++){
  226. if(n->offset[o] == -1)
  227. break;
  228. run = cmprun(data+i, data+n->offset[o], length-i);
  229. if(run > maxrun && n->offset[o]+mindist < i){
  230. maxrun = run;
  231. maxoff = n->offset[o];
  232. best = o;
  233. }
  234. }
  235. if(best > 0){
  236. o = n->offset[best];
  237. for(j=best; j>0; j--)
  238. n->offset[j] = n->offset[j-1];
  239. n->offset[0] = o;
  240. }
  241. }
  242. if(maxrun >= minrun)
  243. j = i+refblock(i, maxrun, maxoff);
  244. else
  245. j = i+rawbyte(i);
  246. for(; i<j; i++){
  247. /* avoid huge chains from large runs of same byte */
  248. rle = countrle(data+i);
  249. if(rle<4)
  250. insertnode(sum, i);
  251. else if(rle>maxrle[data[i]]){
  252. maxrle[data[i]] = rle;
  253. insertnode(sum, i);
  254. }
  255. sum = (sum*256+data[i+win]) % Prime;
  256. sum = (sum + data[i]*outn) % Prime;
  257. }
  258. }
  259. /* could do better here */
  260. for(; i<length; i++)
  261. rawbyte(i);
  262. flushraw();
  263. }
  264. void
  265. usage(void)
  266. {
  267. fprint(2, "usage: bflz [-n winsize] [file]\n");
  268. exits("usage");
  269. }
  270. void
  271. main(int argc, char **argv)
  272. {
  273. int fd, i, n;
  274. char buf[10485760];
  275. ARGBEGIN{
  276. case 'd':
  277. verbose = 1;
  278. break;
  279. case 's':
  280. replacesame = atoi(EARGF(usage()));
  281. break;
  282. case 'm':
  283. mindist = atoi(EARGF(usage()));
  284. break;
  285. case 'n':
  286. win = atoi(EARGF(usage()));
  287. minrun = win;
  288. break;
  289. default:
  290. usage();
  291. }ARGEND
  292. switch(argc){
  293. default:
  294. usage();
  295. case 0:
  296. fd = 0;
  297. break;
  298. case 1:
  299. if((fd = open(argv[0], OREAD)) < 0)
  300. sysfatal("open %s: %r", argv[0]);
  301. break;
  302. }
  303. while((n = readn(fd, buf, sizeof buf)) > 0){
  304. data = realloc(data, length+n);
  305. if(data == nil)
  306. sysfatal("realloc: %r");
  307. memmove(data+length, buf, n);
  308. length += n;
  309. if(n < sizeof buf)
  310. break;
  311. }
  312. odat = malloc(length);
  313. if(odat == nil)
  314. sysfatal("malloc: %r");
  315. Binit(&bout, 1, OWRITE);
  316. Bprint(&bout, "BLZ\n");
  317. Bputint(&bout, length);
  318. outn = 1;
  319. for(i=0; i<win; i++)
  320. outn = (outn * 256) % Prime;
  321. if(verbose)
  322. fprint(2, "256^%d = %.6lux\n", win, outn);
  323. outn = Prime - outn;
  324. if(verbose)
  325. fprint(2, "outn = %.6lux\n", outn);
  326. compress();
  327. Bwrite(&bout, odat, nodat);
  328. Bterm(&bout);
  329. fprint(2, "brk %p\n", sbrk(1));
  330. fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
  331. exits(nil);
  332. }