bflz.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /*
  2. * Extraordinarily brute force Lempel & Ziv-like
  3. * compressor. The input file must fit in memory
  4. * during compression, and the output file will
  5. * be reconstructed in memory during decompression.
  6. * We search for large common sequences and use a
  7. * greedy algorithm to choose which sequence gets
  8. * compressed first.
  9. *
  10. * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
  11. *
  12. * Output format is a series of blocks followed by
  13. * a raw data section. Each block begins with a 32-bit big-endian
  14. * number. The top bit is type and the next 31 bits
  15. * are uncompressed size. Type is one of
  16. * 0 - use raw data for this length
  17. * 1 - a 32-bit offset follows
  18. * After the blocks come the raw data. (The end of the blocks can be
  19. * noted by summing block lengths until you reach the file length.)
  20. */
  21. #include <u.h>
  22. #include <libc.h>
  23. #include <bio.h>
  24. #define malloc sbrk
  25. int minrun = 16;
  26. int win = 16;
  27. ulong outn;
  28. int verbose;
  29. int mindist;
  30. enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
  31. enum { NOFF = 3 };
  32. Biobuf bout;
  33. ulong length;
  34. uchar *data;
  35. ulong sum32(ulong, void*, long);
  36. uchar *odat;
  37. int nodat;
  38. int nraw;
  39. int rawstart;
  40. int acct;
  41. int zlength;
  42. int maxchain;
  43. int maxrle[256];
  44. int nnew;
  45. typedef struct Node Node;
  46. struct Node {
  47. Node *link;
  48. ulong key;
  49. ulong offset[NOFF];
  50. };
  51. Node *nodepool;
  52. int nnodepool;
  53. Node **hash;
  54. uint nhash;
  55. uint maxlen;
  56. uint maxsame;
  57. uint replacesame = 8*1024*1024;
  58. Node *freelist, **freeend;
  59. uint nalloc;
  60. Node*
  61. allocnode(void)
  62. {
  63. int i;
  64. Node *n;
  65. if(nnodepool == 0){
  66. nnodepool = 256*1024;
  67. nodepool = malloc(sizeof(Node)*nnodepool);
  68. }
  69. if(freelist){
  70. n = freelist;
  71. freelist = n->link;
  72. return n;
  73. }
  74. assert(nnodepool > 0);
  75. nalloc++;
  76. n = &nodepool[--nnodepool];
  77. for(i=0; i<NOFF; i++)
  78. n->offset[i] = -1;
  79. return n;
  80. }
  81. void
  82. freenode(Node *n)
  83. {
  84. if(freelist == nil)
  85. freelist = n;
  86. else
  87. *freeend = n;
  88. freeend = &n->link;
  89. n->link = nil;
  90. }
  91. Node**
  92. llookup(ulong key)
  93. {
  94. uint c;
  95. Node **l, **top, *n;
  96. if(nhash == 0){
  97. uint x;
  98. x = length/8;
  99. for(nhash=1; nhash<x; nhash<<=1)
  100. ;
  101. hash = sbrk(sizeof(Node*)*nhash);
  102. }
  103. top = &hash[key&(nhash-1)];
  104. c = 0;
  105. for(l=top; *l; l=&(*l)->link){
  106. c++;
  107. if((*l)->key == key){
  108. /* move to front */
  109. n = *l;
  110. *l = n->link;
  111. n->link = *top;
  112. *top = n;
  113. return top;
  114. }
  115. }
  116. if(c > maxlen)
  117. maxlen = c;
  118. return l;
  119. }
  120. Node*
  121. lookup(ulong key)
  122. {
  123. return *llookup(key);
  124. }
  125. void
  126. insertnode(ulong key, ulong offset)
  127. {
  128. int i;
  129. Node *n, **l;
  130. l = llookup(key);
  131. if(*l == nil){
  132. if(l==&hash[key&(nhash-1)])
  133. nnew++;
  134. *l = allocnode();
  135. (*l)->key = key;
  136. }
  137. n = *l;
  138. /* add or replace last */
  139. for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
  140. ;
  141. n->offset[i] = offset;
  142. }
  143. void
  144. Bputint(Biobufhdr *b, int n)
  145. {
  146. uchar p[4];
  147. p[0] = n>>24;
  148. p[1] = n>>16;
  149. p[2] = n>>8;
  150. p[3] = n;
  151. Bwrite(b, p, 4);
  152. }
  153. void
  154. flushraw(void)
  155. {
  156. if(nraw){
  157. if(verbose)
  158. fprint(2, "Raw %d+%d\n", rawstart, nraw);
  159. zlength += 4+nraw;
  160. Bputint(&bout, (1<<31)|nraw);
  161. memmove(odat+nodat, data+rawstart, nraw);
  162. nodat += nraw;
  163. nraw = 0;
  164. }
  165. }
  166. int
  167. rawbyte(int i)
  168. {
  169. assert(acct == i);
  170. if(nraw == 0)
  171. rawstart = i;
  172. acct++;
  173. nraw++;
  174. return 1;
  175. }
  176. int
  177. refblock(int i, int len, int off)
  178. {
  179. assert(acct == i);
  180. acct += len;
  181. if(nraw)
  182. flushraw();
  183. if(verbose)
  184. fprint(2, "Copy %d+%d from %d\n", i, len, off);
  185. Bputint(&bout, len);
  186. Bputint(&bout, off);
  187. zlength += 4+4;
  188. return len;
  189. }
  190. int
  191. cmprun(uchar *a, uchar *b, int len)
  192. {
  193. int i;
  194. if(a==b)
  195. return 0;
  196. for(i=0; i<len && a[i]==b[i]; i++)
  197. ;
  198. return i;
  199. }
  200. int
  201. countrle(uchar *a)
  202. {
  203. uchar a0;
  204. uchar *p;
  205. p = a;
  206. a0 = *p;
  207. while(*++p == a0)
  208. ;
  209. return p - a;
  210. }
  211. void
  212. compress(void)
  213. {
  214. int best, i, j, o, rle, run, maxrun, maxoff;
  215. ulong sum;
  216. Node *n;
  217. sum = 0;
  218. for(i=0; i<win && i<length; i++)
  219. sum = (sum*256+data[i])%Prime;
  220. for(i=0; i<length-win; ){
  221. maxrun = 0;
  222. maxoff = 0;
  223. if(verbose)
  224. fprint(2, "look %.6lux\n", sum);
  225. n = lookup(sum);
  226. if(n){
  227. best = -1;
  228. for(o=0; o<NOFF; o++){
  229. if(n->offset[o] == -1)
  230. break;
  231. run = cmprun(data+i, data+n->offset[o], length-i);
  232. if(run > maxrun && n->offset[o]+mindist < i){
  233. maxrun = run;
  234. maxoff = n->offset[o];
  235. best = o;
  236. }
  237. }
  238. if(best > 0){
  239. o = n->offset[best];
  240. for(j=best; j>0; j--)
  241. n->offset[j] = n->offset[j-1];
  242. n->offset[0] = o;
  243. }
  244. }
  245. if(maxrun >= minrun)
  246. j = i+refblock(i, maxrun, maxoff);
  247. else
  248. j = i+rawbyte(i);
  249. for(; i<j; i++){
  250. /* avoid huge chains from large runs of same byte */
  251. rle = countrle(data+i);
  252. if(rle<4)
  253. insertnode(sum, i);
  254. else if(rle>maxrle[data[i]]){
  255. maxrle[data[i]] = rle;
  256. insertnode(sum, i);
  257. }
  258. sum = (sum*256+data[i+win]) % Prime;
  259. sum = (sum + data[i]*outn) % Prime;
  260. }
  261. }
  262. /* could do better here */
  263. for(; i<length; i++)
  264. rawbyte(i);
  265. flushraw();
  266. }
  267. void
  268. usage(void)
  269. {
  270. fprint(2, "usage: bflz [-n winsize] [file]\n");
  271. exits("usage");
  272. }
  273. void
  274. main(int argc, char **argv)
  275. {
  276. int fd, i, n;
  277. char buf[10485760];
  278. ARGBEGIN{
  279. case 'd':
  280. verbose = 1;
  281. break;
  282. case 's':
  283. replacesame = atoi(EARGF(usage()));
  284. break;
  285. case 'm':
  286. mindist = atoi(EARGF(usage()));
  287. break;
  288. case 'n':
  289. win = atoi(EARGF(usage()));
  290. minrun = win;
  291. break;
  292. default:
  293. usage();
  294. }ARGEND
  295. switch(argc){
  296. default:
  297. usage();
  298. case 0:
  299. fd = 0;
  300. break;
  301. case 1:
  302. if((fd = open(argv[0], OREAD)) < 0)
  303. sysfatal("open %s: %r", argv[0]);
  304. break;
  305. }
  306. while((n = readn(fd, buf, sizeof buf)) > 0){
  307. data = realloc(data, length+n);
  308. if(data == nil)
  309. sysfatal("realloc: %r");
  310. memmove(data+length, buf, n);
  311. length += n;
  312. if(n < sizeof buf)
  313. break;
  314. }
  315. odat = malloc(length);
  316. if(odat == nil)
  317. sysfatal("malloc: %r");
  318. Binit(&bout, 1, OWRITE);
  319. Bprint(&bout, "BLZ\n");
  320. Bputint(&bout, length);
  321. outn = 1;
  322. for(i=0; i<win; i++)
  323. outn = (outn * 256) % Prime;
  324. if(verbose)
  325. fprint(2, "256^%d = %.6lux\n", win, outn);
  326. outn = Prime - outn;
  327. if(verbose)
  328. fprint(2, "outn = %.6lux\n", outn);
  329. compress();
  330. Bwrite(&bout, odat, nodat);
  331. Bterm(&bout);
  332. fprint(2, "brk %p\n", sbrk(1));
  333. fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
  334. exits(nil);
  335. }