123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377 |
- /*
- * Extraordinarily brute force Lempel & Ziv-like
- * compressor. The input file must fit in memory
- * during compression, and the output file will
- * be reconstructed in memory during decompression.
- * We search for large common sequences and use a
- * greedy algorithm to choose which sequence gets
- * compressed first.
- *
- * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
- *
- * Output format is a series of blocks followed by
- * a raw data section. Each block begins with a 32-bit big-endian
- * number. The top bit is type and the next 31 bits
- * are uncompressed size. Type is one of
- * 0 - use raw data for this length
- * 1 - a 32-bit offset follows
- * After the blocks come the raw data. (The end of the blocks can be
- * noted by summing block lengths until you reach the file length.)
- */
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #define malloc sbrk
- int minrun = 16;
- int win = 16;
- ulong outn;
- int verbose;
- int mindist;
- enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
- enum { NOFF = 3 };
- Biobuf bout;
- ulong length;
- uchar *data;
- ulong sum32(ulong, void*, long);
- uchar *odat;
- int nodat;
- int nraw;
- int rawstart;
- int acct;
- int zlength;
- int maxchain;
- int maxrle[256];
- int nnew;
- typedef struct Node Node;
- struct Node {
- Node *link;
- ulong key;
- ulong offset[NOFF];
- };
- Node *nodepool;
- int nnodepool;
- Node **hash;
- uint nhash;
- uint maxlen;
- uint maxsame;
- uint replacesame = 8*1024*1024;
- Node *freelist, **freeend;
- uint nalloc;
- Node*
- allocnode(void)
- {
- int i;
- Node *n;
- if(nnodepool == 0){
- nnodepool = 256*1024;
- nodepool = malloc(sizeof(Node)*nnodepool);
- }
- if(freelist){
- n = freelist;
- freelist = n->link;
- return n;
- }
- assert(nnodepool > 0);
- nalloc++;
- n = &nodepool[--nnodepool];
- for(i=0; i<NOFF; i++)
- n->offset[i] = -1;
- return n;
- }
- void
- freenode(Node *n)
- {
- if(freelist == nil)
- freelist = n;
- else
- *freeend = n;
- freeend = &n->link;
- n->link = nil;
- }
- Node**
- llookup(ulong key)
- {
- uint c;
- Node **l, **top, *n;
-
- if(nhash == 0){
- uint x;
- x = length/8;
- for(nhash=1; nhash<x; nhash<<=1)
- ;
- hash = sbrk(sizeof(Node*)*nhash);
- }
- top = &hash[key&(nhash-1)];
- c = 0;
- for(l=top; *l; l=&(*l)->link){
- c++;
- if((*l)->key == key){
- /* move to front */
- n = *l;
- *l = n->link;
- n->link = *top;
- *top = n;
- return top;
- }
- }
- if(c > maxlen)
- maxlen = c;
- return l;
- }
- Node*
- lookup(ulong key)
- {
- return *llookup(key);
- }
- void
- insertnode(ulong key, ulong offset)
- {
- int i;
- Node *n, **l;
- l = llookup(key);
- if(*l == nil){
- if(l==&hash[key&(nhash-1)])
- nnew++;
- *l = allocnode();
- (*l)->key = key;
- }
- n = *l;
- /* add or replace last */
- for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
- ;
- n->offset[i] = offset;
- }
- void
- Bputint(Biobufhdr *b, int n)
- {
- uchar p[4];
- p[0] = n>>24;
- p[1] = n>>16;
- p[2] = n>>8;
- p[3] = n;
- Bwrite(b, p, 4);
- }
- void
- flushraw(void)
- {
- if(nraw){
- if(verbose)
- fprint(2, "Raw %d+%d\n", rawstart, nraw);
- zlength += 4+nraw;
- Bputint(&bout, (1<<31)|nraw);
- memmove(odat+nodat, data+rawstart, nraw);
- nodat += nraw;
- nraw = 0;
- }
- }
- int
- rawbyte(int i)
- {
- assert(acct == i);
- if(nraw == 0)
- rawstart = i;
- acct++;
- nraw++;
- return 1;
- }
- int
- refblock(int i, int len, int off)
- {
- assert(acct == i);
- acct += len;
- if(nraw)
- flushraw();
- if(verbose)
- fprint(2, "Copy %d+%d from %d\n", i, len, off);
- Bputint(&bout, len);
- Bputint(&bout, off);
- zlength += 4+4;
- return len;
- }
- int
- cmprun(uchar *a, uchar *b, int len)
- {
- int i;
- if(a==b)
- return 0;
- for(i=0; i<len && a[i]==b[i]; i++)
- ;
- return i;
- }
- int
- countrle(uchar *a)
- {
- uchar a0;
- uchar *p;
- p = a;
- a0 = *p;
- while(*++p == a0)
- ;
- return p - a;
- }
- void
- compress(void)
- {
- int best, i, j, o, rle, run, maxrun, maxoff;
- ulong sum;
- Node *n;
- sum = 0;
- for(i=0; i<win && i<length; i++)
- sum = (sum*256+data[i])%Prime;
- for(i=0; i<length-win; ){
- maxrun = 0;
- maxoff = 0;
- if(verbose)
- fprint(2, "look %.6lux\n", sum);
- n = lookup(sum);
- if(n){
- best = -1;
- for(o=0; o<NOFF; o++){
- if(n->offset[o] == -1)
- break;
- run = cmprun(data+i, data+n->offset[o], length-i);
- if(run > maxrun && n->offset[o]+mindist < i){
- maxrun = run;
- maxoff = n->offset[o];
- best = o;
- }
- }
- if(best > 0){
- o = n->offset[best];
- for(j=best; j>0; j--)
- n->offset[j] = n->offset[j-1];
- n->offset[0] = o;
- }
- }
-
- if(maxrun >= minrun)
- j = i+refblock(i, maxrun, maxoff);
- else
- j = i+rawbyte(i);
- for(; i<j; i++){
- /* avoid huge chains from large runs of same byte */
- rle = countrle(data+i);
- if(rle<4)
- insertnode(sum, i);
- else if(rle>maxrle[data[i]]){
- maxrle[data[i]] = rle;
- insertnode(sum, i);
- }
- sum = (sum*256+data[i+win]) % Prime;
- sum = (sum + data[i]*outn) % Prime;
- }
- }
- /* could do better here */
- for(; i<length; i++)
- rawbyte(i);
- flushraw();
- }
- void
- usage(void)
- {
- fprint(2, "usage: bflz [-n winsize] [file]\n");
- exits("usage");
- }
- void
- main(int argc, char **argv)
- {
- int fd, i, n;
- char buf[10485760];
- ARGBEGIN{
- case 'd':
- verbose = 1;
- break;
- case 's':
- replacesame = atoi(EARGF(usage()));
- break;
- case 'm':
- mindist = atoi(EARGF(usage()));
- break;
- case 'n':
- win = atoi(EARGF(usage()));
- minrun = win;
- break;
- default:
- usage();
- }ARGEND
- switch(argc){
- default:
- usage();
- case 0:
- fd = 0;
- break;
- case 1:
- if((fd = open(argv[0], OREAD)) < 0)
- sysfatal("open %s: %r", argv[0]);
- break;
- }
- while((n = readn(fd, buf, sizeof buf)) > 0){
- data = realloc(data, length+n);
- if(data == nil)
- sysfatal("realloc: %r");
- memmove(data+length, buf, n);
- length += n;
- if(n < sizeof buf)
- break;
- }
- odat = malloc(length);
- if(odat == nil)
- sysfatal("malloc: %r");
- Binit(&bout, 1, OWRITE);
- Bprint(&bout, "BLZ\n");
- Bputint(&bout, length);
- outn = 1;
- for(i=0; i<win; i++)
- outn = (outn * 256) % Prime;
- if(verbose)
- fprint(2, "256^%d = %.6lux\n", win, outn);
- outn = Prime - outn;
- if(verbose)
- fprint(2, "outn = %.6lux\n", outn);
- compress();
- Bwrite(&bout, odat, nodat);
- Bterm(&bout);
- fprint(2, "brk %p\n", sbrk(1));
- fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
- exits(nil);
- }
|