Subversion Repositories planix.SVN

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

/*
 * Extraordinarily brute force Lempel & Ziv-like
 * compressor.  The input file must fit in memory
 * during compression, and the output file will
 * be reconstructed in memory during decompression.
 * We search for large common sequences and use a
 * greedy algorithm to choose which sequence gets
 * compressed first.
 *
 * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
 *
 * Output format is a series of blocks followed by
 * a raw data section.  Each block begins with a 32-bit big-endian
 * number.  The top bit is type and the next 31 bits
 * are uncompressed size.  Type is one of
 *      0 - use raw data for this length
 *      1 - a 32-bit offset follows
 * After the blocks come the raw data.  (The end of the blocks can be
 * noted by summing block lengths until you reach the file length.)
 */

#include <u.h>
#include <libc.h>
#include <bio.h>

#define malloc sbrk

int minrun = 16;
int win = 16;
ulong outn;
int verbose;
int mindist;

enum { Prime = 16777213 };      /* smallest prime < 2^24 (so p*256+256 < 2^32) */
enum { NOFF = 3 };

Biobuf bout;
ulong length;
uchar *data;
ulong sum32(ulong, void*, long);
uchar *odat;
int nodat;
int nraw;
int rawstart;
int acct;
int zlength;
int maxchain;
int maxrle[256];
int nnew;

typedef struct Node Node;
struct Node {
        Node *link;
        ulong key;
        ulong offset[NOFF];
};

Node *nodepool;
int nnodepool;

Node **hash;
uint nhash;

uint maxlen;
uint maxsame;
uint replacesame = 8*1024*1024;

Node *freelist, **freeend;
uint nalloc;

Node*
allocnode(void)
{
        int i;
        Node *n;

        if(nnodepool == 0){
                nnodepool = 256*1024;
                nodepool = malloc(sizeof(Node)*nnodepool);
        }
        if(freelist){
                n = freelist;
                freelist = n->link;
                return n;
        }
        assert(nnodepool > 0);
        nalloc++;
        n = &nodepool[--nnodepool];
        for(i=0; i<NOFF; i++)
                n->offset[i] = -1;

        return n;
}

void
freenode(Node *n)
{
        if(freelist == nil)
                freelist = n;
        else
                *freeend = n;
        freeend = &n->link;
        n->link = nil;
}

Node**
llookup(ulong key)
{
        uint c;
        Node **l, **top, *n;
        
        if(nhash == 0){
                uint x;

                x = length/8;
                for(nhash=1; nhash<x; nhash<<=1)
                        ;
                hash = sbrk(sizeof(Node*)*nhash);
        }

        top = &hash[key&(nhash-1)];
        c = 0;
        for(l=top; *l; l=&(*l)->link){
                c++;
                if((*l)->key == key){
                        /* move to front */
                        n = *l;
                        *l = n->link;
                        n->link = *top;
                        *top = n;
                        return top;
                }
        }
        if(c > maxlen)
                maxlen = c;
        return l;
}

Node*
lookup(ulong key)
{
        return *llookup(key);
}

void
insertnode(ulong key, ulong offset)
{
        int i;
        Node *n, **l;

        l = llookup(key);
        if(*l == nil){
                if(l==&hash[key&(nhash-1)])
                        nnew++;
                *l = allocnode();
                (*l)->key = key;
        }
        n = *l;

        /* add or replace last */
        for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
                ;
        n->offset[i] = offset;
}

void
Bputint(Biobufhdr *b, int n)
{
        uchar p[4];

        p[0] = n>>24;
        p[1] = n>>16;
        p[2] = n>>8;
        p[3] = n;
        Bwrite(b, p, 4);
}

void
flushraw(void)
{
        if(nraw){
                if(verbose)
                        fprint(2, "Raw %d+%d\n", rawstart, nraw);
                zlength += 4+nraw;
                Bputint(&bout, (1<<31)|nraw);
                memmove(odat+nodat, data+rawstart, nraw);
                nodat += nraw;
                nraw = 0;
        }
}

int
rawbyte(int i)
{
        assert(acct == i);
        if(nraw == 0)
                rawstart = i;
        acct++;
        nraw++;
        return 1;
}

int
refblock(int i, int len, int off)
{
        assert(acct == i);
        acct += len;
        if(nraw)
                flushraw();
        if(verbose)
                fprint(2, "Copy %d+%d from %d\n", i, len, off);
        Bputint(&bout, len);
        Bputint(&bout, off);
        zlength += 4+4;
        return len;
}

int
cmprun(uchar *a, uchar *b, int len)
{
        int i;

        if(a==b)
                return 0;
        for(i=0; i<len && a[i]==b[i]; i++)
                ;
        return i;
}

int
countrle(uchar *a)
{
        uchar a0;
        uchar *p;

        p = a;
        a0 = *p;
        while(*++p == a0)
                ;
        return p - a;
}

void
compress(void)
{
        int best, i, j, o, rle, run, maxrun, maxoff;
        ulong sum;
        Node *n;

        sum = 0;
        for(i=0; i<win && i<length; i++)
                sum = (sum*256+data[i])%Prime;
        for(i=0; i<length-win; ){
                maxrun = 0;
                maxoff = 0;
                if(verbose)
                        fprint(2, "look %.6lux\n", sum);
                n = lookup(sum);
                if(n){
                        best = -1;
                        for(o=0; o<NOFF; o++){
                                if(n->offset[o] == -1)
                                        break;
                                run = cmprun(data+i, data+n->offset[o], length-i);
                                if(run > maxrun && n->offset[o]+mindist < i){
                                        maxrun = run;
                                        maxoff = n->offset[o];
                                        best = o;
                                }
                        }
                        if(best > 0){
                                o = n->offset[best];
                                for(j=best; j>0; j--)
                                        n->offset[j] = n->offset[j-1];
                                n->offset[0] = o;
                        }
                }
                                
                if(maxrun >= minrun)
                        j = i+refblock(i, maxrun, maxoff);
                else
                        j = i+rawbyte(i);
                for(; i<j; i++){
                        /* avoid huge chains from large runs of same byte */
                        rle = countrle(data+i);
                        if(rle<4)
                                insertnode(sum, i);
                        else if(rle>maxrle[data[i]]){
                                maxrle[data[i]] = rle;
                                insertnode(sum, i);
                        }
                        sum = (sum*256+data[i+win]) % Prime;
                        sum = (sum + data[i]*outn) % Prime;
                }
        }
        /* could do better here */
        for(; i<length; i++)
                rawbyte(i);
        flushraw();
}

void
usage(void)
{
        fprint(2, "usage: bflz [-n winsize] [file]\n");
        exits("usage");
}

void
main(int argc, char **argv)
{
        int fd, i, n;
        char buf[10485760];

        ARGBEGIN{
        case 'd':
                verbose = 1;
                break;
        case 's':
                replacesame = atoi(EARGF(usage()));
                break;
        case 'm':
                mindist = atoi(EARGF(usage()));
                break;
        case 'n':
                win = atoi(EARGF(usage()));
                minrun = win;
                break;
        default:
                usage();
        }ARGEND

        switch(argc){
        default:
                usage();
        case 0:
                fd = 0;
                break;
        case 1:
                if((fd = open(argv[0], OREAD)) < 0)
                        sysfatal("open %s: %r", argv[0]);
                break;
        }

        while((n = readn(fd, buf, sizeof buf)) > 0){
                data = realloc(data, length+n);
                if(data == nil)
                        sysfatal("realloc: %r");
                memmove(data+length, buf, n);
                length += n;
                if(n < sizeof buf)
                        break;
        }
        odat = malloc(length);
        if(odat == nil)
                sysfatal("malloc: %r");

        Binit(&bout, 1, OWRITE);
        Bprint(&bout, "BLZ\n");
        Bputint(&bout, length);
        outn = 1;
        for(i=0; i<win; i++)
                outn = (outn * 256) % Prime;

        if(verbose)
                fprint(2, "256^%d = %.6lux\n", win, outn);
        outn = Prime - outn;
        if(verbose)
                fprint(2, "outn = %.6lux\n", outn);

        compress();
        Bwrite(&bout, odat, nodat);
        Bterm(&bout);
        fprint(2, "brk %p\n", sbrk(1));
        fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
        exits(nil);     
}