Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

#include <u.h>
#include <libc.h>
#include <flate.h>

typedef struct Chain    Chain;
typedef struct Chains   Chains;
typedef struct Dyncode  Dyncode;
typedef struct Huff     Huff;
typedef struct LZblock  LZblock;
typedef struct LZstate  LZstate;

enum
{
        /*
         * deflate format paramaters
         */
        DeflateUnc      = 0,                    /* uncompressed block */
        DeflateFix      = 1,                    /* fixed huffman codes */
        DeflateDyn      = 2,                    /* dynamic huffman codes */

        DeflateEob      = 256,                  /* end of block code in lit/len book */
        DeflateMaxBlock = 64*1024-1,            /* maximum size of uncompressed block */

        DeflateMaxExp   = 10,                   /* maximum expansion for a block */

        LenStart        = 257,                  /* start of length codes in litlen */
        Nlitlen         = 288,                  /* number of litlen codes */
        Noff            = 30,                   /* number of offset codes */
        Nclen           = 19,                   /* number of codelen codes */

        MaxOff          = 32*1024,
        MinMatch        = 3,                    /* shortest match possible */
        MaxMatch        = 258,                  /* longest match possible */

        /*
         * huffman code paramaters
         */
        MaxLeaf         = Nlitlen,
        MaxHuffBits     = 16,                   /* max bits in a huffman code */
        ChainMem        = 2 * (MaxHuffBits - 1) * MaxHuffBits,

        /*
         * coding of the lz parse
         */
        LenFlag         = 1 << 3,
        LenShift        = 4,                    /* leaves enough space for MinMatchMaxOff */
        MaxLitRun       = LenFlag - 1,

        /*
         * internal lz paramaters
         */
        DeflateOut      = 4096,                 /* output buffer size */
        BlockSize       = 8192,                 /* attempted input read quanta */
        DeflateBlock    = DeflateMaxBlock & ~(BlockSize - 1),
        MinMatchMaxOff  = 4096,                 /* max profitable offset for small match;
                                                 * assumes 8 bits for len, 5+10 for offset
                                                 * DONT CHANGE WITHOUT CHANGING LZPARSE CONSTANTS
                                                 */
        HistSlop        = 512,                  /* must be at lead MaxMatch */
        HistBlock       = 64*1024,
        HistSize        = HistBlock + HistSlop,

        HashLog         = 13,
        HashSize        = 1<<HashLog,

        MaxOffCode      = 256,                  /* biggest offset looked up in direct table */

        EstLitBits      = 8,
        EstLenBits      = 4,
        EstOffBits      = 5,
};

/*
 * knuth vol. 3 multiplicative hashing
 * each byte x chosen according to rules
 * 1/4 < x < 3/10, 1/3 x < < 3/7, 4/7 < x < 2/3, 7/10 < x < 3/4
 * with reasonable spread between the bytes & their complements
 *
 * the 3 byte value appears to be as almost good as the 4 byte value,
 * and might be faster on some machines
 */
/*
#define hashit(c)       (((ulong)(c) * 0x6b43a9) >> (24 - HashLog))
*/
#define hashit(c)       ((((ulong)(c) & 0xffffff) * 0x6b43a9b5) >> (32 - HashLog))

/*
 * lempel-ziv style compression state
 */
struct LZstate
{
        uchar   hist[HistSize];
        ulong   pos;                            /* current location in history buffer */
        ulong   avail;                          /* data available after pos */
        int     eof;
        ushort  hash[HashSize];                 /* hash chains */
        ushort  nexts[MaxOff];
        int     now;                            /* pos in hash chains */
        int     dot;                            /* dawn of time in history */
        int     prevlen;                        /* lazy matching state */
        int     prevoff;
        int     maxcheck;                       /* compressor tuning */

        uchar   obuf[DeflateOut];
        uchar   *out;                           /* current position in the output buffer */
        uchar   *eout;
        ulong   bits;                           /* bit shift register */
        int     nbits;
        int     rbad;                           /* got an error reading the buffer */
        int     wbad;                           /* got an error writing the buffer */
        int     (*w)(void*, void*, int);
        void    *wr;

        ulong   totr;                           /* total input size */
        ulong   totw;                           /* total output size */
        int     debug;
};

struct LZblock
{
        ushort  parse[DeflateMaxBlock / 2 + 1];
        int     lastv;                          /* value being constucted for parse */
        ulong   litlencount[Nlitlen];
        ulong   offcount[Noff];
        ushort  *eparse;                        /* limit for parse table */
        int     bytes;                          /* consumed from the input */
        int     excost;                         /* cost of encoding extra len & off bits */
};

/*
 * huffman code table
 */
struct Huff
{
        short   bits;                           /* length of the code */
        ushort  encode;                         /* the code */
};

/*
 * encoding of dynamic huffman trees
 */
struct Dyncode
{
        int     nlit;
        int     noff;
        int     nclen;
        int     ncode;
        Huff    codetab[Nclen];
        uchar   codes[Nlitlen+Noff];
        uchar   codeaux[Nlitlen+Noff];
};

static  int     deflateb(LZstate *lz, LZblock *lzb, void *rr, int (*r)(void*, void*, int));
static  int     lzcomp(LZstate*, LZblock*, uchar*, ushort*, int finish);
static  void    wrblock(LZstate*, int, ushort*, ushort*, Huff*, Huff*);
static  int     bitcost(Huff*, ulong*, int);
static  int     huffcodes(Dyncode*, Huff*, Huff*);
static  void    wrdyncode(LZstate*, Dyncode*);
static  void    lzput(LZstate*, ulong bits, int nbits);
static  void    lzflushbits(LZstate*);
static  void    lzflush(LZstate *lz);
static  void    lzwrite(LZstate *lz, void *buf, int n);

static  int     hufftabinit(Huff*, int, ulong*, int);
static  int     mkgzprecode(Huff*, ulong *, int, int);

static  int     mkprecode(Huff*, ulong *, int, int, ulong*);
static  void    nextchain(Chains*, int);
static  void    leafsort(ulong*, ushort*, int, int);

/* conversion from len to code word */
static int lencode[MaxMatch];

/*
 * conversion from off to code word
 * off <= MaxOffCode ? offcode[off] : bigoffcode[off >> 7]
*/
static int offcode[MaxOffCode];
static int bigoffcode[256];

/* litlen code words LenStart-285 extra bits */
static int litlenbase[Nlitlen-LenStart];
static int litlenextra[Nlitlen-LenStart] =
{
/* 257 */       0, 0, 0,
/* 260 */       0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
/* 270 */       2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
/* 280 */       4, 5, 5, 5, 5, 0, 0, 0
};

/* offset code word extra bits */
static int offbase[Noff];
static int offextra[] =
{
        0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
        4,  4,  5,  5,  6,  6,  7,  7,  8,  8,
        9,  9,  10, 10, 11, 11, 12, 12, 13, 13,
        0,  0,
};

/* order code lengths */
static int clenorder[Nclen] =
{
        16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};

/* static huffman tables */
static  Huff    litlentab[Nlitlen];
static  Huff    offtab[Noff];
static  Huff    hofftab[Noff];

/* bit reversal for brain dead endian swap in huffman codes */
static  uchar   revtab[256];
static  ulong   nlits;
static  ulong   nmatches;

int
deflateinit(void)
{
        ulong bitcount[MaxHuffBits];
        int i, j, ci, n;

        /* byte reverse table */
        for(i=0; i<256; i++)
                for(j=0; j<8; j++)
                        if(i & (1<<j))
                                revtab[i] |= 0x80 >> j;

        /* static Litlen bit lengths */
        for(i=0; i<144; i++)
                litlentab[i].bits = 8;
        for(i=144; i<256; i++)
                litlentab[i].bits = 9;
        for(i=256; i<280; i++)
                litlentab[i].bits = 7;
        for(i=280; i<Nlitlen; i++)
                litlentab[i].bits = 8;

        memset(bitcount, 0, sizeof(bitcount));
        bitcount[8] += 144 - 0;
        bitcount[9] += 256 - 144;
        bitcount[7] += 280 - 256;
        bitcount[8] += Nlitlen - 280;

        if(!hufftabinit(litlentab, Nlitlen, bitcount, 9))
                return FlateInternal;

        /* static offset bit lengths */
        for(i = 0; i < Noff; i++)
                offtab[i].bits = 5;

        memset(bitcount, 0, sizeof(bitcount));
        bitcount[5] = Noff;

        if(!hufftabinit(offtab, Noff, bitcount, 5))
                return FlateInternal;

        bitcount[0] = 0;
        bitcount[1] = 0;
        if(!mkgzprecode(hofftab, bitcount, 2, MaxHuffBits))
                return FlateInternal;

        /* conversion tables for lens & offs to codes */
        ci = 0;
        for(i = LenStart; i < 286; i++){
                n = ci + (1 << litlenextra[i - LenStart]);
                litlenbase[i - LenStart] = ci;
                for(; ci < n; ci++)
                        lencode[ci] = i;
        }
        /* patch up special case for len MaxMatch */
        lencode[MaxMatch-MinMatch] = 285;
        litlenbase[285-LenStart] = MaxMatch-MinMatch;

        ci = 0;
        for(i = 0; i < 16; i++){
                n = ci + (1 << offextra[i]);
                offbase[i] = ci;
                for(; ci < n; ci++)
                        offcode[ci] = i;
        }

        ci = ci >> 7;
        for(; i < 30; i++){
                n = ci + (1 << (offextra[i] - 7));
                offbase[i] = ci << 7;
                for(; ci < n; ci++)
                        bigoffcode[ci] = i;
        }
        return FlateOk;
}

static void
deflatereset(LZstate *lz, int level, int debug)
{
        memset(lz->nexts, 0, sizeof lz->nexts);
        memset(lz->hash, 0, sizeof lz->hash);
        lz->totr = 0;
        lz->totw = 0;
        lz->pos = 0;
        lz->avail = 0;
        lz->out = lz->obuf;
        lz->eout = &lz->obuf[DeflateOut];
        lz->prevlen = MinMatch - 1;
        lz->prevoff = 0;
        lz->now = MaxOff + 1;
        lz->dot = lz->now;
        lz->bits = 0;
        lz->nbits = 0;
        lz->maxcheck = (1 << level);
        lz->maxcheck -= lz->maxcheck >> 2;
        if(lz->maxcheck < 2)
                lz->maxcheck = 2;
        else if(lz->maxcheck > 1024)
                lz->maxcheck = 1024;

        lz->debug = debug;
}

int
deflate(void *wr, int (*w)(void*, void*, int), void *rr, int (*r)(void*, void*, int), int level, int debug)
{
        LZstate *lz;
        LZblock *lzb;
        int ok;

        lz = malloc(sizeof *lz + sizeof *lzb);
        if(lz == nil)
                return FlateNoMem;
        lzb = (LZblock*)&lz[1];

        deflatereset(lz, level, debug);
        lz->w = w;
        lz->wr = wr;
        lz->wbad = 0;
        lz->rbad = 0;
        lz->eof = 0;
        ok = FlateOk;
        while(!lz->eof || lz->avail){
                ok = deflateb(lz, lzb, rr, r);
                if(ok != FlateOk)
                        break;
        }
        if(ok == FlateOk && lz->rbad)
                ok = FlateInputFail;
        if(ok == FlateOk && lz->wbad)
                ok = FlateOutputFail;
        free(lz);
        return ok;
}

static int
deflateb(LZstate *lz, LZblock *lzb, void *rr, int (*r)(void*, void*, int))
{
        Dyncode dyncode, hdyncode;
        Huff dlitlentab[Nlitlen], dofftab[Noff], hlitlentab[Nlitlen];
        ulong litcount[Nlitlen];
        long nunc, ndyn, nfix, nhuff;
        uchar *slop, *hslop;
        ulong ep;
        int i, n, m, mm, nslop;

        memset(lzb->litlencount, 0, sizeof lzb->litlencount);
        memset(lzb->offcount, 0, sizeof lzb->offcount);
        lzb->litlencount[DeflateEob]++;

        lzb->bytes = 0;
        lzb->eparse = lzb->parse;
        lzb->lastv = 0;
        lzb->excost = 0;

        slop = &lz->hist[lz->pos];
        n = lz->avail;
        while(n < DeflateBlock && (!lz->eof || lz->avail)){
                /*
                 * fill the buffer as much as possible,
                 * while leaving room for MaxOff history behind lz->pos,
                 * and not reading more than we can handle.
                 *
                 * make sure we read at least HistSlop bytes.
                 */
                if(!lz->eof){
                        ep = lz->pos + lz->avail;
                        if(ep >= HistBlock)
                                ep -= HistBlock;
                        m = HistBlock - MaxOff - lz->avail;
                        if(m > HistBlock - n)
                                m = HistBlock - n;
                        if(m > (HistBlock + HistSlop) - ep)
                                m = (HistBlock + HistSlop) - ep;
                        if(m & ~(BlockSize - 1))
                                m &= ~(BlockSize - 1);

                        /*
                         * be nice to the caller: stop reads that are too small.
                         * can only get here when we've already filled the buffer some
                         */
                        if(m < HistSlop){
                                if(!m || !lzb->bytes)
                                        return FlateInternal;
                                break;
                        }

                        mm = (*r)(rr, &lz->hist[ep], m);
                        if(mm > 0){
                                /*
                                 * wrap data to end if we're read it from the beginning
                                 * this way, we don't have to wrap searches.
                                 *
                                 * wrap reads past the end to the beginning.
                                 * this way, we can guarantee minimum size reads.
                                 */
                                if(ep < HistSlop)
                                        memmove(&lz->hist[ep + HistBlock], &lz->hist[ep], HistSlop - ep);
                                else if(ep + mm > HistBlock)
                                        memmove(&lz->hist[0], &lz->hist[HistBlock], ep + mm - HistBlock);

                                lz->totr += mm;
                                n += mm;
                                lz->avail += mm;
                        }else{
                                if(mm < 0)
                                        lz->rbad = 1;
                                lz->eof = 1;
                        }
                }
                ep = lz->pos + lz->avail;
                if(ep > HistSize)
                        ep = HistSize;
                if(lzb->bytes + ep - lz->pos > DeflateMaxBlock)
                        ep = lz->pos + DeflateMaxBlock - lzb->bytes;
                m = lzcomp(lz, lzb, &lz->hist[ep], lzb->eparse, lz->eof);
                lzb->bytes += m;
                lz->pos = (lz->pos + m) & (HistBlock - 1);
                lz->avail -= m;
        }
        if(lzb->lastv)
                *lzb->eparse++ = lzb->lastv;
        if(lzb->eparse > lzb->parse + nelem(lzb->parse))
                return FlateInternal;
        nunc = lzb->bytes;

        if(!mkgzprecode(dlitlentab, lzb->litlencount, Nlitlen, MaxHuffBits)
        || !mkgzprecode(dofftab, lzb->offcount, Noff, MaxHuffBits))
                return FlateInternal;

        ndyn = huffcodes(&dyncode, dlitlentab, dofftab);
        if(ndyn < 0)
                return FlateInternal;
        ndyn += bitcost(dlitlentab, lzb->litlencount, Nlitlen)
                + bitcost(dofftab, lzb->offcount, Noff)
                + lzb->excost;

        memset(litcount, 0, sizeof litcount);

        nslop = nunc;
        if(nslop > &lz->hist[HistSize] - slop)
                nslop = &lz->hist[HistSize] - slop;

        for(i = 0; i < nslop; i++)
                litcount[slop[i]]++;
        hslop = &lz->hist[HistSlop - nslop];
        for(; i < nunc; i++)
                litcount[hslop[i]]++;
        litcount[DeflateEob]++;

        if(!mkgzprecode(hlitlentab, litcount, Nlitlen, MaxHuffBits))
                return FlateInternal;
        nhuff = huffcodes(&hdyncode, hlitlentab, hofftab);
        if(nhuff < 0)
                return FlateInternal;
        nhuff += bitcost(hlitlentab, litcount, Nlitlen);

        nfix = bitcost(litlentab, lzb->litlencount, Nlitlen)
                + bitcost(offtab, lzb->offcount, Noff)
                + lzb->excost;

        lzput(lz, lz->eof && !lz->avail, 1);

        if(lz->debug){
                fprint(2, "block: bytes=%lud entries=%ld extra bits=%d\n\tuncompressed=%lud fixed=%lud dynamic=%lud huffman=%lud\n",
                        nunc, lzb->eparse - lzb->parse, lzb->excost, (nunc + 4) * 8, nfix, ndyn, nhuff);
                fprint(2, "\tnlit=%lud matches=%lud eof=%d\n", nlits, nmatches, lz->eof && !lz->avail);
        }

        if((nunc + 4) * 8 < ndyn && (nunc + 4) * 8 < nfix && (nunc + 4) * 8 < nhuff){
                lzput(lz, DeflateUnc, 2);
                lzflushbits(lz);

                lzput(lz, nunc & 0xff, 8);
                lzput(lz, (nunc >> 8) & 0xff, 8);
                lzput(lz, ~nunc & 0xff, 8);
                lzput(lz, (~nunc >> 8) & 0xff, 8);
                lzflush(lz);

                lzwrite(lz, slop, nslop);
                lzwrite(lz, &lz->hist[HistSlop], nunc - nslop);
        }else if(ndyn < nfix && ndyn < nhuff){
                lzput(lz, DeflateDyn, 2);

                wrdyncode(lz, &dyncode);
                wrblock(lz, slop - lz->hist, lzb->parse, lzb->eparse, dlitlentab, dofftab);
                lzput(lz, dlitlentab[DeflateEob].encode, dlitlentab[DeflateEob].bits);
        }else if(nhuff < nfix){
                lzput(lz, DeflateDyn, 2);

                wrdyncode(lz, &hdyncode);

                m = 0;
                for(i = nunc; i > MaxLitRun; i -= MaxLitRun)
                        lzb->parse[m++] = MaxLitRun;
                lzb->parse[m++] = i;

                wrblock(lz, slop - lz->hist, lzb->parse, lzb->parse + m, hlitlentab, hofftab);
                lzput(lz, hlitlentab[DeflateEob].encode, hlitlentab[DeflateEob].bits);
        }else{
                lzput(lz, DeflateFix, 2);

                wrblock(lz, slop - lz->hist, lzb->parse, lzb->eparse, litlentab, offtab);
                lzput(lz, litlentab[DeflateEob].encode, litlentab[DeflateEob].bits);
        }

        if(lz->eof && !lz->avail){
                lzflushbits(lz);
                lzflush(lz);
        }
        return FlateOk;
}

static void
lzwrite(LZstate *lz, void *buf, int n)
{
        int nw;

        if(n && lz->w){
                nw = (*lz->w)(lz->wr, buf, n);
                if(nw != n){
                        lz->w = nil;
                        lz->wbad = 1;
                }else
                        lz->totw += n;
        }
}

static void
lzflush(LZstate *lz)
{
        lzwrite(lz, lz->obuf, lz->out - lz->obuf);
        lz->out = lz->obuf;
}

static void
lzput(LZstate *lz, ulong bits, int nbits)
{
        bits = (bits << lz->nbits) | lz->bits;
        for(nbits += lz->nbits; nbits >= 8; nbits -= 8){
                *lz->out++ = bits;
                if(lz->out == lz->eout)
                        lzflush(lz);
                bits >>= 8;
        }
        lz->bits = bits;
        lz->nbits = nbits;
}

static void
lzflushbits(LZstate *lz)
{
        if(lz->nbits)
                lzput(lz, 0, 8 - (lz->nbits & 7));
}

/*
 * write out a block of n samples,
 * given lz encoding and counts for huffman tables
 */
static void
wrblock(LZstate *out, int litoff, ushort *soff, ushort *eoff, Huff *litlentab, Huff *offtab)
{
        ushort *off;
        int i, run, offset, lit, len, c;

        if(out->debug > 2){
                for(off = soff; off < eoff; ){
                        offset = *off++;
                        run = offset & MaxLitRun;
                        if(run){
                                for(i = 0; i < run; i++){
                                        lit = out->hist[litoff & (HistBlock - 1)];
                                        litoff++;
                                        fprint(2, "\tlit %.2ux %c\n", lit, lit);
                                }
                                if(!(offset & LenFlag))
                                        continue;
                                len = offset >> LenShift;
                                offset = *off++;
                        }else if(offset & LenFlag){
                                len = offset >> LenShift;
                                offset = *off++;
                        }else{
                                len = 0;
                                offset >>= LenShift;
                        }
                        litoff += len + MinMatch;
                        fprint(2, "\t<%d, %d>\n", offset + 1, len + MinMatch);
                }
        }

        for(off = soff; off < eoff; ){
                offset = *off++;
                run = offset & MaxLitRun;
                if(run){
                        for(i = 0; i < run; i++){
                                lit = out->hist[litoff & (HistBlock - 1)];
                                litoff++;
                                lzput(out, litlentab[lit].encode, litlentab[lit].bits);
                        }
                        if(!(offset & LenFlag))
                                continue;
                        len = offset >> LenShift;
                        offset = *off++;
                }else if(offset & LenFlag){
                        len = offset >> LenShift;
                        offset = *off++;
                }else{
                        len = 0;
                        offset >>= LenShift;
                }
                litoff += len + MinMatch;
                c = lencode[len];
                lzput(out, litlentab[c].encode, litlentab[c].bits);
                c -= LenStart;
                if(litlenextra[c])
                        lzput(out, len - litlenbase[c], litlenextra[c]);

                if(offset < MaxOffCode)
                        c = offcode[offset];
                else
                        c = bigoffcode[offset >> 7];
                lzput(out, offtab[c].encode, offtab[c].bits);
                if(offextra[c])
                        lzput(out, offset - offbase[c], offextra[c]);
        }
}

/*
 * look for the longest, closest string which matches
 * the next prefix.  the clever part here is looking for
 * a string 1 longer than the previous best match.
 *
 * follows the recommendation of limiting number of chains
 * which are checked.  this appears to be the best heuristic.
 */
static int
lzmatch(int now, int then, uchar *p, uchar *es, ushort *nexts, uchar *hist, int runlen, int check, int *m)
{
        uchar *s, *t;
        int ml, off, last;

        ml = check;
        if(runlen >= 8)
                check >>= 2;
        *m = 0;
        if(p + runlen >= es)
                return runlen;
        last = 0;
        for(; check-- > 0; then = nexts[then & (MaxOff-1)]){
                off = (ushort)(now - then);
                if(off <= last || off > MaxOff)
                        break;
                s = p + runlen;
                t = hist + (((p - hist) - off) & (HistBlock-1));
                t += runlen;
                for(; s >= p; s--){
                        if(*s != *t)
                                goto matchloop;
                        t--;
                }

                /*
                 * we have a new best match.
                 * extend it to it's maximum length
                 */
                t += runlen + 2;
                s += runlen + 2;
                for(; s < es; s++){
                        if(*s != *t)
                                break;
                        t++;
                }
                runlen = s - p;
                *m = off - 1;
                if(s == es || runlen > ml)
                        break;
matchloop:;
                last = off;
        }
        return runlen;
}

static int
lzcomp(LZstate *lz, LZblock *lzb, uchar *ep, ushort *parse, int finish)
{
        ulong cont, excost, *litlencount, *offcount;
        uchar *p, *q, *s, *es;
        ushort *nexts, *hash;
        int v, i, h, runlen, n, now, then, m, prevlen, prevoff, maxdefer;

        litlencount = lzb->litlencount;
        offcount = lzb->offcount;
        nexts = lz->nexts;
        hash = lz->hash;
        now = lz->now;

        p = &lz->hist[lz->pos];
        if(lz->prevlen != MinMatch - 1)
                p++;

        /*
         * hash in the links for any hanging link positions,
         * and calculate the hash for the current position.
         */
        n = MinMatch;
        if(n > ep - p)
                n = ep - p;
        cont = 0;
        for(i = 0; i < n - 1; i++){
                m = now - ((MinMatch-1) - i);
                if(m < lz->dot)
                        continue;
                s = lz->hist + (((p - lz->hist) - (now - m)) & (HistBlock-1));

                cont = (s[0] << 16) | (s[1] << 8) | s[2];
                h = hashit(cont);
                prevoff = 0;
                for(then = hash[h]; ; then = nexts[then & (MaxOff-1)]){
                        v = (ushort)(now - then);
                        if(v <= prevoff || v >= (MinMatch-1) - i)
                                break;
                        prevoff = v;
                }
                if(then == (ushort)m)
                        continue;
                nexts[m & (MaxOff-1)] = hash[h];
                hash[h] = m;
        }
        for(i = 0; i < n; i++)
                cont = (cont << 8) | p[i];

        /*
         * now must point to the index in the nexts array
         * corresponding to p's position in the history
         */
        prevlen = lz->prevlen;
        prevoff = lz->prevoff;
        maxdefer = lz->maxcheck >> 2;
        excost = 0;
        v = lzb->lastv;
        for(;;){
                es = p + MaxMatch;
                if(es > ep){
                        if(!finish || p >= ep)
                                break;
                        es = ep;
                }

                h = hashit(cont);
                runlen = lzmatch(now, hash[h], p, es, nexts, lz->hist, prevlen, lz->maxcheck, &m);

                /*
                 * back out of small matches too far in the past
                 */
                if(runlen == MinMatch && m >= MinMatchMaxOff){
                        runlen = MinMatch - 1;
                        m = 0;
                }

                /*
                 * record the encoding and increment counts for huffman trees
                 * if we get a match, defer selecting it until we check for
                 * a longer match at the next position.
                 */
                if(prevlen >= runlen && prevlen != MinMatch - 1){
                        /*
                         * old match at least as good; use that one
                         */
                        n = prevlen - MinMatch;
                        if(v || n){
                                *parse++ = v | LenFlag | (n << LenShift);
                                *parse++ = prevoff;
                        }else
                                *parse++ = prevoff << LenShift;
                        v = 0;

                        n = lencode[n];
                        litlencount[n]++;
                        excost += litlenextra[n - LenStart];

                        if(prevoff < MaxOffCode)
                                n = offcode[prevoff];
                        else
                                n = bigoffcode[prevoff >> 7];
                        offcount[n]++;
                        excost += offextra[n];

                        runlen = prevlen - 1;
                        prevlen = MinMatch - 1;
                        nmatches++;
                }else if(runlen == MinMatch - 1){
                        /*
                         * no match; just put out the literal
                         */
                        if(++v == MaxLitRun){
                                *parse++ = v;
                                v = 0;
                        }
                        litlencount[*p]++;
                        nlits++;
                        runlen = 1;
                }else{
                        if(prevlen != MinMatch - 1){
                                /*
                                 * longer match now. output previous literal,
                                 * update current match, and try again
                                 */
                                if(++v == MaxLitRun){
                                        *parse++ = v;
                                        v = 0;
                                }
                                litlencount[p[-1]]++;
                                nlits++;
                        }

                        prevoff = m;

                        if(runlen < maxdefer){
                                prevlen = runlen;
                                runlen = 1;
                        }else{
                                n = runlen - MinMatch;
                                if(v || n){
                                        *parse++ = v | LenFlag | (n << LenShift);
                                        *parse++ = prevoff;
                                }else
                                        *parse++ = prevoff << LenShift;
                                v = 0;

                                n = lencode[n];
                                litlencount[n]++;
                                excost += litlenextra[n - LenStart];

                                if(prevoff < MaxOffCode)
                                        n = offcode[prevoff];
                                else
                                        n = bigoffcode[prevoff >> 7];
                                offcount[n]++;
                                excost += offextra[n];

                                prevlen = MinMatch - 1;
                                nmatches++;
                        }
                }

                /*
                 * update the hash for the newly matched data
                 * this is constructed so the link for the old
                 * match in this position must be at the end of a chain,
                 * and will expire when this match is added, ie it will
                 * never be examined by the match loop.
                 * add to the hash chain only if we have the real hash data.
                 */
                for(q = p + runlen; p != q; p++){
                        if(p + MinMatch <= ep){
                                h = hashit(cont);
                                nexts[now & (MaxOff-1)] = hash[h];
                                hash[h] = now;
                                if(p + MinMatch < ep)
                                        cont = (cont << 8) | p[MinMatch];
                        }
                        now++;
                }
        }

        /*
         * we can just store away the lazy state and
         * pick it up next time.  the last block will have finish set
         * so we won't have any pending matches
         * however, we need to correct for how much we've encoded
         */
        if(prevlen != MinMatch - 1)
                p--;

        lzb->excost += excost;
        lzb->eparse = parse;
        lzb->lastv = v;

        lz->now = now;
        lz->prevlen = prevlen;
        lz->prevoff = prevoff;

        return p - &lz->hist[lz->pos];
}

/*
 * make up the dynamic code tables, and return the number of bits
 * needed to transmit them.
 */
static int
huffcodes(Dyncode *dc, Huff *littab, Huff *offtab)
{
        Huff *codetab;
        uchar *codes, *codeaux;
        ulong codecount[Nclen], excost;
        int i, n, m, v, c, nlit, noff, ncode, nclen;

        codetab = dc->codetab;
        codes = dc->codes;
        codeaux = dc->codeaux;

        /*
         * trim the sizes of the tables
         */
        for(nlit = Nlitlen; nlit > 257 && littab[nlit-1].bits == 0; nlit--)
                ;
        for(noff = Noff; noff > 1 && offtab[noff-1].bits == 0; noff--)
                ;

        /*
         * make the code-length code
         */
        for(i = 0; i < nlit; i++)
                codes[i] = littab[i].bits;
        for(i = 0; i < noff; i++)
                codes[i + nlit] = offtab[i].bits;

        /*
         * run-length compress the code-length code
         */
        excost = 0;
        c = 0;
        ncode = nlit+noff;
        for(i = 0; i < ncode; ){
                n = i + 1;
                v = codes[i];
                while(n < ncode && v == codes[n])
                        n++;
                n -= i;
                i += n;
                if(v == 0){
                        while(n >= 11){
                                m = n;
                                if(m > 138)
                                        m = 138;
                                codes[c] = 18;
                                codeaux[c++] = m - 11;
                                n -= m;
                                excost += 7;
                        }
                        if(n >= 3){
                                codes[c] = 17;
                                codeaux[c++] = n - 3;
                                n = 0;
                                excost += 3;
                        }
                }
                while(n--){
                        codes[c++] = v;
                        while(n >= 3){
                                m = n;
                                if(m > 6)
                                        m = 6;
                                codes[c] = 16;
                                codeaux[c++] = m - 3;
                                n -= m;
                                excost += 3;
                        }
                }
        }

        memset(codecount, 0, sizeof codecount);
        for(i = 0; i < c; i++)
                codecount[codes[i]]++;
        if(!mkgzprecode(codetab, codecount, Nclen, 8))
                return -1;

        for(nclen = Nclen; nclen > 4 && codetab[clenorder[nclen-1]].bits == 0; nclen--)
                ;

        dc->nlit = nlit;
        dc->noff = noff;
        dc->nclen = nclen;
        dc->ncode = c;

        return 5 + 5 + 4 + nclen * 3 + bitcost(codetab, codecount, Nclen) + excost;
}

static void
wrdyncode(LZstate *out, Dyncode *dc)
{
        Huff *codetab;
        uchar *codes, *codeaux;
        int i, v, c;

        /*
         * write out header, then code length code lengths,
         * and code lengths
         */
        lzput(out, dc->nlit-257, 5);
        lzput(out, dc->noff-1, 5);
        lzput(out, dc->nclen-4, 4);

        codetab = dc->codetab;
        for(i = 0; i < dc->nclen; i++)
                lzput(out, codetab[clenorder[i]].bits, 3);

        codes = dc->codes;
        codeaux = dc->codeaux;
        c = dc->ncode;
        for(i = 0; i < c; i++){
                v = codes[i];
                lzput(out, codetab[v].encode, codetab[v].bits);
                if(v >= 16){
                        if(v == 16)
                                lzput(out, codeaux[i], 2);
                        else if(v == 17)
                                lzput(out, codeaux[i], 3);
                        else /* v == 18 */
                                lzput(out, codeaux[i], 7);
                }
        }
}

static int
bitcost(Huff *tab, ulong *count, int n)
{
        ulong tot;
        int i;

        tot = 0;
        for(i = 0; i < n; i++)
                tot += count[i] * tab[i].bits;
        return tot;
}

static int
mkgzprecode(Huff *tab, ulong *count, int n, int maxbits)
{
        ulong bitcount[MaxHuffBits];
        int i, nbits;

        nbits = mkprecode(tab, count, n, maxbits, bitcount);
        for(i = 0; i < n; i++){
                if(tab[i].bits == -1)
                        tab[i].bits = 0;
                else if(tab[i].bits == 0){
                        if(nbits != 0 || bitcount[0] != 1)
                                return 0;
                        bitcount[1] = 1;
                        bitcount[0] = 0;
                        nbits = 1;
                        tab[i].bits = 1;
                }
        }
        if(bitcount[0] != 0)
                return 0;
        return hufftabinit(tab, n, bitcount, nbits);
}

static int
hufftabinit(Huff *tab, int n, ulong *bitcount, int nbits)
{
        ulong code, nc[MaxHuffBits];
        int i, bits;

        code = 0;
        for(bits = 1; bits <= nbits; bits++){
                code = (code + bitcount[bits-1]) << 1;
                nc[bits] = code;
        }

        for(i = 0; i < n; i++){
                bits = tab[i].bits;
                if(bits){
                        code = nc[bits]++ << (16 - bits);
                        if(code & ~0xffff)
                                return 0;
                        tab[i].encode = revtab[code >> 8] | (revtab[code & 0xff] << 8);
                }
        }
        return 1;
}


/*
 * this should be in a library
 */
struct Chain
{
        ulong   count;                          /* occurances of everything in the chain */
        ushort  leaf;                           /* leaves to the left of chain, or leaf value */
        char    col;                            /* ref count for collecting unused chains */
        char    gen;                            /* need to generate chains for next lower level */
        Chain   *up;                            /* Chain up in the lists */
};

struct Chains
{
        Chain   *lists[(MaxHuffBits - 1) * 2];
        ulong   leafcount[MaxLeaf];             /* sorted list of leaf counts */
        ushort  leafmap[MaxLeaf];               /* map to actual leaf number */
        int     nleaf;                          /* number of leaves */
        Chain   chains[ChainMem];
        Chain   *echains;
        Chain   *free;
        char    col;
        int     nlists;
};

/*
 * fast, low space overhead algorithm for max depth huffman type codes
 *
 * J. Katajainen, A. Moffat and A. Turpin, "A fast and space-economical
 * algorithm for length-limited coding," Proc. Intl. Symp. on Algorithms
 * and Computation, Cairns, Australia, Dec. 1995, Lecture Notes in Computer
 * Science, Vol 1004, J. Staples, P. Eades, N. Katoh, and A. Moffat, eds.,
 * pp 12-21, Springer Verlag, New York, 1995.
 */
static int
mkprecode(Huff *tab, ulong *count, int n, int maxbits, ulong *bitcount)
{
        Chains cs;
        Chain *c;
        int i, m, em, bits;

        /*
         * set up the sorted list of leaves
         */
        m = 0;
        for(i = 0; i < n; i++){
                tab[i].bits = -1;
                tab[i].encode = 0;
                if(count[i] != 0){
                        cs.leafcount[m] = count[i];
                        cs.leafmap[m] = i;
                        m++;
                }
        }
        if(m < 2){
                if(m != 0){
                        tab[cs.leafmap[0]].bits = 0;
                        bitcount[0] = 1;
                }else
                        bitcount[0] = 0;
                return 0;
        }
        cs.nleaf = m;
        leafsort(cs.leafcount, cs.leafmap, 0, m);

        for(i = 0; i < m; i++)
                cs.leafcount[i] = count[cs.leafmap[i]];

        /*
         * set up free list
         */
        cs.free = &cs.chains[2];
        cs.echains = &cs.chains[ChainMem];
        cs.col = 1;

        /*
         * initialize chains for each list
         */
        c = &cs.chains[0];
        c->count = cs.leafcount[0];
        c->leaf = 1;
        c->col = cs.col;
        c->up = nil;
        c->gen = 0;
        cs.chains[1] = cs.chains[0];
        cs.chains[1].leaf = 2;
        cs.chains[1].count = cs.leafcount[1];
        for(i = 0; i < maxbits-1; i++){
                cs.lists[i * 2] = &cs.chains[0];
                cs.lists[i * 2 + 1] = &cs.chains[1];
        }

        cs.nlists = 2 * (maxbits - 1);
        m = 2 * m - 2;
        for(i = 2; i < m; i++)
                nextchain(&cs, cs.nlists - 2);

        bits = 0;
        bitcount[0] = cs.nleaf;
        for(c = cs.lists[cs.nlists - 1]; c != nil; c = c->up){
                m = c->leaf;
                bitcount[bits++] -= m;
                bitcount[bits] = m;
        }
        m = 0;
        for(i = bits; i >= 0; i--)
                for(em = m + bitcount[i]; m < em; m++)
                        tab[cs.leafmap[m]].bits = i;

        return bits;
}

/*
 * calculate the next chain on the list
 * we can always toss out the old chain
 */
static void
nextchain(Chains *cs, int list)
{
        Chain *c, *oc;
        int i, nleaf, sumc;

        oc = cs->lists[list + 1];
        cs->lists[list] = oc;
        if(oc == nil)
                return;

        /*
         * make sure we have all chains needed to make sumc
         * note it is possible to generate only one of these,
         * use twice that value for sumc, and then generate
         * the second if that preliminary sumc would be chosen.
         * however, this appears to be slower on current tests
         */
        if(oc->gen){
                nextchain(cs, list - 2);
                nextchain(cs, list - 2);
                oc->gen = 0;
        }

        /*
         * pick up the chain we're going to add;
         * collect unused chains no free ones are left
         */
        for(c = cs->free; ; c++){
                if(c >= cs->echains){
                        cs->col++;
                        for(i = 0; i < cs->nlists; i++)
                                for(c = cs->lists[i]; c != nil; c = c->up)
                                        c->col = cs->col;
                        c = cs->chains;
                }
                if(c->col != cs->col)
                        break;
        }

        /*
         * pick the cheapest of
         * 1) the next package from the previous list
         * 2) the next leaf
         */
        nleaf = oc->leaf;
        sumc = 0;
        if(list > 0 && cs->lists[list-1] != nil)
                sumc = cs->lists[list-2]->count + cs->lists[list-1]->count;
        if(sumc != 0 && (nleaf >= cs->nleaf || cs->leafcount[nleaf] > sumc)){
                c->count = sumc;
                c->leaf = oc->leaf;
                c->up = cs->lists[list-1];
                c->gen = 1;
        }else if(nleaf >= cs->nleaf){
                cs->lists[list + 1] = nil;
                return;
        }else{
                c->leaf = nleaf + 1;
                c->count = cs->leafcount[nleaf];
                c->up = oc->up;
                c->gen = 0;
        }
        cs->free = c + 1;

        cs->lists[list + 1] = c;
        c->col = cs->col;
}

static int
pivot(ulong *c, int a, int n)
{
        int j, pi, pj, pk;

        j = n/6;
        pi = a + j;     /* 1/6 */
        j += j;
        pj = pi + j;    /* 1/2 */
        pk = pj + j;    /* 5/6 */
        if(c[pi] < c[pj]){
                if(c[pi] < c[pk]){
                        if(c[pj] < c[pk])
                                return pj;
                        return pk;
                }
                return pi;
        }
        if(c[pj] < c[pk]){
                if(c[pi] < c[pk])
                        return pi;
                return pk;
        }
        return pj;
}

static  void
leafsort(ulong *leafcount, ushort *leafmap, int a, int n)
{
        ulong t;
        int j, pi, pj, pn;

        while(n > 1){
                if(n > 10){
                        pi = pivot(leafcount, a, n);
                }else
                        pi = a + (n>>1);

                t = leafcount[pi];
                leafcount[pi] = leafcount[a];
                leafcount[a] = t;
                t = leafmap[pi];
                leafmap[pi] = leafmap[a];
                leafmap[a] = t;
                pi = a;
                pn = a + n;
                pj = pn;
                for(;;){
                        do
                                pi++;
                        while(pi < pn && (leafcount[pi] < leafcount[a] || leafcount[pi] == leafcount[a] && leafmap[pi] > leafmap[a]));
                        do
                                pj--;
                        while(pj > a && (leafcount[pj] > leafcount[a] || leafcount[pj] == leafcount[a] && leafmap[pj] < leafmap[a]));
                        if(pj < pi)
                                break;
                        t = leafcount[pi];
                        leafcount[pi] = leafcount[pj];
                        leafcount[pj] = t;
                        t = leafmap[pi];
                        leafmap[pi] = leafmap[pj];
                        leafmap[pj] = t;
                }
                t = leafcount[a];
                leafcount[a] = leafcount[pj];
                leafcount[pj] = t;
                t = leafmap[a];
                leafmap[a] = leafmap[pj];
                leafmap[pj] = t;
                j = pj - a;

                n = n-j-1;
                if(j >= n){
                        leafsort(leafcount, leafmap, a, j);
                        a += j+1;
                }else{
                        leafsort(leafcount, leafmap, a + (j+1), n);
                        n = j;
                }
        }
}