Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/*
2
 * Extraordinarily brute force Lempel & Ziv-like
3
 * compressor.  The input file must fit in memory
4
 * during compression, and the output file will
5
 * be reconstructed in memory during decompression.
6
 * We search for large common sequences and use a
7
 * greedy algorithm to choose which sequence gets
8
 * compressed first.
9
 *
10
 * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
11
 *
12
 * Output format is a series of blocks followed by
13
 * a raw data section.  Each block begins with a 32-bit big-endian
14
 * number.  The top bit is type and the next 31 bits
15
 * are uncompressed size.  Type is one of
16
 *	0 - use raw data for this length
17
 *	1 - a 32-bit offset follows
18
 * After the blocks come the raw data.  (The end of the blocks can be
19
 * noted by summing block lengths until you reach the file length.)
20
 */
21
 
22
#include <u.h>
23
#include <libc.h>
24
#include <bio.h>
25
 
26
#define malloc sbrk
27
 
28
int minrun = 16;
29
int win = 16;
30
ulong outn;
31
int verbose;
32
int mindist;
33
 
34
enum { Prime = 16777213 };	/* smallest prime < 2^24 (so p*256+256 < 2^32) */
35
enum { NOFF = 3 };
36
 
37
Biobuf bout;
38
ulong length;
39
uchar *data;
40
ulong sum32(ulong, void*, long);
41
uchar *odat;
42
int nodat;
43
int nraw;
44
int rawstart;
45
int acct;
46
int zlength;
47
int maxchain;
48
int maxrle[256];
49
int nnew;
50
 
51
typedef struct Node Node;
52
struct Node {
53
	Node *link;
54
	ulong key;
55
	ulong offset[NOFF];
56
};
57
 
58
Node *nodepool;
59
int nnodepool;
60
 
61
Node **hash;
62
uint nhash;
63
 
64
uint maxlen;
65
uint maxsame;
66
uint replacesame = 8*1024*1024;
67
 
68
Node *freelist, **freeend;
69
uint nalloc;
70
 
71
Node*
72
allocnode(void)
73
{
74
	int i;
75
	Node *n;
76
 
77
	if(nnodepool == 0){
78
		nnodepool = 256*1024;
79
		nodepool = malloc(sizeof(Node)*nnodepool);
80
	}
81
	if(freelist){
82
		n = freelist;
83
		freelist = n->link;
84
		return n;
85
	}
86
	assert(nnodepool > 0);
87
	nalloc++;
88
	n = &nodepool[--nnodepool];
89
	for(i=0; i<NOFF; i++)
90
		n->offset[i] = -1;
91
 
92
	return n;
93
}
94
 
95
void
96
freenode(Node *n)
97
{
98
	if(freelist == nil)
99
		freelist = n;
100
	else
101
		*freeend = n;
102
	freeend = &n->link;
103
	n->link = nil;
104
}
105
 
106
Node**
107
llookup(ulong key)
108
{
109
	uint c;
110
	Node **l, **top, *n;
111
 
112
	if(nhash == 0){
113
		uint x;
114
 
115
		x = length/8;
116
		for(nhash=1; nhash<x; nhash<<=1)
117
			;
118
		hash = sbrk(sizeof(Node*)*nhash);
119
	}
120
 
121
	top = &hash[key&(nhash-1)];
122
	c = 0;
123
	for(l=top; *l; l=&(*l)->link){
124
		c++;
125
		if((*l)->key == key){
126
			/* move to front */
127
			n = *l;
128
			*l = n->link;
129
			n->link = *top;
130
			*top = n;
131
			return top;
132
		}
133
	}
134
	if(c > maxlen)
135
		maxlen = c;
136
	return l;
137
}
138
 
139
Node*
140
lookup(ulong key)
141
{
142
	return *llookup(key);
143
}
144
 
145
void
146
insertnode(ulong key, ulong offset)
147
{
148
	int i;
149
	Node *n, **l;
150
 
151
	l = llookup(key);
152
	if(*l == nil){
153
		if(l==&hash[key&(nhash-1)])
154
			nnew++;
155
		*l = allocnode();
156
		(*l)->key = key;
157
	}
158
	n = *l;
159
 
160
	/* add or replace last */
161
	for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
162
		;
163
	n->offset[i] = offset;
164
}
165
 
166
void
167
Bputint(Biobufhdr *b, int n)
168
{
169
	uchar p[4];
170
 
171
	p[0] = n>>24;
172
	p[1] = n>>16;
173
	p[2] = n>>8;
174
	p[3] = n;
175
	Bwrite(b, p, 4);
176
}
177
 
178
void
179
flushraw(void)
180
{
181
	if(nraw){
182
		if(verbose)
183
			fprint(2, "Raw %d+%d\n", rawstart, nraw);
184
		zlength += 4+nraw;
185
		Bputint(&bout, (1<<31)|nraw);
186
		memmove(odat+nodat, data+rawstart, nraw);
187
		nodat += nraw;
188
		nraw = 0;
189
	}
190
}
191
 
192
int
193
rawbyte(int i)
194
{
195
	assert(acct == i);
196
	if(nraw == 0)
197
		rawstart = i;
198
	acct++;
199
	nraw++;
200
	return 1;
201
}
202
 
203
int
204
refblock(int i, int len, int off)
205
{
206
	assert(acct == i);
207
	acct += len;
208
	if(nraw)
209
		flushraw();
210
	if(verbose)
211
		fprint(2, "Copy %d+%d from %d\n", i, len, off);
212
	Bputint(&bout, len);
213
	Bputint(&bout, off);
214
	zlength += 4+4;
215
	return len;
216
}
217
 
218
int
219
cmprun(uchar *a, uchar *b, int len)
220
{
221
	int i;
222
 
223
	if(a==b)
224
		return 0;
225
	for(i=0; i<len && a[i]==b[i]; i++)
226
		;
227
	return i;
228
}
229
 
230
int
231
countrle(uchar *a)
232
{
233
	uchar a0;
234
	uchar *p;
235
 
236
	p = a;
237
	a0 = *p;
238
	while(*++p == a0)
239
		;
240
	return p - a;
241
}
242
 
243
void
244
compress(void)
245
{
246
	int best, i, j, o, rle, run, maxrun, maxoff;
247
	ulong sum;
248
	Node *n;
249
 
250
	sum = 0;
251
	for(i=0; i<win && i<length; i++)
252
		sum = (sum*256+data[i])%Prime;
253
	for(i=0; i<length-win; ){
254
		maxrun = 0;
255
		maxoff = 0;
256
		if(verbose)
257
			fprint(2, "look %.6lux\n", sum);
258
		n = lookup(sum);
259
		if(n){
260
			best = -1;
261
			for(o=0; o<NOFF; o++){
262
				if(n->offset[o] == -1)
263
					break;
264
				run = cmprun(data+i, data+n->offset[o], length-i);
265
				if(run > maxrun && n->offset[o]+mindist < i){
266
					maxrun = run;
267
					maxoff = n->offset[o];
268
					best = o;
269
				}
270
			}
271
			if(best > 0){
272
				o = n->offset[best];
273
				for(j=best; j>0; j--)
274
					n->offset[j] = n->offset[j-1];
275
				n->offset[0] = o;
276
			}
277
		}
278
 
279
		if(maxrun >= minrun)
280
			j = i+refblock(i, maxrun, maxoff);
281
		else
282
			j = i+rawbyte(i);
283
		for(; i<j; i++){
284
			/* avoid huge chains from large runs of same byte */
285
			rle = countrle(data+i);
286
			if(rle<4)
287
				insertnode(sum, i);
288
			else if(rle>maxrle[data[i]]){
289
				maxrle[data[i]] = rle;
290
				insertnode(sum, i);
291
			}
292
			sum = (sum*256+data[i+win]) % Prime;
293
			sum = (sum + data[i]*outn) % Prime;
294
		}
295
	}
296
	/* could do better here */
297
	for(; i<length; i++)
298
		rawbyte(i);
299
	flushraw();
300
}
301
 
302
void
303
usage(void)
304
{
305
	fprint(2, "usage: bflz [-n winsize] [file]\n");
306
	exits("usage");
307
}
308
 
309
void
310
main(int argc, char **argv)
311
{
312
	int fd, i, n;
313
	char buf[10485760];
314
 
315
	ARGBEGIN{
316
	case 'd':
317
		verbose = 1;
318
		break;
319
	case 's':
320
		replacesame = atoi(EARGF(usage()));
321
		break;
322
	case 'm':
323
		mindist = atoi(EARGF(usage()));
324
		break;
325
	case 'n':
326
		win = atoi(EARGF(usage()));
327
		minrun = win;
328
		break;
329
	default:
330
		usage();
331
	}ARGEND
332
 
333
	switch(argc){
334
	default:
335
		usage();
336
	case 0:
337
		fd = 0;
338
		break;
339
	case 1:
340
		if((fd = open(argv[0], OREAD)) < 0)
341
			sysfatal("open %s: %r", argv[0]);
342
		break;
343
	}
344
 
345
	while((n = readn(fd, buf, sizeof buf)) > 0){
346
		data = realloc(data, length+n);
347
		if(data == nil)
348
			sysfatal("realloc: %r");
349
		memmove(data+length, buf, n);
350
		length += n;
351
		if(n < sizeof buf)
352
			break;
353
	}
354
	odat = malloc(length);
355
	if(odat == nil)
356
		sysfatal("malloc: %r");
357
 
358
	Binit(&bout, 1, OWRITE);
359
	Bprint(&bout, "BLZ\n");
360
	Bputint(&bout, length);
361
	outn = 1;
362
	for(i=0; i<win; i++)
363
		outn = (outn * 256) % Prime;
364
 
365
	if(verbose)
366
		fprint(2, "256^%d = %.6lux\n", win, outn);
367
	outn = Prime - outn;
368
	if(verbose)
369
		fprint(2, "outn = %.6lux\n", outn);
370
 
371
	compress();
372
	Bwrite(&bout, odat, nodat);
373
	Bterm(&bout);
374
	fprint(2, "brk %p\n", sbrk(1));
375
	fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
376
	exits(nil);	
377
}