Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bio.h>
4
#include <regexp.h>
5
#include <ctype.h>
6
#include "dict.h"
7
 
8
/*
9
 * Assumed index file structure: lines of form
10
 * 	[^\t]+\t[0-9]+
11
 * First field is key, second is byte offset into dictionary.
12
 * Should be sorted with args -u -t'	' +0f -1 +0 -1 +1n -2
13
 */
14
typedef struct Addr Addr;
15
 
16
struct Addr {
17
	int	n;		/* number of offsets */
18
	int	cur;		/* current position within doff array */
19
	int	maxn;		/* actual current size of doff array */
20
	ulong	doff[1];	/* doff[maxn], with 0..n-1 significant */
21
};
22
 
23
Biobuf	binbuf;
24
Biobuf	boutbuf;
25
Biobuf	*bin = &binbuf;		/* user cmd input */
26
Biobuf	*bout = &boutbuf;	/* output */
27
Biobuf	*bdict;			/* dictionary */
28
Biobuf	*bindex;		/* index file */
29
long	indextop;		/* index offset at end of file */
30
int	lastcmd;		/* last executed command */
31
Addr	*dot;			/* "current" address */
32
Dict	*dict;			/* current dictionary */
33
int	linelen;
34
int	breaklen = 60;
35
int	outinhibit;
36
int	debug;
37
 
38
void	execcmd(int);
39
int	getpref(char*, Rune*);
40
Entry	getentry(int);
41
int	getfield(Rune*);
42
long	locate(Rune*);
43
int	parseaddr(char*, char**);
44
int	parsecmd(char*);
45
int	search(char*, int);
46
long	seeknextline(Biobuf*, long);
47
void	setdotnext(void);
48
void	setdotprev(void);
49
void	sortaddr(Addr*);
50
void	usage(void);
51
 
52
enum {
53
	Plen=300,	/* max length of a search pattern */
54
	Fieldlen=200,	/* max length of an index field */
55
	Aslots=10,	/* initial number of slots in an address */
56
};
57
 
58
void
59
main(int argc, char **argv)
60
{
61
	int i, cmd, kflag;
62
	char *line, *p;
63
 
64
	Binit(&binbuf, 0, OREAD);
65
	Binit(&boutbuf, 1, OWRITE);
66
	kflag = 0;
67
	line = 0;
68
	dict = 0;
69
	for(i=0; dicts[i].name; i++){
70
		if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
71
			dict = &dicts[i];
72
			break;
73
		}
74
	}
75
	ARGBEGIN {
76
		case 'd':
77
			p = ARGF();
78
			dict = 0;
79
			if(p) {
80
				for(i=0; dicts[i].name; i++)
81
					if(strcmp(p, dicts[i].name)==0) {
82
						dict = &dicts[i];
83
						break;
84
					}
85
			}
86
			if(!dict)
87
				usage();
88
			break;
89
		case 'c':
90
			line = ARGF();
91
			if(!line)
92
				usage();
93
			break;
94
		case 'k':
95
			kflag++;
96
			break;
97
		case 'D':
98
			debug++;
99
			break;
100
		default:
101
			usage();
102
	ARGEND }
103
	if(dict == 0){
104
		err("no dictionaries present on this system");
105
		exits("nodict");
106
	}
107
 
108
	if(kflag) {
109
		(*dict->printkey)();
110
		exits(0);
111
	}
112
	if(argc > 1)
113
		usage();
114
	else if(argc == 1) {
115
		if(line)
116
			usage();
117
		p = argv[0];
118
		line = malloc(strlen(p)+5);
119
		sprint(line, "/%s/P\n", p);
120
	}
121
	bdict = Bopen(dict->path, OREAD);
122
	if(!bdict) {
123
		err("can't open dictionary %s", dict->path);
124
		exits("nodict");
125
	}
126
	bindex = Bopen(dict->indexpath, OREAD);
127
	if(!bindex) {
128
		err("can't open index %s", dict->indexpath);
129
		exits("noindex");
130
	}
131
	indextop = Bseek(bindex, 0L, 2);
132
 
133
	dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
134
	dot->n = 0;
135
	dot->cur = 0;
136
	dot->maxn = Aslots;
137
	lastcmd = 0;
138
 
139
	if(line) {
140
		cmd = parsecmd(line);
141
		if(cmd)
142
			execcmd(cmd);
143
	} else {
144
		for(;;) {
145
			Bprint(bout, "*");
146
			Bflush(bout);
147
			line = Brdline(bin, '\n');
148
			linelen = 0;
149
			if(!line)
150
				break;
151
			cmd = parsecmd(line);
152
			if(cmd) {
153
				execcmd(cmd);
154
				lastcmd = cmd;
155
			}
156
		}
157
	}
158
	exits(0);
159
}
160
 
161
void
162
usage(void)
163
{
164
	int i;
165
	char *a, *b;
166
 
167
	Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
168
	Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
169
	for(i = 0; dicts[i].name; i++){
170
		a = b = "";
171
		if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
172
			a = "[";
173
			b = "]";
174
		}
175
		Bprint(bout, "   %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
176
	}
177
	exits("usage");
178
}
179
 
180
int
181
parsecmd(char *line)
182
{
183
	char *e;
184
	int cmd, ans;
185
 
186
	if(parseaddr(line, &e) >= 0)
187
		line = e;
188
	else
189
		return 0;
190
	cmd = *line;
191
	ans = cmd;
192
	if(isupper(cmd))
193
		cmd = tolower(cmd);
194
	if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
195
	     cmd == '\n')) {
196
		err("unknown command %c", cmd);
197
		return 0;
198
	}
199
	if(cmd == '\n')
200
		switch(lastcmd) {
201
			case 0:	ans = 'H'; break;
202
			case 'H':	ans = 'p'; break;
203
			default :	ans = lastcmd; break;
204
		}
205
	else if(line[1] != '\n' && line[1] != 0)
206
		err("extra stuff after command %c ignored", cmd);
207
	return ans;
208
}
209
 
210
void
211
execcmd(int cmd)
212
{
213
	Entry e;
214
	int cur, doall;
215
 
216
	if(isupper(cmd)) {
217
		doall = 1;
218
		cmd = tolower(cmd);
219
		cur = 0;
220
	} else {
221
		doall = 0;
222
		cur = dot->cur;
223
	}
224
 
225
	if(debug && doall && cmd == 'a')
226
		Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
227
	for(;;){
228
		if(cur >= dot->n)
229
			break;
230
		if(doall) {
231
			Bprint(bout, "%d\t", cur+1);
232
			linelen += 4 + (cur >= 10);
233
		}
234
		switch(cmd) {
235
		case 'a':
236
			Bprint(bout, "#%lud\n", dot->doff[cur]);
237
			break;
238
		case 'h':
239
		case 'p':
240
		case 'r':
241
			e = getentry(cur);
242
			(*dict->printentry)(e, cmd);
243
			break;
244
		}
245
		cur++;
246
		if(doall) {
247
			if(cmd == 'p' || cmd == 'r') {
248
				Bputc(bout, '\n');
249
				linelen = 0;
250
			}
251
		} else
252
			break;
253
	}
254
	if(cur >= dot->n)
255
		cur = 0;
256
	dot->cur = cur;
257
}
258
 
259
/*
260
 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
261
 * Answer goes in dot.
262
 * Return -1 if address starts, but get error.
263
 * Return 0 if no address.
264
 */
265
int
266
parseaddr(char *line, char **eptr)
267
{
268
	int delim, plen;
269
	ulong v;
270
	char *e;
271
	char pat[Plen];
272
 
273
	if(*line == '/' || *line == '!') {
274
		/* anchored regular expression match; '!' means no folding */
275
		if(*line == '/') {
276
			delim = '/';
277
			e = strpbrk(line+1, "/\n");
278
		} else {
279
			delim = '!';
280
			e = strpbrk(line+1, "!\n");
281
		}
282
		plen = e-line-1;
283
		if(plen >= Plen-3) {
284
			err("pattern too big");
285
			return -1;
286
		}
287
		pat[0] = '^';
288
		memcpy(pat+1, line+1, plen);
289
		pat[plen+1] = '$';
290
		pat[plen+2] = 0;
291
		if(*e == '\n')
292
			line = e;
293
		else
294
			line = e+1;
295
		if(!search(pat, delim == '/')) {
296
			err("pattern not found");
297
			return -1;
298
		}
299
	} else if(*line == '#') {
300
		/* absolute byte offset into dictionary */
301
		line++;
302
		if(!isdigit(*line))
303
			return -1;
304
		v = strtoul(line, &e, 10);
305
		line = e;
306
		dot->doff[0] = v;
307
		dot->n = 1;
308
		dot->cur = 0;
309
	} else if(isdigit(*line)) {
310
		v = strtoul(line, &e, 10);
311
		line = e;
312
		if(v < 1 || v > dot->n)
313
			err(".%d not in range [1,%d], ignored",
314
				v, dot->n);
315
		else
316
			dot->cur = v-1;
317
	} else if(*line == '.') {
318
		line++;
319
	} else {
320
		*eptr = line;
321
		return 0;
322
	}
323
	while(*line == '+' || *line == '-') {
324
		if(*line == '+')
325
			setdotnext();
326
		else
327
			setdotprev();
328
		line++;
329
	}
330
	*eptr = line;
331
	return 1;
332
}
333
 
334
/*
335
 * Index file is sorted by folded field1.
336
 * Method: find pre, a folded prefix of r.e. pat,
337
 * and then low = offset to beginning of
338
 * line in index file where first match of prefix occurs.
339
 * Then go through index until prefix no longer matches,
340
 * adding each line that matches real pattern to dot.
341
 * Finally, sort dot offsets (uniquing).
342
 * We know pat len < Plen, and that it is surrounded by ^..$
343
 */
344
int
345
search(char *pat, int dofold)
346
{
347
	int needre, prelen, match, n;
348
	Reprog *re;
349
	long ioff, v;
350
	Rune pre[Plen];
351
	Rune lit[Plen];
352
	Rune entry[Fieldlen];
353
	char fpat[Plen];
354
 
355
	prelen = getpref(pat+1, pre);
356
	if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
357
		runescpy(lit, pre);
358
		if(dofold)
359
			fold(lit);
360
		needre = 0;
361
		SET(re);
362
	} else {
363
		needre = 1;
364
		if(dofold) {
365
			foldre(fpat, pat);
366
			re = regcomp(fpat);
367
		} else
368
			re = regcomp(pat);
369
	}
370
	fold(pre);
371
	ioff = locate(pre);
372
	if(ioff < 0)
373
		return 0;
374
	dot->n = 0;
375
	Bseek(bindex, ioff, 0);
376
	for(;;) {
377
		if(!getfield(entry))
378
			break;
379
		if(dofold)
380
			fold(entry);
381
		if(needre)
382
			match = rregexec(re, entry, 0, 0);
383
		else
384
			match = (acomp(lit, entry) == 0);
385
		if(match) {
386
			if(!getfield(entry))
387
				break;
388
			v = runetol(entry);
389
			if(dot->n >= dot->maxn) {
390
				n = 2*dot->maxn;
391
				dot = realloc(dot,
392
					sizeof(Addr)+(n-1)*sizeof(long));
393
				if(!dot) {
394
					err("out of memory");
395
					exits("nomem");
396
				}
397
				dot->maxn = n;
398
			}
399
			dot->doff[dot->n++] = v;
400
		} else {
401
			if(!dofold)
402
				fold(entry);
403
			if(*pre) {
404
				n = acomp(pre, entry);
405
				if(n < -1 || (!needre && n < 0))
406
					break;
407
			}
408
			/* get to next index entry */
409
			if(!getfield(entry))
410
				break;
411
		}
412
	}
413
	sortaddr(dot);
414
	dot->cur = 0;
415
	return dot->n;
416
}
417
 
418
/*
419
 * Return offset in index file of first line whose folded
420
 * first field has pre as a prefix.  -1 if none found.
421
 */
422
long
423
locate(Rune *pre)
424
{
425
	long top, bot, mid;
426
	Rune entry[Fieldlen];
427
 
428
	if(*pre == 0)
429
		return 0;
430
	bot = 0;
431
	top = indextop;
432
	if(debug>1)
433
		fprint(2, "locate looking for prefix %S\n", pre);
434
	for(;;) {
435
		/*
436
		 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
437
		 * and bot < top, and bot,top point at beginning of lines
438
		 */
439
		mid = (top+bot) / 2;
440
		mid = seeknextline(bindex, mid);
441
		if(debug > 1)
442
			fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
443
				bot, (top+bot) / 2, mid, top);
444
		if(mid == top || !getfield(entry))
445
			break;
446
		if(debug > 1)
447
			fprint(2, "key=%S\n", entry);
448
		/*
449
		 * here mid is strictly between bot and top
450
		 */
451
		fold(entry);
452
		if(acomp(pre, entry) <= 0)
453
			top = mid;
454
		else
455
			bot = mid;
456
	}
457
	/*
458
	 * bot < top, but they don't necessarily point at successive lines
459
	 * Use linear search from bot to find first line that pre is a
460
	 * prefix of
461
	 */
462
	while((bot = seeknextline(bindex, bot)) <= top) {
463
		if(!getfield(entry))
464
			return -1;
465
		if(debug > 1)
466
			fprint(2, "key=%S\n", entry);
467
		fold(entry);
468
		switch(acomp(pre, entry)) {
469
		case -2:
470
			return -1;
471
		case -1:
472
		case 0:
473
			return bot;
474
		case 1:
475
		case 2:
476
			continue;
477
		}
478
	}
479
	return -1;
480
 
481
}
482
 
483
/*
484
 * Get prefix of non re-metacharacters, runified, into pre,
485
 * and return length
486
 */
487
int
488
getpref(char *pat, Rune *pre)
489
{
490
	int n, r;
491
	char *p;
492
 
493
	p = pat;
494
	while(*p) {
495
		n = chartorune(pre, p);
496
		r = *pre;
497
		switch(r) {
498
		case L'.': case L'*': case L'+': case L'?':
499
		case L'[': case L']': case L'(': case ')':
500
		case L'|': case L'^': case L'$':
501
			*pre = 0;
502
			return p-pat;
503
		case L'\\':
504
			p += n;
505
			p += chartorune(++pre, p);
506
			pre++;
507
			break;
508
		default:
509
			p += n;
510
			pre++;
511
		}
512
	}
513
	return p-pat;
514
}
515
 
516
long
517
seeknextline(Biobuf *b, long off)
518
{
519
	long c;
520
 
521
	Bseek(b, off, 0);
522
	do {
523
		c = Bgetrune(b);
524
	} while(c>=0 && c!='\n');
525
	return Boffset(b);
526
}
527
 
528
/*
529
 * Get next field out of index file (either tab- or nl- terminated)
530
 * Answer in *rp, assumed to be Fieldlen long.
531
 * Return 0 if read error first.
532
 */
533
int
534
getfield(Rune *rp)
535
{
536
	long c;
537
	int n;
538
 
539
	for(n=Fieldlen; n-- > 0; ) {
540
		if ((c = Bgetrune(bindex)) < 0)
541
			return 0;
542
		if(c == '\t' || c == '\n') {
543
			*rp = L'\0';
544
			return 1;
545
		}
546
		*rp++ = c;
547
	}
548
	err("word too long");
549
	return 0;
550
}
551
 
552
/*
553
 * A compare longs function suitable for qsort
554
 */
555
static int
556
longcmp(void *av, void *bv)
557
{
558
	long v;
559
	long *a, *b;
560
 
561
	a = av;
562
	b = bv;
563
 
564
	v = *a - *b;
565
	if(v < 0)
566
		return -1;
567
	else if(v == 0)
568
		return 0;
569
	else
570
		return 1;
571
}
572
 
573
void
574
sortaddr(Addr *a)
575
{
576
	int i, j;
577
	long v;
578
 
579
	if(a->n <= 1)
580
		return;
581
 
582
	qsort(a->doff, a->n, sizeof(long), longcmp);
583
 
584
	/* remove duplicates */
585
	for(i=0, j=0; j < a->n; j++) {
586
		v = a->doff[j];
587
		if(i > 0 && v == a->doff[i-1])
588
			continue;
589
		a->doff[i++] = v;
590
	}
591
	a->n = i;
592
}
593
 
594
Entry
595
getentry(int i)
596
{
597
	long b, e, n;
598
	static Entry ans;
599
	static int anslen = 0;
600
 
601
	b = dot->doff[i];
602
	e = (*dict->nextoff)(b+1);
603
	ans.doff = b;
604
	if(e < 0) {
605
		err("couldn't seek to entry");
606
		ans.start = 0;
607
		ans.end = 0;
608
	} else {
609
		n = e-b;
610
		if(n+1 > anslen) {
611
			ans.start = realloc(ans.start, n+1);
612
			if(!ans.start) {
613
				err("out of memory");
614
				exits("nomem");
615
			}
616
			anslen = n+1;
617
		}
618
		Bseek(bdict, b, 0);
619
		n = Bread(bdict, ans.start, n);
620
		ans.end = ans.start + n;
621
		*ans.end = 0;
622
	}
623
	return ans;
624
}
625
 
626
void
627
setdotnext(void)
628
{
629
	long b;
630
 
631
	b = (*dict->nextoff)(dot->doff[dot->cur]+1);
632
	if(b < 0) {
633
		err("couldn't find a next entry");
634
		return;
635
	}
636
	dot->doff[0] = b;
637
	dot->n = 1;
638
	dot->cur = 0;
639
}
640
 
641
void
642
setdotprev(void)
643
{
644
	int tryback;
645
	long here, last, p;
646
 
647
	if(dot->cur < 0 || dot->cur >= dot->n)
648
		return;
649
	tryback = 2000;
650
	here = dot->doff[dot->cur];
651
	last = 0;
652
	while(last == 0) {
653
		p = here - tryback;
654
		if(p < 0)
655
			p = 0;
656
		for(;;) {
657
			p = (*dict->nextoff)(p+1);
658
			if(p < 0)
659
				return; /* shouldn't happen */
660
			if(p >= here)
661
				break;
662
			last = p;
663
		}
664
		if(!last) {
665
			if(here - tryback < 0) {
666
				err("can't find a previous entry");
667
				return;
668
			}
669
			tryback = 2*tryback;
670
		}
671
	}
672
	dot->doff[0] = last;
673
	dot->n = 1;
674
	dot->cur = 0;
675
}