Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include "regexp.h"
4
#include "regcomp.h"
5
 
6
#define	TRUE	1
7
#define	FALSE	0
8
 
9
/*
10
 * Parser Information
11
 */
12
typedef
13
struct Node
14
{
15
	Reinst*	first;
16
	Reinst*	last;
17
}Node;
18
 
19
/* max character classes per program is nelem(reprog->class) */
20
static Reprog	*reprog;
21
 
22
/* max rune ranges per character class is nelem(classp->spans)/2 */
23
#define NCCRUNE	nelem(classp->spans)
24
 
25
#define	NSTACK	20
26
static	Node	andstack[NSTACK];
27
static	Node	*andp;
28
static	int	atorstack[NSTACK];
29
static	int*	atorp;
30
static	int	cursubid;		/* id of current subexpression */
31
static	int	subidstack[NSTACK];	/* parallel to atorstack */
32
static	int*	subidp;
33
static	int	lastwasand;	/* Last token was operand */
34
static	int	nbra;
35
static	char*	exprp;		/* pointer to next character in source expression */
36
static	int	lexdone;
37
static	int	nclass;
38
static	Reclass*classp;
39
static	Reinst*	freep;
40
static	int	errors;
41
static	Rune	yyrune;		/* last lex'd rune */
42
static	Reclass*yyclassp;	/* last lex'd class */
43
 
44
/* predeclared crap */
45
static	void	operator(int);
46
static	void	pushand(Reinst*, Reinst*);
47
static	void	pushator(int);
48
static	void	evaluntil(int);
49
static	int	bldcclass(void);
50
 
51
static jmp_buf regkaboom;
52
 
53
static	void
54
rcerror(char *s)
55
{
56
	errors++;
57
	regerror(s);
58
	longjmp(regkaboom, 1);
59
}
60
 
61
static	Reinst*
62
newinst(int t)
63
{
64
	freep->type = t;
65
	freep->left = 0;
66
	freep->right = 0;
67
	return freep++;
68
}
69
 
70
static	void
71
operand(int t)
72
{
73
	Reinst *i;
74
 
75
	if(lastwasand)
76
		operator(CAT);	/* catenate is implicit */
77
	i = newinst(t);
78
 
79
	if(t == CCLASS || t == NCCLASS)
80
		i->cp = yyclassp;
81
	if(t == RUNE)
82
		i->r = yyrune;
83
 
84
	pushand(i, i);
85
	lastwasand = TRUE;
86
}
87
 
88
static	void
89
operator(int t)
90
{
91
	if(t==RBRA && --nbra<0)
92
		rcerror("unmatched right paren");
93
	if(t==LBRA){
94
		if(++cursubid >= NSUBEXP)
95
			rcerror ("too many subexpressions");
96
		nbra++;
97
		if(lastwasand)
98
			operator(CAT);
99
	} else
100
		evaluntil(t);
101
	if(t != RBRA)
102
		pushator(t);
103
	lastwasand = FALSE;
104
	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
105
		lastwasand = TRUE;	/* these look like operands */
106
}
107
 
108
static	void
109
regerr2(char *s, int c)
110
{
111
	char buf[100];
112
	char *cp = buf;
113
	while(*s)
114
		*cp++ = *s++;
115
	*cp++ = c;
116
	*cp = '\0'; 
117
	rcerror(buf);
118
}
119
 
120
static	void
121
cant(char *s)
122
{
123
	char buf[100];
124
	strcpy(buf, "can't happen: ");
125
	strcat(buf, s);
126
	rcerror(buf);
127
}
128
 
129
static	void
130
pushand(Reinst *f, Reinst *l)
131
{
132
	if(andp >= &andstack[NSTACK])
133
		cant("operand stack overflow");
134
	andp->first = f;
135
	andp->last = l;
136
	andp++;
137
}
138
 
139
static	void
140
pushator(int t)
141
{
142
	if(atorp >= &atorstack[NSTACK])
143
		cant("operator stack overflow");
144
	*atorp++ = t;
145
	*subidp++ = cursubid;
146
}
147
 
148
static	Node*
149
popand(int op)
150
{
151
	Reinst *inst;
152
 
153
	if(andp <= &andstack[0]){
154
		regerr2("missing operand for ", op);
155
		inst = newinst(NOP);
156
		pushand(inst,inst);
157
	}
158
	return --andp;
159
}
160
 
161
static	int
162
popator(void)
163
{
164
	if(atorp <= &atorstack[0])
165
		cant("operator stack underflow");
166
	--subidp;
167
	return *--atorp;
168
}
169
 
170
static	void
171
evaluntil(int pri)
172
{
173
	Node *op1, *op2;
174
	Reinst *inst1, *inst2;
175
 
176
	while(pri==RBRA || atorp[-1]>=pri){
177
		switch(popator()){
178
		default:
179
			rcerror("unknown operator in evaluntil");
180
			break;
181
		case LBRA:		/* must have been RBRA */
182
			op1 = popand('(');
183
			inst2 = newinst(RBRA);
184
			inst2->subid = *subidp;
185
			op1->last->next = inst2;
186
			inst1 = newinst(LBRA);
187
			inst1->subid = *subidp;
188
			inst1->next = op1->first;
189
			pushand(inst1, inst2);
190
			return;
191
		case OR:
192
			op2 = popand('|');
193
			op1 = popand('|');
194
			inst2 = newinst(NOP);
195
			op2->last->next = inst2;
196
			op1->last->next = inst2;
197
			inst1 = newinst(OR);
198
			inst1->right = op1->first;
199
			inst1->left = op2->first;
200
			pushand(inst1, inst2);
201
			break;
202
		case CAT:
203
			op2 = popand(0);
204
			op1 = popand(0);
205
			op1->last->next = op2->first;
206
			pushand(op1->first, op2->last);
207
			break;
208
		case STAR:
209
			op2 = popand('*');
210
			inst1 = newinst(OR);
211
			op2->last->next = inst1;
212
			inst1->right = op2->first;
213
			pushand(inst1, inst1);
214
			break;
215
		case PLUS:
216
			op2 = popand('+');
217
			inst1 = newinst(OR);
218
			op2->last->next = inst1;
219
			inst1->right = op2->first;
220
			pushand(op2->first, inst1);
221
			break;
222
		case QUEST:
223
			op2 = popand('?');
224
			inst1 = newinst(OR);
225
			inst2 = newinst(NOP);
226
			inst1->left = inst2;
227
			inst1->right = op2->first;
228
			op2->last->next = inst2;
229
			pushand(inst1, inst2);
230
			break;
231
		}
232
	}
233
}
234
 
235
static	Reprog*
236
optimize(Reprog *pp)
237
{
238
	Reinst *inst, *target;
239
	int size;
240
	Reprog *npp;
241
	Reclass *cl;
242
	int diff;
243
 
244
	/*
245
	 *  get rid of NOOP chains
246
	 */
247
	for(inst=pp->firstinst; inst->type!=END; inst++){
248
		target = inst->next;
249
		while(target->type == NOP)
250
			target = target->next;
251
		inst->next = target;
252
	}
253
 
254
	/*
255
	 *  The original allocation is for an area larger than
256
	 *  necessary.  Reallocate to the actual space used
257
	 *  and then relocate the code.
258
	 */
259
	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
260
	npp = realloc(pp, size);
261
	if(npp==0 || npp==pp)
262
		return pp;
263
	diff = (char *)npp - (char *)pp;
264
	freep = (Reinst *)((char *)freep + diff);
265
	for(inst=npp->firstinst; inst<freep; inst++){
266
		switch(inst->type){
267
		case OR:
268
		case STAR:
269
		case PLUS:
270
		case QUEST:
271
			*(char **)&inst->right += diff;
272
			break;
273
		case CCLASS:
274
		case NCCLASS:
275
			*(char **)&inst->right += diff;
276
			cl = inst->cp;
277
			*(char **)&cl->end += diff;
278
			break;
279
		}
280
		*(char **)&inst->left += diff;
281
	}
282
	*(char **)&npp->startinst += diff;
283
	return npp;
284
}
285
 
286
#ifdef	DEBUG
287
static	void
288
dumpstack(void){
289
	Node *stk;
290
	int *ip;
291
 
292
	print("operators\n");
293
	for(ip=atorstack; ip<atorp; ip++)
294
		print("0%o\n", *ip);
295
	print("operands\n");
296
	for(stk=andstack; stk<andp; stk++)
297
		print("0%o\t0%o\n", stk->first->type, stk->last->type);
298
}
299
 
300
static	void
301
dump(Reprog *pp)
302
{
303
	Reinst *l;
304
	Rune *p;
305
 
306
	l = pp->firstinst;
307
	do{
308
		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
309
			l->left-pp->firstinst, l->right-pp->firstinst);
310
		if(l->type == RUNE)
311
			print("\t%C\n", l->r);
312
		else if(l->type == CCLASS || l->type == NCCLASS){
313
			print("\t[");
314
			if(l->type == NCCLASS)
315
				print("^");
316
			for(p = l->cp->spans; p < l->cp->end; p += 2)
317
				if(p[0] == p[1])
318
					print("%C", p[0]);
319
				else
320
					print("%C-%C", p[0], p[1]);
321
			print("]\n");
322
		} else
323
			print("\n");
324
	}while(l++->type);
325
}
326
#endif
327
 
328
static	Reclass*
329
newclass(void)
330
{
331
	if(nclass >= nelem(reprog->class))
332
		rcerror("too many character classes; increase Reprog.class size");
333
	return &(classp[nclass++]);
334
}
335
 
336
static	int
337
nextc(Rune *rp)
338
{
339
	if(lexdone){
340
		*rp = 0;
341
		return 1;
342
	}
343
	exprp += chartorune(rp, exprp);
344
	if(*rp == L'\\'){
345
		exprp += chartorune(rp, exprp);
346
		return 1;
347
	}
348
	if(*rp == 0)
349
		lexdone = 1;
350
	return 0;
351
}
352
 
353
static	int
354
lex(int literal, int dot_type)
355
{
356
	int quoted;
357
 
358
	quoted = nextc(&yyrune);
359
	if(literal || quoted){
360
		if(yyrune == 0)
361
			return END;
362
		return RUNE;
363
	}
364
 
365
	switch(yyrune){
366
	case 0:
367
		return END;
368
	case L'*':
369
		return STAR;
370
	case L'?':
371
		return QUEST;
372
	case L'+':
373
		return PLUS;
374
	case L'|':
375
		return OR;
376
	case L'.':
377
		return dot_type;
378
	case L'(':
379
		return LBRA;
380
	case L')':
381
		return RBRA;
382
	case L'^':
383
		return BOL;
384
	case L'$':
385
		return EOL;
386
	case L'[':
387
		return bldcclass();
388
	}
389
	return RUNE;
390
}
391
 
392
static int
393
bldcclass(void)
394
{
395
	int type;
396
	Rune r[NCCRUNE];
397
	Rune *p, *ep, *np;
398
	Rune rune;
399
	int quoted;
400
 
401
	/* we have already seen the '[' */
402
	type = CCLASS;
403
	yyclassp = newclass();
404
 
405
	/* look ahead for negation */
406
	/* SPECIAL CASE!!! negated classes don't match \n */
407
	ep = r;
408
	quoted = nextc(&rune);
409
	if(!quoted && rune == L'^'){
410
		type = NCCLASS;
411
		quoted = nextc(&rune);
412
		*ep++ = L'\n';
413
		*ep++ = L'\n';
414
	}
415
 
416
	/* parse class into a set of spans */
417
	while(ep < &r[NCCRUNE-1]){
418
		if(rune == 0){
419
			rcerror("malformed '[]'");
420
			return 0;
421
		}
422
		if(!quoted && rune == L']')
423
			break;
424
		if(!quoted && rune == L'-'){
425
			if(ep == r){
426
				rcerror("malformed '[]'");
427
				return 0;
428
			}
429
			quoted = nextc(&rune);
430
			if((!quoted && rune == L']') || rune == 0){
431
				rcerror("malformed '[]'");
432
				return 0;
433
			}
434
			*(ep-1) = rune;
435
		} else {
436
			*ep++ = rune;
437
			*ep++ = rune;
438
		}
439
		quoted = nextc(&rune);
440
	}
441
	if(ep >= &r[NCCRUNE-1]) {
442
		rcerror("char class too large; increase Reclass.spans size");
443
		return 0;
444
	}
445
 
446
	/* sort on span start */
447
	for(p = r; p < ep; p += 2){
448
		for(np = p; np < ep; np += 2)
449
			if(*np < *p){
450
				rune = np[0];
451
				np[0] = p[0];
452
				p[0] = rune;
453
				rune = np[1];
454
				np[1] = p[1];
455
				p[1] = rune;
456
			}
457
	}
458
 
459
	/* merge spans */
460
	np = yyclassp->spans;
461
	p = r;
462
	if(r == ep)
463
		yyclassp->end = np;
464
	else {
465
		np[0] = *p++;
466
		np[1] = *p++;
467
		for(; p < ep; p += 2)
468
			/* overlapping or adjacent ranges? */
469
			if(p[0] <= np[1] + 1){
470
				if(p[1] >= np[1])
471
					np[1] = p[1];	/* coalesce */
472
			} else {
473
				np += 2;
474
				np[0] = p[0];
475
				np[1] = p[1];
476
			}
477
		yyclassp->end = np+2;
478
	}
479
 
480
	return type;
481
}
482
 
483
static	Reprog*
484
regcomp1(char *s, int literal, int dot_type)
485
{
486
	int token;
487
	Reprog *pp;
488
 
489
	/* get memory for the program */
490
	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
491
	if(pp == 0){
492
		regerror("out of memory");
493
		return 0;
494
	}
495
	freep = pp->firstinst;
496
	classp = pp->class;
497
	errors = 0;
498
 
499
	if(setjmp(regkaboom))
500
		goto out;
501
 
502
	/* go compile the sucker */
503
	lexdone = 0;
504
	exprp = s;
505
	nclass = 0;
506
	nbra = 0;
507
	atorp = atorstack;
508
	andp = andstack;
509
	subidp = subidstack;
510
	lastwasand = FALSE;
511
	cursubid = 0;
512
 
513
	/* Start with a low priority operator to prime parser */
514
	pushator(START-1);
515
	while((token = lex(literal, dot_type)) != END){
516
		if((token&0300) == OPERATOR)
517
			operator(token);
518
		else
519
			operand(token);
520
	}
521
 
522
	/* Close with a low priority operator */
523
	evaluntil(START);
524
 
525
	/* Force END */
526
	operand(END);
527
	evaluntil(START);
528
#ifdef DEBUG
529
	dumpstack();
530
#endif
531
	if(nbra)
532
		rcerror("unmatched left paren");
533
	--andp;	/* points to first and only operand */
534
	pp->startinst = andp->first;
535
#ifdef DEBUG
536
	dump(pp);
537
#endif
538
	pp = optimize(pp);
539
#ifdef DEBUG
540
	print("start: %d\n", andp->first-pp->firstinst);
541
	dump(pp);
542
#endif
543
out:
544
	if(errors){
545
		free(pp);
546
		pp = 0;
547
	}
548
	return pp;
549
}
550
 
551
extern	Reprog*
552
regcomp(char *s)
553
{
554
	return regcomp1(s, 0, ANY);
555
}
556
 
557
extern	Reprog*
558
regcomplit(char *s)
559
{
560
	return regcomp1(s, 1, ANY);
561
}
562
 
563
extern	Reprog*
564
regcompnl(char *s)
565
{
566
	return regcomp1(s, 0, ANYNL);
567
}