Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <setjmp.h>
4
#include <string.h>
5
#include "regexp.h"
6
#include "regcomp.h"
7
 
8
#define	TRUE	1
9
#define	FALSE	0
10
 
11
/*
12
 * Parser Information
13
 */
14
typedef
15
struct Node
16
{
17
	Reinst*	first;
18
	Reinst*	last;
19
}Node;
20
 
21
#define	NSTACK	20
22
static	Node	andstack[NSTACK];
23
static	Node	*andp;
24
static	int	atorstack[NSTACK];
25
static	int*	atorp;
26
static	int	cursubid;		/* id of current subexpression */
27
static	int	subidstack[NSTACK];	/* parallel to atorstack */
28
static	int*	subidp;
29
static	int	lastwasand;	/* Last token was operand */
30
static	int	nbra;
31
static	char*	exprp;		/* pointer to next character in source expression */
32
static	int	lexdone;
33
static	int	nclass;
34
static	Reclass*classp;
35
static	Reinst*	freep;
36
static	int	errors;
37
static	wchar_t	yyrune;		/* last lex'd rune */
38
static	Reclass*yyclassp;	/* last lex'd class */
39
 
40
/* predeclared crap */
41
static	void	operator(int);
42
static	void	pushand(Reinst*, Reinst*);
43
static	void	pushator(int);
44
static	void	evaluntil(int);
45
static	int	bldcclass(void);
46
 
47
static jmp_buf regkaboom;
48
 
49
static	void
50
rcerror(char *s)
51
{
52
	errors++;
53
	regerror(s);
54
	longjmp(regkaboom, 1);
55
}
56
 
57
static	Reinst*
58
newinst(int t)
59
{
60
	freep->type = t;
61
	freep->l.left = 0;
62
	freep->r.right = 0;
63
	return freep++;
64
}
65
 
66
static	void
67
operand(int t)
68
{
69
	Reinst *i;
70
 
71
	if(lastwasand)
72
		operator(CAT);	/* catenate is implicit */
73
	i = newinst(t);
74
 
75
	if(t == CCLASS || t == NCCLASS)
76
		i->r.cp = yyclassp;
77
	if(t == RUNE)
78
		i->r.r = yyrune;
79
 
80
	pushand(i, i);
81
	lastwasand = TRUE;
82
}
83
 
84
static	void
85
operator(int t)
86
{
87
	if(t==RBRA && --nbra<0)
88
		rcerror("unmatched right paren");
89
	if(t==LBRA){
90
		if(++cursubid >= NSUBEXP)
91
			rcerror ("too many subexpressions");
92
		nbra++;
93
		if(lastwasand)
94
			operator(CAT);
95
	} else
96
		evaluntil(t);
97
	if(t != RBRA)
98
		pushator(t);
99
	lastwasand = FALSE;
100
	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
101
		lastwasand = TRUE;	/* these look like operands */
102
}
103
 
104
static	void
105
regerr2(char *s, int c)
106
{
107
	char buf[100];
108
	char *cp = buf;
109
	while(*s)
110
		*cp++ = *s++;
111
	*cp++ = c;
112
	*cp = '\0'; 
113
	rcerror(buf);
114
}
115
 
116
static	void
117
cant(char *s)
118
{
119
	char buf[100];
120
	strcpy(buf, "can't happen: ");
121
	strcat(buf, s);
122
	rcerror(buf);
123
}
124
 
125
static	void
126
pushand(Reinst *f, Reinst *l)
127
{
128
	if(andp >= &andstack[NSTACK])
129
		cant("operand stack overflow");
130
	andp->first = f;
131
	andp->last = l;
132
	andp++;
133
}
134
 
135
static	void
136
pushator(int t)
137
{
138
	if(atorp >= &atorstack[NSTACK])
139
		cant("operator stack overflow");
140
	*atorp++ = t;
141
	*subidp++ = cursubid;
142
}
143
 
144
static	Node*
145
popand(int op)
146
{
147
	Reinst *inst;
148
 
149
	if(andp <= &andstack[0]){
150
		regerr2("missing operand for ", op);
151
		inst = newinst(NOP);
152
		pushand(inst,inst);
153
	}
154
	return --andp;
155
}
156
 
157
static	int
158
popator(void)
159
{
160
	if(atorp <= &atorstack[0])
161
		cant("operator stack underflow");
162
	--subidp;
163
	return *--atorp;
164
}
165
 
166
static	void
167
evaluntil(int pri)
168
{
169
	Node *op1, *op2;
170
	Reinst *inst1, *inst2;
171
 
172
	while(pri==RBRA || atorp[-1]>=pri){
173
		switch(popator()){
174
		default:
175
			rcerror("unknown operator in evaluntil");
176
			break;
177
		case LBRA:		/* must have been RBRA */
178
			op1 = popand('(');
179
			inst2 = newinst(RBRA);
180
			inst2->r.subid = *subidp;
181
			op1->last->l.next = inst2;
182
			inst1 = newinst(LBRA);
183
			inst1->r.subid = *subidp;
184
			inst1->l.next = op1->first;
185
			pushand(inst1, inst2);
186
			return;
187
		case OR:
188
			op2 = popand('|');
189
			op1 = popand('|');
190
			inst2 = newinst(NOP);
191
			op2->last->l.next = inst2;
192
			op1->last->l.next = inst2;
193
			inst1 = newinst(OR);
194
			inst1->r.right = op1->first;
195
			inst1->l.left = op2->first;
196
			pushand(inst1, inst2);
197
			break;
198
		case CAT:
199
			op2 = popand(0);
200
			op1 = popand(0);
201
			op1->last->l.next = op2->first;
202
			pushand(op1->first, op2->last);
203
			break;
204
		case STAR:
205
			op2 = popand('*');
206
			inst1 = newinst(OR);
207
			op2->last->l.next = inst1;
208
			inst1->r.right = op2->first;
209
			pushand(inst1, inst1);
210
			break;
211
		case PLUS:
212
			op2 = popand('+');
213
			inst1 = newinst(OR);
214
			op2->last->l.next = inst1;
215
			inst1->r.right = op2->first;
216
			pushand(op2->first, inst1);
217
			break;
218
		case QUEST:
219
			op2 = popand('?');
220
			inst1 = newinst(OR);
221
			inst2 = newinst(NOP);
222
			inst1->l.left = inst2;
223
			inst1->r.right = op2->first;
224
			op2->last->l.next = inst2;
225
			pushand(inst1, inst2);
226
			break;
227
		}
228
	}
229
}
230
 
231
static	Reprog*
232
optimize(Reprog *pp)
233
{
234
	Reinst *inst, *target;
235
	int size;
236
	Reprog *npp;
237
	int diff;
238
 
239
	/*
240
	 *  get rid of NOOP chains
241
	 */
242
	for(inst=pp->firstinst; inst->type!=END; inst++){
243
		target = inst->l.next;
244
		while(target->type == NOP)
245
			target = target->l.next;
246
		inst->l.next = target;
247
	}
248
 
249
	/*
250
	 *  The original allocation is for an area larger than
251
	 *  necessary.  Reallocate to the actual space used
252
	 *  and then relocate the code.
253
	 */
254
	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255
	npp = realloc(pp, size);
256
	if(npp==0 || npp==pp)
257
		return pp;
258
	diff = (char *)npp - (char *)pp;
259
	freep = (Reinst *)((char *)freep + diff);
260
	for(inst=npp->firstinst; inst<freep; inst++){
261
		switch(inst->type){
262
		case OR:
263
		case STAR:
264
		case PLUS:
265
		case QUEST:
266
		case CCLASS:
267
		case NCCLASS:
268
			*(char **)&inst->r.right += diff;
269
			break;
270
		}
271
		*(char **)&inst->l.left += diff;
272
	}
273
	*(char **)&npp->startinst += diff;
274
	return npp;
275
}
276
 
277
#ifdef	DEBUG
278
static	void
279
dumpstack(void){
280
	Node *stk;
281
	int *ip;
282
 
283
	print("operators\n");
284
	for(ip=atorstack; ip<atorp; ip++)
285
		print("0%o\n", *ip);
286
	print("operands\n");
287
	for(stk=andstack; stk<andp; stk++)
288
		print("0%o\t0%o\n", stk->first->type, stk->last->type);
289
}
290
 
291
static	void
292
dump(Reprog *pp)
293
{
294
	Reinst *l;
295
	wchar_t *p;
296
 
297
	l = pp->firstinst;
298
	do{
299
		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
300
			l->l.left-pp->firstinst, l->l.right-pp->firstinst);
301
		if(l->type == RUNE)
302
			print("\t%C\n", l->r);
303
		else if(l->type == CCLASS || l->type == NCCLASS){
304
			print("\t[");
305
			if(l->type == NCCLASS)
306
				print("^");
307
			for(p = l->r.cp->spans; p < l->r.cp->end; p += 2)
308
				if(p[0] == p[1])
309
					print("%C", p[0]);
310
				else
311
					print("%C-%C", p[0], p[1]);
312
			print("]\n");
313
		} else
314
			print("\n");
315
	}while(l++->type);
316
}
317
#endif
318
 
319
static	Reclass*
320
newclass(void)
321
{
322
	if(nclass >= NCLASS)
323
		regerr2("too many character classes; limit", NCLASS+'0');
324
	return &(classp[nclass++]);
325
}
326
 
327
static	int
328
nextc(wchar_t *rp)
329
{
330
	int n;
331
 
332
	if(lexdone){
333
		*rp = 0;
334
		return 1;
335
	}
336
	n = mbtowc(rp, exprp, MB_CUR_MAX);
337
	if (n <= 0)
338
		n = 1;
339
	exprp += n;
340
	if(*rp == L'\\'){
341
		n = mbtowc(rp, exprp, MB_CUR_MAX);
342
		if (n <= 0)
343
			n = 1;
344
		exprp += n;
345
		return 1;
346
	}
347
	if(*rp == 0)
348
		lexdone = 1;
349
	return 0;
350
}
351
 
352
static	int
353
lex(int literal, int dot_type)
354
{
355
	int quoted;
356
 
357
	quoted = nextc(&yyrune);
358
	if(literal || quoted){
359
		if(yyrune == 0)
360
			return END;
361
		return RUNE;
362
	}
363
 
364
	switch(yyrune){
365
	case 0:
366
		return END;
367
	case L'*':
368
		return STAR;
369
	case L'?':
370
		return QUEST;
371
	case L'+':
372
		return PLUS;
373
	case L'|':
374
		return OR;
375
	case L'.':
376
		return dot_type;
377
	case L'(':
378
		return LBRA;
379
	case L')':
380
		return RBRA;
381
	case L'^':
382
		return BOL;
383
	case L'$':
384
		return EOL;
385
	case L'[':
386
		return bldcclass();
387
	}
388
	return RUNE;
389
}
390
 
391
static int
392
bldcclass(void)
393
{
394
	int type;
395
	wchar_t r[NCCRUNE];
396
	wchar_t *p, *ep, *np;
397
	wchar_t rune;
398
	int quoted;
399
 
400
	/* we have already seen the '[' */
401
	type = CCLASS;
402
	yyclassp = newclass();
403
 
404
	/* look ahead for negation */
405
	ep = r;
406
	quoted = nextc(&rune);
407
	if(!quoted && rune == L'^'){
408
		type = NCCLASS;
409
		quoted = nextc(&rune);
410
		*ep++ = L'\n';
411
		*ep++ = L'\n';
412
	}
413
 
414
	/* parse class into a set of spans */
415
	for(; ep<&r[NCCRUNE];){
416
		if(rune == 0){
417
			rcerror("malformed '[]'");
418
			return 0;
419
		}
420
		if(!quoted && rune == L']')
421
			break;
422
		if(!quoted && rune == L'-'){
423
			if(ep == r){
424
				rcerror("malformed '[]'");
425
				return 0;
426
			}
427
			quoted = nextc(&rune);
428
			if((!quoted && rune == L']') || rune == 0){
429
				rcerror("malformed '[]'");
430
				return 0;
431
			}
432
			*(ep-1) = rune;
433
		} else {
434
			*ep++ = rune;
435
			*ep++ = rune;
436
		}
437
		quoted = nextc(&rune);
438
	}
439
 
440
	/* sort on span start */
441
	for(p = r; p < ep; p += 2){
442
		for(np = p; np < ep; np += 2)
443
			if(*np < *p){
444
				rune = np[0];
445
				np[0] = p[0];
446
				p[0] = rune;
447
				rune = np[1];
448
				np[1] = p[1];
449
				p[1] = rune;
450
			}
451
	}
452
 
453
	/* merge spans */
454
	np = yyclassp->spans;
455
	p = r;
456
	if(r == ep)
457
		yyclassp->end = np;
458
	else {
459
		np[0] = *p++;
460
		np[1] = *p++;
461
		for(; p < ep; p += 2)
462
			if(p[0] <= np[1]){
463
				if(p[1] > np[1])
464
					np[1] = p[1];
465
			} else {
466
				np += 2;
467
				np[0] = p[0];
468
				np[1] = p[1];
469
			}
470
		yyclassp->end = np+2;
471
	}
472
 
473
	return type;
474
}
475
 
476
static	Reprog*
477
regcomp1(char *s, int literal, int dot_type)
478
{
479
	int token;
480
	Reprog *pp;
481
 
482
	/* get memory for the program */
483
	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
484
	if(pp == 0){
485
		regerror("out of memory");
486
		return 0;
487
	}
488
	freep = pp->firstinst;
489
	classp = pp->class;
490
	errors = 0;
491
 
492
	if(setjmp(regkaboom))
493
		goto out;
494
 
495
	/* go compile the sucker */
496
	lexdone = 0;
497
	exprp = s;
498
	nclass = 0;
499
	nbra = 0;
500
	atorp = atorstack;
501
	andp = andstack;
502
	subidp = subidstack;
503
	lastwasand = FALSE;
504
	cursubid = 0;
505
 
506
	/* Start with a low priority operator to prime parser */
507
	pushator(START-1);
508
	while((token = lex(literal, dot_type)) != END){
509
		if((token&0300) == OPERATOR)
510
			operator(token);
511
		else
512
			operand(token);
513
	}
514
 
515
	/* Close with a low priority operator */
516
	evaluntil(START);
517
 
518
	/* Force END */
519
	operand(END);
520
	evaluntil(START);
521
#ifdef DEBUG
522
	dumpstack();
523
#endif
524
	if(nbra)
525
		rcerror("unmatched left paren");
526
	--andp;	/* points to first and only operand */
527
	pp->startinst = andp->first;
528
#ifdef DEBUG
529
	dump(pp);
530
#endif
531
	pp = optimize(pp);
532
#ifdef DEBUG
533
	print("start: %d\n", andp->first-pp->firstinst);
534
	dump(pp);
535
#endif
536
out:
537
	if(errors){
538
		free(pp);
539
		pp = 0;
540
	}
541
	return pp;
542
}
543
 
544
extern	Reprog*
545
regcomp(char *s)
546
{
547
	return regcomp1(s, 0, ANY);
548
}
549
 
550
extern	Reprog*
551
regcomplit(char *s)
552
{
553
	return regcomp1(s, 1, ANY);
554
}
555
 
556
extern	Reprog*
557
regcompnl(char *s)
558
{
559
	return regcomp1(s, 0, ANYNL);
560
}