Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/* From libregexp but leaks extra classes when it runs out */
2
 
3
 
4
#include <u.h>
5
#include <libc.h>
6
#include "regexp.h"
7
#include "/sys/src/libregexp/regcomp.h"
8
 
9
#define	TRUE	1
10
#define	FALSE	0
11
 
12
/*
13
 * Parser Information
14
 */
15
typedef
16
struct Node
17
{
18
	Reinst*	first;
19
	Reinst*	last;
20
}Node;
21
 
22
#define	NSTACK	20
23
static	Node	andstack[NSTACK];
24
static	Node	*andp;
25
static	int	atorstack[NSTACK];
26
static	int*	atorp;
27
static	int	cursubid;		/* id of current subexpression */
28
static	int	subidstack[NSTACK];	/* parallel to atorstack */
29
static	int*	subidp;
30
static	int	lastwasand;	/* Last token was operand */
31
static	int	nbra;
32
static	char*	exprp;		/* pointer to next character in source expression */
33
static	int	lexdone;
34
static	int	nclass;
35
static	Reclass*classp;
36
static	Reinst*	freep;
37
static	int	errors;
38
static	Rune	yyrune;		/* last lex'd rune */
39
static	Reclass*yyclassp;	/* last lex'd class */
40
 
41
/* predeclared crap */
42
static	void	operator(int);
43
static	void	pushand(Reinst*, Reinst*);
44
static	void	pushator(int);
45
static	void	evaluntil(int);
46
static	int	bldcclass(void);
47
 
48
static jmp_buf regkaboom;
49
 
50
static	void
51
rcerror(char *s)
52
{
53
	errors++;
54
	regerror(s);
55
	longjmp(regkaboom, 1);
56
}
57
 
58
static	Reinst*
59
newinst(int t)
60
{
61
	freep->type = t;
62
	freep->left = 0;
63
	freep->right = 0;
64
	return freep++;
65
}
66
 
67
static	void
68
operand(int t)
69
{
70
	Reinst *i;
71
 
72
	if(lastwasand)
73
		operator(CAT);	/* catenate is implicit */
74
	i = newinst(t);
75
 
76
	if(t == CCLASS || t == NCCLASS)
77
		i->cp = yyclassp;
78
	if(t == RUNE)
79
		i->r = yyrune;
80
 
81
	pushand(i, i);
82
	lastwasand = TRUE;
83
}
84
 
85
static	void
86
operator(int t)
87
{
88
	if(t==RBRA && --nbra<0)
89
		rcerror("unmatched right paren");
90
	if(t==LBRA){
91
		if(++cursubid >= NSUBEXP)
92
			rcerror ("too many subexpressions");
93
		nbra++;
94
		if(lastwasand)
95
			operator(CAT);
96
	} else
97
		evaluntil(t);
98
	if(t != RBRA)
99
		pushator(t);
100
	lastwasand = FALSE;
101
	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
102
		lastwasand = TRUE;	/* these look like operands */
103
}
104
 
105
static	void
106
regerr2(char *s, int c)
107
{
108
	char buf[100];
109
	char *cp = buf;
110
	while(*s)
111
		*cp++ = *s++;
112
	*cp++ = c;
113
	*cp = '\0'; 
114
	rcerror(buf);
115
}
116
 
117
static	void
118
cant(char *s)
119
{
120
	char buf[100];
121
	strcpy(buf, "can't happen: ");
122
	strcat(buf, s);
123
	rcerror(buf);
124
}
125
 
126
static	void
127
pushand(Reinst *f, Reinst *l)
128
{
129
	if(andp >= &andstack[NSTACK])
130
		cant("operand stack overflow");
131
	andp->first = f;
132
	andp->last = l;
133
	andp++;
134
}
135
 
136
static	void
137
pushator(int t)
138
{
139
	if(atorp >= &atorstack[NSTACK])
140
		cant("operator stack overflow");
141
	*atorp++ = t;
142
	*subidp++ = cursubid;
143
}
144
 
145
static	Node*
146
popand(int op)
147
{
148
	Reinst *inst;
149
 
150
	if(andp <= &andstack[0]){
151
		regerr2("missing operand for ", op);
152
		inst = newinst(NOP);
153
		pushand(inst,inst);
154
	}
155
	return --andp;
156
}
157
 
158
static	int
159
popator(void)
160
{
161
	if(atorp <= &atorstack[0])
162
		cant("operator stack underflow");
163
	--subidp;
164
	return *--atorp;
165
}
166
 
167
static	void
168
evaluntil(int pri)
169
{
170
	Node *op1, *op2;
171
	Reinst *inst1, *inst2;
172
 
173
	while(pri==RBRA || atorp[-1]>=pri){
174
		switch(popator()){
175
		default:
176
			rcerror("unknown operator in evaluntil");
177
			break;
178
		case LBRA:		/* must have been RBRA */
179
			op1 = popand('(');
180
			inst2 = newinst(RBRA);
181
			inst2->subid = *subidp;
182
			op1->last->next = inst2;
183
			inst1 = newinst(LBRA);
184
			inst1->subid = *subidp;
185
			inst1->next = op1->first;
186
			pushand(inst1, inst2);
187
			return;
188
		case OR:
189
			op2 = popand('|');
190
			op1 = popand('|');
191
			inst2 = newinst(NOP);
192
			op2->last->next = inst2;
193
			op1->last->next = inst2;
194
			inst1 = newinst(OR);
195
			inst1->right = op1->first;
196
			inst1->left = op2->first;
197
			pushand(inst1, inst2);
198
			break;
199
		case CAT:
200
			op2 = popand(0);
201
			op1 = popand(0);
202
			op1->last->next = op2->first;
203
			pushand(op1->first, op2->last);
204
			break;
205
		case STAR:
206
			op2 = popand('*');
207
			inst1 = newinst(OR);
208
			op2->last->next = inst1;
209
			inst1->right = op2->first;
210
			pushand(inst1, inst1);
211
			break;
212
		case PLUS:
213
			op2 = popand('+');
214
			inst1 = newinst(OR);
215
			op2->last->next = inst1;
216
			inst1->right = op2->first;
217
			pushand(op2->first, inst1);
218
			break;
219
		case QUEST:
220
			op2 = popand('?');
221
			inst1 = newinst(OR);
222
			inst2 = newinst(NOP);
223
			inst1->left = inst2;
224
			inst1->right = op2->first;
225
			op2->last->next = inst2;
226
			pushand(inst1, inst2);
227
			break;
228
		}
229
	}
230
}
231
 
232
static	Reprog*
233
optimize(Reprog *pp)
234
{
235
	Reinst *inst, *target;
236
	int size;
237
	Reprog *npp;
238
	Reclass *cl;
239
	int diff;
240
 
241
	/*
242
	 *  get rid of NOOP chains
243
	 */
244
	for(inst=pp->firstinst; inst->type!=END; inst++){
245
		target = inst->next;
246
		while(target->type == NOP)
247
			target = target->next;
248
		inst->next = target;
249
	}
250
 
251
	/*
252
	 *  The original allocation is for an area larger than
253
	 *  necessary.  Reallocate to the actual space used
254
	 *  and then relocate the code.
255
	 */
256
	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
257
	npp = realloc(pp, size);
258
	if(npp==0 || npp==pp)
259
		return pp;
260
	diff = (char *)npp - (char *)pp;
261
	freep = (Reinst *)((char *)freep + diff);
262
	for(inst=npp->firstinst; inst<freep; inst++){
263
		switch(inst->type){
264
		case OR:
265
		case STAR:
266
		case PLUS:
267
		case QUEST:
268
			*(char **)&inst->right += diff;
269
			break;
270
		case CCLASS:
271
		case NCCLASS:
272
			*(char **)&inst->right += diff;
273
			cl = inst->cp;
274
			*(char **)&cl->end += diff;
275
			break;
276
		}
277
		*(char **)&inst->left += diff;
278
	}
279
	*(char **)&npp->startinst += diff;
280
	return npp;
281
}
282
 
283
#ifdef	DEBUG
284
static	void
285
dumpstack(void){
286
	Node *stk;
287
	int *ip;
288
 
289
	print("operators\n");
290
	for(ip=atorstack; ip<atorp; ip++)
291
		print("0%o\n", *ip);
292
	print("operands\n");
293
	for(stk=andstack; stk<andp; stk++)
294
		print("0%o\t0%o\n", stk->first->type, stk->last->type);
295
}
296
 
297
static	void
298
dump(Reprog *pp)
299
{
300
	Reinst *l;
301
	Rune *p;
302
 
303
	l = pp->firstinst;
304
	do{
305
		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
306
			l->left-pp->firstinst, l->right-pp->firstinst);
307
		if(l->type == RUNE)
308
			print("\t%C\n", l->r);
309
		else if(l->type == CCLASS || l->type == NCCLASS){
310
			print("\t[");
311
			if(l->type == NCCLASS)
312
				print("^");
313
			for(p = l->cp->spans; p < l->cp->end; p += 2)
314
				if(p[0] == p[1])
315
					print("%C", p[0]);
316
				else
317
					print("%C-%C", p[0], p[1]);
318
			print("]\n");
319
		} else
320
			print("\n");
321
	}while(l++->type);
322
}
323
#endif
324
 
325
static	Reclass*
326
newclass(void)
327
{
328
	if(nclass <= 0){
329
		classp = mallocz(128*sizeof(Reclass), 1);
330
		if(classp == nil)
331
			regerror("out of memory");
332
		nclass = 128;
333
	}
334
	return &classp[--nclass];
335
}
336
 
337
static	int
338
nextc(Rune *rp)
339
{
340
	if(lexdone){
341
		*rp = 0;
342
		return 1;
343
	}
344
	exprp += chartorune(rp, exprp);
345
	if(*rp == L'\\'){
346
		exprp += chartorune(rp, exprp);
347
		return 1;
348
	}
349
	if(*rp == 0)
350
		lexdone = 1;
351
	return 0;
352
}
353
 
354
static	int
355
lex(int literal, int dot_type)
356
{
357
	int quoted;
358
 
359
	quoted = nextc(&yyrune);
360
	if(literal || quoted){
361
		if(yyrune == 0)
362
			return END;
363
		return RUNE;
364
	}
365
 
366
	switch(yyrune){
367
	case 0:
368
		return END;
369
	case L'*':
370
		return STAR;
371
	case L'?':
372
		return QUEST;
373
	case L'+':
374
		return PLUS;
375
	case L'|':
376
		return OR;
377
	case L'.':
378
		return dot_type;
379
	case L'(':
380
		return LBRA;
381
	case L')':
382
		return RBRA;
383
	case L'^':
384
		return BOL;
385
	case L'$':
386
		return EOL;
387
	case L'[':
388
		return bldcclass();
389
	}
390
	return RUNE;
391
}
392
 
393
static int
394
bldcclass(void)
395
{
396
	int type;
397
	Rune r[NCCRUNE];
398
	Rune *p, *ep, *np;
399
	Rune rune;
400
	int quoted;
401
 
402
	/* we have already seen the '[' */
403
	type = CCLASS;
404
	yyclassp = newclass();
405
 
406
	/* look ahead for negation */
407
	/* SPECIAL CASE!!! negated classes don't match \n */
408
	ep = r;
409
	quoted = nextc(&rune);
410
	if(!quoted && rune == L'^'){
411
		type = NCCLASS;
412
		quoted = nextc(&rune);
413
		*ep++ = L'\n';
414
		*ep++ = L'\n';
415
	}
416
 
417
	/* parse class into a set of spans */
418
	for(; ep<&r[NCCRUNE];){
419
		if(rune == 0){
420
			rcerror("malformed '[]'");
421
			return 0;
422
		}
423
		if(!quoted && rune == L']')
424
			break;
425
		if(!quoted && rune == L'-'){
426
			if(ep == r){
427
				rcerror("malformed '[]'");
428
				return 0;
429
			}
430
			quoted = nextc(&rune);
431
			if((!quoted && rune == L']') || rune == 0){
432
				rcerror("malformed '[]'");
433
				return 0;
434
			}
435
			*(ep-1) = rune;
436
		} else {
437
			*ep++ = rune;
438
			*ep++ = rune;
439
		}
440
		quoted = nextc(&rune);
441
	}
442
 
443
	/* sort on span start */
444
	for(p = r; p < ep; p += 2){
445
		for(np = p; np < ep; np += 2)
446
			if(*np < *p){
447
				rune = np[0];
448
				np[0] = p[0];
449
				p[0] = rune;
450
				rune = np[1];
451
				np[1] = p[1];
452
				p[1] = rune;
453
			}
454
	}
455
 
456
	/* merge spans */
457
	np = yyclassp->spans;
458
	p = r;
459
	if(r == ep)
460
		yyclassp->end = np;
461
	else {
462
		np[0] = *p++;
463
		np[1] = *p++;
464
		for(; p < ep; p += 2)
465
			if(p[0] <= np[1]){
466
				if(p[1] > np[1])
467
					np[1] = p[1];
468
			} else {
469
				np += 2;
470
				np[0] = p[0];
471
				np[1] = p[1];
472
			}
473
		yyclassp->end = np+2;
474
	}
475
 
476
	return type;
477
}
478
 
479
static	Reprog*
480
regcomp1(char *s, int literal, int dot_type)
481
{
482
	int token;
483
	Reprog *pp;
484
 
485
	/* get memory for the program */
486
	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
487
	if(pp == 0){
488
		regerror("out of memory");
489
		return 0;
490
	}
491
	freep = pp->firstinst;
492
	classp = pp->class;
493
	errors = 0;
494
 
495
	if(setjmp(regkaboom))
496
		goto out;
497
 
498
	/* go compile the sucker */
499
	lexdone = 0;
500
	exprp = s;
501
	nclass = NCLASS;
502
	nbra = 0;
503
	atorp = atorstack;
504
	andp = andstack;
505
	subidp = subidstack;
506
	lastwasand = FALSE;
507
	cursubid = 0;
508
 
509
	/* Start with a low priority operator to prime parser */
510
	pushator(START-1);
511
	while((token = lex(literal, dot_type)) != END){
512
		if((token&0300) == OPERATOR)
513
			operator(token);
514
		else
515
			operand(token);
516
	}
517
 
518
	/* Close with a low priority operator */
519
	evaluntil(START);
520
 
521
	/* Force END */
522
	operand(END);
523
	evaluntil(START);
524
#ifdef DEBUG
525
	dumpstack();
526
#endif
527
	if(nbra)
528
		rcerror("unmatched left paren");
529
	--andp;	/* points to first and only operand */
530
	pp->startinst = andp->first;
531
#ifdef DEBUG
532
	dump(pp);
533
#endif
534
	pp = optimize(pp);
535
#ifdef DEBUG
536
	print("start: %d\n", andp->first-pp->firstinst);
537
	dump(pp);
538
#endif
539
out:
540
	if(errors){
541
		free(pp);
542
		pp = 0;
543
	}
544
	return pp;
545
}
546
 
547
extern	Reprog*
548
regcomp(char *s)
549
{
550
	return regcomp1(s, 0, ANY);
551
}
552
 
553
extern	Reprog*
554
regcomplit(char *s)
555
{
556
	return regcomp1(s, 1, ANY);
557
}
558
 
559
extern	Reprog*
560
regcompnl(char *s)
561
{
562
	return regcomp1(s, 0, ANYNL);
563
}