Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include "sam.h"
2
 
3
Rangeset	sel;
4
String		lastregexp;
5
/*
6
 * Machine Information
7
 */
8
typedef struct Inst Inst;
9
 
10
struct Inst
11
{
12
	long	type;	/* <= Runemax ==> literal, otherwise action */
13
	union {
14
		int rsid;
15
		int rsubid;
16
		int class;
17
		struct Inst *rother;
18
		struct Inst *rright;
19
	} r;
20
	union{
21
		struct Inst *lleft;
22
		struct Inst *lnext;
23
	} l;
24
};
25
#define	sid	r.rsid
26
#define	subid	r.rsubid
27
#define	rclass	r.class
28
#define	other	r.rother
29
#define	right	r.rright
30
#define	left	l.lleft
31
#define	next	l.lnext
32
 
33
#define	NPROG	1024
34
Inst	program[NPROG];
35
Inst	*progp;
36
Inst	*startinst;	/* First inst. of program; might not be program[0] */
37
Inst	*bstartinst;	/* same for backwards machine */
38
 
39
typedef struct Ilist Ilist;
40
struct Ilist
41
{
42
	Inst	*inst;		/* Instruction of the thread */
43
	Rangeset se;
44
	Posn	startp;		/* first char of match */
45
};
46
 
47
#define	NLIST	127
48
 
49
Ilist	*tl, *nl;		/* This list, next list */
50
Ilist	list[2][NLIST+1];	/* +1 for trailing null */
51
static	Rangeset sempty;
52
 
53
/*
54
 * Actions and Tokens
55
 *
56
 *	0x100xx are operators, value == precedence
57
 *	0x200xx are tokens, i.e. operands for operators
58
 */
59
enum {
60
	OPERATOR = Runemask+1,	/* Bitmask of all operators */
61
	START	= OPERATOR,	/* Start, used for marker on stack */
62
	RBRA,			/* Right bracket, ) */
63
	LBRA,			/* Left bracket, ( */
64
	OR,			/* Alternation, | */
65
	CAT,			/* Concatentation, implicit operator */
66
	STAR,			/* Closure, * */
67
	PLUS,			/* a+ == aa* */
68
	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
69
 
70
	ANY	= OPERATOR<<1,	/* Any character but newline, . */
71
	NOP,			/* No operation, internal use only */
72
	BOL,			/* Beginning of line, ^ */
73
	EOL,			/* End of line, $ */
74
	CCLASS,			/* Character class, [] */
75
	NCCLASS,		/* Negated character class, [^] */
76
	END,			/* Terminate: match found */
77
 
78
	ISATOR	= OPERATOR,
79
	ISAND	= OPERATOR<<1,
80
};
81
 
82
/*
83
 * Parser Information
84
 */
85
typedef struct Node Node;
86
struct Node
87
{
88
	Inst	*first;
89
	Inst	*last;
90
};
91
 
92
#define	NSTACK	20
93
Node	andstack[NSTACK];
94
Node	*andp;
95
int	atorstack[NSTACK];
96
int	*atorp;
97
int	lastwasand;	/* Last token was operand */
98
int	cursubid;
99
int	subidstack[NSTACK];
100
int	*subidp;
101
int	backwards;
102
int	nbra;
103
Rune	*exprp;		/* pointer to next character in source expression */
104
#define	DCLASS	10	/* allocation increment */
105
int	nclass;		/* number active */
106
int	Nclass;		/* high water mark */
107
Rune	**class;
108
int	negateclass;
109
 
110
int	addinst(Ilist *l, Inst *inst, Rangeset *sep);
111
void	newmatch(Rangeset*);
112
void	bnewmatch(Rangeset*);
113
void	pushand(Inst*, Inst*);
114
void	pushator(int);
115
Node	*popand(int);
116
int	popator(void);
117
void	startlex(Rune*);
118
int	lex(void);
119
void	operator(int);
120
void	operand(int);
121
void	evaluntil(int);
122
void	optimize(Inst*);
123
void	bldcclass(void);
124
 
125
void
126
regerror(Err e)
127
{
128
	Strzero(&lastregexp);
129
	error(e);
130
}
131
 
132
void
133
regerror_c(Err e, int c)
134
{
135
	Strzero(&lastregexp);
136
	error_c(e, c);
137
}
138
 
139
Inst *
140
newinst(int t)
141
{
142
	if(progp >= &program[NPROG])
143
		regerror(Etoolong);
144
	progp->type = t;
145
	progp->left = 0;
146
	progp->right = 0;
147
	return progp++;
148
}
149
 
150
Inst *
151
realcompile(Rune *s)
152
{
153
	int token;
154
 
155
	startlex(s);
156
	atorp = atorstack;
157
	andp = andstack;
158
	subidp = subidstack;
159
	cursubid = 0;
160
	lastwasand = FALSE;
161
	/* Start with a low priority operator to prime parser */
162
	pushator(START-1);
163
	while((token=lex()) != END){
164
		if((token&ISATOR) == OPERATOR)
165
			operator(token);
166
		else
167
			operand(token);
168
	}
169
	/* Close with a low priority operator */
170
	evaluntil(START);
171
	/* Force END */
172
	operand(END);
173
	evaluntil(START);
174
	if(nbra)
175
		regerror(Eleftpar);
176
	--andp;	/* points to first and only operand */
177
	return andp->first;
178
}
179
 
180
void
181
compile(String *s)
182
{
183
	int i;
184
	Inst *oprogp;
185
 
186
	if(Strcmp(s, &lastregexp)==0)
187
		return;
188
	for(i=0; i<nclass; i++)
189
		free(class[i]);
190
	nclass = 0;
191
	progp = program;
192
	backwards = FALSE;
193
	startinst = realcompile(s->s);
194
	optimize(program);
195
	oprogp = progp;
196
	backwards = TRUE;
197
	bstartinst = realcompile(s->s);
198
	optimize(oprogp);
199
	Strduplstr(&lastregexp, s);
200
}
201
 
202
void
203
operand(int t)
204
{
205
	Inst *i;
206
	if(lastwasand)
207
		operator(CAT);	/* catenate is implicit */
208
	i = newinst(t);
209
	if(t == CCLASS){
210
		if(negateclass)
211
			i->type = NCCLASS;	/* UGH */
212
		i->rclass = nclass-1;		/* UGH */
213
	}
214
	pushand(i, i);
215
	lastwasand = TRUE;
216
}
217
 
218
void
219
operator(int t)
220
{
221
	if(t==RBRA && --nbra<0)
222
		regerror(Erightpar);
223
	if(t==LBRA){
224
/*
225
 *		if(++cursubid >= NSUBEXP)
226
 *			regerror(Esubexp);
227
 */
228
		cursubid++;	/* silently ignored */
229
		nbra++;
230
		if(lastwasand)
231
			operator(CAT);
232
	}else
233
		evaluntil(t);
234
	if(t!=RBRA)
235
		pushator(t);
236
	lastwasand = FALSE;
237
	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
238
		lastwasand = TRUE;	/* these look like operands */
239
}
240
 
241
void
242
cant(char *s)
243
{
244
	char buf[100];
245
 
246
	sprint(buf, "regexp: can't happen: %s", s);
247
	panic(buf);
248
}
249
 
250
void
251
pushand(Inst *f, Inst *l)
252
{
253
	if(andp >= &andstack[NSTACK])
254
		cant("operand stack overflow");
255
	andp->first = f;
256
	andp->last = l;
257
	andp++;
258
}
259
 
260
void
261
pushator(int t)
262
{
263
	if(atorp >= &atorstack[NSTACK])
264
		cant("operator stack overflow");
265
	*atorp++=t;
266
	if(cursubid >= NSUBEXP)
267
		*subidp++= -1;
268
	else
269
		*subidp++=cursubid;
270
}
271
 
272
Node *
273
popand(int op)
274
{
275
	if(andp <= &andstack[0])
276
		if(op)
277
			regerror_c(Emissop, op);
278
		else
279
			regerror(Ebadregexp);
280
	return --andp;
281
}
282
 
283
int
284
popator(void)
285
{
286
	if(atorp <= &atorstack[0])
287
		cant("operator stack underflow");
288
	--subidp;
289
	return *--atorp;
290
}
291
 
292
void
293
evaluntil(int pri)
294
{
295
	Node *op1, *op2, *t;
296
	Inst *inst1, *inst2;
297
 
298
	while(pri==RBRA || atorp[-1]>=pri){
299
		switch(popator()){
300
		case LBRA:
301
			op1 = popand('(');
302
			inst2 = newinst(RBRA);
303
			inst2->subid = *subidp;
304
			op1->last->next = inst2;
305
			inst1 = newinst(LBRA);
306
			inst1->subid = *subidp;
307
			inst1->next = op1->first;
308
			pushand(inst1, inst2);
309
			return;		/* must have been RBRA */
310
		default:
311
			panic("unknown regexp operator");
312
			break;
313
		case OR:
314
			op2 = popand('|');
315
			op1 = popand('|');
316
			inst2 = newinst(NOP);
317
			op2->last->next = inst2;
318
			op1->last->next = inst2;
319
			inst1 = newinst(OR);
320
			inst1->right = op1->first;
321
			inst1->left = op2->first;
322
			pushand(inst1, inst2);
323
			break;
324
		case CAT:
325
			op2 = popand(0);
326
			op1 = popand(0);
327
			if(backwards && op2->first->type!=END)
328
				t = op1, op1 = op2, op2 = t;
329
			op1->last->next = op2->first;
330
			pushand(op1->first, op2->last);
331
			break;
332
		case STAR:
333
			op2 = popand('*');
334
			inst1 = newinst(OR);
335
			op2->last->next = inst1;
336
			inst1->right = op2->first;
337
			pushand(inst1, inst1);
338
			break;
339
		case PLUS:
340
			op2 = popand('+');
341
			inst1 = newinst(OR);
342
			op2->last->next = inst1;
343
			inst1->right = op2->first;
344
			pushand(op2->first, inst1);
345
			break;
346
		case QUEST:
347
			op2 = popand('?');
348
			inst1 = newinst(OR);
349
			inst2 = newinst(NOP);
350
			inst1->left = inst2;
351
			inst1->right = op2->first;
352
			op2->last->next = inst2;
353
			pushand(inst1, inst2);
354
			break;
355
		}
356
	}
357
}
358
 
359
 
360
void
361
optimize(Inst *start)
362
{
363
	Inst *inst, *target;
364
 
365
	for(inst=start; inst->type!=END; inst++){
366
		target = inst->next;
367
		while(target->type == NOP)
368
			target = target->next;
369
		inst->next = target;
370
	}
371
}
372
 
373
#ifdef	DEBUG
374
void
375
dumpstack(void){
376
	Node *stk;
377
	int *ip;
378
 
379
	dprint("operators\n");
380
	for(ip = atorstack; ip<atorp; ip++)
381
		dprint("0%o\n", *ip);
382
	dprint("operands\n");
383
	for(stk = andstack; stk<andp; stk++)
384
		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
385
}
386
void
387
dump(void){
388
	Inst *l;
389
 
390
	l = program;
391
	do{
392
		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
393
			l->left-program, l->right-program);
394
	}while(l++->type);
395
}
396
#endif
397
 
398
void
399
startlex(Rune *s)
400
{
401
	exprp = s;
402
	nbra = 0;
403
}
404
 
405
 
406
int
407
lex(void){
408
	int c= *exprp++;
409
 
410
	switch(c){
411
	case '\\':
412
		if(*exprp)
413
			if((c= *exprp++)=='n')
414
				c='\n';
415
		break;
416
	case 0:
417
		c = END;
418
		--exprp;	/* In case we come here again */
419
		break;
420
	case '*':
421
		c = STAR;
422
		break;
423
	case '?':
424
		c = QUEST;
425
		break;
426
	case '+':
427
		c = PLUS;
428
		break;
429
	case '|':
430
		c = OR;
431
		break;
432
	case '.':
433
		c = ANY;
434
		break;
435
	case '(':
436
		c = LBRA;
437
		break;
438
	case ')':
439
		c = RBRA;
440
		break;
441
	case '^':
442
		c = BOL;
443
		break;
444
	case '$':
445
		c = EOL;
446
		break;
447
	case '[':
448
		c = CCLASS;
449
		bldcclass();
450
		break;
451
	}
452
	return c;
453
}
454
 
455
long
456
nextrec(void){
457
	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
458
		regerror(Ebadclass);
459
	if(exprp[0] == '\\'){
460
		exprp++;
461
		if(*exprp=='n'){
462
			exprp++;
463
			return '\n';
464
		}
465
		return *exprp++|(Runemax+1);
466
	}
467
	return *exprp++;
468
}
469
 
470
void
471
bldcclass(void)
472
{
473
	long c1, c2, n, na;
474
	Rune *classp;
475
 
476
	classp = emalloc(DCLASS*RUNESIZE);
477
	n = 0;
478
	na = DCLASS;
479
	/* we have already seen the '[' */
480
	if(*exprp == '^'){
481
		classp[n++] = '\n';	/* don't match newline in negate case */
482
		negateclass = TRUE;
483
		exprp++;
484
	}else
485
		negateclass = FALSE;
486
	while((c1 = nextrec()) != ']'){
487
		if(c1 == '-'){
488
    Error:
489
			free(classp);
490
			regerror(Ebadclass);
491
		}
492
		if(n+4 >= na){		/* 3 runes plus NUL */
493
			na += DCLASS;
494
			classp = erealloc(classp, na*RUNESIZE);
495
		}
496
		if(*exprp == '-'){
497
			exprp++;	/* eat '-' */
498
			if((c2 = nextrec()) == ']')
499
				goto Error;
500
			classp[n+0] = Runemax;
501
			classp[n+1] = c1;
502
			classp[n+2] = c2;
503
			n += 3;
504
		}else
505
			classp[n++] = c1;
506
	}
507
	classp[n] = 0;
508
	if(nclass == Nclass){
509
		Nclass += DCLASS;
510
		class = erealloc(class, Nclass*sizeof(Rune*));
511
	}
512
	class[nclass++] = classp;
513
}
514
 
515
int
516
classmatch(int classno, int c, int negate)
517
{
518
	Rune *p;
519
 
520
	p = class[classno];
521
	while(*p){
522
		if(*p == Runemax){
523
			if(p[1]<=c && c<=p[2])
524
				return !negate;
525
			p += 3;
526
		}else if(*p++ == c)
527
			return !negate;
528
	}
529
	return negate;
530
}
531
 
532
/*
533
 * Note optimization in addinst:
534
 * 	*l must be pending when addinst called; if *l has been looked
535
 *		at already, the optimization is a bug.
536
 */
537
int
538
addinst(Ilist *l, Inst *inst, Rangeset *sep)
539
{
540
	Ilist *p;
541
 
542
	for(p = l; p->inst; p++){
543
		if(p->inst==inst){
544
			if((sep)->p[0].p1 < p->se.p[0].p1)
545
				p->se= *sep;	/* this would be bug */
546
			return 0;	/* It's already there */
547
		}
548
	}
549
	p->inst = inst;
550
	p->se= *sep;
551
	(p+1)->inst = 0;
552
	return 1;
553
}
554
 
555
int
556
execute(File *f, Posn startp, Posn eof)
557
{
558
	int flag = 0;
559
	Inst *inst;
560
	Ilist *tlp;
561
	Posn p = startp;
562
	int nnl = 0, ntl;
563
	int c;
564
	int wrapped = 0;
565
	int startchar = startinst->type<OPERATOR? startinst->type : 0;
566
 
567
	list[0][0].inst = list[1][0].inst = 0;
568
	sel.p[0].p1 = -1;
569
	/* Execute machine once for each character */
570
	for(;;p++){
571
	doloop:
572
		c = filereadc(f, p);
573
		if(p>=eof || c<0){
574
			switch(wrapped++){
575
			case 0:		/* let loop run one more click */
576
			case 2:
577
				break;
578
			case 1:		/* expired; wrap to beginning */
579
				if(sel.p[0].p1>=0 || eof!=INFINITY)
580
					goto Return;
581
				list[0][0].inst = list[1][0].inst = 0;
582
				p = 0;
583
				goto doloop;
584
			default:
585
				goto Return;
586
			}
587
		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
588
			break;
589
		/* fast check for first char */
590
		if(startchar && nnl==0 && c!=startchar)
591
			continue;
592
		tl = list[flag];
593
		nl = list[flag^=1];
594
		nl->inst = 0;
595
		ntl = nnl;
596
		nnl = 0;
597
		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
598
			/* Add first instruction to this list */
599
			sempty.p[0].p1 = p;
600
			if(addinst(tl, startinst, &sempty))
601
			if(++ntl >= NLIST)
602
	Overflow:
603
				error(Eoverflow);
604
		}
605
		/* Execute machine until this list is empty */
606
		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
607
	Switchstmt:
608
			switch(inst->type){
609
			default:	/* regular character */
610
				if(inst->type==c){
611
	Addinst:
612
					if(addinst(nl, inst->next, &tlp->se))
613
					if(++nnl >= NLIST)
614
						goto Overflow;
615
				}
616
				break;
617
			case LBRA:
618
				if(inst->subid>=0)
619
					tlp->se.p[inst->subid].p1 = p;
620
				inst = inst->next;
621
				goto Switchstmt;
622
			case RBRA:
623
				if(inst->subid>=0)
624
					tlp->se.p[inst->subid].p2 = p;
625
				inst = inst->next;
626
				goto Switchstmt;
627
			case ANY:
628
				if(c!='\n')
629
					goto Addinst;
630
				break;
631
			case BOL:
632
				if(p==0 || filereadc(f, p - 1)=='\n'){
633
	Step:
634
					inst = inst->next;
635
					goto Switchstmt;
636
				}
637
				break;
638
			case EOL:
639
				if(c == '\n')
640
					goto Step;
641
				break;
642
			case CCLASS:
643
				if(c>=0 && classmatch(inst->rclass, c, 0))
644
					goto Addinst;
645
				break;
646
			case NCCLASS:
647
				if(c>=0 && classmatch(inst->rclass, c, 1))
648
					goto Addinst;
649
				break;
650
			case OR:
651
				/* evaluate right choice later */
652
				if(addinst(tlp, inst->right, &tlp->se))
653
				if(++ntl >= NLIST)
654
					goto Overflow;
655
				/* efficiency: advance and re-evaluate */
656
				inst = inst->left;
657
				goto Switchstmt;
658
			case END:	/* Match! */
659
				tlp->se.p[0].p2 = p;
660
				newmatch(&tlp->se);
661
				break;
662
			}
663
		}
664
	}
665
    Return:
666
	return sel.p[0].p1>=0;
667
}
668
 
669
void
670
newmatch(Rangeset *sp)
671
{
672
	int i;
673
 
674
	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
675
	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
676
		for(i = 0; i<NSUBEXP; i++)
677
			sel.p[i] = sp->p[i];
678
}
679
 
680
int
681
bexecute(File *f, Posn startp)
682
{
683
	int flag = 0;
684
	Inst *inst;
685
	Ilist *tlp;
686
	Posn p = startp;
687
	int nnl = 0, ntl;
688
	int c;
689
	int wrapped = 0;
690
	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691
 
692
	list[0][0].inst = list[1][0].inst = 0;
693
	sel.p[0].p1= -1;
694
	/* Execute machine once for each character, including terminal NUL */
695
	for(;;--p){
696
	doloop:
697
		if((c = filereadc(f, p - 1))==-1){
698
			switch(wrapped++){
699
			case 0:		/* let loop run one more click */
700
			case 2:
701
				break;
702
			case 1:		/* expired; wrap to end */
703
				if(sel.p[0].p1>=0)
704
			case 3:
705
					goto Return;
706
				list[0][0].inst = list[1][0].inst = 0;
707
				p = f->nc;
708
				goto doloop;
709
			default:
710
				goto Return;
711
			}
712
		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
713
			break;
714
		/* fast check for first char */
715
		if(startchar && nnl==0 && c!=startchar)
716
			continue;
717
		tl = list[flag];
718
		nl = list[flag^=1];
719
		nl->inst = 0;
720
		ntl = nnl;
721
		nnl = 0;
722
		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
723
			/* Add first instruction to this list */
724
			/* the minus is so the optimizations in addinst work */
725
			sempty.p[0].p1 = -p;
726
			if(addinst(tl, bstartinst, &sempty))
727
			if(++ntl >= NLIST)
728
	Overflow:
729
				error(Eoverflow);
730
		}
731
		/* Execute machine until this list is empty */
732
		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
733
	Switchstmt:
734
			switch(inst->type){
735
			default:	/* regular character */
736
				if(inst->type == c){
737
	Addinst:
738
					if(addinst(nl, inst->next, &tlp->se))
739
					if(++nnl >= NLIST)
740
						goto Overflow;
741
				}
742
				break;
743
			case LBRA:
744
				if(inst->subid>=0)
745
					tlp->se.p[inst->subid].p1 = p;
746
				inst = inst->next;
747
				goto Switchstmt;
748
			case RBRA:
749
				if(inst->subid >= 0)
750
					tlp->se.p[inst->subid].p2 = p;
751
				inst = inst->next;
752
				goto Switchstmt;
753
			case ANY:
754
				if(c != '\n')
755
					goto Addinst;
756
				break;
757
			case BOL:
758
				if(c=='\n' || p==0){
759
	Step:
760
					inst = inst->next;
761
					goto Switchstmt;
762
				}
763
				break;
764
			case EOL:
765
				if(p==f->nc || filereadc(f, p)=='\n')
766
					goto Step;
767
				break;
768
			case CCLASS:
769
				if(c>=0 && classmatch(inst->rclass, c, 0))
770
					goto Addinst;
771
				break;
772
			case NCCLASS:
773
				if(c>=0 && classmatch(inst->rclass, c, 1))
774
					goto Addinst;
775
				break;
776
			case OR:
777
				/* evaluate right choice later */
778
				if(addinst(tl, inst->right, &tlp->se))
779
				if(++ntl >= NLIST)
780
					goto Overflow;
781
				/* efficiency: advance and re-evaluate */
782
				inst = inst->left;
783
				goto Switchstmt;
784
			case END:	/* Match! */
785
				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
786
				tlp->se.p[0].p2 = p;
787
				bnewmatch(&tlp->se);
788
				break;
789
			}
790
		}
791
	}
792
    Return:
793
	return sel.p[0].p1>=0;
794
}
795
 
796
void
797
bnewmatch(Rangeset *sp)
798
{
799
        int  i;
800
        if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
801
                for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
802
                        sel.p[i].p1 = sp->p[i].p2;
803
                        sel.p[i].p2 = sp->p[i].p1;
804
                }
805
}