Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bin.h>
4
#include <bio.h>
5
#include <regexp.h>
6
#include "/sys/src/libregexp/regcomp.h"
7
#include "dfa.h"
8
 
9
void rdump(Reprog*);
10
void dump(Dreprog*);
11
 
12
/*
13
 * Standard NFA determinization and DFA minimization.
14
 */
15
typedef struct Deter Deter;
16
typedef struct Reiset Reiset;
17
 
18
void ddump(Deter*);
19
 
20
/* state of determinization */
21
struct Deter
22
{
23
	jmp_buf kaboom;	/* jmp on error */
24
 
25
	Bin *bin;		/* bin for temporary allocations */
26
 
27
	Reprog *p;	/* program being determinized */
28
	uint ninst;		/* number of instructions in program */
29
 
30
	Reiset *alloc;	/* chain of all Reisets */
31
	Reiset **last;
32
 
33
	Reiset **hash;	/* hash of all Reisets */
34
	uint nhash;
35
 
36
	Reiset *tmp;	/* temporaries for walk */
37
	uchar *bits;
38
 
39
	Rune *c;		/* ``interesting'' characters */
40
	uint nc;
41
};
42
 
43
/* set of Reinsts: perhaps we should use a bit list instead of the indices? */
44
struct Reiset
45
{
46
	uint *inst;		/* indices of instructions in set */
47
	uint ninst;		/* size of set */
48
 
49
	Reiset *next;	/* d.alloc chain */
50
	Reiset *hash;	/* d.hash chain */
51
	Reiset **delta;	/* where to go on each interesting char */
52
	uint id;		/* assigned id during minimization */
53
	uint isfinal;	/* is an accepting (final) state */
54
};
55
 
56
static Reiset*
57
ralloc(Deter *d, int ninst)
58
{
59
	Reiset *t;
60
 
61
	t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
62
	if(t == nil)
63
		longjmp(d->kaboom, 1);
64
	t->delta = (Reiset**)&t[1];
65
	t->inst = (uint*)&t->delta[2*d->nc];
66
	return t;
67
}
68
 
69
/* find the canonical form a given Reiset */
70
static Reiset*
71
findreiset(Deter *d, Reiset *s)
72
{
73
	int i, szinst;
74
	uint h;
75
	Reiset *t;
76
 
77
	h = 0;
78
	for(i=0; i<s->ninst; i++)
79
		h = h*1000003 + s->inst[i];
80
	h %= d->nhash;
81
 
82
	szinst = s->ninst*sizeof(s->inst[0]);
83
	for(t=d->hash[h]; t; t=t->hash)
84
		if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
85
			return t;
86
 
87
	t = ralloc(d, s->ninst);
88
	t->hash = d->hash[h];
89
	d->hash[h] = t;
90
 
91
	*d->last = t;
92
	d->last = &t->next;
93
	t->next = 0;
94
 
95
	t->ninst = s->ninst;
96
	memmove(t->inst, s->inst, szinst);
97
 
98
	/* delta is filled in later */
99
 
100
	return t;
101
}
102
 
103
/* convert bits to a real reiset */
104
static Reiset*
105
bits2reiset(Deter *d, uchar *bits)
106
{
107
	int k;
108
	Reiset *s;
109
 
110
	s = d->tmp;
111
	s->ninst = 0;
112
	for(k=0; k<d->ninst; k++)
113
		if(bits[k])
114
			s->inst[s->ninst++] = k;
115
	return findreiset(d, s);
116
}
117
 
118
/* add n to state set; if n < k, need to go around again */
119
static int
120
add(int n, uchar *bits, int k)
121
{
122
	if(bits[n])
123
		return 0;
124
	bits[n] = 1;
125
	return n < k;
126
}
127
 
128
/* update bits to follow all the empty (non-character-related) transitions possible */
129
static void
130
followempty(Deter *d, uchar *bits, int bol, int eol)
131
{
132
	int again, k;
133
	Reinst *i;
134
 
135
	do{
136
		again = 0;
137
		for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
138
			if(!bits[k])
139
				continue;
140
			switch(i->type){
141
			case RBRA:
142
			case LBRA:
143
				again |= add(i->next - d->p->firstinst, bits, k);
144
				break;
145
			case OR:
146
				again |= add(i->left - d->p->firstinst, bits, k);
147
				again |= add(i->right - d->p->firstinst, bits, k);
148
				break;
149
			case BOL:
150
				if(bol)
151
					again |= add(i->next - d->p->firstinst, bits, k);
152
				break;
153
			case EOL:
154
				if(eol)
155
					again |= add(i->next - d->p->firstinst, bits, k);
156
				break;
157
			}
158
		}
159
	}while(again);
160
 
161
	/*
162
	 * Clear bits for useless transitions.  We could do this during
163
	 * the switch above, but then we have no guarantee of termination
164
	 * if we get a loop in the regexp.
165
	 */
166
	for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
167
		if(!bits[k])
168
			continue;
169
		switch(i->type){
170
		case RBRA:
171
		case LBRA:
172
		case OR:
173
		case BOL:
174
		case EOL:
175
			bits[k] = 0;
176
			break;
177
		}
178
	}
179
}
180
 
181
/*
182
 * Where does s go if it sees rune r?
183
 * Eol is true if a $ matches the string at the position just after r.
184
 */
185
static Reiset*
186
transition(Deter *d, Reiset *s, Rune r, uint eol)
187
{
188
	int k;
189
	uchar *bits;
190
	Reinst *i, *inst0;
191
	Rune *rp, *ep;
192
 
193
	bits = d->bits;
194
	memset(bits, 0, d->ninst);
195
 
196
	inst0 = d->p->firstinst;
197
	for(k=0; k < s->ninst; k++){
198
		i = inst0 + s->inst[k];
199
		switch(i->type){
200
		default:
201
			werrstr("bad reprog: got type %d", i->type);
202
			longjmp(d->kaboom, 1);
203
		case RBRA:
204
		case LBRA:
205
		case OR:
206
		case BOL:
207
		case EOL:
208
			werrstr("internal error: got type %d", i->type);
209
			longjmp(d->kaboom, 1);
210
 
211
		case RUNE:
212
			if(r == i->r)
213
				bits[i->next - inst0] = 1;
214
			break;
215
		case ANY:
216
			if(r != L'\n')
217
				bits[i->next - inst0] = 1;
218
			break;
219
		case ANYNL:
220
			bits[i->next - inst0] = 1;
221
			break;
222
		case NCCLASS:
223
			if(r == L'\n')
224
				break;
225
			/* fall through */
226
		case CCLASS:
227
			ep = i->cp->end;
228
			for(rp = i->cp->spans; rp < ep; rp += 2)
229
				if(rp[0] <= r && r <= rp[1])
230
					break;
231
			if((rp < ep) ^! (i->type == CCLASS))
232
				bits[i->next - inst0] = 1;
233
			break;
234
		case END:
235
			break;
236
		}
237
	}
238
 
239
	followempty(d, bits, r=='\n', eol);
240
	return bits2reiset(d, bits);
241
}
242
 
243
static int
244
countinst(Reprog *pp)
245
{
246
	int n;
247
	Reinst *l;
248
 
249
	n = 0;
250
	l = pp->firstinst;
251
	while(l++->type)
252
		n++;
253
	return n;
254
}
255
 
256
static void
257
set(Deter *d, u32int **tab, Rune r)
258
{
259
	u32int *u;
260
 
261
	if((u = tab[r/4096]) == nil){
262
		u = binalloc(&d->bin, 4096/8, 1);
263
		if(u == nil)
264
			longjmp(d->kaboom, 1);
265
		tab[r/4096] = u;
266
	}
267
	u[(r%4096)/32] |= 1<<(r%32);
268
}
269
 
270
/*
271
 * Compute the list of important characters. 
272
 * Other characters behave like the ones that surround them.
273
 */
274
static void
275
findchars(Deter *d, Reprog *p)
276
{
277
	u32int *tab[65536/4096], *u, x;
278
	Reinst *i;
279
	Rune *rp, *ep;
280
	int k, m, n, a;
281
 
282
	memset(tab, 0, sizeof tab);
283
	set(d, tab, 0);
284
	set(d, tab, 0xFFFF);
285
	for(i=p->firstinst; i->type; i++){
286
		switch(i->type){
287
		case ANY:
288
			set(d, tab, L'\n'-1);
289
			set(d, tab, L'\n');
290
			set(d, tab, L'\n'+1);
291
			break;
292
		case RUNE:
293
			set(d, tab, i->r-1);
294
			set(d, tab, i->r);
295
			set(d, tab, i->r+1);
296
			break;
297
		case NCCLASS:
298
			set(d, tab, L'\n'-1);
299
			set(d, tab, L'\n');
300
			set(d, tab, L'\n'+1);
301
			/* fall through */
302
		case CCLASS:
303
			ep = i->cp->end;
304
			for(rp = i->cp->spans; rp < ep; rp += 2){
305
				set(d, tab, rp[0]-1);
306
				set(d, tab, rp[0]);
307
				set(d, tab, rp[1]);
308
				set(d, tab, rp[1]+1);
309
			}
310
			break;
311
		}
312
	}
313
 
314
	n = 0;
315
	for(k=0; k<nelem(tab); k++){
316
		if((u = tab[k]) == nil)
317
			continue;
318
		for(m=0; m<4096/32; m++){
319
			if((x = u[m]) == 0)
320
				continue;
321
			for(a=0; a<32; a++)
322
				if(x&(1<<a))
323
					n++;
324
		}
325
	}
326
 
327
	d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
328
	if(d->c == 0)
329
		longjmp(d->kaboom, 1);
330
	d->nc = n;
331
 
332
	n = 0;
333
	for(k=0; k<nelem(tab); k++){
334
		if((u = tab[k]) == nil)
335
			continue;
336
		for(m=0; m<4096/32; m++){
337
			if((x = u[m]) == 0)
338
				continue;
339
			for(a=0; a<32; a++)
340
				if(x&(1<<a))
341
					d->c[n++] = k*4096+m*32+a;
342
		}
343
	}
344
 
345
	d->c[n] = 0;
346
	if(n != d->nc)
347
		abort();
348
}
349
 
350
/*
351
 * convert the Deter and Reisets into a Dreprog.
352
 * if dp and c are nil, just return the count of Drecases needed.
353
 */
354
static int
355
buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
356
{
357
	int i, j, id, n, nn;
358
	Dreinst *di;
359
	Reiset *s;
360
 
361
	nn = 0;
362
	di = 0;
363
	for(i=0; i<nid; i++){
364
		s = id2set[i];
365
		if(c){
366
			di = &dp->inst[i];
367
			di->isfinal = s->isfinal;
368
		}
369
		n = 0;
370
		id = -1;
371
		for(j=0; j<2*d->nc; j++){
372
			if(s->delta[j]->id != id){
373
				id = s->delta[j]->id;
374
				if(c){
375
					c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
376
					c[n].next = &dp->inst[id];
377
				}
378
				n++;
379
			}
380
		}
381
		if(c){
382
			if(n == 1 && c[0].next == di)
383
				di->isloop = 1;
384
			di->c = c;
385
			di->nc = n;
386
			c += n;
387
		}
388
		nn += n;
389
	}
390
	return nn;
391
}
392
 
393
Dreprog*
394
dregcvt(Reprog *p)
395
{
396
	uchar *bits;
397
	uint again, n, nid, id;
398
	Deter d;
399
	Reiset **id2set, *s, *t, *start[4];
400
	Dreprog *dp;
401
	Drecase *c;
402
 
403
	memset(&d, 0, sizeof d);
404
 
405
	if(setjmp(d.kaboom)){
406
		binfree(&d.bin);
407
		return nil;
408
	}
409
 
410
	d.p = p;
411
	d.ninst = countinst(p);
412
 
413
	d.last = &d.alloc;
414
 
415
	n = d.ninst;
416
	/* round up to power of two; this loop is the least of our efficiency problems */
417
	while(n&(n-1))
418
		n++;
419
	d.nhash = n;
420
	d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
421
 
422
	/* get list of important runes */
423
	findchars(&d, p);
424
 
425
#ifdef DUMP
426
	print("relevant chars are: «%S»\n", d.c+1);
427
#endif
428
 
429
	d.bits = bits = binalloc(&d.bin, d.ninst, 0);
430
	d.tmp = ralloc(&d, d.ninst);
431
 
432
	/*
433
	 * Convert to DFA
434
	 */
435
 
436
	/* 4 start states, depending on initial bol, eol */
437
	for(n=0; n<4; n++){
438
		memset(bits, 0, d.ninst);
439
		bits[p->startinst - p->firstinst] = 1;
440
		followempty(&d, bits, n&1, n&2);
441
		start[n] = bits2reiset(&d, bits);
442
	}
443
 
444
	/* explore the reiset space */
445
	for(s=d.alloc; s; s=s->next)
446
		for(n=0; n<2*d.nc; n++)
447
			s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
448
 
449
#ifdef DUMP
450
	nid = 0;
451
	for(s=d.alloc; s; s=s->next)
452
		s->id = nid++;
453
	ddump(&d);
454
#endif
455
 
456
	/*
457
	 * Minimize.
458
	 */
459
 
460
	/* first class division is final or not */
461
	for(s=d.alloc; s; s=s->next){
462
		s->isfinal = 0;
463
		for(n=0; n<s->ninst; n++)
464
			if(p->firstinst[s->inst[n]].type == END)
465
				s->isfinal = 1;
466
		s->id = s->isfinal;
467
	}
468
 
469
	/* divide states with different transition tables in id space */
470
	nid = 2;
471
	do{
472
		again = 0;
473
		for(s=d.alloc; s; s=s->next){
474
			id = -1;
475
			for(t=s->next; t; t=t->next){
476
				if(s->id != t->id)
477
					continue;
478
				for(n=0; n<2*d.nc; n++){
479
					/* until we finish the for(t) loop, s->id and id are same */
480
					if((s->delta[n]->id == t->delta[n]->id)
481
					|| (s->delta[n]->id == s->id && t->delta[n]->id == id)
482
					|| (s->delta[n]->id == id && t->delta[n]->id == s->id))
483
						continue;
484
					break;
485
				}
486
				if(n == 2*d.nc)
487
					continue;
488
				if(id == -1)
489
					id = nid++;
490
				t->id = id;
491
				again = 1;
492
			}
493
		}
494
	}while(again);
495
 
496
#ifdef DUMP
497
	ddump(&d);
498
#endif
499
 
500
	/* build dreprog */
501
	id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
502
	if(id2set == nil)
503
		longjmp(d.kaboom, 1);
504
	for(s=d.alloc; s; s=s->next)
505
		id2set[s->id] = s;
506
 
507
	n = buildprog(&d, id2set, nid, nil, nil);
508
	dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
509
	if(dp == nil)
510
		longjmp(d.kaboom, 1);
511
	c = (Drecase*)&dp->inst[nid];
512
	buildprog(&d, id2set, nid, dp, c);
513
 
514
	for(n=0; n<4; n++)
515
		dp->start[n] = &dp->inst[start[n]->id];
516
	dp->ninst = nid;
517
 
518
	binfree(&d.bin);
519
	return dp;
520
}
521
 
522
int
523
dregexec(Dreprog *p, char *s, int bol)
524
{
525
	Rune r;
526
	ulong rr;
527
	Dreinst *i;
528
	Drecase *c, *ec;
529
	int best, n;
530
	char *os;
531
 
532
	i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
533
	best = -1;
534
	os = s;
535
	for(; *s; s+=n){
536
		if(i->isfinal)
537
			best = s - os;
538
		if(i->isloop){
539
			if(i->isfinal)
540
				return strlen(os);
541
			else
542
				return best;
543
		}
544
		if((*s&0xFF) < Runeself){
545
			r = *s;
546
			n = 1;
547
		}else
548
			n = chartorune(&r, s);
549
		c = i->c;
550
		ec = c+i->nc;
551
		rr = r;
552
		if(s[n] == '\n' || s[n] == '\0')
553
			rr |= 0x10000;
554
		for(; c<ec; c++){
555
			if(c->start > rr){
556
				i = c[-1].next;
557
				goto Out;
558
			}
559
		}
560
		i = ec[-1].next;
561
	Out:;
562
	}
563
	if(i->isfinal)
564
		best = s - os;
565
	return best;
566
}
567
 
568
 
569
#ifdef DUMP
570
void
571
ddump(Deter *d)
572
{
573
	int i, id;
574
	Reiset *s;
575
 
576
	for(s=d->alloc; s; s=s->next){
577
		print("%d ", s->id);
578
		id = -1;
579
		for(i=0; i<2*d->nc; i++){
580
			if(id != s->delta[i]->id){
581
				if(i==0)
582
					print(" [");
583
				else if(i/d->nc)
584
					print(" [%C$", d->c[i%d->nc]);
585
				else
586
					print(" [%C", d->c[i%d->nc]);
587
				print(" %d]", s->delta[i]->id);
588
				id = s->delta[i]->id;
589
			}
590
		}
591
		print("\n");
592
	}
593
}
594
 
595
void
596
rdump(Reprog *pp)
597
{
598
	Reinst *l;
599
	Rune *p;
600
 
601
	l = pp->firstinst;
602
	do{
603
		print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
604
			l->left-pp->firstinst, l->right-pp->firstinst);
605
		if(l->type == RUNE)
606
			print("\t%C\n", l->r);
607
		else if(l->type == CCLASS || l->type == NCCLASS){
608
			print("\t[");
609
			if(l->type == NCCLASS)
610
				print("^");
611
			for(p = l->cp->spans; p < l->cp->end; p += 2)
612
				if(p[0] == p[1])
613
					print("%C", p[0]);
614
				else
615
					print("%C-%C", p[0], p[1]);
616
			print("]\n");
617
		} else
618
			print("\n");
619
	}while(l++->type);
620
}
621
 
622
void
623
dump(Dreprog *pp)
624
{
625
	int i, j;
626
	Dreinst *l;
627
 
628
	print("start %ld %ld %ld %ld\n",
629
		pp->start[0]-pp->inst,
630
		pp->start[1]-pp->inst,
631
		pp->start[2]-pp->inst,
632
		pp->start[3]-pp->inst);
633
 
634
	for(i=0; i<pp->ninst; i++){
635
		l = &pp->inst[i];
636
		print("%d:", i);
637
		for(j=0; j<l->nc; j++){
638
			print(" [");
639
			if(j == 0)
640
				if(l->c[j].start != 1)
641
					abort();
642
			if(j != 0)
643
				print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
644
			print("-");
645
			if(j != l->nc-1)
646
				print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
647
			print("] %ld", l->c[j].next - pp->inst);
648
		}
649
		if(l->isfinal)
650
			print(" final");
651
		if(l->isloop)
652
			print(" loop");
653
		print("\n");
654
	}
655
}
656
 
657
 
658
void
659
main(int argc, char **argv)
660
{
661
	int i;
662
	Reprog *p;
663
	Dreprog *dp;
664
 
665
	i = 1;
666
		p = regcomp(argv[i]);
667
		if(p == 0){
668
			print("=== %s: bad regexp\n", argv[i]);
669
		}
670
	//	print("=== %s\n", argv[i]);
671
	//	rdump(p);
672
		dp = dregcvt(p);
673
		print("=== dfa\n");
674
		dump(dp);
675
 
676
	for(i=2; i<argc; i++)
677
		print("match %d\n", dregexec(dp, argv[i], 0));
678
	exits(0);
679
}
680
#endif
681
 
682
void
683
Bprintdfa(Biobuf *b, Dreprog *p)
684
{
685
	int i, j, nc;
686
 
687
	Bprint(b, "# dreprog\n");
688
	nc = 0;
689
	for(i=0; i<p->ninst; i++)
690
		nc += p->inst[i].nc;
691
	Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc,
692
		p->start[0]-p->inst, p->start[1]-p->inst,
693
		p->start[2]-p->inst, p->start[3]-p->inst);
694
	for(i=0; i<p->ninst; i++){
695
		Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
696
		for(j=0; j<p->inst[i].nc; j++)
697
			Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
698
		Bprint(b, "\n");
699
	}
700
}
701
 
702
static char*
703
egetline(Biobuf *b, int c, jmp_buf jb)
704
{
705
	char *p;
706
 
707
	p = Brdline(b, c);
708
	if(p == nil)
709
		longjmp(jb, 1);
710
	p[Blinelen(b)-1] = '\0';
711
	return p;
712
}
713
 
714
static void
715
egetc(Biobuf *b, int c, jmp_buf jb)
716
{
717
	if(Bgetc(b) != c)
718
		longjmp(jb, 1);
719
}
720
 
721
static int
722
egetnum(Biobuf *b, int want, jmp_buf jb)
723
{
724
	int c;
725
	int n, first;
726
 
727
	n = 0;
728
	first = 1;
729
	while((c = Bgetc(b)) != Beof){
730
		if(c < '0' || c > '9'){
731
			if(want == 0){
732
				Bungetc(b);
733
				c = 0;
734
			}
735
			if(first || c != want){
736
				werrstr("format error");
737
				longjmp(jb, 1);
738
			}
739
			return n;
740
		}
741
		n = n*10 + c - '0';
742
		first = 0;
743
	}
744
	werrstr("unexpected eof");
745
	longjmp(jb, 1);
746
	return -1;
747
}
748
 
749
Dreprog*
750
Breaddfa(Biobuf *b)
751
{
752
	char *s;
753
	int ninst, nc;
754
	jmp_buf jb;
755
	Dreprog *p;
756
	Drecase *c;
757
	Dreinst *l;
758
	int j, k;
759
 
760
	p = nil;
761
	if(setjmp(jb)){
762
		free(p);
763
		return nil;
764
	}
765
 
766
	s = egetline(b, '\n', jb);
767
	if(strcmp(s, "# dreprog") != 0){
768
		werrstr("format error");
769
		longjmp(jb, 1);
770
	}
771
 
772
	ninst = egetnum(b, ' ', jb);
773
	nc = egetnum(b, ' ', jb);
774
 
775
	p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
776
	if(p == nil)
777
		longjmp(jb, 1);
778
	c = (Drecase*)&p->inst[ninst];
779
 
780
	p->start[0] = &p->inst[egetnum(b, ' ', jb)];
781
	p->start[1] = &p->inst[egetnum(b, ' ', jb)];
782
	p->start[2] = &p->inst[egetnum(b, ' ', jb)];
783
	p->start[3] = &p->inst[egetnum(b, '\n', jb)];
784
 
785
	for(j=0; j<ninst; j++){
786
		l = &p->inst[j];
787
		l->isfinal = egetnum(b, ' ', jb);
788
		l->isloop = egetnum(b, ' ', jb);
789
		l->nc = egetnum(b, 0, jb);
790
		l->c = c;
791
		for(k=0; k<l->nc; k++){
792
			egetc(b, ' ', jb);
793
			c->start = egetnum(b, ' ', jb);
794
			c->next = &p->inst[egetnum(b, 0, jb)];
795
			c++;
796
		}
797
		egetc(b, '\n', jb);
798
	}
799
	return p;
800
}