Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/* $Source: /u/mark/src/pax/RCS/regexp.c,v $
2
 *
3
 * $Revision: 1.2 $
4
 *
5
 * regexp.c - regular expression matching
6
 *
7
 * DESCRIPTION
8
 *
9
 *	Underneath the reformatting and comment blocks which were added to 
10
 *	make it consistent with the rest of the code, you will find a
11
 *	modified version of Henry Specer's regular expression library.
12
 *	Henry's functions were modified to provide the minimal regular
13
 *	expression matching, as required by P1003.  Henry's code was
14
 *	copyrighted, and copy of the copyright message and restrictions
15
 *	are provided, verbatim, below:
16
 *
17
 *	Copyright (c) 1986 by University of Toronto.
18
 *	Written by Henry Spencer.  Not derived from licensed software.
19
 *
20
 *	Permission is granted to anyone to use this software for any
21
 *	purpose on any computer system, and to redistribute it freely,
22
 *	subject to the following restrictions:
23
 *
24
 *	1. The author is not responsible for the consequences of use of
25
 *         this software, no matter how awful, even if they arise
26
 *	   from defects in it.
27
 *
28
 *	2. The origin of this software must not be misrepresented, either
29
 *	   by explicit claim or by omission.
30
 *
31
 *	3. Altered versions must be plainly marked as such, and must not
32
 *	   be misrepresented as being the original software.
33
 *
34
 * 	Beware that some of this code is subtly aware of the way operator
35
 * 	precedence is structured in regular expressions.  Serious changes in
36
 * 	regular-expression syntax might require a total rethink.
37
 *
38
 * AUTHORS
39
 *
40
 *     Mark H. Colburn, NAPS International (mark@jhereg.mn.org)
41
 *     Henry Spencer, University of Torronto (henry@utzoo.edu)
42
 *
43
 * Sponsored by The USENIX Association for public distribution. 
44
 *
45
 * $Log:	regexp.c,v $
46
 * Revision 1.2  89/02/12  10:05:39  mark
47
 * 1.2 release fixes
48
 * 
49
 * Revision 1.1  88/12/23  18:02:32  mark
50
 * Initial revision
51
 * 
52
 */
53
 
54
/* Headers */
55
 
56
#include "pax.h"
57
 
58
#ifndef lint
59
static char    *Ident = "$Id: regexp.c,v 1.2 89/02/12 10:05:39 mark Exp $";
60
#endif
61
 
62
 
63
/*
64
 * The "internal use only" fields in regexp.h are present to pass info from
65
 * compile to execute that permits the execute phase to run lots faster on
66
 * simple cases.  They are:
67
 *
68
 * regstart	char that must begin a match; '\0' if none obvious
69
 * reganch	is the match anchored (at beginning-of-line only)?
70
 * regmust	string (pointer into program) that match must include, or NULL
71
 * regmlen	length of regmust string
72
 *
73
 * Regstart and reganch permit very fast decisions on suitable starting points
74
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
75
 * of lines that cannot possibly match.  The regmust tests are costly enough
76
 * that regcomp() supplies a regmust only if the r.e. contains something
77
 * potentially expensive (at present, the only such thing detected is * or +
78
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
79
 * supplied because the test in regexec() needs it and regcomp() is computing
80
 * it anyway.
81
 */
82
 
83
/*
84
 * Structure for regexp "program".  This is essentially a linear encoding
85
 * of a nondeterministic finite-state machine (aka syntax charts or
86
 * "railroad normal form" in parsing technology).  Each node is an opcode
87
 * plus a "nxt" pointer, possibly plus an operand.  "Nxt" pointers of
88
 * all nodes except BRANCH implement concatenation; a "nxt" pointer with
89
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
90
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
91
 * opposed to a collection of them) is never concatenated with anything
92
 * because of operator precedence.)  The operand of some types of node is
93
 * a literal string; for others, it is a node leading into a sub-FSM.  In
94
 * particular, the operand of a BRANCH node is the first node of the branch.
95
 * (NB this is *not* a tree structure:  the tail of the branch connects
96
 * to the thing following the set of BRANCHes.)  The opcodes are:
97
 */
98
 
99
/* definition	number	opnd?	meaning */
100
#define	END	0		/* no	End of program. */
101
#define	BOL	1		/* no	Match "" at beginning of line. */
102
#define	EOL	2		/* no	Match "" at end of line. */
103
#define	ANY	3		/* no	Match any one character. */
104
#define	ANYOF	4		/* str	Match any character in this string. */
105
#define	ANYBUT	5		/* str	Match any character not in this
106
				 * string. */
107
#define	BRANCH	6		/* node	Match this alternative, or the
108
				 * nxt... */
109
#define	BACK	7		/* no	Match "", "nxt" ptr points backward. */
110
#define	EXACTLY	8		/* str	Match this string. */
111
#define	NOTHING	9		/* no	Match empty string. */
112
#define	STAR	10		/* node	Match this (simple) thing 0 or more
113
				 * times. */
114
#define	OPEN	20		/* no	Mark this point in input as start of
115
				 * #n. */
116
 /* OPEN+1 is number 1, etc. */
117
#define	CLOSE	30		/* no	Analogous to OPEN. */
118
 
119
/*
120
 * Opcode notes:
121
 *
122
 * BRANCH	The set of branches constituting a single choice are hooked
123
 *		together with their "nxt" pointers, since precedence prevents
124
 *		anything being concatenated to any individual branch.  The
125
 *		"nxt" pointer of the last BRANCH in a choice points to the
126
 *		thing following the whole choice.  This is also where the
127
 *		final "nxt" pointer of each individual branch points; each
128
 *		branch starts with the operand node of a BRANCH node.
129
 *
130
 * BACK		Normal "nxt" pointers all implicitly point forward; BACK
131
 *		exists to make loop structures possible.
132
 *
133
 * STAR		complex '*', are implemented as circular BRANCH structures 
134
 *		using BACK.  Simple cases (one character per match) are 
135
 *		implemented with STAR for speed and to minimize recursive 
136
 *		plunges.
137
 *
138
 * OPEN,CLOSE	...are numbered at compile time.
139
 */
140
 
141
/*
142
 * A node is one char of opcode followed by two chars of "nxt" pointer.
143
 * "Nxt" pointers are stored as two 8-bit pieces, high order first.  The
144
 * value is a positive offset from the opcode of the node containing it.
145
 * An operand, if any, simply follows the node.  (Note that much of the
146
 * code generation knows about this implicit relationship.)
147
 *
148
 * Using two bytes for the "nxt" pointer is vast overkill for most things,
149
 * but allows patterns to get big without disasters.
150
 */
151
#define	OP(p)	(*(p))
152
#define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
153
#define	OPERAND(p)	((p) + 3)
154
 
155
/*
156
 * Utility definitions.
157
 */
158
 
159
#define	FAIL(m)	{ regerror(m); return(NULL); }
160
#define	ISMULT(c)	((c) == '*')
161
#define	META	"^$.[()|*\\"
162
#ifndef CHARBITS
163
#define	UCHARAT(p)	((int)*(unsigned char *)(p))
164
#else
165
#define	UCHARAT(p)	((int)*(p)&CHARBITS)
166
#endif
167
 
168
/*
169
 * Flags to be passed up and down.
170
 */
171
#define	HASWIDTH	01	/* Known never to match null string. */
172
#define	SIMPLE		02	/* Simple enough to be STAR operand. */
173
#define	SPSTART		04	/* Starts with * */
174
#define	WORST		0	/* Worst case. */
175
 
176
/*
177
 * Global work variables for regcomp().
178
 */
179
static char    *regparse;	/* Input-scan pointer. */
180
static int      regnpar;	/* () count. */
181
static char     regdummy;
182
static char    *regcode;	/* Code-emit pointer; &regdummy = don't. */
183
static long     regsize;	/* Code size. */
184
 
185
/*
186
 * Forward declarations for regcomp()'s friends.
187
 */
188
#ifndef STATIC
189
#define	STATIC	static
190
#endif
191
STATIC char    *reg();
192
STATIC char    *regbranch();
193
STATIC char    *regpiece();
194
STATIC char    *regatom();
195
STATIC char    *regnode();
196
STATIC char    *regnext();
197
STATIC void     regc();
198
STATIC void     reginsert();
199
STATIC void     regtail();
200
STATIC void     regoptail();
201
#ifdef STRCSPN
202
STATIC int      strcspn();
203
#endif
204
 
205
/*
206
 - regcomp - compile a regular expression into internal code
207
 *
208
 * We can't allocate space until we know how big the compiled form will be,
209
 * but we can't compile it (and thus know how big it is) until we've got a
210
 * place to put the code.  So we cheat:  we compile it twice, once with code
211
 * generation turned off and size counting turned on, and once "for real".
212
 * This also means that we don't allocate space until we are sure that the
213
 * thing really will compile successfully, and we never have to move the
214
 * code and thus invalidate pointers into it.  (Note that it has to be in
215
 * one piece because free() must be able to free it all.)
216
 *
217
 * Beware that the optimization-preparation code in here knows about some
218
 * of the structure of the compiled regexp.
219
 */
220
regexp *regcomp(exp)
221
char           *exp;
222
{
223
    register regexp *r;
224
    register char  *scan;
225
    register char  *longest;
226
    register int    len;
227
    int             flags;
228
    extern char    *malloc();
229
 
230
    if (exp == (char *)NULL)
231
	FAIL("NULL argument");
232
 
233
    /* First pass: determine size, legality. */
234
    regparse = exp;
235
    regnpar = 1;
236
    regsize = 0L;
237
    regcode = &regdummy;
238
    regc(MAGIC);
239
    if (reg(0, &flags) == (char *)NULL)
240
	return ((regexp *)NULL);
241
 
242
    /* Small enough for pointer-storage convention? */
243
    if (regsize >= 32767L)	/* Probably could be 65535L. */
244
	FAIL("regexp too big");
245
 
246
    /* Allocate space. */
247
    r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
248
    if (r == (regexp *) NULL)
249
	FAIL("out of space");
250
 
251
    /* Second pass: emit code. */
252
    regparse = exp;
253
    regnpar = 1;
254
    regcode = r->program;
255
    regc(MAGIC);
256
    if (reg(0, &flags) == NULL)
257
	return ((regexp *) NULL);
258
 
259
    /* Dig out information for optimizations. */
260
    r->regstart = '\0';		/* Worst-case defaults. */
261
    r->reganch = 0;
262
    r->regmust = NULL;
263
    r->regmlen = 0;
264
    scan = r->program + 1;	/* First BRANCH. */
265
    if (OP(regnext(scan)) == END) {	/* Only one top-level choice. */
266
	scan = OPERAND(scan);
267
 
268
	/* Starting-point info. */
269
	if (OP(scan) == EXACTLY)
270
	    r->regstart = *OPERAND(scan);
271
	else if (OP(scan) == BOL)
272
	    r->reganch++;
273
 
274
	/*
275
	 * If there's something expensive in the r.e., find the longest
276
	 * literal string that must appear and make it the regmust.  Resolve
277
	 * ties in favor of later strings, since the regstart check works
278
	 * with the beginning of the r.e. and avoiding duplication
279
	 * strengthens checking.  Not a strong reason, but sufficient in the
280
	 * absence of others. 
281
	 */
282
	if (flags & SPSTART) {
283
	    longest = NULL;
284
	    len = 0;
285
	    for (; scan != NULL; scan = regnext(scan))
286
		if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
287
		    longest = OPERAND(scan);
288
		    len = strlen(OPERAND(scan));
289
		}
290
	    r->regmust = longest;
291
	    r->regmlen = len;
292
	}
293
    }
294
    return (r);
295
}
296
 
297
/*
298
 - reg - regular expression, i.e. main body or parenthesized thing
299
 *
300
 * Caller must absorb opening parenthesis.
301
 *
302
 * Combining parenthesis handling with the base level of regular expression
303
 * is a trifle forced, but the need to tie the tails of the branches to what
304
 * follows makes it hard to avoid.
305
 */
306
static char *reg(paren, flagp)
307
int             paren;		/* Parenthesized? */
308
int            *flagp;
309
{
310
    register char  *ret;
311
    register char  *br;
312
    register char  *ender;
313
    register int    parno;
314
    int             flags;
315
 
316
    *flagp = HASWIDTH;		/* Tentatively. */
317
 
318
    /* Make an OPEN node, if parenthesized. */
319
    if (paren) {
320
	if (regnpar >= NSUBEXP)
321
	    FAIL("too many ()");
322
	parno = regnpar;
323
	regnpar++;
324
	ret = regnode(OPEN + parno);
325
    } else
326
	ret = (char *)NULL;
327
 
328
    /* Pick up the branches, linking them together. */
329
    br = regbranch(&flags);
330
    if (br == (char *)NULL)
331
	return ((char *)NULL);
332
    if (ret != (char *)NULL)
333
	regtail(ret, br);	/* OPEN -> first. */
334
    else
335
	ret = br;
336
    if (!(flags & HASWIDTH))
337
	*flagp &= ~HASWIDTH;
338
    *flagp |= flags & SPSTART;
339
    while (*regparse == '|') {
340
	regparse++;
341
	br = regbranch(&flags);
342
	if (br == (char *)NULL)
343
	    return ((char *)NULL);
344
	regtail(ret, br);	/* BRANCH -> BRANCH. */
345
	if (!(flags & HASWIDTH))
346
	    *flagp &= ~HASWIDTH;
347
	*flagp |= flags & SPSTART;
348
    }
349
 
350
    /* Make a closing node, and hook it on the end. */
351
    ender = regnode((paren) ? CLOSE + parno : END);
352
    regtail(ret, ender);
353
 
354
    /* Hook the tails of the branches to the closing node. */
355
    for (br = ret; br != (char *)NULL; br = regnext(br))
356
	regoptail(br, ender);
357
 
358
    /* Check for proper termination. */
359
    if (paren && *regparse++ != ')') {
360
	FAIL("unmatched ()");
361
    } else if (!paren && *regparse != '\0') {
362
	if (*regparse == ')') {
363
	    FAIL("unmatched ()");
364
	} else
365
	    FAIL("junk on end");/* "Can't happen". */
366
	/* NOTREACHED */
367
    }
368
    return (ret);
369
}
370
 
371
/*
372
 - regbranch - one alternative of an | operator
373
 *
374
 * Implements the concatenation operator.
375
 */
376
static char  *regbranch(flagp)
377
int            *flagp;
378
{
379
    register char  *ret;
380
    register char  *chain;
381
    register char  *latest;
382
    int             flags;
383
 
384
    *flagp = WORST;		/* Tentatively. */
385
 
386
    ret = regnode(BRANCH);
387
    chain = (char *)NULL;
388
    while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
389
	latest = regpiece(&flags);
390
	if (latest == (char *)NULL)
391
	    return ((char *)NULL);
392
	*flagp |= flags & HASWIDTH;
393
	if (chain == (char *)NULL)	/* First piece. */
394
	    *flagp |= flags & SPSTART;
395
	else
396
	    regtail(chain, latest);
397
	chain = latest;
398
    }
399
    if (chain == (char *)NULL)		/* Loop ran zero times. */
400
	regnode(NOTHING);
401
 
402
    return (ret);
403
}
404
 
405
/*
406
 - regpiece - something followed by possible [*]
407
 *
408
 * Note that the branching code sequence used for * is somewhat optimized:  
409
 * they use the same NOTHING node as both the endmarker for their branch 
410
 * list and the body of the last branch.  It might seem that this node could 
411
 * be dispensed with entirely, but the endmarker role is not redundant.
412
 */
413
static char *regpiece(flagp)
414
int            *flagp;
415
{
416
    register char  *ret;
417
    register char   op;
418
    register char  *nxt;
419
    int             flags;
420
 
421
    ret = regatom(&flags);
422
    if (ret == (char *)NULL)
423
	return ((char *)NULL);
424
 
425
    op = *regparse;
426
    if (!ISMULT(op)) {
427
	*flagp = flags;
428
	return (ret);
429
    }
430
    if (!(flags & HASWIDTH))
431
	FAIL("* operand could be empty");
432
    *flagp = (WORST | SPSTART);
433
 
434
    if (op == '*' && (flags & SIMPLE))
435
	reginsert(STAR, ret);
436
    else if (op == '*') {
437
	/* Emit x* as (x&|), where & means "self". */
438
	reginsert(BRANCH, ret);	/* Either x */
439
	regoptail(ret, regnode(BACK));	/* and loop */
440
	regoptail(ret, ret);	/* back */
441
	regtail(ret, regnode(BRANCH));	/* or */
442
	regtail(ret, regnode(NOTHING));	/* null. */
443
    } 
444
    regparse++;
445
    if (ISMULT(*regparse))
446
	FAIL("nested *");
447
 
448
    return (ret);
449
}
450
 
451
/*
452
 - regatom - the lowest level
453
 *
454
 * Optimization:  gobbles an entire sequence of ordinary characters so that
455
 * it can turn them into a single node, which is smaller to store and
456
 * faster to run.  Backslashed characters are exceptions, each becoming a
457
 * separate node; the code is simpler that way and it's not worth fixing.
458
 */
459
static char *regatom(flagp)
460
int            *flagp;
461
{
462
    register char  *ret;
463
    int             flags;
464
 
465
    *flagp = WORST;		/* Tentatively. */
466
 
467
    switch (*regparse++) {
468
    case '^':
469
	ret = regnode(BOL);
470
	break;
471
    case '$':
472
	ret = regnode(EOL);
473
	break;
474
    case '.':
475
	ret = regnode(ANY);
476
	*flagp |= HASWIDTH | SIMPLE;
477
	break;
478
    case '[':{
479
	    register int    class;
480
	    register int    classend;
481
 
482
	    if (*regparse == '^') {	/* Complement of range. */
483
		ret = regnode(ANYBUT);
484
		regparse++;
485
	    } else
486
		ret = regnode(ANYOF);
487
	    if (*regparse == ']' || *regparse == '-')
488
		regc(*regparse++);
489
	    while (*regparse != '\0' && *regparse != ']') {
490
		if (*regparse == '-') {
491
		    regparse++;
492
		    if (*regparse == ']' || *regparse == '\0')
493
			regc('-');
494
		    else {
495
			class = UCHARAT(regparse - 2) + 1;
496
			classend = UCHARAT(regparse);
497
			if (class > classend + 1)
498
			    FAIL("invalid [] range");
499
			for (; class <= classend; class++)
500
			    regc(class);
501
			regparse++;
502
		    }
503
		} else
504
		    regc(*regparse++);
505
	    }
506
	    regc('\0');
507
	    if (*regparse != ']')
508
		FAIL("unmatched []");
509
	    regparse++;
510
	    *flagp |= HASWIDTH | SIMPLE;
511
	}
512
	break;
513
    case '(':
514
	ret = reg(1, &flags);
515
	if (ret == (char *)NULL)
516
	    return ((char *)NULL);
517
	*flagp |= flags & (HASWIDTH | SPSTART);
518
	break;
519
    case '\0':
520
    case '|':
521
    case ')':
522
	FAIL("internal urp");	/* Supposed to be caught earlier. */
523
	break;
524
    case '*':
525
	FAIL("* follows nothing");
526
	break;
527
    case '\\':
528
	if (*regparse == '\0')
529
	    FAIL("trailing \\");
530
	ret = regnode(EXACTLY);
531
	regc(*regparse++);
532
	regc('\0');
533
	*flagp |= HASWIDTH | SIMPLE;
534
	break;
535
    default:{
536
	    register int    len;
537
	    register char   ender;
538
 
539
	    regparse--;
540
	    len = strcspn(regparse, META);
541
	    if (len <= 0)
542
		FAIL("internal disaster");
543
	    ender = *(regparse + len);
544
	    if (len > 1 && ISMULT(ender))
545
		len--;		/* Back off clear of * operand. */
546
	    *flagp |= HASWIDTH;
547
	    if (len == 1)
548
		*flagp |= SIMPLE;
549
	    ret = regnode(EXACTLY);
550
	    while (len > 0) {
551
		regc(*regparse++);
552
		len--;
553
	    }
554
	    regc('\0');
555
	}
556
	break;
557
    }
558
 
559
    return (ret);
560
}
561
 
562
/*
563
 - regnode - emit a node
564
 */
565
static char *regnode(op)
566
char            op;
567
{
568
    register char  *ret;
569
    register char  *ptr;
570
 
571
    ret = regcode;
572
    if (ret == &regdummy) {
573
	regsize += 3;
574
	return (ret);
575
    }
576
    ptr = ret;
577
    *ptr++ = op;
578
    *ptr++ = '\0';		/* Null "nxt" pointer. */
579
    *ptr++ = '\0';
580
    regcode = ptr;
581
 
582
    return (ret);
583
}
584
 
585
/*
586
 - regc - emit (if appropriate) a byte of code
587
 */
588
static void regc(b)
589
char            b;
590
{
591
    if (regcode != &regdummy)
592
	*regcode++ = b;
593
    else
594
	regsize++;
595
}
596
 
597
/*
598
 - reginsert - insert an operator in front of already-emitted operand
599
 *
600
 * Means relocating the operand.
601
 */
602
static void reginsert(op, opnd)
603
char            op;
604
char           *opnd;
605
{
606
    register char  *src;
607
    register char  *dst;
608
    register char  *place;
609
 
610
    if (regcode == &regdummy) {
611
	regsize += 3;
612
	return;
613
    }
614
    src = regcode;
615
    regcode += 3;
616
    dst = regcode;
617
    while (src > opnd)
618
	*--dst = *--src;
619
 
620
    place = opnd;		/* Op node, where operand used to be. */
621
    *place++ = op;
622
    *place++ = '\0';
623
    *place++ = '\0';
624
}
625
 
626
/*
627
 - regtail - set the next-pointer at the end of a node chain
628
 */
629
static void regtail(p, val)
630
char           *p;
631
char           *val;
632
{
633
    register char  *scan;
634
    register char  *temp;
635
    register int    offset;
636
 
637
    if (p == &regdummy)
638
	return;
639
 
640
    /* Find last node. */
641
    scan = p;
642
    for (;;) {
643
	temp = regnext(scan);
644
	if (temp == (char *)NULL)
645
	    break;
646
	scan = temp;
647
    }
648
 
649
    if (OP(scan) == BACK)
650
	offset = scan - val;
651
    else
652
	offset = val - scan;
653
    *(scan + 1) = (offset >> 8) & 0377;
654
    *(scan + 2) = offset & 0377;
655
}
656
 
657
/*
658
 - regoptail - regtail on operand of first argument; nop if operandless
659
 */
660
static void regoptail(p, val)
661
char           *p;
662
char           *val;
663
{
664
    /* "Operandless" and "op != BRANCH" are synonymous in practice. */
665
    if (p == (char *)NULL || p == &regdummy || OP(p) != BRANCH)
666
	return;
667
    regtail(OPERAND(p), val);
668
}
669
 
670
/*
671
 * regexec and friends
672
 */
673
 
674
/*
675
 * Global work variables for regexec().
676
 */
677
static char    *reginput;	/* String-input pointer. */
678
static char    *regbol;		/* Beginning of input, for ^ check. */
679
static char   **regstartp;	/* Pointer to startp array. */
680
static char   **regendp;	/* Ditto for endp. */
681
 
682
/*
683
 * Forwards.
684
 */
685
STATIC int      regtry();
686
STATIC int      regmatch();
687
STATIC int      regrepeat();
688
 
689
#ifdef DEBUG
690
int             regnarrate = 0;
691
void            regdump();
692
STATIC char    *regprop();
693
#endif
694
 
695
/*
696
 - regexec - match a regexp against a string
697
 */
698
int regexec(prog, string)
699
register regexp *prog;
700
register char  *string;
701
{
702
    register char  *s;
703
 
704
    /* Be paranoid... */
705
    if (prog == (regexp *)NULL || string == (char *)NULL) {
706
	regerror("NULL parameter");
707
	return (0);
708
    }
709
    /* Check validity of program. */
710
    if (UCHARAT(prog->program) != MAGIC) {
711
	regerror("corrupted program");
712
	return (0);
713
    }
714
    /* If there is a "must appear" string, look for it. */
715
    if (prog->regmust != (char *)NULL) {
716
	s = string;
717
	while ((s = strchr(s, prog->regmust[0])) != (char *)NULL) {
718
	    if (strncmp(s, prog->regmust, prog->regmlen) == 0)
719
		break;		/* Found it. */
720
	    s++;
721
	}
722
	if (s == (char *)NULL)		/* Not present. */
723
	    return (0);
724
    }
725
    /* Mark beginning of line for ^ . */
726
    regbol = string;
727
 
728
    /* Simplest case:  anchored match need be tried only once. */
729
    if (prog->reganch)
730
	return (regtry(prog, string));
731
 
732
    /* Messy cases:  unanchored match. */
733
    s = string;
734
    if (prog->regstart != '\0')
735
	/* We know what char it must start with. */
736
	while ((s = strchr(s, prog->regstart)) != (char *)NULL) {
737
	    if (regtry(prog, s))
738
		return (1);
739
	    s++;
740
	}
741
    else
742
	/* We don't -- general case. */
743
	do {
744
	    if (regtry(prog, s))
745
		return (1);
746
	} while (*s++ != '\0');
747
 
748
    /* Failure. */
749
    return (0);
750
}
751
 
752
/*
753
 - regtry - try match at specific point
754
 */
755
#ifdef __STDC__
756
 
757
static int regtry(regexp *prog, char *string)
758
 
759
#else
760
 
761
static int regtry(prog, string)
762
regexp         *prog;
763
char           *string;
764
 
765
#endif
766
{
767
    register int    i;
768
    register char **sp;
769
    register char **ep;
770
 
771
    reginput = string;
772
    regstartp = prog->startp;
773
    regendp = prog->endp;
774
 
775
    sp = prog->startp;
776
    ep = prog->endp;
777
    for (i = NSUBEXP; i > 0; i--) {
778
	*sp++ = (char *)NULL;
779
	*ep++ = (char *)NULL;
780
    }
781
    if (regmatch(prog->program + 1)) {
782
	prog->startp[0] = string;
783
	prog->endp[0] = reginput;
784
	return (1);
785
    } else
786
	return (0);
787
}
788
 
789
/*
790
 - regmatch - main matching routine
791
 *
792
 * Conceptually the strategy is simple:  check to see whether the current
793
 * node matches, call self recursively to see whether the rest matches,
794
 * and then act accordingly.  In practice we make some effort to avoid
795
 * recursion, in particular by going through "ordinary" nodes (that don't
796
 * need to know whether the rest of the match failed) by a loop instead of
797
 * by recursion.
798
 */
799
#ifdef __STDC__
800
 
801
static int regmatch(char *prog)
802
 
803
#else
804
 
805
static int regmatch(prog)
806
char           *prog;
807
 
808
#endif
809
{
810
    register char  *scan;	/* Current node. */
811
    char           *nxt;	/* nxt node. */
812
 
813
    scan = prog;
814
#ifdef DEBUG
815
    if (scan != (char *)NULL && regnarrate)
816
	fprintf(stderr, "%s(\n", regprop(scan));
817
#endif
818
    while (scan != (char *)NULL) {
819
#ifdef DEBUG
820
	if (regnarrate)
821
	    fprintf(stderr, "%s...\n", regprop(scan));
822
#endif
823
	nxt = regnext(scan);
824
 
825
	switch (OP(scan)) {
826
	case BOL:
827
	    if (reginput != regbol)
828
		return (0);
829
	    break;
830
	case EOL:
831
	    if (*reginput != '\0')
832
		return (0);
833
	    break;
834
	case ANY:
835
	    if (*reginput == '\0')
836
		return (0);
837
	    reginput++;
838
	    break;
839
	case EXACTLY:{
840
		register int    len;
841
		register char  *opnd;
842
 
843
		opnd = OPERAND(scan);
844
		/* Inline the first character, for speed. */
845
		if (*opnd != *reginput)
846
		    return (0);
847
		len = strlen(opnd);
848
		if (len > 1 && strncmp(opnd, reginput, len) != 0)
849
		    return (0);
850
		reginput += len;
851
	    }
852
	    break;
853
	case ANYOF:
854
	    if (*reginput == '\0' || 
855
		 strchr(OPERAND(scan), *reginput) == (char *)NULL)
856
		return (0);
857
	    reginput++;
858
	    break;
859
	case ANYBUT:
860
	    if (*reginput == '\0' || 
861
		 strchr(OPERAND(scan), *reginput) != (char *)NULL)
862
		return (0);
863
	    reginput++;
864
	    break;
865
	case NOTHING:
866
	    break;
867
	case BACK:
868
	    break;
869
	case OPEN + 1:
870
	case OPEN + 2:
871
	case OPEN + 3:
872
	case OPEN + 4:
873
	case OPEN + 5:
874
	case OPEN + 6:
875
	case OPEN + 7:
876
	case OPEN + 8:
877
	case OPEN + 9:{
878
		register int    no;
879
		register char  *save;
880
 
881
		no = OP(scan) - OPEN;
882
		save = reginput;
883
 
884
		if (regmatch(nxt)) {
885
		    /*
886
		     * Don't set startp if some later invocation of the same
887
		     * parentheses already has. 
888
		     */
889
		    if (regstartp[no] == (char *)NULL)
890
			regstartp[no] = save;
891
		    return (1);
892
		} else
893
		    return (0);
894
	    }
895
	    break;
896
	case CLOSE + 1:
897
	case CLOSE + 2:
898
	case CLOSE + 3:
899
	case CLOSE + 4:
900
	case CLOSE + 5:
901
	case CLOSE + 6:
902
	case CLOSE + 7:
903
	case CLOSE + 8:
904
	case CLOSE + 9:{
905
		register int    no;
906
		register char  *save;
907
 
908
		no = OP(scan) - CLOSE;
909
		save = reginput;
910
 
911
		if (regmatch(nxt)) {
912
		    /*
913
		     * Don't set endp if some later invocation of the same
914
		     * parentheses already has. 
915
		     */
916
		    if (regendp[no] == (char *)NULL)
917
			regendp[no] = save;
918
		    return (1);
919
		} else
920
		    return (0);
921
	    }
922
	    break;
923
	case BRANCH:{
924
		register char  *save;
925
 
926
		if (OP(nxt) != BRANCH)	/* No choice. */
927
		    nxt = OPERAND(scan);	/* Avoid recursion. */
928
		else {
929
		    do {
930
			save = reginput;
931
			if (regmatch(OPERAND(scan)))
932
			    return (1);
933
			reginput = save;
934
			scan = regnext(scan);
935
		    } while (scan != (char *)NULL && OP(scan) == BRANCH);
936
		    return (0);
937
		    /* NOTREACHED */
938
		}
939
	    }
940
	    break;
941
	case STAR:{
942
		register char   nextch;
943
		register int    no;
944
		register char  *save;
945
		register int    minimum;
946
 
947
		/*
948
		 * Lookahead to avoid useless match attempts when we know
949
		 * what character comes next. 
950
		 */
951
		nextch = '\0';
952
		if (OP(nxt) == EXACTLY)
953
		    nextch = *OPERAND(nxt);
954
		minimum = (OP(scan) == STAR) ? 0 : 1;
955
		save = reginput;
956
		no = regrepeat(OPERAND(scan));
957
		while (no >= minimum) {
958
		    /* If it could work, try it. */
959
		    if (nextch == '\0' || *reginput == nextch)
960
			if (regmatch(nxt))
961
			    return (1);
962
		    /* Couldn't or didn't -- back up. */
963
		    no--;
964
		    reginput = save + no;
965
		}
966
		return (0);
967
	    }
968
	    break;
969
	case END:
970
	    return (1);		/* Success! */
971
	    break;
972
	default:
973
	    regerror("memory corruption");
974
	    return (0);
975
	    break;
976
	}
977
 
978
	scan = nxt;
979
    }
980
 
981
    /*
982
     * We get here only if there's trouble -- normally "case END" is the
983
     * terminating point. 
984
     */
985
    regerror("corrupted pointers");
986
    return (0);
987
}
988
 
989
/*
990
 - regrepeat - repeatedly match something simple, report how many
991
 */
992
#ifdef __STDC__
993
 
994
static int regrepeat(char *p)
995
 
996
#else
997
 
998
static int regrepeat(p)
999
char           *p;
1000
 
1001
#endif
1002
{
1003
    register int    count = 0;
1004
    register char  *scan;
1005
    register char  *opnd;
1006
 
1007
    scan = reginput;
1008
    opnd = OPERAND(p);
1009
    switch (OP(p)) {
1010
    case ANY:
1011
	count = strlen(scan);
1012
	scan += count;
1013
	break;
1014
    case EXACTLY:
1015
	while (*opnd == *scan) {
1016
	    count++;
1017
	    scan++;
1018
	}
1019
	break;
1020
    case ANYOF:
1021
	while (*scan != '\0' && strchr(opnd, *scan) != (char *)NULL) {
1022
	    count++;
1023
	    scan++;
1024
	}
1025
	break;
1026
    case ANYBUT:
1027
	while (*scan != '\0' && strchr(opnd, *scan) == (char *)NULL) {
1028
	    count++;
1029
	    scan++;
1030
	}
1031
	break;
1032
    default:			/* Oh dear.  Called inappropriately. */
1033
	regerror("internal foulup");
1034
	count = 0;		/* Best compromise. */
1035
	break;
1036
    }
1037
    reginput = scan;
1038
 
1039
    return (count);
1040
}
1041
 
1042
 
1043
/*
1044
 - regnext - dig the "nxt" pointer out of a node
1045
 */
1046
#ifdef __STDC__
1047
 
1048
static char *regnext(register char *p)
1049
 
1050
#else
1051
 
1052
static char *regnext(p)
1053
register char  *p;
1054
 
1055
#endif
1056
{
1057
    register int    offset;
1058
 
1059
    if (p == &regdummy)
1060
	return ((char *)NULL);
1061
 
1062
    offset = NEXT(p);
1063
    if (offset == 0)
1064
	return ((char *)NULL);
1065
 
1066
    if (OP(p) == BACK)
1067
	return (p - offset);
1068
    else
1069
	return (p + offset);
1070
}
1071
 
1072
#ifdef DEBUG
1073
 
1074
STATIC char    *regprop();
1075
 
1076
/*
1077
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1078
 */
1079
#ifdef __STDC__
1080
 
1081
void regdump(regexp *r)
1082
 
1083
#else
1084
 
1085
void regdump(r)
1086
regexp         *r;
1087
 
1088
#endif
1089
{
1090
    register char  *s;
1091
    register char   op = EXACTLY;	/* Arbitrary non-END op. */
1092
    register char  *nxt;
1093
    extern char    *strchr();
1094
 
1095
 
1096
    s = r->program + 1;
1097
    while (op != END) {		/* While that wasn't END last time... */
1098
	op = OP(s);
1099
	printf("%2d%s", s - r->program, regprop(s));	/* Where, what. */
1100
	nxt = regnext(s);
1101
	if (nxt == (char *)NULL)	/* nxt ptr. */
1102
	    printf("(0)");
1103
	else
1104
	    printf("(%d)", (s - r->program) + (nxt - s));
1105
	s += 3;
1106
	if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1107
	    /* Literal string, where present. */
1108
	    while (*s != '\0') {
1109
		putchar(*s);
1110
		s++;
1111
	    }
1112
	    s++;
1113
	}
1114
	putchar('\n');
1115
    }
1116
 
1117
    /* Header fields of interest. */
1118
    if (r->regstart != '\0')
1119
	printf("start `%c' ", r->regstart);
1120
    if (r->reganch)
1121
	printf("anchored ");
1122
    if (r->regmust != (char *)NULL)
1123
	printf("must have \"%s\"", r->regmust);
1124
    printf("\n");
1125
}
1126
 
1127
/*
1128
 - regprop - printable representation of opcode
1129
 */
1130
#ifdef __STDC__
1131
 
1132
static char *regprop(char *op)
1133
 
1134
#else
1135
 
1136
static char *regprop(op)
1137
char           *op;
1138
 
1139
#endif
1140
{
1141
    register char  *p;
1142
    static char     buf[50];
1143
 
1144
    strcpy(buf, ":");
1145
 
1146
    switch (OP(op)) {
1147
    case BOL:
1148
	p = "BOL";
1149
	break;
1150
    case EOL:
1151
	p = "EOL";
1152
	break;
1153
    case ANY:
1154
	p = "ANY";
1155
	break;
1156
    case ANYOF:
1157
	p = "ANYOF";
1158
	break;
1159
    case ANYBUT:
1160
	p = "ANYBUT";
1161
	break;
1162
    case BRANCH:
1163
	p = "BRANCH";
1164
	break;
1165
    case EXACTLY:
1166
	p = "EXACTLY";
1167
	break;
1168
    case NOTHING:
1169
	p = "NOTHING";
1170
	break;
1171
    case BACK:
1172
	p = "BACK";
1173
	break;
1174
    case END:
1175
	p = "END";
1176
	break;
1177
    case OPEN + 1:
1178
    case OPEN + 2:
1179
    case OPEN + 3:
1180
    case OPEN + 4:
1181
    case OPEN + 5:
1182
    case OPEN + 6:
1183
    case OPEN + 7:
1184
    case OPEN + 8:
1185
    case OPEN + 9:
1186
	sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1187
	p = (char *)NULL;
1188
	break;
1189
    case CLOSE + 1:
1190
    case CLOSE + 2:
1191
    case CLOSE + 3:
1192
    case CLOSE + 4:
1193
    case CLOSE + 5:
1194
    case CLOSE + 6:
1195
    case CLOSE + 7:
1196
    case CLOSE + 8:
1197
    case CLOSE + 9:
1198
	sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1199
	p = (char *)NULL;
1200
	break;
1201
    case STAR:
1202
	p = "STAR";
1203
	break;
1204
    default:
1205
	regerror("corrupted opcode");
1206
	break;
1207
    }
1208
    if (p != (char *)NULL)
1209
	strcat(buf, p);
1210
    return (buf);
1211
}
1212
#endif
1213
 
1214
/*
1215
 * The following is provided for those people who do not have strcspn() in
1216
 * their C libraries.  They should get off their butts and do something
1217
 * about it; at least one public-domain implementation of those (highly
1218
 * useful) string routines has been published on Usenet.
1219
 */
1220
#ifdef STRCSPN
1221
/*
1222
 * strcspn - find length of initial segment of s1 consisting entirely
1223
 * of characters not from s2
1224
 */
1225
 
1226
#ifdef __STDC__
1227
 
1228
static int strcspn(char *s1, char *s2)
1229
 
1230
#else
1231
 
1232
static int strcspn(s1, s2)
1233
char           *s1;
1234
char           *s2;
1235
 
1236
#endif
1237
{
1238
    register char  *scan1;
1239
    register char  *scan2;
1240
    register int    count;
1241
 
1242
    count = 0;
1243
    for (scan1 = s1; *scan1 != '\0'; scan1++) {
1244
	for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1245
	    if (*scan1 == *scan2++)
1246
		return (count);
1247
	count++;
1248
    }
1249
    return (count);
1250
}
1251
#endif
1252
 
1253
 
1254
/*
1255
 - regsub - perform substitutions after a regexp match
1256
 */
1257
#ifdef __STDC__
1258
 
1259
void regsub(regexp *prog, char *source, char *dest)
1260
 
1261
#else
1262
 
1263
void regsub(prog, source, dest)
1264
regexp         *prog;
1265
char           *source;
1266
char           *dest;
1267
 
1268
#endif
1269
{
1270
    register char  *src;
1271
    register char  *dst;
1272
    register char   c;
1273
    register int    no;
1274
    register int    len;
1275
    extern char    *strncpy();
1276
 
1277
    if (prog == (regexp *)NULL || 
1278
	source == (char *)NULL || dest == (char *)NULL) {
1279
	regerror("NULL parm to regsub");
1280
	return;
1281
    }
1282
    if (UCHARAT(prog->program) != MAGIC) {
1283
	regerror("damaged regexp fed to regsub");
1284
	return;
1285
    }
1286
    src = source;
1287
    dst = dest;
1288
    while ((c = *src++) != '\0') {
1289
	if (c == '&')
1290
	    no = 0;
1291
	else if (c == '\\' && '0' <= *src && *src <= '9')
1292
	    no = *src++ - '0';
1293
	else
1294
	    no = -1;
1295
 
1296
	if (no < 0) {		/* Ordinary character. */
1297
	    if (c == '\\' && (*src == '\\' || *src == '&'))
1298
		c = *src++;
1299
	    *dst++ = c;
1300
	} else if (prog->startp[no] != (char *)NULL && 
1301
		   prog->endp[no] != (char *)NULL) {
1302
	    len = prog->endp[no] - prog->startp[no];
1303
	    strncpy(dst, prog->startp[no], len);
1304
	    dst += len;
1305
	    if (len != 0 && *(dst - 1) == '\0') {	/* strncpy hit NUL. */
1306
		regerror("damaged match string");
1307
		return;
1308
	    }
1309
	}
1310
    }
1311
    *dst++ = '\0';
1312
}
1313
 
1314
 
1315
#ifdef __STDC__
1316
 
1317
void regerror(char *s)
1318
 
1319
#else
1320
 
1321
void regerror(s)
1322
char           *s;
1323
 
1324
#endif
1325
{
1326
    fprintf(stderr, "regexp(3): %s", s);
1327
    exit(1);
1328
}