Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

/* From libregexp but leaks extra classes when it runs out */


#include <u.h>
#include <libc.h>
#include "regexp.h"
#include "/sys/src/libregexp/regcomp.h"

#define TRUE    1
#define FALSE   0

/*
 * Parser Information
 */
typedef
struct Node
{
        Reinst* first;
        Reinst* last;
}Node;

#define NSTACK  20
static  Node    andstack[NSTACK];
static  Node    *andp;
static  int     atorstack[NSTACK];
static  int*    atorp;
static  int     cursubid;               /* id of current subexpression */
static  int     subidstack[NSTACK];     /* parallel to atorstack */
static  int*    subidp;
static  int     lastwasand;     /* Last token was operand */
static  int     nbra;
static  char*   exprp;          /* pointer to next character in source expression */
static  int     lexdone;
static  int     nclass;
static  Reclass*classp;
static  Reinst* freep;
static  int     errors;
static  Rune    yyrune;         /* last lex'd rune */
static  Reclass*yyclassp;       /* last lex'd class */

/* predeclared crap */
static  void    operator(int);
static  void    pushand(Reinst*, Reinst*);
static  void    pushator(int);
static  void    evaluntil(int);
static  int     bldcclass(void);

static jmp_buf regkaboom;

static  void
rcerror(char *s)
{
        errors++;
        regerror(s);
        longjmp(regkaboom, 1);
}

static  Reinst*
newinst(int t)
{
        freep->type = t;
        freep->left = 0;
        freep->right = 0;
        return freep++;
}

static  void
operand(int t)
{
        Reinst *i;

        if(lastwasand)
                operator(CAT);  /* catenate is implicit */
        i = newinst(t);

        if(t == CCLASS || t == NCCLASS)
                i->cp = yyclassp;
        if(t == RUNE)
                i->r = yyrune;

        pushand(i, i);
        lastwasand = TRUE;
}

static  void
operator(int t)
{
        if(t==RBRA && --nbra<0)
                rcerror("unmatched right paren");
        if(t==LBRA){
                if(++cursubid >= NSUBEXP)
                        rcerror ("too many subexpressions");
                nbra++;
                if(lastwasand)
                        operator(CAT);
        } else
                evaluntil(t);
        if(t != RBRA)
                pushator(t);
        lastwasand = FALSE;
        if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
                lastwasand = TRUE;      /* these look like operands */
}

static  void
regerr2(char *s, int c)
{
        char buf[100];
        char *cp = buf;
        while(*s)
                *cp++ = *s++;
        *cp++ = c;
        *cp = '\0'; 
        rcerror(buf);
}

static  void
cant(char *s)
{
        char buf[100];
        strcpy(buf, "can't happen: ");
        strcat(buf, s);
        rcerror(buf);
}

static  void
pushand(Reinst *f, Reinst *l)
{
        if(andp >= &andstack[NSTACK])
                cant("operand stack overflow");
        andp->first = f;
        andp->last = l;
        andp++;
}

static  void
pushator(int t)
{
        if(atorp >= &atorstack[NSTACK])
                cant("operator stack overflow");
        *atorp++ = t;
        *subidp++ = cursubid;
}

static  Node*
popand(int op)
{
        Reinst *inst;

        if(andp <= &andstack[0]){
                regerr2("missing operand for ", op);
                inst = newinst(NOP);
                pushand(inst,inst);
        }
        return --andp;
}

static  int
popator(void)
{
        if(atorp <= &atorstack[0])
                cant("operator stack underflow");
        --subidp;
        return *--atorp;
}

static  void
evaluntil(int pri)
{
        Node *op1, *op2;
        Reinst *inst1, *inst2;

        while(pri==RBRA || atorp[-1]>=pri){
                switch(popator()){
                default:
                        rcerror("unknown operator in evaluntil");
                        break;
                case LBRA:              /* must have been RBRA */
                        op1 = popand('(');
                        inst2 = newinst(RBRA);
                        inst2->subid = *subidp;
                        op1->last->next = inst2;
                        inst1 = newinst(LBRA);
                        inst1->subid = *subidp;
                        inst1->next = op1->first;
                        pushand(inst1, inst2);
                        return;
                case OR:
                        op2 = popand('|');
                        op1 = popand('|');
                        inst2 = newinst(NOP);
                        op2->last->next = inst2;
                        op1->last->next = inst2;
                        inst1 = newinst(OR);
                        inst1->right = op1->first;
                        inst1->left = op2->first;
                        pushand(inst1, inst2);
                        break;
                case CAT:
                        op2 = popand(0);
                        op1 = popand(0);
                        op1->last->next = op2->first;
                        pushand(op1->first, op2->last);
                        break;
                case STAR:
                        op2 = popand('*');
                        inst1 = newinst(OR);
                        op2->last->next = inst1;
                        inst1->right = op2->first;
                        pushand(inst1, inst1);
                        break;
                case PLUS:
                        op2 = popand('+');
                        inst1 = newinst(OR);
                        op2->last->next = inst1;
                        inst1->right = op2->first;
                        pushand(op2->first, inst1);
                        break;
                case QUEST:
                        op2 = popand('?');
                        inst1 = newinst(OR);
                        inst2 = newinst(NOP);
                        inst1->left = inst2;
                        inst1->right = op2->first;
                        op2->last->next = inst2;
                        pushand(inst1, inst2);
                        break;
                }
        }
}

static  Reprog*
optimize(Reprog *pp)
{
        Reinst *inst, *target;
        int size;
        Reprog *npp;
        Reclass *cl;
        int diff;

        /*
         *  get rid of NOOP chains
         */
        for(inst=pp->firstinst; inst->type!=END; inst++){
                target = inst->next;
                while(target->type == NOP)
                        target = target->next;
                inst->next = target;
        }

        /*
         *  The original allocation is for an area larger than
         *  necessary.  Reallocate to the actual space used
         *  and then relocate the code.
         */
        size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
        npp = realloc(pp, size);
        if(npp==0 || npp==pp)
                return pp;
        diff = (char *)npp - (char *)pp;
        freep = (Reinst *)((char *)freep + diff);
        for(inst=npp->firstinst; inst<freep; inst++){
                switch(inst->type){
                case OR:
                case STAR:
                case PLUS:
                case QUEST:
                        *(char **)&inst->right += diff;
                        break;
                case CCLASS:
                case NCCLASS:
                        *(char **)&inst->right += diff;
                        cl = inst->cp;
                        *(char **)&cl->end += diff;
                        break;
                }
                *(char **)&inst->left += diff;
        }
        *(char **)&npp->startinst += diff;
        return npp;
}

#ifdef  DEBUG
static  void
dumpstack(void){
        Node *stk;
        int *ip;

        print("operators\n");
        for(ip=atorstack; ip<atorp; ip++)
                print("0%o\n", *ip);
        print("operands\n");
        for(stk=andstack; stk<andp; stk++)
                print("0%o\t0%o\n", stk->first->type, stk->last->type);
}

static  void
dump(Reprog *pp)
{
        Reinst *l;
        Rune *p;

        l = pp->firstinst;
        do{
                print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
                        l->left-pp->firstinst, l->right-pp->firstinst);
                if(l->type == RUNE)
                        print("\t%C\n", l->r);
                else if(l->type == CCLASS || l->type == NCCLASS){
                        print("\t[");
                        if(l->type == NCCLASS)
                                print("^");
                        for(p = l->cp->spans; p < l->cp->end; p += 2)
                                if(p[0] == p[1])
                                        print("%C", p[0]);
                                else
                                        print("%C-%C", p[0], p[1]);
                        print("]\n");
                } else
                        print("\n");
        }while(l++->type);
}
#endif

static  Reclass*
newclass(void)
{
        if(nclass <= 0){
                classp = mallocz(128*sizeof(Reclass), 1);
                if(classp == nil)
                        regerror("out of memory");
                nclass = 128;
        }
        return &classp[--nclass];
}

static  int
nextc(Rune *rp)
{
        if(lexdone){
                *rp = 0;
                return 1;
        }
        exprp += chartorune(rp, exprp);
        if(*rp == L'\\'){
                exprp += chartorune(rp, exprp);
                return 1;
        }
        if(*rp == 0)
                lexdone = 1;
        return 0;
}

static  int
lex(int literal, int dot_type)
{
        int quoted;

        quoted = nextc(&yyrune);
        if(literal || quoted){
                if(yyrune == 0)
                        return END;
                return RUNE;
        }

        switch(yyrune){
        case 0:
                return END;
        case L'*':
                return STAR;
        case L'?':
                return QUEST;
        case L'+':
                return PLUS;
        case L'|':
                return OR;
        case L'.':
                return dot_type;
        case L'(':
                return LBRA;
        case L')':
                return RBRA;
        case L'^':
                return BOL;
        case L'$':
                return EOL;
        case L'[':
                return bldcclass();
        }
        return RUNE;
}

static int
bldcclass(void)
{
        int type;
        Rune r[NCCRUNE];
        Rune *p, *ep, *np;
        Rune rune;
        int quoted;

        /* we have already seen the '[' */
        type = CCLASS;
        yyclassp = newclass();

        /* look ahead for negation */
        /* SPECIAL CASE!!! negated classes don't match \n */
        ep = r;
        quoted = nextc(&rune);
        if(!quoted && rune == L'^'){
                type = NCCLASS;
                quoted = nextc(&rune);
                *ep++ = L'\n';
                *ep++ = L'\n';
        }

        /* parse class into a set of spans */
        for(; ep<&r[NCCRUNE];){
                if(rune == 0){
                        rcerror("malformed '[]'");
                        return 0;
                }
                if(!quoted && rune == L']')
                        break;
                if(!quoted && rune == L'-'){
                        if(ep == r){
                                rcerror("malformed '[]'");
                                return 0;
                        }
                        quoted = nextc(&rune);
                        if((!quoted && rune == L']') || rune == 0){
                                rcerror("malformed '[]'");
                                return 0;
                        }
                        *(ep-1) = rune;
                } else {
                        *ep++ = rune;
                        *ep++ = rune;
                }
                quoted = nextc(&rune);
        }

        /* sort on span start */
        for(p = r; p < ep; p += 2){
                for(np = p; np < ep; np += 2)
                        if(*np < *p){
                                rune = np[0];
                                np[0] = p[0];
                                p[0] = rune;
                                rune = np[1];
                                np[1] = p[1];
                                p[1] = rune;
                        }
        }

        /* merge spans */
        np = yyclassp->spans;
        p = r;
        if(r == ep)
                yyclassp->end = np;
        else {
                np[0] = *p++;
                np[1] = *p++;
                for(; p < ep; p += 2)
                        if(p[0] <= np[1]){
                                if(p[1] > np[1])
                                        np[1] = p[1];
                        } else {
                                np += 2;
                                np[0] = p[0];
                                np[1] = p[1];
                        }
                yyclassp->end = np+2;
        }

        return type;
}

static  Reprog*
regcomp1(char *s, int literal, int dot_type)
{
        int token;
        Reprog *pp;

        /* get memory for the program */
        pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
        if(pp == 0){
                regerror("out of memory");
                return 0;
        }
        freep = pp->firstinst;
        classp = pp->class;
        errors = 0;

        if(setjmp(regkaboom))
                goto out;

        /* go compile the sucker */
        lexdone = 0;
        exprp = s;
        nclass = NCLASS;
        nbra = 0;
        atorp = atorstack;
        andp = andstack;
        subidp = subidstack;
        lastwasand = FALSE;
        cursubid = 0;

        /* Start with a low priority operator to prime parser */
        pushator(START-1);
        while((token = lex(literal, dot_type)) != END){
                if((token&0300) == OPERATOR)
                        operator(token);
                else
                        operand(token);
        }

        /* Close with a low priority operator */
        evaluntil(START);

        /* Force END */
        operand(END);
        evaluntil(START);
#ifdef DEBUG
        dumpstack();
#endif
        if(nbra)
                rcerror("unmatched left paren");
        --andp; /* points to first and only operand */
        pp->startinst = andp->first;
#ifdef DEBUG
        dump(pp);
#endif
        pp = optimize(pp);
#ifdef DEBUG
        print("start: %d\n", andp->first-pp->firstinst);
        dump(pp);
#endif
out:
        if(errors){
                free(pp);
                pp = 0;
        }
        return pp;
}

extern  Reprog*
regcomp(char *s)
{
        return regcomp1(s, 0, ANY);
}

extern  Reprog*
regcomplit(char *s)
{
        return regcomp1(s, 1, ANY);
}

extern  Reprog*
regcompnl(char *s)
{
        return regcomp1(s, 0, ANYNL);
}