Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include "dfa.h"

/***
 * Regular expression for matching.
 */

char *ignore[] = 
{
        /* HTML that isn't A, IMG, or FONT */
        /* Must have a space somewhere to avoid catching <email@address> */
        "<[     \n\r]*("
                "[^aif]|"
                "a[^> \t\r\n]|"
                "i[^mM \t\r\n]|"
                "im[^gG \t\r\n]|"
                "img[^> \t\r\n]|"
                "f[^oO \t\r\n]|"
                "fo[^Nn \t\r\n]|"
                "fon[^tT \t\r\n]|"
                "font[^> \r\t\n]"
        ")[^>]*[ \t\n\r][^>]*>",
        "<[     \n\r]*("
                "i|im|f|fo|fon"
        ")[ \t\r\n][^>]*>",

        /* ignore html comments */
        "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",

        /* random mail strings */
        "^message-id:.*\n([     ].*\n)*",
        "^in-reply-to:.*\n([    ].*\n)*",
        "^references:.*\n([     ].*\n)*",
        "^date:.*\n([   ].*\n)*",
        "^delivery-date:.*\n([  ].*\n)*",
        "e?smtp id .*",
        "^      id.*",
        "boundary=.*",
        "name=\"",
        "filename=\"",
        "news:<[^>]+>",
        "^--[^  ]*$",

        /* base64 encoding */
        "^[0-9a-zA-Z+\\-=/]+$",

        /* uu encoding */
        "^[!-Z]+$",

        /* little things */
        ".",
        "\n",
};

char *keywords[] =
{
        "([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+",
};

int debug;

Dreprog*
dregcomp(char *buf)
{
        Reprog *r;
        Dreprog *d;

        if(debug)
                print(">>> '%s'\n", buf);

        r = regcomp(buf);
        if(r == nil)
                sysfatal("regcomp");
        d = dregcvt(r);
        if(d == nil)
                sysfatal("dregcomp");
        free(r);
        return d;
}

char*
strcpycase(char *d, char *s)
{
        int cc, esc;

        cc = 0;
        esc = 0;
        while(*s){
                if(*s == '[')
                        cc++;
                if(*s == ']')
                        cc--;
                if(!cc && 'a' <= *s && *s <= 'z'){
                        *d++ = '[';
                        *d++ = *s;
                        *d++ = *s+'A'-'a';
                        *d++ = ']';
                }else
                        *d++ = *s;
                if(*s == '\\')
                        esc++;
                else if(esc)
                        esc--;
                s++;
        }
        return d;
}

void
regerror(char *msg)
{
        sysfatal("regerror: %s", msg);
}

void
buildre(Dreprog *re[3])
{
        int i;
        static char buf[16384], *s;

        re[0] = dregcomp("^From ");
        
        s = buf;
        for(i=0; i<nelem(keywords); i++){
                if(i != 0)
                        *s++ = '|';
                s = strcpycase(s, keywords[i]);
        }
        *s = 0;
        re[1] = dregcomp(buf);

        s = buf;
        for(i=0; i<nelem(ignore); i++){
                if(i != 0)
                        *s++ = '|';
                s = strcpycase(s, ignore[i]);
        }
        *s = 0;
        re[2] = dregcomp(buf);
}

void
usage(void)
{
        fprint(2, "usage: regen [-d]\n");
        exits("usage");
}

void
main(int argc, char **argv)
{
        Dreprog *re[3];
        Biobuf b;

        ARGBEGIN{
        default:
                usage();
        case 'd':
                debug = 1;
        }ARGEND

        if(argc != 0)
                usage();

        buildre(re);
        Binit(&b, 1, OWRITE);
        Bprintdfa(&b, re[0]);
        Bprintdfa(&b, re[1]);
        Bprintdfa(&b, re[2]);
        exits(0);
}

        

Generated by GNU Enscript 1.6.6.