Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include "spam.h"

enum {
        Quanta  = 8192,
        Minbody = 6000,
        HdrMax  = 15,
};

typedef struct keyword Keyword;
typedef struct word Word;

struct word{
        char    *string;
        int     n;
};

struct  keyword{
        char    *string;
        int     value;
};

Word    htmlcmds[] =
{
        "html",         4,
        "!doctype html", 13,
        0,

};

Word    hrefs[] =
{
        "a href=",      7,
        "a title=",     8,
        "a target=",    9,
        "base href=",   10,
        "img src=",     8,
        "img border=",  11,
        "form action=", 12,
        "!--",          3,
        0,

};

/*
 *      RFC822 header keywords to look for for fractured header.
 *      all lengths must be less than HdrMax defined above.
 */
Word    hdrwords[] =
{
        "cc:",                  3,
        "bcc:",                 4,
        "to:",                  3,
        0,                      0,

};

Keyword keywords[] =
{
        "header",       HoldHeader,
        "line",         SaveLine,
        "hold",         Hold,
        "dump",         Dump,
        "loff",         Lineoff,
        0,              Nactions,
};

Patterns patterns[] = {
[Dump]          { "DUMP:", 0, 0 },
[HoldHeader]    { "HEADER:", 0, 0 },
[Hold]          { "HOLD:", 0, 0 },
[SaveLine]      { "LINE:", 0, 0 },
[Lineoff]       { "LINEOFF:", 0, 0 },
[Nactions]      { 0, 0, 0 },
};

static char*    endofhdr(char*, char*);
static  int     escape(char**);
static  int     extract(char*);
static  int     findkey(char*);
static  int     hash(int);
static  int     isword(Word*, char*, int);
static  void    parsealt(Biobuf*, char*, Spat**);

/*
 *      The canonicalizer: convert input to canonical representation
 */
char*
readmsg(Biobuf *bp, int *hsize, int *bufsize)
{
        char *p, *buf;
        int n, offset, eoh, bsize, delta;

        buf = 0;
        offset = 0;
        if(bufsize)
                *bufsize = 0;
        if(hsize)
                *hsize = 0;
        for(;;) {
                buf = Realloc(buf, offset+Quanta+1);
                n = Bread(bp, buf+offset, Quanta);
                if(n < 0){
                        free(buf);
                        return 0;
                }
                p = buf+offset;                 /* start of this chunk */
                offset += n;                    /* end of this chunk */
                buf[offset] = 0;
                if(n == 0){
                        if(offset == 0)
                                return 0;
                        break;
                }

                if(hsize == 0)                  /* don't process header */
                        break;
                if(p != buf && p[-1] == '\n')   /* check for EOH across buffer split */
                        p--;
                p = endofhdr(p, buf+offset);
                if(p)
                        break;
                if(offset >= Maxread)           /* gargantuan header - just punt*/
                {
                        if(hsize)
                                *hsize = offset;
                        if(bufsize)
                                *bufsize = offset;
                        return buf;
                }
        }
        eoh = p-buf;                            /* End of header */
        bsize = offset - eoh;                   /* amount of body already read */

                /* Read at least Minbody bytes of the body */
        if (bsize < Minbody){
                delta = Minbody-bsize;
                buf = Realloc(buf, offset+delta+1);
                n = Bread(bp, buf+offset, delta);
                if(n > 0) {
                        offset += n;
                        buf[offset] = 0;
                }
        }
        if(hsize)
                *hsize = eoh;
        if(bufsize)
                *bufsize = offset;
        return buf;
}

static  int
isword(Word *wp, char *text, int len)
{
        for(;wp->string; wp++)
                if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
                        return 1;
        return 0;
}

static char*
endofhdr(char *raw, char *end)
{
        int i;
        char *p, *q;
        char buf[HdrMax];

        /*
         * can't use strchr to search for newlines because
         * there may be embedded NULL's.
         */
        for(p = raw; p < end; p++){
                if(*p != '\n' || p[1] != '\n')
                        continue;
                p++;
                for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
                        buf[i++] = tolower(*q);
                        if(*q == ':' || *q == '\n')
                                break;
                }
                if(!isword(hdrwords, buf, i))
                        return p+1;
        }
        return 0;
}

static  int
htmlmatch(Word *wp, char *text, char *end, int *n)
{
        char *cp;
        int i, c, lastc;
        char buf[MaxHtml];

        /*
         * extract a string up to '>'
         */

        i = lastc = 0;
        cp = text;
        while (cp < end && i < sizeof(buf)-1){
                c = *cp++;
                if(c == '=')
                        c = escape(&cp);
                switch(c){
                case 0:
                case '\r':
                        continue;
                case '>':
                        goto out;
                case '\n':
                case ' ':
                case '\t':
                        if(lastc == ' ')
                                continue;
                        c = ' ';
                        break;
                default:
                        c = tolower(c);
                        break;
                }
                buf[i++] = lastc = c;
        }
out:
        buf[i] = 0;
        if(n)
                *n = cp-text;
        return isword(wp, buf, i);
}

static int
escape(char **msg)
{
        int c;
        char *p;

        p = *msg;
        c = *p;
        if(c == '\n'){
                p++;
                c = *p++;
        } else
        if(c == '2'){
                c = tolower(p[1]);
                if(c == 'e'){
                        p += 2;
                        c = '.';
                }else
                if(c == 'f'){
                        p += 2;
                        c = '/';
                }else
                if(c == '0'){
                        p += 2;
                        c = ' ';
                }
                else c = '=';
        } else {
                if(c == '3' && tolower(p[1]) == 'd')
                        p += 2;
                c = '=';
        }
        *msg = p;
        return c;
}

static int
htmlchk(char **msg, char *end)
{
        int n;
        char *p;

        static int ishtml;

        p = *msg;
        if(ishtml == 0){
                ishtml = htmlmatch(htmlcmds, p, end, &n);
        
                /* If not an HTML keyword, check if it's
                 * an HTML comment (<!comment>).  if so,
                 * skip over it; otherwise copy it in.
                 */
                if(ishtml == 0 && *p != '!')    /* not comment */
                        return '<';             /* copy it */

        } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string  */
                return '<';                     /* copy it */
        
        /*
         * this is an uninteresting HTML command; skip over it.
         */
        p += n;
        *msg = p+1;
        return *p;
}

/*
 * decode a base 64 encode body
 */
void
conv64(char *msg, char *end, char *buf, int bufsize)
{
        int len, i;
        char *cp;

        len = end - msg;
        i = (len*3)/4+1;        // room for max chars + null
        cp = Malloc(i);
        len = dec64((uchar*)cp, i, msg, len);
        convert(cp, cp+len, buf, bufsize, 1);
        free(cp);
}

int
convert(char *msg, char *end, char *buf, int bufsize, int isbody)
{

        char *p;
        int c, lastc, base64;

        lastc = 0;
        base64 = 0;
        while(msg < end && bufsize > 0){
                c = *msg++;

                /*
                 * In the body only, try to strip most HTML and
                 * replace certain MIME escape sequences with the character
                 */
                if(isbody) {
                        do{
                                p = msg;
                                if(c == '<')
                                        c = htmlchk(&msg, end);
                                if(c == '=')
                                        c = escape(&msg);
                        } while(p != msg && p < end);
                }
                switch(c){
                case 0:
                case '\r':
                        continue;
                case '\t':
                case ' ':
                case '\n':
                        if(lastc == ' ')
                                continue;
                        c = ' ';
                        break;
                case 'C':       /* check for MIME base 64 encoding in header */
                case 'c':
                        if(isbody == 0)
                        if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
                        if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
                                base64 = 1;
                        c = 'c';
                        break;
                default:
                        c = tolower(c);
                        break;
                }
                *buf++ = c;
                lastc = c;
                bufsize--;
        }
        *buf = 0;
        return base64;
}

/*
 *      The pattern parser: build data structures from the pattern file
 */

static int
hash(int c)
{
        return c & 127;
}

static  int
findkey(char *val)
{
        Keyword *kp;

        for(kp = keywords; kp->string; kp++)
                if(strcmp(val, kp->string) == 0)
                                break;
        return kp->value;
}

#define whitespace(c)   ((c) == ' ' || (c) == '\t')

void
parsepats(Biobuf *bp)
{
        Pattern *p, *new;
        char *cp, *qp;
        int type, action, n, h;
        Spat *spat;

        for(;;){
                cp = Brdline(bp, '\n');
                if(cp == 0)
                        break;
                cp[Blinelen(bp)-1] = 0;
                while(*cp == ' ' || *cp == '\t')
                        cp++;
                if(*cp == '#' || *cp == 0)
                        continue;
                type = regexp;
                if(*cp == '*'){
                        type = string;
                        cp++;
                }
                qp = strchr(cp, ':');
                if(qp == 0)
                        continue;
                *qp = 0;
                if(debug)
                        fprint(2, "action = %s\n", cp);
                action = findkey(cp);
                if(action >= Nactions)
                        continue;
                cp = qp+1;
                n = extract(cp);
                if(n <= 0 || *cp == 0)
                        continue;

                qp = strstr(cp, "~~");
                if(qp){
                        *qp = 0;
                        n = strlen(cp);
                }
                if(debug)
                        fprint(2, " Pattern: `%s'\n", cp);

                        /* Hook regexps into a chain */
                if(type == regexp) {
                        new = Malloc(sizeof(Pattern));
                        new->action = action;
                        new->pat = regcomp(cp);
                        if(new->pat == 0){
                                free(new);
                                continue;
                        }
                        new->type = regexp;
                        new->alt = 0;
                        new->next = 0;

                        if(qp)
                                parsealt(bp, qp+2, &new->alt);

                        new->next = patterns[action].regexps;
                        patterns[action].regexps = new;
                        continue;

                }
                        /* not a Regexp - hook strings into Pattern hash chain */
                spat = Malloc(sizeof(*spat));
                spat->next = 0;
                spat->alt = 0;
                spat->len = n;
                spat->string = Malloc(n+1);
                spat->c1 = cp[1];
                strcpy(spat->string, cp);

                if(qp)
                        parsealt(bp, qp+2, &spat->alt);

                p = patterns[action].strings;
                if(p == 0) {
                        p = Malloc(sizeof(Pattern));
                        memset(p, 0, sizeof(*p));
                        p->action = action;
                        p->type = string;
                        patterns[action].strings = p;
                }
                h = hash(*spat->string);
                spat->next = p->spat[h];
                p->spat[h] = spat;
        }
}

static void
parsealt(Biobuf *bp, char *cp, Spat** head)
{
        char *p;
        Spat *alt;

        while(cp){
                if(*cp == 0){           /*escaped newline*/
                        do{
                                cp = Brdline(bp, '\n');
                                if(cp == 0)
                                        return;
                                cp[Blinelen(bp)-1] = 0;
                        } while(extract(cp) <= 0 || *cp == 0);
                }

                p = cp;
                cp = strstr(p, "~~");
                if(cp){
                        *cp = 0;
                        cp += 2;
                }
                if(strlen(p)){
                        alt = Malloc(sizeof(*alt));
                        alt->string = strdup(p);
                        alt->next = *head;
                        *head = alt;
                }
        }
}

static int
extract(char *cp)
{
        int c;
        char *p, *q, *r;

        p = q = r = cp;
        while(whitespace(*p))
                p++;
        while(c = *p++){
                if (c == '#')
                        break;
                if(c == '"'){
                        while(*p && *p != '"'){
                                if(*p == '\\' && p[1] == '"')
                                        p++;
                                if('A' <= *p && *p <= 'Z')
                                        *q++ = *p++ + ('a'-'A');
                                else
                                        *q++ = *p++;
                        }
                        if(*p)
                                p++;
                        r = q;          /* never back up over a quoted string */
                } else {
                        if('A' <= c && c <= 'Z')
                                c += ('a'-'A');
                        *q++ = c;
                }
        }
        while(q > r && whitespace(q[-1]))
                q--;
        *q = 0;
        return q-cp;
}

/*
 *      The matching engine: compare canonical input to pattern structures
 */

static Spat*
isalt(char *message, Spat *alt)
{
        while(alt) {
                if(*cmd)
                if(message != cmd && strstr(cmd, alt->string))
                        break;
                if(message != header+1 && strstr(header+1, alt->string))
                        break;
                if(strstr(message, alt->string))
                        break;
                alt = alt->next;
        }
        return alt;
}

int
matchpat(Pattern *p, char *message, Resub *m)
{
        Spat *spat;
        char *s;
        int c, c1;

        if(p->type == string){
                c1 = *message;
                for(s=message; c=c1; s++){
                        c1 = s[1];
                        for(spat=p->spat[hash(c)]; spat; spat=spat->next){
                                if(c1 == spat->c1)
                                if(memcmp(s, spat->string, spat->len) == 0)
                                if(!isalt(message, spat->alt)){
                                        m->sp = s;
                                        m->ep = s + spat->len;
                                        return 1;
                                }
                        }
                }
                return 0;
        }
        m->sp = m->ep = 0;
        if(regexec(p->pat, message, m, 1) == 0)
                return 0;
        if(isalt(message, p->alt))
                return 0;
        return 1;
}


void
xprint(int fd, char *type, Resub *m)
{
        char *p, *q;
        int i;

        if(m->sp == 0 || m->ep == 0)
                return;

                /* back up approx 30 characters to whitespace */
        for(p = m->sp, i = 0; *p && i < 30; i++, p--)
                        ;
        while(*p && *p != ' ')
                p--;
        p++;

                /* grab about 30 more chars beyond the end of the match */
        for(q = m->ep, i = 0; *q && i < 30; i++, q++)
                        ;
        while(*q && *q != ' ')
                q++;

        fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
}

enum {
        INVAL=  255
};

static uchar t64d[256] = {
/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*10*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*20*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
/*30*/     52,    53,    54,    55,    56,    57,    58,    59,
           60,    61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*40*/  INVAL,    0,      1,     2,     3,     4,     5,     6,
            7,    8,      9,    10,    11,    12,    13,    14,
/*50*/     15,   16,     17,    18,    19,    20,    21,    22,
           23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
/*60*/  INVAL,   26,     27,    28,    29,    30,    31,    32,
           33,   34,     35,    36,    37,    38,    39,    40,
/*70*/     41,   42,     43,    44,    45,    46,    47,    48,
           49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
/*80*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*90*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*A0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*B0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*C0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*D0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*E0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*F0*/  INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
        INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
};