Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "msgdb.h"

void
usage(void)
{
        fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
        exits("usage");
}

enum
{
        MAXBEST = 32,
        MAXLEN = 64,
        MAXTAB = 256,
};

typedef struct Ndb Ndb;
struct Ndb
{
        char *name;
        char *file;
        Msgdb *db;
        double p;
        long nmsg;
};

typedef struct Word Word;
struct Word
{
        char s[MAXLEN];
        int count[MAXTAB];
        double p[MAXTAB];
        double mp;
        int mi; /* w.p[w.mi] = w.mp */
        int nmsg;
};

Ndb db[MAXTAB];
int ndb;

int add;
int mul;
Msgdb *indb;

Word best[MAXBEST];
int mbest = 15;
int nbest;

void process(Biobuf*, char*);
void lockfile(char*);

void
noteword(Word *w, char *s)
{
        int i;

        for(i=nbest-1; i>=0; i--)
                if(w->mp < best[i].mp)
                        break;
        i++;

        if(i >= mbest)
                return;
        if(nbest == mbest)
                nbest--;
        if(i < nbest)
                memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
        best[i] = *w;
        strecpy(best[i].s, best[i].s+MAXLEN, s);
        nbest++;
}

void
main(int argc, char **argv)
{
        int i, bad, m, tot, nn, j;
        Biobuf bin, *b, bout;
        char *s, *lf;
        double totp, p, thresh;
        long n;
        Word w;

        lf = nil;
        thresh = 0;
        ARGBEGIN{
        case 'a':
                add = 1;
                break;
        case 'd':
                if(ndb >= MAXTAB)
                        sysfatal("too many db classes");
                db[ndb].name = EARGF(usage());
                db[ndb].file = EARGF(usage());
                ndb++;
                break;
        case 'l':
                lf = EARGF(usage());
                break;
        case 'm':
                mul = atoi(EARGF(usage()));
                break;
        case 't':
                thresh = atof(EARGF(usage()));
                break;
        default:
                usage();
        }ARGEND

        if(ndb == 0){
                fprint(2, "must have at least one -d option\n");
                usage();
        }

        indb = mdopen(nil, 1);
        if(argc == 0){
                Binit(&bin, 0, OREAD);
                process(&bin, "<stdin>");
                Bterm(&bin);
        }else{
                bad = 0;
                for(i=0; i<argc; i++){
                        if((b = Bopen(argv[i], OREAD)) == nil){
                                fprint(2, "opening %s: %r\n", argv[i]);
                                bad = 1;
                                continue;
                        }
                        process(b, argv[i]);
                        Bterm(b);
                }
                if(bad)
                        exits("open inputs");
        }

        lockfile(lf);
        bad = 0;
        for(i=0; i<ndb; i++){
                if((db[i].db = mdopen(db[i].file, 0)) == nil){
                        fprint(2, "opendb %s: %r\n", db[i].file);
                        bad = 1;
                }
                db[i].nmsg = mdget(db[i].db, "*From*");
        }
        if(bad)
                exits("open databases");

        /* run conditional probabilities of input words, getting 15 most specific */
        mdenum(indb);
        nbest = 0;
        while(mdnext(indb, &s, &n) >= 0){
                tot = 0;
                totp = 0.0;
                for(i=0; i<ndb; i++){
                        nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
                        tot += nn;
                        w.count[i] = nn;
                        p = w.count[i]/(double)db[i].nmsg;
                        if(p >= 1.0)
                                p = 1.0;
                        w.p[i] = p;
                        totp += p;
                }
//fprint(2, "%s tot %d totp %g\n", s, tot, totp);
                if(tot < 2)
                        continue;
                w.mp = 0.0;
                for(i=0; i<ndb; i++){
                        p = w.p[i];
                        p /= totp;
                        if(p < 0.001)
                                p = 0.001;
                        else if(p > 0.999)
                                p = 0.999;
                        if(p > w.mp){
                                w.mp = p;
                                w.mi = i;
                        }
                        w.p[i] = p;
                }
                noteword(&w, s);
        }

        /* compute conditional probabilities of message classes using 15 most specific */
        totp = 0.0;
        for(i=0; i<ndb; i++){
                p = 1.0;
                for(j=0; j<nbest; j++)
                        p *= best[j].p[i];
                db[i].p = p;
                totp += p;
        }
        for(i=0; i<ndb; i++)
                db[i].p /= totp;
        m = 0;
        for(i=1; i<ndb; i++)
                if(db[i].p > db[m].p)
                        m = i;

        Binit(&bout, 1, OWRITE);
        if(db[m].p < thresh)
                m = -1;
        if(m >= 0)
                Bprint(&bout, "%s", db[m].name);
        else
                Bprint(&bout, "inconclusive");
        for(j=0; j<ndb; j++)
                Bprint(&bout, " %s=%g", db[j].name, db[j].p);
        Bprint(&bout, "\n");
        for(i=0; i<nbest; i++){
                Bprint(&bout, "%s", best[i].s);
                for(j=0; j<ndb; j++)
                        Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
                Bprint(&bout, "\n");
        }
                Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
        Bterm(&bout);

        if(m >= 0 && add){
                mdenum(indb);
                while(mdnext(indb, &s, &n) >= 0)
                        mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
                mdclose(db[m].db);
        }
        exits(nil);
}

void
process(Biobuf *b, char*)
{
        char *s;
        char *p;
        long n;

        while((s = Brdline(b, '\n')) != nil){
                s[Blinelen(b)-1] = 0;
                if((p = strrchr(s, ' ')) != nil){
                        *p++ = 0;
                        n = atoi(p);
                }else
                        n = 1;
                mdput(indb, s, mdget(indb, s)+n);
        }
}

int tpid;
void
killtickle(void)
{
        postnote(PNPROC, tpid, "die");
}

void
lockfile(char *s)
{
        int fd, t, w;
        char err[ERRMAX];

        if(s == nil)
                return;
        w = 50;
        t = 0;
        for(;;){
                fd = open(s, OREAD);
                if(fd >= 0)
                        break;
                rerrstr(err, sizeof err);
                if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
                        break;
                sleep(w);
                t += w;
                if(w < 1000)
                        w = (w*3)/2;
                if(t > 120*1000)
                        break;
        }
        if(fd < 0)
                sysfatal("could not lock %s", s);
        switch(tpid = fork()){
        case -1:
                sysfatal("fork: %r");
        case 0:
                for(;;){
                        sleep(30*1000);
                        free(dirfstat(fd));
                }
                _exits(nil);
        default:
                break;
        }
        close(fd);
        atexit(killtickle);
}