Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bio.h>
4
#include <ctype.h>
5
#include "msgdb.h"
6
 
7
void
8
usage(void)
9
{
10
	fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
11
	exits("usage");
12
}
13
 
14
enum
15
{
16
	MAXBEST = 32,
17
	MAXLEN = 64,
18
	MAXTAB = 256,
19
};
20
 
21
typedef struct Ndb Ndb;
22
struct Ndb
23
{
24
	char *name;
25
	char *file;
26
	Msgdb *db;
27
	double p;
28
	long nmsg;
29
};
30
 
31
typedef struct Word Word;
32
struct Word
33
{
34
	char s[MAXLEN];
35
	int count[MAXTAB];
36
	double p[MAXTAB];
37
	double mp;
38
	int mi; /* w.p[w.mi] = w.mp */
39
	int nmsg;
40
};
41
 
42
Ndb db[MAXTAB];
43
int ndb;
44
 
45
int add;
46
int mul;
47
Msgdb *indb;
48
 
49
Word best[MAXBEST];
50
int mbest = 15;
51
int nbest;
52
 
53
void process(Biobuf*, char*);
54
void lockfile(char*);
55
 
56
void
57
noteword(Word *w, char *s)
58
{
59
	int i;
60
 
61
	for(i=nbest-1; i>=0; i--)
62
		if(w->mp < best[i].mp)
63
			break;
64
	i++;
65
 
66
	if(i >= mbest)
67
		return;
68
	if(nbest == mbest)
69
		nbest--;
70
	if(i < nbest)
71
		memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
72
	best[i] = *w;
73
	strecpy(best[i].s, best[i].s+MAXLEN, s);
74
	nbest++;
75
}
76
 
77
void
78
main(int argc, char **argv)
79
{
80
	int i, bad, m, tot, nn, j;
81
	Biobuf bin, *b, bout;
82
	char *s, *lf;
83
	double totp, p, thresh;
84
	long n;
85
	Word w;
86
 
87
	lf = nil;
88
	thresh = 0;
89
	ARGBEGIN{
90
	case 'a':
91
		add = 1;
92
		break;
93
	case 'd':
94
		if(ndb >= MAXTAB)
95
			sysfatal("too many db classes");
96
		db[ndb].name = EARGF(usage());
97
		db[ndb].file = EARGF(usage());
98
		ndb++;
99
		break;
100
	case 'l':
101
		lf = EARGF(usage());
102
		break;
103
	case 'm':
104
		mul = atoi(EARGF(usage()));
105
		break;
106
	case 't':
107
		thresh = atof(EARGF(usage()));
108
		break;
109
	default:
110
		usage();
111
	}ARGEND
112
 
113
	if(ndb == 0){
114
		fprint(2, "must have at least one -d option\n");
115
		usage();
116
	}
117
 
118
	indb = mdopen(nil, 1);
119
	if(argc == 0){
120
		Binit(&bin, 0, OREAD);
121
		process(&bin, "<stdin>");
122
		Bterm(&bin);
123
	}else{
124
		bad = 0;
125
		for(i=0; i<argc; i++){
126
			if((b = Bopen(argv[i], OREAD)) == nil){
127
				fprint(2, "opening %s: %r\n", argv[i]);
128
				bad = 1;
129
				continue;
130
			}
131
			process(b, argv[i]);
132
			Bterm(b);
133
		}
134
		if(bad)
135
			exits("open inputs");
136
	}
137
 
138
	lockfile(lf);
139
	bad = 0;
140
	for(i=0; i<ndb; i++){
141
		if((db[i].db = mdopen(db[i].file, 0)) == nil){
142
			fprint(2, "opendb %s: %r\n", db[i].file);
143
			bad = 1;
144
		}
145
		db[i].nmsg = mdget(db[i].db, "*From*");
146
	}
147
	if(bad)
148
		exits("open databases");
149
 
150
	/* run conditional probabilities of input words, getting 15 most specific */
151
	mdenum(indb);
152
	nbest = 0;
153
	while(mdnext(indb, &s, &n) >= 0){
154
		tot = 0;
155
		totp = 0.0;
156
		for(i=0; i<ndb; i++){
157
			nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
158
			tot += nn;
159
			w.count[i] = nn;
160
			p = w.count[i]/(double)db[i].nmsg;
161
			if(p >= 1.0)
162
				p = 1.0;
163
			w.p[i] = p;
164
			totp += p;
165
		}
166
//fprint(2, "%s tot %d totp %g\n", s, tot, totp);
167
		if(tot < 2)
168
			continue;
169
		w.mp = 0.0;
170
		for(i=0; i<ndb; i++){
171
			p = w.p[i];
172
			p /= totp;
173
			if(p < 0.001)
174
				p = 0.001;
175
			else if(p > 0.999)
176
				p = 0.999;
177
			if(p > w.mp){
178
				w.mp = p;
179
				w.mi = i;
180
			}
181
			w.p[i] = p;
182
		}
183
		noteword(&w, s);
184
	}
185
 
186
	/* compute conditional probabilities of message classes using 15 most specific */
187
	totp = 0.0;
188
	for(i=0; i<ndb; i++){
189
		p = 1.0;
190
		for(j=0; j<nbest; j++)
191
			p *= best[j].p[i];
192
		db[i].p = p;
193
		totp += p;
194
	}
195
	for(i=0; i<ndb; i++)
196
		db[i].p /= totp;
197
	m = 0;
198
	for(i=1; i<ndb; i++)
199
		if(db[i].p > db[m].p)
200
			m = i;
201
 
202
	Binit(&bout, 1, OWRITE);
203
	if(db[m].p < thresh)
204
		m = -1;
205
	if(m >= 0)
206
		Bprint(&bout, "%s", db[m].name);
207
	else
208
		Bprint(&bout, "inconclusive");
209
	for(j=0; j<ndb; j++)
210
		Bprint(&bout, " %s=%g", db[j].name, db[j].p);
211
	Bprint(&bout, "\n");
212
	for(i=0; i<nbest; i++){
213
		Bprint(&bout, "%s", best[i].s);
214
		for(j=0; j<ndb; j++)
215
			Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
216
		Bprint(&bout, "\n");
217
	}
218
		Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
219
	Bterm(&bout);
220
 
221
	if(m >= 0 && add){
222
		mdenum(indb);
223
		while(mdnext(indb, &s, &n) >= 0)
224
			mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
225
		mdclose(db[m].db);
226
	}
227
	exits(nil);
228
}
229
 
230
void
231
process(Biobuf *b, char*)
232
{
233
	char *s;
234
	char *p;
235
	long n;
236
 
237
	while((s = Brdline(b, '\n')) != nil){
238
		s[Blinelen(b)-1] = 0;
239
		if((p = strrchr(s, ' ')) != nil){
240
			*p++ = 0;
241
			n = atoi(p);
242
		}else
243
			n = 1;
244
		mdput(indb, s, mdget(indb, s)+n);
245
	}
246
}
247
 
248
int tpid;
249
void
250
killtickle(void)
251
{
252
	postnote(PNPROC, tpid, "die");
253
}
254
 
255
void
256
lockfile(char *s)
257
{
258
	int fd, t, w;
259
	char err[ERRMAX];
260
 
261
	if(s == nil)
262
		return;
263
	w = 50;
264
	t = 0;
265
	for(;;){
266
		fd = open(s, OREAD);
267
		if(fd >= 0)
268
			break;
269
		rerrstr(err, sizeof err);
270
		if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
271
			break;
272
		sleep(w);
273
		t += w;
274
		if(w < 1000)
275
			w = (w*3)/2;
276
		if(t > 120*1000)
277
			break;
278
	}
279
	if(fd < 0)
280
		sysfatal("could not lock %s", s);
281
	switch(tpid = fork()){
282
	case -1:
283
		sysfatal("fork: %r");
284
	case 0:
285
		for(;;){
286
			sleep(30*1000);
287
			free(dirfstat(fd));
288
		}
289
		_exits(nil);
290
	default:
291
		break;
292
	}
293
	close(fd);
294
	atexit(killtickle);
295
}
296