Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/*
2
 * RFC822 message tokenizer (really feature generator) for spam filter.
3
 * 
4
 * See Paul Graham's musings on spam filtering for theory.
5
 */
6
 
7
#include <u.h>
8
#include <libc.h>
9
#include <bio.h>
10
#include <regexp.h>
11
#include <ctype.h>
12
#include "dfa.h"
13
 
14
void buildre(Dreprog*[3]);
15
int debug;
16
char *refile = "/mail/lib/classify.re";
17
int maxtoklen = 20;
18
int trim(char*);
19
 
20
void
21
usage(void)
22
{
23
	fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24
	exits("usage");
25
}
26
 
27
void
28
main(int argc, char **argv)
29
{
30
	int i, hdr, n, eof, off;
31
	Dreprog *re[3];
32
	int m[3];
33
	char *p, *ep, *tag;
34
	Biobuf bout, bin;
35
	char msg[1024+1];
36
	char buf[1024];
37
 
38
	buildre(re);
39
	ARGBEGIN{
40
	case 'D':
41
		debug = 1;
42
		break;
43
	case 'n':
44
		maxtoklen = atoi(EARGF(usage()));
45
		break;
46
	case 'r':
47
		refile = EARGF(usage());
48
		break;
49
	default:
50
		usage();
51
	}ARGEND;
52
 
53
	if(argc > 1)
54
		usage();
55
	if(argc == 1){
56
		close(0);
57
		if(open(argv[0], OREAD) < 0)
58
			sysfatal("open %s: %r", argv[0]);
59
	}
60
 
61
	tag = nil;
62
	Binit(&bin, 0, OREAD);
63
	Binit(&bout, 1, OWRITE);
64
	ep = msg;
65
	p = msg;
66
	eof = 0;
67
	off = 0;
68
	hdr = 1;
69
	for(;;){
70
		/* replenish buffer */
71
		if(ep - p < 512 && !eof){
72
			if(p > msg + 1){
73
				n = ep - p;
74
				memmove(msg, p-1, ep-(p-1));
75
				off += (p-1) - msg;
76
				p = msg+1;
77
				ep = p + n;
78
			}
79
			n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
80
			if(n < 0)
81
				sysfatal("read error: %r");
82
			if(n == 0)
83
				eof = 1;
84
			ep += n;
85
			*ep = 0;
86
		}
87
		if(p >= ep)
88
			break;
89
 
90
		if(*p == 0){
91
			p++;
92
			continue;
93
		}
94
 
95
		if(hdr && p[-1]=='\n'){
96
			if(p[0]=='\n')
97
				hdr = 0;
98
			else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
99
				tag = "From*";
100
			else if(cistrncmp(p-1, "\nto:", 4) == 0)
101
				tag = "To*";
102
			else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
103
				tag = "Subject*";
104
			else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
105
				tag = "Return-Path*";
106
			else
107
				tag = nil;
108
		}
109
		m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
110
		m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
111
		m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
112
 
113
		n = m[0];
114
		if(n < m[1])
115
			n = m[1];
116
		if(n < m[2])
117
			n = m[2];
118
		if(n <= 0){
119
fprint(2, "«%s» %.2ux", p, p[0]);
120
			sysfatal("no regexps matched at %ld", off + (p-msg));
121
		}
122
 
123
		if(m[0] >= m[1] && m[0] >= m[2]){
124
			/* "From " marks start of new message */
125
			Bprint(&bout, "*From*\n");
126
			n = m[0];
127
			hdr = 1;
128
		}else if(m[2] > 1){
129
			/* ignore */
130
			n = m[2];
131
		}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
132
			/* keyword */
133
			/* should do UTF-aware lowercasing, too much bother */
134
/*
135
			for(i=0; i<n; i++)
136
				if('A' <= p[i] && p[i] <= 'Z')
137
					p[i] += 'a' - 'A';
138
*/
139
			if(tag){
140
				i = strlen(tag);	
141
				memmove(buf, tag, i);
142
				memmove(buf+i, p, m[1]);
143
				buf[i+m[1]] = 0;
144
			}else{
145
				memmove(buf, p, m[1]);
146
				buf[m[1]] = 0;
147
			}
148
			Bprint(&bout, "%s\n", buf);
149
			while(trim(buf) >= 0)
150
				Bprint(&bout, "stem*%s\n", buf);
151
			n = m[1];
152
		}else
153
			n = m[2];
154
		if(debug)
155
			fprint(2, "%.*s¦", utfnlen(p, n), p);
156
		p += n;
157
	}
158
	Bterm(&bout);
159
	exits(0);
160
}
161
 
162
void
163
buildre(Dreprog *re[3])
164
{
165
	Biobuf *b;
166
 
167
	if((b = Bopen(refile, OREAD)) == nil)
168
		sysfatal("open %s: %r", refile);
169
 
170
	re[0] = Breaddfa(b);
171
	re[1] = Breaddfa(b);
172
	re[2] = Breaddfa(b);
173
 
174
	if(re[0]==nil || re[1]==nil || re[2]==nil)
175
		sysfatal("Breaddfa: %r");
176
	Bterm(b);
177
}
178
 
179
/* perhaps this belongs in the tokenizer */
180
int
181
trim(char *s)
182
{
183
	char *p, *op;
184
	int mix, mix1;
185
 
186
	if(*s == '*')
187
		return -1;
188
 
189
	/* strip leading punctuation */
190
	p = strchr(s, '*');
191
	if(p == nil)
192
		p = s;
193
	while(*p && !isalpha(*p))
194
		p++;
195
	if(strlen(p) < 2)
196
{
197
		return -1;
198
}
199
	memmove(s, p, strlen(p)+1);
200
 
201
	/* strip suffix of punctuation */
202
	p = s+strlen(s);
203
	op = p;
204
	while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
205
		p--;
206
 
207
	/* chop punctuation */
208
	if(p > s){
209
		/* free!!! -> free! */
210
		if(p+1 < op){
211
			p[1] = 0;
212
			return 0;
213
		}
214
		/* free! -> free */
215
		if(p < op){
216
			p[0] = 0;
217
			return 0;
218
		}
219
	}
220
 
221
	mix = mix1 = 0;
222
	if(isupper(s[0]))
223
		mix = 1;
224
	for(p=s+1; *p; p++)
225
		if(isupper(*p)){
226
			mix1 = 1;
227
			break;
228
		}
229
 
230
	/* turn FREE into Free */
231
	if(mix1){
232
		for(p=s+1; *p; p++)
233
			if(isupper(*p))
234
				*p += 'a'-'A';
235
		return 0;
236
	}
237
 
238
	/* turn Free into free */
239
	if(mix){
240
		*s += 'a'-'A';
241
		return 0;
242
	}
243
	return -1;
244
}		
245