Warning: Attempt to read property "date" on null in /usr/local/www/websvn.planix.org/blame.php on line 247

Warning: Attempt to read property "msg" on null in /usr/local/www/websvn.planix.org/blame.php on line 247
WebSVN – planix.SVN – Blame – /os/branches/feature_unix/sys/src/cmd/upas/scanmail/common.c – Rev 2

Subversion Repositories planix.SVN

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bio.h>
4
#include <regexp.h>
5
#include "spam.h"
6
 
7
enum {
8
	Quanta	= 8192,
9
	Minbody = 6000,
10
	HdrMax	= 15,
11
};
12
 
13
typedef struct keyword Keyword;
14
typedef struct word Word;
15
 
16
struct word{
17
	char	*string;
18
	int	n;
19
};
20
 
21
struct	keyword{
22
	char	*string;
23
	int	value;
24
};
25
 
26
Word	htmlcmds[] =
27
{
28
	"html",		4,
29
	"!doctype html", 13,
30
	0,
31
 
32
};
33
 
34
Word	hrefs[] =
35
{
36
	"a href=",	7,
37
	"a title=",	8,
38
	"a target=",	9,
39
	"base href=",	10,
40
	"img src=",	8,
41
	"img border=",	11,
42
	"form action=", 12,
43
	"!--",		3,
44
	0,
45
 
46
};
47
 
48
/*
49
 *	RFC822 header keywords to look for for fractured header.
50
 *	all lengths must be less than HdrMax defined above.
51
 */
52
Word	hdrwords[] =
53
{
54
	"cc:",			3,
55
	"bcc:", 		4,
56
	"to:",			3,
57
	0,			0,
58
 
59
};
60
 
61
Keyword	keywords[] =
62
{
63
	"header",	HoldHeader,
64
	"line",		SaveLine,
65
	"hold",		Hold,
66
	"dump",		Dump,
67
	"loff",		Lineoff,
68
	0,		Nactions,
69
};
70
 
71
Patterns patterns[] = {
72
[Dump]		{ "DUMP:", 0, 0 },
73
[HoldHeader]	{ "HEADER:", 0, 0 },
74
[Hold]		{ "HOLD:", 0, 0 },
75
[SaveLine]	{ "LINE:", 0, 0 },
76
[Lineoff]	{ "LINEOFF:", 0, 0 },
77
[Nactions]	{ 0, 0, 0 },
78
};
79
 
80
static char*	endofhdr(char*, char*);
81
static	int	escape(char**);
82
static	int	extract(char*);
83
static	int	findkey(char*);
84
static	int	hash(int);
85
static	int	isword(Word*, char*, int);
86
static	void	parsealt(Biobuf*, char*, Spat**);
87
 
88
/*
89
 *	The canonicalizer: convert input to canonical representation
90
 */
91
char*
92
readmsg(Biobuf *bp, int *hsize, int *bufsize)
93
{
94
	char *p, *buf;
95
	int n, offset, eoh, bsize, delta;
96
 
97
	buf = 0;
98
	offset = 0;
99
	if(bufsize)
100
		*bufsize = 0;
101
	if(hsize)
102
		*hsize = 0;
103
	for(;;) {
104
		buf = Realloc(buf, offset+Quanta+1);
105
		n = Bread(bp, buf+offset, Quanta);
106
		if(n < 0){
107
			free(buf);
108
			return 0;
109
		}
110
		p = buf+offset;			/* start of this chunk */
111
		offset += n;			/* end of this chunk */
112
		buf[offset] = 0;
113
		if(n == 0){
114
			if(offset == 0)
115
				return 0;
116
			break;
117
		}
118
 
119
		if(hsize == 0)			/* don't process header */
120
			break;
121
		if(p != buf && p[-1] == '\n')	/* check for EOH across buffer split */
122
			p--;
123
		p = endofhdr(p, buf+offset);
124
		if(p)
125
			break;
126
		if(offset >= Maxread)		/* gargantuan header - just punt*/
127
		{
128
			if(hsize)
129
				*hsize = offset;
130
			if(bufsize)
131
				*bufsize = offset;
132
			return buf;
133
		}
134
	}
135
	eoh = p-buf;				/* End of header */
136
	bsize = offset - eoh;			/* amount of body already read */
137
 
138
		/* Read at least Minbody bytes of the body */
139
	if (bsize < Minbody){
140
		delta = Minbody-bsize;
141
		buf = Realloc(buf, offset+delta+1);
142
		n = Bread(bp, buf+offset, delta);
143
		if(n > 0) {
144
			offset += n;
145
			buf[offset] = 0;
146
		}
147
	}
148
	if(hsize)
149
		*hsize = eoh;
150
	if(bufsize)
151
		*bufsize = offset;
152
	return buf;
153
}
154
 
155
static	int
156
isword(Word *wp, char *text, int len)
157
{
158
	for(;wp->string; wp++)
159
		if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
160
			return 1;
161
	return 0;
162
}
163
 
164
static char*
165
endofhdr(char *raw, char *end)
166
{
167
	int i;
168
	char *p, *q;
169
	char buf[HdrMax];
170
 
171
	/*
172
 	 * can't use strchr to search for newlines because
173
	 * there may be embedded NULL's.
174
	 */
175
	for(p = raw; p < end; p++){
176
		if(*p != '\n' || p[1] != '\n')
177
			continue;
178
		p++;
179
		for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
180
			buf[i++] = tolower(*q);
181
			if(*q == ':' || *q == '\n')
182
				break;
183
		}
184
		if(!isword(hdrwords, buf, i))
185
			return p+1;
186
	}
187
	return 0;
188
}
189
 
190
static	int
191
htmlmatch(Word *wp, char *text, char *end, int *n)
192
{
193
	char *cp;
194
	int i, c, lastc;
195
	char buf[MaxHtml];
196
 
197
	/*
198
	 * extract a string up to '>'
199
	 */
200
 
201
	i = lastc = 0;
202
	cp = text;
203
	while (cp < end && i < sizeof(buf)-1){
204
		c = *cp++;
205
		if(c == '=')
206
			c = escape(&cp);
207
		switch(c){
208
		case 0:
209
		case '\r':
210
			continue;
211
		case '>':
212
			goto out;
213
		case '\n':
214
		case ' ':
215
		case '\t':
216
			if(lastc == ' ')
217
				continue;
218
			c = ' ';
219
			break;
220
		default:
221
			c = tolower(c);
222
			break;
223
		}
224
		buf[i++] = lastc = c;
225
	}
226
out:
227
	buf[i] = 0;
228
	if(n)
229
		*n = cp-text;
230
	return isword(wp, buf, i);
231
}
232
 
233
static int
234
escape(char **msg)
235
{
236
	int c;
237
	char *p;
238
 
239
	p = *msg;
240
	c = *p;
241
	if(c == '\n'){
242
		p++;
243
		c = *p++;
244
	} else
245
	if(c == '2'){
246
		c = tolower(p[1]);
247
		if(c == 'e'){
248
			p += 2;
249
			c = '.';
250
		}else
251
		if(c == 'f'){
252
			p += 2;
253
			c = '/';
254
		}else
255
		if(c == '0'){
256
			p += 2;
257
			c = ' ';
258
		}
259
		else c = '=';
260
	} else {
261
		if(c == '3' && tolower(p[1]) == 'd')
262
			p += 2;
263
		c = '=';
264
	}
265
	*msg = p;
266
	return c;
267
}
268
 
269
static int
270
htmlchk(char **msg, char *end)
271
{
272
	int n;
273
	char *p;
274
 
275
	static int ishtml;
276
 
277
	p = *msg;
278
	if(ishtml == 0){
279
		ishtml = htmlmatch(htmlcmds, p, end, &n);
280
 
281
		/* If not an HTML keyword, check if it's
282
		 * an HTML comment (<!comment>).  if so,
283
		 * skip over it; otherwise copy it in.
284
		 */
285
		if(ishtml == 0 && *p != '!')	/* not comment */
286
			return '<';		/* copy it */
287
 
288
	} else if(htmlmatch(hrefs, p, end, &n))	/* if special HTML string  */
289
		return '<';			/* copy it */
290
 
291
	/*
292
	 * this is an uninteresting HTML command; skip over it.
293
	 */
294
	p += n;
295
	*msg = p+1;
296
	return *p;
297
}
298
 
299
/*
300
 * decode a base 64 encode body
301
 */
302
void
303
conv64(char *msg, char *end, char *buf, int bufsize)
304
{
305
	int len, i;
306
	char *cp;
307
 
308
	len = end - msg;
309
	i = (len*3)/4+1;	// room for max chars + null
310
	cp = Malloc(i);
311
	len = dec64((uchar*)cp, i, msg, len);
312
	convert(cp, cp+len, buf, bufsize, 1);
313
	free(cp);
314
}
315
 
316
int
317
convert(char *msg, char *end, char *buf, int bufsize, int isbody)
318
{
319
 
320
	char *p;
321
	int c, lastc, base64;
322
 
323
	lastc = 0;
324
	base64 = 0;
325
	while(msg < end && bufsize > 0){
326
		c = *msg++;
327
 
328
		/*
329
		 * In the body only, try to strip most HTML and
330
		 * replace certain MIME escape sequences with the character
331
		 */
332
		if(isbody) {
333
			do{
334
				p = msg;
335
				if(c == '<')
336
					c = htmlchk(&msg, end);
337
				if(c == '=')
338
					c = escape(&msg);
339
			} while(p != msg && p < end);
340
		}
341
		switch(c){
342
		case 0:
343
		case '\r':
344
			continue;
345
		case '\t':
346
		case ' ':
347
		case '\n':
348
			if(lastc == ' ')
349
				continue;
350
			c = ' ';
351
			break;
352
		case 'C':	/* check for MIME base 64 encoding in header */
353
		case 'c':
354
			if(isbody == 0)
355
			if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
356
			if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
357
				base64 = 1;
358
			c = 'c';
359
			break;
360
		default:
361
			c = tolower(c);
362
			break;
363
		}
364
		*buf++ = c;
365
		lastc = c;
366
		bufsize--;
367
	}
368
	*buf = 0;
369
	return base64;
370
}
371
 
372
/*
373
 *	The pattern parser: build data structures from the pattern file
374
 */
375
 
376
static int
377
hash(int c)
378
{
379
	return c & 127;
380
}
381
 
382
static	int
383
findkey(char *val)
384
{
385
	Keyword *kp;
386
 
387
	for(kp = keywords; kp->string; kp++)
388
		if(strcmp(val, kp->string) == 0)
389
				break;
390
	return kp->value;
391
}
392
 
393
#define	whitespace(c)	((c) == ' ' || (c) == '\t')
394
 
395
void
396
parsepats(Biobuf *bp)
397
{
398
	Pattern *p, *new;
399
	char *cp, *qp;
400
	int type, action, n, h;
401
	Spat *spat;
402
 
403
	for(;;){
404
		cp = Brdline(bp, '\n');
405
		if(cp == 0)
406
			break;
407
		cp[Blinelen(bp)-1] = 0;
408
		while(*cp == ' ' || *cp == '\t')
409
			cp++;
410
		if(*cp == '#' || *cp == 0)
411
			continue;
412
		type = regexp;
413
		if(*cp == '*'){
414
			type = string;
415
			cp++;
416
		}
417
		qp = strchr(cp, ':');
418
		if(qp == 0)
419
			continue;
420
		*qp = 0;
421
		if(debug)
422
			fprint(2, "action = %s\n", cp);
423
		action = findkey(cp);
424
		if(action >= Nactions)
425
			continue;
426
		cp = qp+1;
427
		n = extract(cp);
428
		if(n <= 0 || *cp == 0)
429
			continue;
430
 
431
		qp = strstr(cp, "~~");
432
		if(qp){
433
			*qp = 0;
434
			n = strlen(cp);
435
		}
436
		if(debug)
437
			fprint(2, " Pattern: `%s'\n", cp);
438
 
439
			/* Hook regexps into a chain */
440
		if(type == regexp) {
441
			new = Malloc(sizeof(Pattern));
442
			new->action = action;
443
			new->pat = regcomp(cp);
444
			if(new->pat == 0){
445
				free(new);
446
				continue;
447
			}
448
			new->type = regexp;
449
			new->alt = 0;
450
			new->next = 0;
451
 
452
			if(qp)
453
				parsealt(bp, qp+2, &new->alt);
454
 
455
			new->next = patterns[action].regexps;
456
			patterns[action].regexps = new;
457
			continue;
458
 
459
		}
460
			/* not a Regexp - hook strings into Pattern hash chain */
461
		spat = Malloc(sizeof(*spat));
462
		spat->next = 0;
463
		spat->alt = 0;
464
		spat->len = n;
465
		spat->string = Malloc(n+1);
466
		spat->c1 = cp[1];
467
		strcpy(spat->string, cp);
468
 
469
		if(qp)
470
			parsealt(bp, qp+2, &spat->alt);
471
 
472
		p = patterns[action].strings;
473
		if(p == 0) {
474
			p = Malloc(sizeof(Pattern));
475
			memset(p, 0, sizeof(*p));
476
			p->action = action;
477
			p->type = string;
478
			patterns[action].strings = p;
479
		}
480
		h = hash(*spat->string);
481
		spat->next = p->spat[h];
482
		p->spat[h] = spat;
483
	}
484
}
485
 
486
static void
487
parsealt(Biobuf *bp, char *cp, Spat** head)
488
{
489
	char *p;
490
	Spat *alt;
491
 
492
	while(cp){
493
		if(*cp == 0){		/*escaped newline*/
494
			do{
495
				cp = Brdline(bp, '\n');
496
				if(cp == 0)
497
					return;
498
				cp[Blinelen(bp)-1] = 0;
499
			} while(extract(cp) <= 0 || *cp == 0);
500
		}
501
 
502
		p = cp;
503
		cp = strstr(p, "~~");
504
		if(cp){
505
			*cp = 0;
506
			cp += 2;
507
		}
508
		if(strlen(p)){
509
			alt = Malloc(sizeof(*alt));
510
			alt->string = strdup(p);
511
			alt->next = *head;
512
			*head = alt;
513
		}
514
	}
515
}
516
 
517
static int
518
extract(char *cp)
519
{
520
	int c;
521
	char *p, *q, *r;
522
 
523
	p = q = r = cp;
524
	while(whitespace(*p))
525
		p++;
526
	while(c = *p++){
527
		if (c == '#')
528
			break;
529
		if(c == '"'){
530
			while(*p && *p != '"'){
531
				if(*p == '\\' && p[1] == '"')
532
					p++;
533
				if('A' <= *p && *p <= 'Z')
534
					*q++ = *p++ + ('a'-'A');
535
				else
536
					*q++ = *p++;
537
			}
538
			if(*p)
539
				p++;
540
			r = q;		/* never back up over a quoted string */
541
		} else {
542
			if('A' <= c && c <= 'Z')
543
				c += ('a'-'A');
544
			*q++ = c;
545
		}
546
	}
547
	while(q > r && whitespace(q[-1]))
548
		q--;
549
	*q = 0;
550
	return q-cp;
551
}
552
 
553
/*
554
 *	The matching engine: compare canonical input to pattern structures
555
 */
556
 
557
static Spat*
558
isalt(char *message, Spat *alt)
559
{
560
	while(alt) {
561
		if(*cmd)
562
		if(message != cmd && strstr(cmd, alt->string))
563
			break;
564
		if(message != header+1 && strstr(header+1, alt->string))
565
			break;
566
		if(strstr(message, alt->string))
567
			break;
568
		alt = alt->next;
569
	}
570
	return alt;
571
}
572
 
573
int
574
matchpat(Pattern *p, char *message, Resub *m)
575
{
576
	Spat *spat;
577
	char *s;
578
	int c, c1;
579
 
580
	if(p->type == string){
581
		c1 = *message;
582
		for(s=message; c=c1; s++){
583
			c1 = s[1];
584
			for(spat=p->spat[hash(c)]; spat; spat=spat->next){
585
				if(c1 == spat->c1)
586
				if(memcmp(s, spat->string, spat->len) == 0)
587
				if(!isalt(message, spat->alt)){
588
					m->sp = s;
589
					m->ep = s + spat->len;
590
					return 1;
591
				}
592
			}
593
		}
594
		return 0;
595
	}
596
	m->sp = m->ep = 0;
597
	if(regexec(p->pat, message, m, 1) == 0)
598
		return 0;
599
	if(isalt(message, p->alt))
600
		return 0;
601
	return 1;
602
}
603
 
604
 
605
void
606
xprint(int fd, char *type, Resub *m)
607
{
608
	char *p, *q;
609
	int i;
610
 
611
	if(m->sp == 0 || m->ep == 0)
612
		return;
613
 
614
		/* back up approx 30 characters to whitespace */
615
	for(p = m->sp, i = 0; *p && i < 30; i++, p--)
616
			;
617
	while(*p && *p != ' ')
618
		p--;
619
	p++;
620
 
621
		/* grab about 30 more chars beyond the end of the match */
622
	for(q = m->ep, i = 0; *q && i < 30; i++, q++)
623
			;
624
	while(*q && *q != ' ')
625
		q++;
626
 
627
	fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
628
}
629
 
630
enum {
631
	INVAL=	255
632
};
633
 
634
static uchar t64d[256] = {
635
/*00 */	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
636
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
637
/*10*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
638
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
639
/*20*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
640
	INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
641
/*30*/	   52,	  53,	 54,	55,    56,    57,    58,    59,
642
	   60,	  61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
643
/*40*/	INVAL,    0,      1,     2,     3,     4,     5,     6,
644
	    7,    8,      9,    10,    11,    12,    13,    14,
645
/*50*/	   15,   16,     17,    18,    19,    20,    21,    22,
646
	   23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
647
/*60*/	INVAL,   26,     27,    28,    29,    30,    31,    32,
648
	   33,   34,     35,    36,    37,    38,    39,    40,
649
/*70*/	   41,   42,     43,    44,    45,    46,    47,    48,
650
	   49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
651
/*80*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
652
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
653
/*90*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
654
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
655
/*A0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
656
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
657
/*B0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
658
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
659
/*C0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
660
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
661
/*D0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
662
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
663
/*E0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
664
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
665
/*F0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
666
	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
667
};