Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <stdio.h>
4
#include "cpp.h"
5
 
6
/*
7
 * lexical FSM encoding
8
 *   when in state state, and one of the characters
9
 *   in ch arrives, enter nextstate.
10
 *   States >= S_SELF are either final, or at least require special action.
11
 *   In 'fsm' there is a line for each state X charset X nextstate.
12
 *   List chars that overwrite previous entries later (e.g. C_ALPH
13
 *   can be overridden by '_' by a later entry; and C_XX is the
14
 *   the universal set, and should always be first.
15
 *   States above S_SELF are represented in the big table as negative values.
16
 *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
17
 *   These actions differ in that S_SELF doesn't have a lookahead char,
18
 *   S_SELFB does.
19
 *
20
 *   The encoding is blown out into a big table for time-efficiency.
21
 *   Entries have
22
 *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
23
 */
24
 
25
#define	MAXSTATE 32
26
#define	ACT(tok,act)	((tok<<7)+act)
27
#define	QBSBIT	0100
28
#define	GETACT(st)	(st>>7)&0x1ff
29
 
30
#define	UTF2(c)		((c)>=0xA0 && (c)<0xE0)		/* 2-char UTF seq */
31
#define	UTF3(c)		((c)>=0xE0 && (c)<0xF0)		/* 3-char UTF seq */
32
 
33
/* character classes */
34
#define	C_WS	1
35
#define	C_ALPH	2
36
#define	C_NUM	3
37
#define	C_EOF	4
38
#define	C_XX	5
39
 
40
enum state {
41
	START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
42
	CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
43
	CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
44
	S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
45
	S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
46
};
47
 
48
struct	fsm {
49
	int	state;		/* if in this state */
50
	uchar	ch[4];		/* and see one of these characters */
51
	int	nextstate;	/* enter this state if +ve */
52
};
53
 
54
/*const*/ struct fsm fsm[] = {
55
	/* start state */
56
	START,	{ C_XX },	ACT(UNCLASS,S_SELF),
57
	START,	{ ' ', '\t', '\v', '\r' },	WS1,
58
	START,	{ C_NUM },	NUM1,
59
	START,	{ '.' },	NUM3,
60
	START,	{ C_ALPH },	ID1,
61
	START,	{ 'L' },	ST1,
62
	START,	{ '"' },	ST2,
63
	START,	{ '\'' },	CC1,
64
	START,	{ '/' },	COM1,
65
	START,	{ EOFC },	S_EOF,
66
	START,	{ '\n' },	S_NL,
67
	START,	{ '-' },	MINUS1,
68
	START,	{ '+' },	PLUS1,
69
	START,	{ '<' },	LT1,
70
	START,	{ '>' },	GT1,
71
	START,	{ '=' },	ASG1,
72
	START,	{ '!' },	NOT1,
73
	START,	{ '&' },	AND1,
74
	START,	{ '|' },	OR1,
75
	START,	{ '#' },	SHARP1,
76
	START,	{ '%' },	PCT1,
77
	START,	{ '[' },	ACT(SBRA,S_SELF),
78
	START,	{ ']' },	ACT(SKET,S_SELF),
79
	START,	{ '(' },	ACT(LP,S_SELF),
80
	START,	{ ')' },	ACT(RP,S_SELF),
81
	START,	{ '*' },	STAR1,
82
	START,	{ ',' },	ACT(COMMA,S_SELF),
83
	START,	{ '?' },	ACT(QUEST,S_SELF),
84
	START,	{ ':' },	ACT(COLON,S_SELF),
85
	START,	{ ';' },	ACT(SEMIC,S_SELF),
86
	START,	{ '{' },	ACT(CBRA,S_SELF),
87
	START,	{ '}' },	ACT(CKET,S_SELF),
88
	START,	{ '~' },	ACT(TILDE,S_SELF),
89
	START,	{ '^' },	CIRC1,
90
 
91
	/* saw a digit */
92
	NUM1,	{ C_XX },	ACT(NUMBER,S_SELFB),
93
	NUM1,	{ C_NUM, C_ALPH, '.' },	NUM1,
94
	NUM1,	{ 'E', 'e' },	NUM2,
95
	NUM1,	{ '_' },	ACT(NUMBER,S_SELFB),
96
 
97
	/* saw possible start of exponent, digits-e */
98
	NUM2,	{ C_XX },	ACT(NUMBER,S_SELFB),
99
	NUM2,	{ '+', '-' },	NUM1,
100
	NUM2,	{ C_NUM, C_ALPH },	NUM1,
101
	NUM2,	{ '_' },	ACT(NUMBER,S_SELFB),
102
 
103
	/* saw a '.', which could be a number or an operator */
104
	NUM3,	{ C_XX },	ACT(DOT,S_SELFB),
105
	NUM3,	{ '.' },	DOTS1,
106
	NUM3,	{ C_NUM },	NUM1,
107
 
108
	DOTS1,	{ C_XX },	ACT(UNCLASS, S_SELFB),
109
	DOTS1,	{ C_NUM },	NUM1,
110
	DOTS1,	{ '.' },	ACT(ELLIPS, S_SELF),
111
 
112
	/* saw a letter or _ */
113
	ID1,	{ C_XX },	ACT(NAME,S_NAME),
114
	ID1,	{ C_ALPH, C_NUM },	ID1,
115
 
116
	/* saw L (start of wide string?) */
117
	ST1,	{ C_XX },	ACT(NAME,S_NAME),
118
	ST1,	{ C_ALPH, C_NUM },	ID1,
119
	ST1,	{ '"' },	ST2,
120
	ST1,	{ '\'' },	CC1,
121
 
122
	/* saw " beginning string */
123
	ST2,	{ C_XX },	ST2,
124
	ST2,	{ '"' },	ACT(STRING, S_SELF),
125
	ST2,	{ '\\' },	ST3,
126
	ST2,	{ '\n' },	S_STNL,
127
	ST2,	{ EOFC },	S_EOFSTR,
128
 
129
	/* saw \ in string */
130
	ST3,	{ C_XX },	ST2,
131
	ST3,	{ '\n' },	S_STNL,
132
	ST3,	{ EOFC },	S_EOFSTR,
133
 
134
	/* saw ' beginning character const */
135
	CC1,	{ C_XX },	CC1,
136
	CC1,	{ '\'' },	ACT(CCON, S_SELF),
137
	CC1,	{ '\\' },	CC2,
138
	CC1,	{ '\n' },	S_STNL,
139
	CC1,	{ EOFC },	S_EOFSTR,
140
 
141
	/* saw \ in ccon */
142
	CC2,	{ C_XX },	CC1,
143
	CC2,	{ '\n' },	S_STNL,
144
	CC2,	{ EOFC },	S_EOFSTR,
145
 
146
	/* saw /, perhaps start of comment */
147
	COM1,	{ C_XX },	ACT(SLASH, S_SELFB),
148
	COM1,	{ '=' },	ACT(ASSLASH, S_SELF),
149
	COM1,	{ '*' },	COM2,
150
	COM1,	{ '/' },	COM4,
151
 
152
	/* saw "/*", start of comment */
153
	COM2,	{ C_XX },	COM2,
154
	COM2,	{ '\n' },	S_COMNL,
155
	COM2,	{ '*' },	COM3,
156
	COM2,	{ EOFC },	S_EOFCOM,
157
 
158
	/* saw the * possibly ending a comment */
159
	COM3,	{ C_XX },	COM2,
160
	COM3,	{ '\n' },	S_COMNL,
161
	COM3,	{ '*' },	COM3,
162
	COM3,	{ '/' },	S_COMMENT,
163
	COM3,	{ EOFC },	S_EOFCOM,
164
 
165
	/* // comment */
166
	COM4,	{ C_XX },	COM4,
167
	COM4,	{ '\n' },	S_NL,
168
	COM4,	{ EOFC },	S_EOFCOM,
169
 
170
	/* saw white space, eat it up */
171
	WS1,	{ C_XX },	S_WS,
172
	WS1,	{ ' ', '\t', '\v', '\r'},	WS1,
173
 
174
	/* saw -, check --, -=, -> */
175
	MINUS1,	{ C_XX },	ACT(MINUS, S_SELFB),
176
	MINUS1,	{ '-' },	ACT(MMINUS, S_SELF),
177
	MINUS1,	{ '=' },	ACT(ASMINUS,S_SELF),
178
	MINUS1,	{ '>' },	ACT(ARROW,S_SELF),
179
 
180
	/* saw +, check ++, += */
181
	PLUS1,	{ C_XX },	ACT(PLUS, S_SELFB),
182
	PLUS1,	{ '+' },	ACT(PPLUS, S_SELF),
183
	PLUS1,	{ '=' },	ACT(ASPLUS, S_SELF),
184
 
185
	/* saw <, check <<, <<=, <= */
186
	LT1,	{ C_XX },	ACT(LT, S_SELFB),
187
	LT1,	{ '<' },	LT2,
188
	LT1,	{ '=' },	ACT(LEQ, S_SELF),
189
	LT2,	{ C_XX },	ACT(LSH, S_SELFB),
190
	LT2,	{ '=' },	ACT(ASLSH, S_SELF),
191
 
192
	/* saw >, check >>, >>=, >= */
193
	GT1,	{ C_XX },	ACT(GT, S_SELFB),
194
	GT1,	{ '>' },	GT2,
195
	GT1,	{ '=' },	ACT(GEQ, S_SELF),
196
	GT2,	{ C_XX },	ACT(RSH, S_SELFB),
197
	GT2,	{ '=' },	ACT(ASRSH, S_SELF),
198
 
199
	/* = */
200
	ASG1,	{ C_XX },	ACT(ASGN, S_SELFB),
201
	ASG1,	{ '=' },	ACT(EQ, S_SELF),
202
 
203
	/* ! */
204
	NOT1,	{ C_XX },	ACT(NOT, S_SELFB),
205
	NOT1,	{ '=' },	ACT(NEQ, S_SELF),
206
 
207
	/* & */
208
	AND1,	{ C_XX },	ACT(AND, S_SELFB),
209
	AND1,	{ '&' },	ACT(LAND, S_SELF),
210
	AND1,	{ '=' },	ACT(ASAND, S_SELF),
211
 
212
	/* | */
213
	OR1,	{ C_XX },	ACT(OR, S_SELFB),
214
	OR1,	{ '|' },	ACT(LOR, S_SELF),
215
	OR1,	{ '=' },	ACT(ASOR, S_SELF),
216
 
217
	/* # */
218
	SHARP1,	{ C_XX },	ACT(SHARP, S_SELFB),
219
	SHARP1,	{ '#' },	ACT(DSHARP, S_SELF),
220
 
221
	/* % */
222
	PCT1,	{ C_XX },	ACT(PCT, S_SELFB),
223
	PCT1,	{ '=' },	ACT(ASPCT, S_SELF),
224
 
225
	/* * */
226
	STAR1,	{ C_XX },	ACT(STAR, S_SELFB),
227
	STAR1,	{ '=' },	ACT(ASSTAR, S_SELF),
228
 
229
	/* ^ */
230
	CIRC1,	{ C_XX },	ACT(CIRC, S_SELFB),
231
	CIRC1,	{ '=' },	ACT(ASCIRC, S_SELF),
232
 
233
	-1
234
};
235
 
236
/* first index is char, second is state */
237
/* increase #states to power of 2 to encourage use of shift */
238
short	bigfsm[256][MAXSTATE];
239
 
240
void
241
expandlex(void)
242
{
243
	/*const*/ struct fsm *fp;
244
	int i, j, nstate;
245
 
246
	for (fp = fsm; fp->state>=0; fp++) {
247
		for (i=0; fp->ch[i]; i++) {
248
			nstate = fp->nextstate;
249
			if (nstate >= S_SELF)
250
				nstate = ~nstate;
251
			switch (fp->ch[i]) {
252
 
253
			case C_XX:		/* random characters */
254
				for (j=0; j<256; j++)
255
					bigfsm[j][fp->state] = nstate;
256
				continue;
257
			case C_ALPH:
258
				for (j=0; j<=256; j++)
259
					if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
260
					  || UTF2(j) || UTF3(j) || j=='_')
261
						bigfsm[j][fp->state] = nstate;
262
				continue;
263
			case C_NUM:
264
				for (j='0'; j<='9'; j++)
265
					bigfsm[j][fp->state] = nstate;
266
				continue;
267
			default:
268
				bigfsm[fp->ch[i]][fp->state] = nstate;
269
			}
270
		}
271
	}
272
	/* install special cases for ? (trigraphs),  \ (splicing), runes, and EOB */
273
	for (i=0; i<MAXSTATE; i++) {
274
		for (j=0; j<0xFF; j++)
275
			if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) {
276
				if (bigfsm[j][i]>0)
277
					bigfsm[j][i] = ~bigfsm[j][i];
278
				bigfsm[j][i] &= ~QBSBIT;
279
			}
280
		bigfsm[EOB][i] = ~S_EOB;
281
		if (bigfsm[EOFC][i]>=0)
282
			bigfsm[EOFC][i] = ~S_EOF;
283
	}
284
}
285
 
286
void
287
fixlex(void)
288
{
289
	/* do C++ comments? */
290
	if (Cplusplus==0)
291
		bigfsm['/'][COM1] = bigfsm['x'][COM1];
292
}
293
 
294
/*
295
 * fill in a row of tokens from input, terminated by NL or END
296
 * First token is put at trp->lp.
297
 * Reset is non-zero when the input buffer can be "rewound."
298
 * The value is a flag indicating that possible macros have
299
 * been seen in the row.
300
 */
301
int
302
gettokens(Tokenrow *trp, int reset)
303
{
304
	register int c, state, oldstate;
305
	register uchar *ip;
306
	register Token *tp, *maxp;
307
	int runelen;
308
	Source *s = cursource;
309
	int nmac = 0;
310
	extern char outbuf[];
311
 
312
	tp = trp->lp;
313
	ip = s->inp;
314
	if (reset) {
315
		s->lineinc = 0;
316
		if (ip>=s->inl) {		/* nothing in buffer */
317
			s->inl = s->inb;
318
			fillbuf(s);
319
			ip = s->inp = s->inb;
320
		} else if (ip >= s->inb+(3*s->ins/4)) {
321
			memmove(s->inb, ip, 4+s->inl-ip);
322
			s->inl = s->inb+(s->inl-ip);
323
			ip = s->inp = s->inb;
324
		}
325
	}
326
	maxp = &trp->bp[trp->max];
327
	runelen = 1;
328
	for (;;) {
329
	   continue2:
330
		if (tp>=maxp) {
331
			trp->lp = tp;
332
			tp = growtokenrow(trp);
333
			maxp = &trp->bp[trp->max];
334
		}
335
		tp->type = UNCLASS;
336
		tp->hideset = 0;
337
		tp->t = ip;
338
		tp->wslen = 0;
339
		tp->flag = 0;
340
		state = START;
341
		for (;;) {
342
			oldstate = state;
343
			c = *ip;
344
			if ((state = bigfsm[c][state]) >= 0) {
345
				ip += runelen;
346
				runelen = 1;
347
				continue;
348
			}
349
			state = ~state;
350
		reswitch:
351
			switch (state&0177) {
352
			case S_SELF:
353
				ip += runelen;
354
				runelen = 1;
355
			case S_SELFB:
356
				tp->type = GETACT(state);
357
				tp->len = ip - tp->t;
358
				tp++;
359
				goto continue2;
360
 
361
			case S_NAME:	/* like S_SELFB but with nmac check */
362
				tp->type = NAME;
363
				tp->len = ip - tp->t;
364
				nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
365
				tp++;
366
				goto continue2;
367
 
368
			case S_WS:
369
				tp->wslen = ip - tp->t;
370
				tp->t = ip;
371
				state = START;
372
				continue;
373
 
374
			default:
375
				if ((state&QBSBIT)==0) {
376
					ip += runelen;
377
					runelen = 1;
378
					continue;
379
				}
380
				state &= ~QBSBIT;
381
				s->inp = ip;
382
				if (c=='?') { 	/* check trigraph */
383
					if (trigraph(s)) {
384
						state = oldstate;
385
						continue;
386
					}
387
					goto reswitch;
388
				}
389
				if (c=='\\') { /* line-folding */
390
					if (foldline(s)) {
391
						s->lineinc++;
392
						state = oldstate;
393
						continue;
394
					}
395
					goto reswitch;
396
				}
397
				if (UTF2(c)) {
398
					runelen = 2;
399
					goto reswitch;
400
				}
401
				if (UTF3(c)) {
402
					runelen = 3;
403
					goto reswitch;
404
				}
405
				error(WARNING, "Lexical botch in cpp");
406
				ip += runelen;
407
				runelen = 1;
408
				continue;
409
 
410
			case S_EOB:
411
				s->inp = ip;
412
				fillbuf(cursource);
413
				state = oldstate;
414
				continue;
415
 
416
			case S_EOF:
417
				tp->type = END;
418
				tp->len = 0;
419
				s->inp = ip;
420
				if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
421
					error(WARNING,"No newline at end of file");
422
				trp->lp = tp+1;
423
				return nmac;
424
 
425
			case S_STNL:
426
				error(ERROR, "Unterminated string or char const");
427
			case S_NL:
428
				tp->t = ip;
429
				tp->type = NL;
430
				tp->len = 1;
431
				tp->wslen = 0;
432
				s->lineinc++;
433
				s->inp = ip+1;
434
				trp->lp = tp+1;
435
				return nmac;
436
 
437
			case S_EOFSTR:
438
				error(FATAL, "EOF in string or char constant");
439
				break;
440
 
441
			case S_COMNL:
442
				s->lineinc++;
443
				state = COM2;
444
				ip += runelen;
445
				runelen = 1;
446
 				if (ip >= s->inb+(7*s->ins/8)) { /* very long comment */
447
					memmove(tp->t, ip, 4+s->inl-ip);
448
					s->inl -= ip-tp->t;
449
					ip = tp->t+1;
450
				}
451
				continue;
452
 
453
			case S_EOFCOM:
454
				error(WARNING, "EOF inside comment");
455
				--ip;
456
			case S_COMMENT:
457
				++ip;
458
				tp->t = ip;
459
				tp->t[-1] = ' ';
460
				tp->wslen = 1;
461
				state = START;
462
				continue;
463
			}
464
			break;
465
		}
466
		ip += runelen;
467
		runelen = 1;
468
		tp->len = ip - tp->t;
469
		tp++;
470
	}
471
}
472
 
473
/* have seen ?; handle the trigraph it starts (if any) else 0 */
474
int
475
trigraph(Source *s)
476
{
477
	int c;
478
 
479
	while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
480
		;
481
	if (s->inp[1]!='?')
482
		return 0;
483
	c = 0;
484
	switch(s->inp[2]) {
485
	case '=':
486
		c = '#'; break;
487
	case '(':
488
		c = '['; break;
489
	case '/':
490
		c = '\\'; break;
491
	case ')':
492
		c = ']'; break;
493
	case '\'':
494
		c = '^'; break;
495
	case '<':
496
		c = '{'; break;
497
	case '!':
498
		c = '|'; break;
499
	case '>':
500
		c = '}'; break;
501
	case '-':
502
		c = '~'; break;
503
	}
504
	if (c) {
505
		*s->inp = c;
506
		memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
507
		s->inl -= 2;
508
	}
509
	return c;
510
}
511
 
512
int
513
foldline(Source *s)
514
{
515
	int ncr = 0;
516
 
517
recheck:
518
	while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
519
		;
520
	if (s->inp[ncr+1] == '\r') {	/* nonstandardly, ignore CR before line-folding */
521
		ncr++;
522
		goto recheck;
523
	}
524
	if (s->inp[ncr+1] == '\n') {
525
		memmove(s->inp, s->inp+2+ncr, s->inl-s->inp+3-ncr);
526
		s->inl -= 2+ncr;
527
		return 1;
528
	}
529
	return 0;
530
}
531
 
532
int
533
fillbuf(Source *s)
534
{
535
	int n;
536
 
537
	while((char *)s->inl+s->ins/8 > (char *)s->inb+s->ins) {
538
		int l = s->inl - s->inb;
539
		int p = s->inp - s->inb;
540
		if(l < 0) 
541
			error(FATAL, "negative end of input!?");
542
		if(p < 0)
543
			error(FATAL, "negative input pointer!?");
544
		/* double the buffer size and try again */
545
		s->ins *= 2;
546
		s->inb = dorealloc(s->inb, s->ins);
547
		s->inl = s->inb + l;
548
		s->inp = s->inb + p;
549
	}
550
	if (s->fd<0 || (n=read(s->fd, (char *)s->inl, s->ins/8)) <= 0)
551
		n = 0;
552
	if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
553
		*s->inp = EOFC;
554
	s->inl += n;
555
	s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
556
	if (n==0) {
557
		s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
558
		return EOF;
559
	}
560
	return 0;
561
}
562
 
563
/*
564
 * Push down to new source of characters.
565
 * If fd>0 and str==NULL, then from a file `name';
566
 * if fd==-1 and str, then from the string.
567
 */
568
Source *
569
setsource(char *name, int fd, char *str)
570
{
571
	Source *s = new(Source);
572
	int len;
573
 
574
	s->line = 1;
575
	s->lineinc = 0;
576
	s->fd = fd;
577
	s->filename = name;
578
	s->next = cursource;
579
	s->ifdepth = 0;
580
	cursource = s;
581
	/* slop at right for EOB */
582
	if (str) {
583
		len = strlen(str);
584
		s->inb = domalloc(len+4);
585
		s->inp = s->inb;
586
		strncpy((char *)s->inp, str, len);
587
	} else {
588
		Dir *d;
589
		int junk;
590
		ulong length = 0;
591
		d = dirfstat(fd);
592
		if (d != nil) {
593
			length = d->length;
594
			free(d);
595
		}
596
		junk = length;
597
		if (junk<INS)
598
			junk = INS;
599
		s->inb = domalloc((junk)+4);
600
		s->inp = s->inb;
601
		len = 0;
602
	}
603
 
604
	s->ins = INS;	
605
	s->inl = s->inp+len;
606
	s->inl[0] = s->inl[1] = EOB;
607
	return s;
608
}
609
 
610
void
611
unsetsource(void)
612
{
613
	Source *s = cursource;
614
 
615
	if (s->fd>=0) {
616
		close(s->fd);
617
		dofree(s->inb);
618
	}
619
	cursource = s->next;
620
	dofree(s);
621
}