Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <draw.h>
4
#include <ctype.h>
5
#include <html.h>
6
#include "impl.h"
7
 
8
typedef struct TokenSource TokenSource;
9
struct TokenSource
10
{
11
	int			i;		// index of next byte to use
12
	uchar*		data;		// all the data
13
	int			edata;	// data[0:edata] is valid
14
	int			chset;	// one of US_Ascii, etc.
15
	int			mtype;	// TextHtml or TextPlain
16
};
17
 
18
enum {
19
	EOF = -2,
20
	EOB = -1
21
};
22
 
23
#define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24
 
25
#define SMALLBUFSIZE 240
26
#define BIGBUFSIZE 2000
27
 
28
// HTML 4.0 tag names.
29
// Keep sorted, and in correspondence with enum in iparse.h.
30
Rune* tagnames[] = {
31
	L" ",
32
	L"!",
33
	L"a", 
34
	L"abbr",
35
	L"acronym",
36
	L"address",
37
	L"applet", 
38
	L"area",
39
	L"b",
40
	L"base",
41
	L"basefont",
42
	L"bdo",
43
	L"big",
44
	L"blink",
45
	L"blockquote",
46
	L"body",
47
	L"bq",
48
	L"br",
49
	L"button",
50
	L"caption",
51
	L"center",
52
	L"cite",
53
	L"code",
54
	L"col",
55
	L"colgroup",
56
	L"dd",
57
	L"del",
58
	L"dfn",
59
	L"dir",
60
	L"div",
61
	L"dl",
62
	L"dt",
63
	L"em",
64
	L"fieldset",
65
	L"font",
66
	L"form",
67
	L"frame",
68
	L"frameset",
69
	L"h1",
70
	L"h2",
71
	L"h3",
72
	L"h4",
73
	L"h5",
74
	L"h6",
75
	L"head",
76
	L"hr",
77
	L"html",
78
	L"i",
79
	L"iframe",
80
	L"img",
81
	L"input",
82
	L"ins",
83
	L"isindex",
84
	L"kbd",
85
	L"label",
86
	L"legend",
87
	L"li",
88
	L"link",
89
	L"map",
90
	L"menu",
91
	L"meta",
92
	L"nobr",
93
	L"noframes",
94
	L"noscript",
95
	L"object",
96
	L"ol",
97
	L"optgroup",
98
	L"option",
99
	L"p",
100
	L"param",
101
	L"pre",
102
	L"q",
103
	L"s",
104
	L"samp",
105
	L"script",
106
	L"select",
107
	L"small",
108
	L"span",
109
	L"strike",
110
	L"strong",
111
	L"style",
112
	L"sub",
113
	L"sup",
114
	L"table",
115
	L"tbody",
116
	L"td",
117
	L"textarea",
118
	L"tfoot",
119
	L"th",
120
	L"thead",
121
	L"title",
122
	L"tr",
123
	L"tt",
124
	L"u",
125
	L"ul",
126
	L"var"
127
};
128
 
129
// HTML 4.0 attribute names.
130
// Keep sorted, and in correspondence with enum in impl.h.
131
Rune* attrnames[] = {
132
	L"abbr",
133
	L"accept-charset",
134
	L"access-key",
135
	L"action",
136
	L"align",
137
	L"alink",
138
	L"alt",
139
	L"archive",
140
	L"axis",
141
	L"background",
142
	L"bgcolor",
143
	L"border",
144
	L"cellpadding",
145
	L"cellspacing",
146
	L"char",
147
	L"charoff",
148
	L"charset",
149
	L"checked",
150
	L"cite",
151
	L"class",
152
	L"classid",
153
	L"clear",
154
	L"code",
155
	L"codebase",
156
	L"codetype",
157
	L"color",
158
	L"cols",
159
	L"colspan",
160
	L"compact",
161
	L"content",
162
	L"coords",
163
	L"data",
164
	L"datetime",
165
	L"declare",
166
	L"defer",
167
	L"dir",
168
	L"disabled",
169
	L"enctype",
170
	L"face",
171
	L"for",
172
	L"frame",
173
	L"frameborder",
174
	L"headers",
175
	L"height",
176
	L"href",
177
	L"hreflang",
178
	L"hspace",
179
	L"http-equiv",
180
	L"id",
181
	L"ismap",
182
	L"label",
183
	L"lang",
184
	L"link",
185
	L"longdesc",
186
	L"marginheight",
187
	L"marginwidth",
188
	L"maxlength",
189
	L"media",
190
	L"method",
191
	L"multiple",
192
	L"name",
193
	L"nohref",
194
	L"noresize",
195
	L"noshade",
196
	L"nowrap",
197
	L"object",
198
	L"onblur",
199
	L"onchange",
200
	L"onclick",
201
	L"ondblclick",
202
	L"onfocus",
203
	L"onkeypress",
204
	L"onkeyup",
205
	L"onload",
206
	L"onmousedown",
207
	L"onmousemove",
208
	L"onmouseout",
209
	L"onmouseover",
210
	L"onmouseup",
211
	L"onreset",
212
	L"onselect",
213
	L"onsubmit",
214
	L"onunload",
215
	L"profile",
216
	L"prompt",
217
	L"readonly",
218
	L"rel",
219
	L"rev",
220
	L"rows",
221
	L"rowspan",
222
	L"rules",
223
	L"scheme",
224
	L"scope",
225
	L"scrolling",
226
	L"selected",
227
	L"shape",
228
	L"size",
229
	L"span",
230
	L"src",
231
	L"standby",
232
	L"start",
233
	L"style",
234
	L"summary",
235
	L"tabindex",
236
	L"target",
237
	L"text",
238
	L"title",
239
	L"type",
240
	L"usemap",
241
	L"valign",
242
	L"value",
243
	L"valuetype",
244
	L"version",
245
	L"vlink",
246
	L"vspace",
247
	L"width"
248
};
249
 
250
 
251
// Character entity to unicode character number map.
252
// Keep sorted by name.
253
StringInt	chartab[]= {
254
	{L"AElig", 198},
255
	{L"Aacute", 193},
256
	{L"Acirc", 194},
257
	{L"Agrave", 192},
258
	{L"Alpha", 913},
259
	{L"Aring", 197},
260
	{L"Atilde", 195},
261
	{L"Auml", 196},
262
	{L"Beta", 914},
263
	{L"Ccedil", 199},
264
	{L"Chi", 935},
265
	{L"Dagger", 8225},
266
	{L"Delta", 916},
267
	{L"ETH", 208},
268
	{L"Eacute", 201},
269
	{L"Ecirc", 202},
270
	{L"Egrave", 200},
271
	{L"Epsilon", 917},
272
	{L"Eta", 919},
273
	{L"Euml", 203},
274
	{L"Gamma", 915},
275
	{L"Iacute", 205},
276
	{L"Icirc", 206},
277
	{L"Igrave", 204},
278
	{L"Iota", 921},
279
	{L"Iuml", 207},
280
	{L"Kappa", 922},
281
	{L"Lambda", 923},
282
	{L"Mu", 924},
283
	{L"Ntilde", 209},
284
	{L"Nu", 925},
285
	{L"OElig", 338},
286
	{L"Oacute", 211},
287
	{L"Ocirc", 212},
288
	{L"Ograve", 210},
289
	{L"Omega", 937},
290
	{L"Omicron", 927},
291
	{L"Oslash", 216},
292
	{L"Otilde", 213},
293
	{L"Ouml", 214},
294
	{L"Phi", 934},
295
	{L"Pi", 928},
296
	{L"Prime", 8243},
297
	{L"Psi", 936},
298
	{L"Rho", 929},
299
	{L"Scaron", 352},
300
	{L"Sigma", 931},
301
	{L"THORN", 222},
302
	{L"Tau", 932},
303
	{L"Theta", 920},
304
	{L"Uacute", 218},
305
	{L"Ucirc", 219},
306
	{L"Ugrave", 217},
307
	{L"Upsilon", 933},
308
	{L"Uuml", 220},
309
	{L"Xi", 926},
310
	{L"Yacute", 221},
311
	{L"Yuml", 376},
312
	{L"Zeta", 918},
313
	{L"aacute", 225},
314
	{L"acirc", 226},
315
	{L"acute", 180},
316
	{L"aelig", 230},
317
	{L"agrave", 224},
318
	{L"alefsym", 8501},
319
	{L"alpha", 945},
320
	{L"amp", 38},
321
	{L"and", 8743},
322
	{L"ang", 8736},
323
	{L"aring", 229},
324
	{L"asymp", 8776},
325
	{L"atilde", 227},
326
	{L"auml", 228},
327
	{L"bdquo", 8222},
328
	{L"beta", 946},
329
	{L"brvbar", 166},
330
	{L"bull", 8226},
331
	{L"cap", 8745},
332
	{L"ccedil", 231},
333
	{L"cdots", 8943},
334
	{L"cedil", 184},
335
	{L"cent", 162},
336
	{L"chi", 967},
337
	{L"circ", 710},
338
	{L"clubs", 9827},
339
	{L"cong", 8773},
340
	{L"copy", 169},
341
	{L"crarr", 8629},
342
	{L"cup", 8746},
343
	{L"curren", 164},
344
	{L"dArr", 8659},
345
	{L"dagger", 8224},
346
	{L"darr", 8595},
347
	{L"ddots", 8945},
348
	{L"deg", 176},
349
	{L"delta", 948},
350
	{L"diams", 9830},
351
	{L"divide", 247},
352
	{L"eacute", 233},
353
	{L"ecirc", 234},
354
	{L"egrave", 232},
355
	{L"emdash", 8212},	/* non-standard but commonly used */
356
	{L"empty", 8709},
357
	{L"emsp", 8195},
358
	{L"endash", 8211},	/* non-standard but commonly used */
359
	{L"ensp", 8194},
360
	{L"epsilon", 949},
361
	{L"equiv", 8801},
362
	{L"eta", 951},
363
	{L"eth", 240},
364
	{L"euml", 235},
365
	{L"euro", 8364},
366
	{L"exist", 8707},
367
	{L"fnof", 402},
368
	{L"forall", 8704},
369
	{L"frac12", 189},
370
	{L"frac14", 188},
371
	{L"frac34", 190},
372
	{L"frasl", 8260},
373
	{L"gamma", 947},
374
	{L"ge", 8805},
375
	{L"gt", 62},
376
	{L"hArr", 8660},
377
	{L"harr", 8596},
378
	{L"hearts", 9829},
379
	{L"hellip", 8230},
380
	{L"iacute", 237},
381
	{L"icirc", 238},
382
	{L"iexcl", 161},
383
	{L"igrave", 236},
384
	{L"image", 8465},
385
	{L"infin", 8734},
386
	{L"int", 8747},
387
	{L"iota", 953},
388
	{L"iquest", 191},
389
	{L"isin", 8712},
390
	{L"iuml", 239},
391
	{L"kappa", 954},
392
	{L"lArr", 8656},
393
	{L"lambda", 955},
394
	{L"lang", 9001},
395
	{L"laquo", 171},
396
	{L"larr", 8592},
397
	{L"lceil", 8968},
398
	{L"ldots", 8230},
399
	{L"ldquo", 8220},
400
	{L"le", 8804},
401
	{L"lfloor", 8970},
402
	{L"lowast", 8727},
403
	{L"loz", 9674},
404
	{L"lrm", 8206},
405
	{L"lsaquo", 8249},
406
	{L"lsquo", 8216},
407
	{L"lt", 60},
408
	{L"macr", 175},
409
	{L"mdash", 8212},
410
	{L"micro", 181},
411
	{L"middot", 183},
412
	{L"minus", 8722},
413
	{L"mu", 956},
414
	{L"nabla", 8711},
415
	{L"nbsp", 160},
416
	{L"ndash", 8211},
417
	{L"ne", 8800},
418
	{L"ni", 8715},
419
	{L"not", 172},
420
	{L"notin", 8713},
421
	{L"nsub", 8836},
422
	{L"ntilde", 241},
423
	{L"nu", 957},
424
	{L"oacute", 243},
425
	{L"ocirc", 244},
426
	{L"oelig", 339},
427
	{L"ograve", 242},
428
	{L"oline", 8254},
429
	{L"omega", 969},
430
	{L"omicron", 959},
431
	{L"oplus", 8853},
432
	{L"or", 8744},
433
	{L"ordf", 170},
434
	{L"ordm", 186},
435
	{L"oslash", 248},
436
	{L"otilde", 245},
437
	{L"otimes", 8855},
438
	{L"ouml", 246},
439
	{L"para", 182},
440
	{L"part", 8706},
441
	{L"permil", 8240},
442
	{L"perp", 8869},
443
	{L"phi", 966},
444
	{L"pi", 960},
445
	{L"piv", 982},
446
	{L"plusmn", 177},
447
	{L"pound", 163},
448
	{L"prime", 8242},
449
	{L"prod", 8719},
450
	{L"prop", 8733},
451
	{L"psi", 968},
452
	{L"quad", 8193},
453
	{L"quot", 34},
454
	{L"rArr", 8658},
455
	{L"radic", 8730},
456
	{L"rang", 9002},
457
	{L"raquo", 187},
458
	{L"rarr", 8594},
459
	{L"rceil", 8969},
460
	{L"rdquo", 8221},
461
	{L"real", 8476},
462
	{L"reg", 174},
463
	{L"rfloor", 8971},
464
	{L"rho", 961},
465
	{L"rlm", 8207},
466
	{L"rsaquo", 8250},
467
	{L"rsquo", 8217},
468
	{L"sbquo", 8218},
469
	{L"scaron", 353},
470
	{L"sdot", 8901},
471
	{L"sect", 167},
472
	{L"shy", 173},
473
	{L"sigma", 963},
474
	{L"sigmaf", 962},
475
	{L"sim", 8764},
476
	{L"sp", 8194},
477
	{L"spades", 9824},
478
	{L"sub", 8834},
479
	{L"sube", 8838},
480
	{L"sum", 8721},
481
	{L"sup", 8835},
482
	{L"sup1", 185},
483
	{L"sup2", 178},
484
	{L"sup3", 179},
485
	{L"supe", 8839},
486
	{L"szlig", 223},
487
	{L"tau", 964},
488
	{L"there4", 8756},
489
	{L"theta", 952},
490
	{L"thetasym", 977},
491
	{L"thinsp", 8201},
492
	{L"thorn", 254},
493
	{L"tilde", 732},
494
	{L"times", 215},
495
	{L"trade", 8482},
496
	{L"uArr", 8657},
497
	{L"uacute", 250},
498
	{L"uarr", 8593},
499
	{L"ucirc", 251},
500
	{L"ugrave", 249},
501
	{L"uml", 168},
502
	{L"upsih", 978},
503
	{L"upsilon", 965},
504
	{L"uuml", 252},
505
	{L"varepsilon", 8712},
506
	{L"varphi", 981},
507
	{L"varpi", 982},
508
	{L"varrho", 1009},
509
	{L"vdots", 8942},
510
	{L"vsigma", 962},
511
	{L"vtheta", 977},
512
	{L"weierp", 8472},
513
	{L"xi", 958},
514
	{L"yacute", 253},
515
	{L"yen", 165},
516
	{L"yuml", 255},
517
	{L"zeta", 950},
518
	{L"zwj", 8205},
519
	{L"zwnj", 8204}
520
};
521
#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
522
 
523
// Characters Winstart..Winend are those that Windows
524
// uses interpolated into the Latin1 set.
525
// They aren't supposed to appear in HTML, but they do....
526
enum {
527
	Winstart = 127,
528
	Winend = 159
529
};
530
 
531
static int	winchars[]= { 8226,	// 8226 is a bullet
532
	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
533
	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
534
	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
535
	732, 8482, 353, 8250, 339, 8226, 8226, 376};
536
 
537
static StringInt*	tagtable;		// initialized from tagnames
538
static StringInt*	attrtable;		// initialized from attrnames
539
 
540
static void	lexinit(void);
541
static int		getplaindata(TokenSource* ts, Token* a, int* pai);
542
static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
543
static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
544
static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
545
static Rune*	buftostr(Rune* s, Rune* buf, int j);
546
static int		comment(TokenSource* ts);
547
static int		findstr(TokenSource* ts, Rune* s);
548
static int		ampersand(TokenSource* ts);
549
static int		lowerc(int c);
550
static int		getchar(TokenSource* ts);
551
static void		ungetchar(TokenSource* ts, int c);
552
static void		backup(TokenSource* ts, int savei);
553
static void		freeinsidetoken(Token* t);
554
static void		freeattrs(Attr* ahead);
555
static Attr*	newattr(int attid, Rune* value, Attr* link);
556
static int		Tconv(Fmt* f);
557
 
558
int	dbglex = 0;
559
static int lexinited = 0;
560
 
561
static void
562
lexinit(void)
563
{
564
	tagtable = _makestrinttab(tagnames, Numtags);
565
	attrtable = _makestrinttab(attrnames, Numattrs);
566
	fmtinstall('T', Tconv);
567
	lexinited = 1;
568
}
569
 
570
static TokenSource*
571
newtokensource(uchar* data, int edata, int chset, int mtype)
572
{
573
	TokenSource*	ans;
574
 
575
	assert(chset == US_Ascii || chset == ISO_8859_1 ||
576
			chset == UTF_8 || chset == Unicode);
577
	ans = (TokenSource*)emalloc(sizeof(TokenSource));
578
	ans->i = 0;
579
	ans->data = data;
580
	ans->edata = edata;
581
	ans->chset = chset;
582
	ans->mtype = mtype;
583
	return ans;
584
}
585
 
586
enum {
587
	ToksChunk = 500,
588
};
589
 
590
// Call this to get the tokens.
591
//  The number of returned tokens is returned in *plen.
592
Token*
593
_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
594
{
595
	TokenSource*	ts;
596
	Token*		a;
597
	int	alen;
598
	int	ai;
599
	int	starti;
600
	int	c;
601
	int	tag;
602
 
603
	if(!lexinited)
604
		lexinit();
605
	ts = newtokensource(data, datalen, chset, mtype);
606
	if(dbglex)
607
		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
608
	alen = 0;
609
	ai = 0;
610
	a = 0;
611
	if(ts->mtype == TextHtml) {
612
		for(;;) {
613
			if(alen - ai < ToksChunk/32) {
614
				alen += ToksChunk;
615
				a = erealloc(a, alen*sizeof *a);
616
			}
617
			starti = ts->i;
618
			c = getchar(ts);
619
			if(c < 0)
620
				break;
621
			if(c == '<') {
622
				tag = gettag(ts, starti, a, &ai);
623
				if(tag == Tscript || tag == Tstyle) {
624
					// special rules for getting Data after....
625
					starti = ts->i;
626
					c = getchar(ts);
627
					tag = getscriptdata(ts, c, starti, a, &ai, tag);
628
				}
629
			}
630
			else
631
				tag = getdata(ts, c, starti, a, &ai);
632
			if(tag == -1)
633
				break;
634
			else if(dbglex > 1 && tag != Comment)
635
				fprint(2, "lex: got token %T\n", &a[ai-1]);
636
		}
637
	}
638
	else {
639
		// plain text (non-html) tokens
640
		for(;;) {
641
			if(alen - ai < ToksChunk/32) {
642
				alen += ToksChunk;
643
				a = erealloc(a, alen*sizeof *a);
644
			}
645
			tag = getplaindata(ts, a, &ai);
646
			if(tag == -1)
647
				break;
648
			if(dbglex > 1)
649
				fprint(2, "lex: got token %T\n", &a[ai]);
650
		}
651
	}
652
	free(ts);
653
	if(dbglex)
654
		fprint(2, "lex: returning %d tokens\n", ai);
655
	*plen = ai;
656
	if(ai == 0){
657
		free(a);
658
		a = 0;
659
	}
660
	return a;
661
}
662
 
663
// For case where source isn't HTML.
664
// Just make data tokens, one per line (or partial line,
665
// at end of buffer), ignoring non-whitespace control
666
// characters and dumping \r's.
667
// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
668
// Otherwise return -1;
669
static int
670
getplaindata(TokenSource* ts, Token* a, int* pai)
671
{
672
	Rune*	s;
673
	int	j;
674
	int	starti;
675
	int	c;
676
	Token*	tok;
677
	Rune	buf[BIGBUFSIZE];
678
 
679
	s = nil;
680
	j = 0;
681
	starti = ts->i;
682
	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
683
		if(c < ' ') {
684
			if(isspace(c)) {
685
				if(c == '\r') {
686
					// ignore it unless no following '\n',
687
					// in which case treat it like '\n'
688
					c = getchar(ts);
689
					if(c != '\n') {
690
						if(c >= 0)
691
							ungetchar(ts, c);
692
						c = '\n';
693
					}
694
				}
695
			}
696
			else
697
				c = 0;
698
		}
699
		if(c != 0) {
700
			buf[j++] = c;
701
			if(j == nelem(buf)-1) {
702
				s = buftostr(s, buf, j);
703
				j = 0;
704
			}
705
		}
706
		if(c == '\n')
707
			break;
708
	}
709
	s = buftostr(s, buf, j);
710
	if(s == nil)
711
		return -1;
712
	tok = &a[(*pai)++];
713
	tok->tag = Data;
714
	tok->text = s;
715
	tok->attr = nil;
716
	tok->starti = starti;
717
	return Data;
718
}
719
 
720
// Return concatenation of s and buf[0:j]
721
static Rune*
722
buftostr(Rune* s, Rune* buf, int j)
723
{
724
	int i;
725
 
726
	if(s == nil)
727
		s = _Strndup(buf, j);
728
	else {
729
		i = _Strlen(s);
730
		s = realloc(s, ( i+j+1)*sizeof *s);
731
		memcpy(&s[i], buf, j*sizeof *s);
732
		s[i+j] = 0;
733
	}
734
	return s;
735
}
736
 
737
// Gather data up to next start-of-tag or end-of-buffer.
738
// Translate entity references (&amp;).
739
// Ignore non-whitespace control characters and get rid of \r's.
740
// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
741
// Otherwise return -1;
742
static int
743
getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
744
{
745
	Rune*	s;
746
	int	j;
747
	int	c;
748
	Token*	tok;
749
	Rune	buf[SMALLBUFSIZE];
750
 
751
	s = nil;
752
	j = 0;
753
	for(c = firstc; c >= 0; c = getchar(ts)){
754
		if(c == '&') {
755
			c = ampersand(ts);
756
			if(c < 0)
757
				break;
758
		}
759
		else if(c < ' ') {
760
			if(isspace(c)) {
761
				if(c == '\r') {
762
					// ignore it unless no following '\n',
763
					// in which case treat it like '\n'
764
					c = getchar(ts);
765
					if(c != '\n') {
766
						if(c >= 0)
767
							ungetchar(ts, c);
768
						c = '\n';
769
					}
770
				}
771
			}
772
			else {
773
				if(warn)
774
					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
775
				c = 0;
776
			}
777
		}
778
		else if(c == '<') {
779
			ungetchar(ts, c);
780
			break;
781
		}
782
		if(c != 0) {
783
			buf[j++] = c;
784
			if(j == nelem(buf)-1) {
785
				s = buftostr(s, buf, j);
786
				j = 0;
787
			}
788
		}
789
	}
790
	s = buftostr(s, buf, j);
791
	if(s == nil)
792
		return -1;
793
	tok = &a[(*pai)++];
794
	tok->tag = Data;
795
	tok->text = s;
796
	tok->attr = nil;
797
	tok->starti = starti;
798
	return Data;
799
}
800
 
801
// The rules for lexing scripts are different (ugh).
802
// Gather up everything until see an "</" tagnames[tok] ">"
803
static int
804
getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
805
{
806
	Rune*	s;
807
	int	j;
808
	int	tstarti;
809
	int	savei;
810
	int	c;
811
	int	tag;
812
	int	done;
813
	Token*	tok;
814
	Rune	buf[BIGBUFSIZE];
815
 
816
	s = nil;
817
	j = 0;
818
	tstarti = starti;
819
	c = firstc;
820
	done = 0;
821
	while(c >= 0) {
822
		if(c == '<') {
823
			// other browsers ignore stuff to end of line after <!
824
			savei = ts->i;
825
			c = getchar(ts);
826
			if(c == '!') {
827
				if(comment(ts) == -1)
828
					break;
829
				if(c == '\r')
830
					c = getchar(ts);
831
				if(c == '\n')
832
					c = getchar(ts);
833
			}
834
			else if(c >= 0) {
835
				backup(ts, savei);
836
				tag = gettag(ts, tstarti, a, pai);
837
				if(tag == -1)
838
					break;
839
				if(tag != Comment)
840
					(*pai)--;
841
				backup(ts, tstarti);
842
				if(tag == findtag + RBRA) {
843
					done = 1;
844
					break;
845
				}
846
				// here tag was not the one we were looking for, so take as regular data
847
				c = getchar(ts);
848
			}
849
		}
850
		if(c < 0)
851
			break;
852
		if(c != 0) {
853
			buf[j++] = c;
854
			if(j == nelem(buf)-1) {
855
				s = buftostr(s, buf, j);
856
				j = 0;
857
			}
858
		}
859
		tstarti = ts->i;
860
		c = getchar(ts);
861
	}
862
	if(done || ts->i == ts->edata) {
863
		s = buftostr(s, buf, j);
864
		tok = &a[(*pai)++];
865
		tok->tag = Data;
866
		tok->text = s;
867
		tok->attr = nil;
868
		tok->starti = starti;
869
		return Data;
870
	}
871
	free(s);
872
	backup(ts, starti);
873
	return -1;
874
}
875
 
876
// We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
877
// ends before then, return -1).
878
// If it's a tag, look up the name, gather the attributes, and return
879
// the appropriate token.
880
// Else it's either just plain data or some kind of ignorable stuff:
881
// return Data or Comment as appropriate.
882
// If it's not a Comment, put it in a[*pai] and bump *pai.
883
static int
884
gettag(TokenSource* ts, int starti, Token* a, int* pai)
885
{
886
	int	rbra;
887
	int	ans;
888
	Attr*	al;
889
	int	nexti;
890
	int	c;
891
	int	ti;
892
	int	afnd;
893
	int	attid;
894
	int	quote;
895
	Rune*	val;
896
	int	nv;
897
	int	i;
898
	int	tag;
899
	Token*	tok;
900
	Rune	buf[BIGBUFSIZE];
901
 
902
	rbra = 0;
903
	nexti = ts->i;
904
	tok = &a[*pai];
905
	tok->tag = Notfound;
906
	tok->text = nil;
907
	tok->attr = nil;
908
	tok->starti = starti;
909
	c = getchar(ts);
910
	if(c == '/') {
911
		rbra = RBRA;
912
		c = getchar(ts);
913
	}
914
	if(c < 0)
915
		goto eob_done;
916
	if(c >= 256 || !isalpha(c)) {
917
		// not a tag
918
		if(c == '!') {
919
			ans = comment(ts);
920
			if(ans != -1)
921
				return ans;
922
			goto eob_done;
923
		}
924
		else {
925
			backup(ts, nexti);
926
			tok->tag = Data;
927
			tok->text = _Strdup(L"<");
928
			(*pai)++;
929
			return Data;
930
		}
931
	}
932
	// c starts a tagname
933
	buf[0] = c;
934
	i = 1;
935
	while(1) {
936
		c = getchar(ts);
937
		if(c < 0)
938
			goto eob_done;
939
		if(!ISNAMCHAR(c))
940
			break;
941
		// if name is bigger than buf it won't be found anyway...
942
		if(i < BIGBUFSIZE)
943
			buf[i++] = c;
944
	}
945
	if(_lookup(tagtable, Numtags, buf, i, &tag))
946
		tok->tag = tag + rbra;
947
	else
948
		tok->text = _Strndup(buf, i);	// for warning print, in build
949
	// attribute gathering loop
950
	al = nil;
951
	while(1) {
952
		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
953
		// skip whitespace
954
attrloop_continue:
955
		while(c < 256 && isspace(c)) {
956
			c = getchar(ts);
957
			if(c < 0)
958
				goto eob_done;
959
		}
960
		if(c == '>')
961
			goto attrloop_done;
962
		if(c == '<') {
963
			if(warn)
964
				fprint(2, "warning: unclosed tag\n");
965
			ungetchar(ts, c);
966
			goto attrloop_done;
967
		}
968
		if(c >= 256 || !isalpha(c)) {
969
			if(warn)
970
				fprint(2, "warning: expected attribute name\n");
971
			// skipt to next attribute name
972
			while(1) {
973
				c = getchar(ts);
974
				if(c < 0)
975
					goto eob_done;
976
				if(c < 256 && isalpha(c))
977
					goto attrloop_continue;
978
				if(c == '<') {
979
					if(warn)
980
						fprint(2, "warning: unclosed tag\n");
981
					ungetchar(ts, 60);
982
					goto attrloop_done;
983
				}
984
				if(c == '>')
985
					goto attrloop_done;
986
			}
987
		}
988
		// gather attribute name
989
		buf[0] = c;
990
		i = 1;
991
		while(1) {
992
			c = getchar(ts);
993
			if(c < 0)
994
				goto eob_done;
995
			if(!ISNAMCHAR(c))
996
				break;
997
			if(i < BIGBUFSIZE-1)
998
				buf[i++] = c;
999
		}
1000
		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1001
		if(warn && !afnd) {
1002
			buf[i] = 0;
1003
			fprint(2, "warning: unknown attribute name %S\n", buf);
1004
		}
1005
		// skip whitespace
1006
		while(c < 256 && isspace(c)) {
1007
			c = getchar(ts);
1008
			if(c < 0)
1009
				goto eob_done;
1010
		}
1011
		if(c != '=') {
1012
			if(afnd)
1013
				al = newattr(attid, nil, al);
1014
			goto attrloop_continue;
1015
		}
1016
		//# c is '=' here;  skip whitespace
1017
		while(1) {
1018
			c = getchar(ts);
1019
			if(c < 0)
1020
				goto eob_done;
1021
			if(c >= 256 || !isspace(c))
1022
				break;
1023
		}
1024
		quote = 0;
1025
		if(c == '\'' || c == '"') {
1026
			quote = c;
1027
			c = getchar(ts);
1028
			if(c < 0)
1029
				goto eob_done;
1030
		}
1031
		val = nil;
1032
		nv = 0;
1033
		while(1) {
1034
valloop_continue:
1035
			if(c < 0)
1036
				goto eob_done;
1037
			if(c == '>') {
1038
				if(quote) {
1039
					// c might be part of string (though not good style)
1040
					// but if line ends before close quote, assume
1041
					// there was an unmatched quote
1042
					ti = ts->i;
1043
					while(1) {
1044
						c = getchar(ts);
1045
						if(c < 0)
1046
							goto eob_done;
1047
						if(c == quote) {
1048
							backup(ts, ti);
1049
							buf[nv++] = '>';
1050
							if(nv == BIGBUFSIZE-1) {
1051
								val = buftostr(val, buf, nv);
1052
								nv = 0;
1053
							}
1054
							c = getchar(ts);
1055
							goto valloop_continue;
1056
						}
1057
						if(c == '\n') {
1058
							if(warn)
1059
								fprint(2, "warning: apparent unmatched quote\n");
1060
							backup(ts, ti);
1061
							c = '>';
1062
							goto valloop_done;
1063
						}
1064
					}
1065
				}
1066
				else
1067
					goto valloop_done;
1068
			}
1069
			if(quote) {
1070
				if(c == quote) {
1071
					c = getchar(ts);
1072
					if(c < 0)
1073
						goto eob_done;
1074
					goto valloop_done;
1075
				}
1076
				if(c == '\r') {
1077
					c = getchar(ts);
1078
					goto valloop_continue;
1079
				}
1080
				if(c == '\t' || c == '\n')
1081
					c = ' ';
1082
			}
1083
			else {
1084
				if(c < 256 && isspace(c))
1085
					goto valloop_done;
1086
			}
1087
			if(c == '&') {
1088
				c = ampersand(ts);
1089
				if(c == -1)
1090
					goto eob_done;
1091
			}
1092
			buf[nv++] = c;
1093
			if(nv == BIGBUFSIZE-1) {
1094
				val = buftostr(val, buf, nv);
1095
				nv = 0;
1096
			}
1097
			c = getchar(ts);
1098
		}
1099
valloop_done:
1100
		if(afnd) {
1101
			val = buftostr(val, buf, nv);
1102
			al = newattr(attid, val, al);
1103
		}
1104
	}
1105
 
1106
attrloop_done:
1107
	tok->attr = al;
1108
	(*pai)++;
1109
	return tok->tag;
1110
 
1111
eob_done:
1112
	if(warn)
1113
		fprint(2, "warning: incomplete tag at end of page\n");
1114
	backup(ts, nexti);
1115
	tok->tag = Data;
1116
	tok->text = _Strdup(L"<");
1117
	return Data;
1118
}
1119
 
1120
// We've just read a '<!' at position starti,
1121
// so this may be a comment or other ignored section, or it may
1122
// be just a literal string if there is no close before end of file
1123
// (other browsers do that).
1124
// The accepted practice seems to be (note: contrary to SGML spec!):
1125
// If see <!--, look for --> to close, or if none, > to close.
1126
// If see <!(not --), look for > to close.
1127
// If no close before end of file, leave original characters in as literal data.
1128
//
1129
// If we see ignorable stuff, return Comment.
1130
// Else return nil (caller should back up and try again when more data arrives,
1131
// unless at end of file, in which case caller should just make '<' a data token).
1132
static int
1133
comment(TokenSource* ts)
1134
{
1135
	int	nexti;
1136
	int	havecomment;
1137
	int	c;
1138
 
1139
	nexti = ts->i;
1140
	havecomment = 0;
1141
	c = getchar(ts);
1142
	if(c == '-') {
1143
		c = getchar(ts);
1144
		if(c == '-') {
1145
			if(findstr(ts, L"-->"))
1146
				havecomment = 1;
1147
			else
1148
				backup(ts, nexti);
1149
		}
1150
	}
1151
	if(!havecomment) {
1152
		if(c == '>')
1153
			havecomment = 1;
1154
		else if(c >= 0) {
1155
			if(findstr(ts, L">"))
1156
				havecomment = 1;
1157
		}
1158
	}
1159
	if(havecomment)
1160
		return Comment;
1161
	return -1;
1162
}
1163
 
1164
// Look for string s in token source.
1165
// If found, return 1, with buffer at next char after s,
1166
// else return 0 (caller should back up).
1167
static int
1168
findstr(TokenSource* ts, Rune* s)
1169
{
1170
	int	c0;
1171
	int	n;
1172
	int	nexti;
1173
	int	i;
1174
	int	c;
1175
 
1176
	c0 = s[0];
1177
	n = runestrlen(s);
1178
	while(1) {
1179
		c = getchar(ts);
1180
		if(c < 0)
1181
			break;
1182
		if(c == c0) {
1183
			if(n == 1)
1184
				return 1;
1185
			nexti = ts->i;
1186
			for(i = 1; i < n; i++) {
1187
				c = getchar(ts);
1188
				if(c < 0)
1189
					goto mainloop_done;
1190
				if(c != s[i])
1191
					break;
1192
			}
1193
			if(i == n)
1194
				return 1;
1195
			backup(ts, nexti);
1196
		}
1197
	}
1198
mainloop_done:
1199
	return 0;
1200
}
1201
 
1202
// We've just read an '&'; look for an entity reference
1203
// name, and if found, return translated char.
1204
// if there is a complete entity name but it isn't known,
1205
// back up to just past the '&' and return '&'.
1206
// If the entity can't be completed in the current buffer, back up
1207
// to the '&' and return -1.
1208
static int
1209
ampersand(TokenSource* ts)
1210
{
1211
	int	savei;
1212
	int	c;
1213
	int	fnd;
1214
	int	ans;
1215
	int	v;
1216
	int	k;
1217
	Rune	buf[25];
1218
 
1219
	savei = ts->i;
1220
	c = getchar(ts);
1221
	fnd = 0;
1222
	ans = -1;
1223
	if(c == '#') {
1224
		c = getchar(ts);
1225
		v = 0;
1226
		if(c == 'X' || c == 'x')
1227
			for(c = getchar(ts); c < 256; c = getchar(ts))
1228
				if(c >= '0' && c <= '9')
1229
					v = v*16+c-'0';
1230
				else if(c >= 'A' && c<= 'F')
1231
					v = v*16+c-'A'+10;
1232
				else if(c >= 'a' && c <= 'f')
1233
					v = v*16+c-'a'+10;
1234
				else
1235
					break;
1236
		else
1237
			while(c >= 0) {
1238
				if(!(c < 256 && isdigit(c)))
1239
					break;
1240
				v = v*10 + c - 48;
1241
				c = getchar(ts);
1242
			}
1243
		if(c >= 0) {
1244
			if(!(c == ';' || c == '\n' || c == '\r'))
1245
				ungetchar(ts, c);
1246
			c = v;
1247
			if(c == 160)
1248
				c = 160;
1249
			if(c >= Winstart && c <= Winend) {
1250
				c = winchars[c - Winstart];
1251
			}
1252
			ans = c;
1253
			fnd = 1;
1254
		}
1255
	}
1256
	else if(c < 256 && isalpha(c)) {
1257
		buf[0] = c;
1258
		k = 1;
1259
		while(1) {
1260
			c = getchar(ts);
1261
			if(c < 0)
1262
				break;
1263
			if(c < 256 && (isalpha(c) || isdigit(c))) {
1264
				if(k < nelem(buf)-1)
1265
					buf[k++] = c;
1266
			}
1267
			else {
1268
				if(!(c == ';' || c == '\n' || c == '\r'))
1269
					ungetchar(ts, c);
1270
				break;
1271
			}
1272
		}
1273
		if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1274
			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1275
	}
1276
	if(!fnd) {
1277
		backup(ts, savei);
1278
		ans = '&';
1279
	}
1280
	return ans;
1281
}
1282
 
1283
// Get next char, obeying ts.chset.
1284
// Returns -1 if no complete character left before current end of data.
1285
static int
1286
getchar(TokenSource* ts)
1287
{
1288
	uchar*	buf;
1289
	int	c;
1290
	int	n;
1291
	int	ok;
1292
	Rune	r;
1293
 
1294
	if(ts->i >= ts->edata)
1295
		return -1;
1296
	buf = ts->data;
1297
	c = buf[ts->i];
1298
	switch(ts->chset) {
1299
	case ISO_8859_1:
1300
		if(c >= Winstart && c <= Winend)
1301
			c = winchars[c - Winstart];
1302
		ts->i++;
1303
		break;
1304
	case US_Ascii:
1305
		if(c > 127) {
1306
			if(warn)
1307
				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1308
		}
1309
		ts->i++;
1310
		break;
1311
	case UTF_8:
1312
		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1313
		n = chartorune(&r, (char*)(buf+ts->i));
1314
		if(ok) {
1315
			if(warn && c == 0x80)
1316
				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1317
			ts->i += n;
1318
			c = r;
1319
		}
1320
		else {
1321
			// not enough bytes in buf to complete utf-8 char
1322
			ts->i = ts->edata;	// mark "all used"
1323
			c = -1;
1324
		}
1325
		break;
1326
	case Unicode:
1327
		if(ts->i < ts->edata - 1) {
1328
			//standards say most-significant byte first
1329
			c = (c << 8)|(buf[ts->i + 1]);
1330
			ts->i += 2;
1331
		}
1332
		else {
1333
			ts->i = ts->edata;	// mark "all used"
1334
			c = -1;
1335
		}
1336
		break;
1337
	default:
1338
		return -1;
1339
	}
1340
	return c;
1341
}
1342
 
1343
// Assuming c was the last character returned by getchar, set
1344
// things up so that next getchar will get that same character
1345
// followed by the current 'next character', etc.
1346
static void
1347
ungetchar(TokenSource* ts, int c)
1348
{
1349
	int	n;
1350
	Rune	r;
1351
	char	a[UTFmax];
1352
 
1353
	n = 1;
1354
	switch(ts->chset) {
1355
	case UTF_8:
1356
		if(c >= 128) {
1357
			r = c;
1358
			n = runetochar(a, &r);
1359
		}
1360
		break;
1361
	case Unicode:
1362
		n = 2;
1363
		break;
1364
	}
1365
	ts->i -= n;
1366
}
1367
 
1368
// Restore ts so that it is at the state where the index was savei.
1369
static void
1370
backup(TokenSource* ts, int savei)
1371
{
1372
	if(dbglex)
1373
		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1374
	ts->i = savei;
1375
}
1376
 
1377
 
1378
// Look for value associated with attribute attid in token t.
1379
// If there is one, return 1 and put the value in *pans,
1380
// else return 0.
1381
// If xfer is true, transfer ownership of the string to the caller
1382
// (nil it out here); otherwise, caller must duplicate the answer
1383
// if it needs to save it.
1384
// OK to have pans==0, in which case this is just looking
1385
// to see if token is present.
1386
int
1387
_tokaval(Token* t, int attid, Rune** pans, int xfer)
1388
{
1389
	Attr*	attr;
1390
 
1391
	attr = t->attr;
1392
	while(attr != nil) {
1393
		if(attr->attid == attid) {
1394
			if(pans != nil)
1395
				*pans = attr->value;
1396
			if(xfer)
1397
				attr->value = nil;
1398
			return 1;
1399
		}
1400
		attr = attr->next;
1401
	}
1402
	if(pans != nil)
1403
		*pans = nil;
1404
	return 0;
1405
}
1406
 
1407
static int
1408
Tconv(Fmt *f)
1409
{
1410
	Token*	t;
1411
	int	i;
1412
	int	tag;
1413
	char*	srbra;
1414
	Rune*	aname;
1415
	Rune*	tname;
1416
	Attr*	a;
1417
	char	buf[BIGBUFSIZE];
1418
 
1419
	t = va_arg(f->args, Token*);
1420
	if(t == nil)
1421
		sprint(buf, "<null>");
1422
	else {
1423
		i = 0;
1424
		if(dbglex > 1)
1425
			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1426
		tag = t->tag;
1427
		if(tag == Data) {
1428
			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1429
		}
1430
		else {
1431
			srbra = "";
1432
			if(tag >= RBRA) {
1433
				tag -= RBRA;
1434
				srbra = "/";
1435
			}
1436
			tname = tagnames[tag];
1437
			if(tag == Notfound)
1438
				tname = L"?";
1439
			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1440
			for(a = t->attr; a != nil; a = a->next) {
1441
				aname = attrnames[a->attid];
1442
				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1443
				if(a->value != nil)
1444
					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1445
			}
1446
			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1447
		}
1448
		buf[i] = 0;
1449
	}
1450
	return fmtstrcpy(f, buf);
1451
}
1452
 
1453
// Attrs own their constituent strings, but build may eventually
1454
// transfer some values to its items and nil them out in the Attr.
1455
static Attr*
1456
newattr(int attid, Rune* value, Attr* link)
1457
{
1458
	Attr* ans;
1459
 
1460
	ans = (Attr*)emalloc(sizeof(Attr));
1461
	ans->attid = attid;
1462
	ans->value = value;
1463
	ans->next = link;
1464
	return ans;
1465
}
1466
 
1467
// Free list of Attrs linked through next field
1468
static void
1469
freeattrs(Attr* ahead)
1470
{
1471
	Attr* a;
1472
	Attr* nexta;
1473
 
1474
	a = ahead;
1475
	while(a != nil) {
1476
		nexta = a->next;
1477
		free(a->value);
1478
		free(a);
1479
		a = nexta;
1480
	}
1481
}
1482
 
1483
// Free array of Tokens.
1484
// Allocated space might have room for more than n tokens,
1485
// but only n of them are initialized.
1486
// If caller has transferred ownership of constitutent strings
1487
// or attributes, it must have nil'd out the pointers in the Tokens.
1488
void
1489
_freetokens(Token* tarray, int n)
1490
{
1491
	int i;
1492
	Token* t;
1493
 
1494
	if(tarray == nil)
1495
		return;
1496
	for(i = 0; i < n; i++) {
1497
		t = &tarray[i];
1498
		free(t->text);
1499
		freeattrs(t->attr);
1500
	}
1501
	free(tarray);
1502
}