Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/*
2
 * This is a URL parser, written to parse "Common Internet Scheme" URL
3
 * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs 
4
 * are supported, using "server-based" naming authorities in the schemes.
5
 * Support for literal IPv6 addresses is included, per RFC2732.
6
 *
7
 * Current "known" schemes: http, ftp, file.
8
 *
9
 * We can do all the parsing operations without Runes since URLs are
10
 * defined to be composed of US-ASCII printable characters.
11
 * See RFC1738, RFC2396.
12
 */
13
 
14
#include <u.h>
15
#include <libc.h>
16
#include <ctype.h>
17
#include <regexp.h>
18
#include <plumb.h>
19
#include <thread.h>
20
#include <fcall.h>
21
#include <9p.h>
22
#include "dat.h"
23
#include "fns.h"
24
 
25
int urldebug;
26
 
27
/* If set, relative paths with leading ".." segments will have them trimmed */
28
#define RemoveExtraRelDotDots	0
29
#define ExpandCurrentDocUrls	1
30
 
31
static char*
32
schemestrtab[] =
33
{
34
	nil,
35
	"http",
36
	"https",
37
	"ftp",
38
	"file",
39
};
40
 
41
static int
42
ischeme(char *s)
43
{
44
	int i;
45
 
46
	for(i=0; i<nelem(schemestrtab); i++)
47
		if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
48
			return i;
49
	return USunknown;
50
}
51
 
52
/*
53
 * URI splitting regexp is from RFC2396, Appendix B: 
54
 *		^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
55
 *		 12            3  4          5       6  7        8 9
56
 *
57
 * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
58
 * $2 = scheme			"http"
59
 * $4 = authority		"www.ics.uci.edu"
60
 * $5 = path			"/pub/ietf/uri/"
61
 * $7 = query			<undefined>
62
 * $9 = fragment		"Related"
63
 */
64
 
65
/*
66
 * RFC2396, Sec 3.1, contains:
67
 *
68
 * Scheme names consist of a sequence of characters beginning with a
69
 * lower case letter and followed by any combination of lower case
70
 * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
71
 * resiliency, programs interpreting URI should treat upper case letters
72
 * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
73
 * well as "http").
74
 */
75
 
76
/*
77
 * For server-based naming authorities (RFC2396 Sec 3.2.2):
78
 *    server        = [ [ userinfo "@" ] hostport ]
79
 *    userinfo      = *( unreserved | escaped |
80
 *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
81
 *    hostport      = host [ ":" port ]
82
 *    host          = hostname | IPv4address
83
 *    hostname      = *( domainlabel "." ) toplabel [ "." ]
84
 *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
85
 *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
86
 *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
87
 *    port          = *digit
88
 *
89
 *  The host is a domain name of a network host, or its IPv4 address as a
90
 *  set of four decimal digit groups separated by ".".  Literal IPv6
91
 *  addresses are not supported.
92
 *
93
 * Note that literal IPv6 address support is outlined in RFC2732:
94
 *    host          = hostname | IPv4address | IPv6reference
95
 *    ipv6reference = "[" IPv6address "]"		(RFC2373)
96
 *
97
 * Since hostnames and numbers will have to be resolved by the OS anyway,
98
 * we don't have to parse them too pedantically (counting '.'s, checking 
99
 * for well-formed literal IP addresses, etc.).
100
 *
101
 * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
102
 * we just pass them through.
103
 *
104
 * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, 
105
 * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
106
 * path yields a nil substring match, instead of an empty one.
107
 *
108
 * We're more restrictive than RFC2396 indicates with "userinfo" strings,
109
 * insisting they have the form "[user[:password]]".  This may need to
110
 * change at some point, however.
111
 */
112
 
113
/* RE character-class components -- these go in brackets */
114
#define PUNCT			"\\-_.!~*'()"
115
#define RES			";/?:@&=+$,"
116
#define ALNUM		"a-zA-Z0-9"
117
#define HEX			"0-9a-fA-F"
118
#define UNRES			ALNUM PUNCT
119
 
120
/* RE components; _N => has N parenthesized subexpressions when expanded */
121
#define ESCAPED_1			"(%[" HEX "][" HEX "])"
122
#define URIC_2			"([" RES UNRES "]|" ESCAPED_1 ")"
123
#define URICNOSLASH_2		"([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
124
#define USERINFO_2		"([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
125
#define PCHAR_2			"([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
126
#define PSEGCHAR_3		"([/;]|" PCHAR_2 ")"
127
 
128
typedef struct Retab Retab;
129
struct Retab
130
{
131
	char	*str;
132
	Reprog	*prog;
133
	int		size;
134
	int		ind[5];
135
};
136
 
137
enum
138
{
139
	REsplit = 0,
140
	REscheme,
141
	REunknowndata,
142
	REauthority,
143
	REhost,
144
	REuserinfo,
145
	REabspath,
146
	REquery,
147
	REfragment,
148
	REhttppath,
149
	REftppath,
150
	REfilepath,
151
 
152
	MaxResub=	20,
153
};
154
 
155
Retab retab[] =	/* view in constant width Font */
156
{
157
[REsplit]
158
	"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
159
	/* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
160
	{  2,              4,         5,          7,          9},
161
 
162
[REscheme]
163
	"^[a-z][a-z0-9+-.]*$", nil, 0,
164
	{ 0, },
165
 
166
[REunknowndata]
167
	"^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
168
	{ 0, },
169
 
170
[REauthority]
171
	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
172
	/* |----user info-----|  |--------host----------------|  |-port-| */
173
	{  3,                    7,                              11, },
174
 
175
[REhost]
176
	"^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
177
	/* |--regular host--|     |-IPv6 literal-| */
178
	{  2,                     4, },
179
 
180
[REuserinfo]
181
	"^(([^:]*)(:([^:]*))?)$", nil, 0,
182
	/* |user-|  |pass-| */
183
	{  2,       4, },
184
 
185
[REabspath]
186
	"^/" PSEGCHAR_3 "*$", nil, 0,
187
	{ 0, },
188
 
189
[REquery]
190
	"^" URIC_2 "*$", nil, 0,
191
	{ 0, },
192
 
193
[REfragment]
194
	"^" URIC_2 "*$", nil, 0,
195
	{ 0, },
196
 
197
[REhttppath]
198
	"^.*$", nil, 0,
199
	{ 0, },
200
 
201
[REftppath]
202
	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
203
	/*|--|-path              |ftptype-| */
204
	{ 1,                     3, }, 
205
 
206
[REfilepath]
207
	"^.*$", nil, 0,
208
	{ 0, },
209
};
210
 
211
static int
212
countleftparen(char *s)
213
{
214
	int n;
215
 
216
	n = 0;
217
	for(; *s; s++)
218
		if(*s == '(')
219
			n++;
220
	return n;
221
}
222
 
223
void
224
initurl(void)
225
{
226
	int i, j;
227
 
228
	for(i=0; i<nelem(retab); i++){
229
		retab[i].prog = regcomp(retab[i].str);
230
		if(retab[i].prog == nil)
231
			sysfatal("recomp(%s): %r", retab[i].str);
232
		retab[i].size = countleftparen(retab[i].str)+1;
233
		for(j=0; j<nelem(retab[i].ind); j++)
234
			if(retab[i].ind[j] >= retab[i].size)
235
				sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
236
					i, j, retab[i].ind[j], retab[i].size);
237
		if(MaxResub < retab[i].size)
238
			sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
239
	}
240
}
241
 
242
typedef struct SplitUrl SplitUrl;
243
struct SplitUrl
244
{
245
	struct {
246
		char *s;
247
		char *e;
248
	} url, scheme, authority, path, query, fragment;
249
};
250
 
251
/*
252
 * Implements the algorithm in RFC2396 sec 5.2 step 6.
253
 * Returns number of chars written, excluding NUL terminator.
254
 * dest is known to be >= strlen(base)+rel_len.
255
 */
256
static void
257
merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
258
{
259
	char *s, *p, *e, *pdest;
260
 
261
	pdest = dest;
262
 
263
	/* 6a: start with base, discard last segment */
264
	if(base && base[0]){
265
		/* Empty paths don't match in our scheme; 'base' should be nil */
266
		assert(base[0] == '/');
267
		e = strrchr(base, '/');
268
		e++;
269
		memmove(pdest, base, e-base);
270
		pdest += e-base;
271
	}else{
272
		/* Artistic license on my part */
273
		*pdest++ = '/';
274
	}
275
 
276
	/* 6b: append relative component */
277
	if(rel_st){
278
		memmove(pdest, rel_st, rel_len);
279
		pdest += rel_len;
280
	}
281
 
282
	/* 6c: remove any occurrences of "./" as a complete segment */
283
	s = dest;
284
	*pdest = '\0';
285
	while(e = strstr(s, "./")){
286
		if((e == dest) || (*(e-1) == '/')){
287
 			memmove(e, e+2, pdest+1-(e+2));	/* +1 for NUL */
288
			pdest -= 2;
289
		}else
290
			s = e+1;
291
	}
292
 
293
	/* 6d: remove a trailing "." as a complete segment */
294
	if(pdest>dest && *(pdest-1)=='.' && 
295
	  (pdest==dest+1 || *(pdest-2)=='/'))
296
		*--pdest = '\0';
297
 
298
	/* 6e: remove occurences of "seg/../", where seg != "..", left->right */
299
	s = dest+1;
300
	while(e = strstr(s, "/../")){
301
		p = e - 1;
302
		while(p >= dest && *p != '/')
303
			p--;
304
		if(memcmp(p, "/../", 4) != 0){
305
			memmove(p+1, e+4, pdest+1-(e+4));
306
			pdest -= (e+4) - (p+1);
307
		}else
308
			s = e+1;
309
	}
310
 
311
	/* 6f: remove a trailing "seg/..", where seg isn't ".."  */
312
	if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
313
		p = pdest-3 - 1;
314
		while(p >= dest && *p != '/')
315
			p--;
316
		if(memcmp(p, "/../", 4) != 0){
317
			pdest = p+1;
318
			*pdest = '\0';
319
		}
320
	}
321
 
322
	/* 6g: leading ".." segments are errors -- we'll just blat them out. */
323
	if(RemoveExtraRelDotDots){
324
		p = dest;
325
		if (p[0] == '/')
326
			p++;
327
		s = p;
328
		while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
329
			s += 3;
330
		if(s > p){
331
			memmove(p, s, pdest+1-s);
332
			pdest -= s-p;
333
		}
334
	}
335
	USED(pdest);
336
 
337
	if(urldebug)
338
		fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 
339
			rel_st, dest);
340
}
341
 
342
/*
343
 * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
344
 *
345
 * If successful, this just ends up freeing and replacing "u->url".
346
 */
347
static int
348
resolve_relative(SplitUrl *su, Url *base, Url *u)
349
{
350
	char *url, *path;
351
	char *purl, *ppath;
352
	int currentdoc, ulen, plen;
353
 
354
	if(base == nil){
355
		werrstr("relative URI given without base");
356
		return -1;
357
	}
358
	if(base->scheme == nil){
359
		werrstr("relative URI given with no scheme");
360
		return -1;
361
	}
362
	if(base->ischeme == USunknown){
363
		werrstr("relative URI given with unknown scheme");
364
		return -1;
365
	}
366
	if(base->ischeme == UScurrent){
367
		werrstr("relative URI given with incomplete base");
368
		return -1;
369
	}
370
	assert(su->scheme.s == nil);
371
 
372
	/* Sec 5.2 step 2 */
373
	currentdoc = 0;
374
	if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
375
		/* Reference is to current document */
376
		if(urldebug)
377
			fprint(2, "url %s is relative to current document\n", u->url);
378
		u->ischeme = UScurrent;
379
		if(!ExpandCurrentDocUrls)
380
			return 0;
381
		currentdoc = 1;
382
	}
383
 
384
	/* Over-estimate the maximum lengths, for allocation purposes */
385
	/* (constants are for separators) */
386
	plen = 1;
387
	if(base->path)
388
		plen += strlen(base->path);
389
	if(su->path.s)
390
		plen += 1 + (su->path.e - su->path.s);
391
 
392
	ulen = 0;
393
	ulen += strlen(base->scheme) + 1;
394
	if(su->authority.s)
395
		ulen += 2 + (su->authority.e - su->authority.s);
396
	else
397
		ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
398
	ulen += plen;
399
	if(su->query.s)
400
		ulen += 1 + (su->query.e - su->query.s);
401
	else if(currentdoc && base->query)
402
		ulen += 1 + strlen(base->query);
403
	if(su->fragment.s)
404
		ulen += 1 + (su->fragment.e - su->fragment.s);
405
	else if(currentdoc && base->fragment)
406
		ulen += 1 + strlen(base->fragment);
407
	url = emalloc(ulen+1);
408
	path = emalloc(plen+1);
409
 
410
	url[0] = '\0';
411
	purl = url;
412
	path[0] = '\0';
413
	ppath = path;
414
 
415
	if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
416
		/* Is a "network-path" or "absolute-path"; don't merge with base path */
417
		/* Sec 5.2 steps 4,5 */
418
		if(su->path.s){
419
			memmove(ppath, su->path.s, su->path.e - su->path.s);
420
			ppath += su->path.e - su->path.s;
421
			*ppath = '\0';
422
		}
423
	}else if(currentdoc){
424
		/* Is a current-doc reference; just copy the path from the base URL */
425
		if(base->path){
426
			strcpy(ppath, base->path);
427
			ppath += strlen(ppath);
428
		}
429
		USED(ppath);
430
	}else{
431
		/* Is a relative-path reference; we have to merge it */
432
		/* Sec 5.2 step 6 */
433
		merge_relative_path(base->path,
434
			su->path.s, su->path.e - su->path.s, ppath);
435
	}
436
 
437
	/* Build new URL from pieces, inheriting from base where needed */
438
	strcpy(purl, base->scheme);
439
	purl += strlen(purl);
440
	*purl++ = ':';
441
	if(su->authority.s){
442
		strcpy(purl, "//");
443
		purl += strlen(purl);
444
		memmove(purl, su->authority.s, su->authority.e - su->authority.s);
445
		purl += su->authority.e - su->authority.s;
446
	}else if(base->authority){
447
		strcpy(purl, "//");
448
		purl += strlen(purl);
449
		strcpy(purl, base->authority);
450
		purl += strlen(purl);
451
	}
452
	assert((path[0] == '\0') || (path[0] == '/'));
453
	strcpy(purl, path);
454
	purl += strlen(purl);
455
 
456
	/*
457
	 * The query and fragment are not inherited from the base,
458
	 * except in case of "current document" URLs, which inherit any query
459
	 * and may inherit the fragment.
460
	 */
461
	if(su->query.s){
462
		*purl++ = '?';
463
		memmove(purl, su->query.s, su->query.e - su->query.s);
464
		purl += su->query.e - su->query.s;
465
	}else if(currentdoc && base->query){
466
		*purl++ = '?';
467
		strcpy(purl, base->query);
468
		purl += strlen(purl);
469
	}
470
 
471
	if(su->fragment.s){
472
		*purl++ = '#';
473
		memmove(purl, su->query.s, su->query.e - su->query.s);
474
		purl += su->fragment.e - su->fragment.s;
475
	}else if(currentdoc && base->fragment){
476
		*purl++ = '#';
477
		strcpy(purl, base->fragment);
478
		purl += strlen(purl);
479
	}
480
	USED(purl);
481
 
482
	if(urldebug)
483
		fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
484
	free(u->url);
485
	u->url = url;
486
	free(path);
487
	return 0;
488
}
489
 
490
int
491
regx(Reprog *prog, char *s, Resub *m, int nm)
492
{
493
	int i;
494
 
495
	if(s == nil)
496
		s = m[0].sp;	/* why is this necessary? */
497
 
498
	i = regexec(prog, s, m, nm);
499
/*
500
	if(i >= 0)
501
		for(j=0; j<nm; j++)
502
			fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
503
*/
504
	return i;
505
}
506
 
507
static int
508
ismatch(int i, char *s, char *desc)
509
{
510
	Resub m[1];
511
 
512
	m[0].sp = m[0].ep = nil;
513
	if(!regx(retab[i].prog, s, m, 1)){
514
		werrstr("malformed %s: %q", desc, s);
515
		return 0;
516
	}
517
	return 1;
518
}
519
 
520
static int
521
spliturl(char *url, SplitUrl *su)
522
{
523
	Resub m[MaxResub];
524
	Retab *t;
525
 
526
	/*
527
	 * Newlines are not valid in a URI, but regexp(2) treats them specially 
528
	 * so it's best to make sure there are none before proceeding.
529
	 */
530
	if(strchr(url, '\n')){
531
		werrstr("newline in URI");
532
		return -1;
533
	}
534
 
535
	/*
536
	 * Because we use NUL-terminated strings, as do many client and server
537
	 * implementations, an escaped NUL ("%00") will quite likely cause problems
538
	 * when unescaped.  We can check for such a sequence once before examining
539
 	 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
540
	 * in URIs to _always_ indicate escape sequences.  Something like "%2500"
541
	 * will still get by, but that's legitimate, and if it ends up causing
542
	 * a NUL then someone is unescaping too many times.
543
	 */
544
	if(strstr(url, "%00")){
545
		werrstr("escaped NUL in URI");
546
		return -1;
547
	}
548
 
549
	m[0].sp = m[0].ep = nil;
550
	t = &retab[REsplit];
551
	if(!regx(t->prog, url, m, t->size)){
552
		werrstr("malformed URI: %q", url);
553
		return -1;
554
	}
555
 
556
	su->url.s = m[0].sp;
557
	su->url.e = m[0].ep;
558
	su->scheme.s = m[t->ind[0]].sp;
559
	su->scheme.e = m[t->ind[0]].ep;
560
	su->authority.s = m[t->ind[1]].sp;
561
	su->authority.e = m[t->ind[1]].ep;
562
	su->path.s = m[t->ind[2]].sp;
563
	su->path.e = m[t->ind[2]].ep;
564
	su->query.s = m[t->ind[3]].sp;
565
	su->query.e = m[t->ind[3]].ep;
566
	su->fragment.s = m[t->ind[4]].sp;
567
	su->fragment.e = m[t->ind[4]].ep;
568
 
569
	if(urldebug)
570
		fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
571
			url,
572
			su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
573
			su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
574
			su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
575
			su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
576
			su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
577
			su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
578
 
579
	return 0;
580
}
581
 
582
static int
583
parse_scheme(SplitUrl *su, Url *u)
584
{
585
	if(su->scheme.s == nil){
586
		werrstr("missing scheme");
587
		return -1;
588
	}
589
	u->scheme = estredup(su->scheme.s, su->scheme.e);
590
	strlower(u->scheme);
591
 
592
	if(!ismatch(REscheme, u->scheme, "scheme"))
593
		return -1;
594
 
595
	u->ischeme = ischeme(u->scheme);
596
	if(urldebug)
597
		fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
598
	return 0;
599
}
600
 
601
static int
602
parse_unknown_part(SplitUrl *su, Url *u)
603
{
604
	char *s, *e;
605
 
606
	assert(u->ischeme == USunknown);
607
	assert(su->scheme.e[0] == ':');
608
 
609
	s = su->scheme.e+1;
610
	if(su->fragment.s){
611
		e = su->fragment.s-1;
612
		assert(*e == '#');
613
	}else
614
		e = s+strlen(s);
615
 
616
	u->schemedata = estredup(s, e);
617
	if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
618
		return -1;
619
	return 0;
620
}
621
 
622
static int
623
parse_userinfo(char *s, char *e, Url *u)
624
{
625
	Resub m[MaxResub];
626
	Retab *t;
627
 
628
	m[0].sp = s;
629
	m[0].ep = e;
630
	t = &retab[REuserinfo];
631
	if(!regx(t->prog, nil, m, t->size)){
632
		werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
633
		return -1;
634
	}
635
	if(m[t->ind[0]].sp)
636
		u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
637
	if(m[t->ind[1]].sp)
638
		u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
639
	return 0;
640
}
641
 
642
static int
643
parse_host(char *s, char *e, Url *u)
644
{
645
	Resub m[MaxResub];
646
	Retab *t;
647
 
648
	m[0].sp = s;
649
	m[0].ep = e;
650
	t = &retab[REhost];
651
	if(!regx(t->prog, nil, m, t->size)){
652
		werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
653
		return -1;
654
	}
655
 
656
	assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
657
 
658
	if(m[t->ind[0]].sp)	/* regular */
659
		u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
660
	else
661
		u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
662
	return 0;
663
}
664
 
665
static int
666
parse_authority(SplitUrl *su, Url *u)
667
{
668
	Resub m[MaxResub];
669
	Retab *t;
670
	char *host;
671
	char *userinfo;
672
 
673
	if(su->authority.s == nil)
674
		return 0;
675
 
676
	u->authority = estredup(su->authority.s, su->authority.e);
677
	m[0].sp = m[0].ep = nil;
678
	t = &retab[REauthority];
679
	if(!regx(t->prog, u->authority, m, t->size)){
680
		werrstr("malformed authority: %q", u->authority);
681
		return -1;
682
	}
683
 
684
	if(m[t->ind[0]].sp)
685
		if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
686
			return -1;
687
	if(m[t->ind[1]].sp)
688
		if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
689
			return -1;
690
	if(m[t->ind[2]].sp)
691
		u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
692
 
693
 
694
	if(urldebug > 0){
695
		userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 
696
		host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
697
		fprint(2, "port: %q, authority %q\n", u->port, u->authority);
698
		fprint(2, "host %q, userinfo %q\n", host, userinfo);
699
		free(host);
700
		free(userinfo);
701
	}
702
	return 0;
703
}
704
 
705
static int
706
parse_abspath(SplitUrl *su, Url *u)
707
{
708
	if(su->path.s == nil)
709
		return 0;
710
	u->path = estredup(su->path.s, su->path.e);
711
	if(!ismatch(REabspath, u->path, "absolute path"))
712
		return -1;
713
	return 0;
714
}
715
 
716
static int
717
parse_query(SplitUrl *su, Url *u)
718
{
719
	if(su->query.s == nil)
720
		return 0;
721
	u->query = estredup(su->query.s, su->query.e);
722
	if(!ismatch(REquery, u->query, "query"))
723
		return -1;
724
	return 0;
725
}
726
 
727
static int
728
parse_fragment(SplitUrl *su, Url *u)
729
{
730
	if(su->fragment.s == nil)
731
		return 0;
732
	u->fragment = estredup(su->fragment.s, su->fragment.e);
733
	if(!ismatch(REfragment, u->fragment, "fragment"))
734
		return -1;
735
	return 0;
736
}
737
 
738
static int
739
postparse_http(Url *u)
740
{
741
	u->open = httpopen;
742
	u->read = httpread;
743
	u->close = httpclose;
744
 
745
	if(u->authority==nil){
746
		werrstr("missing authority (hostname, port, etc.)");
747
		return -1;
748
	}
749
	if(u->host == nil){
750
		werrstr("missing host specification");
751
		return -1;
752
	}
753
 
754
	if(u->path == nil){
755
		u->http.page_spec = estrdup("/");
756
		return 0;
757
	}
758
 
759
	if(!ismatch(REhttppath, u->path, "http path"))
760
		return -1;
761
	if(u->query){
762
		u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
763
		strcpy(u->http.page_spec, u->path);
764
		strcat(u->http.page_spec, "?");
765
		strcat(u->http.page_spec, u->query);
766
	}else
767
		u->http.page_spec = estrdup(u->path);
768
 
769
	return 0;
770
}
771
 
772
static int
773
postparse_ftp(Url *u)
774
{
775
	Resub m[MaxResub];
776
	Retab *t;
777
 
778
	if(u->authority==nil){
779
		werrstr("missing authority (hostname, port, etc.)");
780
		return -1;
781
	}
782
	if(u->query){
783
		werrstr("unexpected \"?query\" in ftp path");
784
		return -1;
785
	}
786
	if(u->host == nil){
787
		werrstr("missing host specification");
788
		return -1;
789
	}
790
 
791
	if(u->path == nil){
792
		u->ftp.path_spec = estrdup("/");
793
		return 0;
794
	}
795
 
796
	m[0].sp = m[0].ep = nil;
797
	t = &retab[REftppath];
798
	if(!regx(t->prog, u->path, m, t->size)){
799
		werrstr("malformed ftp path: %q", u->path);
800
		return -1;
801
	}
802
 
803
	if(m[t->ind[0]].sp){
804
		u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
805
		if(strchr(u->ftp.path_spec, ';')){
806
			werrstr("unexpected \";param\" in ftp path");
807
			return -1;
808
		}
809
	}else
810
		u->ftp.path_spec = estrdup("/");
811
 
812
	if(m[t->ind[1]].sp){
813
		u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
814
		strlower(u->ftp.type);
815
	}
816
	return 0;
817
}
818
 
819
static int
820
postparse_file(Url *u)
821
{
822
	if(u->user || u->passwd){
823
		werrstr("user information not valid with file scheme");
824
		return -1;
825
	}
826
	if(u->query){
827
		werrstr("unexpected \"?query\" in file path");
828
		return -1;
829
	}
830
	if(u->port){
831
		werrstr("port not valid with file scheme");
832
		return -1;
833
	}
834
	if(u->path == nil){
835
		werrstr("missing path in file scheme");
836
		return -1;
837
	}
838
	if(strchr(u->path, ';')){
839
		werrstr("unexpected \";param\" in file path");
840
		return -1;
841
	}
842
 
843
	if(!ismatch(REfilepath, u->path, "file path"))
844
		return -1;
845
 
846
	/* "localhost" is equivalent to no host spec, we'll chose the latter */
847
	if(u->host && cistrcmp(u->host, "localhost") == 0){
848
		free(u->host);
849
		u->host = nil;
850
	}
851
	return 0;
852
}
853
 
854
static int (*postparse[])(Url*) = {
855
	nil,
856
	postparse_http,
857
	postparse_http,
858
	postparse_ftp,
859
	postparse_file,
860
};
861
 
862
Url*
863
parseurl(char *url, Url *base)
864
{
865
	Url *u;
866
	SplitUrl su;
867
 
868
	if(urldebug)
869
		fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
870
 
871
	u = emalloc(sizeof(Url));
872
	u->url = estrdup(url);
873
	if(spliturl(u->url, &su) < 0){
874
	Fail:
875
		freeurl(u);
876
		return nil;
877
	}
878
 
879
	/* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ 
880
	if(su.scheme.s==nil){
881
		if(urldebug)
882
			fprint(2, "parseurl has nil scheme\n");
883
		if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
884
			goto Fail;
885
		if(u->ischeme == UScurrent){
886
			/* 'u.url' refers to current document; set fragment and return */
887
			if(parse_fragment(&su, u) < 0)
888
				goto Fail;
889
			return u;
890
		}
891
	}
892
 
893
	if(parse_scheme(&su, u) < 0
894
	|| parse_fragment(&su, u) < 0)
895
		goto Fail;
896
 
897
	if(u->ischeme == USunknown){
898
		if(parse_unknown_part(&su, u) < 0)
899
			goto Fail;
900
		return u;
901
	}
902
 
903
	if(parse_query(&su, u) < 0
904
	|| parse_authority(&su, u) < 0
905
	|| parse_abspath(&su, u) < 0)
906
		goto Fail;
907
 
908
	if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
909
		if((*postparse[u->ischeme])(u) < 0)
910
			goto Fail;
911
 
912
	setmalloctag(u, getcallerpc(&url));
913
	return u;
914
}
915
 
916
void
917
freeurl(Url *u)
918
{
919
	if(u == nil)
920
		return;
921
	free(u->url);
922
	free(u->scheme);
923
	free(u->schemedata);
924
	free(u->authority);
925
	free(u->user);
926
	free(u->passwd);
927
	free(u->host);
928
	free(u->port);
929
	free(u->path);
930
	free(u->query);
931
	free(u->fragment);
932
	switch(u->ischeme){
933
	case UShttp:
934
		free(u->http.page_spec);
935
		break;
936
	case USftp:
937
		free(u->ftp.path_spec);
938
		free(u->ftp.type);
939
		break;
940
	}
941
	free(u);
942
}
943
 
944
void
945
rewriteurl(Url *u)
946
{
947
	char *s;
948
 
949
	if(u->schemedata)
950
		s = estrmanydup(u->scheme, ":", u->schemedata, nil);
951
	else
952
		s = estrmanydup(u->scheme, "://", 
953
			u->user ? u->user : "",
954
			u->passwd ? ":" : "", u->passwd ? u->passwd : "",
955
			u->user ? "@" : "", u->host ? u->host : "", 
956
			u->port ? ":" : "", u->port ? u->port : "",
957
			u->path,
958
			u->query ? "?" : "", u->query ? u->query : "",
959
			u->fragment ? "#" : "", u->fragment ? u->fragment : "",
960
			nil);
961
	free(u->url);
962
	u->url = s;
963
}
964
 
965
int
966
seturlquery(Url *u, char *query)
967
{
968
	if(query == nil){
969
		free(u->query);
970
		u->query = nil;
971
		return 0;
972
	}
973
 
974
	if(!ismatch(REquery, query, "query"))
975
		return -1;
976
 
977
	free(u->query);
978
	u->query = estrdup(query);
979
	return 0;
980
}
981
 
982
static void
983
dupp(char **p)
984
{
985
	if(*p)
986
		*p = estrdup(*p);
987
}
988
 
989
Url*
990
copyurl(Url *u)
991
{
992
	Url *v;
993
 
994
	v = emalloc(sizeof(Url));
995
	*v = *u;
996
	dupp(&v->url);
997
	dupp(&v->scheme);
998
	dupp(&v->schemedata);
999
	dupp(&v->authority);
1000
	dupp(&v->user);
1001
	dupp(&v->passwd);
1002
	dupp(&v->host);
1003
	dupp(&v->port);
1004
	dupp(&v->path);
1005
	dupp(&v->query);
1006
	dupp(&v->fragment);
1007
 
1008
	switch(v->ischeme){
1009
	case UShttp:
1010
		dupp(&v->http.page_spec);
1011
		break;
1012
	case USftp:
1013
		dupp(&v->ftp.path_spec);
1014
		dupp(&v->ftp.type);
1015
		break;
1016
	}
1017
	return v;
1018
}
1019
 
1020
static int
1021
dhex(char c)
1022
{
1023
	if('0' <= c && c <= '9')
1024
		return c-'0';
1025
	if('a' <= c && c <= 'f')
1026
		return c-'a'+10;
1027
	if('A' <= c && c <= 'F')
1028
		return c-'A'+10;
1029
	return 0;
1030
}
1031
 
1032
char*
1033
escapeurl(char *s, int (*needesc)(int))
1034
{
1035
	int n;
1036
	char *t, *u;
1037
	Rune r;
1038
	static char *hex = "0123456789abcdef";
1039
 
1040
	n = 0;
1041
	for(t=s; *t; t++)
1042
		if((*needesc)(*t))
1043
			n++;
1044
 
1045
	u = emalloc(strlen(s)+2*n+1);
1046
	t = u;
1047
	for(; *s; s++){
1048
		s += chartorune(&r, s);
1049
		if(r >= 0xFF){
1050
			werrstr("URLs cannot contain Runes > 0xFF");
1051
			free(t);
1052
			return nil;
1053
		}
1054
		if((*needesc)(r)){
1055
			*u++ = '%';
1056
			*u++ = hex[(r>>4)&0xF];
1057
			*u++ = hex[r&0xF];
1058
		}else
1059
			*u++ = r;
1060
	}
1061
	*u = '\0';
1062
	return t;
1063
}
1064
 
1065
char*
1066
unescapeurl(char *s)
1067
{
1068
	char *r, *w;
1069
	Rune rune;
1070
 
1071
	s = estrdup(s);
1072
	for(r=w=s; *r; r++){
1073
		if(*r=='%'){
1074
			r++;
1075
			if(!isxdigit(r[0]) || !isxdigit(r[1])){
1076
				werrstr("bad escape sequence '%.3s' in URL", r);
1077
				return nil;
1078
			}
1079
			if(r[0]=='0' && r[2]=='0'){
1080
				werrstr("escaped NUL in URL");
1081
				return nil;
1082
			}
1083
			rune = (dhex(r[0])<<4)|dhex(r[1]);	/* latin1 */
1084
			w += runetochar(w, &rune);
1085
			r += 2;
1086
		}else
1087
			*w++ = *r;
1088
	}
1089
	*w = '\0';
1090
	return s;
1091
}
1092