WebSVN – SE.SVN – Blame – /code/mochiweb/trunk/src/mochiweb_html.erl

Rev	Author	Line No.	Line
12	7u83	1	`%% @author Bob Ippolito <bob@mochimedia.com>`
		2	`%% @copyright 2007 Mochi Media, Inc.`
		3	`%%`
		4	`%% Permission is hereby granted, free of charge, to any person obtaining a`
		5	`%% copy of this software and associated documentation files (the "Software"),`
		6	`%% to deal in the Software without restriction, including without limitation`
		7	`%% the rights to use, copy, modify, merge, publish, distribute, sublicense,`
		8	`%% and/or sell copies of the Software, and to permit persons to whom the`
		9	`%% Software is furnished to do so, subject to the following conditions:`
		10	`%%`
		11	`%% The above copyright notice and this permission notice shall be included in`
		12	`%% all copies or substantial portions of the Software.`
		13	`%%`
		14	`%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
		15	`%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
		16	`%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
		17	`%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
		18	`%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING`
		19	`%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER`
		20	`%% DEALINGS IN THE SOFTWARE.`
		21
		22	`%% @doc Loosely tokenizes and generates parse trees for HTML 4.`
		23	`-module(mochiweb_html).`
		24	`-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,`
		25	`escape_attr/1, to_html/1]).`
		26	`-ifdef(TEST).`
		27	`-export([destack/1, destack/2, is_singleton/1]).`
		28	`-endif.`
		29
		30	`%% This is a macro to placate syntax highlighters..`
		31	`-define(QUOTE, $\"). %% $\"`
		32	`-define(SQUOTE, $\'). %% $\'`
		33	`-define(ADV_COL(S, N),`
		34	`S#decoder{column=N+S#decoder.column,`
		35	`offset=N+S#decoder.offset}).`
		36	`-define(INC_COL(S),`
		37	`S#decoder{column=1+S#decoder.column,`
		38	`offset=1+S#decoder.offset}).`
		39	`-define(INC_LINE(S),`
		40	`S#decoder{column=1,`
		41	`line=1+S#decoder.line,`
		42	`offset=1+S#decoder.offset}).`
		43	`-define(INC_CHAR(S, C),`
		44	`case C of`
		45	`$\n ->`
		46	`S#decoder{column=1,`
		47	`line=1+S#decoder.line,`
		48	`offset=1+S#decoder.offset};`
		49	`_ ->`
		50	`S#decoder{column=1+S#decoder.column,`
		51	`offset=1+S#decoder.offset}`
		52	`end).`
		53
		54	`-define(IS_WHITESPACE(C),`
		55	`(C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).`
		56	`-define(IS_LETTER(C),`
		57	`((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z))).`
		58	`-define(IS_LITERAL_SAFE(C),`
		59	`((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)`
		60	`orelse (C >= $0 andalso C =< $9))).`
		61	`-define(PROBABLE_CLOSE(C),`
		62	`(C =:= $> orelse ?IS_WHITESPACE(C))).`
		63
		64	`-record(decoder, {line=1,`
		65	`column=1,`
		66	`offset=0}).`
		67
		68	`%% @type html_node() = {string(), [html_attr()], [html_node() \| string()]}`
		69	`%% @type html_attr() = {string(), string()}`
		70	`%% @type html_token() = html_data() \| start_tag() \| end_tag() \| inline_html() \| html_comment() \| html_doctype()`
		71	`%% @type html_data() = {data, string(), Whitespace::boolean()}`
		72	`%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}`
		73	`%% @type end_tag() = {end_tag, Name}`
		74	`%% @type html_comment() = {comment, Comment}`
		75	`%% @type html_doctype() = {doctype, [Doctype]}`
		76	`%% @type inline_html() = {'=', iolist()}`
		77
		78	`%% External API.`
		79
		80	`%% @spec parse(string() \| binary()) -> html_node()`
		81	`%% @doc tokenize and then transform the token stream into a HTML tree.`
		82	`parse(Input) ->`
		83	`parse_tokens(tokens(Input)).`
		84
		85	`%% @spec parse_tokens([html_token()]) -> html_node()`
		86	`%% @doc Transform the output of tokens(Doc) into a HTML tree.`
		87	`parse_tokens(Tokens) when is_list(Tokens) ->`
		88	`%% Skip over doctype, processing instructions`
		89	`[{start_tag, Tag, Attrs, false} \| Rest] = find_document(Tokens, normal),`
		90	`{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),`
		91	`Tree.`
		92
		93	`find_document(Tokens=[{start_tag, _Tag, _Attrs, false} \| _Rest], Mode) ->`
		94	`maybe_add_html_tag(Tokens, Mode);`
		95	`find_document([{doctype, [<<"html">>]} \| Rest], _Mode) ->`
		96	`find_document(Rest, html5);`
		97	`find_document([_T \| Rest], Mode) ->`
		98	`find_document(Rest, Mode);`
		99	`find_document([], _Mode) ->`
		100	`[].`
		101
		102	`maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} \| _], html5)`
		103	`when Tag =/= <<"html">> ->`
		104	`[{start_tag, <<"html">>, [], false} \| Tokens];`
		105	`maybe_add_html_tag(Tokens, _Mode) ->`
		106	`Tokens.`
		107
		108	`%% @spec tokens(StringOrBinary) -> [html_token()]`
		109	`%% @doc Transform the input UTF-8 HTML into a token stream.`
		110	`tokens(Input) ->`
		111	`tokens(iolist_to_binary(Input), #decoder{}, []).`
		112
		113	`%% @spec to_tokens(html_node()) -> [html_token()]`
		114	`%% @doc Convert a html_node() tree to a list of tokens.`
		115	`to_tokens({Tag0}) ->`
		116	`to_tokens({Tag0, [], []});`
		117	`to_tokens(T={'=', _}) ->`
		118	`[T];`
		119	`to_tokens(T={doctype, _}) ->`
		120	`[T];`
		121	`to_tokens(T={comment, _}) ->`
		122	`[T];`
		123	`to_tokens({Tag0, Acc}) ->`
		124	`%% This is only allowed in sub-tags: {p, [{"class", "foo"}]}`
		125	`to_tokens({Tag0, [], Acc});`
		126	`to_tokens({Tag0, Attrs, Acc}) ->`
		127	`Tag = to_tag(Tag0),`
		128	`case is_singleton(Tag) of`
		129	`true ->`
		130	`to_tokens([], [{start_tag, Tag, Attrs, true}]);`
		131	`false ->`
		132	`to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}])`
		133	`end.`
		134
		135	`%% @spec to_html([html_token()] \| html_node()) -> iolist()`
		136	`%% @doc Convert a list of html_token() to a HTML document.`
		137	`to_html(Node) when is_tuple(Node) ->`
		138	`to_html(to_tokens(Node));`
		139	`to_html(Tokens) when is_list(Tokens) ->`
		140	`to_html(Tokens, []).`
		141
		142	`%% @spec escape(string() \| atom() \| binary()) -> binary()`
		143	`%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).`
		144	`escape(B) when is_binary(B) ->`
		145	`escape(binary_to_list(B), []);`
		146	`escape(A) when is_atom(A) ->`
		147	`escape(atom_to_list(A), []);`
		148	`escape(S) when is_list(S) ->`
		149	`escape(S, []).`
		150
		151	`%% @spec escape_attr(string() \| binary() \| atom() \| integer() \| float()) -> binary()`
		152	`%% @doc Escape a string such that it's safe for HTML attrs`
		153	`%% (amp; lt; gt; quot;).`
		154	`escape_attr(B) when is_binary(B) ->`
		155	`escape_attr(binary_to_list(B), []);`
		156	`escape_attr(A) when is_atom(A) ->`
		157	`escape_attr(atom_to_list(A), []);`
		158	`escape_attr(S) when is_list(S) ->`
		159	`escape_attr(S, []);`
		160	`escape_attr(I) when is_integer(I) ->`
		161	`escape_attr(integer_to_list(I), []);`
		162	`escape_attr(F) when is_float(F) ->`
		163	`escape_attr(mochinum:digits(F), []).`
		164
		165	`to_html([], Acc) ->`
		166	`lists:reverse(Acc);`
		167	`to_html([{'=', Content} \| Rest], Acc) ->`
		168	`to_html(Rest, [Content \| Acc]);`
		169	`to_html([{pi, Bin} \| Rest], Acc) ->`
		170	`Open = [<<"<?">>,`
		171	`Bin,`
		172	`<<"?>">>],`
		173	`to_html(Rest, [Open \| Acc]);`
		174	`to_html([{pi, Tag, Attrs} \| Rest], Acc) ->`
		175	`Open = [<<"<?">>,`
		176	`Tag,`
		177	`attrs_to_html(Attrs, []),`
		178	`<<"?>">>],`
		179	`to_html(Rest, [Open \| Acc]);`
		180	`to_html([{comment, Comment} \| Rest], Acc) ->`
		181	`to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] \| Acc]);`
		182	`to_html([{doctype, Parts} \| Rest], Acc) ->`
		183	`Inside = doctype_to_html(Parts, Acc),`
		184	`to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] \| Acc]);`
		185	`to_html([{data, Data, _Whitespace} \| Rest], Acc) ->`
		186	`to_html(Rest, [escape(Data) \| Acc]);`
		187	`to_html([{start_tag, Tag, Attrs, Singleton} \| Rest], Acc) ->`
		188	`Open = [<<"<">>,`
		189	`Tag,`
		190	`attrs_to_html(Attrs, []),`
		191	`case Singleton of`
		192	`true -> <<" />">>;`
		193	`false -> <<">">>`
		194	`end],`
		195	`to_html(Rest, [Open \| Acc]);`
		196	`to_html([{end_tag, Tag} \| Rest], Acc) ->`
		197	`to_html(Rest, [[<<"</">>, Tag, <<">">>] \| Acc]).`
		198
		199	`doctype_to_html([], Acc) ->`
		200	`lists:reverse(Acc);`
		201	`doctype_to_html([Word \| Rest], Acc) ->`
		202	`case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,`
		203	`binary_to_list(iolist_to_binary(Word))) of`
		204	`true ->`
		205	`doctype_to_html(Rest, [[<<" ">>, Word] \| Acc]);`
		206	`false ->`
		207	`doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] \| Acc])`
		208	`end.`
		209
		210	`attrs_to_html([], Acc) ->`
		211	`lists:reverse(Acc);`
		212	`attrs_to_html([{K, V} \| Rest], Acc) ->`
		213	`attrs_to_html(Rest,`
		214	`[[<<" ">>, escape(K), <<"=\"">>,`
		215	`escape_attr(V), <<"\"">>] \| Acc]).`
		216
		217	`escape([], Acc) ->`
		218	`list_to_binary(lists:reverse(Acc));`
		219	`escape("<" ++ Rest, Acc) ->`
		220	`escape(Rest, lists:reverse("<", Acc));`
		221	`escape(">" ++ Rest, Acc) ->`
		222	`escape(Rest, lists:reverse(">", Acc));`
		223	`escape("&" ++ Rest, Acc) ->`
		224	`escape(Rest, lists:reverse("&", Acc));`
		225	`escape([C \| Rest], Acc) ->`
		226	`escape(Rest, [C \| Acc]).`
		227
		228	`escape_attr([], Acc) ->`
		229	`list_to_binary(lists:reverse(Acc));`
		230	`escape_attr("<" ++ Rest, Acc) ->`
		231	`escape_attr(Rest, lists:reverse("<", Acc));`
		232	`escape_attr(">" ++ Rest, Acc) ->`
		233	`escape_attr(Rest, lists:reverse(">", Acc));`
		234	`escape_attr("&" ++ Rest, Acc) ->`
		235	`escape_attr(Rest, lists:reverse("&", Acc));`
		236	`escape_attr([?QUOTE \| Rest], Acc) ->`
		237	`escape_attr(Rest, lists:reverse(""", Acc));`
		238	`escape_attr([C \| Rest], Acc) ->`
		239	`escape_attr(Rest, [C \| Acc]).`
		240
		241	`to_tag(A) when is_atom(A) ->`
		242	`norm(atom_to_list(A));`
		243	`to_tag(L) ->`
		244	`norm(L).`
		245
		246	`to_tokens([], Acc) ->`
		247	`lists:reverse(Acc);`
		248	`to_tokens([{Tag, []} \| Rest], Acc) ->`
		249	`to_tokens(Rest, [{end_tag, to_tag(Tag)} \| Acc]);`
		250	`to_tokens([{Tag0, [{T0} \| R1]} \| Rest], Acc) ->`
		251	`%% Allow {br}`
		252	`to_tokens([{Tag0, [{T0, [], []} \| R1]} \| Rest], Acc);`
		253	`to_tokens([{Tag0, [T0={'=', _C0} \| R1]} \| Rest], Acc) ->`
		254	`%% Allow {'=', iolist()}`
		255	`to_tokens([{Tag0, R1} \| Rest], [T0 \| Acc]);`
		256	`to_tokens([{Tag0, [T0={comment, _C0} \| R1]} \| Rest], Acc) ->`
		257	`%% Allow {comment, iolist()}`
		258	`to_tokens([{Tag0, R1} \| Rest], [T0 \| Acc]);`
		259	`to_tokens([{Tag0, [T0={pi, _S0} \| R1]} \| Rest], Acc) ->`
		260	`%% Allow {pi, binary()}`
		261	`to_tokens([{Tag0, R1} \| Rest], [T0 \| Acc]);`
		262	`to_tokens([{Tag0, [T0={pi, _S0, _A0} \| R1]} \| Rest], Acc) ->`
		263	`%% Allow {pi, binary(), list()}`
		264	`to_tokens([{Tag0, R1} \| Rest], [T0 \| Acc]);`
		265	`to_tokens([{Tag0, [{T0, A0=[{_, _} \| _]} \| R1]} \| Rest], Acc) ->`
		266	`%% Allow {p, [{"class", "foo"}]}`
		267	`to_tokens([{Tag0, [{T0, A0, []} \| R1]} \| Rest], Acc);`
		268	`to_tokens([{Tag0, [{T0, C0} \| R1]} \| Rest], Acc) ->`
		269	`%% Allow {p, "content"} and {p, <<"content">>}`
		270	`to_tokens([{Tag0, [{T0, [], C0} \| R1]} \| Rest], Acc);`
		271	`to_tokens([{Tag0, [{T0, A1, C0} \| R1]} \| Rest], Acc) when is_binary(C0) ->`
		272	`%% Allow {"p", [{"class", "foo"}], <<"content">>}`
		273	`to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} \| R1]} \| Rest], Acc);`
		274	`to_tokens([{Tag0, [{T0, A1, C0=[C \| _]} \| R1]} \| Rest], Acc)`
		275	`when is_integer(C) ->`
		276	`%% Allow {"p", [{"class", "foo"}], "content"}`
		277	`to_tokens([{Tag0, [{T0, A1, [C0]} \| R1]} \| Rest], Acc);`
		278	`to_tokens([{Tag0, [{T0, A1, C1} \| R1]} \| Rest], Acc) ->`
		279	`%% Native {"p", [{"class", "foo"}], ["content"]}`
		280	`Tag = to_tag(Tag0),`
		281	`T1 = to_tag(T0),`
		282	`case is_singleton(norm(T1)) of`
		283	`true ->`
		284	`to_tokens([{Tag, R1} \| Rest], [{start_tag, T1, A1, true} \| Acc]);`
		285	`false ->`
		286	`to_tokens([{T1, C1}, {Tag, R1} \| Rest],`
		287	`[{start_tag, T1, A1, false} \| Acc])`
		288	`end;`
		289	`to_tokens([{Tag0, [L \| R1]} \| Rest], Acc) when is_list(L) ->`
		290	`%% List text`
		291	`Tag = to_tag(Tag0),`
		292	`to_tokens([{Tag, R1} \| Rest], [{data, iolist_to_binary(L), false} \| Acc]);`
		293	`to_tokens([{Tag0, [B \| R1]} \| Rest], Acc) when is_binary(B) ->`
		294	`%% Binary text`
		295	`Tag = to_tag(Tag0),`
		296	`to_tokens([{Tag, R1} \| Rest], [{data, B, false} \| Acc]).`
		297
		298	`tokens(B, S=#decoder{offset=O}, Acc) ->`
		299	`case B of`
		300	`<<_:O/binary>> ->`
		301	`lists:reverse(Acc);`
		302	`_ ->`
		303	`{Tag, S1} = tokenize(B, S),`
		304	`case parse_flag(Tag) of`
		305	`script ->`
		306	`{Tag2, S2} = tokenize_script(B, S1),`
		307	`tokens(B, S2, [Tag2, Tag \| Acc]);`
		308	`textarea ->`
		309	`{Tag2, S2} = tokenize_textarea(B, S1),`
		310	`tokens(B, S2, [Tag2, Tag \| Acc]);`
		311	`none ->`
		312	`tokens(B, S1, [Tag \| Acc])`
		313	`end`
		314	`end.`
		315
		316	`parse_flag({start_tag, B, _, false}) ->`
		317	`case string:to_lower(binary_to_list(B)) of`
		318	`"script" ->`
		319	`script;`
		320	`"textarea" ->`
		321	`textarea;`
		322	`_ ->`
		323	`none`
		324	`end;`
		325	`parse_flag(_) ->`
		326	`none.`
		327
		328	`tokenize(B, S=#decoder{offset=O}) ->`
		329	`case B of`
		330	`<<_:O/binary, "<!--", _/binary>> ->`
		331	`tokenize_comment(B, ?ADV_COL(S, 4));`
		332	`<<_:O/binary, "<!doctype", _/binary>> ->`
		333	`tokenize_doctype(B, ?ADV_COL(S, 10));`
		334	`<<_:O/binary, "<!DOCTYPE", _/binary>> ->`
		335	`tokenize_doctype(B, ?ADV_COL(S, 10));`
		336	`<<_:O/binary, "<![CDATA[", _/binary>> ->`
		337	`tokenize_cdata(B, ?ADV_COL(S, 9));`
		338	`<<_:O/binary, "<?php", _/binary>> ->`
		339	`{Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),`
		340	`{{pi, Body}, S1};`
		341	`<<_:O/binary, "<?", _/binary>> ->`
		342	`{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),`
		343	`{Attrs, S2} = tokenize_attributes(B, S1),`
		344	`S3 = find_qgt(B, S2),`
		345	`{{pi, Tag, Attrs}, S3};`
		346	`<<_:O/binary, "&", _/binary>> ->`
		347	`tokenize_charref(B, ?INC_COL(S));`
		348	`<<_:O/binary, "</", _/binary>> ->`
		349	`{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),`
		350	`{S2, _} = find_gt(B, S1),`
		351	`{{end_tag, Tag}, S2};`
		352	`<<_:O/binary, "<", C, _/binary>>`
		353	`when ?IS_WHITESPACE(C); not ?IS_LETTER(C) ->`
		354	`%% This isn't really strict HTML`
		355	`{{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),`
		356	`{{data, <<$<, Data/binary>>, false}, S1};`
		357	`<<_:O/binary, "<", _/binary>> ->`
		358	`{Tag, S1} = tokenize_literal(B, ?INC_COL(S)),`
		359	`{Attrs, S2} = tokenize_attributes(B, S1),`
		360	`{S3, HasSlash} = find_gt(B, S2),`
		361	`Singleton = HasSlash orelse is_singleton(Tag),`
		362	`{{start_tag, Tag, Attrs, Singleton}, S3};`
		363	`_ ->`
		364	`tokenize_data(B, S)`
		365	`end.`
		366
		367	`tree_data([{data, Data, Whitespace} \| Rest], AllWhitespace, Acc) ->`
		368	`tree_data(Rest, (Whitespace andalso AllWhitespace), [Data \| Acc]);`
		369	`tree_data(Rest, AllWhitespace, Acc) ->`
		370	`{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.`
		371
		372	`tree([], Stack) ->`
		373	`{destack(Stack), []};`
		374	`tree([{end_tag, Tag} \| Rest], Stack) ->`
		375	`case destack(norm(Tag), Stack) of`
		376	`S when is_list(S) ->`
		377	`tree(Rest, S);`
		378	`Result ->`
		379	`{Result, []}`
		380	`end;`
		381	`tree([{start_tag, Tag, Attrs, true} \| Rest], S) ->`
		382	`tree(Rest, append_stack_child(norm({Tag, Attrs}), S));`
		383	`tree([{start_tag, Tag, Attrs, false} \| Rest], S) ->`
		384	`tree(Rest, stack(norm({Tag, Attrs}), S));`
		385	`tree([T={pi, _Raw} \| Rest], S) ->`
		386	`tree(Rest, append_stack_child(T, S));`
		387	`tree([T={pi, _Tag, _Attrs} \| Rest], S) ->`
		388	`tree(Rest, append_stack_child(T, S));`
		389	`tree([T={comment, _Comment} \| Rest], S) ->`
		390	`tree(Rest, append_stack_child(T, S));`
		391	`tree(L=[{data, _Data, _Whitespace} \| _], S) ->`
		392	`case tree_data(L, true, []) of`
		393	`{_, true, Rest} ->`
		394	`tree(Rest, S);`
		395	`{Data, false, Rest} ->`
		396	`tree(Rest, append_stack_child(Data, S))`
		397	`end;`
		398	`tree([{doctype, _} \| Rest], Stack) ->`
		399	`tree(Rest, Stack).`
		400
		401	`norm({Tag, Attrs}) ->`
		402	`{norm(Tag), [{norm(K), iolist_to_binary(V)} \|\| {K, V} <- Attrs], []};`
		403	`norm(Tag) when is_binary(Tag) ->`
		404	`Tag;`
		405	`norm(Tag) ->`
		406	`list_to_binary(string:to_lower(Tag)).`
		407
		408	`stack(T1={TN, _, _}, Stack=[{TN, _, _} \| _Rest])`
		409	`when TN =:= <<"li">> orelse TN =:= <<"option">> ->`
		410	`[T1 \| destack(TN, Stack)];`
		411	`stack(T1={TN0, _, _}, Stack=[{TN1, _, _} \| _Rest])`
		412	`when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso`
		413	`(TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->`
		414	`[T1 \| destack(TN1, Stack)];`
		415	`stack(T1, Stack) ->`
		416	`[T1 \| Stack].`
		417
		418	`append_stack_child(StartTag, [{Name, Attrs, Acc} \| Stack]) ->`
		419	`[{Name, Attrs, [StartTag \| Acc]} \| Stack].`
		420
		421	`destack(<<"br">>, Stack) ->`
		422	`%% This is an ugly hack to make dumb_br_test() pass,`
		423	`%% this makes it such that br can never have children.`
		424	`Stack;`
		425	`destack(TagName, Stack) when is_list(Stack) ->`
		426	`F = fun (X) ->`
		427	`case X of`
		428	`{TagName, _, _} ->`
		429	`false;`
		430	`_ ->`
		431	`true`
		432	`end`
		433	`end,`
		434	`case lists:splitwith(F, Stack) of`
		435	`{_, []} ->`
		436	`%% If we're parsing something like XML we might find`
		437	`%% a <link>tag</link> that is normally a singleton`
		438	`%% in HTML but isn't here`
		439	`case {is_singleton(TagName), Stack} of`
		440	`{true, [{T0, A0, Acc0} \| Post0]} ->`
		441	`case lists:splitwith(F, Acc0) of`
		442	`{_, []} ->`
		443	`%% Actually was a singleton`
		444	`Stack;`
		445	`{Pre, [{T1, A1, Acc1} \| Post1]} ->`
		446	`[{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} \| Post1]}`
		447	`\| Post0]`
		448	`end;`
		449	`_ ->`
		450	`%% No match, no state change`
		451	`Stack`
		452	`end;`
		453	`{_Pre, [_T]} ->`
		454	`%% Unfurl the whole stack, we're done`
		455	`destack(Stack);`
		456	`{Pre, [T, {T0, A0, Acc0} \| Post]} ->`
		457	`%% Unfurl up to the tag, then accumulate it`
		458	`[{T0, A0, [destack(Pre ++ [T]) \| Acc0]} \| Post]`
		459	`end.`
		460
		461	`destack([{Tag, Attrs, Acc}]) ->`
		462	`{Tag, Attrs, lists:reverse(Acc)};`
		463	`destack([{T1, A1, Acc1}, {T0, A0, Acc0} \| Rest]) ->`
		464	`destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} \| Acc0]} \| Rest]).`
		465
		466	`is_singleton(<<"area">>) -> true;`
		467	`is_singleton(<<"base">>) -> true;`
		468	`is_singleton(<<"br">>) -> true;`
		469	`is_singleton(<<"col">>) -> true;`
		470	`is_singleton(<<"embed">>) -> true;`
		471	`is_singleton(<<"hr">>) -> true;`
		472	`is_singleton(<<"img">>) -> true;`
		473	`is_singleton(<<"input">>) -> true;`
		474	`is_singleton(<<"keygen">>) -> true;`
		475	`is_singleton(<<"link">>) -> true;`
		476	`is_singleton(<<"meta">>) -> true;`
		477	`is_singleton(<<"param">>) -> true;`
		478	`is_singleton(<<"source">>) -> true;`
		479	`is_singleton(<<"track">>) -> true;`
		480	`is_singleton(<<"wbr">>) -> true;`
		481	`is_singleton(_) -> false.`
		482
		483	`tokenize_data(B, S=#decoder{offset=O}) ->`
		484	`tokenize_data(B, S, O, true).`
		485
		486	`tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->`
		487	`case B of`
		488	`<<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->`
		489	`tokenize_data(B, ?INC_CHAR(S, C), Start,`
		490	`(Whitespace andalso ?IS_WHITESPACE(C)));`
		491	`_ ->`
		492	`Len = O - Start,`
		493	`<<_:Start/binary, Data:Len/binary, _/binary>> = B,`
		494	`{{data, Data, Whitespace}, S}`
		495	`end.`
		496
		497	`tokenize_attributes(B, S) ->`
		498	`tokenize_attributes(B, S, []).`
		499
		500	`tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->`
		501	`case B of`
		502	`<<_:O/binary>> ->`
		503	`{lists:reverse(Acc), S};`
		504	`<<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->`
		505	`{lists:reverse(Acc), S};`
		506	`<<_:O/binary, "?>", _/binary>> ->`
		507	`{lists:reverse(Acc), S};`
		508	`<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->`
		509	`tokenize_attributes(B, ?INC_CHAR(S, C), Acc);`
		510	`_ ->`
		511	`{Attr, S1} = tokenize_literal(B, S),`
		512	`{Value, S2} = tokenize_attr_value(Attr, B, S1),`
		513	`tokenize_attributes(B, S2, [{Attr, Value} \| Acc])`
		514	`end.`
		515
		516	`tokenize_attr_value(Attr, B, S) ->`
		517	`S1 = skip_whitespace(B, S),`
		518	`O = S1#decoder.offset,`
		519	`case B of`
		520	`<<_:O/binary, "=", _/binary>> ->`
		521	`S2 = skip_whitespace(B, ?INC_COL(S1)),`
		522	`tokenize_quoted_or_unquoted_attr_value(B, S2);`
		523	`_ ->`
		524	`{Attr, S1}`
		525	`end.`
		526
		527	`tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->`
		528	`case B of`
		529	`<<_:O/binary>> ->`
		530	`{ [], S };`
		531	`<<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse`
		532	`Q =:= ?SQUOTE ->`
		533	`tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);`
		534	`<<_:O/binary, _/binary>> ->`
		535	`tokenize_unquoted_attr_value(B, S, [])`
		536	`end.`
		537
		538	`tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->`
		539	`case B of`
		540	`<<_:O/binary>> ->`
		541	`{ iolist_to_binary(lists:reverse(Acc)), S };`
		542	`<<_:O/binary, $&, _/binary>> ->`
		543	`{{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),`
		544	`tokenize_quoted_attr_value(B, S1, [Data\|Acc], Q);`
		545	`<<_:O/binary, Q, _/binary>> ->`
		546	`{ iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };`
		547	`<<_:O/binary, C, _/binary>> ->`
		548	`tokenize_quoted_attr_value(B, ?INC_COL(S), [C\|Acc], Q)`
		549	`end.`
		550
		551	`tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->`
		552	`case B of`
		553	`<<_:O/binary>> ->`
		554	`{ iolist_to_binary(lists:reverse(Acc)), S };`
		555	`<<_:O/binary, $&, _/binary>> ->`
		556	`{{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),`
		557	`tokenize_unquoted_attr_value(B, S1, [Data\|Acc]);`
		558	`<<_:O/binary, $/, $>, _/binary>> ->`
		559	`{ iolist_to_binary(lists:reverse(Acc)), S };`
		560	`<<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->`
		561	`{ iolist_to_binary(lists:reverse(Acc)), S };`
		562	`<<_:O/binary, C, _/binary>> ->`
		563	`tokenize_unquoted_attr_value(B, ?INC_COL(S), [C\|Acc])`
		564	`end.`
		565
		566	`skip_whitespace(B, S=#decoder{offset=O}) ->`
		567	`case B of`
		568	`<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->`
		569	`skip_whitespace(B, ?INC_CHAR(S, C));`
		570	`_ ->`
		571	`S`
		572	`end.`
		573
		574	`tokenize_literal(Bin, S=#decoder{offset=O}) ->`
		575	`case Bin of`
		576	`<<_:O/binary, C, _/binary>> when C =:= $>`
		577	`orelse C =:= $/`
		578	`orelse C =:= $= ->`
		579	`%% Handle case where tokenize_literal would consume`
		580	`%% 0 chars. http://github.com/mochi/mochiweb/pull/13`
		581	`{[C], ?INC_COL(S)};`
		582	`_ ->`
		583	`tokenize_literal(Bin, S, [])`
		584	`end.`
		585
		586	`tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->`
		587	`case Bin of`
		588	`<<_:O/binary, $&, _/binary>> ->`
		589	`{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),`
		590	`tokenize_literal(Bin, S1, [Data \| Acc]);`
		591	`<<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)`
		592	`orelse C =:= $>`
		593	`orelse C =:= $/`
		594	`orelse C =:= $=) ->`
		595	`tokenize_literal(Bin, ?INC_COL(S), [C \| Acc]);`
		596	`_ ->`
		597	`{iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}`
		598	`end.`
		599
		600	`raw_qgt(Bin, S=#decoder{offset=O}) ->`
		601	`raw_qgt(Bin, S, O).`
		602
		603	`raw_qgt(Bin, S=#decoder{offset=O}, Start) ->`
		604	`case Bin of`
		605	`<<_:O/binary, "?>", _/binary>> ->`
		606	`Len = O - Start,`
		607	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		608	`{Raw, ?ADV_COL(S, 2)};`
		609	`<<_:O/binary, C, _/binary>> ->`
		610	`raw_qgt(Bin, ?INC_CHAR(S, C), Start);`
		611	`<<_:O/binary>> ->`
		612	`<<_:Start/binary, Raw/binary>> = Bin,`
		613	`{Raw, S}`
		614	`end.`
		615
		616	`find_qgt(Bin, S=#decoder{offset=O}) ->`
		617	`case Bin of`
		618	`<<_:O/binary, "?>", _/binary>> ->`
		619	`?ADV_COL(S, 2);`
		620	`<<_:O/binary, ">", _/binary>> ->`
		621	`?ADV_COL(S, 1);`
		622	`<<_:O/binary, "/>", _/binary>> ->`
		623	`?ADV_COL(S, 2);`
		624	`%% tokenize_attributes takes care of this state:`
		625	`%% <<_:O/binary, C, _/binary>> ->`
		626	`%% find_qgt(Bin, ?INC_CHAR(S, C));`
		627	`<<_:O/binary>> ->`
		628	`S`
		629	`end.`
		630
		631	`find_gt(Bin, S) ->`
		632	`find_gt(Bin, S, false).`
		633
		634	`find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->`
		635	`case Bin of`
		636	`<<_:O/binary, $/, _/binary>> ->`
		637	`find_gt(Bin, ?INC_COL(S), true);`
		638	`<<_:O/binary, $>, _/binary>> ->`
		639	`{?INC_COL(S), HasSlash};`
		640	`<<_:O/binary, C, _/binary>> ->`
		641	`find_gt(Bin, ?INC_CHAR(S, C), HasSlash);`
		642	`_ ->`
		643	`{S, HasSlash}`
		644	`end.`
		645
		646	`tokenize_charref(Bin, S=#decoder{offset=O}) ->`
		647	`try`
		648	`case tokenize_charref_raw(Bin, S, O) of`
		649	`{C1, S1} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->`
		650	`%% Surrogate pair`
		651	`tokeninize_charref_surrogate_pair(Bin, S1, C1);`
		652	`{Unichar, S1} when is_integer(Unichar) ->`
		653	`{{data, mochiutf8:codepoint_to_bytes(Unichar), false},`
		654	`S1};`
		655	`{Unichars, S1} when is_list(Unichars) ->`
		656	`{{data, unicode:characters_to_binary(Unichars), false},`
		657	`S1};`
		658	`{undefined, _} ->`
		659	`throw(invalid_charref)`
		660	`end`
		661	`catch`
		662	`throw:invalid_charref ->`
		663	`{{data, <<"&">>, false}, S}`
		664	`end.`
		665
		666	`tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->`
		667	`case Bin of`
		668	`<<_:O/binary, $&, _/binary>> ->`
		669	`case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of`
		670	`{C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->`
		671	`{{data,`
		672	`unicode:characters_to_binary(`
		673	`<<C1:16, C2:16>>,`
		674	`utf16,`
		675	`utf8),`
		676	`false},`
		677	`S1};`
		678	`_ ->`
		679	`throw(invalid_charref)`
		680	`end;`
		681	`_ ->`
		682	`throw(invalid_charref)`
		683	`end.`
		684
		685	`tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->`
		686	`case Bin of`
		687	`<<_:O/binary>> ->`
		688	`throw(invalid_charref);`
		689	`<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)`
		690	`orelse C =:= ?SQUOTE`
		691	`orelse C =:= ?QUOTE`
		692	`orelse C =:= $/`
		693	`orelse C =:= $> ->`
		694	`throw(invalid_charref);`
		695	`<<_:O/binary, $;, _/binary>> ->`
		696	`Len = O - Start,`
		697	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		698	`{mochiweb_charref:charref(Raw), ?INC_COL(S)};`
		699	`_ ->`
		700	`tokenize_charref_raw(Bin, ?INC_COL(S), Start)`
		701	`end.`
		702
		703	`tokenize_doctype(Bin, S) ->`
		704	`tokenize_doctype(Bin, S, []).`
		705
		706	`tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->`
		707	`case Bin of`
		708	`<<_:O/binary>> ->`
		709	`{{doctype, lists:reverse(Acc)}, S};`
		710	`<<_:O/binary, $>, _/binary>> ->`
		711	`{{doctype, lists:reverse(Acc)}, ?INC_COL(S)};`
		712	`<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->`
		713	`tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);`
		714	`_ ->`
		715	`{Word, S1} = tokenize_word_or_literal(Bin, S),`
		716	`tokenize_doctype(Bin, S1, [Word \| Acc])`
		717	`end.`
		718
		719	`tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->`
		720	`case Bin of`
		721	`<<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->`
		722	`tokenize_word(Bin, ?INC_COL(S), C);`
		723	`<<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->`
		724	`%% Sanity check for whitespace`
		725	`tokenize_literal(Bin, S)`
		726	`end.`
		727
		728	`tokenize_word(Bin, S, Quote) ->`
		729	`tokenize_word(Bin, S, Quote, []).`
		730
		731	`tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->`
		732	`case Bin of`
		733	`<<_:O/binary>> ->`
		734	`{iolist_to_binary(lists:reverse(Acc)), S};`
		735	`<<_:O/binary, Quote, _/binary>> ->`
		736	`{iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};`
		737	`<<_:O/binary, $&, _/binary>> ->`
		738	`{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),`
		739	`tokenize_word(Bin, S1, Quote, [Data \| Acc]);`
		740	`<<_:O/binary, C, _/binary>> ->`
		741	`tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C \| Acc])`
		742	`end.`
		743
		744	`tokenize_cdata(Bin, S=#decoder{offset=O}) ->`
		745	`tokenize_cdata(Bin, S, O).`
		746
		747	`tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->`
		748	`case Bin of`
		749	`<<_:O/binary, "]]>", _/binary>> ->`
		750	`Len = O - Start,`
		751	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		752	`{{data, Raw, false}, ?ADV_COL(S, 3)};`
		753	`<<_:O/binary, C, _/binary>> ->`
		754	`tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);`
		755	`_ ->`
		756	`<<_:O/binary, Raw/binary>> = Bin,`
		757	`{{data, Raw, false}, S}`
		758	`end.`
		759
		760	`tokenize_comment(Bin, S=#decoder{offset=O}) ->`
		761	`tokenize_comment(Bin, S, O).`
		762
		763	`tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->`
		764	`case Bin of`
		765	`<<_:O/binary, "-->", _/binary>> ->`
		766	`Len = O - Start,`
		767	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		768	`{{comment, Raw}, ?ADV_COL(S, 3)};`
		769	`<<_:O/binary, C, _/binary>> ->`
		770	`tokenize_comment(Bin, ?INC_CHAR(S, C), Start);`
		771	`<<_:Start/binary, Raw/binary>> ->`
		772	`{{comment, Raw}, S}`
		773	`end.`
		774
		775	`tokenize_script(Bin, S=#decoder{offset=O}) ->`
		776	`tokenize_script(Bin, S, O).`
		777
		778	`tokenize_script(Bin, S=#decoder{offset=O}, Start) ->`
		779	`case Bin of`
		780	`%% Just a look-ahead, we want the end_tag separately`
		781	`<<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>`
		782	`when (SS =:= $s orelse SS =:= $S) andalso`
		783	`(CC =:= $c orelse CC =:= $C) andalso`
		784	`(RR =:= $r orelse RR =:= $R) andalso`
		785	`(II =:= $i orelse II =:= $I) andalso`
		786	`(PP =:= $p orelse PP =:= $P) andalso`
		787	`(TT=:= $t orelse TT =:= $T) andalso`
		788	`?PROBABLE_CLOSE(ZZ) ->`
		789	`Len = O - Start,`
		790	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		791	`{{data, Raw, false}, S};`
		792	`<<_:O/binary, C, _/binary>> ->`
		793	`tokenize_script(Bin, ?INC_CHAR(S, C), Start);`
		794	`<<_:Start/binary, Raw/binary>> ->`
		795	`{{data, Raw, false}, S}`
		796	`end.`
		797
		798	`tokenize_textarea(Bin, S=#decoder{offset=O}) ->`
		799	`tokenize_textarea(Bin, S, O).`
		800
		801	`tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->`
		802	`case Bin of`
		803	`%% Just a look-ahead, we want the end_tag separately`
		804	`<<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>`
		805	`when (TT =:= $t orelse TT =:= $T) andalso`
		806	`(EE =:= $e orelse EE =:= $E) andalso`
		807	`(XX =:= $x orelse XX =:= $X) andalso`
		808	`(TT2 =:= $t orelse TT2 =:= $T) andalso`
		809	`(AA =:= $a orelse AA =:= $A) andalso`
		810	`(RR =:= $r orelse RR =:= $R) andalso`
		811	`(EE2 =:= $e orelse EE2 =:= $E) andalso`
		812	`(AA2 =:= $a orelse AA2 =:= $A) andalso`
		813	`?PROBABLE_CLOSE(ZZ) ->`
		814	`Len = O - Start,`
		815	`<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,`
		816	`{{data, Raw, false}, S};`
		817	`<<_:O/binary, C, _/binary>> ->`
		818	`tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);`
		819	`<<_:Start/binary, Raw/binary>> ->`
		820	`{{data, Raw, false}, S}`
		821	`end.`

Subversion Repositories SE.SVN

(root)/code/mochiweb/trunk/src/mochiweb_html.erl – Rev 12