12 |
7u83 |
1 |
%% @author Bob Ippolito <bob@mochimedia.com>
|
|
|
2 |
%% @copyright 2007 Mochi Media, Inc.
|
|
|
3 |
%%
|
|
|
4 |
%% Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
5 |
%% copy of this software and associated documentation files (the "Software"),
|
|
|
6 |
%% to deal in the Software without restriction, including without limitation
|
|
|
7 |
%% the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
8 |
%% and/or sell copies of the Software, and to permit persons to whom the
|
|
|
9 |
%% Software is furnished to do so, subject to the following conditions:
|
|
|
10 |
%%
|
|
|
11 |
%% The above copyright notice and this permission notice shall be included in
|
|
|
12 |
%% all copies or substantial portions of the Software.
|
|
|
13 |
%%
|
|
|
14 |
%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
15 |
%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
16 |
%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
17 |
%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
18 |
%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
19 |
%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
|
20 |
%% DEALINGS IN THE SOFTWARE.
|
|
|
21 |
|
|
|
22 |
%% @doc Loosely tokenizes and generates parse trees for HTML 4.
|
|
|
23 |
-module(mochiweb_html).
|
|
|
24 |
-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
|
|
|
25 |
escape_attr/1, to_html/1]).
|
|
|
26 |
-ifdef(TEST).
|
|
|
27 |
-export([destack/1, destack/2, is_singleton/1]).
|
|
|
28 |
-endif.
|
|
|
29 |
|
|
|
30 |
%% This is a macro to placate syntax highlighters..
|
|
|
31 |
-define(QUOTE, $\"). %% $\"
|
|
|
32 |
-define(SQUOTE, $\'). %% $\'
|
|
|
33 |
-define(ADV_COL(S, N),
|
|
|
34 |
S#decoder{column=N+S#decoder.column,
|
|
|
35 |
offset=N+S#decoder.offset}).
|
|
|
36 |
-define(INC_COL(S),
|
|
|
37 |
S#decoder{column=1+S#decoder.column,
|
|
|
38 |
offset=1+S#decoder.offset}).
|
|
|
39 |
-define(INC_LINE(S),
|
|
|
40 |
S#decoder{column=1,
|
|
|
41 |
line=1+S#decoder.line,
|
|
|
42 |
offset=1+S#decoder.offset}).
|
|
|
43 |
-define(INC_CHAR(S, C),
|
|
|
44 |
case C of
|
|
|
45 |
$\n ->
|
|
|
46 |
S#decoder{column=1,
|
|
|
47 |
line=1+S#decoder.line,
|
|
|
48 |
offset=1+S#decoder.offset};
|
|
|
49 |
_ ->
|
|
|
50 |
S#decoder{column=1+S#decoder.column,
|
|
|
51 |
offset=1+S#decoder.offset}
|
|
|
52 |
end).
|
|
|
53 |
|
|
|
54 |
-define(IS_WHITESPACE(C),
|
|
|
55 |
(C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
|
|
|
56 |
-define(IS_LETTER(C),
|
|
|
57 |
((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z))).
|
|
|
58 |
-define(IS_LITERAL_SAFE(C),
|
|
|
59 |
((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
|
|
|
60 |
orelse (C >= $0 andalso C =< $9))).
|
|
|
61 |
-define(PROBABLE_CLOSE(C),
|
|
|
62 |
(C =:= $> orelse ?IS_WHITESPACE(C))).
|
|
|
63 |
|
|
|
64 |
-record(decoder, {line=1,
|
|
|
65 |
column=1,
|
|
|
66 |
offset=0}).
|
|
|
67 |
|
|
|
68 |
%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
|
|
|
69 |
%% @type html_attr() = {string(), string()}
|
|
|
70 |
%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
|
|
|
71 |
%% @type html_data() = {data, string(), Whitespace::boolean()}
|
|
|
72 |
%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
|
|
|
73 |
%% @type end_tag() = {end_tag, Name}
|
|
|
74 |
%% @type html_comment() = {comment, Comment}
|
|
|
75 |
%% @type html_doctype() = {doctype, [Doctype]}
|
|
|
76 |
%% @type inline_html() = {'=', iolist()}
|
|
|
77 |
|
|
|
78 |
%% External API.
|
|
|
79 |
|
|
|
80 |
%% @spec parse(string() | binary()) -> html_node()
|
|
|
81 |
%% @doc tokenize and then transform the token stream into a HTML tree.
|
|
|
82 |
parse(Input) ->
|
|
|
83 |
parse_tokens(tokens(Input)).
|
|
|
84 |
|
|
|
85 |
%% @spec parse_tokens([html_token()]) -> html_node()
|
|
|
86 |
%% @doc Transform the output of tokens(Doc) into a HTML tree.
|
|
|
87 |
parse_tokens(Tokens) when is_list(Tokens) ->
|
|
|
88 |
%% Skip over doctype, processing instructions
|
|
|
89 |
[{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
|
|
|
90 |
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
|
|
|
91 |
Tree.
|
|
|
92 |
|
|
|
93 |
find_document(Tokens=[{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
|
|
|
94 |
maybe_add_html_tag(Tokens, Mode);
|
|
|
95 |
find_document([{doctype, [<<"html">>]} | Rest], _Mode) ->
|
|
|
96 |
find_document(Rest, html5);
|
|
|
97 |
find_document([_T | Rest], Mode) ->
|
|
|
98 |
find_document(Rest, Mode);
|
|
|
99 |
find_document([], _Mode) ->
|
|
|
100 |
[].
|
|
|
101 |
|
|
|
102 |
maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} | _], html5)
|
|
|
103 |
when Tag =/= <<"html">> ->
|
|
|
104 |
[{start_tag, <<"html">>, [], false} | Tokens];
|
|
|
105 |
maybe_add_html_tag(Tokens, _Mode) ->
|
|
|
106 |
Tokens.
|
|
|
107 |
|
|
|
108 |
%% @spec tokens(StringOrBinary) -> [html_token()]
|
|
|
109 |
%% @doc Transform the input UTF-8 HTML into a token stream.
|
|
|
110 |
tokens(Input) ->
|
|
|
111 |
tokens(iolist_to_binary(Input), #decoder{}, []).
|
|
|
112 |
|
|
|
113 |
%% @spec to_tokens(html_node()) -> [html_token()]
|
|
|
114 |
%% @doc Convert a html_node() tree to a list of tokens.
|
|
|
115 |
to_tokens({Tag0}) ->
|
|
|
116 |
to_tokens({Tag0, [], []});
|
|
|
117 |
to_tokens(T={'=', _}) ->
|
|
|
118 |
[T];
|
|
|
119 |
to_tokens(T={doctype, _}) ->
|
|
|
120 |
[T];
|
|
|
121 |
to_tokens(T={comment, _}) ->
|
|
|
122 |
[T];
|
|
|
123 |
to_tokens({Tag0, Acc}) ->
|
|
|
124 |
%% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
|
|
|
125 |
to_tokens({Tag0, [], Acc});
|
|
|
126 |
to_tokens({Tag0, Attrs, Acc}) ->
|
|
|
127 |
Tag = to_tag(Tag0),
|
|
|
128 |
case is_singleton(Tag) of
|
|
|
129 |
true ->
|
|
|
130 |
to_tokens([], [{start_tag, Tag, Attrs, true}]);
|
|
|
131 |
false ->
|
|
|
132 |
to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}])
|
|
|
133 |
end.
|
|
|
134 |
|
|
|
135 |
%% @spec to_html([html_token()] | html_node()) -> iolist()
|
|
|
136 |
%% @doc Convert a list of html_token() to a HTML document.
|
|
|
137 |
to_html(Node) when is_tuple(Node) ->
|
|
|
138 |
to_html(to_tokens(Node));
|
|
|
139 |
to_html(Tokens) when is_list(Tokens) ->
|
|
|
140 |
to_html(Tokens, []).
|
|
|
141 |
|
|
|
142 |
%% @spec escape(string() | atom() | binary()) -> binary()
|
|
|
143 |
%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
|
|
|
144 |
escape(B) when is_binary(B) ->
|
|
|
145 |
escape(binary_to_list(B), []);
|
|
|
146 |
escape(A) when is_atom(A) ->
|
|
|
147 |
escape(atom_to_list(A), []);
|
|
|
148 |
escape(S) when is_list(S) ->
|
|
|
149 |
escape(S, []).
|
|
|
150 |
|
|
|
151 |
%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
|
|
|
152 |
%% @doc Escape a string such that it's safe for HTML attrs
|
|
|
153 |
%% (amp; lt; gt; quot;).
|
|
|
154 |
escape_attr(B) when is_binary(B) ->
|
|
|
155 |
escape_attr(binary_to_list(B), []);
|
|
|
156 |
escape_attr(A) when is_atom(A) ->
|
|
|
157 |
escape_attr(atom_to_list(A), []);
|
|
|
158 |
escape_attr(S) when is_list(S) ->
|
|
|
159 |
escape_attr(S, []);
|
|
|
160 |
escape_attr(I) when is_integer(I) ->
|
|
|
161 |
escape_attr(integer_to_list(I), []);
|
|
|
162 |
escape_attr(F) when is_float(F) ->
|
|
|
163 |
escape_attr(mochinum:digits(F), []).
|
|
|
164 |
|
|
|
165 |
to_html([], Acc) ->
|
|
|
166 |
lists:reverse(Acc);
|
|
|
167 |
to_html([{'=', Content} | Rest], Acc) ->
|
|
|
168 |
to_html(Rest, [Content | Acc]);
|
|
|
169 |
to_html([{pi, Bin} | Rest], Acc) ->
|
|
|
170 |
Open = [<<"<?">>,
|
|
|
171 |
Bin,
|
|
|
172 |
<<"?>">>],
|
|
|
173 |
to_html(Rest, [Open | Acc]);
|
|
|
174 |
to_html([{pi, Tag, Attrs} | Rest], Acc) ->
|
|
|
175 |
Open = [<<"<?">>,
|
|
|
176 |
Tag,
|
|
|
177 |
attrs_to_html(Attrs, []),
|
|
|
178 |
<<"?>">>],
|
|
|
179 |
to_html(Rest, [Open | Acc]);
|
|
|
180 |
to_html([{comment, Comment} | Rest], Acc) ->
|
|
|
181 |
to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
|
|
|
182 |
to_html([{doctype, Parts} | Rest], Acc) ->
|
|
|
183 |
Inside = doctype_to_html(Parts, Acc),
|
|
|
184 |
to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
|
|
|
185 |
to_html([{data, Data, _Whitespace} | Rest], Acc) ->
|
|
|
186 |
to_html(Rest, [escape(Data) | Acc]);
|
|
|
187 |
to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
|
|
|
188 |
Open = [<<"<">>,
|
|
|
189 |
Tag,
|
|
|
190 |
attrs_to_html(Attrs, []),
|
|
|
191 |
case Singleton of
|
|
|
192 |
true -> <<" />">>;
|
|
|
193 |
false -> <<">">>
|
|
|
194 |
end],
|
|
|
195 |
to_html(Rest, [Open | Acc]);
|
|
|
196 |
to_html([{end_tag, Tag} | Rest], Acc) ->
|
|
|
197 |
to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
|
|
|
198 |
|
|
|
199 |
doctype_to_html([], Acc) ->
|
|
|
200 |
lists:reverse(Acc);
|
|
|
201 |
doctype_to_html([Word | Rest], Acc) ->
|
|
|
202 |
case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
|
|
|
203 |
binary_to_list(iolist_to_binary(Word))) of
|
|
|
204 |
true ->
|
|
|
205 |
doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
|
|
|
206 |
false ->
|
|
|
207 |
doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
|
|
|
208 |
end.
|
|
|
209 |
|
|
|
210 |
attrs_to_html([], Acc) ->
|
|
|
211 |
lists:reverse(Acc);
|
|
|
212 |
attrs_to_html([{K, V} | Rest], Acc) ->
|
|
|
213 |
attrs_to_html(Rest,
|
|
|
214 |
[[<<" ">>, escape(K), <<"=\"">>,
|
|
|
215 |
escape_attr(V), <<"\"">>] | Acc]).
|
|
|
216 |
|
|
|
217 |
escape([], Acc) ->
|
|
|
218 |
list_to_binary(lists:reverse(Acc));
|
|
|
219 |
escape("<" ++ Rest, Acc) ->
|
|
|
220 |
escape(Rest, lists:reverse("<", Acc));
|
|
|
221 |
escape(">" ++ Rest, Acc) ->
|
|
|
222 |
escape(Rest, lists:reverse(">", Acc));
|
|
|
223 |
escape("&" ++ Rest, Acc) ->
|
|
|
224 |
escape(Rest, lists:reverse("&", Acc));
|
|
|
225 |
escape([C | Rest], Acc) ->
|
|
|
226 |
escape(Rest, [C | Acc]).
|
|
|
227 |
|
|
|
228 |
escape_attr([], Acc) ->
|
|
|
229 |
list_to_binary(lists:reverse(Acc));
|
|
|
230 |
escape_attr("<" ++ Rest, Acc) ->
|
|
|
231 |
escape_attr(Rest, lists:reverse("<", Acc));
|
|
|
232 |
escape_attr(">" ++ Rest, Acc) ->
|
|
|
233 |
escape_attr(Rest, lists:reverse(">", Acc));
|
|
|
234 |
escape_attr("&" ++ Rest, Acc) ->
|
|
|
235 |
escape_attr(Rest, lists:reverse("&", Acc));
|
|
|
236 |
escape_attr([?QUOTE | Rest], Acc) ->
|
|
|
237 |
escape_attr(Rest, lists:reverse(""", Acc));
|
|
|
238 |
escape_attr([C | Rest], Acc) ->
|
|
|
239 |
escape_attr(Rest, [C | Acc]).
|
|
|
240 |
|
|
|
241 |
to_tag(A) when is_atom(A) ->
|
|
|
242 |
norm(atom_to_list(A));
|
|
|
243 |
to_tag(L) ->
|
|
|
244 |
norm(L).
|
|
|
245 |
|
|
|
246 |
to_tokens([], Acc) ->
|
|
|
247 |
lists:reverse(Acc);
|
|
|
248 |
to_tokens([{Tag, []} | Rest], Acc) ->
|
|
|
249 |
to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
|
|
|
250 |
to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
|
|
|
251 |
%% Allow {br}
|
|
|
252 |
to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
|
|
|
253 |
to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
|
|
|
254 |
%% Allow {'=', iolist()}
|
|
|
255 |
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
|
|
|
256 |
to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
|
|
|
257 |
%% Allow {comment, iolist()}
|
|
|
258 |
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
|
|
|
259 |
to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
|
|
|
260 |
%% Allow {pi, binary()}
|
|
|
261 |
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
|
|
|
262 |
to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
|
|
|
263 |
%% Allow {pi, binary(), list()}
|
|
|
264 |
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
|
|
|
265 |
to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
|
|
|
266 |
%% Allow {p, [{"class", "foo"}]}
|
|
|
267 |
to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
|
|
|
268 |
to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
|
|
|
269 |
%% Allow {p, "content"} and {p, <<"content">>}
|
|
|
270 |
to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
|
|
|
271 |
to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
|
|
|
272 |
%% Allow {"p", [{"class", "foo"}], <<"content">>}
|
|
|
273 |
to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
|
|
|
274 |
to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
|
|
|
275 |
when is_integer(C) ->
|
|
|
276 |
%% Allow {"p", [{"class", "foo"}], "content"}
|
|
|
277 |
to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
|
|
|
278 |
to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
|
|
|
279 |
%% Native {"p", [{"class", "foo"}], ["content"]}
|
|
|
280 |
Tag = to_tag(Tag0),
|
|
|
281 |
T1 = to_tag(T0),
|
|
|
282 |
case is_singleton(norm(T1)) of
|
|
|
283 |
true ->
|
|
|
284 |
to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
|
|
|
285 |
false ->
|
|
|
286 |
to_tokens([{T1, C1}, {Tag, R1} | Rest],
|
|
|
287 |
[{start_tag, T1, A1, false} | Acc])
|
|
|
288 |
end;
|
|
|
289 |
to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
|
|
|
290 |
%% List text
|
|
|
291 |
Tag = to_tag(Tag0),
|
|
|
292 |
to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
|
|
|
293 |
to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
|
|
|
294 |
%% Binary text
|
|
|
295 |
Tag = to_tag(Tag0),
|
|
|
296 |
to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
|
|
|
297 |
|
|
|
298 |
tokens(B, S=#decoder{offset=O}, Acc) ->
|
|
|
299 |
case B of
|
|
|
300 |
<<_:O/binary>> ->
|
|
|
301 |
lists:reverse(Acc);
|
|
|
302 |
_ ->
|
|
|
303 |
{Tag, S1} = tokenize(B, S),
|
|
|
304 |
case parse_flag(Tag) of
|
|
|
305 |
script ->
|
|
|
306 |
{Tag2, S2} = tokenize_script(B, S1),
|
|
|
307 |
tokens(B, S2, [Tag2, Tag | Acc]);
|
|
|
308 |
textarea ->
|
|
|
309 |
{Tag2, S2} = tokenize_textarea(B, S1),
|
|
|
310 |
tokens(B, S2, [Tag2, Tag | Acc]);
|
|
|
311 |
none ->
|
|
|
312 |
tokens(B, S1, [Tag | Acc])
|
|
|
313 |
end
|
|
|
314 |
end.
|
|
|
315 |
|
|
|
316 |
parse_flag({start_tag, B, _, false}) ->
|
|
|
317 |
case string:to_lower(binary_to_list(B)) of
|
|
|
318 |
"script" ->
|
|
|
319 |
script;
|
|
|
320 |
"textarea" ->
|
|
|
321 |
textarea;
|
|
|
322 |
_ ->
|
|
|
323 |
none
|
|
|
324 |
end;
|
|
|
325 |
parse_flag(_) ->
|
|
|
326 |
none.
|
|
|
327 |
|
|
|
328 |
tokenize(B, S=#decoder{offset=O}) ->
|
|
|
329 |
case B of
|
|
|
330 |
<<_:O/binary, "<!--", _/binary>> ->
|
|
|
331 |
tokenize_comment(B, ?ADV_COL(S, 4));
|
|
|
332 |
<<_:O/binary, "<!doctype", _/binary>> ->
|
|
|
333 |
tokenize_doctype(B, ?ADV_COL(S, 10));
|
|
|
334 |
<<_:O/binary, "<!DOCTYPE", _/binary>> ->
|
|
|
335 |
tokenize_doctype(B, ?ADV_COL(S, 10));
|
|
|
336 |
<<_:O/binary, "<![CDATA[", _/binary>> ->
|
|
|
337 |
tokenize_cdata(B, ?ADV_COL(S, 9));
|
|
|
338 |
<<_:O/binary, "<?php", _/binary>> ->
|
|
|
339 |
{Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
|
|
|
340 |
{{pi, Body}, S1};
|
|
|
341 |
<<_:O/binary, "<?", _/binary>> ->
|
|
|
342 |
{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
|
|
|
343 |
{Attrs, S2} = tokenize_attributes(B, S1),
|
|
|
344 |
S3 = find_qgt(B, S2),
|
|
|
345 |
{{pi, Tag, Attrs}, S3};
|
|
|
346 |
<<_:O/binary, "&", _/binary>> ->
|
|
|
347 |
tokenize_charref(B, ?INC_COL(S));
|
|
|
348 |
<<_:O/binary, "</", _/binary>> ->
|
|
|
349 |
{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
|
|
|
350 |
{S2, _} = find_gt(B, S1),
|
|
|
351 |
{{end_tag, Tag}, S2};
|
|
|
352 |
<<_:O/binary, "<", C, _/binary>>
|
|
|
353 |
when ?IS_WHITESPACE(C); not ?IS_LETTER(C) ->
|
|
|
354 |
%% This isn't really strict HTML
|
|
|
355 |
{{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
|
|
|
356 |
{{data, <<$<, Data/binary>>, false}, S1};
|
|
|
357 |
<<_:O/binary, "<", _/binary>> ->
|
|
|
358 |
{Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
|
|
|
359 |
{Attrs, S2} = tokenize_attributes(B, S1),
|
|
|
360 |
{S3, HasSlash} = find_gt(B, S2),
|
|
|
361 |
Singleton = HasSlash orelse is_singleton(Tag),
|
|
|
362 |
{{start_tag, Tag, Attrs, Singleton}, S3};
|
|
|
363 |
_ ->
|
|
|
364 |
tokenize_data(B, S)
|
|
|
365 |
end.
|
|
|
366 |
|
|
|
367 |
tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
|
|
|
368 |
tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
|
|
|
369 |
tree_data(Rest, AllWhitespace, Acc) ->
|
|
|
370 |
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
|
|
|
371 |
|
|
|
372 |
tree([], Stack) ->
|
|
|
373 |
{destack(Stack), []};
|
|
|
374 |
tree([{end_tag, Tag} | Rest], Stack) ->
|
|
|
375 |
case destack(norm(Tag), Stack) of
|
|
|
376 |
S when is_list(S) ->
|
|
|
377 |
tree(Rest, S);
|
|
|
378 |
Result ->
|
|
|
379 |
{Result, []}
|
|
|
380 |
end;
|
|
|
381 |
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
|
|
|
382 |
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
|
|
|
383 |
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
|
|
|
384 |
tree(Rest, stack(norm({Tag, Attrs}), S));
|
|
|
385 |
tree([T={pi, _Raw} | Rest], S) ->
|
|
|
386 |
tree(Rest, append_stack_child(T, S));
|
|
|
387 |
tree([T={pi, _Tag, _Attrs} | Rest], S) ->
|
|
|
388 |
tree(Rest, append_stack_child(T, S));
|
|
|
389 |
tree([T={comment, _Comment} | Rest], S) ->
|
|
|
390 |
tree(Rest, append_stack_child(T, S));
|
|
|
391 |
tree(L=[{data, _Data, _Whitespace} | _], S) ->
|
|
|
392 |
case tree_data(L, true, []) of
|
|
|
393 |
{_, true, Rest} ->
|
|
|
394 |
tree(Rest, S);
|
|
|
395 |
{Data, false, Rest} ->
|
|
|
396 |
tree(Rest, append_stack_child(Data, S))
|
|
|
397 |
end;
|
|
|
398 |
tree([{doctype, _} | Rest], Stack) ->
|
|
|
399 |
tree(Rest, Stack).
|
|
|
400 |
|
|
|
401 |
norm({Tag, Attrs}) ->
|
|
|
402 |
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
|
|
|
403 |
norm(Tag) when is_binary(Tag) ->
|
|
|
404 |
Tag;
|
|
|
405 |
norm(Tag) ->
|
|
|
406 |
list_to_binary(string:to_lower(Tag)).
|
|
|
407 |
|
|
|
408 |
stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
|
|
|
409 |
when TN =:= <<"li">> orelse TN =:= <<"option">> ->
|
|
|
410 |
[T1 | destack(TN, Stack)];
|
|
|
411 |
stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
|
|
|
412 |
when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
|
|
|
413 |
(TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
|
|
|
414 |
[T1 | destack(TN1, Stack)];
|
|
|
415 |
stack(T1, Stack) ->
|
|
|
416 |
[T1 | Stack].
|
|
|
417 |
|
|
|
418 |
append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
|
|
|
419 |
[{Name, Attrs, [StartTag | Acc]} | Stack].
|
|
|
420 |
|
|
|
421 |
destack(<<"br">>, Stack) ->
|
|
|
422 |
%% This is an ugly hack to make dumb_br_test() pass,
|
|
|
423 |
%% this makes it such that br can never have children.
|
|
|
424 |
Stack;
|
|
|
425 |
destack(TagName, Stack) when is_list(Stack) ->
|
|
|
426 |
F = fun (X) ->
|
|
|
427 |
case X of
|
|
|
428 |
{TagName, _, _} ->
|
|
|
429 |
false;
|
|
|
430 |
_ ->
|
|
|
431 |
true
|
|
|
432 |
end
|
|
|
433 |
end,
|
|
|
434 |
case lists:splitwith(F, Stack) of
|
|
|
435 |
{_, []} ->
|
|
|
436 |
%% If we're parsing something like XML we might find
|
|
|
437 |
%% a <link>tag</link> that is normally a singleton
|
|
|
438 |
%% in HTML but isn't here
|
|
|
439 |
case {is_singleton(TagName), Stack} of
|
|
|
440 |
{true, [{T0, A0, Acc0} | Post0]} ->
|
|
|
441 |
case lists:splitwith(F, Acc0) of
|
|
|
442 |
{_, []} ->
|
|
|
443 |
%% Actually was a singleton
|
|
|
444 |
Stack;
|
|
|
445 |
{Pre, [{T1, A1, Acc1} | Post1]} ->
|
|
|
446 |
[{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
|
|
|
447 |
| Post0]
|
|
|
448 |
end;
|
|
|
449 |
_ ->
|
|
|
450 |
%% No match, no state change
|
|
|
451 |
Stack
|
|
|
452 |
end;
|
|
|
453 |
{_Pre, [_T]} ->
|
|
|
454 |
%% Unfurl the whole stack, we're done
|
|
|
455 |
destack(Stack);
|
|
|
456 |
{Pre, [T, {T0, A0, Acc0} | Post]} ->
|
|
|
457 |
%% Unfurl up to the tag, then accumulate it
|
|
|
458 |
[{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
|
|
|
459 |
end.
|
|
|
460 |
|
|
|
461 |
destack([{Tag, Attrs, Acc}]) ->
|
|
|
462 |
{Tag, Attrs, lists:reverse(Acc)};
|
|
|
463 |
destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
|
|
|
464 |
destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
|
|
|
465 |
|
|
|
466 |
is_singleton(<<"area">>) -> true;
|
|
|
467 |
is_singleton(<<"base">>) -> true;
|
|
|
468 |
is_singleton(<<"br">>) -> true;
|
|
|
469 |
is_singleton(<<"col">>) -> true;
|
|
|
470 |
is_singleton(<<"embed">>) -> true;
|
|
|
471 |
is_singleton(<<"hr">>) -> true;
|
|
|
472 |
is_singleton(<<"img">>) -> true;
|
|
|
473 |
is_singleton(<<"input">>) -> true;
|
|
|
474 |
is_singleton(<<"keygen">>) -> true;
|
|
|
475 |
is_singleton(<<"link">>) -> true;
|
|
|
476 |
is_singleton(<<"meta">>) -> true;
|
|
|
477 |
is_singleton(<<"param">>) -> true;
|
|
|
478 |
is_singleton(<<"source">>) -> true;
|
|
|
479 |
is_singleton(<<"track">>) -> true;
|
|
|
480 |
is_singleton(<<"wbr">>) -> true;
|
|
|
481 |
is_singleton(_) -> false.
|
|
|
482 |
|
|
|
483 |
tokenize_data(B, S=#decoder{offset=O}) ->
|
|
|
484 |
tokenize_data(B, S, O, true).
|
|
|
485 |
|
|
|
486 |
tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
|
|
|
487 |
case B of
|
|
|
488 |
<<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
|
|
|
489 |
tokenize_data(B, ?INC_CHAR(S, C), Start,
|
|
|
490 |
(Whitespace andalso ?IS_WHITESPACE(C)));
|
|
|
491 |
_ ->
|
|
|
492 |
Len = O - Start,
|
|
|
493 |
<<_:Start/binary, Data:Len/binary, _/binary>> = B,
|
|
|
494 |
{{data, Data, Whitespace}, S}
|
|
|
495 |
end.
|
|
|
496 |
|
|
|
497 |
tokenize_attributes(B, S) ->
|
|
|
498 |
tokenize_attributes(B, S, []).
|
|
|
499 |
|
|
|
500 |
tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
|
|
|
501 |
case B of
|
|
|
502 |
<<_:O/binary>> ->
|
|
|
503 |
{lists:reverse(Acc), S};
|
|
|
504 |
<<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
|
|
|
505 |
{lists:reverse(Acc), S};
|
|
|
506 |
<<_:O/binary, "?>", _/binary>> ->
|
|
|
507 |
{lists:reverse(Acc), S};
|
|
|
508 |
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
|
|
|
509 |
tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
|
|
|
510 |
_ ->
|
|
|
511 |
{Attr, S1} = tokenize_literal(B, S),
|
|
|
512 |
{Value, S2} = tokenize_attr_value(Attr, B, S1),
|
|
|
513 |
tokenize_attributes(B, S2, [{Attr, Value} | Acc])
|
|
|
514 |
end.
|
|
|
515 |
|
|
|
516 |
tokenize_attr_value(Attr, B, S) ->
|
|
|
517 |
S1 = skip_whitespace(B, S),
|
|
|
518 |
O = S1#decoder.offset,
|
|
|
519 |
case B of
|
|
|
520 |
<<_:O/binary, "=", _/binary>> ->
|
|
|
521 |
S2 = skip_whitespace(B, ?INC_COL(S1)),
|
|
|
522 |
tokenize_quoted_or_unquoted_attr_value(B, S2);
|
|
|
523 |
_ ->
|
|
|
524 |
{Attr, S1}
|
|
|
525 |
end.
|
|
|
526 |
|
|
|
527 |
tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
|
|
|
528 |
case B of
|
|
|
529 |
<<_:O/binary>> ->
|
|
|
530 |
{ [], S };
|
|
|
531 |
<<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
|
|
|
532 |
Q =:= ?SQUOTE ->
|
|
|
533 |
tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
|
|
|
534 |
<<_:O/binary, _/binary>> ->
|
|
|
535 |
tokenize_unquoted_attr_value(B, S, [])
|
|
|
536 |
end.
|
|
|
537 |
|
|
|
538 |
tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
|
|
|
539 |
case B of
|
|
|
540 |
<<_:O/binary>> ->
|
|
|
541 |
{ iolist_to_binary(lists:reverse(Acc)), S };
|
|
|
542 |
<<_:O/binary, $&, _/binary>> ->
|
|
|
543 |
{{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
|
|
|
544 |
tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
|
|
|
545 |
<<_:O/binary, Q, _/binary>> ->
|
|
|
546 |
{ iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
|
|
|
547 |
<<_:O/binary, C, _/binary>> ->
|
|
|
548 |
tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
|
|
|
549 |
end.
|
|
|
550 |
|
|
|
551 |
tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
|
|
|
552 |
case B of
|
|
|
553 |
<<_:O/binary>> ->
|
|
|
554 |
{ iolist_to_binary(lists:reverse(Acc)), S };
|
|
|
555 |
<<_:O/binary, $&, _/binary>> ->
|
|
|
556 |
{{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
|
|
|
557 |
tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
|
|
|
558 |
<<_:O/binary, $/, $>, _/binary>> ->
|
|
|
559 |
{ iolist_to_binary(lists:reverse(Acc)), S };
|
|
|
560 |
<<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
|
|
|
561 |
{ iolist_to_binary(lists:reverse(Acc)), S };
|
|
|
562 |
<<_:O/binary, C, _/binary>> ->
|
|
|
563 |
tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
|
|
|
564 |
end.
|
|
|
565 |
|
|
|
566 |
skip_whitespace(B, S=#decoder{offset=O}) ->
|
|
|
567 |
case B of
|
|
|
568 |
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
|
|
|
569 |
skip_whitespace(B, ?INC_CHAR(S, C));
|
|
|
570 |
_ ->
|
|
|
571 |
S
|
|
|
572 |
end.
|
|
|
573 |
|
|
|
574 |
tokenize_literal(Bin, S=#decoder{offset=O}) ->
|
|
|
575 |
case Bin of
|
|
|
576 |
<<_:O/binary, C, _/binary>> when C =:= $>
|
|
|
577 |
orelse C =:= $/
|
|
|
578 |
orelse C =:= $= ->
|
|
|
579 |
%% Handle case where tokenize_literal would consume
|
|
|
580 |
%% 0 chars. http://github.com/mochi/mochiweb/pull/13
|
|
|
581 |
{[C], ?INC_COL(S)};
|
|
|
582 |
_ ->
|
|
|
583 |
tokenize_literal(Bin, S, [])
|
|
|
584 |
end.
|
|
|
585 |
|
|
|
586 |
tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
|
|
|
587 |
case Bin of
|
|
|
588 |
<<_:O/binary, $&, _/binary>> ->
|
|
|
589 |
{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
|
|
|
590 |
tokenize_literal(Bin, S1, [Data | Acc]);
|
|
|
591 |
<<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
|
|
|
592 |
orelse C =:= $>
|
|
|
593 |
orelse C =:= $/
|
|
|
594 |
orelse C =:= $=) ->
|
|
|
595 |
tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
|
|
|
596 |
_ ->
|
|
|
597 |
{iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
|
|
|
598 |
end.
|
|
|
599 |
|
|
|
600 |
raw_qgt(Bin, S=#decoder{offset=O}) ->
|
|
|
601 |
raw_qgt(Bin, S, O).
|
|
|
602 |
|
|
|
603 |
raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
604 |
case Bin of
|
|
|
605 |
<<_:O/binary, "?>", _/binary>> ->
|
|
|
606 |
Len = O - Start,
|
|
|
607 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
608 |
{Raw, ?ADV_COL(S, 2)};
|
|
|
609 |
<<_:O/binary, C, _/binary>> ->
|
|
|
610 |
raw_qgt(Bin, ?INC_CHAR(S, C), Start);
|
|
|
611 |
<<_:O/binary>> ->
|
|
|
612 |
<<_:Start/binary, Raw/binary>> = Bin,
|
|
|
613 |
{Raw, S}
|
|
|
614 |
end.
|
|
|
615 |
|
|
|
616 |
find_qgt(Bin, S=#decoder{offset=O}) ->
|
|
|
617 |
case Bin of
|
|
|
618 |
<<_:O/binary, "?>", _/binary>> ->
|
|
|
619 |
?ADV_COL(S, 2);
|
|
|
620 |
<<_:O/binary, ">", _/binary>> ->
|
|
|
621 |
?ADV_COL(S, 1);
|
|
|
622 |
<<_:O/binary, "/>", _/binary>> ->
|
|
|
623 |
?ADV_COL(S, 2);
|
|
|
624 |
%% tokenize_attributes takes care of this state:
|
|
|
625 |
%% <<_:O/binary, C, _/binary>> ->
|
|
|
626 |
%% find_qgt(Bin, ?INC_CHAR(S, C));
|
|
|
627 |
<<_:O/binary>> ->
|
|
|
628 |
S
|
|
|
629 |
end.
|
|
|
630 |
|
|
|
631 |
find_gt(Bin, S) ->
|
|
|
632 |
find_gt(Bin, S, false).
|
|
|
633 |
|
|
|
634 |
find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
|
|
|
635 |
case Bin of
|
|
|
636 |
<<_:O/binary, $/, _/binary>> ->
|
|
|
637 |
find_gt(Bin, ?INC_COL(S), true);
|
|
|
638 |
<<_:O/binary, $>, _/binary>> ->
|
|
|
639 |
{?INC_COL(S), HasSlash};
|
|
|
640 |
<<_:O/binary, C, _/binary>> ->
|
|
|
641 |
find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
|
|
|
642 |
_ ->
|
|
|
643 |
{S, HasSlash}
|
|
|
644 |
end.
|
|
|
645 |
|
|
|
646 |
tokenize_charref(Bin, S=#decoder{offset=O}) ->
|
|
|
647 |
try
|
|
|
648 |
case tokenize_charref_raw(Bin, S, O) of
|
|
|
649 |
{C1, S1} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
|
|
|
650 |
%% Surrogate pair
|
|
|
651 |
tokeninize_charref_surrogate_pair(Bin, S1, C1);
|
|
|
652 |
{Unichar, S1} when is_integer(Unichar) ->
|
|
|
653 |
{{data, mochiutf8:codepoint_to_bytes(Unichar), false},
|
|
|
654 |
S1};
|
|
|
655 |
{Unichars, S1} when is_list(Unichars) ->
|
|
|
656 |
{{data, unicode:characters_to_binary(Unichars), false},
|
|
|
657 |
S1};
|
|
|
658 |
{undefined, _} ->
|
|
|
659 |
throw(invalid_charref)
|
|
|
660 |
end
|
|
|
661 |
catch
|
|
|
662 |
throw:invalid_charref ->
|
|
|
663 |
{{data, <<"&">>, false}, S}
|
|
|
664 |
end.
|
|
|
665 |
|
|
|
666 |
tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
|
|
|
667 |
case Bin of
|
|
|
668 |
<<_:O/binary, $&, _/binary>> ->
|
|
|
669 |
case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
|
|
|
670 |
{C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
|
|
|
671 |
{{data,
|
|
|
672 |
unicode:characters_to_binary(
|
|
|
673 |
<<C1:16, C2:16>>,
|
|
|
674 |
utf16,
|
|
|
675 |
utf8),
|
|
|
676 |
false},
|
|
|
677 |
S1};
|
|
|
678 |
_ ->
|
|
|
679 |
throw(invalid_charref)
|
|
|
680 |
end;
|
|
|
681 |
_ ->
|
|
|
682 |
throw(invalid_charref)
|
|
|
683 |
end.
|
|
|
684 |
|
|
|
685 |
tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
686 |
case Bin of
|
|
|
687 |
<<_:O/binary>> ->
|
|
|
688 |
throw(invalid_charref);
|
|
|
689 |
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
|
|
|
690 |
orelse C =:= ?SQUOTE
|
|
|
691 |
orelse C =:= ?QUOTE
|
|
|
692 |
orelse C =:= $/
|
|
|
693 |
orelse C =:= $> ->
|
|
|
694 |
throw(invalid_charref);
|
|
|
695 |
<<_:O/binary, $;, _/binary>> ->
|
|
|
696 |
Len = O - Start,
|
|
|
697 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
698 |
{mochiweb_charref:charref(Raw), ?INC_COL(S)};
|
|
|
699 |
_ ->
|
|
|
700 |
tokenize_charref_raw(Bin, ?INC_COL(S), Start)
|
|
|
701 |
end.
|
|
|
702 |
|
|
|
703 |
tokenize_doctype(Bin, S) ->
|
|
|
704 |
tokenize_doctype(Bin, S, []).
|
|
|
705 |
|
|
|
706 |
tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
|
|
|
707 |
case Bin of
|
|
|
708 |
<<_:O/binary>> ->
|
|
|
709 |
{{doctype, lists:reverse(Acc)}, S};
|
|
|
710 |
<<_:O/binary, $>, _/binary>> ->
|
|
|
711 |
{{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
|
|
|
712 |
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
|
|
|
713 |
tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
|
|
|
714 |
_ ->
|
|
|
715 |
{Word, S1} = tokenize_word_or_literal(Bin, S),
|
|
|
716 |
tokenize_doctype(Bin, S1, [Word | Acc])
|
|
|
717 |
end.
|
|
|
718 |
|
|
|
719 |
tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
|
|
|
720 |
case Bin of
|
|
|
721 |
<<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
|
|
|
722 |
tokenize_word(Bin, ?INC_COL(S), C);
|
|
|
723 |
<<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
|
|
|
724 |
%% Sanity check for whitespace
|
|
|
725 |
tokenize_literal(Bin, S)
|
|
|
726 |
end.
|
|
|
727 |
|
|
|
728 |
tokenize_word(Bin, S, Quote) ->
|
|
|
729 |
tokenize_word(Bin, S, Quote, []).
|
|
|
730 |
|
|
|
731 |
tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
|
|
|
732 |
case Bin of
|
|
|
733 |
<<_:O/binary>> ->
|
|
|
734 |
{iolist_to_binary(lists:reverse(Acc)), S};
|
|
|
735 |
<<_:O/binary, Quote, _/binary>> ->
|
|
|
736 |
{iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
|
|
|
737 |
<<_:O/binary, $&, _/binary>> ->
|
|
|
738 |
{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
|
|
|
739 |
tokenize_word(Bin, S1, Quote, [Data | Acc]);
|
|
|
740 |
<<_:O/binary, C, _/binary>> ->
|
|
|
741 |
tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
|
|
|
742 |
end.
|
|
|
743 |
|
|
|
744 |
tokenize_cdata(Bin, S=#decoder{offset=O}) ->
|
|
|
745 |
tokenize_cdata(Bin, S, O).
|
|
|
746 |
|
|
|
747 |
tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
748 |
case Bin of
|
|
|
749 |
<<_:O/binary, "]]>", _/binary>> ->
|
|
|
750 |
Len = O - Start,
|
|
|
751 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
752 |
{{data, Raw, false}, ?ADV_COL(S, 3)};
|
|
|
753 |
<<_:O/binary, C, _/binary>> ->
|
|
|
754 |
tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
|
|
|
755 |
_ ->
|
|
|
756 |
<<_:O/binary, Raw/binary>> = Bin,
|
|
|
757 |
{{data, Raw, false}, S}
|
|
|
758 |
end.
|
|
|
759 |
|
|
|
760 |
tokenize_comment(Bin, S=#decoder{offset=O}) ->
|
|
|
761 |
tokenize_comment(Bin, S, O).
|
|
|
762 |
|
|
|
763 |
tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
764 |
case Bin of
|
|
|
765 |
<<_:O/binary, "-->", _/binary>> ->
|
|
|
766 |
Len = O - Start,
|
|
|
767 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
768 |
{{comment, Raw}, ?ADV_COL(S, 3)};
|
|
|
769 |
<<_:O/binary, C, _/binary>> ->
|
|
|
770 |
tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
|
|
|
771 |
<<_:Start/binary, Raw/binary>> ->
|
|
|
772 |
{{comment, Raw}, S}
|
|
|
773 |
end.
|
|
|
774 |
|
|
|
775 |
tokenize_script(Bin, S=#decoder{offset=O}) ->
|
|
|
776 |
tokenize_script(Bin, S, O).
|
|
|
777 |
|
|
|
778 |
tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
779 |
case Bin of
|
|
|
780 |
%% Just a look-ahead, we want the end_tag separately
|
|
|
781 |
<<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
|
|
|
782 |
when (SS =:= $s orelse SS =:= $S) andalso
|
|
|
783 |
(CC =:= $c orelse CC =:= $C) andalso
|
|
|
784 |
(RR =:= $r orelse RR =:= $R) andalso
|
|
|
785 |
(II =:= $i orelse II =:= $I) andalso
|
|
|
786 |
(PP =:= $p orelse PP =:= $P) andalso
|
|
|
787 |
(TT=:= $t orelse TT =:= $T) andalso
|
|
|
788 |
?PROBABLE_CLOSE(ZZ) ->
|
|
|
789 |
Len = O - Start,
|
|
|
790 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
791 |
{{data, Raw, false}, S};
|
|
|
792 |
<<_:O/binary, C, _/binary>> ->
|
|
|
793 |
tokenize_script(Bin, ?INC_CHAR(S, C), Start);
|
|
|
794 |
<<_:Start/binary, Raw/binary>> ->
|
|
|
795 |
{{data, Raw, false}, S}
|
|
|
796 |
end.
|
|
|
797 |
|
|
|
798 |
tokenize_textarea(Bin, S=#decoder{offset=O}) ->
|
|
|
799 |
tokenize_textarea(Bin, S, O).
|
|
|
800 |
|
|
|
801 |
tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
|
|
|
802 |
case Bin of
|
|
|
803 |
%% Just a look-ahead, we want the end_tag separately
|
|
|
804 |
<<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
|
|
|
805 |
when (TT =:= $t orelse TT =:= $T) andalso
|
|
|
806 |
(EE =:= $e orelse EE =:= $E) andalso
|
|
|
807 |
(XX =:= $x orelse XX =:= $X) andalso
|
|
|
808 |
(TT2 =:= $t orelse TT2 =:= $T) andalso
|
|
|
809 |
(AA =:= $a orelse AA =:= $A) andalso
|
|
|
810 |
(RR =:= $r orelse RR =:= $R) andalso
|
|
|
811 |
(EE2 =:= $e orelse EE2 =:= $E) andalso
|
|
|
812 |
(AA2 =:= $a orelse AA2 =:= $A) andalso
|
|
|
813 |
?PROBABLE_CLOSE(ZZ) ->
|
|
|
814 |
Len = O - Start,
|
|
|
815 |
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
|
|
|
816 |
{{data, Raw, false}, S};
|
|
|
817 |
<<_:O/binary, C, _/binary>> ->
|
|
|
818 |
tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
|
|
|
819 |
<<_:Start/binary, Raw/binary>> ->
|
|
|
820 |
{{data, Raw, false}, S}
|
|
|
821 |
end.
|