Subversion Repositories SE.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
12 7u83 1
%% @author Bob Ippolito <bob@mochimedia.com>
2
%% @copyright 2007 Mochi Media, Inc.
3
%%
4
%% Permission is hereby granted, free of charge, to any person obtaining a
5
%% copy of this software and associated documentation files (the "Software"),
6
%% to deal in the Software without restriction, including without limitation
7
%% the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
%% and/or sell copies of the Software, and to permit persons to whom the
9
%% Software is furnished to do so, subject to the following conditions:
10
%%
11
%% The above copyright notice and this permission notice shall be included in
12
%% all copies or substantial portions of the Software.
13
%%
14
%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17
%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20
%% DEALINGS IN THE SOFTWARE.
21
 
22
%% @doc Loosely tokenizes and generates parse trees for HTML 4.
23
-module(mochiweb_html).
24
-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
25
         escape_attr/1, to_html/1]).
26
-ifdef(TEST).
27
-export([destack/1, destack/2, is_singleton/1]).
28
-endif.
29
 
30
%% This is a macro to placate syntax highlighters..
31
-define(QUOTE, $\"). %% $\"
32
-define(SQUOTE, $\'). %% $\'
33
-define(ADV_COL(S, N),
34
        S#decoder{column=N+S#decoder.column,
35
                  offset=N+S#decoder.offset}).
36
-define(INC_COL(S),
37
        S#decoder{column=1+S#decoder.column,
38
                  offset=1+S#decoder.offset}).
39
-define(INC_LINE(S),
40
        S#decoder{column=1,
41
                  line=1+S#decoder.line,
42
                  offset=1+S#decoder.offset}).
43
-define(INC_CHAR(S, C),
44
        case C of
45
            $\n ->
46
                S#decoder{column=1,
47
                          line=1+S#decoder.line,
48
                          offset=1+S#decoder.offset};
49
            _ ->
50
                S#decoder{column=1+S#decoder.column,
51
                          offset=1+S#decoder.offset}
52
        end).
53
 
54
-define(IS_WHITESPACE(C),
55
        (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
56
-define(IS_LETTER(C),
57
        ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z))).
58
-define(IS_LITERAL_SAFE(C),
59
        ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
60
         orelse (C >= $0 andalso C =< $9))).
61
-define(PROBABLE_CLOSE(C),
62
        (C =:= $> orelse ?IS_WHITESPACE(C))).
63
 
64
-record(decoder, {line=1,
65
                  column=1,
66
                  offset=0}).
67
 
68
%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
69
%% @type html_attr() = {string(), string()}
70
%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
71
%% @type html_data() = {data, string(), Whitespace::boolean()}
72
%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
73
%% @type end_tag() = {end_tag, Name}
74
%% @type html_comment() = {comment, Comment}
75
%% @type html_doctype() = {doctype, [Doctype]}
76
%% @type inline_html() = {'=', iolist()}
77
 
78
%% External API.
79
 
80
%% @spec parse(string() | binary()) -> html_node()
81
%% @doc tokenize and then transform the token stream into a HTML tree.
82
parse(Input) ->
83
    parse_tokens(tokens(Input)).
84
 
85
%% @spec parse_tokens([html_token()]) -> html_node()
86
%% @doc Transform the output of tokens(Doc) into a HTML tree.
87
parse_tokens(Tokens) when is_list(Tokens) ->
88
    %% Skip over doctype, processing instructions
89
    [{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
90
    {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
91
    Tree.
92
 
93
find_document(Tokens=[{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
94
    maybe_add_html_tag(Tokens, Mode);
95
find_document([{doctype, [<<"html">>]} | Rest], _Mode) ->
96
    find_document(Rest, html5);
97
find_document([_T | Rest], Mode) ->
98
    find_document(Rest, Mode);
99
find_document([], _Mode) ->
100
    [].
101
 
102
maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} | _], html5)
103
  when Tag =/= <<"html">> ->
104
    [{start_tag, <<"html">>, [], false} | Tokens];
105
maybe_add_html_tag(Tokens, _Mode) ->
106
    Tokens.
107
 
108
%% @spec tokens(StringOrBinary) -> [html_token()]
109
%% @doc Transform the input UTF-8 HTML into a token stream.
110
tokens(Input) ->
111
    tokens(iolist_to_binary(Input), #decoder{}, []).
112
 
113
%% @spec to_tokens(html_node()) -> [html_token()]
114
%% @doc Convert a html_node() tree to a list of tokens.
115
to_tokens({Tag0}) ->
116
    to_tokens({Tag0, [], []});
117
to_tokens(T={'=', _}) ->
118
    [T];
119
to_tokens(T={doctype, _}) ->
120
    [T];
121
to_tokens(T={comment, _}) ->
122
    [T];
123
to_tokens({Tag0, Acc}) ->
124
    %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
125
    to_tokens({Tag0, [], Acc});
126
to_tokens({Tag0, Attrs, Acc}) ->
127
    Tag = to_tag(Tag0),
128
    case is_singleton(Tag) of
129
        true ->
130
            to_tokens([], [{start_tag, Tag, Attrs, true}]);
131
        false ->
132
            to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}])
133
    end.
134
 
135
%% @spec to_html([html_token()] | html_node()) -> iolist()
136
%% @doc Convert a list of html_token() to a HTML document.
137
to_html(Node) when is_tuple(Node) ->
138
    to_html(to_tokens(Node));
139
to_html(Tokens) when is_list(Tokens) ->
140
    to_html(Tokens, []).
141
 
142
%% @spec escape(string() | atom() | binary()) -> binary()
143
%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
144
escape(B) when is_binary(B) ->
145
    escape(binary_to_list(B), []);
146
escape(A) when is_atom(A) ->
147
    escape(atom_to_list(A), []);
148
escape(S) when is_list(S) ->
149
    escape(S, []).
150
 
151
%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
152
%% @doc Escape a string such that it's safe for HTML attrs
153
%%      (amp; lt; gt; quot;).
154
escape_attr(B) when is_binary(B) ->
155
    escape_attr(binary_to_list(B), []);
156
escape_attr(A) when is_atom(A) ->
157
    escape_attr(atom_to_list(A), []);
158
escape_attr(S) when is_list(S) ->
159
    escape_attr(S, []);
160
escape_attr(I) when is_integer(I) ->
161
    escape_attr(integer_to_list(I), []);
162
escape_attr(F) when is_float(F) ->
163
    escape_attr(mochinum:digits(F), []).
164
 
165
to_html([], Acc) ->
166
    lists:reverse(Acc);
167
to_html([{'=', Content} | Rest], Acc) ->
168
    to_html(Rest, [Content | Acc]);
169
to_html([{pi, Bin} | Rest], Acc) ->
170
    Open = [<<"<?">>,
171
            Bin,
172
            <<"?>">>],
173
    to_html(Rest, [Open | Acc]);
174
to_html([{pi, Tag, Attrs} | Rest], Acc) ->
175
    Open = [<<"<?">>,
176
            Tag,
177
            attrs_to_html(Attrs, []),
178
            <<"?>">>],
179
    to_html(Rest, [Open | Acc]);
180
to_html([{comment, Comment} | Rest], Acc) ->
181
    to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
182
to_html([{doctype, Parts} | Rest], Acc) ->
183
    Inside = doctype_to_html(Parts, Acc),
184
    to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
185
to_html([{data, Data, _Whitespace} | Rest], Acc) ->
186
    to_html(Rest, [escape(Data) | Acc]);
187
to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
188
    Open = [<<"<">>,
189
            Tag,
190
            attrs_to_html(Attrs, []),
191
            case Singleton of
192
                true -> <<" />">>;
193
                false -> <<">">>
194
            end],
195
    to_html(Rest, [Open | Acc]);
196
to_html([{end_tag, Tag} | Rest], Acc) ->
197
    to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
198
 
199
doctype_to_html([], Acc) ->
200
    lists:reverse(Acc);
201
doctype_to_html([Word | Rest], Acc) ->
202
    case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
203
                   binary_to_list(iolist_to_binary(Word))) of
204
        true ->
205
            doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
206
        false ->
207
            doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
208
    end.
209
 
210
attrs_to_html([], Acc) ->
211
    lists:reverse(Acc);
212
attrs_to_html([{K, V} | Rest], Acc) ->
213
    attrs_to_html(Rest,
214
                  [[<<" ">>, escape(K), <<"=\"">>,
215
                    escape_attr(V), <<"\"">>] | Acc]).
216
 
217
escape([], Acc) ->
218
    list_to_binary(lists:reverse(Acc));
219
escape("<" ++ Rest, Acc) ->
220
    escape(Rest, lists:reverse("&lt;", Acc));
221
escape(">" ++ Rest, Acc) ->
222
    escape(Rest, lists:reverse("&gt;", Acc));
223
escape("&" ++ Rest, Acc) ->
224
    escape(Rest, lists:reverse("&amp;", Acc));
225
escape([C | Rest], Acc) ->
226
    escape(Rest, [C | Acc]).
227
 
228
escape_attr([], Acc) ->
229
    list_to_binary(lists:reverse(Acc));
230
escape_attr("<" ++ Rest, Acc) ->
231
    escape_attr(Rest, lists:reverse("&lt;", Acc));
232
escape_attr(">" ++ Rest, Acc) ->
233
    escape_attr(Rest, lists:reverse("&gt;", Acc));
234
escape_attr("&" ++ Rest, Acc) ->
235
    escape_attr(Rest, lists:reverse("&amp;", Acc));
236
escape_attr([?QUOTE | Rest], Acc) ->
237
    escape_attr(Rest, lists:reverse("&quot;", Acc));
238
escape_attr([C | Rest], Acc) ->
239
    escape_attr(Rest, [C | Acc]).
240
 
241
to_tag(A) when is_atom(A) ->
242
    norm(atom_to_list(A));
243
to_tag(L) ->
244
    norm(L).
245
 
246
to_tokens([], Acc) ->
247
    lists:reverse(Acc);
248
to_tokens([{Tag, []} | Rest], Acc) ->
249
    to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
250
to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
251
    %% Allow {br}
252
    to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
253
to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
254
    %% Allow {'=', iolist()}
255
    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
256
to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
257
    %% Allow {comment, iolist()}
258
    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
259
to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
260
    %% Allow {pi, binary()}
261
    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
262
to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
263
    %% Allow {pi, binary(), list()}
264
    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
265
to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
266
    %% Allow {p, [{"class", "foo"}]}
267
    to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
268
to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
269
    %% Allow {p, "content"} and {p, <<"content">>}
270
    to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
271
to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
272
    %% Allow {"p", [{"class", "foo"}], <<"content">>}
273
    to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
274
to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
275
  when is_integer(C) ->
276
    %% Allow {"p", [{"class", "foo"}], "content"}
277
    to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
278
to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
279
    %% Native {"p", [{"class", "foo"}], ["content"]}
280
    Tag = to_tag(Tag0),
281
    T1 = to_tag(T0),
282
    case is_singleton(norm(T1)) of
283
        true ->
284
            to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
285
        false ->
286
            to_tokens([{T1, C1}, {Tag, R1} | Rest],
287
                      [{start_tag, T1, A1, false} | Acc])
288
    end;
289
to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
290
    %% List text
291
    Tag = to_tag(Tag0),
292
    to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
293
to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
294
    %% Binary text
295
    Tag = to_tag(Tag0),
296
    to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
297
 
298
tokens(B, S=#decoder{offset=O}, Acc) ->
299
    case B of
300
        <<_:O/binary>> ->
301
            lists:reverse(Acc);
302
        _ ->
303
            {Tag, S1} = tokenize(B, S),
304
            case parse_flag(Tag) of
305
                script ->
306
                    {Tag2, S2} = tokenize_script(B, S1),
307
                    tokens(B, S2, [Tag2, Tag | Acc]);
308
                textarea ->
309
                    {Tag2, S2} = tokenize_textarea(B, S1),
310
                    tokens(B, S2, [Tag2, Tag | Acc]);
311
                none ->
312
                    tokens(B, S1, [Tag | Acc])
313
            end
314
    end.
315
 
316
parse_flag({start_tag, B, _, false}) ->
317
    case string:to_lower(binary_to_list(B)) of
318
        "script" ->
319
            script;
320
        "textarea" ->
321
            textarea;
322
        _ ->
323
            none
324
    end;
325
parse_flag(_) ->
326
    none.
327
 
328
tokenize(B, S=#decoder{offset=O}) ->
329
    case B of
330
        <<_:O/binary, "<!--", _/binary>> ->
331
            tokenize_comment(B, ?ADV_COL(S, 4));
332
        <<_:O/binary, "<!doctype", _/binary>> ->
333
            tokenize_doctype(B, ?ADV_COL(S, 10));
334
        <<_:O/binary, "<!DOCTYPE", _/binary>> ->
335
            tokenize_doctype(B, ?ADV_COL(S, 10));
336
        <<_:O/binary, "<![CDATA[", _/binary>> ->
337
            tokenize_cdata(B, ?ADV_COL(S, 9));
338
        <<_:O/binary, "<?php", _/binary>> ->
339
            {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
340
            {{pi, Body}, S1};
341
        <<_:O/binary, "<?", _/binary>> ->
342
            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
343
            {Attrs, S2} = tokenize_attributes(B, S1),
344
            S3 = find_qgt(B, S2),
345
            {{pi, Tag, Attrs}, S3};
346
        <<_:O/binary, "&", _/binary>> ->
347
            tokenize_charref(B, ?INC_COL(S));
348
        <<_:O/binary, "</", _/binary>> ->
349
            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
350
            {S2, _} = find_gt(B, S1),
351
            {{end_tag, Tag}, S2};
352
        <<_:O/binary, "<", C, _/binary>>
353
                when ?IS_WHITESPACE(C); not ?IS_LETTER(C) ->
354
            %% This isn't really strict HTML
355
            {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
356
            {{data, <<$<, Data/binary>>, false}, S1};
357
        <<_:O/binary, "<", _/binary>> ->
358
            {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
359
            {Attrs, S2} = tokenize_attributes(B, S1),
360
            {S3, HasSlash} = find_gt(B, S2),
361
            Singleton = HasSlash orelse is_singleton(Tag),
362
            {{start_tag, Tag, Attrs, Singleton}, S3};
363
        _ ->
364
            tokenize_data(B, S)
365
    end.
366
 
367
tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
368
    tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
369
tree_data(Rest, AllWhitespace, Acc) ->
370
    {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
371
 
372
tree([], Stack) ->
373
    {destack(Stack), []};
374
tree([{end_tag, Tag} | Rest], Stack) ->
375
    case destack(norm(Tag), Stack) of
376
        S when is_list(S) ->
377
            tree(Rest, S);
378
        Result ->
379
            {Result, []}
380
    end;
381
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
382
    tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
383
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
384
    tree(Rest, stack(norm({Tag, Attrs}), S));
385
tree([T={pi, _Raw} | Rest], S) ->
386
    tree(Rest, append_stack_child(T, S));
387
tree([T={pi, _Tag, _Attrs} | Rest], S) ->
388
    tree(Rest, append_stack_child(T, S));
389
tree([T={comment, _Comment} | Rest], S) ->
390
    tree(Rest, append_stack_child(T, S));
391
tree(L=[{data, _Data, _Whitespace} | _], S) ->
392
    case tree_data(L, true, []) of
393
        {_, true, Rest} ->
394
            tree(Rest, S);
395
        {Data, false, Rest} ->
396
            tree(Rest, append_stack_child(Data, S))
397
    end;
398
tree([{doctype, _} | Rest], Stack) ->
399
    tree(Rest, Stack).
400
 
401
norm({Tag, Attrs}) ->
402
    {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
403
norm(Tag) when is_binary(Tag) ->
404
    Tag;
405
norm(Tag) ->
406
    list_to_binary(string:to_lower(Tag)).
407
 
408
stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
409
  when TN =:= <<"li">> orelse TN =:= <<"option">> ->
410
    [T1 | destack(TN, Stack)];
411
stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
412
  when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
413
       (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
414
    [T1 | destack(TN1, Stack)];
415
stack(T1, Stack) ->
416
    [T1 | Stack].
417
 
418
append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
419
    [{Name, Attrs, [StartTag | Acc]} | Stack].
420
 
421
destack(<<"br">>, Stack) ->
422
    %% This is an ugly hack to make dumb_br_test() pass,
423
    %% this makes it such that br can never have children.
424
    Stack;
425
destack(TagName, Stack) when is_list(Stack) ->
426
    F = fun (X) ->
427
                case X of
428
                    {TagName, _, _} ->
429
                        false;
430
                    _ ->
431
                        true
432
                end
433
        end,
434
    case lists:splitwith(F, Stack) of
435
        {_, []} ->
436
            %% If we're parsing something like XML we might find
437
            %% a <link>tag</link> that is normally a singleton
438
            %% in HTML but isn't here
439
            case {is_singleton(TagName), Stack} of
440
                {true, [{T0, A0, Acc0} | Post0]} ->
441
                    case lists:splitwith(F, Acc0) of
442
                        {_, []} ->
443
                            %% Actually was a singleton
444
                            Stack;
445
                        {Pre, [{T1, A1, Acc1} | Post1]} ->
446
                            [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
447
                             | Post0]
448
                    end;
449
                _ ->
450
                    %% No match, no state change
451
                    Stack
452
            end;
453
        {_Pre, [_T]} ->
454
            %% Unfurl the whole stack, we're done
455
            destack(Stack);
456
        {Pre, [T, {T0, A0, Acc0} | Post]} ->
457
            %% Unfurl up to the tag, then accumulate it
458
            [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
459
    end.
460
 
461
destack([{Tag, Attrs, Acc}]) ->
462
    {Tag, Attrs, lists:reverse(Acc)};
463
destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
464
    destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
465
 
466
is_singleton(<<"area">>) -> true;
467
is_singleton(<<"base">>) -> true;
468
is_singleton(<<"br">>) -> true;
469
is_singleton(<<"col">>) -> true;
470
is_singleton(<<"embed">>) -> true;
471
is_singleton(<<"hr">>) -> true;
472
is_singleton(<<"img">>) -> true;
473
is_singleton(<<"input">>) -> true;
474
is_singleton(<<"keygen">>) -> true;
475
is_singleton(<<"link">>) -> true;
476
is_singleton(<<"meta">>) -> true;
477
is_singleton(<<"param">>) -> true;
478
is_singleton(<<"source">>) -> true;
479
is_singleton(<<"track">>) -> true;
480
is_singleton(<<"wbr">>) -> true;
481
is_singleton(_) -> false.
482
 
483
tokenize_data(B, S=#decoder{offset=O}) ->
484
    tokenize_data(B, S, O, true).
485
 
486
tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
487
    case B of
488
        <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
489
            tokenize_data(B, ?INC_CHAR(S, C), Start,
490
                          (Whitespace andalso ?IS_WHITESPACE(C)));
491
        _ ->
492
            Len = O - Start,
493
            <<_:Start/binary, Data:Len/binary, _/binary>> = B,
494
            {{data, Data, Whitespace}, S}
495
    end.
496
 
497
tokenize_attributes(B, S) ->
498
    tokenize_attributes(B, S, []).
499
 
500
tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
501
    case B of
502
        <<_:O/binary>> ->
503
            {lists:reverse(Acc), S};
504
        <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
505
            {lists:reverse(Acc), S};
506
        <<_:O/binary, "?>", _/binary>> ->
507
            {lists:reverse(Acc), S};
508
        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
509
            tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
510
        _ ->
511
            {Attr, S1} = tokenize_literal(B, S),
512
            {Value, S2} = tokenize_attr_value(Attr, B, S1),
513
            tokenize_attributes(B, S2, [{Attr, Value} | Acc])
514
    end.
515
 
516
tokenize_attr_value(Attr, B, S) ->
517
    S1 = skip_whitespace(B, S),
518
    O = S1#decoder.offset,
519
    case B of
520
        <<_:O/binary, "=", _/binary>> ->
521
            S2 = skip_whitespace(B, ?INC_COL(S1)),
522
            tokenize_quoted_or_unquoted_attr_value(B, S2);
523
        _ ->
524
            {Attr, S1}
525
    end.
526
 
527
tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
528
    case B of
529
        <<_:O/binary>> ->
530
            { [], S };
531
        <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
532
                                         Q =:= ?SQUOTE ->
533
            tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
534
        <<_:O/binary, _/binary>> ->
535
            tokenize_unquoted_attr_value(B, S, [])
536
    end.
537
 
538
tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
539
    case B of
540
        <<_:O/binary>> ->
541
            { iolist_to_binary(lists:reverse(Acc)), S };
542
        <<_:O/binary, $&, _/binary>> ->
543
            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
544
            tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
545
        <<_:O/binary, Q, _/binary>> ->
546
            { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
547
        <<_:O/binary, C, _/binary>> ->
548
            tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
549
    end.
550
 
551
tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
552
    case B of
553
        <<_:O/binary>> ->
554
            { iolist_to_binary(lists:reverse(Acc)), S };
555
        <<_:O/binary, $&, _/binary>> ->
556
            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
557
            tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
558
        <<_:O/binary, $/, $>, _/binary>> ->
559
            { iolist_to_binary(lists:reverse(Acc)), S };
560
        <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
561
            { iolist_to_binary(lists:reverse(Acc)), S };
562
        <<_:O/binary, C, _/binary>> ->
563
            tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
564
    end.
565
 
566
skip_whitespace(B, S=#decoder{offset=O}) ->
567
    case B of
568
        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
569
            skip_whitespace(B, ?INC_CHAR(S, C));
570
        _ ->
571
            S
572
    end.
573
 
574
tokenize_literal(Bin, S=#decoder{offset=O}) ->
575
    case Bin of
576
        <<_:O/binary, C, _/binary>> when C =:= $>
577
                                    orelse C =:= $/
578
                                    orelse C =:= $= ->
579
            %% Handle case where tokenize_literal would consume
580
            %% 0 chars. http://github.com/mochi/mochiweb/pull/13
581
            {[C], ?INC_COL(S)};
582
        _ ->
583
            tokenize_literal(Bin, S, [])
584
    end.
585
 
586
tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
587
    case Bin of
588
        <<_:O/binary, $&, _/binary>> ->
589
            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
590
            tokenize_literal(Bin, S1, [Data | Acc]);
591
        <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
592
                                              orelse C =:= $>
593
                                              orelse C =:= $/
594
                                              orelse C =:= $=) ->
595
            tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
596
        _ ->
597
            {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
598
    end.
599
 
600
raw_qgt(Bin, S=#decoder{offset=O}) ->
601
    raw_qgt(Bin, S, O).
602
 
603
raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
604
    case Bin of
605
        <<_:O/binary, "?>", _/binary>> ->
606
            Len = O - Start,
607
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
608
            {Raw, ?ADV_COL(S, 2)};
609
        <<_:O/binary, C, _/binary>> ->
610
            raw_qgt(Bin, ?INC_CHAR(S, C), Start);
611
        <<_:O/binary>> ->
612
            <<_:Start/binary, Raw/binary>> = Bin,
613
            {Raw, S}
614
    end.
615
 
616
find_qgt(Bin, S=#decoder{offset=O}) ->
617
    case Bin of
618
        <<_:O/binary, "?>", _/binary>> ->
619
            ?ADV_COL(S, 2);
620
        <<_:O/binary, ">", _/binary>> ->
621
                        ?ADV_COL(S, 1);
622
        <<_:O/binary, "/>", _/binary>> ->
623
                        ?ADV_COL(S, 2);
624
        %% tokenize_attributes takes care of this state:
625
        %% <<_:O/binary, C, _/binary>> ->
626
        %%     find_qgt(Bin, ?INC_CHAR(S, C));
627
        <<_:O/binary>> ->
628
            S
629
    end.
630
 
631
find_gt(Bin, S) ->
632
    find_gt(Bin, S, false).
633
 
634
find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
635
    case Bin of
636
        <<_:O/binary, $/, _/binary>> ->
637
            find_gt(Bin, ?INC_COL(S), true);
638
        <<_:O/binary, $>, _/binary>> ->
639
            {?INC_COL(S), HasSlash};
640
        <<_:O/binary, C, _/binary>> ->
641
            find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
642
        _ ->
643
            {S, HasSlash}
644
    end.
645
 
646
tokenize_charref(Bin, S=#decoder{offset=O}) ->
647
    try
648
        case tokenize_charref_raw(Bin, S, O) of
649
            {C1, S1} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
650
                %% Surrogate pair
651
                tokeninize_charref_surrogate_pair(Bin, S1, C1);
652
            {Unichar, S1} when is_integer(Unichar) ->
653
                {{data, mochiutf8:codepoint_to_bytes(Unichar), false},
654
                 S1};
655
            {Unichars, S1} when is_list(Unichars) ->
656
                {{data, unicode:characters_to_binary(Unichars), false},
657
                 S1};
658
            {undefined, _} ->
659
                throw(invalid_charref)
660
        end
661
    catch
662
        throw:invalid_charref ->
663
            {{data, <<"&">>, false}, S}
664
    end.
665
 
666
tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
667
    case Bin of
668
        <<_:O/binary, $&, _/binary>> ->
669
            case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
670
                {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
671
                    {{data,
672
                      unicode:characters_to_binary(
673
                        <<C1:16, C2:16>>,
674
                        utf16,
675
                        utf8),
676
                      false},
677
                     S1};
678
                _ ->
679
                    throw(invalid_charref)
680
            end;
681
        _ ->
682
            throw(invalid_charref)
683
    end.
684
 
685
tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
686
    case Bin of
687
        <<_:O/binary>> ->
688
            throw(invalid_charref);
689
        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
690
                                         orelse C =:= ?SQUOTE
691
                                         orelse C =:= ?QUOTE
692
                                         orelse C =:= $/
693
                                         orelse C =:= $> ->
694
            throw(invalid_charref);
695
        <<_:O/binary, $;, _/binary>> ->
696
            Len = O - Start,
697
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
698
            {mochiweb_charref:charref(Raw), ?INC_COL(S)};
699
        _ ->
700
            tokenize_charref_raw(Bin, ?INC_COL(S), Start)
701
    end.
702
 
703
tokenize_doctype(Bin, S) ->
704
    tokenize_doctype(Bin, S, []).
705
 
706
tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
707
    case Bin of
708
        <<_:O/binary>> ->
709
            {{doctype, lists:reverse(Acc)}, S};
710
        <<_:O/binary, $>, _/binary>> ->
711
            {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
712
        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
713
            tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
714
        _ ->
715
            {Word, S1} = tokenize_word_or_literal(Bin, S),
716
            tokenize_doctype(Bin, S1, [Word | Acc])
717
    end.
718
 
719
tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
720
    case Bin of
721
        <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
722
            tokenize_word(Bin, ?INC_COL(S), C);
723
        <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
724
            %% Sanity check for whitespace
725
            tokenize_literal(Bin, S)
726
    end.
727
 
728
tokenize_word(Bin, S, Quote) ->
729
    tokenize_word(Bin, S, Quote, []).
730
 
731
tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
732
    case Bin of
733
        <<_:O/binary>> ->
734
            {iolist_to_binary(lists:reverse(Acc)), S};
735
        <<_:O/binary, Quote, _/binary>> ->
736
            {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
737
        <<_:O/binary, $&, _/binary>> ->
738
            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
739
            tokenize_word(Bin, S1, Quote, [Data | Acc]);
740
        <<_:O/binary, C, _/binary>> ->
741
            tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
742
    end.
743
 
744
tokenize_cdata(Bin, S=#decoder{offset=O}) ->
745
    tokenize_cdata(Bin, S, O).
746
 
747
tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
748
    case Bin of
749
        <<_:O/binary, "]]>", _/binary>> ->
750
            Len = O - Start,
751
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
752
            {{data, Raw, false}, ?ADV_COL(S, 3)};
753
        <<_:O/binary, C, _/binary>> ->
754
            tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
755
        _ ->
756
            <<_:O/binary, Raw/binary>> = Bin,
757
            {{data, Raw, false}, S}
758
    end.
759
 
760
tokenize_comment(Bin, S=#decoder{offset=O}) ->
761
    tokenize_comment(Bin, S, O).
762
 
763
tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
764
    case Bin of
765
        <<_:O/binary, "-->", _/binary>> ->
766
            Len = O - Start,
767
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
768
            {{comment, Raw}, ?ADV_COL(S, 3)};
769
        <<_:O/binary, C, _/binary>> ->
770
            tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
771
        <<_:Start/binary, Raw/binary>> ->
772
            {{comment, Raw}, S}
773
    end.
774
 
775
tokenize_script(Bin, S=#decoder{offset=O}) ->
776
    tokenize_script(Bin, S, O).
777
 
778
tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
779
    case Bin of
780
        %% Just a look-ahead, we want the end_tag separately
781
        <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
782
        when (SS =:= $s orelse SS =:= $S) andalso
783
             (CC =:= $c orelse CC =:= $C) andalso
784
             (RR =:= $r orelse RR =:= $R) andalso
785
             (II =:= $i orelse II =:= $I) andalso
786
             (PP =:= $p orelse PP =:= $P) andalso
787
             (TT=:= $t orelse TT =:= $T) andalso
788
             ?PROBABLE_CLOSE(ZZ) ->
789
            Len = O - Start,
790
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
791
            {{data, Raw, false}, S};
792
        <<_:O/binary, C, _/binary>> ->
793
            tokenize_script(Bin, ?INC_CHAR(S, C), Start);
794
        <<_:Start/binary, Raw/binary>> ->
795
            {{data, Raw, false}, S}
796
    end.
797
 
798
tokenize_textarea(Bin, S=#decoder{offset=O}) ->
799
    tokenize_textarea(Bin, S, O).
800
 
801
tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
802
    case Bin of
803
        %% Just a look-ahead, we want the end_tag separately
804
        <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
805
        when (TT =:= $t orelse TT =:= $T) andalso
806
             (EE =:= $e orelse EE =:= $E) andalso
807
             (XX =:= $x orelse XX =:= $X) andalso
808
             (TT2 =:= $t orelse TT2 =:= $T) andalso
809
             (AA =:= $a orelse AA =:= $A) andalso
810
             (RR =:= $r orelse RR =:= $R) andalso
811
             (EE2 =:= $e orelse EE2 =:= $E) andalso
812
             (AA2 =:= $a orelse AA2 =:= $A) andalso
813
             ?PROBABLE_CLOSE(ZZ) ->
814
            Len = O - Start,
815
            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
816
            {{data, Raw, false}, S};
817
        <<_:O/binary, C, _/binary>> ->
818
            tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
819
        <<_:Start/binary, Raw/binary>> ->
820
            {{data, Raw, false}, S}
821
    end.