JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
test for correct number of parse errors
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
28
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
31 TYPE_COMMENT = 2
32 TYPE_DOCTYPE = 3
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_END_TAG = 5 # name
36 TYPE_EOF = 6
37
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
40 digits = "0123456789"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
43 scopers = { # FIXME these are supposed to be namespace specific
44         'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
45         'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
46         'foreignObject', 'desc', 'title'
47 }
48
49 # some SVG elements have dashes in them
50 tag_name_chars = alnum + "-"
51
52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
54
55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
57
58 # These are the character references that don't need a terminating semicolon
59 # min length: 2, max: 6, none are a prefix of any other.
60 legacy_char_refs = {
61         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
62         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
63         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
64         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
65         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
66         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
67         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
68         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
69         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
70         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
71         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
72         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
73         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
74         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
75         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
76         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
77         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
78         yen: '¥', yuml: 'ÿ'
79 }
80
81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
82 raw_text_elements = ['script', 'style']
83 escapable_raw_text_elements = ['textarea', 'title']
84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
85 svg_elements = [
86         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
87         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
88         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
89         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
90         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
91         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
92         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
93         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
94         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
95         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
96         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
97         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
98         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
99         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
100         'view', 'vkern'
101 ]
102
103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
104 mathml_elements = [
105         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
106         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
107         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
108         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
109         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
110         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
111         'determinant', 'diff', 'divergence', 'divide', 'domain',
112         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
113         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
114         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
115         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
116         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
117         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
118         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
119         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
120         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
121         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
122         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
123         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
124         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
125         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
126         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
127         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
128         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
129         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
130         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
131         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
132         'vectorproduct', 'xor'
133 ]
134 # foreign_elements = [svg_elements..., mathml_elements...]
135 #normal_elements = All other allowed HTML elements are normal elements.
136
137 special_elements = {
138         # from HTML:
139         address: true, applet: true, area: true, article: true, aside: true,
140         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
141         br: true, button: true, caption: true, center: true, col: true,
142         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
143         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
144         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
145         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
146         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
147         isindex: true, li: true, link: true, listing: true, main: true,
148         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
149         noscript: true, object: true, ol: true, p: true, param: true,
150         plaintext: true, pre: true, script: true, section: true, select: true,
151         source: true, style: true, summary: true, table: true, tbody: true,
152         td: true, template: true, textarea: true, tfoot: true, th: true,
153         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
154         xmp: true,
155
156         # from MathML:
157         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
158
159         # from SVG:
160         foreignObject: true, desc: true, title: true
161 }
162
163 formatting_elements = {
164          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
165          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
166          u: true
167 }
168
169
170 # decode_named_char_ref()
171 #
172 # The list of named character references is _huge_ so ask the browser to decode
173 # for us instead of wasting bandwidth/space on including the table here.
174 #
175 # Pass without the "&" but with the ";" examples:
176 #    for "&amp" pass "amp;"
177 #    for "&#x2032" pass "x2032;"
178 g_dncr = {
179         cache: {}
180         textarea: document.createElement('textarea')
181 }
182 # TODO test this in IE8
183 decode_named_char_ref = (txt) ->
184         txt = "&#{txt}"
185         decoded = g_dncr.cache[txt]
186         return decoded if decoded?
187         g_dncr.textarea.innerHTML = txt
188         decoded = g_dncr.textarea.value
189         return null if decoded is txt
190         return g_dncr.cache[txt] = decoded
191
192 parse_html = (txt, parse_error_cb = null) ->
193         cur = 0 # index of next char in txt to be parsed
194         # declare tree and tokenizer variables so they're in scope below
195         tree = null
196         open_tags = [] # stack of open elements
197         tree_state = null
198         tok_state = null
199         tok_cur_tag = null # partially parsed tag
200         flag_frameset_ok = null
201         flag_parsing = null
202
203         parse_error = ->
204                 if parse_error_cb?
205                         parse_error_cb cur
206                 else
207                         console.log "Parse error at character #{cur} of #{txt.length}"
208
209
210         # the functions below impliment the Tree Contstruction algorithm
211         # http://www.w3.org/TR/html5/syntax.html#tree-construction
212
213         # But first... the helpers
214         template_tag_is_open = ->
215                 for t of open_tags
216                         if t[0] is TYPE_TAG and t[1] is 'template'
217                                 return true
218                 return false
219         is_in_scope = (tag_name) ->
220                 for t of open_tags
221                         if t[0] is TYPE_TAG and t[1] is tag_name
222                                 return true
223                         # FIXME bail if in scopers
224                 return false
225
226         reconstruct_active_formatting_elements = ->
227                 # FIXME implement this
228
229         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
230         # FIXME implement this
231         close_p_if_in_button_scope = ->
232                 if open_tags[0][1] is 'p' # FIXME
233                         open_tags.pop()
234                 return
235                 #p = find_button_scope 'p'
236                 #if p?
237                         # TODO generate_implied_end_tags except for p tags
238                         # TODO parse_error unless open_tags[0][1] is 'p'
239                         # TODO pop stack until 'p' popped
240
241
242
243         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
244         tree_insert_a_character = (t) ->
245                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
246                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
247                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
248                 else
249                         open_tags[0][3].push t
250
251         # FIXME read spec, do this right
252         # note: this assumes it's an open tag
253         tree_insert_tag = (t) ->
254                 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
255                 # convert attributes into a hash
256                 attrs = {}
257                 while t[2].length
258                         a = t[2].pop()
259                         attrs[a[0]] = a[1]
260                 t[2] = attrs
261                 open_tags[0][3].push t
262                 open_tags.unshift t
263
264         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
265         tree_insert_a_comment = (t) ->
266                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
267                 open_tags[0][3].push t
268
269         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
270         tree_in_body = (t) ->
271                 switch t[0]
272                         when TYPE_TEXT
273                                 switch t[1]
274                                         when "\u0000"
275                                                 parse_error()
276                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
277                                                 reconstruct_active_formatting_elements()
278                                                 tree_insert_a_character t
279                                         else
280                                                 reconstruct_active_formatting_elements()
281                                                 tree_insert_a_character t
282                                                 flag_frameset_ok = false
283                         when TYPE_COMMENT
284                                 tree_insert_a_comment t
285                         when TYPE_DOCTYPE
286                                 parse_error()
287                         when TYPE_OPEN_TAG
288                                 switch t[1]
289                                         when 'html'
290                                                 parse_error()
291                                                 return if template_tag_is_open()
292                                                 root_attrs = open_tags[open_tags.length - 1][3]
293                                                 for k, v of t[2]
294                                                         root_attrs[k] = v unless root_attrs[k]?
295                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
296                                                 # FIXME also do this for </template> (end tag)
297                                                 return tree_in_head t
298                                         when 'body'
299                                                 parse_error()
300                                                 # TODO
301                                         when 'frameset'
302                                                 parse_error()
303                                                 # TODO
304                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
305                                                 close_p_if_in_button_scope()
306                                                 tree_insert_tag t
307                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
308                                                 close_p_if_in_button_scope()
309                                                 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
310                                                         parse_error()
311                                                         open_tags.shift()
312                                                 tree_insert_tag t
313                                         # TODO lots more to implement here
314                                         else # any other start tag
315                                                 reconstruct_active_formatting_elements()
316                                                 tree_insert_tag t
317                         when TYPE_EOF
318                                 ok_tags = {
319                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
320                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
321                                 }
322                                 for t in open_tags
323                                         unless ok_tags[t[1]]?
324                                                 parse_error()
325                                                 break
326                                 # TODO stack of template insertion modes thing
327                                 flag_parsing = false # stop parsing
328                         when TYPE_END_TAG
329                                 switch t[1]
330                                         when 'body'
331                                                 unless is_in_scope 'body'
332                                                         parse_error()
333                                                         return
334                                                 # TODO implement parse error and move to tree_after_body
335                                         when 'html'
336                                                 unless is_in_scope 'body' # weird, but it's what the spec says
337                                                         parse_error()
338                                                         return
339                                                 # TODO implement parse error and move to tree_after_body, reprocess
340                                         # TODO lots more close tags to implement here
341                                         else
342                                                 for node, i in open_tags
343                                                         if node[1] is t[1]
344                                                                 # FIXME generate implied end tags except those with name==t[1]
345                                                                 parse_error() unless i is 0
346                                                                 while i > 0
347                                                                         open_tags.shift()
348                                                                         i -= 1
349                                                                 open_tags.shift()
350                                                                 return
351                                                         if special_elements[node[1]]?
352                                                                 parse_error()
353                                                                 return
354
355
356         # the functions below implement the tokenizer stats described here:
357         # http://www.w3.org/TR/html5/syntax.html#tokenization
358
359         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
360         tok_state_data = ->
361                 switch c = txt.charAt(cur++)
362                         when '&'
363                                 return [TYPE_TEXT, tokenize_character_reference()]
364                         when '<'
365                                 tok_state = tok_state_tag_open
366                         when "\u0000"
367                                 parse_error()
368                                 return [TYPE_TEXT, c]
369                         when '' # EOF
370                                 return [TYPE_EOF]
371                         else
372                                 return [TYPE_TEXT, c]
373                 return null
374
375         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
376         # not needed: tok_state_character_reference_in_data = ->
377         # just call tok_state_character_reference_in_data()
378
379         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
380         tok_state_tag_open = ->
381                 switch c = txt.charAt(cur++)
382                         when '!'
383                                 tok_state = tok_state_markup_declaration_open
384                         when '/'
385                                 tok_state = tok_state_end_tag_open
386                         when '?'
387                                 parse_error()
388                                 tok_state = tok_state_bogus_comment
389                         else
390                                 if lc_alpha.indexOf(c) > -1
391                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
392                                         tok_state = tok_state_tag_name
393                                 else if uc_alpha.indexOf(c) > -1
394                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
395                                         tok_state = tok_state_tag_name
396                                 else
397                                         parse_error()
398                                         tok_state = tok_state_data
399                                         cur -= 1 # we didn't parse/handle the char after <
400                                         return [TYPE_TEXT, '<']
401                 return null
402
403         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
404         tok_state_end_tag_open = ->
405                 switch c = txt.charAt(cur++)
406                         when '>'
407                                 parse_error()
408                                 tok_state = tok_state_data
409                         when '' # EOF
410                                 parse_error()
411                                 tok_state = tok_state_data
412                                 return [TYPE_TEXT, '</']
413                         else
414                                 if uc_alpha.indexOf(c) > -1
415                                         tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
416                                         tok_state = tok_state_tag_name
417                                 else if lc_alpha.indexOf(c) > -1
418                                         tok_cur_tag = [TYPE_END_TAG, c, [], []]
419                                         tok_state = tok_state_tag_name
420                                 else
421                                         parse_error()
422                                         tok_state = tok_state_bogus_comment
423                 return null
424
425         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
426         tok_state_tag_name = ->
427                 switch c = txt.charAt(cur++)
428                         when "\t", "\n", "\u000c", ' '
429                                 tok_state = tok_state_before_attribute_name
430                         when '/'
431                                 tok_state = tok_state_self_closing_start_tag
432                         when '>'
433                                 tok_state = tok_state_data
434                                 tmp = tok_cur_tag
435                                 tok_cur_tag = null
436                                 return tmp
437                         when "\u0000"
438                                 parse_error()
439                                 tok_cur_tag[1] += "\ufffd"
440                         when '' # EOF
441                                 parse_error()
442                                 tok_state = tok_state_data
443                         else
444                                 if uc_alpha.indexOf(c) > -1
445                                         tok_cur_tag[1] += c.toLowerCase()
446                                 else
447                                         tok_cur_tag[1] += c
448                 return null
449
450         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
451         tok_state_before_attribute_name = ->
452                 attr_name = null
453                 switch c = txt.charAt(cur++)
454                         when "\t", "\n", "\u000c", ' '
455                                 return null
456                         when '/'
457                                 tok_state = tok_state_self_closing_start_tag
458                                 return null
459                         when '>'
460                                 tok_state = tok_state_data
461                                 tmp = tok_cur_tag
462                                 tok_cur_tag = null
463                                 return tmp
464                         when "\u0000"
465                                 parse_error()
466                                 attr_name = "\ufffd"
467                         when '"', "'", '<', '='
468                                 parse_error()
469                                 attr_name = c
470                         when '' # EOF
471                                 parse_error()
472                                 tok_state = tok_state_data
473                         else
474                                 if uc_alpha.indexOf(c) > -1
475                                         attr_name = c.toLowerCase()
476                                 else
477                                         attr_name = c
478                 if attr_name?
479                         tok_cur_tag[2].unshift [attr_name, '']
480                         tok_state = tok_state_attribute_name
481                 return null
482
483         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
484         tok_state_attribute_name = ->
485                 switch c = txt.charAt(cur++)
486                         when "\t", "\n", "\u000c", ' '
487                                 tok_state = tok_state_after_attribute_name
488                         when '/'
489                                 tok_state = tok_state_self_closing_start_tag
490                         when '='
491                                 tok_state = tok_state_before_attribute_value
492                         when '>'
493                                 tok_state = tok_state_data
494                                 tmp = tok_cur_tag
495                                 tok_cur_tag = null
496                                 return tmp
497                         when "\u0000"
498                                 parse_error()
499                                 tok_cur_tag[2][0][0] = "\ufffd"
500                         when '"', "'", '<'
501                                 parse_error()
502                                 tok_cur_tag[2][0][0] = c
503                         when '' # EOF
504                                 parse_error()
505                                 tok_state = tok_state_data
506                         else
507                                 if uc_alpha.indexOf(c) > -1
508                                         tok_cur_tag[2][0][0] = c.toLowerCase()
509                                 else
510                                         tok_cur_tag[2][0][0] += c
511                 return null
512
513         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
514         tok_state_before_attribute_value = ->
515                 switch c = txt.charAt(cur++)
516                         when "\t", "\n", "\u000c", ' '
517                                 return null
518                         when '"'
519                                 tok_state = tok_state_attribute_value_double_quoted
520                         when '&'
521                                 tok_state = tok_state_attribute_value_unquoted
522                                 cur -= 1
523                         when "'"
524                                 tok_state = tok_state_attribute_value_single_quoted
525                         when "\u0000"
526                                 # Parse error
527                                 tok_cur_tag[2][0][1] += "\ufffd"
528                                 tok_state = tok_state_attribute_value_unquoted
529                         when '>'
530                                 # Parse error
531                                 tok_state = tok_state_data
532                                 tmp = tok_cur_tag
533                                 tok_cur_tag = null
534                                 return tmp
535                         when '' # EOF
536                                 parse_error()
537                                 tok_state = tok_state_data
538                         else
539                                 tok_cur_tag[2][0][1] += c
540                                 tok_state = tok_state_attribute_value_unquoted
541                 return null
542
543         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
544         tok_state_attribute_value_double_quoted = ->
545                 switch c = txt.charAt(cur++)
546                         when '"'
547                                 tok_state = tok_state_after_attribute_value_quoted
548                         when '&'
549                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
550                         when "\u0000"
551                                 # Parse error
552                                 tok_cur_tag[2][0][1] += "\ufffd"
553                         when '' # EOF
554                                 parse_error()
555                                 tok_state = tok_state_data
556                         else
557                                 tok_cur_tag[2][0][1] += c
558                 return null
559
560         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
561         tok_state_attribute_value_single_quoted = ->
562                 switch c = txt.charAt(cur++)
563                         when "'"
564                                 tok_state = tok_state_after_attribute_value_quoted
565                         when '&'
566                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
567                         when "\u0000"
568                                 # Parse error
569                                 tok_cur_tag[2][0][1] += "\ufffd"
570                         when '' # EOF
571                                 parse_error()
572                                 tok_state = tok_state_data
573                         else
574                                 tok_cur_tag[2][0][1] += c
575                 return null
576
577         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
578         tok_state_attribute_value_unquoted = ->
579                 switch c = txt.charAt(cur++)
580                         when "\t", "\n", "\u000c", ' '
581                                 tok_state = tok_state_before_attribute_name
582                         when '&'
583                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
584                         when '>'
585                                 tok_state = tok_state_data
586                                 tmp = tok_cur_tag
587                                 tok_cur_tag = null
588                                 return tmp
589                         when "\u0000"
590                                 tok_cur_tag[2][0][1] += "\ufffd"
591                         when '' # EOF
592                                 parse_error()
593                                 tok_state = tok_state_data
594                         else
595                                 # Parse Error if ', <, = or ` (backtick)
596                                 tok_cur_tag[2][0][1] += c
597                 return null
598
599         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
600         tok_state_after_attribute_value_quoted = ->
601                 switch c = txt.charAt(cur++)
602                         when "\t", "\n", "\u000c", ' '
603                                 tok_state = tok_state_before_attribute_name
604                         when '/'
605                                 tok_state = tok_state_self_closing_start_tag
606                         when '>'
607                                 tok_state = tok_state_data
608                                 tmp = tok_cur_tag
609                                 tok_cur_tag = null
610                                 return tmp
611                         when '' # EOF
612                                 parse_error()
613                                 tok_state = tok_state_data
614                         else
615                                 # Parse Error
616                                 tok_state = tok_state_before_attribute_name
617                                 cur -= 1 # we didn't handle that char
618                 return null
619
620         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
621         # Don't set this as a state, just call it
622         # returns a string (NOT a text node)
623         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
624                 if cur >= txt.length
625                         return '&'
626                 switch c = txt.charAt(cur)
627                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
628                                 # explicitly not a parse error
629                                 return '&'
630                         when ';'
631                                 # there has to be "one or more" alnums between & and ; to be a parse error
632                                 return '&'
633                         when '#'
634                                 if cur + 1 >= txt.length
635                                         return '&'
636                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
637                                         prefix = '#x'
638                                         charset = hex_chars
639                                         start = cur + 2
640                                 else
641                                         charset = digits
642                                         start = cur + 1
643                                         prefix = '#'
644                                 i = 0
645                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
646                                         i += 1
647                                 if i is 0
648                                         return '&'
649                                 if txt.charAt(start + i) is ';'
650                                         i += 1
651                                 # FIXME This is supposed to generate parse errors for some chars
652                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
653                                 if decoded?
654                                         cur = start + i
655                                         return decoded
656                                 return '&'
657                         else
658                                 for i in [0...31]
659                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
660                                                 break
661                                 if i is 0
662                                         # exit early, because parse_error() below needs at least one alnum
663                                         return '&'
664                                 if txt.charAt(cur + i) is ';'
665                                         i += 1 # include ';' terminator in value
666                                         decoded = decode_named_char_ref txt.substr(cur, i)
667                                         if decoded?
668                                                 cur += i
669                                                 return decoded
670                                         parse_error()
671                                         return '&'
672                                 else
673                                         # no ';' terminator (only legacy char refs)
674                                         max = i
675                                         for i in [2..max] # no prefix matches, so ok to check shortest first
676                                                 c = legacy_char_refs[txt.substr(cur, i)]
677                                                 if c?
678                                                         if in_attr
679                                                                 if txt.charAt(cur + i) is '='
680                                                                         # "because some legacy user agents will
681                                                                         # misinterpret the markup in those cases"
682                                                                         parse_error()
683                                                                         return '&'
684                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
685                                                                         # this makes attributes forgiving about url args
686                                                                         return '&'
687                                                         # ok, and besides the weird exceptions for attributes...
688                                                         # return the matching char
689                                                         cur += i # consume entity chars
690                                                         parse_error() # because no terminating ";"
691                                                         return c
692                                         parse_error()
693                                         return '&'
694                 return # never reached
695
696         # tree constructor initialization
697         # see comments on TYPE_TAG/etc for the structure of this data
698         tree = [TYPE_TAG, 'html', {}, []]
699         open_tags = [tree]
700         tree_state = tree_in_body
701         flag_frameset_ok = true
702         flag_parsing = true
703
704         # tokenizer initialization
705         tok_state = tok_state_data
706
707         # proccess input
708         while flag_parsing
709                 t = tok_state()
710                 if t?
711                         tree_state t
712         return tree[3]
713
714 # everything below is tests on the above
715 test_equals = (description, output, expected_output) ->
716         if output is expected_output
717                 console.log "passed." # don't say name, so smart consoles can merge all of these
718         else
719                 console.log "FAILED: \"#{description}\""
720                 console.log "   Expected: #{expected_output}"
721                 console.log "     Actual: #{output}"
722 test_parser = (args) ->
723         parse_errors = []
724         errors_cb = (i) ->
725                 parse_errors.push i
726         parsed = parse_html args.html, errors_cb
727         parsed = JSON.stringify parsed
728         if parsed isnt args.expected or parse_errors.length isnt args.errors
729                 console.log "test FAILED: \"#{args.name}\""
730         else
731                 console.log 'test passed'
732         if parsed isnt args.expected
733                 console.log "      Input: #{args.html}"
734                 console.log "    Correct: #{args.expected}"
735                 console.log "     Output: #{parsed}"
736         if parse_errors.length isnt args.errors
737                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
738
739 test_parser name: "empty", \
740         html: "",
741         expected: '[]',
742         errors: 0
743 test_parser name: "just text", \
744         html: "abc",
745         expected: '[[1,"abc"]]',
746         errors: 0
747 test_parser name: "named entity", \
748         html: "a&amp;1234",
749         expected: '[[1,"a&1234"]]',
750         errors: 0
751 test_parser name: "broken named character references", \
752         html: "1&amp2&&amp;3&aabbcc;",
753         expected: '[[1,"1&2&&3&aabbcc;"]]',
754         errors: 2
755 test_parser name: "numbered entity overrides", \
756         html: "1&#X80&#x80; &#x83",
757         expected: '[[1,"1€€ ƒ"]]',
758         errors: 0
759 test_parser name: "open tag", \
760         html: "foo<span>bar",
761         expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
762         errors: 1 # no close tag
763 test_parser name: "open tag with attributes", \
764         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
765         expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
766         errors: 1 # no close tag
767 test_parser name: "open tag with attributes of various quotings", \
768         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
769         expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
770         errors: 1 # no close tag
771 test_parser name: "attribute entity exceptions dq", \
772         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
773         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
774         errors: 2 # no close tag, &amp= in attr
775 test_parser name: "attribute entity exceptions sq", \
776         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
777         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
778         errors: 2 # no close tag, &amp= in attr
779 test_parser name: "attribute entity exceptions uq", \
780         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
781         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
782         errors: 2 # no close tag, &amp= in attr
783 test_parser name: "matching closing tags", \
784         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
785         expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
786         errors: 0
787 test_parser name: "mis-matched closing tags", \
788         html: "foo<div>bar<span>baz</div>qux",
789         expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
790         errors: 1 # close tag mismatch