JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
start implementing tree construction algorithm
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
28
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
31 TYPE_COMMENT = 2
32 TYPE_DOCTYPE = 3
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_END_TAG = 5 # name
36 TYPE_EOF = 6
37
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
40 digits = "0123456789"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
43 scopers = { # FIXME these are supposed to be namespace specific
44         'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
45         'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
46         'foreignObject', 'desc', 'title'
47 }
48
49 # some SVG elements have dashes in them
50 tag_name_chars = alnum + "-"
51
52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
54
55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
57
58 # These are the character references that don't need a terminating semicolon
59 # min length: 2, max: 6, none are a prefix of any other.
60 legacy_char_refs = {
61         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
62         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
63         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
64         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
65         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
66         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
67         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
68         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
69         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
70         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
71         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
72         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
73         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
74         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
75         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
76         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
77         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
78         yen: '¥', yuml: 'ÿ'
79 }
80
81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
82 raw_text_elements = ['script', 'style']
83 escapable_raw_text_elements = ['textarea', 'title']
84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
85 svg_elements = [
86         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
87         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
88         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
89         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
90         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
91         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
92         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
93         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
94         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
95         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
96         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
97         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
98         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
99         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
100         'view', 'vkern'
101 ]
102
103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
104 mathml_elements = [
105         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
106         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
107         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
108         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
109         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
110         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
111         'determinant', 'diff', 'divergence', 'divide', 'domain',
112         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
113         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
114         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
115         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
116         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
117         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
118         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
119         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
120         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
121         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
122         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
123         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
124         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
125         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
126         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
127         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
128         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
129         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
130         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
131         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
132         'vectorproduct', 'xor'
133 ]
134 # foreign_elements = [svg_elements..., mathml_elements...]
135 #normal_elements = All other allowed HTML elements are normal elements.
136
137 special_elements = {
138         # from HTML:
139         address: true, applet: true, area: true, article: true, aside: true,
140         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
141         br: true, button: true, caption: true, center: true, col: true,
142         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
143         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
144         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
145         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
146         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
147         isindex: true, li: true, link: true, listing: true, main: true,
148         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
149         noscript: true, object: true, ol: true, p: true, param: true,
150         plaintext: true, pre: true, script: true, section: true, select: true,
151         source: true, style: true, summary: true, table: true, tbody: true,
152         td: true, template: true, textarea: true, tfoot: true, th: true,
153         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
154         xmp: true,
155
156         # from MathML:
157         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
158
159         # from SVG:
160         foreignObject: true, desc: true, title: true
161 }
162
163 formatting_elements = {
164          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
165          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
166          u: true
167 }
168
169
170 # decode_named_char_ref()
171 #
172 # The list of named character references is _huge_ so ask the browser to decode
173 # for us instead of wasting bandwidth/space on including the table here.
174 #
175 # Pass without the "&" but with the ";" examples:
176 #    for "&amp" pass "amp;"
177 #    for "&#x2032" pass "x2032;"
178 g_dncr = {
179         cache: {}
180         textarea: document.createElement('textarea')
181 }
182 # TODO test this in IE8
183 decode_named_char_ref = (txt) ->
184         txt = "&#{txt}"
185         decoded = g_dncr.cache[txt]
186         return decoded if decoded?
187         g_dncr.textarea.innerHTML = txt
188         decoded = g_dncr.textarea.value
189         return null if decoded is txt
190         return g_dncr.cache[txt] = decoded
191
192 parse_html = (txt) ->
193         cur = 0 # index of next char in txt to be parsed
194         # declare tree and tokenizer variables so they're in scope below
195         tree = null
196         open_tags = [] # stack of open elements
197         tree_state = null
198         tok_state = null
199         tok_cur_tag = null # partially parsed tag
200         flag_frameset_ok = null
201         flag_parsing = null
202
203         parse_error = ->
204                 console.log "Parse error at character #{cur} of #{txt.length}"
205
206
207         # the functions below impliment the Tree Contstruction algorithm
208         # http://www.w3.org/TR/html5/syntax.html#tree-construction
209
210         # But first... the helpers
211         template_tag_is_open = ->
212                 for t of open_tags
213                         if t[0] is TYPE_TAG and t[1] is 'template'
214                                 return true
215                 return false
216         is_in_scope = (tag_name) ->
217                 for t of open_tags
218                         if t[0] is TYPE_TAG and t[1] is tag_name
219                                 return true
220                         # FIXME bail if in scopers
221                 return false
222
223         reconstruct_active_formatting_elements = ->
224                 # FIXME implement this
225
226         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
227         # FIXME implement this
228         close_p_if_in_button_scope = ->
229                 if open_tags[0][1] is 'p' # FIXME
230                         open_tags.pop()
231                 return
232                 #p = find_button_scope 'p'
233                 #if p?
234                         # TODO generate_implied_end_tags except for p tags
235                         # TODO parse_error unless open_tags[0][1] is 'p'
236                         # TODO pop stack until 'p' popped
237
238
239
240         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
241         tree_insert_a_character = (t) ->
242                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
243                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
244                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
245                 else
246                         open_tags[0][3].push t
247
248         # FIXME read spec, do this right
249         # note: this assumes it's an open tag
250         tree_insert_tag = (t) ->
251                 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
252                 # convert attributes into a hash
253                 attrs = {}
254                 while t[2].length
255                         a = t[2].pop()
256                         attrs[a[0]] = a[1]
257                 t[2] = attrs
258                 open_tags[0][3].push t
259                 open_tags.unshift t
260
261         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
262         tree_insert_a_comment = (t) ->
263                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
264                 open_tags[0][3].push t
265
266         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
267         tree_in_body = (t) ->
268                 switch t[0]
269                         when TYPE_TEXT
270                                 switch t[1]
271                                         when "\u0000"
272                                                 parse_error()
273                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
274                                                 reconstruct_active_formatting_elements()
275                                                 tree_insert_a_character t
276                                         else
277                                                 reconstruct_active_formatting_elements()
278                                                 tree_insert_a_character t
279                                                 flag_frameset_ok = false
280                         when TYPE_COMMENT
281                                 tree_insert_a_comment t
282                         when TYPE_DOCTYPE
283                                 parse_error()
284                         when TYPE_OPEN_TAG
285                                 switch t[1]
286                                         when 'html'
287                                                 parse_error()
288                                                 return if template_tag_is_open()
289                                                 root_attrs = open_tags[open_tags.length - 1][3]
290                                                 for k, v of t[2]
291                                                         root_attrs[k] = v unless root_attrs[k]?
292                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
293                                                 # FIXME also do this for </template> (end tag)
294                                                 return tree_in_head t
295                                         when 'body'
296                                                 parse_error()
297                                                 # TODO
298                                         when 'frameset'
299                                                 parse_error()
300                                                 # TODO
301                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
302                                                 close_p_if_in_button_scope()
303                                                 tree_insert_tag t
304                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
305                                                 close_p_if_in_button_scope()
306                                                 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
307                                                         parse_error()
308                                                         open_tags.shift()
309                                                 tree_insert_tag t
310                                         # TODO lots more to implement here
311                                         else # any other start tag
312                                                 reconstruct_active_formatting_elements()
313                                                 tree_insert_tag t
314                         when TYPE_EOF
315                                 ok_tags = {
316                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
317                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
318                                 }
319                                 for t in open_tags
320                                         unless ok_tags[t[1]]?
321                                                 parse_error()
322                                                 break
323                                 # TODO stack of template insertion modes thing
324                                 flag_parsing = false # stop parsing
325                         when TYPE_END_TAG
326                                 switch t[1]
327                                         when 'body'
328                                                 unless is_in_scope 'body'
329                                                         parse_error()
330                                                         return
331                                                 # TODO implement parse error and move to tree_after_body
332                                         when 'html'
333                                                 unless is_in_scope 'body' # weird, but it's what the spec says
334                                                         parse_error()
335                                                         return
336                                                 # TODO implement parse error and move to tree_after_body, reprocess
337                                         # TODO lots more close tags to implement here
338                                         else
339                                                 for node, i in open_tags
340                                                         if node[1] is t[1]
341                                                                 # FIXME generate implied end tags except those with name==t[1]
342                                                                 parse_error() unless i is 0
343                                                                 while i > 0
344                                                                         open_tags.shift()
345                                                                         i -= 1
346                                                                 open_tags.shift()
347                                                                 return
348                                                         if special_elements[node[1]]?
349                                                                 parse_error()
350                                                                 return
351
352
353         # the functions below implement the tokenizer stats described here:
354         # http://www.w3.org/TR/html5/syntax.html#tokenization
355
356         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
357         tok_state_data = ->
358                 switch c = txt.charAt(cur++)
359                         when '&'
360                                 return [TYPE_TEXT, tokenize_character_reference()]
361                         when '<'
362                                 tok_state = tok_state_tag_open
363                         when "\u0000"
364                                 parse_error()
365                                 return [TYPE_TEXT, c]
366                         when '' # EOF
367                                 return [TYPE_EOF]
368                         else
369                                 return [TYPE_TEXT, c]
370                 return null
371
372         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
373         # not needed: tok_state_character_reference_in_data = ->
374         # just call tok_state_character_reference_in_data()
375
376         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
377         tok_state_tag_open = ->
378                 switch c = txt.charAt(cur++)
379                         when '!'
380                                 tok_state = tok_state_markup_declaration_open
381                         when '/'
382                                 tok_state = tok_state_end_tag_open
383                         when '?'
384                                 parse_error()
385                                 tok_state = tok_state_bogus_comment
386                         else
387                                 if lc_alpha.indexOf(c) > -1
388                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
389                                         tok_state = tok_state_tag_name
390                                 else if uc_alpha.indexOf(c) > -1
391                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
392                                         tok_state = tok_state_tag_name
393                                 else
394                                         parse_error()
395                                         tok_state = tok_state_data
396                                         cur -= 1 # we didn't parse/handle the char after <
397                                         return [TYPE_TEXT, '<']
398                 return null
399
400         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
401         tok_state_end_tag_open = ->
402                 switch c = txt.charAt(cur++)
403                         when '>'
404                                 parse_error()
405                                 tok_state = tok_state_data
406                         when '' # EOF
407                                 parse_error()
408                                 tok_state = tok_state_data
409                                 return [TYPE_TEXT, '</']
410                         else
411                                 if uc_alpha.indexOf(c) > -1
412                                         tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
413                                         tok_state = tok_state_tag_name
414                                 else if lc_alpha.indexOf(c) > -1
415                                         tok_cur_tag = [TYPE_END_TAG, c, [], []]
416                                         tok_state = tok_state_tag_name
417                                 else
418                                         parse_error()
419                                         tok_state = tok_state_bogus_comment
420                 return null
421
422         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
423         tok_state_tag_name = ->
424                 switch c = txt.charAt(cur++)
425                         when "\t", "\n", "\u000c", ' '
426                                 tok_state = tok_state_before_attribute_name
427                         when '/'
428                                 tok_state = tok_state_self_closing_start_tag
429                         when '>'
430                                 tok_state = tok_state_data
431                                 tmp = tok_cur_tag
432                                 tok_cur_tag = null
433                                 return tmp
434                         when "\u0000"
435                                 parse_error()
436                                 tok_cur_tag[1] += "\ufffd"
437                         when '' # EOF
438                                 parse_error()
439                                 tok_state = tok_state_data
440                         else
441                                 if uc_alpha.indexOf(c) > -1
442                                         tok_cur_tag[1] += c.toLowerCase()
443                                 else
444                                         tok_cur_tag[1] += c
445                 return null
446
447         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
448         tok_state_before_attribute_name = ->
449                 attr_name = null
450                 switch c = txt.charAt(cur++)
451                         when "\t", "\n", "\u000c", ' '
452                                 return null
453                         when '/'
454                                 tok_state = tok_state_self_closing_start_tag
455                                 return null
456                         when '>'
457                                 tok_state = tok_state_data
458                                 tmp = tok_cur_tag
459                                 tok_cur_tag = null
460                                 return tmp
461                         when "\u0000"
462                                 parse_error()
463                                 attr_name = "\ufffd"
464                         when '"', "'", '<', '='
465                                 parse_error()
466                                 attr_name = c
467                         when '' # EOF
468                                 parse_error()
469                                 tok_state = tok_state_data
470                         else
471                                 if uc_alpha.indexOf(c) > -1
472                                         attr_name = c.toLowerCase()
473                                 else
474                                         attr_name = c
475                 if attr_name?
476                         tok_cur_tag[2].unshift [attr_name, '']
477                         tok_state = tok_state_attribute_name
478                 return null
479
480         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
481         tok_state_attribute_name = ->
482                 switch c = txt.charAt(cur++)
483                         when "\t", "\n", "\u000c", ' '
484                                 tok_state = tok_state_after_attribute_name
485                         when '/'
486                                 tok_state = tok_state_self_closing_start_tag
487                         when '='
488                                 tok_state = tok_state_before_attribute_value
489                         when '>'
490                                 tok_state = tok_state_data
491                                 tmp = tok_cur_tag
492                                 tok_cur_tag = null
493                                 return tmp
494                         when "\u0000"
495                                 parse_error()
496                                 tok_cur_tag[2][0][0] = "\ufffd"
497                         when '"', "'", '<'
498                                 parse_error()
499                                 tok_cur_tag[2][0][0] = c
500                         when '' # EOF
501                                 parse_error()
502                                 tok_state = tok_state_data
503                         else
504                                 if uc_alpha.indexOf(c) > -1
505                                         tok_cur_tag[2][0][0] = c.toLowerCase()
506                                 else
507                                         tok_cur_tag[2][0][0] += c
508                 return null
509
510         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
511         tok_state_before_attribute_value = ->
512                 switch c = txt.charAt(cur++)
513                         when "\t", "\n", "\u000c", ' '
514                                 return null
515                         when '"'
516                                 tok_state = tok_state_attribute_value_double_quoted
517                         when '&'
518                                 tok_state = tok_state_attribute_value_unquoted
519                                 cur -= 1
520                         when "'"
521                                 tok_state = tok_state_attribute_value_single_quoted
522                         when "\u0000"
523                                 # Parse error
524                                 tok_cur_tag[2][0][1] += "\ufffd"
525                                 tok_state = tok_state_attribute_value_unquoted
526                         when '>'
527                                 # Parse error
528                                 tok_state = tok_state_data
529                                 tmp = tok_cur_tag
530                                 tok_cur_tag = null
531                                 return tmp
532                         when '' # EOF
533                                 parse_error()
534                                 tok_state = tok_state_data
535                         else
536                                 tok_cur_tag[2][0][1] += c
537                                 tok_state = tok_state_attribute_value_unquoted
538                 return null
539
540         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
541         tok_state_attribute_value_double_quoted = ->
542                 switch c = txt.charAt(cur++)
543                         when '"'
544                                 tok_state = tok_state_after_attribute_value_quoted
545                         when '&'
546                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
547                         when "\u0000"
548                                 # Parse error
549                                 tok_cur_tag[2][0][1] += "\ufffd"
550                         when '' # EOF
551                                 parse_error()
552                                 tok_state = tok_state_data
553                         else
554                                 tok_cur_tag[2][0][1] += c
555                 return null
556
557         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
558         tok_state_attribute_value_single_quoted = ->
559                 switch c = txt.charAt(cur++)
560                         when "'"
561                                 tok_state = tok_state_after_attribute_value_quoted
562                         when '&'
563                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
564                         when "\u0000"
565                                 # Parse error
566                                 tok_cur_tag[2][0][1] += "\ufffd"
567                         when '' # EOF
568                                 parse_error()
569                                 tok_state = tok_state_data
570                         else
571                                 tok_cur_tag[2][0][1] += c
572                 return null
573
574         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
575         tok_state_attribute_value_unquoted = ->
576                 switch c = txt.charAt(cur++)
577                         when "\t", "\n", "\u000c", ' '
578                                 tok_state = tok_state_before_attribute_name
579                         when '&'
580                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
581                         when '>'
582                                 tok_state = tok_state_data
583                                 tmp = tok_cur_tag
584                                 tok_cur_tag = null
585                                 return tmp
586                         when "\u0000"
587                                 tok_cur_tag[2][0][1] += "\ufffd"
588                         when '' # EOF
589                                 parse_error()
590                                 tok_state = tok_state_data
591                         else
592                                 # Parse Error if ', <, = or ` (backtick)
593                                 tok_cur_tag[2][0][1] += c
594                 return null
595
596         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
597         tok_state_after_attribute_value_quoted = ->
598                 switch c = txt.charAt(cur++)
599                         when "\t", "\n", "\u000c", ' '
600                                 tok_state = tok_state_before_attribute_name
601                         when '/'
602                                 tok_state = tok_state_self_closing_start_tag
603                         when '>'
604                                 tok_state = tok_state_data
605                                 tmp = tok_cur_tag
606                                 tok_cur_tag = null
607                                 return tmp
608                         when '' # EOF
609                                 parse_error()
610                                 tok_state = tok_state_data
611                         else
612                                 # Parse Error
613                                 tok_state = tok_state_before_attribute_name
614                                 cur -= 1 # we didn't handle that char
615                 return null
616
617         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
618         # Don't set this as a state, just call it
619         # returns a string (NOT a text node)
620         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
621                 if cur >= txt.length
622                         return '&'
623                 switch c = txt.charAt(cur)
624                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
625                                 # explicitly not a parse error
626                                 return '&'
627                         when ';'
628                                 # there has to be "one or more" alnums between & and ; to be a parse error
629                                 return '&'
630                         when '#'
631                                 if cur + 1 >= txt.length
632                                         return '&'
633                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
634                                         prefix = '#x'
635                                         charset = hex_chars
636                                         start = cur + 2
637                                 else
638                                         charset = digits
639                                         start = cur + 1
640                                         prefix = '#'
641                                 i = 0
642                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
643                                         i += 1
644                                 if i is 0
645                                         return '&'
646                                 if txt.charAt(start + i) is ';'
647                                         i += 1
648                                 # FIXME This is supposed to generate parse errors for some chars
649                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
650                                 if decoded?
651                                         cur = start + i
652                                         return decoded
653                                 return '&'
654                         else
655                                 for i in [0...31]
656                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
657                                                 break
658                                 if i is 0
659                                         # exit early, because parse_error() below needs at least one alnum
660                                         return '&'
661                                 if txt.charAt(cur + i) is ';'
662                                         i += 1 # include ';' terminator in value
663                                         decoded = decode_named_char_ref txt.substr(cur, i)
664                                         if decoded?
665                                                 cur += i
666                                                 return decoded
667                                         parse_error()
668                                         return '&'
669                                 else
670                                         # no ';' terminator (only legacy char refs)
671                                         max = i
672                                         for i in [2..max] # no prefix matches, so ok to check shortest first
673                                                 c = legacy_char_refs[txt.substr(cur, i)]
674                                                 if c?
675                                                         if in_attr
676                                                                 if txt.charAt(cur + i) is '='
677                                                                         # "because some legacy user agents will
678                                                                         # misinterpret the markup in those cases"
679                                                                         parse_error()
680                                                                         return '&'
681                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
682                                                                         # this makes attributes forgiving about url args
683                                                                         return '&'
684                                                         # ok, and besides the weird exceptions for attributes...
685                                                         # return the matching char
686                                                         cur += i # consume entity chars
687                                                         parse_error() # because no terminating ";"
688                                                         return c
689                                         parse_error()
690                                         return '&'
691                 return # never reached
692
693         # tree constructor initialization
694         # see comments on TYPE_TAG/etc for the structure of this data
695         tree = [TYPE_TAG, 'html', {}, []]
696         open_tags = [tree]
697         tree_state = tree_in_body
698         flag_frameset_ok = true
699         flag_parsing = true
700
701         # tokenizer initialization
702         tok_state = tok_state_data
703
704         # proccess input
705         while flag_parsing
706                 t = tok_state()
707                 if t?
708                         tree_state t
709         return tree[3]
710
711 # everything below is tests on the above
712 test_equals = (description, fn, args..., expected_output) ->
713         output = fn.apply this, args
714         if output is expected_output
715                 console.log "passed: #{description}."
716         else
717                 console.log "FAILED: #{description}..."
718                 console.log "   Expected: #{expected_output}"
719                 console.log "     Actual: #{output}"
720 html_to_json = (html) ->
721         return JSON.stringify parse_html html
722 test_equals "empty", html_to_json, "", '[]'
723 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
724 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
725 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
726 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
727 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
728 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
729 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
730 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
731 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
732 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
733 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
734 test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'