JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
use Node class for data structures, improve is_in_scope
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
28 TYPE_COMMENT = 2
29 TYPE_DOCTYPE = 3
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
33 TYPE_EOF = 6
34
35 class Node
36         constructor: (type, args = {}) ->
37                 @type = type # one of the TYPE_* constants above
38                 @name = args.name ? '' # tag name
39                 @text = args.text ? '' # contents for text/comment nodes
40                 @attrs = args.attrs ? {}
41                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
42                 @children = args.children ? []
43         serialize: -> # for unit tests
44                 ret = ''
45                 switch @type
46                         when TYPE_TAG
47                                 ret += 'tag:'
48                                 ret += JSON.stringify @name
49                                 ret += ','
50                                 ret += JSON.stringify @attrs
51                                 ret += ','
52                                 sep = '['
53                                 for c in @children
54                                         ret += sep
55                                         sep = ','
56                                         ret += c.serialize()
57                                 ret += ']'
58                         when TYPE_TEXT
59                                 ret += 'text:'
60                                 ret += JSON.stringify @text
61                         when TYPE_COMMENT
62                                 ret += 'comment:'
63                                 ret += JSON.stringify @text
64                         when TYPE_DOCTYPE
65                                 ret += 'doctype'
66                                 # FIXME
67                         else
68                                 ret += 'unknown:'
69                 return ret
70
71
72 # helpers: (only take args that are normally known when parser creates nodes)
73 new_open_tag = (name) ->
74         return new Node TYPE_OPEN_TAG, name: name
75 new_end_tag = (name) ->
76         return new Node TYPE_END_TAG, name: name
77 new_text_node = (txt) ->
78         return new Node TYPE_TEXT, text: txt
79 new_comment_node = (txt) ->
80         return new Node TYPE_COMMENT, text: txt
81 new_eof_token = ->
82         return new Node TYPE_EOF
83
84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
86 digits = "0123456789"
87 alnum = lc_alpha + uc_alpha + digits
88 hex_chars = digits + "abcdefABCDEF"
89 scopers = { # FIXME these are supposed to be namespace specific
90         'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
91         'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
92         'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
93         'foreignObject': true, 'desc': true, 'title'
94 }
95
96 # some SVG elements have dashes in them
97 tag_name_chars = alnum + "-"
98
99 # http://www.w3.org/TR/html5/infrastructure.html#space-character
100 space_chars = "\u0009\u000a\u000c\u000d\u0020"
101
102 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
103 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
104
105 # These are the character references that don't need a terminating semicolon
106 # min length: 2, max: 6, none are a prefix of any other.
107 legacy_char_refs = {
108         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
109         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
110         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
111         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
112         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
113         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
114         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
115         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
116         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
117         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
118         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
119         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
120         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
121         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
122         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
123         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
124         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
125         yen: '¥', yuml: 'ÿ'
126 }
127
128 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
129 raw_text_elements = ['script', 'style']
130 escapable_raw_text_elements = ['textarea', 'title']
131 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
132 svg_elements = [
133         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
134         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
135         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
136         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
137         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
138         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
139         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
140         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
141         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
142         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
143         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
144         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
145         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
146         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
147         'view', 'vkern'
148 ]
149
150 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
151 mathml_elements = [
152         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
153         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
154         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
155         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
156         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
157         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
158         'determinant', 'diff', 'divergence', 'divide', 'domain',
159         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
160         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
161         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
162         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
163         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
164         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
165         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
166         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
167         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
168         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
169         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
170         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
171         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
172         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
173         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
174         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
175         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
176         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
177         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
178         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
179         'vectorproduct', 'xor'
180 ]
181 # foreign_elements = [svg_elements..., mathml_elements...]
182 #normal_elements = All other allowed HTML elements are normal elements.
183
184 special_elements = {
185         # from HTML:
186         address: true, applet: true, area: true, article: true, aside: true,
187         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
188         br: true, button: true, caption: true, center: true, col: true,
189         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
190         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
191         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
192         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
193         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
194         isindex: true, li: true, link: true, listing: true, main: true,
195         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
196         noscript: true, object: true, ol: true, p: true, param: true,
197         plaintext: true, pre: true, script: true, section: true, select: true,
198         source: true, style: true, summary: true, table: true, tbody: true,
199         td: true, template: true, textarea: true, tfoot: true, th: true,
200         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
201         xmp: true,
202
203         # from MathML:
204         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
205
206         # from SVG:
207         foreignObject: true, desc: true, title: true
208 }
209
210 formatting_elements = {
211          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
212          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
213          u: true
214 }
215
216
217 # decode_named_char_ref()
218 #
219 # The list of named character references is _huge_ so ask the browser to decode
220 # for us instead of wasting bandwidth/space on including the table here.
221 #
222 # Pass without the "&" but with the ";" examples:
223 #    for "&amp" pass "amp;"
224 #    for "&#x2032" pass "x2032;"
225 g_dncr = {
226         cache: {}
227         textarea: document.createElement('textarea')
228 }
229 # TODO test this in IE8
230 decode_named_char_ref = (txt) ->
231         txt = "&#{txt}"
232         decoded = g_dncr.cache[txt]
233         return decoded if decoded?
234         g_dncr.textarea.innerHTML = txt
235         decoded = g_dncr.textarea.value
236         return null if decoded is txt
237         return g_dncr.cache[txt] = decoded
238
239 parse_html = (txt, parse_error_cb = null) ->
240         cur = 0 # index of next char in txt to be parsed
241         # declare tree and tokenizer variables so they're in scope below
242         tree = null
243         open_tags = [] # stack of open elements
244         tree_state = null
245         tok_state = null
246         tok_cur_tag = null # partially parsed tag
247         flag_frameset_ok = null
248         flag_parsing = null
249
250         parse_error = ->
251                 if parse_error_cb?
252                         parse_error_cb cur
253                 else
254                         console.log "Parse error at character #{cur} of #{txt.length}"
255
256
257         # the functions below impliment the Tree Contstruction algorithm
258         # http://www.w3.org/TR/html5/syntax.html#tree-construction
259
260         # But first... the helpers
261         template_tag_is_open = ->
262                 for t of open_tags
263                         if t.type is TYPE_TAG and t.name is 'template'
264                                 return true
265                 return false
266         is_in_scope = (tag_name) ->
267                 for t of open_tags
268                         if t.name is tag_name
269                                 return true
270                         if t.name of scopers
271                                 return false
272                 return false
273
274         reconstruct_active_formatting_elements = ->
275                 # FIXME implement this
276
277         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
278         # FIXME implement this
279         close_p_if_in_button_scope = ->
280                 if open_tags[0].name is 'p'
281                         open_tags.pop()
282                 return
283                 #p = find_button_scope 'p'
284                 #if p?
285                         # TODO generate_implied_end_tags except for p tags
286                         # TODO parse_error unless open_tags[0].name is 'p'
287                         # TODO pop stack until 'p' popped
288
289
290
291         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
292         tree_insert_a_character = (t) ->
293                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
294                 dest = open_tags[0].children
295                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
296                         dest[dest.length - 1].text += t.text
297                 else
298                         dest.push t
299
300         # FIXME read spec, do this right
301         # note: this assumes it's an open tag
302         tree_insert_tag = (t) ->
303                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
304                 # convert attributes into a hash
305                 while t.attrs_a.length
306                         a = t.attrs_a.pop()
307                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
308                 open_tags[0].children.push t
309                 open_tags.unshift t
310
311         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
312         tree_insert_a_comment = (t) ->
313                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
314                 open_tags[0].children.push t
315
316         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
317         tree_in_body = (t) ->
318                 switch t.type
319                         when TYPE_TEXT
320                                 switch t.text
321                                         when "\u0000"
322                                                 parse_error()
323                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
324                                                 reconstruct_active_formatting_elements()
325                                                 tree_insert_a_character t
326                                         else
327                                                 reconstruct_active_formatting_elements()
328                                                 tree_insert_a_character t
329                                                 flag_frameset_ok = false
330                         when TYPE_COMMENT
331                                 tree_insert_a_comment t
332                         when TYPE_DOCTYPE
333                                 parse_error()
334                         when TYPE_OPEN_TAG
335                                 switch t.name
336                                         when 'html'
337                                                 parse_error()
338                                                 return if template_tag_is_open()
339                                                 root_attrs = open_tags[open_tags.length - 1].children
340                                                 for k, v of t.attrs
341                                                         root_attrs[k] = v unless root_attrs[k]?
342                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
343                                                 # FIXME also do this for </template> (end tag)
344                                                 return tree_in_head t
345                                         when 'body'
346                                                 parse_error()
347                                                 # TODO
348                                         when 'frameset'
349                                                 parse_error()
350                                                 # TODO
351                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
352                                                 close_p_if_in_button_scope()
353                                                 tree_insert_tag t
354                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
355                                                 close_p_if_in_button_scope()
356                                                 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
357                                                         parse_error()
358                                                         open_tags.shift()
359                                                 tree_insert_tag t
360                                         # TODO lots more to implement here
361                                         else # any other start tag
362                                                 reconstruct_active_formatting_elements()
363                                                 tree_insert_tag t
364                         when TYPE_EOF
365                                 ok_tags = {
366                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
367                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
368                                 }
369                                 for t in open_tags
370                                         unless ok_tags[t.name]?
371                                                 parse_error()
372                                                 break
373                                 # TODO stack of template insertion modes thing
374                                 flag_parsing = false # stop parsing
375                         when TYPE_END_TAG
376                                 switch t.name
377                                         when 'body'
378                                                 unless is_in_scope 'body'
379                                                         parse_error()
380                                                         return
381                                                 # TODO implement parse error and move to tree_after_body
382                                         when 'html'
383                                                 unless is_in_scope 'body' # weird, but it's what the spec says
384                                                         parse_error()
385                                                         return
386                                                 # TODO implement parse error and move to tree_after_body, reprocess
387                                         # TODO lots more close tags to implement here
388                                         else
389                                                 for node, i in open_tags
390                                                         if node.name is t.name
391                                                                 # FIXME generate implied end tags except those with name==t.name
392                                                                 parse_error() unless i is 0
393                                                                 while i > 0
394                                                                         open_tags.shift()
395                                                                         i -= 1
396                                                                 open_tags.shift()
397                                                                 return
398                                                         if special_elements[node.name]?
399                                                                 parse_error()
400                                                                 return
401
402
403         # the functions below implement the tokenizer stats described here:
404         # http://www.w3.org/TR/html5/syntax.html#tokenization
405
406         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
407         tok_state_data = ->
408                 switch c = txt.charAt(cur++)
409                         when '&'
410                                 return new_text_node tokenize_character_reference()
411                         when '<'
412                                 tok_state = tok_state_tag_open
413                         when "\u0000"
414                                 parse_error()
415                                 return new_text_node c
416                         when '' # EOF
417                                 return new_eof_token()
418                         else
419                                 return new_text_node c
420                 return null
421
422         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
423         # not needed: tok_state_character_reference_in_data = ->
424         # just call tok_state_character_reference_in_data()
425
426         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
427         tok_state_tag_open = ->
428                 switch c = txt.charAt(cur++)
429                         when '!'
430                                 tok_state = tok_state_markup_declaration_open
431                         when '/'
432                                 tok_state = tok_state_end_tag_open
433                         when '?'
434                                 parse_error()
435                                 tok_state = tok_state_bogus_comment
436                         else
437                                 if lc_alpha.indexOf(c) > -1
438                                         tok_cur_tag = new_open_tag c
439                                         tok_state = tok_state_tag_name
440                                 else if uc_alpha.indexOf(c) > -1
441                                         tok_cur_tag = new_open_tag c.toLowerCase()
442                                         tok_state = tok_state_tag_name
443                                 else
444                                         parse_error()
445                                         tok_state = tok_state_data
446                                         cur -= 1 # we didn't parse/handle the char after <
447                                         return new_text_node '<'
448                 return null
449
450         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
451         tok_state_end_tag_open = ->
452                 switch c = txt.charAt(cur++)
453                         when '>'
454                                 parse_error()
455                                 tok_state = tok_state_data
456                         when '' # EOF
457                                 parse_error()
458                                 tok_state = tok_state_data
459                                 return new_text_node '</'
460                         else
461                                 if uc_alpha.indexOf(c) > -1
462                                         tok_cur_tag = new_end_tag c.toLowerCase()
463                                         tok_state = tok_state_tag_name
464                                 else if lc_alpha.indexOf(c) > -1
465                                         tok_cur_tag = new_end_tag c
466                                         tok_state = tok_state_tag_name
467                                 else
468                                         parse_error()
469                                         tok_state = tok_state_bogus_comment
470                 return null
471
472         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
473         tok_state_tag_name = ->
474                 switch c = txt.charAt(cur++)
475                         when "\t", "\n", "\u000c", ' '
476                                 tok_state = tok_state_before_attribute_name
477                         when '/'
478                                 tok_state = tok_state_self_closing_start_tag
479                         when '>'
480                                 tok_state = tok_state_data
481                                 tmp = tok_cur_tag
482                                 tok_cur_tag = null
483                                 return tmp
484                         when "\u0000"
485                                 parse_error()
486                                 tok_cur_tag.name += "\ufffd"
487                         when '' # EOF
488                                 parse_error()
489                                 tok_state = tok_state_data
490                         else
491                                 if uc_alpha.indexOf(c) > -1
492                                         tok_cur_tag.name += c.toLowerCase()
493                                 else
494                                         tok_cur_tag.name += c
495                 return null
496
497         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
498         tok_state_before_attribute_name = ->
499                 attr_name = null
500                 switch c = txt.charAt(cur++)
501                         when "\t", "\n", "\u000c", ' '
502                                 return null
503                         when '/'
504                                 tok_state = tok_state_self_closing_start_tag
505                                 return null
506                         when '>'
507                                 tok_state = tok_state_data
508                                 tmp = tok_cur_tag
509                                 tok_cur_tag = null
510                                 return tmp
511                         when "\u0000"
512                                 parse_error()
513                                 attr_name = "\ufffd"
514                         when '"', "'", '<', '='
515                                 parse_error()
516                                 attr_name = c
517                         when '' # EOF
518                                 parse_error()
519                                 tok_state = tok_state_data
520                         else
521                                 if uc_alpha.indexOf(c) > -1
522                                         attr_name = c.toLowerCase()
523                                 else
524                                         attr_name = c
525                 if attr_name?
526                         tok_cur_tag.attrs_a.unshift [attr_name, '']
527                         tok_state = tok_state_attribute_name
528                 return null
529
530         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
531         tok_state_attribute_name = ->
532                 switch c = txt.charAt(cur++)
533                         when "\t", "\n", "\u000c", ' '
534                                 tok_state = tok_state_after_attribute_name
535                         when '/'
536                                 tok_state = tok_state_self_closing_start_tag
537                         when '='
538                                 tok_state = tok_state_before_attribute_value
539                         when '>'
540                                 tok_state = tok_state_data
541                                 tmp = tok_cur_tag
542                                 tok_cur_tag = null
543                                 return tmp
544                         when "\u0000"
545                                 parse_error()
546                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
547                         when '"', "'", '<'
548                                 parse_error()
549                                 tok_cur_tag.attrs_a[0][0] = c
550                         when '' # EOF
551                                 parse_error()
552                                 tok_state = tok_state_data
553                         else
554                                 if uc_alpha.indexOf(c) > -1
555                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
556                                 else
557                                         tok_cur_tag.attrs_a[0][0] += c
558                 return null
559
560         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
561         tok_state_before_attribute_value = ->
562                 switch c = txt.charAt(cur++)
563                         when "\t", "\n", "\u000c", ' '
564                                 return null
565                         when '"'
566                                 tok_state = tok_state_attribute_value_double_quoted
567                         when '&'
568                                 tok_state = tok_state_attribute_value_unquoted
569                                 cur -= 1
570                         when "'"
571                                 tok_state = tok_state_attribute_value_single_quoted
572                         when "\u0000"
573                                 # Parse error
574                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
575                                 tok_state = tok_state_attribute_value_unquoted
576                         when '>'
577                                 # Parse error
578                                 tok_state = tok_state_data
579                                 tmp = tok_cur_tag
580                                 tok_cur_tag = null
581                                 return tmp
582                         when '' # EOF
583                                 parse_error()
584                                 tok_state = tok_state_data
585                         else
586                                 tok_cur_tag.attrs_a[0][1] += c
587                                 tok_state = tok_state_attribute_value_unquoted
588                 return null
589
590         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
591         tok_state_attribute_value_double_quoted = ->
592                 switch c = txt.charAt(cur++)
593                         when '"'
594                                 tok_state = tok_state_after_attribute_value_quoted
595                         when '&'
596                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
597                         when "\u0000"
598                                 # Parse error
599                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
600                         when '' # EOF
601                                 parse_error()
602                                 tok_state = tok_state_data
603                         else
604                                 tok_cur_tag.attrs_a[0][1] += c
605                 return null
606
607         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
608         tok_state_attribute_value_single_quoted = ->
609                 switch c = txt.charAt(cur++)
610                         when "'"
611                                 tok_state = tok_state_after_attribute_value_quoted
612                         when '&'
613                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
614                         when "\u0000"
615                                 # Parse error
616                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
617                         when '' # EOF
618                                 parse_error()
619                                 tok_state = tok_state_data
620                         else
621                                 tok_cur_tag.attrs_a[0][1] += c
622                 return null
623
624         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
625         tok_state_attribute_value_unquoted = ->
626                 switch c = txt.charAt(cur++)
627                         when "\t", "\n", "\u000c", ' '
628                                 tok_state = tok_state_before_attribute_name
629                         when '&'
630                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
631                         when '>'
632                                 tok_state = tok_state_data
633                                 tmp = tok_cur_tag
634                                 tok_cur_tag = null
635                                 return tmp
636                         when "\u0000"
637                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
638                         when '' # EOF
639                                 parse_error()
640                                 tok_state = tok_state_data
641                         else
642                                 # Parse Error if ', <, = or ` (backtick)
643                                 tok_cur_tag.attrs_a[0][1] += c
644                 return null
645
646         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
647         tok_state_after_attribute_value_quoted = ->
648                 switch c = txt.charAt(cur++)
649                         when "\t", "\n", "\u000c", ' '
650                                 tok_state = tok_state_before_attribute_name
651                         when '/'
652                                 tok_state = tok_state_self_closing_start_tag
653                         when '>'
654                                 tok_state = tok_state_data
655                                 tmp = tok_cur_tag
656                                 tok_cur_tag = null
657                                 return tmp
658                         when '' # EOF
659                                 parse_error()
660                                 tok_state = tok_state_data
661                         else
662                                 # Parse Error
663                                 tok_state = tok_state_before_attribute_name
664                                 cur -= 1 # we didn't handle that char
665                 return null
666
667         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
668         # Don't set this as a state, just call it
669         # returns a string (NOT a text node)
670         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
671                 if cur >= txt.length
672                         return '&'
673                 switch c = txt.charAt(cur)
674                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
675                                 # explicitly not a parse error
676                                 return '&'
677                         when ';'
678                                 # there has to be "one or more" alnums between & and ; to be a parse error
679                                 return '&'
680                         when '#'
681                                 if cur + 1 >= txt.length
682                                         return '&'
683                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
684                                         prefix = '#x'
685                                         charset = hex_chars
686                                         start = cur + 2
687                                 else
688                                         charset = digits
689                                         start = cur + 1
690                                         prefix = '#'
691                                 i = 0
692                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
693                                         i += 1
694                                 if i is 0
695                                         return '&'
696                                 if txt.charAt(start + i) is ';'
697                                         i += 1
698                                 # FIXME This is supposed to generate parse errors for some chars
699                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
700                                 if decoded?
701                                         cur = start + i
702                                         return decoded
703                                 return '&'
704                         else
705                                 for i in [0...31]
706                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
707                                                 break
708                                 if i is 0
709                                         # exit early, because parse_error() below needs at least one alnum
710                                         return '&'
711                                 if txt.charAt(cur + i) is ';'
712                                         i += 1 # include ';' terminator in value
713                                         decoded = decode_named_char_ref txt.substr(cur, i)
714                                         if decoded?
715                                                 cur += i
716                                                 return decoded
717                                         parse_error()
718                                         return '&'
719                                 else
720                                         # no ';' terminator (only legacy char refs)
721                                         max = i
722                                         for i in [2..max] # no prefix matches, so ok to check shortest first
723                                                 c = legacy_char_refs[txt.substr(cur, i)]
724                                                 if c?
725                                                         if in_attr
726                                                                 if txt.charAt(cur + i) is '='
727                                                                         # "because some legacy user agents will
728                                                                         # misinterpret the markup in those cases"
729                                                                         parse_error()
730                                                                         return '&'
731                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
732                                                                         # this makes attributes forgiving about url args
733                                                                         return '&'
734                                                         # ok, and besides the weird exceptions for attributes...
735                                                         # return the matching char
736                                                         cur += i # consume entity chars
737                                                         parse_error() # because no terminating ";"
738                                                         return c
739                                         parse_error()
740                                         return '&'
741                 return # never reached
742
743         # tree constructor initialization
744         # see comments on TYPE_TAG/etc for the structure of this data
745         tree = new Node TYPE_TAG, name: 'html'
746         open_tags = [tree]
747         tree_state = tree_in_body
748         flag_frameset_ok = true
749         flag_parsing = true
750
751         # tokenizer initialization
752         tok_state = tok_state_data
753
754         # proccess input
755         while flag_parsing
756                 t = tok_state()
757                 if t?
758                         tree_state t
759         return tree.children
760
761 # everything below is tests on the above
762 test_equals = (description, output, expected_output) ->
763         if output is expected_output
764                 console.log "passed." # don't say name, so smart consoles can merge all of these
765         else
766                 console.log "FAILED: \"#{description}\""
767                 console.log "   Expected: #{expected_output}"
768                 console.log "     Actual: #{output}"
769 test_parser = (args) ->
770         parse_errors = []
771         errors_cb = (i) ->
772                 parse_errors.push i
773         parsed = parse_html args.html, errors_cb
774         serialized = ''
775         sep = ''
776         for t in parsed
777                 serialized += sep
778                 sep = ','
779                 serialized += t.serialize()
780         if serialized isnt args.expected or parse_errors.length isnt args.errors
781                 console.log "test FAILED: \"#{args.name}\""
782         else
783                 console.log 'test passed'
784         if serialized isnt args.expected
785                 console.log "      Input: #{args.html}"
786                 console.log "    Correct: #{args.expected}"
787                 console.log "     Output: #{serialized}"
788         if parse_errors.length isnt args.errors
789                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
790
791 test_parser name: "empty", \
792         html: "",
793         expected: '',
794         errors: 0
795 test_parser name: "just text", \
796         html: "abc",
797         expected: 'text:"abc"',
798         errors: 0
799 test_parser name: "named entity", \
800         html: "a&amp;1234",
801         expected: 'text:"a&1234"',
802         errors: 0
803 test_parser name: "broken named character references", \
804         html: "1&amp2&&amp;3&aabbcc;",
805         expected: 'text:"1&2&&3&aabbcc;"',
806         errors: 2
807 test_parser name: "numbered entity overrides", \
808         html: "1&#X80&#x80; &#x83",
809         expected: 'text:"1€€ ƒ"',
810         errors: 0
811 test_parser name: "open tag", \
812         html: "foo<span>bar",
813         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
814         errors: 1 # no close tag
815 test_parser name: "open tag with attributes", \
816         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
817         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
818         errors: 1 # no close tag
819 test_parser name: "open tag with attributes of various quotings", \
820         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
821         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
822         errors: 1 # no close tag
823 test_parser name: "attribute entity exceptions dq", \
824         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
825         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
826         errors: 2 # no close tag, &amp= in attr
827 test_parser name: "attribute entity exceptions sq", \
828         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
829         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
830         errors: 2 # no close tag, &amp= in attr
831 test_parser name: "attribute entity exceptions uq", \
832         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
833         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
834         errors: 2 # no close tag, &amp= in attr
835 test_parser name: "matching closing tags", \
836         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
837         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
838         errors: 0
839 test_parser name: "mis-matched closing tags", \
840         html: "foo<div>bar<span>baz</div>qux",
841         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
842         errors: 1 # close tag mismatch