JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement scope checkers, test for formatting els
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
28 TYPE_COMMENT = 2
29 TYPE_DOCTYPE = 3
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
33 TYPE_EOF = 6
34
35 class Node
36         constructor: (type, args = {}) ->
37                 @type = type # one of the TYPE_* constants above
38                 @name = args.name ? '' # tag name
39                 @text = args.text ? '' # contents for text/comment nodes
40                 @attrs = args.attrs ? {}
41                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
42                 @children = args.children ? []
43         serialize: -> # for unit tests
44                 ret = ''
45                 switch @type
46                         when TYPE_TAG
47                                 ret += 'tag:'
48                                 ret += JSON.stringify @name
49                                 ret += ','
50                                 ret += JSON.stringify @attrs
51                                 ret += ','
52                                 sep = '['
53                                 for c in @children
54                                         ret += sep
55                                         sep = ','
56                                         ret += c.serialize()
57                                 ret += ']'
58                         when TYPE_TEXT
59                                 ret += 'text:'
60                                 ret += JSON.stringify @text
61                         when TYPE_COMMENT
62                                 ret += 'comment:'
63                                 ret += JSON.stringify @text
64                         when TYPE_DOCTYPE
65                                 ret += 'doctype'
66                                 # FIXME
67                         else
68                                 ret += 'unknown:'
69                 return ret
70
71
72 # helpers: (only take args that are normally known when parser creates nodes)
73 new_open_tag = (name) ->
74         return new Node TYPE_OPEN_TAG, name: name
75 new_end_tag = (name) ->
76         return new Node TYPE_END_TAG, name: name
77 new_text_node = (txt) ->
78         return new Node TYPE_TEXT, text: txt
79 new_comment_node = (txt) ->
80         return new Node TYPE_COMMENT, text: txt
81 new_eof_token = ->
82         return new Node TYPE_EOF
83
84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
86 digits = "0123456789"
87 alnum = lc_alpha + uc_alpha + digits
88 hex_chars = digits + "abcdefABCDEF"
89
90 # some SVG elements have dashes in them
91 tag_name_chars = alnum + "-"
92
93 # http://www.w3.org/TR/html5/infrastructure.html#space-character
94 space_chars = "\u0009\u000a\u000c\u000d\u0020"
95
96 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
97 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
98
99 # These are the character references that don't need a terminating semicolon
100 # min length: 2, max: 6, none are a prefix of any other.
101 legacy_char_refs = {
102         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
103         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
104         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
105         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
106         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
107         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
108         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
109         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
110         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
111         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
112         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
113         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
114         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
115         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
116         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
117         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
118         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
119         yen: '¥', yuml: 'ÿ'
120 }
121
122 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
123 raw_text_elements = ['script', 'style']
124 escapable_raw_text_elements = ['textarea', 'title']
125 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
126 svg_elements = [
127         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
128         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
129         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
130         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
131         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
132         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
133         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
134         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
135         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
136         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
137         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
138         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
139         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
140         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
141         'view', 'vkern'
142 ]
143
144 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
145 mathml_elements = [
146         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
147         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
148         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
149         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
150         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
151         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
152         'determinant', 'diff', 'divergence', 'divide', 'domain',
153         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
154         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
155         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
156         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
157         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
158         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
159         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
160         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
161         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
162         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
163         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
164         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
165         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
166         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
167         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
168         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
169         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
170         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
171         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
172         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
173         'vectorproduct', 'xor'
174 ]
175 # foreign_elements = [svg_elements..., mathml_elements...]
176 #normal_elements = All other allowed HTML elements are normal elements.
177
178 special_elements = {
179         # from HTML:
180         address: true, applet: true, area: true, article: true, aside: true,
181         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
182         br: true, button: true, caption: true, center: true, col: true,
183         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
184         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
185         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
186         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
187         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
188         isindex: true, li: true, link: true, listing: true, main: true,
189         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
190         noscript: true, object: true, ol: true, p: true, param: true,
191         plaintext: true, pre: true, script: true, section: true, select: true,
192         source: true, style: true, summary: true, table: true, tbody: true,
193         td: true, template: true, textarea: true, tfoot: true, th: true,
194         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
195         xmp: true,
196
197         # from MathML:
198         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
199
200         # from SVG:
201         foreignObject: true, desc: true, title: true
202 }
203
204 formatting_elements = {
205          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
206          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
207          u: true
208 }
209
210
211 # decode_named_char_ref()
212 #
213 # The list of named character references is _huge_ so ask the browser to decode
214 # for us instead of wasting bandwidth/space on including the table here.
215 #
216 # Pass without the "&" but with the ";" examples:
217 #    for "&amp" pass "amp;"
218 #    for "&#x2032" pass "x2032;"
219 g_dncr = {
220         cache: {}
221         textarea: document.createElement('textarea')
222 }
223 # TODO test this in IE8
224 decode_named_char_ref = (txt) ->
225         txt = "&#{txt}"
226         decoded = g_dncr.cache[txt]
227         return decoded if decoded?
228         g_dncr.textarea.innerHTML = txt
229         decoded = g_dncr.textarea.value
230         return null if decoded is txt
231         return g_dncr.cache[txt] = decoded
232
233 parse_html = (txt, parse_error_cb = null) ->
234         cur = 0 # index of next char in txt to be parsed
235         # declare tree and tokenizer variables so they're in scope below
236         tree = null
237         open_tags = [] # stack of open elements
238         tree_state = null
239         tok_state = null
240         tok_cur_tag = null # partially parsed tag
241         flag_frameset_ok = null
242         flag_parsing = null
243
244         parse_error = ->
245                 if parse_error_cb?
246                         parse_error_cb cur
247                 else
248                         console.log "Parse error at character #{cur} of #{txt.length}"
249
250
251         # the functions below impliment the Tree Contstruction algorithm
252         # http://www.w3.org/TR/html5/syntax.html#tree-construction
253
254         # But first... the helpers
255         template_tag_is_open = ->
256                 for t in open_tags
257                         if t.type is TYPE_TAG and t.name is 'template'
258                                 return true
259                 return false
260         is_in_scope_x = (tag_name, scope) ->
261                 for t in open_tags
262                         if t.name is tag_name
263                                 return true
264                         if t.name of scope
265                                 return false
266                 return false
267         is_in_scope_x_y = (tag_name, scope, scope2) ->
268                 for t in open_tags
269                         if t.name is tag_name
270                                 return true
271                         if t.name of scope
272                                 return false
273                         if t.name of scope2
274                                 return false
275                 return false
276         standard_scopers = { # FIXME these are supposed to be namespace specific
277                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
278                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
279                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
280                 'foreignObject': true, 'desc': true, 'title'
281         }
282         button_scopers = button: true
283         li_scopers = ol: true, ul: true
284         table_scopers = html: true, table: true, template: true
285         is_in_scope = (tag_name) ->
286                 return is_in_scope_x tag_name, standard_scopers
287         is_in_button_scope = (tag_name) ->
288                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
289         is_in_table_scope = (tag_name) ->
290                 return is_in_scope_x tag_name, table_scopers
291         is_in_select_scope = (tag_name) ->
292                 for t in open_tags
293                         if t.name is tag_name
294                                 return true
295                         if t.name isnt 'optgroup' and t.name isnt 'option'
296                                 return false
297                 return false
298
299         reconstruct_active_formatting_elements = ->
300                 # FIXME implement this
301
302         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
303         # FIXME implement this
304         close_p_if_in_button_scope = ->
305                 if open_tags[0].name is 'p'
306                         open_tags.pop()
307                 return
308                 #p = find_button_scope 'p'
309                 #if p?
310                         # TODO generate_implied_end_tags except for p tags
311                         # TODO parse_error unless open_tags[0].name is 'p'
312                         # TODO pop stack until 'p' popped
313
314
315
316         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
317         tree_insert_a_character = (t) ->
318                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
319                 dest = open_tags[0].children
320                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
321                         dest[dest.length - 1].text += t.text
322                 else
323                         dest.push t
324
325         # FIXME read spec, do this right
326         # note: this assumes it's an open tag
327         tree_insert_tag = (t) ->
328                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
329                 # convert attributes into a hash
330                 while t.attrs_a.length
331                         a = t.attrs_a.pop()
332                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
333                 open_tags[0].children.push t
334                 open_tags.unshift t
335
336         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
337         tree_insert_a_comment = (t) ->
338                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
339                 open_tags[0].children.push t
340
341         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
342         tree_in_body = (t) ->
343                 switch t.type
344                         when TYPE_TEXT
345                                 switch t.text
346                                         when "\u0000"
347                                                 parse_error()
348                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
349                                                 reconstruct_active_formatting_elements()
350                                                 tree_insert_a_character t
351                                         else
352                                                 reconstruct_active_formatting_elements()
353                                                 tree_insert_a_character t
354                                                 flag_frameset_ok = false
355                         when TYPE_COMMENT
356                                 tree_insert_a_comment t
357                         when TYPE_DOCTYPE
358                                 parse_error()
359                         when TYPE_OPEN_TAG
360                                 switch t.name
361                                         when 'html'
362                                                 parse_error()
363                                                 return if template_tag_is_open()
364                                                 root_attrs = open_tags[open_tags.length - 1].children
365                                                 for k, v of t.attrs
366                                                         root_attrs[k] = v unless root_attrs[k]?
367                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
368                                                 # FIXME also do this for </template> (end tag)
369                                                 return tree_in_head t
370                                         when 'body'
371                                                 parse_error()
372                                                 # TODO
373                                         when 'frameset'
374                                                 parse_error()
375                                                 # TODO
376                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
377                                                 close_p_if_in_button_scope()
378                                                 tree_insert_tag t
379                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
380                                                 close_p_if_in_button_scope()
381                                                 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
382                                                         parse_error()
383                                                         open_tags.shift()
384                                                 tree_insert_tag t
385                                         # TODO lots more to implement here
386                                         else # any other start tag
387                                                 reconstruct_active_formatting_elements()
388                                                 tree_insert_tag t
389                         when TYPE_EOF
390                                 ok_tags = {
391                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
392                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
393                                 }
394                                 for t in open_tags
395                                         unless ok_tags[t.name]?
396                                                 parse_error()
397                                                 break
398                                 # TODO stack of template insertion modes thing
399                                 flag_parsing = false # stop parsing
400                         when TYPE_END_TAG
401                                 switch t.name
402                                         when 'body'
403                                                 unless is_in_scope 'body'
404                                                         parse_error()
405                                                         return
406                                                 # TODO implement parse error and move to tree_after_body
407                                         when 'html'
408                                                 unless is_in_scope 'body' # weird, but it's what the spec says
409                                                         parse_error()
410                                                         return
411                                                 # TODO implement parse error and move to tree_after_body, reprocess
412                                         # TODO lots more close tags to implement here
413                                         else
414                                                 for node, i in open_tags
415                                                         if node.name is t.name
416                                                                 # FIXME generate implied end tags except those with name==t.name
417                                                                 parse_error() unless i is 0
418                                                                 while i > 0
419                                                                         open_tags.shift()
420                                                                         i -= 1
421                                                                 open_tags.shift()
422                                                                 return
423                                                         if special_elements[node.name]?
424                                                                 parse_error()
425                                                                 return
426
427
428         # the functions below implement the tokenizer stats described here:
429         # http://www.w3.org/TR/html5/syntax.html#tokenization
430
431         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
432         tok_state_data = ->
433                 switch c = txt.charAt(cur++)
434                         when '&'
435                                 return new_text_node tokenize_character_reference()
436                         when '<'
437                                 tok_state = tok_state_tag_open
438                         when "\u0000"
439                                 parse_error()
440                                 return new_text_node c
441                         when '' # EOF
442                                 return new_eof_token()
443                         else
444                                 return new_text_node c
445                 return null
446
447         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
448         # not needed: tok_state_character_reference_in_data = ->
449         # just call tok_state_character_reference_in_data()
450
451         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
452         tok_state_tag_open = ->
453                 switch c = txt.charAt(cur++)
454                         when '!'
455                                 tok_state = tok_state_markup_declaration_open
456                         when '/'
457                                 tok_state = tok_state_end_tag_open
458                         when '?'
459                                 parse_error()
460                                 tok_state = tok_state_bogus_comment
461                         else
462                                 if lc_alpha.indexOf(c) > -1
463                                         tok_cur_tag = new_open_tag c
464                                         tok_state = tok_state_tag_name
465                                 else if uc_alpha.indexOf(c) > -1
466                                         tok_cur_tag = new_open_tag c.toLowerCase()
467                                         tok_state = tok_state_tag_name
468                                 else
469                                         parse_error()
470                                         tok_state = tok_state_data
471                                         cur -= 1 # we didn't parse/handle the char after <
472                                         return new_text_node '<'
473                 return null
474
475         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
476         tok_state_end_tag_open = ->
477                 switch c = txt.charAt(cur++)
478                         when '>'
479                                 parse_error()
480                                 tok_state = tok_state_data
481                         when '' # EOF
482                                 parse_error()
483                                 tok_state = tok_state_data
484                                 return new_text_node '</'
485                         else
486                                 if uc_alpha.indexOf(c) > -1
487                                         tok_cur_tag = new_end_tag c.toLowerCase()
488                                         tok_state = tok_state_tag_name
489                                 else if lc_alpha.indexOf(c) > -1
490                                         tok_cur_tag = new_end_tag c
491                                         tok_state = tok_state_tag_name
492                                 else
493                                         parse_error()
494                                         tok_state = tok_state_bogus_comment
495                 return null
496
497         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
498         tok_state_tag_name = ->
499                 switch c = txt.charAt(cur++)
500                         when "\t", "\n", "\u000c", ' '
501                                 tok_state = tok_state_before_attribute_name
502                         when '/'
503                                 tok_state = tok_state_self_closing_start_tag
504                         when '>'
505                                 tok_state = tok_state_data
506                                 tmp = tok_cur_tag
507                                 tok_cur_tag = null
508                                 return tmp
509                         when "\u0000"
510                                 parse_error()
511                                 tok_cur_tag.name += "\ufffd"
512                         when '' # EOF
513                                 parse_error()
514                                 tok_state = tok_state_data
515                         else
516                                 if uc_alpha.indexOf(c) > -1
517                                         tok_cur_tag.name += c.toLowerCase()
518                                 else
519                                         tok_cur_tag.name += c
520                 return null
521
522         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
523         tok_state_before_attribute_name = ->
524                 attr_name = null
525                 switch c = txt.charAt(cur++)
526                         when "\t", "\n", "\u000c", ' '
527                                 return null
528                         when '/'
529                                 tok_state = tok_state_self_closing_start_tag
530                                 return null
531                         when '>'
532                                 tok_state = tok_state_data
533                                 tmp = tok_cur_tag
534                                 tok_cur_tag = null
535                                 return tmp
536                         when "\u0000"
537                                 parse_error()
538                                 attr_name = "\ufffd"
539                         when '"', "'", '<', '='
540                                 parse_error()
541                                 attr_name = c
542                         when '' # EOF
543                                 parse_error()
544                                 tok_state = tok_state_data
545                         else
546                                 if uc_alpha.indexOf(c) > -1
547                                         attr_name = c.toLowerCase()
548                                 else
549                                         attr_name = c
550                 if attr_name?
551                         tok_cur_tag.attrs_a.unshift [attr_name, '']
552                         tok_state = tok_state_attribute_name
553                 return null
554
555         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
556         tok_state_attribute_name = ->
557                 switch c = txt.charAt(cur++)
558                         when "\t", "\n", "\u000c", ' '
559                                 tok_state = tok_state_after_attribute_name
560                         when '/'
561                                 tok_state = tok_state_self_closing_start_tag
562                         when '='
563                                 tok_state = tok_state_before_attribute_value
564                         when '>'
565                                 tok_state = tok_state_data
566                                 tmp = tok_cur_tag
567                                 tok_cur_tag = null
568                                 return tmp
569                         when "\u0000"
570                                 parse_error()
571                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
572                         when '"', "'", '<'
573                                 parse_error()
574                                 tok_cur_tag.attrs_a[0][0] = c
575                         when '' # EOF
576                                 parse_error()
577                                 tok_state = tok_state_data
578                         else
579                                 if uc_alpha.indexOf(c) > -1
580                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
581                                 else
582                                         tok_cur_tag.attrs_a[0][0] += c
583                 return null
584
585         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
586         tok_state_before_attribute_value = ->
587                 switch c = txt.charAt(cur++)
588                         when "\t", "\n", "\u000c", ' '
589                                 return null
590                         when '"'
591                                 tok_state = tok_state_attribute_value_double_quoted
592                         when '&'
593                                 tok_state = tok_state_attribute_value_unquoted
594                                 cur -= 1
595                         when "'"
596                                 tok_state = tok_state_attribute_value_single_quoted
597                         when "\u0000"
598                                 # Parse error
599                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
600                                 tok_state = tok_state_attribute_value_unquoted
601                         when '>'
602                                 # Parse error
603                                 tok_state = tok_state_data
604                                 tmp = tok_cur_tag
605                                 tok_cur_tag = null
606                                 return tmp
607                         when '' # EOF
608                                 parse_error()
609                                 tok_state = tok_state_data
610                         else
611                                 tok_cur_tag.attrs_a[0][1] += c
612                                 tok_state = tok_state_attribute_value_unquoted
613                 return null
614
615         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
616         tok_state_attribute_value_double_quoted = ->
617                 switch c = txt.charAt(cur++)
618                         when '"'
619                                 tok_state = tok_state_after_attribute_value_quoted
620                         when '&'
621                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
622                         when "\u0000"
623                                 # Parse error
624                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
625                         when '' # EOF
626                                 parse_error()
627                                 tok_state = tok_state_data
628                         else
629                                 tok_cur_tag.attrs_a[0][1] += c
630                 return null
631
632         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
633         tok_state_attribute_value_single_quoted = ->
634                 switch c = txt.charAt(cur++)
635                         when "'"
636                                 tok_state = tok_state_after_attribute_value_quoted
637                         when '&'
638                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
639                         when "\u0000"
640                                 # Parse error
641                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
642                         when '' # EOF
643                                 parse_error()
644                                 tok_state = tok_state_data
645                         else
646                                 tok_cur_tag.attrs_a[0][1] += c
647                 return null
648
649         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
650         tok_state_attribute_value_unquoted = ->
651                 switch c = txt.charAt(cur++)
652                         when "\t", "\n", "\u000c", ' '
653                                 tok_state = tok_state_before_attribute_name
654                         when '&'
655                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
656                         when '>'
657                                 tok_state = tok_state_data
658                                 tmp = tok_cur_tag
659                                 tok_cur_tag = null
660                                 return tmp
661                         when "\u0000"
662                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
663                         when '' # EOF
664                                 parse_error()
665                                 tok_state = tok_state_data
666                         else
667                                 # Parse Error if ', <, = or ` (backtick)
668                                 tok_cur_tag.attrs_a[0][1] += c
669                 return null
670
671         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
672         tok_state_after_attribute_value_quoted = ->
673                 switch c = txt.charAt(cur++)
674                         when "\t", "\n", "\u000c", ' '
675                                 tok_state = tok_state_before_attribute_name
676                         when '/'
677                                 tok_state = tok_state_self_closing_start_tag
678                         when '>'
679                                 tok_state = tok_state_data
680                                 tmp = tok_cur_tag
681                                 tok_cur_tag = null
682                                 return tmp
683                         when '' # EOF
684                                 parse_error()
685                                 tok_state = tok_state_data
686                         else
687                                 # Parse Error
688                                 tok_state = tok_state_before_attribute_name
689                                 cur -= 1 # we didn't handle that char
690                 return null
691
692         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
693         # Don't set this as a state, just call it
694         # returns a string (NOT a text node)
695         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
696                 if cur >= txt.length
697                         return '&'
698                 switch c = txt.charAt(cur)
699                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
700                                 # explicitly not a parse error
701                                 return '&'
702                         when ';'
703                                 # there has to be "one or more" alnums between & and ; to be a parse error
704                                 return '&'
705                         when '#'
706                                 if cur + 1 >= txt.length
707                                         return '&'
708                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
709                                         prefix = '#x'
710                                         charset = hex_chars
711                                         start = cur + 2
712                                 else
713                                         charset = digits
714                                         start = cur + 1
715                                         prefix = '#'
716                                 i = 0
717                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
718                                         i += 1
719                                 if i is 0
720                                         return '&'
721                                 if txt.charAt(start + i) is ';'
722                                         i += 1
723                                 # FIXME This is supposed to generate parse errors for some chars
724                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
725                                 if decoded?
726                                         cur = start + i
727                                         return decoded
728                                 return '&'
729                         else
730                                 for i in [0...31]
731                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
732                                                 break
733                                 if i is 0
734                                         # exit early, because parse_error() below needs at least one alnum
735                                         return '&'
736                                 if txt.charAt(cur + i) is ';'
737                                         i += 1 # include ';' terminator in value
738                                         decoded = decode_named_char_ref txt.substr(cur, i)
739                                         if decoded?
740                                                 cur += i
741                                                 return decoded
742                                         parse_error()
743                                         return '&'
744                                 else
745                                         # no ';' terminator (only legacy char refs)
746                                         max = i
747                                         for i in [2..max] # no prefix matches, so ok to check shortest first
748                                                 c = legacy_char_refs[txt.substr(cur, i)]
749                                                 if c?
750                                                         if in_attr
751                                                                 if txt.charAt(cur + i) is '='
752                                                                         # "because some legacy user agents will
753                                                                         # misinterpret the markup in those cases"
754                                                                         parse_error()
755                                                                         return '&'
756                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
757                                                                         # this makes attributes forgiving about url args
758                                                                         return '&'
759                                                         # ok, and besides the weird exceptions for attributes...
760                                                         # return the matching char
761                                                         cur += i # consume entity chars
762                                                         parse_error() # because no terminating ";"
763                                                         return c
764                                         parse_error()
765                                         return '&'
766                 return # never reached
767
768         # tree constructor initialization
769         # see comments on TYPE_TAG/etc for the structure of this data
770         tree = new Node TYPE_TAG, name: 'html'
771         open_tags = [tree]
772         tree_state = tree_in_body
773         flag_frameset_ok = true
774         flag_parsing = true
775
776         # tokenizer initialization
777         tok_state = tok_state_data
778
779         # proccess input
780         while flag_parsing
781                 t = tok_state()
782                 if t?
783                         tree_state t
784         return tree.children
785
786 # everything below is tests on the above
787 test_equals = (description, output, expected_output) ->
788         if output is expected_output
789                 console.log "passed." # don't say name, so smart consoles can merge all of these
790         else
791                 console.log "FAILED: \"#{description}\""
792                 console.log "   Expected: #{expected_output}"
793                 console.log "     Actual: #{output}"
794 test_parser = (args) ->
795         parse_errors = []
796         errors_cb = (i) ->
797                 parse_errors.push i
798         parsed = parse_html args.html, errors_cb
799         serialized = ''
800         sep = ''
801         for t in parsed
802                 serialized += sep
803                 sep = ','
804                 serialized += t.serialize()
805         if serialized isnt args.expected or parse_errors.length isnt args.errors
806                 console.log "FAILED: \"#{args.name}\""
807         else
808                 console.log "passed \"#{args.name}\""
809         if serialized isnt args.expected
810                 console.log "      Input: #{args.html}"
811                 console.log "    Correct: #{args.expected}"
812                 console.log "     Output: #{serialized}"
813         if parse_errors.length isnt args.errors
814                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
815
816 test_parser name: "empty", \
817         html: "",
818         expected: '',
819         errors: 0
820 test_parser name: "just text", \
821         html: "abc",
822         expected: 'text:"abc"',
823         errors: 0
824 test_parser name: "named entity", \
825         html: "a&amp;1234",
826         expected: 'text:"a&1234"',
827         errors: 0
828 test_parser name: "broken named character references", \
829         html: "1&amp2&&amp;3&aabbcc;",
830         expected: 'text:"1&2&&3&aabbcc;"',
831         errors: 2
832 test_parser name: "numbered entity overrides", \
833         html: "1&#X80&#x80; &#x83",
834         expected: 'text:"1€€ ƒ"',
835         errors: 0
836 test_parser name: "open tag", \
837         html: "foo<span>bar",
838         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
839         errors: 1 # no close tag
840 test_parser name: "open tag with attributes", \
841         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
842         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
843         errors: 1 # no close tag
844 test_parser name: "open tag with attributes of various quotings", \
845         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
846         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
847         errors: 1 # no close tag
848 test_parser name: "attribute entity exceptions dq", \
849         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
850         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
851         errors: 2 # no close tag, &amp= in attr
852 test_parser name: "attribute entity exceptions sq", \
853         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
854         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
855         errors: 2 # no close tag, &amp= in attr
856 test_parser name: "attribute entity exceptions uq", \
857         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
858         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
859         errors: 2 # no close tag, &amp= in attr
860 test_parser name: "matching closing tags", \
861         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
862         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
863         errors: 0
864 test_parser name: "missing closing tag inside", \
865         html: "foo<div>bar<span>baz</div>qux",
866         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
867         errors: 1 # close tag mismatch
868 test_parser name: "mis-matched closing tags", \
869         html: "<span>12<div>34</span>56</div>78",
870         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
871         errors: 2 # misplaced </span>, no </span> at the end
872 test_parser name: "mis-matched formatting elements", \
873         html: "12<b>34<i>56</b>78</i>90",
874         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
875         errors: 2 # FIXME dunno how many there should be