JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
rest of insertion modes (untested)
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
54 TYPE_COMMENT = 2
55 TYPE_DOCTYPE = 3
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
59 TYPE_EOF = 6
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
62
63 # namespace constants
64 NS_HTML = 1
65 NS_MATHML = 2
66 NS_SVG = 3
67
68 g_debug_log = []
69 debug_log_reset = ->
70         g_debug_log = []
71 debug_log = (str) ->
72         g_debug_log.push str
73 debug_log_each = (cb) ->
74         for str in g_debug_log
75                 cb str
76
77 prev_node_id = 0
78 class Node
79         constructor: (type, args = {}) ->
80                 @type = type # one of the TYPE_* constants above
81                 @name = args.name ? '' # tag name
82                 @text = args.text ? '' # contents for text/comment nodes
83                 @attrs = args.attrs ? {}
84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85                 @children = args.children ? []
86                 @namespace = args.namespace ? NS_HTML
87                 @parent = args.parent ? null
88                 @token = args.token ? null
89                 if args.id?
90                         @id = "#{args.id}+"
91                 else
92                         @id = "#{++prev_node_id}"
93         shallow_clone: -> # return a new node that's the same except without the children or parent
94                 # WARNING this doesn't work right on open tags that are still being parsed
95                 attrs = {}
96                 attrs[k] = v for k, v of @attrs
97                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
98         acknowledge_self_closing: ->
99                 if @token?
100                         @token.flag 'did_self_close'
101                 else
102                         @flag 'did_self_close', true
103         flag: ->
104                 # fixfull
105         serialize: (shallow = false, show_ids = false) -> # for unit tests
106                 ret = ''
107                 switch @type
108                         when TYPE_TAG
109                                 ret += 'tag:'
110                                 ret += JSON.stringify @name
111                                 ret += ','
112                                 if show_ids
113                                         ret += "##{@id},"
114                                 if shallow
115                                         break
116                                 attr_keys = []
117                                 for k of @attrs
118                                         attr_keys.push k
119                                 attr_keys.sort()
120                                 ret += '{'
121                                 sep = ''
122                                 for k in attr_keys
123                                         ret += sep
124                                         sep = ','
125                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
126                                 ret += '},['
127                                 sep = ''
128                                 for c in @children
129                                         ret += sep
130                                         sep = ','
131                                         ret += c.serialize shallow, show_ids
132                                 ret += ']'
133                         when TYPE_TEXT
134                                 ret += 'text:'
135                                 ret += JSON.stringify @text
136                         when TYPE_COMMENT
137                                 ret += 'comment:'
138                                 ret += JSON.stringify @text
139                         when TYPE_DOCTYPE
140                                 ret += 'doctype'
141                                 # FIXME
142                         when TYPE_AFE_MARKER
143                                 ret += 'marker'
144                         when TYPE_AAA_BOOKMARK
145                                 ret += 'aaa_bookmark'
146                         else
147                                 ret += 'unknown:'
148                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
149                 return ret
150
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153         return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155         return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157         return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159         return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_node = (txt) ->
162         return new Node TYPE_COMMENT, text: txt
163 new_eof_token = ->
164         return new Node TYPE_EOF
165 new_afe_marker = ->
166         return new Node TYPE_AFE_MARKER
167 new_aaa_bookmark = ->
168         return new Node TYPE_AAA_BOOKMARK
169
170 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
171 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
172 digits = "0123456789"
173 alnum = lc_alpha + uc_alpha + digits
174 hex_chars = digits + "abcdefABCDEF"
175
176 # some SVG elements have dashes in them
177 tag_name_chars = alnum + "-"
178
179 # http://www.w3.org/TR/html5/infrastructure.html#space-character
180 space_chars = "\u0009\u000a\u000c\u000d\u0020"
181 is_space = (txt) ->
182         return txt.length is 1 and space_chars.indexOf(txt) > -1
183 is_space_tok = (t) ->
184         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
185
186 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
187 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
188
189 # These are the character references that don't need a terminating semicolon
190 # min length: 2, max: 6, none are a prefix of any other.
191 legacy_char_refs = {
192         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
193         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
194         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
195         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
196         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
197         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
198         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
199         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
200         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
201         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
202         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
203         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
204         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
205         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
206         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
207         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
208         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
209         yen: '¥', yuml: 'ÿ'
210 }
211
212 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
213 raw_text_elements = ['script', 'style']
214 escapable_raw_text_elements = ['textarea', 'title']
215 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
216 svg_elements = [
217         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
218         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
219         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
220         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
221         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
222         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
223         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
224         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
225         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
226         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
227         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
228         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
229         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
230         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
231         'view', 'vkern'
232 ]
233
234 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
235 mathml_elements = [
236         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
237         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
238         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
239         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
240         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
241         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
242         'determinant', 'diff', 'divergence', 'divide', 'domain',
243         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
244         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
245         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
246         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
247         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
248         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
249         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
250         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
251         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
252         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
253         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
254         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
255         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
256         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
257         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
258         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
259         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
260         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
261         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
262         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
263         'vectorproduct', 'xor'
264 ]
265 # foreign_elements = [svg_elements..., mathml_elements...]
266 #normal_elements = All other allowed HTML elements are normal elements.
267
268 special_elements = {
269         # HTML:
270         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
271         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
272         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
273         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
274         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
275         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
276         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
277         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
278         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
279         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
280         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
281         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
282         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
283         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
284         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
285         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
286         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
287         wbr:NS_HTML, xmp:NS_HTML,
288
289         # MathML:
290         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
291         'annotation-xml':NS_MATHML,
292
293         # SVG:
294         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
295 }
296
297 formatting_elements = {
298          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
299          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
300          u: true
301 }
302
303 foster_parenting_targets = {
304         table: true
305         tbody: true
306         tfoot: true
307         thead: true
308         tr: true
309 }
310
311 # all html I presume
312 end_tag_implied = {
313         dd: true
314         dt: true
315         li: true
316         option: true
317         optgroup: true
318         p: true
319         rb: true
320         rp: true
321         rt: true
322         rtc: true
323 }
324
325 el_is_special = (e) ->
326         return special_elements[e.name] is e.namespace
327
328 # decode_named_char_ref()
329 #
330 # The list of named character references is _huge_ so ask the browser to decode
331 # for us instead of wasting bandwidth/space on including the table here.
332 #
333 # Pass without the "&" but with the ";" examples:
334 #    for "&amp" pass "amp;"
335 #    for "&#x2032" pass "x2032;"
336 g_dncr = {
337         cache: {}
338         textarea: document.createElement('textarea')
339 }
340 # TODO test this in IE8
341 decode_named_char_ref = (txt) ->
342         txt = "&#{txt}"
343         decoded = g_dncr.cache[txt]
344         return decoded if decoded?
345         g_dncr.textarea.innerHTML = txt
346         decoded = g_dncr.textarea.value
347         return null if decoded is txt
348         return g_dncr.cache[txt] = decoded
349
350 parse_html = (txt, parse_error_cb = null) ->
351         cur = 0 # index of next char in txt to be parsed
352         # declare doc and tokenizer variables so they're in scope below
353         doc = null
354         open_els = null # stack of open elements
355         afe = null # active formatting elements
356         template_insertion_modes = null
357         insertion_mode = null
358         original_insertion_mode = null
359         tok_state = null
360         tok_cur_tag = null # partially parsed tag
361         flag_scripting = null
362         flag_frameset_ok = null
363         flag_parsing = null
364         flag_foster_parenting = null
365         form_element_pointer = null
366         temporary_buffer = null
367         pending_table_character_tokens = null
368         head_element_pointer = null
369         flag_fragment_parsing = null
370
371         stop_parsing = ->
372                 flag_parsing = false
373
374         parse_error = ->
375                 if parse_error_cb?
376                         parse_error_cb cur
377                 else
378                         console.log "Parse error at character #{cur} of #{txt.length}"
379
380         afe_push = (new_el) ->
381                 matches = 0
382                 for el, i in afe
383                         if el.name is new_el.name and el.namespace is new_el.namespace
384                                 for k, v of el.attrs
385                                         continue unless new_el.attrs[k] is v
386                                 for k, v of new_el.attrs
387                                         continue unless el.attrs[k] is v
388                                 matches += 1
389                                 if matches is 3
390                                         afe.splice i, 1
391                                         break
392                 afe.unshift new_el
393         afe_push_marker = ->
394                 afe.unshift new_afe_marker()
395
396         # the functions below impliment the Tree Contstruction algorithm
397         # http://www.w3.org/TR/html5/syntax.html#tree-construction
398
399         # But first... the helpers
400         template_tag_is_open = ->
401                 for t in open_els
402                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
403                                 return true
404                 return false
405         is_in_scope_x = (tag_name, scope, namespace) ->
406                 for t in open_els
407                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
408                                 return true
409                         if scope[t.name] is t.namespace
410                                 return false
411                 return false
412         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
413                 for t in open_els
414                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
415                                 return true
416                         if scope[t.name] is t.namespace
417                                 return false
418                         if scope2[t.name] is t.namespace
419                                 return false
420                 return false
421         standard_scopers = { # FIXME these are supposed to be namespace specific
422                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
423                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
424                 template: NS_HTML, mi: NS_MATHML,
425
426                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
427                 'annotation-xml': NS_MATHML,
428
429                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
430         }
431         button_scopers = button: NS_HTML
432         li_scopers = ol: NS_HTML, ul: NS_HTML
433         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
434         is_in_scope = (tag_name, namespace = null) ->
435                 return is_in_scope_x tag_name, standard_scopers, namespace
436         is_in_button_scope = (tag_name, namespace = null) ->
437                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
438         is_in_table_scope = (tag_name, namespace = null) ->
439                 return is_in_scope_x tag_name, table_scopers, namespace
440         is_in_select_scope = (tag_name, namespace = null) ->
441                 for t in open_els
442                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
443                                 return true
444                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
445                                 return false
446                 return false
447         # this checks for a particular element, not by name
448         el_is_in_scope = (el) ->
449                 for t in open_els
450                         if t is el
451                                 return true
452                         if standard_scopers[t.name] is t.namespace
453                                 return false
454                 return false
455
456         clear_to_table_stopers = {
457                 'table': true
458                 'template': true
459                 'html': true
460         }
461         clear_stack_to_table_context = ->
462                 loop
463                         if clear_to_table_stopers[open_els[0].name]?
464                                 break
465                         open_els.shift()
466                 return
467         clear_to_table_body_stopers = {
468                 'tbody': true
469                 'tfoot': true
470                 'thead': true
471                 'template': true
472                 'html': true
473         }
474         clear_stack_to_table_body_context = ->
475                 loop
476                         if clear_to_table_body_stopers[open_els[0].name]?
477                                 break
478                         open_els.shift()
479                 return
480         clear_to_table_row_stopers = {
481                 'tr': true
482                 'template': true
483                 'html': true
484         }
485         clear_stack_to_table_row_context = ->
486                 loop
487                         if clear_to_table_row_stopers[open_els[0].name]?
488                                 break
489                         open_els.shift()
490                 return
491         clear_afe_to_marker = ->
492                 loop
493                         el = afe.shift()
494                         if el.type is TYPE_AFE_MARKER
495                                 return
496
497         # 8.2.3.1 ...
498         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
499         reset_insertion_mode = ->
500                 # 1. Let last be false.
501                 last = false
502                 # 2. Let node be the last node in the stack of open elements.
503                 node_i = 0
504                 node = open_els[node_i]
505                 # 3. Loop: If node is the first node in the stack of open elements,
506                 # then set last to true, and, if the parser was originally created as
507                 # part of the HTML fragment parsing algorithm (fragment case) set node
508                 # to the context element.
509                 loop
510                         if node_i is open_els.length - 1
511                                 last = true
512                                 # fixfull (fragment case)
513
514                         # 4. If node is a select element, run these substeps:
515                         if node.name is 'select'
516                                 # 1. If last is true, jump to the step below labeled done.
517                                 unless last
518                                         # 2. Let ancestor be node.
519                                         ancestor_i = node_i
520                                         ancestor = node
521                                         # 3. Loop: If ancestor is the first node in the stack of
522                                         # open elements, jump to the step below labeled done.
523                                         loop
524                                                 if ancestor_i is open_els.length - 1
525                                                         break
526                                                 # 4. Let ancestor be the node before ancestor in the stack
527                                                 # of open elements.
528                                                 ancestor_i += 1
529                                                 ancestor = open_els[ancestor_i]
530                                                 # 5. If ancestor is a template node, jump to the step below
531                                                 # labeled done.
532                                                 if ancestor.name is 'template'
533                                                         break
534                                                 # 6. If ancestor is a table node, switch the insertion mode
535                                                 # to "in select in table" and abort these steps.
536                                                 if ancestor.name is 'table'
537                                                         insertion_mode = ins_mode_in_select_in_table
538                                                         return
539                                                 # 7. Jump back to the step labeled loop.
540                                 # 8. Done: Switch the insertion mode to "in select" and abort
541                                 # these steps.
542                                 insertion_mode = ins_mode_in_select
543                                 return
544                         # 5. If node is a td or th element and last is false, then switch
545                         # the insertion mode to "in cell" and abort these steps.
546                         if (node.name is 'td' or node.name is 'th') and last is false
547                                 insertion_mode = ins_mode_in_cell
548                                 return
549                         # 6. If node is a tr element, then switch the insertion mode to "in
550                         # row" and abort these steps.
551                         if node.name is 'tr'
552                                 insertion_mode = ins_mode_in_row
553                                 return
554                         # 7. If node is a tbody, thead, or tfoot element, then switch the
555                         # insertion mode to "in table body" and abort these steps.
556                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
557                                 insertion_mode = ins_mode_in_table_body
558                                 return
559                         # 8. If node is a caption element, then switch the insertion mode
560                         # to "in caption" and abort these steps.
561                         if node.name is 'caption'
562                                 insertion_mode = ins_mode_in_caption
563                                 return
564                         # 9. If node is a colgroup element, then switch the insertion mode
565                         # to "in column group" and abort these steps.
566                         if node.name is 'colgroup'
567                                 insertion_mode = ins_mode_in_column_group
568                                 return
569                         # 10. If node is a table element, then switch the insertion mode to
570                         # "in table" and abort these steps.
571                         if node.name is 'table'
572                                 insertion_mode = ins_mode_in_table
573                                 return
574                         # 11. If node is a template element, then switch the insertion mode
575                         # to the current template insertion mode and abort these steps.
576                         # fixfull (template insertion mode stack)
577
578                         # 12. If node is a head element and last is true, then switch the
579                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
580                         # these steps. (fragment case)
581                         if node.name is 'head' and last
582                                 insertion_mode = ins_mode_in_body
583                                 return
584                         # 13. If node is a head element and last is false, then switch the
585                         # insertion mode to "in head" and abort these steps.
586                         if node.name is 'head' and last is false
587                                 insertion_mode = ins_mode_in_head
588                                 return
589                         # 14. If node is a body element, then switch the insertion mode to
590                         # "in body" and abort these steps.
591                         if node.name is 'body'
592                                 insertion_mode = ins_mode_in_body
593                                 return
594                         # 15. If node is a frameset element, then switch the insertion mode
595                         # to "in frameset" and abort these steps. (fragment case)
596                         if node.name is 'frameset'
597                                 insertion_mode = ins_mode_in_frameset
598                                 return
599                         # 16. If node is an html element, run these substeps:
600                         if node.name is 'html'
601                                 # 1. If the head element pointer is null, switch the insertion
602                                 # mode to "before head" and abort these steps. (fragment case)
603                                 # fixfull (fragment case)
604
605                                 # 2. Otherwise, the head element pointer is not null, switch
606                                 # the insertion mode to "after head" and abort these steps.
607                                 insertion_mode = ins_mode_in_body # FIXME fixfull
608                                 return
609                         # 17. If last is true, then switch the insertion mode to "in body"
610                         # and abort these steps. (fragment case)
611                         if last
612                                 insertion_mode = ins_mode_in_body
613                                 return
614                         # 18. Let node now be the node before node in the stack of open
615                         # elements.
616                         node_i += 1
617                         node = open_els[node_i]
618                         # 19. Return to the step labeled loop.
619
620         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
621         # this implementation is structured (mostly) as described at the link above.
622         # capitalized comments are the "labels" described at the link above.
623         reconstruct_active_formatting_elements = ->
624                 return if afe.length is 0
625                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
626                         return
627                 # Rewind
628                 i = 0
629                 loop
630                         if i is afe.length - 1
631                                 break
632                         i += 1
633                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
634                                 i -= 1 # Advance
635                                 break
636                 # Create
637                 loop
638                         el = afe[i].shallow_clone()
639                         tree_insert_element el
640                         afe[i] = el
641                         break if i is 0
642                         i -= 1 # Advance
643
644         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
645         # adoption agency algorithm
646         # overview here:
647         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
648         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
649         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
650         adoption_agency = (subject) ->
651                 debug_log "adoption_agency()"
652                 debug_log "tree: #{serialize_els doc.children, false, true}"
653                 debug_log "open_els: #{serialize_els open_els, true, true}"
654                 debug_log "afe: #{serialize_els afe, true, true}"
655                 if open_els[0].name is subject
656                         el = open_els[0]
657                         open_els.shift()
658                         # remove it from the list of active formatting elements (if found)
659                         for t, i in afe
660                                 if t is el
661                                         afe.splice i, 1
662                                         break
663                         debug_log "aaa: starting off with subject on top of stack, exiting"
664                         return
665                 outer = 0
666                 loop
667                         if outer >= 8
668                                 return
669                         outer += 1
670                         # 5. Let formatting element be the last element in the list of
671                         # active formatting elements that: is between the end of the list
672                         # and the last scope marker in the list, if any, or the start of
673                         # the list otherwise, and  has the tag name subject.
674                         fe = null
675                         for t, fe_of_afe in afe
676                                 if t.type is TYPE_AFE_MARKER
677                                         break
678                                 if t.name is subject
679                                         fe = t
680                                         break
681                         # If there is no such element, then abort these steps and instead
682                         # act as described in the "any other end tag" entry above.
683                         if fe is null
684                                 debug_log "aaa: fe not found in afe"
685                                 in_body_any_other_end_tag subject
686                                 return
687                         # 6. If formatting element is not in the stack of open elements,
688                         # then this is a parse error; remove the element from the list, and
689                         # abort these steps.
690                         in_open_els = false
691                         for t, fe_of_open_els in open_els
692                                 if t is fe
693                                         in_open_els = true
694                                         break
695                         unless in_open_els
696                                 debug_log "aaa: fe not found in open_els"
697                                 parse_error()
698                                 # "remove it from the list" must mean afe, since it's not in open_els
699                                 afe.splice fe_of_afe, 1
700                                 return
701                         # 7. If formatting element is in the stack of open elements, but
702                         # the element is not in scope, then this is a parse error; abort
703                         # these steps.
704                         unless el_is_in_scope fe
705                                 debug_log "aaa: fe not in scope"
706                                 parse_error()
707                                 return
708                         # 8. If formatting element is not the current node, this is a parse
709                         # error. (But do not abort these steps.)
710                         unless open_els[0] is fe
711                                 parse_error()
712                                 # continue
713                         # 9. Let furthest block be the topmost node in the stack of open
714                         # elements that is lower in the stack than formatting element, and
715                         # is an element in the special category. There might not be one.
716                         fb = null
717                         fb_of_open_els = null
718                         for t, i in open_els
719                                 if t is fe
720                                         break
721                                 if el_is_special t
722                                         fb = t
723                                         fb_of_open_els = i
724                                         # and continue, to see if there's one that's more "topmost"
725                         # 10. If there is no furthest block, then the UA must first pop all
726                         # the nodes from the bottom of the stack of open elements, from the
727                         # current node up to and including formatting element, then remove
728                         # formatting element from the list of active formatting elements,
729                         # and finally abort these steps.
730                         if fb is null
731                                 debug_log "aaa: no fb"
732                                 loop
733                                         t = open_els.shift()
734                                         if t is fe
735                                                 afe.splice fe_of_afe, 1
736                                                 return
737                         # 11. Let common ancestor be the element immediately above
738                         # formatting element in the stack of open elements.
739                         ca = open_els[fe_of_open_els + 1] # common ancestor
740
741                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
742                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
743                         bookmark = new_aaa_bookmark()
744                         for t, i in afe
745                                 if t is fe
746                                         afe.splice i, 0, bookmark
747                                         break
748                         node = last_node = fb
749                         inner = 0
750                         loop
751                                 inner += 1
752                                 # 3. Let node be the element immediately above node in the
753                                 # stack of open elements, or if node is no longer in the stack
754                                 # of open elements (e.g. because it got removed by this
755                                 # algorithm), the element that was immediately above node in
756                                 # the stack of open elements before node was removed.
757                                 node_next = null
758                                 for t, i in open_els
759                                         if t is node
760                                                 node_next = open_els[i + 1]
761                                                 break
762                                 node = node_next ? node_above
763                                 debug_log "inner loop #{inner}"
764                                 debug_log "tree: #{serialize_els doc.children, false, true}"
765                                 debug_log "open_els: #{serialize_els open_els, true, true}"
766                                 debug_log "afe: #{serialize_els afe, true, true}"
767                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
768                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
769                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
770                                 debug_log "node: #{node.serialize true, true}"
771                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
772
773                                 # 4. If node is formatting element, then go to the next step in
774                                 # the overall algorithm.
775                                 if node is fe
776                                         break
777                                 debug_log "the meat"
778                                 # 5. If inner loop counter is greater than three and node is in
779                                 # the list of active formatting elements, then remove node from
780                                 # the list of active formatting elements.
781                                 node_in_afe = false
782                                 for t, i in afe
783                                         if t is node
784                                                 if inner > 3
785                                                         afe.splice i, 1
786                                                         debug_log "max out inner"
787                                                 else
788                                                         node_in_afe = true
789                                                         debug_log "in afe"
790                                                 break
791                                 # 6. If node is not in the list of active formatting elements,
792                                 # then remove node from the stack of open elements and then go
793                                 # back to the step labeled inner loop.
794                                 unless node_in_afe
795                                         debug_log "not in afe"
796                                         for t, i in open_els
797                                                 if t is node
798                                                         node_above = open_els[i + 1]
799                                                         open_els.splice i, 1
800                                                         break
801                                         continue
802                                 debug_log "the bones"
803                                 # 7. create an element for the token for which the element node
804                                 # was created, in the HTML namespace, with common ancestor as
805                                 # the intended parent; replace the entry for node in the list
806                                 # of active formatting elements with an entry for the new
807                                 # element, replace the entry for node in the stack of open
808                                 # elements with an entry for the new element, and let node be
809                                 # the new element.
810                                 new_node = node.shallow_clone()
811                                 for t, i in afe
812                                         if t is node
813                                                 afe[i] = new_node
814                                                 debug_log "replaced in afe"
815                                                 break
816                                 for t, i in open_els
817                                         if t is node
818                                                 node_above = open_els[i + 1]
819                                                 open_els[i] = new_node
820                                                 debug_log "replaced in open_els"
821                                                 break
822                                 node = new_node
823                                 # 8. If last node is furthest block, then move the
824                                 # aforementioned bookmark to be immediately after the new node
825                                 # in the list of active formatting elements.
826                                 if last_node is fb
827                                         for t, i in afe
828                                                 if t is bookmark
829                                                         afe.splice i, 1
830                                                         debug_log "removed bookmark"
831                                                         break
832                                         for t, i in afe
833                                                 if t is node
834                                                         # "after" means lower
835                                                         afe.splice i, 0, bookmark # "after as <-
836                                                         debug_log "placed bookmark after node"
837                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
838                                                         break
839                                 # 9. Insert last node into node, first removing it from its
840                                 # previous parent node if any.
841                                 if last_node.parent?
842                                         debug_log "last_node has parent"
843                                         for c, i in last_node.parent.children
844                                                 if c is last_node
845                                                         debug_log "removing last_node from parent"
846                                                         last_node.parent.children.splice i, 1
847                                                         break
848                                 node.children.push last_node
849                                 last_node.parent = node
850                                 # 10. Let last node be node.
851                                 last_node = node
852                                 debug_log "at last"
853                                 # 11. Return to the step labeled inner loop.
854                         # 14. Insert whatever last node ended up being in the previous step
855                         # at the appropriate place for inserting a node, but using common
856                         # ancestor as the override target.
857
858                         # In the case where fe is immediately followed by fb:
859                         #   * inner loop exits out early (node==fe)
860                         #   * last_node is fb
861                         #   * last_node is still in the tree (not a duplicate)
862                         if last_node.parent?
863                                 debug_log "FEFIRST? last_node has parent"
864                                 for c, i in last_node.parent.children
865                                         if c is last_node
866                                                 debug_log "removing last_node from parent"
867                                                 last_node.parent.children.splice i, 1
868                                                 break
869
870                         debug_log "after aaa inner loop"
871                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
872                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
873                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
874                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
875                         debug_log "tree: #{serialize_els doc.children, false, true}"
876
877                         debug_log "insert"
878
879
880                         # can't use standard insert token thing, because it's already in
881                         # open_els and must stay at it's current position in open_els
882                         dest = adjusted_insertion_location ca
883                         dest[0].children.splice dest[1], 0, last_node
884                         last_node.parent = dest[0]
885
886
887                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
888                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
889                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
890                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
891                         debug_log "tree: #{serialize_els doc.children, false, true}"
892
893                         # 15. Create an element for the token for which formatting element
894                         # was created, in the HTML namespace, with furthest block as the
895                         # intended parent.
896                         new_element = fe.shallow_clone() # FIXME intended parent thing
897                         # 16. Take all of the child nodes of furthest block and append them
898                         # to the element created in the last step.
899                         while fb.children.length
900                                 t = fb.children.shift()
901                                 t.parent = new_element
902                                 new_element.children.push t
903                         # 17. Append that new element to furthest block.
904                         new_element.parent = fb
905                         fb.children.push new_element
906                         # 18. Remove formatting element from the list of active formatting
907                         # elements, and insert the new element into the list of active
908                         # formatting elements at the position of the aforementioned
909                         # bookmark.
910                         for t, i in afe
911                                 if t is fe
912                                         afe.splice i, 1
913                                         break
914                         for t, i in afe
915                                 if t is bookmark
916                                         afe[i] = new_element
917                                         break
918                         # 19. Remove formatting element from the stack of open elements,
919                         # and insert the new element into the stack of open elements
920                         # immediately below the position of furthest block in that stack.
921                         for t, i in open_els
922                                 if t is fe
923                                         open_els.splice i, 1
924                                         break
925                         for t, i in open_els
926                                 if t is fb
927                                         open_els.splice i, 0, new_element
928                                         break
929                         # 20. Jump back to the step labeled outer loop.
930                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
931                         debug_log "tree: #{serialize_els doc.children, false, true}"
932                         debug_log "open_els: #{serialize_els open_els, true, true}"
933                         debug_log "afe: #{serialize_els afe, true, true}"
934                 debug_log "AAA DONE"
935
936         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
937         close_p_element = ->
938                 generate_implied_end_tags 'p' # arg is exception
939                 if open_els[0].name isnt 'p'
940                         parse_error()
941                 while open_els.length > 1 # just in case
942                         el = open_els.shift()
943                         if el.name is 'p'
944                                 return
945         close_p_if_in_button_scope = ->
946                 if is_in_button_scope 'p'
947                         close_p_element()
948
949         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
950         # aka insert_a_character = (t) ->
951         insert_character = (t) ->
952                 dest = adjusted_insertion_location()
953                 # fixfull check for Document node
954                 if dest[1] > 0
955                         prev = dest[0].children[dest[1] - 1]
956                         if prev.type is TYPE_TEXT
957                                 prev.text += t.text
958                                 return
959                 dest[0].children.splice dest[1], 0, t
960
961         # 8.2.5.1
962         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
963         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
964         adjusted_insertion_location = (override_target = null) ->
965                 # 1. If there was an override target specified, then let target be the
966                 # override target.
967                 if override_target?
968                         target = override_target
969                 else # Otherwise, let target be the current node.
970                         target = open_els[0]
971                 # 2. Determine the adjusted insertion location using the first matching
972                 # steps from the following list:
973                 #
974                 # If foster parenting is enabled and target is a table, tbody, tfoot,
975                 # thead, or tr element Foster parenting happens when content is
976                 # misnested in tables.
977                 if flag_foster_parenting and foster_parenting_targets[target.name]
978                         loop # once. this is here so we can ``break`` to "abort these substeps"
979                                 # 1. Let last template be the last template element in the
980                                 # stack of open elements, if any.
981                                 last_template = null
982                                 last_template_i = null
983                                 for el, i in open_els
984                                         if el.name is 'template'
985                                                 last_template = el
986                                                 last_template_i = i
987                                                 break
988                                 # 2. Let last table be the last table element in the stack of
989                                 # open elements, if any.
990                                 last_table = null
991                                 last_table_i
992                                 for el, i in open_els
993                                         if el.name is 'table'
994                                                 last_table = el
995                                                 last_table_i = i
996                                                 break
997                                 # 3. If there is a last template and either there is no last
998                                 # table, or there is one, but last template is lower (more
999                                 # recently added) than last table in the stack of open
1000                                 # elements, then: let adjusted insertion location be inside
1001                                 # last template's template contents, after its last child (if
1002                                 # any), and abort these substeps.
1003                                 if last_template and (last_table is null or last_template_i < last_table_i)
1004                                         target = template # fixfull should be it's contents
1005                                         target_i = target.children.length
1006                                         break
1007                                 # 4. If there is no last table, then let adjusted insertion
1008                                 # location be inside the first element in the stack of open
1009                                 # elements (the html element), after its last child (if any),
1010                                 # and abort these substeps. (fragment case)
1011                                 if last_table is null
1012                                         # this is odd
1013                                         target = open_els[open_els.length - 1]
1014                                         target_i = target.children.length
1015                                 # 5. If last table has a parent element, then let adjusted
1016                                 # insertion location be inside last table's parent element,
1017                                 # immediately before last table, and abort these substeps.
1018                                 if last_table.parent?
1019                                         for c, i in last_table.parent.children
1020                                                 if c is last_table
1021                                                         target = last_table.parent
1022                                                         target_i = i
1023                                                         break
1024                                         break
1025                                 # 6. Let previous element be the element immediately above last
1026                                 # table in the stack of open elements.
1027                                 #
1028                                 # huh? how could it not have a parent?
1029                                 previous_element = open_els[last_table_i + 1]
1030                                 # 7. Let adjusted insertion location be inside previous
1031                                 # element, after its last child (if any).
1032                                 target = previous_element
1033                                 target_i = target.children.length
1034                                 # Note: These steps are involved in part because it's possible
1035                                 # for elements, the table element in this case in particular,
1036                                 # to have been moved by a script around in the DOM, or indeed
1037                                 # removed from the DOM entirely, after the element was inserted
1038                                 # by the parser.
1039                                 break # don't really loop
1040                 else
1041                         # Otherwise Let adjusted insertion location be inside target, after
1042                         # its last child (if any).
1043                         target_i = target.children.length
1044
1045                 # 3. If the adjusted insertion location is inside a template element,
1046                 # let it instead be inside the template element's template contents,
1047                 # after its last child (if any).
1048                 # fixfull (template)
1049
1050                 # 4. Return the adjusted insertion location.
1051                 return [target, target_i]
1052
1053         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1054         # aka create_an_element_for_token
1055         token_to_element = (t, namespace, intended_parent) ->
1056                 t.type = TYPE_TAG # not TYPE_START_TAG
1057                 # convert attributes into a hash
1058                 attrs = {}
1059                 while t.attrs_a.length
1060                         a = t.attrs_a.pop()
1061                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1062                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1063
1064                 # TODO 2. If the newly created element has an xmlns attribute in the
1065                 # XMLNS namespace whose value is not exactly the same as the element's
1066                 # namespace, that is a parse error. Similarly, if the newly created
1067                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1068                 # value is not the XLink Namespace, that is a parse error.
1069
1070                 # fixfull: the spec says stuff about form pointers and ownerDocument
1071
1072                 return el
1073
1074         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1075         insert_foreign_element = (token, namespace) ->
1076                 ail = adjusted_insertion_location()
1077                 ail_el = ail[0]
1078                 ail_i = ail[1]
1079                 el = token_to_element token, namespace, ail_el
1080                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1081                 el.parent = ail_el
1082                 ail_el.children.splice ail_i, 0, el
1083                 open_els.unshift el
1084                 return el
1085         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1086         insert_html_element = insert_foreign_element # (token, namespace) ->
1087
1088         # FIXME read implement "foster parenting" part
1089         # FIXME read spec, do this right
1090         # FIXME implement the override target thing
1091         # note: this assumes it's an open tag
1092         # FIXME what part of the spec is this?
1093         # TODO look through all callers of this, and see what they should really be doing.
1094         #   eg probably insert_html_element for tokens
1095         tree_insert_element = (el, override_target = null, namespace = null) ->
1096                 if namespace?
1097                         el.namespace = namespace
1098                 dest = adjusted_insertion_location override_target
1099                 if el.type is TYPE_START_TAG # means it's a "token"
1100                         el = token_to_element el, namespace, dest[0]
1101                 unless el.namespace?
1102                         namespace = dest.namespace
1103                 # fixfull: Document nodes sometimes can't accept more chidren
1104                 dest[0].children.splice dest[1], 0, el
1105                 el.parent = dest[0]
1106                 open_els.unshift el
1107                 return el
1108
1109         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1110         # position should be [node, index_within_children]
1111         insert_comment = (t, position = null) ->
1112                 position ?= adjusted_insertion_location()
1113                 position[0].children.splice position[1], 0, t
1114
1115         # 8.2.5.2
1116         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1117         parse_generic_raw_text = (t) ->
1118                 insert_html_element t
1119                 tok_state = tok_state_rawtext
1120                 original_insertion_mode = insertion_mode
1121                 insertion_mode = ins_mode_text
1122         parse_generic_rcdata_text = (t) ->
1123                 insert_html_element t
1124                 tok_state = tok_state_rcdata
1125                 original_insertion_mode = insertion_mode
1126                 insertion_mode = ins_mode_text
1127
1128         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1129         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1130         generate_implied_end_tags = (except = null) ->
1131                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1132                         open_els.shift()
1133
1134         # 8.2.5.4 The rules for parsing tokens in HTML content
1135         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1136
1137         # 8.2.5.4.1 The "initial" insertion mode
1138         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1139         ins_mode_initial = (t) ->
1140                 if is_space_tok t
1141                         return
1142                 if t.type is TYPE_COMMENT
1143                         # fixfull this is supposed to be "the last child of the document object"
1144                         doc.children.push t
1145                         return
1146                 if t.type is TYPE_DOCTYPE
1147                         # fixfull
1148                         t.name = 'html'
1149                         doc.children.push t
1150                         insertion_mode = ins_mode_before_html
1151                         return
1152                 # Anything else
1153                 #fixfull (iframe, quirks)
1154                 insertion_mode = ins_mode_before_html
1155                 insertion_mode t # reprocess the token
1156                 return
1157
1158         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1159         ins_mode_before_html = (t) ->
1160                 if t.type is TYPE_DOCTYPE
1161                         parse_error()
1162                         return
1163                 if t.type is TYPE_COMMENT
1164                         doc.children.push t
1165                         return
1166                 if is_space_tok t
1167                         return
1168                 if t.type is TYPE_START_TAG and t.name is 'html'
1169                         el = token_to_element t, NS_HTML, doc
1170                         open_els.unshift(el)
1171                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1172                         insertion_mode = ins_mode_before_head
1173                         return
1174                 if t.type is TYPE_END_TAG
1175                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1176                                 # fall through to "anything else"
1177                         else
1178                                 parse_error()
1179                                 return
1180                 # Anything else
1181                 html_tok = new_open_tag 'html'
1182                 el = token_to_element html_tok, NS_HTML, doc
1183                 doc.children.push el
1184                 open_els.unshift el
1185                 # ?fixfull browsing context
1186                 insertion_mode = ins_mode_before_head
1187                 insertion_mode t
1188                 return
1189
1190         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1191         ins_mode_before_head = (t) ->
1192                 if is_space_tok t
1193                         return
1194                 if t.type is TYPE_COMMENT
1195                         insert_comment t
1196                         return
1197                 if t.type is TYPE_DOCTYPE
1198                         parse_error()
1199                         return
1200                 if t.type is TYPE_START_TAG and t.name is 'html'
1201                         ins_mode_in_body t
1202                         return
1203                 if t.type is TYPE_START_TAG and t.name is 'head'
1204                         el = insert_html_element t
1205                         head_element_pointer = el
1206                         insertion_mode = ins_mode_in_head
1207                 if t.type is TYPE_END_TAG
1208                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1209                                 # fall through to Anything else below
1210                         else
1211                                 parse_error()
1212                                 return
1213                 # Anything else
1214                 head_tok = new_open_tag 'head'
1215                 el = insert_html_element head_tok
1216                 head_element_pointer = el
1217                 insertion_mode = ins_mode_in_head
1218                 insertion_mode t # reprocess current token
1219
1220         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1221         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1222                 open_els.shift() # spec says this will be a 'head' node
1223                 insertion_mode = ins_mode_after_head
1224                 insertion_mode t
1225         ins_mode_in_head = (t) ->
1226                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1227                         insert_character t
1228                         return
1229                 if t.type is TYPE_COMMENT
1230                         insert_comment t
1231                         return
1232                 if t.type is TYPE_DOCTYPE
1233                         parse_error()
1234                         return
1235                 if t.type is TYPE_START_TAG and t.name is 'html'
1236                         ins_mode_in_body t
1237                         return
1238                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1239                         el = insert_html_element t
1240                         open_els.shift()
1241                         t.acknowledge_self_closing()
1242                         return
1243                 if t.type is TYPE_START_TAG and t.name is 'meta'
1244                         el = insert_html_element t
1245                         open_els.shift()
1246                         t.acknowledge_self_closing()
1247                         # fixfull encoding stuff
1248                         return
1249                 if t.type is TYPE_START_TAG and t.name is 'title'
1250                         parse_generic_rcdata_element t
1251                         return
1252                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1253                         parse_generic_raw_text t
1254                         return
1255                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1256                         insert_html_element t
1257                         insertion_mode = in_head_noscript # FIXME implement
1258                         return
1259                 if t.type is TYPE_START_TAG and t.name is 'script'
1260                         ail = adjusted_insertion_location()
1261                         el = token_to_element t, NS_HTML, ail
1262                         el.flag_parser_inserted true # FIXME implement
1263                         # fixfull frament case
1264                         ail[0].children.splice ail[1], 0, el
1265                         open_els.unshift el
1266                         tok_state = tok_state_script_data
1267                         original_insertion_mode = insertion_mode # make sure orig... is defined
1268                         insertion_mode = ins_mode_text # FIXME implement
1269                         return
1270                 if t.type is TYPE_END_TAG and t.name is 'head'
1271                         open_els.shift() # will be a head element... spec says so
1272                         insertion_mode = ins_mode_after_head
1273                         return
1274                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1275                         ins_mode_in_head_else t
1276                         return
1277                 if t.type is TYPE_START_TAG and t.name is 'template'
1278                         insert_html_element t
1279                         afe_push_marker()
1280                         flag_frameset_ok = false
1281                         insertion_mode = ins_mode_in_template
1282                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1283                         return
1284                 if t.type is TYPE_END_TAG and t.name is 'template'
1285                         if template_tag_is_open()
1286                                 generate_implied_end_tags
1287                                 if open_els[0].name isnt 'template'
1288                                         parse_error()
1289                                 loop
1290                                         el = open_els.shift()
1291                                         if el.name is 'template'
1292                                                 break
1293                                 clear_afe_to_marker()
1294                                 template_insertion_modes.shift()
1295                                 reset_insertion_mode()
1296                         else
1297                                 parse_error()
1298                         return
1299                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1300                         parse_error()
1301                         return
1302                 ins_mode_in_head_else t
1303         
1304         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1305         ins_mode_in_head_noscript = (t) ->
1306                 # FIXME ?fixfull
1307                 console.log "ins_mode_in_head_noscript unimplemented"
1308         
1309         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1310         ins_mode_after_head_else = (t) ->
1311                 body_tok = new_open_tag 'body'
1312                 insert_html_element body_tok
1313                 insertion_mode = ins_mode_in_body
1314                 insertion_mode t # reprocess token
1315                 return
1316         ins_mode_after_head = (t) ->
1317                 if is_space_tok t
1318                         insert_character t
1319                         return
1320                 if t.type is TYPE_COMMENT
1321                         insert_comment t
1322                         return
1323                 if t.type is TYPE_DOCTYPE
1324                         parse_error()
1325                         return
1326                 if t.type is TYPE_START_TAG and t.name is 'html'
1327                         ins_mode_in_body t
1328                         return
1329                 if t.type is TYPE_START_TAG and t.name is 'body'
1330                         insert_html_element t
1331                         flag_frameset_ok = false
1332                         insertion_mode = ins_mode_in_body
1333                         return
1334                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1335                         insert_html_element t
1336                         insertion_mode = ins_mode_in_frameset
1337                         return
1338                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1339                         parse_error()
1340                         open_els.unshift head_element_pointer
1341                         ins_mode_in_head t
1342                         for el, i of open_els
1343                                 if el is head_element_pointer
1344                                         open_els.splice i, 1
1345                                         return
1346                         console.log "warning: 23904 couldn't find head element in open_els"
1347                         return
1348                 if t.type is TYPE_END_TAG and t.name is 'template'
1349                         ins_mode_in_head t
1350                         return
1351                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1352                         ins_mode_after_head_else t
1353                         return
1354                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1355                         parse_error()
1356                         return
1357                 # Anything else
1358                 ins_mode_after_head_else t
1359
1360         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1361         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1362                 for node, i in open_els
1363                         if node.name is name # FIXME check namespace too
1364                                 generate_implied_end_tags name # arg is exception
1365                                 parse_error() unless i is 0
1366                                 while i >= 0
1367                                         open_els.shift()
1368                                         i -= 1
1369                                 return
1370                         if special_elements[node.name]? # FIXME check namespac too
1371                                 parse_error()
1372                                 return
1373         ins_mode_in_body = (t) ->
1374                 switch t.type
1375                         when TYPE_TEXT
1376                                 switch t.text
1377                                         when "\u0000"
1378                                                 parse_error()
1379                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1380                                                 reconstruct_active_formatting_elements()
1381                                                 insert_character t
1382                                         else
1383                                                 reconstruct_active_formatting_elements()
1384                                                 insert_character t
1385                                                 flag_frameset_ok = false
1386                         when TYPE_COMMENT
1387                                 insert_comment t
1388                         when TYPE_DOCTYPE
1389                                 parse_error()
1390                         when TYPE_START_TAG
1391                                 switch t.name
1392                                         when 'html'
1393                                                 parse_error()
1394                                                 return if template_tag_is_open()
1395                                                 root_attrs = open_els[open_els.length - 1].attrs
1396                                                 for k, v of t.attrs
1397                                                         root_attrs[k] = v unless root_attrs[k]?
1398                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1399                                                 # FIXME also do this for </template> (end tag)
1400                                                 return ins_mode_in_head t
1401                                         when 'body'
1402                                                 parse_error()
1403                                                 # TODO
1404                                         when 'frameset'
1405                                                 parse_error()
1406                                                 # TODO
1407                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1408                                                 close_p_if_in_button_scope()
1409                                                 insert_html_element t
1410                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1411                                                 close_p_if_in_button_scope()
1412                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1413                                                         parse_error()
1414                                                         open_els.shift()
1415                                                 insert_html_element t
1416                                         # TODO lots more to implement here
1417                                         when 'a'
1418                                                 # If the list of active formatting elements
1419                                                 # contains an a element between the end of the list and
1420                                                 # the last marker on the list (or the start of the list
1421                                                 # if there is no marker on the list), then this is a
1422                                                 # parse error; run the adoption agency algorithm for
1423                                                 # the tag name "a", then remove that element from the
1424                                                 # list of active formatting elements and the stack of
1425                                                 # open elements if the adoption agency algorithm didn't
1426                                                 # already remove it (it might not have if the element
1427                                                 # is not in table scope).
1428                                                 found = false
1429                                                 for el in afe
1430                                                         if el.type is TYPE_AFE_MARKER
1431                                                                 break
1432                                                         if el.name is 'a'
1433                                                                 found = el
1434                                                 if found?
1435                                                         parse_error()
1436                                                         adoption_agency 'a'
1437                                                         for el, i in afe
1438                                                                 if el is found
1439                                                                         afe.splice i, 1
1440                                                         for el, i in open_els
1441                                                                 if el is found
1442                                                                         open_els.splice i, 1
1443                                                 reconstruct_active_formatting_elements()
1444                                                 el = insert_html_element t
1445                                                 afe_push el
1446                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1447                                                 reconstruct_active_formatting_elements()
1448                                                 el = insert_html_element t
1449                                                 afe_push el
1450                                         when 'table'
1451                                                 # fixfull quirksmode thing
1452                                                 close_p_if_in_button_scope()
1453                                                 insert_html_element t
1454                                                 insertion_mode = ins_mode_in_table
1455                                         # TODO lots more to implement here
1456                                         else # any other start tag
1457                                                 reconstruct_active_formatting_elements()
1458                                                 insert_html_element t
1459                         when TYPE_EOF
1460                                 ok_tags = {
1461                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1462                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1463                                 }
1464                                 for t in open_els
1465                                         unless ok_tags[t.name]?
1466                                                 parse_error()
1467                                                 break
1468                                 # TODO stack of template insertion modes thing
1469                                 stop_parsing()
1470                         when TYPE_END_TAG
1471                                 switch t.name
1472                                         when 'body'
1473                                                 unless is_in_scope 'body'
1474                                                         parse_error()
1475                                                         return
1476                                                 # TODO implement parse error and move to tree_after_body
1477                                         when 'html'
1478                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1479                                                         parse_error()
1480                                                         return
1481                                                 # TODO implement parse error and move to tree_after_body, reprocess
1482                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1483                                                 unless is_in_scope t.name, NS_HTML
1484                                                         parse_error()
1485                                                         return
1486                                                 generate_implied_end_tags()
1487                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1488                                                         parse_error()
1489                                                 loop
1490                                                         el = open_els.shift()
1491                                                         if el.name is t.name and el.namespace is NS_HTML
1492                                                                 return
1493                                         # TODO lots more close tags to implement here
1494                                         when 'p'
1495                                                 unless is_in_button_scope 'p'
1496                                                         parse_error()
1497                                                         insert_html_element new_open_tag 'p'
1498                                                 close_p_element()
1499                                         # TODO lots more close tags to implement here
1500                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1501                                                 adoption_agency t.name
1502                                         # TODO lots more close tags to implement here
1503                                         else
1504                                                 in_body_any_other_end_tag t.name
1505                 return
1506
1507         ins_mode_in_table_else = (t) ->
1508                 parse_error()
1509                 flag_foster_parenting = true # FIXME
1510                 ins_mode_in_body t
1511                 flag_foster_parenting = false
1512         can_in_table = { # FIXME do this inline like everywhere else
1513                 'table': true
1514                 'tbody': true
1515                 'tfoot': true
1516                 'thead': true
1517                 'tr': true
1518         }
1519
1520         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1521         ins_mode_text = (t) ->
1522                 if t.type is TYPE_TEXT
1523                         insert_character t
1524                         return
1525                 if t.type is TYPE_EOF
1526                         parse_error()
1527                         if open_els[0].name is 'script'
1528                                 open_els[0].flag 'already started', true
1529                         open_els.shift()
1530                         insertion_mode = original_insertion_mode
1531                         insertion_mode t
1532                         return
1533                 if t.type is TYPE_END_TAG and t.name is 'script'
1534                         open_els.shift()
1535                         insertion_mode = original_insertion_mode
1536                         # fixfull the spec seems to assume that I'm going to run the script
1537                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1538                         return
1539                 if t.type is TYPE_END_TAG
1540                         open_els.shift()
1541                         insertion_mode = original_insertion_mode
1542                         return
1543                 console.log 'warning: end of ins_mode_text reached'
1544
1545         # the functions below implement the tokenizer stats described here:
1546         # http://www.w3.org/TR/html5/syntax.html#tokenization
1547
1548         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1549         ins_mode_in_table = (t) ->
1550                 switch t.type
1551                         when TYPE_TEXT
1552                                 if can_in_table[t.name]
1553                                         original_insertion_mode = insertion_mode
1554                                         insertion_mode = ins_mode_in_table_text
1555                                         insertion_mode t
1556                                 else
1557                                         ins_mode_in_table_else t
1558                         when TYPE_COMMENT
1559                                 insert_comment t
1560                         when TYPE_DOCTYPE
1561                                 parse_error()
1562                         when TYPE_START_TAG
1563                                 switch t.name
1564                                         when 'caption'
1565                                                 clear_stack_to_table_context()
1566                                                 afe_push_marker()
1567                                                 insert_html_element t
1568                                                 insertion_mode = ins_mode_in_caption
1569                                         when 'colgroup'
1570                                                 clear_stack_to_table_context()
1571                                                 insert_html_element t
1572                                                 insertion_mode = ins_mode_in_column_group
1573                                         when 'col'
1574                                                 clear_stack_to_table_context()
1575                                                 insert_html_element new_open_tag 'colgroup'
1576                                                 insertion_mode = ins_mode_in_column_group
1577                                                 insertion_mode t
1578                                         when 'tbody', 'tfoot', 'thead'
1579                                                 clear_stack_to_table_context()
1580                                                 insert_html_element t
1581                                                 insertion_mode = ins_mode_in_table_body
1582                                         when 'td', 'th', 'tr'
1583                                                 clear_stack_to_table_context()
1584                                                 insert_html_element new_open_tag 'tbody'
1585                                                 insertion_mode = ins_mode_in_table_body
1586                                                 insertion_mode t
1587                                         when 'table'
1588                                                 parse_error()
1589                                                 if is_in_table_scope 'table'
1590                                                         loop
1591                                                                 el = open_els.shift()
1592                                                                 if el.name is 'table'
1593                                                                         break
1594                                                         reset_insertion_mode()
1595                                                         insertion_mode t
1596                                         when 'style', 'script', 'template'
1597                                                 ins_mode_in_head t
1598                                         when 'input'
1599                                                 if token_is_input_hidden t
1600                                                         ins_mode_in_table_else t
1601                                                 else
1602                                                         parse_error()
1603                                                         el = insert_html_element t
1604                                                         open_els.shift()
1605                                                         t.acknowledge_self_closing()
1606                                         when 'form'
1607                                                 parse_error()
1608                                                 if form_element_pointer?
1609                                                         return
1610                                                 if template_tag_is_open()
1611                                                         return
1612                                                 form_element_pointer = insert_html_element t
1613                                                 open_els.shift()
1614                                         else
1615                                                 ins_mode_in_table_else t
1616                         when TYPE_END_TAG
1617                                 switch t.name
1618                                         when 'table'
1619                                                 if is_in_table_scope 'table'
1620                                                         loop
1621                                                                 el = open_els.shift()
1622                                                                 if el.name is 'table'
1623                                                                         break
1624                                                         reset_insertion_mode()
1625                                                 else
1626                                                         parse_error
1627                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1628                                                 parse_error()
1629                                         when 'template'
1630                                                 ins_mode_in_head t
1631                                         else
1632                                                 ins_mode_in_table_else t
1633                         when TYPE_EOF
1634                                 ins_mode_in_body t
1635                         else
1636                                 ins_mode_in_table_else t
1637
1638
1639         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1640         ins_mode_in_table_text = (t) ->
1641                 if t.type is TYPE_TEXT and t.text is "\u0000"
1642                         # huh? I thought the tokenizer didn't emit these
1643                         parse_error()
1644                         return
1645                 if t.type is TYPE_TEXT
1646                         pending_table_character_tokens.push t
1647                         return
1648                 # Anything else
1649                 all_space = true
1650                 for old in pending_table_character_tokens
1651                         unless is_space_tok old
1652                                 all_space = false
1653                                 break
1654                 if all_space
1655                         for old in pending_table_character_tokens
1656                                 insert_character old
1657                 else
1658                         for old in pending_table_character_tokens
1659                                 ins_mode_table_else old
1660                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1661                 insertion_mode = original_insertion_mode
1662                 insertion_mode t
1663
1664         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1665         ins_mode_in_caption = (t) ->
1666                 if t.type is TYPE_END_TAG and t.name is 'caption'
1667                         if is_in_table_scope 'caption'
1668                                 generate_implied_end_tags()
1669                                 if open_els[0].name isnt 'caption'
1670                                         parse_error()
1671                                 loop
1672                                         el = open_els.shift()
1673                                         if el.name is 'caption'
1674                                                 break
1675                                 clear_afe_to_marker()
1676                                 insertion_mode = in_table
1677                         else
1678                                 parse_error()
1679                                 # fragment case
1680                         return
1681                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1682                         parse_error()
1683                         if is_in_table_scope 'caption'
1684                                 loop
1685                                         el = open_els.shift()
1686                                         if el.name is 'caption'
1687                                                 break
1688                                 clear_afe_to_marker()
1689                                 insertion_mode = in_table
1690                                 insertion_mode t
1691                         # else fragment case
1692                         return
1693                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1694                         parse_error()
1695                         return
1696                 # Anything else
1697                 ins_mode_in_body t
1698
1699         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1700         ins_mode_in_column_group = (t) ->
1701                 if is_space_tok t
1702                         insert_character t
1703                         return
1704                 if t.type is TYPE_COMMENT
1705                         insert_comment t
1706                         return
1707                 if t.type is TYPE_DOCTYPE
1708                         parse_error()
1709                         return
1710                 if t.type is TYPE_START_TAG and t.name is 'html'
1711                         ins_mode_in_body t
1712                         return
1713                 if t.type is TYPE_START_TAG and t.name is 'col'
1714                         el = insert_html_element t
1715                         open_els.shift()
1716                         t.acknowledge_self_closing()
1717                         return
1718                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1719                         if open_els[0].name is 'colgroup'
1720                                 open_els[0].shift()
1721                                 insertion_mode = ins_mode_in_table
1722                         else
1723                                 parse_error()
1724                         return
1725                 if t.type is TYPE_END_TAG and t.name is 'col'
1726                         parse_error()
1727                         return
1728                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1729                         ins_mode_in_head t
1730                         return
1731                 if t.type is TYPE_EOF
1732                         ins_mode_in_body t
1733                         return
1734                 # Anything else
1735                 if open_els[0].name isnt 'colgroup'
1736                         parse_error()
1737                         return
1738                 open_els.shift()
1739                 insertion_mode = ins_mode_in_table
1740                 insertion_mode t
1741                 return
1742
1743         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1744         ins_mode_in_table_body = (t) ->
1745                 if t.type is TYPE_START_TAG and t.name is 'tr'
1746                         clear_stack_to_table_body_context()
1747                         insert_html_element t
1748                         insertion_mode = ins_mode_in_row
1749                         return
1750                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1751                         parse_error()
1752                         clear_stack_to_table_body_context()
1753                         insert_html_element new_open_tag 'tr'
1754                         insertion_mode = ins_mode_in_row
1755                         insertion_mode t
1756                         return
1757                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1758                         unless is_in_table_scope t.name # fixfull check namespace
1759                                 parse_error()
1760                                 return
1761                         clear_stack_to_table_body_context()
1762                         open_els.shift()
1763                         insertion_mode = ins_mode_in_table
1764                         return
1765                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1766                         has = false
1767                         for el in open_els
1768                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1769                                         has = true
1770                                         break
1771                                 if table_scopers[el.name]
1772                                         break
1773                         if !has
1774                                 parse_error()
1775                                 return
1776                         clear_stack_to_table_body_context()
1777                         open_els.shift()
1778                         insertion_mode = ins_mode_in_table
1779                         insertion_mode t
1780                         return
1781                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1782                         parse_error()
1783                         return
1784                 # Anything else
1785                 ins_mode_in_table t
1786
1787         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1788         ins_mode_in_row = (t) ->
1789                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1790                         clear_stack_to_table_row_context()
1791                         insert_html_element t
1792                         insertion_mode = ins_mode_in_cell
1793                         afe_push_marker()
1794                         return
1795                 if t.type is TYPE_END_TAG and t.name is 'tr'
1796                         if is_in_table_scope 'tr'
1797                                 clear_stack_to_table_row_context()
1798                                 open_els.shift()
1799                                 insertion_mode = ins_mode_in_table_body
1800                         else
1801                                 parse_error()
1802                         return
1803                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1804                         if is_in_table_scope 'tr'
1805                                 clear_stack_to_table_row_context()
1806                                 open_els.shift()
1807                                 insertion_mode = ins_mode_in_table_body
1808                                 insertion_mode t
1809                         else
1810                                 parse_error()
1811                         return
1812                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1813                         if is_in_table_scope t.name # fixfull namespace
1814                                 if is_in_table_scope 'tr'
1815                                         clear_stack_to_table_row_context()
1816                                         open_els.shift()
1817                                         insertion_mode = ins_mode_in_table_body
1818                                         insertion_mode t
1819                         else
1820                                 parse_error()
1821                         return
1822                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1823                         parse_error()
1824                         return
1825                 # Anything else
1826                 ins_mode_in_table t
1827
1828         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1829         close_the_cell = ->
1830                 generate_implied_end_tags()
1831                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1832                         parse_error()
1833                 loop
1834                         el = open_els.shift()
1835                         if el.name is 'td' or el.name is 'th'
1836                                 break
1837                 clear_afe_to_marker()
1838                 insertion_mode = ins_mode_in_row
1839
1840         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1841         ins_mode_in_cell = (t) ->
1842                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1843                         if is_in_table_scope t.name
1844                                 generate_implied_end_tags()
1845                                 if open_els[0].name isnt t.name
1846                                         parse_error
1847                                 loop
1848                                         el = open_els.shift()
1849                                         if el.name is t.name
1850                                                 break
1851                                 clear_afe_to_marker()
1852                                 insertion_mode = ins_mode_in_row
1853                         else
1854                                 parse_error()
1855                         return
1856                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1857                         has = false
1858                         for el in open_els
1859                                 if el.name is 'td' or el.name is 'th'
1860                                         has = true
1861                                         break
1862                                 if table_scopers[el.name]
1863                                         break
1864                         if !has
1865                                 parse_error()
1866                                 return
1867                         close_the_cell()
1868                         insertion_mode t
1869                         return
1870                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1871                         parse_error()
1872                         return
1873                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1874                         if is_in_table_scope t.name # fixfull namespace
1875                                 close_the_cell()
1876                                 insertion_mode t
1877                         else
1878                                 parse_error()
1879                         return
1880                 # Anything Else
1881                 ins_mode_in_body t
1882
1883         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1884         ins_mode_in_select = (t) ->
1885                 if t.type is TYPE_TEXT and t.text is "\u0000"
1886                         parse_error()
1887                         return
1888                 if t.type is TYPE_TEXT
1889                         insert_character t
1890                         return
1891                 if t.type is TYPE_COMMENT
1892                         insert_comment t
1893                         return
1894                 if t.type is TYPE_DOCTYPE
1895                         parse_error()
1896                         return
1897                 if t.type is TYPE_START_TAG and t.name is 'html'
1898                         ins_mode_in_body t
1899                         return
1900                 if t.type is TYPE_START_TAG and t.name is 'option'
1901                         if open_els[0].name is 'option'
1902                                 open_els.shift()
1903                         insert_html_element t
1904                         return
1905                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1906                         if open_els[0].name is 'option'
1907                                 open_els.shift()
1908                         if open_els[0].name is 'optgroup'
1909                                 open_els.shift()
1910                         insert_html_element t
1911                         return
1912                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1913                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1914                                 open_els.shift()
1915                         if open_els[0].name is 'optgroup'
1916                                 open_els.shift()
1917                         else
1918                                 parse_error()
1919                         return
1920                 if t.type is TYPE_END_TAG and t.name is 'option'
1921                         if open_els[0].name is 'option'
1922                                 open_els.shift()
1923                         else
1924                                 parse_error()
1925                         return
1926                 if t.type is TYPE_END_TAG and t.name is 'select'
1927                         if is_in_select_scope 'select'
1928                                 loop
1929                                         el = open_els.shift()
1930                                         if el.name is 'select'
1931                                                 break
1932                                 reset_insertion_mode()
1933                         else
1934                                 parse_error()
1935                         return
1936                 if t.type is TYPE_START_TAG and t.name is 'select'
1937                         parse_error()
1938                         loop
1939                                 el = open_els.shift()
1940                                 if el.name is 'select'
1941                                         break
1942                         reset_insertion_mode()
1943                         # spec says that this is the same as </select> but it doesn't say
1944                         # to check scope first
1945                         return
1946                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1947                         parse_error()
1948                         if is_in_select_scope 'select'
1949                                 return
1950                         loop
1951                                 el = open_els.shift()
1952                                 if el.name is 'select'
1953                                         break
1954                         reset_insertion_mode()
1955                         insertion_mode t
1956                         return
1957                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1958                         ins_mode_in_head t
1959                         return
1960                 if t.type is TYPE_EOF
1961                         ins_mode_in_body t
1962                         return
1963                 # Anything else
1964                 parse_error()
1965                 return
1966
1967         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1968         ins_mode_in_select_in_table = (t) ->
1969                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1970                         parse_error()
1971                         loop
1972                                 el = open_els.shift()
1973                                 if el.name is 'select'
1974                                         break
1975                         reset_insertion_mode()
1976                         insertion_mode t
1977                         return
1978                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1979                         parse_error()
1980                         unless is_in_table_scope t.name, NS_HTML
1981                                 return
1982                         loop
1983                                 el = open_els.shift()
1984                                 if el.name is 'select'
1985                                         break
1986                         reset_insertion_mode()
1987                         insertion_mode t
1988                         return
1989                 # Anything else
1990                 ins_mode_in_select t
1991                 return
1992
1993         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
1994         ins_mode_in_template = (t) ->
1995                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
1996                         ins_mode_in_body t
1997                         return
1998                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1999                         ins_mode_in_head t
2000                         return
2001                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2002                         template_insertion_modes.shift()
2003                         template_insertion_modes.unshift ins_mode_in_table
2004                         insertion_mode = ins_mode_in_table
2005                         insertion_mode t
2006                         return
2007                 if t.type is TYPE_START_TAG and t.name is 'col'
2008                         template_insertion_modes.shift()
2009                         template_insertion_modes.unshift ins_mode_in_column_group
2010                         insertion_mode = ins_mode_in_column_group
2011                         insertion_mode t
2012                         return
2013                 if t.type is TYPE_START_TAG and t.name is 'tr'
2014                         template_insertion_modes.shift()
2015                         template_insertion_modes.unshift ins_mode_in_table_body
2016                         insertion_mode = ins_mode_in_table_body
2017                         insertion_mode t
2018                         return
2019                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2020                         template_insertion_modes.shift()
2021                         template_insertion_modes.unshift ins_mode_in_row
2022                         insertion_mode = ins_mode_in_row
2023                         insertion_mode t
2024                         return
2025                 if t.type is TYPE_START_TAG
2026                         template_insertion_modes.shift()
2027                         template_insertion_modes.unshift ins_mode_in_body
2028                         insertion_mode = ins_mode_in_body
2029                         insertion_mode t
2030                         return
2031                 if t.type is TYPE_END_TAG
2032                         parse_error()
2033                         return
2034                 if t.type is EOF
2035                         unless template_tag_is_open()
2036                                 stop_parsing()
2037                                 return
2038                         parse_error()
2039                         loop
2040                                 el = open_els.shift()
2041                                 if el.name is 'template' # fixfull check namespace
2042                                         break
2043                         clear_afe_to_marker()
2044                         template_insertion_modes.shift()
2045                         reset_insertion_mode()
2046                         insertion_mode t
2047
2048         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2049         ins_mode_after_body = (t) ->
2050                 if is_space_tok t
2051                         ins_mode_in_body t
2052                         return
2053                 if t.type is TYPE_COMMENT
2054                         insert_comment t, [open_els[0], open_els[0].children.length]
2055                         return
2056                 if t.type is TYPE_DOCTYPE
2057                         parse_error()
2058                         return
2059                 if t.type is TYPE_START_TAG and t.name is 'html'
2060                         ins_mode_in_body t
2061                         return
2062                 if t.type is TYPE_END_TAG and t.name is 'html'
2063                         # fixfull fragment case
2064                         insertion_mode = ins_mode_after_after_body
2065                         return
2066                 if t.type is TYPE_EOF
2067                         stop_parsing()
2068                         return
2069                 # Anything ELse
2070                 parse_error()
2071                 insertion_mode = ins_mode_in_body
2072                 insertion_mode t
2073
2074         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2075         ins_mode_in_frameset = (t) ->
2076                 if is_space_tok t
2077                         insert_character t
2078                         return
2079                 if t.type is TYPE_COMMENT
2080                         insert_comment t
2081                         return
2082                 if t.type is TYPE_DOCTYPE
2083                         parse_error()
2084                         return
2085                 if t.type is TYPE_START_TAG and t.name is 'html'
2086                         ins_mode_in_body t
2087                         return
2088                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2089                         insert_html_element t
2090                         return
2091                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2092                         # TODO ?correct for: "if the current node is the root html element"
2093                         if open_els.length is 1
2094                                 parse_error()
2095                                 return # fragment case
2096                         open_els.shift()
2097                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2098                                 insertion_mode = ins_mode_after_frameset
2099                         return
2100                 if t.type is TYPE_START_TAG and t.name is 'frame'
2101                         insert_html_element t
2102                         open_els.shift()
2103                         t.acknowledge_self_closing()
2104                         return
2105                 if t.type is TYPE_START TAG and t.name is 'noframes'
2106                         ins_mode_in_head t
2107                         return
2108                 if t.type is TYPE_EOF
2109                         # TODO ?correct for: "if the current node is not the root html element"
2110                         if open_els.length isnt 1
2111                                 parse_error()
2112                         stop_parsing()
2113                         return
2114                 # Anything else
2115                 parse_error()
2116                 return
2117
2118         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2119         ins_mode_after_frameset = (t) ->
2120                 if is_space_tok t
2121                         insert_character t
2122                         return
2123                 if t.type is TYPE_COMMENT
2124                         insert_comment t
2125                         return
2126                 if t.type is TYPE_DOCTYPE
2127                         parse_error()
2128                         return
2129                 if t.type is TYPE_START_TAG and t.name is 'html'
2130                         ins_mode_in_body t
2131                         return
2132                 if t.type is TYPE_END_TAG and t.name is 'html'
2133                         insert_mode = ins_mode_after_after_frameset
2134                         return
2135                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2136                         ins_mode_in_head t
2137                         return
2138                 if t.type is TYPE_EOF
2139                         stop_parsing()
2140                         return
2141                 # Anything else
2142                 parse_error()
2143                 return
2144
2145         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2146         ins_mode_after_after_body = (t) ->
2147                 if t.type is TYPE_COMMENT
2148                         insert_comment t, [doc, doc.children.length]
2149                         return
2150                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2151                         ins_mode_in_body t
2152                         return
2153                 if t.type is TYPE_EOF
2154                         stop_parsing()
2155                         return
2156                 # Anything else
2157                 parse_error()
2158                 insertion_mode = ins_mode_in_body
2159                 return
2160
2161         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2162         ins_mode_after_after_frameset = (t) ->
2163                 if t.type is TYPE_COMMENT
2164                         insert_comment t, [doc, doc.children.length]
2165                         return
2166                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2167                         ins_mode_in_body t
2168                         return
2169                 if t.type is TYPE_EOF
2170                         stop_parsing()
2171                         return
2172                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2173                         ins_mode_in_head t
2174                         return
2175                 # Anything else
2176                 parse_error()
2177                 return
2178
2179
2180
2181
2182
2183         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2184         tok_state_data = ->
2185                 switch c = txt.charAt(cur++)
2186                         when '&'
2187                                 return new_text_node parse_character_reference()
2188                         when '<'
2189                                 tok_state = tok_state_tag_open
2190                         when "\u0000"
2191                                 parse_error()
2192                                 return new_text_node c
2193                         when '' # EOF
2194                                 return new_eof_token()
2195                         else
2196                                 return new_text_node c
2197                 return null
2198
2199         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2200         # not needed: tok_state_character_reference_in_data = ->
2201         # just call parse_character_reference()
2202
2203         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2204         tok_state_rcdata = ->
2205                 switch c = txt.charAt(cur++)
2206                         when '&'
2207                                 return new_text_node parse_character_reference()
2208                         when '<'
2209                                 tok_state = tok_state_rcdata_less_than_sign
2210                         when "\u0000"
2211                                 parse_error()
2212                                 return new_character_token "\ufffd"
2213                         when '' # EOF
2214                                 return new_eof_token()
2215                         else
2216                                 return new_character_token c
2217                 return null
2218
2219         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2220         # not needed: tok_state_character_reference_in_rcdata = ->
2221         # just call parse_character_reference()
2222
2223         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2224         tok_state_rawtext = ->
2225                 switch c = txt.charAt(cur++)
2226                         when '<'
2227                                 tok_state = tok_state_rawtext_less_than_sign
2228                         when "\u0000"
2229                                 parse_error()
2230                                 return new_character_token "\ufffd"
2231                         when '' # EOF
2232                                 return new_eof_token()
2233                         else
2234                                 return new_character_token c
2235                 return null
2236
2237         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2238         tok_state_script_data = ->
2239                 switch c = txt.charAt(cur++)
2240                         when '<'
2241                                 tok_state = tok_state_script_data_less_than_sign
2242                         when "\u0000"
2243                                 parse_error()
2244                                 return new_character_token "\ufffd"
2245                         when '' # EOF
2246                                 return new_eof_token()
2247                         else
2248                                 return new_character_token c
2249                 return null
2250
2251         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2252         tok_state_plaintext = ->
2253                 switch c = txt.charAt(cur++)
2254                         when "\u0000"
2255                                 parse_error()
2256                                 return new_character_token "\ufffd"
2257                         when '' # EOF
2258                                 return new_eof_token()
2259                         else
2260                                 return new_character_token c
2261                 return null
2262
2263
2264         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2265         tok_state_tag_open = ->
2266                 switch c = txt.charAt(cur++)
2267                         when '!'
2268                                 tok_state = tok_state_markup_declaration_open
2269                         when '/'
2270                                 tok_state = tok_state_end_tag_open
2271                         when '?'
2272                                 parse_error()
2273                                 tok_state = tok_state_bogus_comment
2274                         else
2275                                 if lc_alpha.indexOf(c) > -1
2276                                         tok_cur_tag = new_open_tag c
2277                                         tok_state = tok_state_tag_name
2278                                 else if uc_alpha.indexOf(c) > -1
2279                                         tok_cur_tag = new_open_tag c.toLowerCase()
2280                                         tok_state = tok_state_tag_name
2281                                 else
2282                                         parse_error()
2283                                         tok_state = tok_state_data
2284                                         cur -= 1 # we didn't parse/handle the char after <
2285                                         return new_text_node '<'
2286                 return null
2287
2288         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2289         tok_state_end_tag_open = ->
2290                 switch c = txt.charAt(cur++)
2291                         when '>'
2292                                 parse_error()
2293                                 tok_state = tok_state_data
2294                         when '' # EOF
2295                                 parse_error()
2296                                 tok_state = tok_state_data
2297                                 return new_text_node '</'
2298                         else
2299                                 if uc_alpha.indexOf(c) > -1
2300                                         tok_cur_tag = new_end_tag c.toLowerCase()
2301                                         tok_state = tok_state_tag_name
2302                                 else if lc_alpha.indexOf(c) > -1
2303                                         tok_cur_tag = new_end_tag c
2304                                         tok_state = tok_state_tag_name
2305                                 else
2306                                         parse_error()
2307                                         tok_state = tok_state_bogus_comment
2308                 return null
2309
2310         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2311         tok_state_tag_name = ->
2312                 switch c = txt.charAt(cur++)
2313                         when "\t", "\n", "\u000c", ' '
2314                                 tok_state = tok_state_before_attribute_name
2315                         when '/'
2316                                 tok_state = tok_state_self_closing_start_tag
2317                         when '>'
2318                                 tok_state = tok_state_data
2319                                 tmp = tok_cur_tag
2320                                 tok_cur_tag = null
2321                                 return tmp
2322                         when "\u0000"
2323                                 parse_error()
2324                                 tok_cur_tag.name += "\ufffd"
2325                         when '' # EOF
2326                                 parse_error()
2327                                 tok_state = tok_state_data
2328                         else
2329                                 if uc_alpha.indexOf(c) > -1
2330                                         tok_cur_tag.name += c.toLowerCase()
2331                                 else
2332                                         tok_cur_tag.name += c
2333                 return null
2334
2335         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2336         tok_state_rcdata_less_than_sign = ->
2337                 c = txt.charAt(cur++)
2338                 if c is '/'
2339                         temporary_buffer = ''
2340                         tok_state = tok_state_rcdata_end_tag_open
2341                         return null
2342                 # Anything else
2343                 tok_state = tok_state_rcdata
2344                 cur -= 1 # reconsume the input character
2345                 return new_character_token '<'
2346
2347         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2348         tok_state_rcdata_end_tag_open = ->
2349                 c = txt.charAt(cur++)
2350                 if uc_alpha.indexOf(c) > -1
2351                         tok_cur_tag = new_end_tag c.toLowerCase()
2352                         temporary_buffer += c
2353                         tok_state = tok_state_rcdata_end_tag_name
2354                         return null
2355                 if lc_alpha.indexOf(c) > -1
2356                         tok_cur_tag = new_end_tag c
2357                         temporary_buffer += c
2358                         tok_state = tok_state_rcdata_end_tag_name
2359                         return null
2360                 # Anything else
2361                 tok_state = tok_state_rcdata
2362                 cur -= 1 # reconsume the input character
2363                 return new_character_token "</" # fixfull separate these
2364
2365         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2366         is_appropriate_end_tag = (t) ->
2367                 # spec says to check against "the tag name of the last start tag to
2368                 # have been emitted from this tokenizer", but this is only called from
2369                 # the various "raw" states, which I'm pretty sure all push the start
2370                 # token onto open_els. TODO: verify this after the script data states
2371                 # are implemented
2372                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2373                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2374
2375         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2376         tok_state_rcdata_end_tag_name = ->
2377                 c = txt.charAt(cur++)
2378                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2379                         if is_appropriate_end_tag tok_cur_tag
2380                                 tok_state = tok_state_before_attribute_name
2381                                 return
2382                         # else fall through to "Anything else"
2383                 if c is '/'
2384                         if is_appropriate_end_tag tok_cur_tag
2385                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2386                                 return
2387                         # else fall through to "Anything else"
2388                 if c is '>'
2389                         if is_appropriate_end_tag tok_cur_tag
2390                                 tok_state = tok_state_data
2391                                 return tok_cur_tag
2392                         # else fall through to "Anything else"
2393                 if uc_alpha.indexOf(c) > -1
2394                         tok_cur_tag.name += c.toLowerCase()
2395                         temporary_buffer += c
2396                         return null
2397                 if lc_alpha.indexOf(c) > -1
2398                         tok_cur_tag.name += c
2399                         temporary_buffer += c
2400                         return null
2401                 # Anything else
2402                 tok_state = tok_state_rcdata
2403                 cur -= 1 # reconsume the input character
2404                 return new_character_token '</' + temporary_buffer # fixfull separate these
2405
2406         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2407         tok_state_rawtext_less_than_sign = ->
2408                 c = txt.charAt(cur++)
2409                 if c is '/'
2410                         temporary_buffer = ''
2411                         tok_state = tok_state_rawtext_end_tag_open
2412                         return null
2413                 # Anything else
2414                 tok_state = tok_state_rawtext
2415                 cur -= 1 # reconsume the input character
2416                 return new_character_token '<'
2417
2418         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2419         tok_state_rawtext_end_tag_open = ->
2420                 c = txt.charAt(cur++)
2421                 if uc_alpha.indexOf(c) > -1
2422                         tok_cur_tag = new_end_tag c.toLowerCase()
2423                         temporary_buffer += c
2424                         tok_state = tok_state_rawtext_end_tag_name
2425                         return null
2426                 if lc_alpha.indexOf(c) > -1
2427                         tok_cur_tag = new_end_tag c
2428                         temporary_buffer += c
2429                         tok_state = tok_state_rawtext_end_tag_name
2430                         return null
2431                 # Anything else
2432                 tok_state = tok_state_rawtext
2433                 cur -= 1 # reconsume the input character
2434                 return new_character_token "</" # fixfull separate these
2435
2436         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2437         tok_state_rawtext_end_tag_name = ->
2438                 c = txt.charAt(cur++)
2439                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2440                         if is_appropriate_end_tag tok_cur_tag
2441                                 tok_state = tok_state_before_attribute_name
2442                                 return
2443                         # else fall through to "Anything else"
2444                 if c is '/'
2445                         if is_appropriate_end_tag tok_cur_tag
2446                                 tok_state = tok_state_self_closing_start_tag
2447                                 return
2448                         # else fall through to "Anything else"
2449                 if c is '>'
2450                         if is_appropriate_end_tag tok_cur_tag
2451                                 tok_state = tok_state_data
2452                                 return tok_cur_tag
2453                         # else fall through to "Anything else"
2454                 if uc_alpha.indexOf(c) > -1
2455                         tok_cur_tag.name += c.toLowerCase()
2456                         temporary_buffer += c
2457                         return null
2458                 if lc_alpha.indexOf(c) > -1
2459                         tok_cur_tag.name += c
2460                         temporary_buffer += c
2461                         return null
2462                 # Anything else
2463                 tok_state = tok_state_rawtext
2464                 cur -= 1 # reconsume the input character
2465                 return new_character_token '</' + temporary_buffer # fixfull separate these
2466
2467         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2468
2469         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2470         tok_state_before_attribute_name = ->
2471                 attr_name = null
2472                 switch c = txt.charAt(cur++)
2473                         when "\t", "\n", "\u000c", ' '
2474                                 return null
2475                         when '/'
2476                                 tok_state = tok_state_self_closing_start_tag
2477                                 return null
2478                         when '>'
2479                                 tok_state = tok_state_data
2480                                 tmp = tok_cur_tag
2481                                 tok_cur_tag = null
2482                                 return tmp
2483                         when "\u0000"
2484                                 parse_error()
2485                                 attr_name = "\ufffd"
2486                         when '"', "'", '<', '='
2487                                 parse_error()
2488                                 attr_name = c
2489                         when '' # EOF
2490                                 parse_error()
2491                                 tok_state = tok_state_data
2492                         else
2493                                 if uc_alpha.indexOf(c) > -1
2494                                         attr_name = c.toLowerCase()
2495                                 else
2496                                         attr_name = c
2497                 if attr_name?
2498                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2499                         tok_state = tok_state_attribute_name
2500                 return null
2501
2502         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2503         tok_state_attribute_name = ->
2504                 switch c = txt.charAt(cur++)
2505                         when "\t", "\n", "\u000c", ' '
2506                                 tok_state = tok_state_after_attribute_name
2507                         when '/'
2508                                 tok_state = tok_state_self_closing_start_tag
2509                         when '='
2510                                 tok_state = tok_state_before_attribute_value
2511                         when '>'
2512                                 tok_state = tok_state_data
2513                                 tmp = tok_cur_tag
2514                                 tok_cur_tag = null
2515                                 return tmp
2516                         when "\u0000"
2517                                 parse_error()
2518                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2519                         when '"', "'", '<'
2520                                 parse_error()
2521                                 tok_cur_tag.attrs_a[0][0] = c
2522                         when '' # EOF
2523                                 parse_error()
2524                                 tok_state = tok_state_data
2525                         else
2526                                 if uc_alpha.indexOf(c) > -1
2527                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2528                                 else
2529                                         tok_cur_tag.attrs_a[0][0] += c
2530                 return null
2531
2532         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2533         tok_state_after_attribute_name = ->
2534                 c = txt.charAt(cur++)
2535                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2536                         return
2537                 if c is '/'
2538                         tok_state = tok_state_self_closing_start_tag
2539                         return
2540                 if c is '='
2541                         tok_state = tok_state_before_attribute_value
2542                         return
2543                 if c is '>'
2544                         tok_state = tok_state_data
2545                         return
2546                 if uc_alpha.indexOf(c) > -1
2547                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2548                         tok_state = tok_state_attribute_name
2549                         return
2550                 if c is "\u0000"
2551                         parse_error()
2552                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2553                         tok_state = tok_state_attribute_name
2554                         return
2555                 if c is '' # EOF
2556                         parse_error()
2557                         tok_state = tok_state_data
2558                         cur -= 1 # reconsume
2559                         return
2560                 if c is '"' or c is "'" or c is '<'
2561                         parse_error()
2562                         # fall through to Anything else
2563                 # Anything else
2564                 tok_cur_tag.attrs_a.unshift [c, '']
2565                 tok_state = tok_state_attribute_name
2566
2567         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2568         tok_state_before_attribute_value = ->
2569                 switch c = txt.charAt(cur++)
2570                         when "\t", "\n", "\u000c", ' '
2571                                 return null
2572                         when '"'
2573                                 tok_state = tok_state_attribute_value_double_quoted
2574                         when '&'
2575                                 tok_state = tok_state_attribute_value_unquoted
2576                                 cur -= 1
2577                         when "'"
2578                                 tok_state = tok_state_attribute_value_single_quoted
2579                         when "\u0000"
2580                                 # Parse error
2581                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2582                                 tok_state = tok_state_attribute_value_unquoted
2583                         when '>'
2584                                 # Parse error
2585                                 tok_state = tok_state_data
2586                                 tmp = tok_cur_tag
2587                                 tok_cur_tag = null
2588                                 return tmp
2589                         when '' # EOF
2590                                 parse_error()
2591                                 tok_state = tok_state_data
2592                         else
2593                                 tok_cur_tag.attrs_a[0][1] += c
2594                                 tok_state = tok_state_attribute_value_unquoted
2595                 return null
2596
2597         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2598         tok_state_attribute_value_double_quoted = ->
2599                 switch c = txt.charAt(cur++)
2600                         when '"'
2601                                 tok_state = tok_state_after_attribute_value_quoted
2602                         when '&'
2603                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2604                         when "\u0000"
2605                                 # Parse error
2606                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2607                         when '' # EOF
2608                                 parse_error()
2609                                 tok_state = tok_state_data
2610                         else
2611                                 tok_cur_tag.attrs_a[0][1] += c
2612                 return null
2613
2614         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2615         tok_state_attribute_value_single_quoted = ->
2616                 switch c = txt.charAt(cur++)
2617                         when "'"
2618                                 tok_state = tok_state_after_attribute_value_quoted
2619                         when '&'
2620                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2621                         when "\u0000"
2622                                 # Parse error
2623                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2624                         when '' # EOF
2625                                 parse_error()
2626                                 tok_state = tok_state_data
2627                         else
2628                                 tok_cur_tag.attrs_a[0][1] += c
2629                 return null
2630
2631         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2632         tok_state_attribute_value_unquoted = ->
2633                 switch c = txt.charAt(cur++)
2634                         when "\t", "\n", "\u000c", ' '
2635                                 tok_state = tok_state_before_attribute_name
2636                         when '&'
2637                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2638                         when '>'
2639                                 tok_state = tok_state_data
2640                                 tmp = tok_cur_tag
2641                                 tok_cur_tag = null
2642                                 return tmp
2643                         when "\u0000"
2644                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2645                         when '' # EOF
2646                                 parse_error()
2647                                 tok_state = tok_state_data
2648                         else
2649                                 # Parse Error if ', <, = or ` (backtick)
2650                                 tok_cur_tag.attrs_a[0][1] += c
2651                 return null
2652
2653         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2654         tok_state_after_attribute_value_quoted = ->
2655                 switch c = txt.charAt(cur++)
2656                         when "\t", "\n", "\u000c", ' '
2657                                 tok_state = tok_state_before_attribute_name
2658                         when '/'
2659                                 tok_state = tok_state_self_closing_start_tag
2660                         when '>'
2661                                 tok_state = tok_state_data
2662                                 tmp = tok_cur_tag
2663                                 tok_cur_tag = null
2664                                 return tmp
2665                         when '' # EOF
2666                                 parse_error()
2667                                 tok_state = tok_state_data
2668                         else
2669                                 # Parse Error
2670                                 tok_state = tok_state_before_attribute_name
2671                                 cur -= 1 # we didn't handle that char
2672                 return null
2673
2674         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2675         # Don't set this as a state, just call it
2676         # returns a string (NOT a text node)
2677         parse_character_reference = (allowed_char = null, in_attr = false) ->
2678                 if cur >= txt.length
2679                         return '&'
2680                 switch c = txt.charAt(cur)
2681                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2682                                 # explicitly not a parse error
2683                                 return '&'
2684                         when ';'
2685                                 # there has to be "one or more" alnums between & and ; to be a parse error
2686                                 return '&'
2687                         when '#'
2688                                 if cur + 1 >= txt.length
2689                                         return '&'
2690                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2691                                         prefix = '#x'
2692                                         charset = hex_chars
2693                                         start = cur + 2
2694                                 else
2695                                         charset = digits
2696                                         start = cur + 1
2697                                         prefix = '#'
2698                                 i = 0
2699                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2700                                         i += 1
2701                                 if i is 0
2702                                         return '&'
2703                                 if txt.charAt(start + i) is ';'
2704                                         i += 1
2705                                 # FIXME This is supposed to generate parse errors for some chars
2706                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2707                                 if decoded?
2708                                         cur = start + i
2709                                         return decoded
2710                                 return '&'
2711                         else
2712                                 for i in [0...31]
2713                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2714                                                 break
2715                                 if i is 0
2716                                         # exit early, because parse_error() below needs at least one alnum
2717                                         return '&'
2718                                 if txt.charAt(cur + i) is ';'
2719                                         i += 1 # include ';' terminator in value
2720                                         decoded = decode_named_char_ref txt.substr(cur, i)
2721                                         if decoded?
2722                                                 cur += i
2723                                                 return decoded
2724                                         parse_error()
2725                                         return '&'
2726                                 else
2727                                         # no ';' terminator (only legacy char refs)
2728                                         max = i
2729                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2730                                                 c = legacy_char_refs[txt.substr(cur, i)]
2731                                                 if c?
2732                                                         if in_attr
2733                                                                 if txt.charAt(cur + i) is '='
2734                                                                         # "because some legacy user agents will
2735                                                                         # misinterpret the markup in those cases"
2736                                                                         parse_error()
2737                                                                         return '&'
2738                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2739                                                                         # this makes attributes forgiving about url args
2740                                                                         return '&'
2741                                                         # ok, and besides the weird exceptions for attributes...
2742                                                         # return the matching char
2743                                                         cur += i # consume entity chars
2744                                                         parse_error() # because no terminating ";"
2745                                                         return c
2746                                         parse_error()
2747                                         return '&'
2748                 return # never reached
2749
2750         # tree constructor initialization
2751         # see comments on TYPE_TAG/etc for the structure of this data
2752         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2753         open_els = [doc]
2754         afe = [] # active formatting elements
2755         template_insertion_modes = []
2756         insertion_mode = ins_mode_initial
2757         original_insertion_mode = insertion_mode # TODO check spec
2758         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2759         flag_frameset_ok = true
2760         flag_parsing = true
2761         flag_foster_parenting = false
2762         form_element_pointer = null
2763         temporary_buffer = null
2764         pending_table_character_tokens = []
2765         head_element_pointer = null
2766         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2767
2768         # tokenizer initialization
2769         tok_state = tok_state_data
2770
2771         # proccess input
2772         while flag_parsing
2773                 t = tok_state()
2774                 if t?
2775                         insertion_mode t
2776                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2777         return doc.children
2778
2779 test_results = passed: 0, failed: 0
2780 # everything below is tests on the above
2781 test_equals = (description, output, expected_output) ->
2782         if output is expected_output
2783                 console.log "passed." # don't say name, so smart consoles can merge all of these
2784         else
2785                 console.log "FAILED: \"#{description}\""
2786                 console.log "   Expected: #{expected_output}"
2787                 console.log "     Actual: #{output}"
2788 serialize_els = (els, shallow, show_ids) ->
2789         serialized = ''
2790         sep = ''
2791         for t in els
2792                 serialized += sep
2793                 sep = ','
2794                 serialized += t.serialize shallow, show_ids
2795         return serialized
2796 test_parser = (args) ->
2797         debug_log_reset()
2798         parse_errors = []
2799         errors_cb = (i) ->
2800                 parse_errors.push i
2801         prev_node_id = 0 # reset counter
2802         parsed = parse_html args.html, errors_cb
2803         serialized = serialize_els parsed, false, false
2804         expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
2805         if serialized isnt expected
2806                 debug_log_each (str) ->
2807                         console.log str
2808                 console.log "FAILED: \"#{args.name}\""
2809                 console.log "      Input: #{args.html}"
2810                 console.log "    Correct: #{expected}"
2811                 console.log "     Output: #{serialized}"
2812                 if parse_errors.length > 0
2813                         console.log " parse errs: #{JSON.stringify parse_errors}"
2814                 else
2815                         console.log "   No parse errors"
2816                 test_results.failed += 1
2817         else
2818                 #console.log "passed \"#{args.name}\""
2819                 test_results.passed += 1
2820 test_summary = ->
2821         console.log "Tests passed: #{test_results.passed}"
2822         console.log "Tests Failed: #{test_results.failed}"
2823
2824 test_parser name: "empty", \
2825         html: "",
2826         expected: ''
2827 test_parser name: "just text", \
2828         html: "abc",
2829         expected: 'text:"abc"'
2830 test_parser name: "named entity", \
2831         html: "a&amp;1234",
2832         expected: 'text:"a&1234"'
2833 test_parser name: "broken named character references", \
2834         html: "1&amp2&&amp;3&aabbcc;",
2835         expected: 'text:"1&2&&3&aabbcc;"'
2836 test_parser name: "numbered entity overrides", \
2837         html: "1&#X80&#x80; &#x83",
2838         expected: 'text:"1€€ ƒ"'
2839 test_parser name: "open tag", \
2840         html: "foo<span>bar",
2841         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2842 test_parser name: "open tag with attributes", \
2843         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2844         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2845 test_parser name: "open tag with attributes of various quotings", \
2846         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2847         expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2848 test_parser name: "attribute entity exceptions dq", \
2849         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
2850         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2851 test_parser name: "attribute entity exceptions sq", \
2852         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
2853         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2854 test_parser name: "attribute entity exceptions uq", \
2855         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
2856         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2857 test_parser name: "matching closing tags", \
2858         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2859         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2860 test_parser name: "missing closing tag inside", \
2861         html: "foo<div>bar<span>baz</div>qux",
2862         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2863 test_parser name: "mis-matched closing tags", \
2864         html: "<span>12<div>34</span>56</div>78",
2865         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2866 test_parser name: "mis-matched formatting elements", \
2867         html: "12<b>34<i>56</b>78</i>90",
2868         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2869 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2870         html: '<p>1<b>2<i>3</b>4</i>5</p>',
2871         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2872 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2873         html: '<b>1<p>2</b>3</p>',
2874         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2875 test_parser name: "crazy formatting elements test", \
2876         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2877         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2878         # firefox does this:
2879         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2880 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2881 test_parser name: "html5lib aaa 1", \
2882         html: '<a><p></a></p>',
2883         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2884 test_parser name: "html5lib aaa 2", \
2885         html: '<a>1<p>2</a>3</p>',
2886         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2887 test_parser name: "html5lib aaa 3", \
2888         html: '<a>1<button>2</a>3</button>',
2889         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2890 test_parser name: "html5lib aaa 4", \
2891         html: '<a>1<b>2</a>3</b>',
2892         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2893 test_parser name: "html5lib aaa 5 (two divs deep)", \
2894         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2895         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2896 test_parser name: "html5lib aaa 6 (foster parenting)", \
2897         html: '<table><a>1<p>2</a>3</p>',
2898         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2899 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2900         html: '<b><b><a><p></a>',
2901         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2902 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2903         html: '<b><a><b><p></a>',
2904         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2905 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2906         html: '<a><b><b><p></a>',
2907         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2908 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2909         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2910         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2911 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2912         html: '<table><a>1<td>2</td>3</table>',
2913         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2914 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2915         html: '<table>A<td>B</td>C</table>',
2916         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2917 # TODO implement svg and namespacing
2918 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2919 #       html: '<a><svg><tr><input></a>',
2920 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2921 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2922         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2923         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2924 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2925         html: '<div><a><b><u><i><code><div></a>',
2926         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2927 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2928         html: '<b><b><b><b>x</b></b></b></b>y',
2929         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2930 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2931         html: '<p><b><b><b><b><p>x',
2932         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2933 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2934         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2935         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2936 test_parser name: "junk after attribute close-quote", \
2937         html: '<p><b c="d", e="f">foo<p>x',
2938         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2939 test_parser name: "html5lib aaa02 1", \
2940         html: '<b>1<i>2<p>3</b>4',
2941         expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2942 test_parser name: "html5lib aaa02 2", \
2943         html: '<a><div><style></style><address><a>',
2944         expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2945 test_parser name: "html5lib tables 1", \
2946         html: '<table><th>',
2947         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2948 test_parser name: "html5lib tables 2", \
2949         html: '<table><td>',
2950         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2951 test_parser name: "html5lib tables 3", \
2952         html: "<table><col foo='bar'>",
2953         expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2954 test_parser name: "html5lib tables 4", \
2955         html: '<table><colgroup></html>foo',
2956         expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2957 test_parser name: "html5lib tables 5", \
2958         html: '<table></table><p>foo',
2959         expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2960 test_parser name: "html5lib tables 6", \
2961         html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2962         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2963 test_parser name: "html5lib tables 7", \
2964         html: '<table><select><option>3</select></table>',
2965         expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2966 test_parser name: "html5lib tables 8", \
2967         html: '<table><select><table></table></select></table>',
2968         expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2969 test_parser name: "html5lib tables 9", \
2970         html: '<table><select></table>',
2971         expected: 'tag:"select",{},[],tag:"table",{},[]'
2972 test_parser name: "html5lib tables 10", \
2973         html: '<table><select><option>A<tr><td>B</td></tr></table>',
2974         expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2975 test_parser name: "html5lib tables 11", \
2976         html: '<table><td></body></caption></col></colgroup></html>foo',
2977         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2978 test_parser name: "html5lib tables 12", \
2979         html: '<table><td>A</table>B',
2980         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2981 test_parser name: "html5lib tables 13", \
2982         html: '<table><tr><caption>',
2983         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2984 test_parser name: "html5lib tables 14", \
2985         html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2986         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2987 test_parser name: "html5lib tables 15", \
2988         html: '<table><td><tr>',
2989         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2990 test_parser name: "html5lib tables 16", \
2991         html: '<table><td><button><td>',
2992         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2993 # TODO implement svg parsing
2994 #test_parser name: "html5lib tables 17", \
2995 #       html: '<table><tr><td><svg><desc><td>',
2996 #       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'
2997 test_summary()