JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
switch to tests from html5lib-tests/tree-construction
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         shallow_clone: -> # return a new node that's the same except without the children or parent
100                 # WARNING this doesn't work right on open tags that are still being parsed
101                 attrs = {}
102                 attrs[k] = v for k, v of @attrs
103                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104         acknowledge_self_closing: ->
105                 if @token?
106                         @token.flag 'did_self_close'
107                 else
108                         @flag 'did_self_close', true
109         flag: ->
110                 # fixfull
111         serialize: (shallow = false, show_ids = false) -> # for unit tests
112                 ret = ''
113                 switch @type
114                         when TYPE_TAG
115                                 ret += 'tag:'
116                                 ret += JSON.stringify @name
117                                 ret += ','
118                                 if show_ids
119                                         ret += "##{@id},"
120                                 if shallow
121                                         break
122                                 attr_keys = []
123                                 for k of @attrs
124                                         attr_keys.push k
125                                 attr_keys.sort()
126                                 ret += '{'
127                                 sep = ''
128                                 for k in attr_keys
129                                         ret += sep
130                                         sep = ','
131                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132                                 ret += '},['
133                                 sep = ''
134                                 for c in @children
135                                         ret += sep
136                                         sep = ','
137                                         ret += c.serialize shallow, show_ids
138                                 ret += ']'
139                         when TYPE_TEXT
140                                 ret += 'text:'
141                                 ret += JSON.stringify @text
142                         when TYPE_COMMENT
143                                 ret += 'comment:'
144                                 ret += JSON.stringify @text
145                         when TYPE_DOCTYPE
146                                 ret += 'doctype'
147                                 # FIXME
148                         when TYPE_AFE_MARKER
149                                 ret += 'marker'
150                         when TYPE_AAA_BOOKMARK
151                                 ret += 'aaa_bookmark'
152                         else
153                                 ret += 'unknown:'
154                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155                 return ret
156
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159         return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161         return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163         return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165         return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_node = (txt) ->
168         return new Node TYPE_COMMENT, text: txt
169 new_eof_token = ->
170         return new Node TYPE_EOF
171 new_afe_marker = ->
172         return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174         return new Node TYPE_AAA_BOOKMARK
175
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
181
182 # some SVG elements have dashes in them
183 tag_name_chars = alnum + "-"
184
185 # http://www.w3.org/TR/html5/infrastructure.html#space-character
186 space_chars = "\u0009\u000a\u000c\u000d\u0020"
187 is_space = (txt) ->
188         return txt.length is 1 and space_chars.indexOf(txt) > -1
189 is_space_tok = (t) ->
190         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
191
192 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
193 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
194
195 # These are the character references that don't need a terminating semicolon
196 # min length: 2, max: 6, none are a prefix of any other.
197 legacy_char_refs = {
198         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
199         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
200         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
201         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
202         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
203         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
204         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
205         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
206         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
207         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
208         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
209         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
210         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
211         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
212         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
213         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
214         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
215         yen: '¥', yuml: 'ÿ'
216 }
217
218 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
219 raw_text_elements = ['script', 'style']
220 escapable_raw_text_elements = ['textarea', 'title']
221 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
222 svg_elements = [
223         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
224         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
225         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
226         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
227         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
228         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
229         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
230         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
231         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
232         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
233         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
234         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
235         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
236         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
237         'view', 'vkern'
238 ]
239
240 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
241 mathml_elements = [
242         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
243         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
244         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
245         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
246         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
247         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
248         'determinant', 'diff', 'divergence', 'divide', 'domain',
249         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
250         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
251         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
252         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
253         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
254         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
255         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
256         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
257         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
258         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
259         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
260         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
261         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
262         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
263         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
264         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
265         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
266         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
267         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
268         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
269         'vectorproduct', 'xor'
270 ]
271 # foreign_elements = [svg_elements..., mathml_elements...]
272 #normal_elements = All other allowed HTML elements are normal elements.
273
274 special_elements = {
275         # HTML:
276         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
277         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
278         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
279         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
280         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
281         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
282         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
283         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
284         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
285         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
286         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
287         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
288         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
289         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
290         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
291         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
292         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
293         wbr:NS_HTML, xmp:NS_HTML,
294
295         # MathML:
296         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
297         'annotation-xml':NS_MATHML,
298
299         # SVG:
300         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
301 }
302
303 formatting_elements = {
304          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
305          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
306          u: true
307 }
308
309 foster_parenting_targets = {
310         table: true
311         tbody: true
312         tfoot: true
313         thead: true
314         tr: true
315 }
316
317 # all html I presume
318 end_tag_implied = {
319         dd: true
320         dt: true
321         li: true
322         option: true
323         optgroup: true
324         p: true
325         rb: true
326         rp: true
327         rt: true
328         rtc: true
329 }
330
331 el_is_special = (e) ->
332         return special_elements[e.name] is e.namespace
333
334 # decode_named_char_ref()
335 #
336 # The list of named character references is _huge_ so ask the browser to decode
337 # for us instead of wasting bandwidth/space on including the table here.
338 #
339 # Pass without the "&" but with the ";" examples:
340 #    for "&amp" pass "amp;"
341 #    for "&#x2032" pass "x2032;"
342 g_dncr = {
343         cache: {}
344         textarea: document.createElement('textarea')
345 }
346 # TODO test this in IE8
347 decode_named_char_ref = (txt) ->
348         txt = "&#{txt}"
349         decoded = g_dncr.cache[txt]
350         return decoded if decoded?
351         g_dncr.textarea.innerHTML = txt
352         decoded = g_dncr.textarea.value
353         return null if decoded is txt
354         return g_dncr.cache[txt] = decoded
355
356 parse_html = (txt, parse_error_cb = null) ->
357         cur = 0 # index of next char in txt to be parsed
358         # declare doc and tokenizer variables so they're in scope below
359         doc = null
360         open_els = null # stack of open elements
361         afe = null # active formatting elements
362         template_insertion_modes = null
363         insertion_mode = null
364         original_insertion_mode = null
365         tok_state = null
366         tok_cur_tag = null # partially parsed tag
367         flag_scripting = null
368         flag_frameset_ok = null
369         flag_parsing = null
370         flag_foster_parenting = null
371         form_element_pointer = null
372         temporary_buffer = null
373         pending_table_character_tokens = null
374         head_element_pointer = null
375         flag_fragment_parsing = null
376
377         stop_parsing = ->
378                 flag_parsing = false
379
380         parse_error = ->
381                 if parse_error_cb?
382                         parse_error_cb cur
383                 else
384                         console.log "Parse error at character #{cur} of #{txt.length}"
385
386         afe_push = (new_el) ->
387                 matches = 0
388                 for el, i in afe
389                         if el.name is new_el.name and el.namespace is new_el.namespace
390                                 for k, v of el.attrs
391                                         continue unless new_el.attrs[k] is v
392                                 for k, v of new_el.attrs
393                                         continue unless el.attrs[k] is v
394                                 matches += 1
395                                 if matches is 3
396                                         afe.splice i, 1
397                                         break
398                 afe.unshift new_el
399         afe_push_marker = ->
400                 afe.unshift new_afe_marker()
401
402         # the functions below impliment the Tree Contstruction algorithm
403         # http://www.w3.org/TR/html5/syntax.html#tree-construction
404
405         # But first... the helpers
406         template_tag_is_open = ->
407                 for t in open_els
408                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
409                                 return true
410                 return false
411         is_in_scope_x = (tag_name, scope, namespace) ->
412                 for t in open_els
413                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
414                                 return true
415                         if scope[t.name] is t.namespace
416                                 return false
417                 return false
418         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
419                 for t in open_els
420                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
421                                 return true
422                         if scope[t.name] is t.namespace
423                                 return false
424                         if scope2[t.name] is t.namespace
425                                 return false
426                 return false
427         standard_scopers = { # FIXME these are supposed to be namespace specific
428                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
429                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
430                 template: NS_HTML, mi: NS_MATHML,
431
432                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
433                 'annotation-xml': NS_MATHML,
434
435                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
436         }
437         button_scopers = button: NS_HTML
438         li_scopers = ol: NS_HTML, ul: NS_HTML
439         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
440         is_in_scope = (tag_name, namespace = null) ->
441                 return is_in_scope_x tag_name, standard_scopers, namespace
442         is_in_button_scope = (tag_name, namespace = null) ->
443                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
444         is_in_table_scope = (tag_name, namespace = null) ->
445                 return is_in_scope_x tag_name, table_scopers, namespace
446         is_in_select_scope = (tag_name, namespace = null) ->
447                 for t in open_els
448                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
449                                 return true
450                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
451                                 return false
452                 return false
453         # this checks for a particular element, not by name
454         el_is_in_scope = (el) ->
455                 for t in open_els
456                         if t is el
457                                 return true
458                         if standard_scopers[t.name] is t.namespace
459                                 return false
460                 return false
461
462         clear_to_table_stopers = {
463                 'table': true
464                 'template': true
465                 'html': true
466         }
467         clear_stack_to_table_context = ->
468                 loop
469                         if clear_to_table_stopers[open_els[0].name]?
470                                 break
471                         open_els.shift()
472                 return
473         clear_to_table_body_stopers = {
474                 'tbody': true
475                 'tfoot': true
476                 'thead': true
477                 'template': true
478                 'html': true
479         }
480         clear_stack_to_table_body_context = ->
481                 loop
482                         if clear_to_table_body_stopers[open_els[0].name]?
483                                 break
484                         open_els.shift()
485                 return
486         clear_to_table_row_stopers = {
487                 'tr': true
488                 'template': true
489                 'html': true
490         }
491         clear_stack_to_table_row_context = ->
492                 loop
493                         if clear_to_table_row_stopers[open_els[0].name]?
494                                 break
495                         open_els.shift()
496                 return
497         clear_afe_to_marker = ->
498                 loop
499                         el = afe.shift()
500                         if el.type is TYPE_AFE_MARKER
501                                 return
502
503         # 8.2.3.1 ...
504         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
505         reset_insertion_mode = ->
506                 # 1. Let last be false.
507                 last = false
508                 # 2. Let node be the last node in the stack of open elements.
509                 node_i = 0
510                 node = open_els[node_i]
511                 # 3. Loop: If node is the first node in the stack of open elements,
512                 # then set last to true, and, if the parser was originally created as
513                 # part of the HTML fragment parsing algorithm (fragment case) set node
514                 # to the context element.
515                 loop
516                         if node_i is open_els.length - 1
517                                 last = true
518                                 # fixfull (fragment case)
519
520                         # 4. If node is a select element, run these substeps:
521                         if node.name is 'select'
522                                 # 1. If last is true, jump to the step below labeled done.
523                                 unless last
524                                         # 2. Let ancestor be node.
525                                         ancestor_i = node_i
526                                         ancestor = node
527                                         # 3. Loop: If ancestor is the first node in the stack of
528                                         # open elements, jump to the step below labeled done.
529                                         loop
530                                                 if ancestor_i is open_els.length - 1
531                                                         break
532                                                 # 4. Let ancestor be the node before ancestor in the stack
533                                                 # of open elements.
534                                                 ancestor_i += 1
535                                                 ancestor = open_els[ancestor_i]
536                                                 # 5. If ancestor is a template node, jump to the step below
537                                                 # labeled done.
538                                                 if ancestor.name is 'template'
539                                                         break
540                                                 # 6. If ancestor is a table node, switch the insertion mode
541                                                 # to "in select in table" and abort these steps.
542                                                 if ancestor.name is 'table'
543                                                         insertion_mode = ins_mode_in_select_in_table
544                                                         return
545                                                 # 7. Jump back to the step labeled loop.
546                                 # 8. Done: Switch the insertion mode to "in select" and abort
547                                 # these steps.
548                                 insertion_mode = ins_mode_in_select
549                                 return
550                         # 5. If node is a td or th element and last is false, then switch
551                         # the insertion mode to "in cell" and abort these steps.
552                         if (node.name is 'td' or node.name is 'th') and last is false
553                                 insertion_mode = ins_mode_in_cell
554                                 return
555                         # 6. If node is a tr element, then switch the insertion mode to "in
556                         # row" and abort these steps.
557                         if node.name is 'tr'
558                                 insertion_mode = ins_mode_in_row
559                                 return
560                         # 7. If node is a tbody, thead, or tfoot element, then switch the
561                         # insertion mode to "in table body" and abort these steps.
562                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
563                                 insertion_mode = ins_mode_in_table_body
564                                 return
565                         # 8. If node is a caption element, then switch the insertion mode
566                         # to "in caption" and abort these steps.
567                         if node.name is 'caption'
568                                 insertion_mode = ins_mode_in_caption
569                                 return
570                         # 9. If node is a colgroup element, then switch the insertion mode
571                         # to "in column group" and abort these steps.
572                         if node.name is 'colgroup'
573                                 insertion_mode = ins_mode_in_column_group
574                                 return
575                         # 10. If node is a table element, then switch the insertion mode to
576                         # "in table" and abort these steps.
577                         if node.name is 'table'
578                                 insertion_mode = ins_mode_in_table
579                                 return
580                         # 11. If node is a template element, then switch the insertion mode
581                         # to the current template insertion mode and abort these steps.
582                         # fixfull (template insertion mode stack)
583
584                         # 12. If node is a head element and last is true, then switch the
585                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
586                         # these steps. (fragment case)
587                         if node.name is 'head' and last
588                                 insertion_mode = ins_mode_in_body
589                                 return
590                         # 13. If node is a head element and last is false, then switch the
591                         # insertion mode to "in head" and abort these steps.
592                         if node.name is 'head' and last is false
593                                 insertion_mode = ins_mode_in_head
594                                 return
595                         # 14. If node is a body element, then switch the insertion mode to
596                         # "in body" and abort these steps.
597                         if node.name is 'body'
598                                 insertion_mode = ins_mode_in_body
599                                 return
600                         # 15. If node is a frameset element, then switch the insertion mode
601                         # to "in frameset" and abort these steps. (fragment case)
602                         if node.name is 'frameset'
603                                 insertion_mode = ins_mode_in_frameset
604                                 return
605                         # 16. If node is an html element, run these substeps:
606                         if node.name is 'html'
607                                 # 1. If the head element pointer is null, switch the insertion
608                                 # mode to "before head" and abort these steps. (fragment case)
609                                 # fixfull (fragment case)
610
611                                 # 2. Otherwise, the head element pointer is not null, switch
612                                 # the insertion mode to "after head" and abort these steps.
613                                 insertion_mode = ins_mode_in_body # FIXME fixfull
614                                 return
615                         # 17. If last is true, then switch the insertion mode to "in body"
616                         # and abort these steps. (fragment case)
617                         if last
618                                 insertion_mode = ins_mode_in_body
619                                 return
620                         # 18. Let node now be the node before node in the stack of open
621                         # elements.
622                         node_i += 1
623                         node = open_els[node_i]
624                         # 19. Return to the step labeled loop.
625
626         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
627         # this implementation is structured (mostly) as described at the link above.
628         # capitalized comments are the "labels" described at the link above.
629         reconstruct_active_formatting_elements = ->
630                 return if afe.length is 0
631                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
632                         return
633                 # Rewind
634                 i = 0
635                 loop
636                         if i is afe.length - 1
637                                 break
638                         i += 1
639                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
640                                 i -= 1 # Advance
641                                 break
642                 # Create
643                 loop
644                         el = afe[i].shallow_clone()
645                         tree_insert_element el
646                         afe[i] = el
647                         break if i is 0
648                         i -= 1 # Advance
649
650         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
651         # adoption agency algorithm
652         # overview here:
653         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
654         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
655         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
656         adoption_agency = (subject) ->
657                 debug_log "adoption_agency()"
658                 debug_log "tree: #{serialize_els doc.children, false, true}"
659                 debug_log "open_els: #{serialize_els open_els, true, true}"
660                 debug_log "afe: #{serialize_els afe, true, true}"
661                 if open_els[0].name is subject
662                         el = open_els[0]
663                         open_els.shift()
664                         # remove it from the list of active formatting elements (if found)
665                         for t, i in afe
666                                 if t is el
667                                         afe.splice i, 1
668                                         break
669                         debug_log "aaa: starting off with subject on top of stack, exiting"
670                         return
671                 outer = 0
672                 loop
673                         if outer >= 8
674                                 return
675                         outer += 1
676                         # 5. Let formatting element be the last element in the list of
677                         # active formatting elements that: is between the end of the list
678                         # and the last scope marker in the list, if any, or the start of
679                         # the list otherwise, and  has the tag name subject.
680                         fe = null
681                         for t, fe_of_afe in afe
682                                 if t.type is TYPE_AFE_MARKER
683                                         break
684                                 if t.name is subject
685                                         fe = t
686                                         break
687                         # If there is no such element, then abort these steps and instead
688                         # act as described in the "any other end tag" entry above.
689                         if fe is null
690                                 debug_log "aaa: fe not found in afe"
691                                 in_body_any_other_end_tag subject
692                                 return
693                         # 6. If formatting element is not in the stack of open elements,
694                         # then this is a parse error; remove the element from the list, and
695                         # abort these steps.
696                         in_open_els = false
697                         for t, fe_of_open_els in open_els
698                                 if t is fe
699                                         in_open_els = true
700                                         break
701                         unless in_open_els
702                                 debug_log "aaa: fe not found in open_els"
703                                 parse_error()
704                                 # "remove it from the list" must mean afe, since it's not in open_els
705                                 afe.splice fe_of_afe, 1
706                                 return
707                         # 7. If formatting element is in the stack of open elements, but
708                         # the element is not in scope, then this is a parse error; abort
709                         # these steps.
710                         unless el_is_in_scope fe
711                                 debug_log "aaa: fe not in scope"
712                                 parse_error()
713                                 return
714                         # 8. If formatting element is not the current node, this is a parse
715                         # error. (But do not abort these steps.)
716                         unless open_els[0] is fe
717                                 parse_error()
718                                 # continue
719                         # 9. Let furthest block be the topmost node in the stack of open
720                         # elements that is lower in the stack than formatting element, and
721                         # is an element in the special category. There might not be one.
722                         fb = null
723                         fb_of_open_els = null
724                         for t, i in open_els
725                                 if t is fe
726                                         break
727                                 if el_is_special t
728                                         fb = t
729                                         fb_of_open_els = i
730                                         # and continue, to see if there's one that's more "topmost"
731                         # 10. If there is no furthest block, then the UA must first pop all
732                         # the nodes from the bottom of the stack of open elements, from the
733                         # current node up to and including formatting element, then remove
734                         # formatting element from the list of active formatting elements,
735                         # and finally abort these steps.
736                         if fb is null
737                                 debug_log "aaa: no fb"
738                                 loop
739                                         t = open_els.shift()
740                                         if t is fe
741                                                 afe.splice fe_of_afe, 1
742                                                 return
743                         # 11. Let common ancestor be the element immediately above
744                         # formatting element in the stack of open elements.
745                         ca = open_els[fe_of_open_els + 1] # common ancestor
746
747                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
748                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
749                         bookmark = new_aaa_bookmark()
750                         for t, i in afe
751                                 if t is fe
752                                         afe.splice i, 0, bookmark
753                                         break
754                         node = last_node = fb
755                         inner = 0
756                         loop
757                                 inner += 1
758                                 # 3. Let node be the element immediately above node in the
759                                 # stack of open elements, or if node is no longer in the stack
760                                 # of open elements (e.g. because it got removed by this
761                                 # algorithm), the element that was immediately above node in
762                                 # the stack of open elements before node was removed.
763                                 node_next = null
764                                 for t, i in open_els
765                                         if t is node
766                                                 node_next = open_els[i + 1]
767                                                 break
768                                 node = node_next ? node_above
769                                 debug_log "inner loop #{inner}"
770                                 debug_log "tree: #{serialize_els doc.children, false, true}"
771                                 debug_log "open_els: #{serialize_els open_els, true, true}"
772                                 debug_log "afe: #{serialize_els afe, true, true}"
773                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
774                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
775                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
776                                 debug_log "node: #{node.serialize true, true}"
777                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
778
779                                 # 4. If node is formatting element, then go to the next step in
780                                 # the overall algorithm.
781                                 if node is fe
782                                         break
783                                 debug_log "the meat"
784                                 # 5. If inner loop counter is greater than three and node is in
785                                 # the list of active formatting elements, then remove node from
786                                 # the list of active formatting elements.
787                                 node_in_afe = false
788                                 for t, i in afe
789                                         if t is node
790                                                 if inner > 3
791                                                         afe.splice i, 1
792                                                         debug_log "max out inner"
793                                                 else
794                                                         node_in_afe = true
795                                                         debug_log "in afe"
796                                                 break
797                                 # 6. If node is not in the list of active formatting elements,
798                                 # then remove node from the stack of open elements and then go
799                                 # back to the step labeled inner loop.
800                                 unless node_in_afe
801                                         debug_log "not in afe"
802                                         for t, i in open_els
803                                                 if t is node
804                                                         node_above = open_els[i + 1]
805                                                         open_els.splice i, 1
806                                                         break
807                                         continue
808                                 debug_log "the bones"
809                                 # 7. create an element for the token for which the element node
810                                 # was created, in the HTML namespace, with common ancestor as
811                                 # the intended parent; replace the entry for node in the list
812                                 # of active formatting elements with an entry for the new
813                                 # element, replace the entry for node in the stack of open
814                                 # elements with an entry for the new element, and let node be
815                                 # the new element.
816                                 new_node = node.shallow_clone()
817                                 for t, i in afe
818                                         if t is node
819                                                 afe[i] = new_node
820                                                 debug_log "replaced in afe"
821                                                 break
822                                 for t, i in open_els
823                                         if t is node
824                                                 node_above = open_els[i + 1]
825                                                 open_els[i] = new_node
826                                                 debug_log "replaced in open_els"
827                                                 break
828                                 node = new_node
829                                 # 8. If last node is furthest block, then move the
830                                 # aforementioned bookmark to be immediately after the new node
831                                 # in the list of active formatting elements.
832                                 if last_node is fb
833                                         for t, i in afe
834                                                 if t is bookmark
835                                                         afe.splice i, 1
836                                                         debug_log "removed bookmark"
837                                                         break
838                                         for t, i in afe
839                                                 if t is node
840                                                         # "after" means lower
841                                                         afe.splice i, 0, bookmark # "after as <-
842                                                         debug_log "placed bookmark after node"
843                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
844                                                         break
845                                 # 9. Insert last node into node, first removing it from its
846                                 # previous parent node if any.
847                                 if last_node.parent?
848                                         debug_log "last_node has parent"
849                                         for c, i in last_node.parent.children
850                                                 if c is last_node
851                                                         debug_log "removing last_node from parent"
852                                                         last_node.parent.children.splice i, 1
853                                                         break
854                                 node.children.push last_node
855                                 last_node.parent = node
856                                 # 10. Let last node be node.
857                                 last_node = node
858                                 debug_log "at last"
859                                 # 11. Return to the step labeled inner loop.
860                         # 14. Insert whatever last node ended up being in the previous step
861                         # at the appropriate place for inserting a node, but using common
862                         # ancestor as the override target.
863
864                         # In the case where fe is immediately followed by fb:
865                         #   * inner loop exits out early (node==fe)
866                         #   * last_node is fb
867                         #   * last_node is still in the tree (not a duplicate)
868                         if last_node.parent?
869                                 debug_log "FEFIRST? last_node has parent"
870                                 for c, i in last_node.parent.children
871                                         if c is last_node
872                                                 debug_log "removing last_node from parent"
873                                                 last_node.parent.children.splice i, 1
874                                                 break
875
876                         debug_log "after aaa inner loop"
877                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
878                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
879                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
880                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
881                         debug_log "tree: #{serialize_els doc.children, false, true}"
882
883                         debug_log "insert"
884
885
886                         # can't use standard insert token thing, because it's already in
887                         # open_els and must stay at it's current position in open_els
888                         dest = adjusted_insertion_location ca
889                         dest[0].children.splice dest[1], 0, last_node
890                         last_node.parent = dest[0]
891
892
893                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
894                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
895                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
896                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
897                         debug_log "tree: #{serialize_els doc.children, false, true}"
898
899                         # 15. Create an element for the token for which formatting element
900                         # was created, in the HTML namespace, with furthest block as the
901                         # intended parent.
902                         new_element = fe.shallow_clone() # FIXME intended parent thing
903                         # 16. Take all of the child nodes of furthest block and append them
904                         # to the element created in the last step.
905                         while fb.children.length
906                                 t = fb.children.shift()
907                                 t.parent = new_element
908                                 new_element.children.push t
909                         # 17. Append that new element to furthest block.
910                         new_element.parent = fb
911                         fb.children.push new_element
912                         # 18. Remove formatting element from the list of active formatting
913                         # elements, and insert the new element into the list of active
914                         # formatting elements at the position of the aforementioned
915                         # bookmark.
916                         for t, i in afe
917                                 if t is fe
918                                         afe.splice i, 1
919                                         break
920                         for t, i in afe
921                                 if t is bookmark
922                                         afe[i] = new_element
923                                         break
924                         # 19. Remove formatting element from the stack of open elements,
925                         # and insert the new element into the stack of open elements
926                         # immediately below the position of furthest block in that stack.
927                         for t, i in open_els
928                                 if t is fe
929                                         open_els.splice i, 1
930                                         break
931                         for t, i in open_els
932                                 if t is fb
933                                         open_els.splice i, 0, new_element
934                                         break
935                         # 20. Jump back to the step labeled outer loop.
936                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
937                         debug_log "tree: #{serialize_els doc.children, false, true}"
938                         debug_log "open_els: #{serialize_els open_els, true, true}"
939                         debug_log "afe: #{serialize_els afe, true, true}"
940                 debug_log "AAA DONE"
941
942         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
943         close_p_element = ->
944                 generate_implied_end_tags 'p' # arg is exception
945                 if open_els[0].name isnt 'p'
946                         parse_error()
947                 while open_els.length > 1 # just in case
948                         el = open_els.shift()
949                         if el.name is 'p'
950                                 return
951         close_p_if_in_button_scope = ->
952                 if is_in_button_scope 'p'
953                         close_p_element()
954
955         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
956         # aka insert_a_character = (t) ->
957         insert_character = (t) ->
958                 dest = adjusted_insertion_location()
959                 # fixfull check for Document node
960                 if dest[1] > 0
961                         prev = dest[0].children[dest[1] - 1]
962                         if prev.type is TYPE_TEXT
963                                 prev.text += t.text
964                                 return
965                 dest[0].children.splice dest[1], 0, t
966
967         # 8.2.5.1
968         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
969         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
970         adjusted_insertion_location = (override_target = null) ->
971                 # 1. If there was an override target specified, then let target be the
972                 # override target.
973                 if override_target?
974                         target = override_target
975                 else # Otherwise, let target be the current node.
976                         target = open_els[0]
977                 # 2. Determine the adjusted insertion location using the first matching
978                 # steps from the following list:
979                 #
980                 # If foster parenting is enabled and target is a table, tbody, tfoot,
981                 # thead, or tr element Foster parenting happens when content is
982                 # misnested in tables.
983                 if flag_foster_parenting and foster_parenting_targets[target.name]
984                         loop # once. this is here so we can ``break`` to "abort these substeps"
985                                 # 1. Let last template be the last template element in the
986                                 # stack of open elements, if any.
987                                 last_template = null
988                                 last_template_i = null
989                                 for el, i in open_els
990                                         if el.name is 'template'
991                                                 last_template = el
992                                                 last_template_i = i
993                                                 break
994                                 # 2. Let last table be the last table element in the stack of
995                                 # open elements, if any.
996                                 last_table = null
997                                 last_table_i
998                                 for el, i in open_els
999                                         if el.name is 'table'
1000                                                 last_table = el
1001                                                 last_table_i = i
1002                                                 break
1003                                 # 3. If there is a last template and either there is no last
1004                                 # table, or there is one, but last template is lower (more
1005                                 # recently added) than last table in the stack of open
1006                                 # elements, then: let adjusted insertion location be inside
1007                                 # last template's template contents, after its last child (if
1008                                 # any), and abort these substeps.
1009                                 if last_template and (last_table is null or last_template_i < last_table_i)
1010                                         target = template # fixfull should be it's contents
1011                                         target_i = target.children.length
1012                                         break
1013                                 # 4. If there is no last table, then let adjusted insertion
1014                                 # location be inside the first element in the stack of open
1015                                 # elements (the html element), after its last child (if any),
1016                                 # and abort these substeps. (fragment case)
1017                                 if last_table is null
1018                                         # this is odd
1019                                         target = open_els[open_els.length - 1]
1020                                         target_i = target.children.length
1021                                 # 5. If last table has a parent element, then let adjusted
1022                                 # insertion location be inside last table's parent element,
1023                                 # immediately before last table, and abort these substeps.
1024                                 if last_table.parent?
1025                                         for c, i in last_table.parent.children
1026                                                 if c is last_table
1027                                                         target = last_table.parent
1028                                                         target_i = i
1029                                                         break
1030                                         break
1031                                 # 6. Let previous element be the element immediately above last
1032                                 # table in the stack of open elements.
1033                                 #
1034                                 # huh? how could it not have a parent?
1035                                 previous_element = open_els[last_table_i + 1]
1036                                 # 7. Let adjusted insertion location be inside previous
1037                                 # element, after its last child (if any).
1038                                 target = previous_element
1039                                 target_i = target.children.length
1040                                 # Note: These steps are involved in part because it's possible
1041                                 # for elements, the table element in this case in particular,
1042                                 # to have been moved by a script around in the DOM, or indeed
1043                                 # removed from the DOM entirely, after the element was inserted
1044                                 # by the parser.
1045                                 break # don't really loop
1046                 else
1047                         # Otherwise Let adjusted insertion location be inside target, after
1048                         # its last child (if any).
1049                         target_i = target.children.length
1050
1051                 # 3. If the adjusted insertion location is inside a template element,
1052                 # let it instead be inside the template element's template contents,
1053                 # after its last child (if any).
1054                 # fixfull (template)
1055
1056                 # 4. Return the adjusted insertion location.
1057                 return [target, target_i]
1058
1059         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1060         # aka create_an_element_for_token
1061         token_to_element = (t, namespace, intended_parent) ->
1062                 t.type = TYPE_TAG # not TYPE_START_TAG
1063                 # convert attributes into a hash
1064                 attrs = {}
1065                 while t.attrs_a.length
1066                         a = t.attrs_a.pop()
1067                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1068                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1069
1070                 # TODO 2. If the newly created element has an xmlns attribute in the
1071                 # XMLNS namespace whose value is not exactly the same as the element's
1072                 # namespace, that is a parse error. Similarly, if the newly created
1073                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1074                 # value is not the XLink Namespace, that is a parse error.
1075
1076                 # fixfull: the spec says stuff about form pointers and ownerDocument
1077
1078                 return el
1079
1080         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1081         insert_foreign_element = (token, namespace) ->
1082                 ail = adjusted_insertion_location()
1083                 ail_el = ail[0]
1084                 ail_i = ail[1]
1085                 el = token_to_element token, namespace, ail_el
1086                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1087                 el.parent = ail_el
1088                 ail_el.children.splice ail_i, 0, el
1089                 open_els.unshift el
1090                 return el
1091         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1092         insert_html_element = insert_foreign_element # (token, namespace) ->
1093
1094         # FIXME read implement "foster parenting" part
1095         # FIXME read spec, do this right
1096         # FIXME implement the override target thing
1097         # note: this assumes it's an open tag
1098         # FIXME what part of the spec is this?
1099         # TODO look through all callers of this, and see what they should really be doing.
1100         #   eg probably insert_html_element for tokens
1101         tree_insert_element = (el, override_target = null, namespace = null) ->
1102                 if namespace?
1103                         el.namespace = namespace
1104                 dest = adjusted_insertion_location override_target
1105                 if el.type is TYPE_START_TAG # means it's a "token"
1106                         el = token_to_element el, namespace, dest[0]
1107                 unless el.namespace?
1108                         namespace = dest.namespace
1109                 # fixfull: Document nodes sometimes can't accept more chidren
1110                 dest[0].children.splice dest[1], 0, el
1111                 el.parent = dest[0]
1112                 open_els.unshift el
1113                 return el
1114
1115         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1116         # position should be [node, index_within_children]
1117         insert_comment = (t, position = null) ->
1118                 position ?= adjusted_insertion_location()
1119                 position[0].children.splice position[1], 0, t
1120
1121         # 8.2.5.2
1122         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1123         parse_generic_raw_text = (t) ->
1124                 insert_html_element t
1125                 tok_state = tok_state_rawtext
1126                 original_insertion_mode = insertion_mode
1127                 insertion_mode = ins_mode_text
1128         parse_generic_rcdata_text = (t) ->
1129                 insert_html_element t
1130                 tok_state = tok_state_rcdata
1131                 original_insertion_mode = insertion_mode
1132                 insertion_mode = ins_mode_text
1133
1134         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1135         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1136         generate_implied_end_tags = (except = null) ->
1137                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1138                         open_els.shift()
1139
1140         # 8.2.5.4 The rules for parsing tokens in HTML content
1141         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1142
1143         # 8.2.5.4.1 The "initial" insertion mode
1144         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1145         ins_mode_initial = (t) ->
1146                 if is_space_tok t
1147                         return
1148                 if t.type is TYPE_COMMENT
1149                         # fixfull this is supposed to be "the last child of the document object"
1150                         doc.children.push t
1151                         return
1152                 if t.type is TYPE_DOCTYPE
1153                         # fixfull
1154                         t.name = 'html'
1155                         doc.children.push t
1156                         insertion_mode = ins_mode_before_html
1157                         return
1158                 # Anything else
1159                 #fixfull (iframe, quirks)
1160                 insertion_mode = ins_mode_before_html
1161                 insertion_mode t # reprocess the token
1162                 return
1163
1164         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1165         ins_mode_before_html = (t) ->
1166                 if t.type is TYPE_DOCTYPE
1167                         parse_error()
1168                         return
1169                 if t.type is TYPE_COMMENT
1170                         doc.children.push t
1171                         return
1172                 if is_space_tok t
1173                         return
1174                 if t.type is TYPE_START_TAG and t.name is 'html'
1175                         el = token_to_element t, NS_HTML, doc
1176                         open_els.unshift(el)
1177                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178                         insertion_mode = ins_mode_before_head
1179                         return
1180                 if t.type is TYPE_END_TAG
1181                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182                                 # fall through to "anything else"
1183                         else
1184                                 parse_error()
1185                                 return
1186                 # Anything else
1187                 html_tok = new_open_tag 'html'
1188                 el = token_to_element html_tok, NS_HTML, doc
1189                 doc.children.push el
1190                 open_els.unshift el
1191                 # ?fixfull browsing context
1192                 insertion_mode = ins_mode_before_head
1193                 insertion_mode t
1194                 return
1195
1196         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197         ins_mode_before_head = (t) ->
1198                 if is_space_tok t
1199                         return
1200                 if t.type is TYPE_COMMENT
1201                         insert_comment t
1202                         return
1203                 if t.type is TYPE_DOCTYPE
1204                         parse_error()
1205                         return
1206                 if t.type is TYPE_START_TAG and t.name is 'html'
1207                         ins_mode_in_body t
1208                         return
1209                 if t.type is TYPE_START_TAG and t.name is 'head'
1210                         el = insert_html_element t
1211                         head_element_pointer = el
1212                         insertion_mode = ins_mode_in_head
1213                 if t.type is TYPE_END_TAG
1214                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215                                 # fall through to Anything else below
1216                         else
1217                                 parse_error()
1218                                 return
1219                 # Anything else
1220                 head_tok = new_open_tag 'head'
1221                 el = insert_html_element head_tok
1222                 head_element_pointer = el
1223                 insertion_mode = ins_mode_in_head
1224                 insertion_mode t # reprocess current token
1225
1226         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228                 open_els.shift() # spec says this will be a 'head' node
1229                 insertion_mode = ins_mode_after_head
1230                 insertion_mode t
1231         ins_mode_in_head = (t) ->
1232                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1233                         insert_character t
1234                         return
1235                 if t.type is TYPE_COMMENT
1236                         insert_comment t
1237                         return
1238                 if t.type is TYPE_DOCTYPE
1239                         parse_error()
1240                         return
1241                 if t.type is TYPE_START_TAG and t.name is 'html'
1242                         ins_mode_in_body t
1243                         return
1244                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245                         el = insert_html_element t
1246                         open_els.shift()
1247                         t.acknowledge_self_closing()
1248                         return
1249                 if t.type is TYPE_START_TAG and t.name is 'meta'
1250                         el = insert_html_element t
1251                         open_els.shift()
1252                         t.acknowledge_self_closing()
1253                         # fixfull encoding stuff
1254                         return
1255                 if t.type is TYPE_START_TAG and t.name is 'title'
1256                         parse_generic_rcdata_element t
1257                         return
1258                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259                         parse_generic_raw_text t
1260                         return
1261                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262                         insert_html_element t
1263                         insertion_mode = in_head_noscript # FIXME implement
1264                         return
1265                 if t.type is TYPE_START_TAG and t.name is 'script'
1266                         ail = adjusted_insertion_location()
1267                         el = token_to_element t, NS_HTML, ail
1268                         el.flag_parser_inserted true # FIXME implement
1269                         # fixfull frament case
1270                         ail[0].children.splice ail[1], 0, el
1271                         open_els.unshift el
1272                         tok_state = tok_state_script_data
1273                         original_insertion_mode = insertion_mode # make sure orig... is defined
1274                         insertion_mode = ins_mode_text # FIXME implement
1275                         return
1276                 if t.type is TYPE_END_TAG and t.name is 'head'
1277                         open_els.shift() # will be a head element... spec says so
1278                         insertion_mode = ins_mode_after_head
1279                         return
1280                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281                         ins_mode_in_head_else t
1282                         return
1283                 if t.type is TYPE_START_TAG and t.name is 'template'
1284                         insert_html_element t
1285                         afe_push_marker()
1286                         flag_frameset_ok = false
1287                         insertion_mode = ins_mode_in_template
1288                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1289                         return
1290                 if t.type is TYPE_END_TAG and t.name is 'template'
1291                         if template_tag_is_open()
1292                                 generate_implied_end_tags
1293                                 if open_els[0].name isnt 'template'
1294                                         parse_error()
1295                                 loop
1296                                         el = open_els.shift()
1297                                         if el.name is 'template'
1298                                                 break
1299                                 clear_afe_to_marker()
1300                                 template_insertion_modes.shift()
1301                                 reset_insertion_mode()
1302                         else
1303                                 parse_error()
1304                         return
1305                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1306                         parse_error()
1307                         return
1308                 ins_mode_in_head_else t
1309         
1310         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311         ins_mode_in_head_noscript = (t) ->
1312                 # FIXME ?fixfull
1313                 console.log "ins_mode_in_head_noscript unimplemented"
1314         
1315         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1316         ins_mode_after_head_else = (t) ->
1317                 body_tok = new_open_tag 'body'
1318                 insert_html_element body_tok
1319                 insertion_mode = ins_mode_in_body
1320                 insertion_mode t # reprocess token
1321                 return
1322         ins_mode_after_head = (t) ->
1323                 if is_space_tok t
1324                         insert_character t
1325                         return
1326                 if t.type is TYPE_COMMENT
1327                         insert_comment t
1328                         return
1329                 if t.type is TYPE_DOCTYPE
1330                         parse_error()
1331                         return
1332                 if t.type is TYPE_START_TAG and t.name is 'html'
1333                         ins_mode_in_body t
1334                         return
1335                 if t.type is TYPE_START_TAG and t.name is 'body'
1336                         insert_html_element t
1337                         flag_frameset_ok = false
1338                         insertion_mode = ins_mode_in_body
1339                         return
1340                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1341                         insert_html_element t
1342                         insertion_mode = ins_mode_in_frameset
1343                         return
1344                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1345                         parse_error()
1346                         open_els.unshift head_element_pointer
1347                         ins_mode_in_head t
1348                         for el, i of open_els
1349                                 if el is head_element_pointer
1350                                         open_els.splice i, 1
1351                                         return
1352                         console.log "warning: 23904 couldn't find head element in open_els"
1353                         return
1354                 if t.type is TYPE_END_TAG and t.name is 'template'
1355                         ins_mode_in_head t
1356                         return
1357                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1358                         ins_mode_after_head_else t
1359                         return
1360                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1361                         parse_error()
1362                         return
1363                 # Anything else
1364                 ins_mode_after_head_else t
1365
1366         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1367         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1368                 for node, i in open_els
1369                         if node.name is name # FIXME check namespace too
1370                                 generate_implied_end_tags name # arg is exception
1371                                 parse_error() unless i is 0
1372                                 while i >= 0
1373                                         open_els.shift()
1374                                         i -= 1
1375                                 return
1376                         if special_elements[node.name]? # FIXME check namespac too
1377                                 parse_error()
1378                                 return
1379         ins_mode_in_body = (t) ->
1380                 switch t.type
1381                         when TYPE_TEXT
1382                                 switch t.text
1383                                         when "\u0000"
1384                                                 parse_error()
1385                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1386                                                 reconstruct_active_formatting_elements()
1387                                                 insert_character t
1388                                         else
1389                                                 reconstruct_active_formatting_elements()
1390                                                 insert_character t
1391                                                 flag_frameset_ok = false
1392                         when TYPE_COMMENT
1393                                 insert_comment t
1394                         when TYPE_DOCTYPE
1395                                 parse_error()
1396                         when TYPE_START_TAG
1397                                 switch t.name
1398                                         when 'html'
1399                                                 parse_error()
1400                                                 return if template_tag_is_open()
1401                                                 root_attrs = open_els[open_els.length - 1].attrs
1402                                                 for k, v of t.attrs
1403                                                         root_attrs[k] = v unless root_attrs[k]?
1404                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1405                                                 # FIXME also do this for </template> (end tag)
1406                                                 return ins_mode_in_head t
1407                                         when 'body'
1408                                                 parse_error()
1409                                                 # TODO
1410                                         when 'frameset'
1411                                                 parse_error()
1412                                                 # TODO
1413                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1414                                                 close_p_if_in_button_scope()
1415                                                 insert_html_element t
1416                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1417                                                 close_p_if_in_button_scope()
1418                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1419                                                         parse_error()
1420                                                         open_els.shift()
1421                                                 insert_html_element t
1422                                         # TODO lots more to implement here
1423                                         when 'a'
1424                                                 # If the list of active formatting elements
1425                                                 # contains an a element between the end of the list and
1426                                                 # the last marker on the list (or the start of the list
1427                                                 # if there is no marker on the list), then this is a
1428                                                 # parse error; run the adoption agency algorithm for
1429                                                 # the tag name "a", then remove that element from the
1430                                                 # list of active formatting elements and the stack of
1431                                                 # open elements if the adoption agency algorithm didn't
1432                                                 # already remove it (it might not have if the element
1433                                                 # is not in table scope).
1434                                                 found = false
1435                                                 for el in afe
1436                                                         if el.type is TYPE_AFE_MARKER
1437                                                                 break
1438                                                         if el.name is 'a'
1439                                                                 found = el
1440                                                 if found?
1441                                                         parse_error()
1442                                                         adoption_agency 'a'
1443                                                         for el, i in afe
1444                                                                 if el is found
1445                                                                         afe.splice i, 1
1446                                                         for el, i in open_els
1447                                                                 if el is found
1448                                                                         open_els.splice i, 1
1449                                                 reconstruct_active_formatting_elements()
1450                                                 el = insert_html_element t
1451                                                 afe_push el
1452                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1453                                                 reconstruct_active_formatting_elements()
1454                                                 el = insert_html_element t
1455                                                 afe_push el
1456                                         when 'table'
1457                                                 # fixfull quirksmode thing
1458                                                 close_p_if_in_button_scope()
1459                                                 insert_html_element t
1460                                                 insertion_mode = ins_mode_in_table
1461                                         # TODO lots more to implement here
1462                                         else # any other start tag
1463                                                 reconstruct_active_formatting_elements()
1464                                                 insert_html_element t
1465                         when TYPE_EOF
1466                                 ok_tags = {
1467                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1468                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1469                                 }
1470                                 for t in open_els
1471                                         unless ok_tags[t.name]?
1472                                                 parse_error()
1473                                                 break
1474                                 # TODO stack of template insertion modes thing
1475                                 stop_parsing()
1476                         when TYPE_END_TAG
1477                                 switch t.name
1478                                         when 'body'
1479                                                 unless is_in_scope 'body'
1480                                                         parse_error()
1481                                                         return
1482                                                 # TODO implement parse error and move to tree_after_body
1483                                         when 'html'
1484                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1485                                                         parse_error()
1486                                                         return
1487                                                 # TODO implement parse error and move to tree_after_body, reprocess
1488                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1489                                                 unless is_in_scope t.name, NS_HTML
1490                                                         parse_error()
1491                                                         return
1492                                                 generate_implied_end_tags()
1493                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1494                                                         parse_error()
1495                                                 loop
1496                                                         el = open_els.shift()
1497                                                         if el.name is t.name and el.namespace is NS_HTML
1498                                                                 return
1499                                         # TODO lots more close tags to implement here
1500                                         when 'p'
1501                                                 unless is_in_button_scope 'p'
1502                                                         parse_error()
1503                                                         insert_html_element new_open_tag 'p'
1504                                                 close_p_element()
1505                                         # TODO lots more close tags to implement here
1506                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1507                                                 adoption_agency t.name
1508                                         # TODO lots more close tags to implement here
1509                                         else
1510                                                 in_body_any_other_end_tag t.name
1511                 return
1512
1513         ins_mode_in_table_else = (t) ->
1514                 parse_error()
1515                 flag_foster_parenting = true # FIXME
1516                 ins_mode_in_body t
1517                 flag_foster_parenting = false
1518         can_in_table = { # FIXME do this inline like everywhere else
1519                 'table': true
1520                 'tbody': true
1521                 'tfoot': true
1522                 'thead': true
1523                 'tr': true
1524         }
1525
1526         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1527         ins_mode_text = (t) ->
1528                 if t.type is TYPE_TEXT
1529                         insert_character t
1530                         return
1531                 if t.type is TYPE_EOF
1532                         parse_error()
1533                         if open_els[0].name is 'script'
1534                                 open_els[0].flag 'already started', true
1535                         open_els.shift()
1536                         insertion_mode = original_insertion_mode
1537                         insertion_mode t
1538                         return
1539                 if t.type is TYPE_END_TAG and t.name is 'script'
1540                         open_els.shift()
1541                         insertion_mode = original_insertion_mode
1542                         # fixfull the spec seems to assume that I'm going to run the script
1543                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1544                         return
1545                 if t.type is TYPE_END_TAG
1546                         open_els.shift()
1547                         insertion_mode = original_insertion_mode
1548                         return
1549                 console.log 'warning: end of ins_mode_text reached'
1550
1551         # the functions below implement the tokenizer stats described here:
1552         # http://www.w3.org/TR/html5/syntax.html#tokenization
1553
1554         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1555         ins_mode_in_table = (t) ->
1556                 switch t.type
1557                         when TYPE_TEXT
1558                                 if can_in_table[t.name]
1559                                         original_insertion_mode = insertion_mode
1560                                         insertion_mode = ins_mode_in_table_text
1561                                         insertion_mode t
1562                                 else
1563                                         ins_mode_in_table_else t
1564                         when TYPE_COMMENT
1565                                 insert_comment t
1566                         when TYPE_DOCTYPE
1567                                 parse_error()
1568                         when TYPE_START_TAG
1569                                 switch t.name
1570                                         when 'caption'
1571                                                 clear_stack_to_table_context()
1572                                                 afe_push_marker()
1573                                                 insert_html_element t
1574                                                 insertion_mode = ins_mode_in_caption
1575                                         when 'colgroup'
1576                                                 clear_stack_to_table_context()
1577                                                 insert_html_element t
1578                                                 insertion_mode = ins_mode_in_column_group
1579                                         when 'col'
1580                                                 clear_stack_to_table_context()
1581                                                 insert_html_element new_open_tag 'colgroup'
1582                                                 insertion_mode = ins_mode_in_column_group
1583                                                 insertion_mode t
1584                                         when 'tbody', 'tfoot', 'thead'
1585                                                 clear_stack_to_table_context()
1586                                                 insert_html_element t
1587                                                 insertion_mode = ins_mode_in_table_body
1588                                         when 'td', 'th', 'tr'
1589                                                 clear_stack_to_table_context()
1590                                                 insert_html_element new_open_tag 'tbody'
1591                                                 insertion_mode = ins_mode_in_table_body
1592                                                 insertion_mode t
1593                                         when 'table'
1594                                                 parse_error()
1595                                                 if is_in_table_scope 'table'
1596                                                         loop
1597                                                                 el = open_els.shift()
1598                                                                 if el.name is 'table'
1599                                                                         break
1600                                                         reset_insertion_mode()
1601                                                         insertion_mode t
1602                                         when 'style', 'script', 'template'
1603                                                 ins_mode_in_head t
1604                                         when 'input'
1605                                                 if token_is_input_hidden t
1606                                                         ins_mode_in_table_else t
1607                                                 else
1608                                                         parse_error()
1609                                                         el = insert_html_element t
1610                                                         open_els.shift()
1611                                                         t.acknowledge_self_closing()
1612                                         when 'form'
1613                                                 parse_error()
1614                                                 if form_element_pointer?
1615                                                         return
1616                                                 if template_tag_is_open()
1617                                                         return
1618                                                 form_element_pointer = insert_html_element t
1619                                                 open_els.shift()
1620                                         else
1621                                                 ins_mode_in_table_else t
1622                         when TYPE_END_TAG
1623                                 switch t.name
1624                                         when 'table'
1625                                                 if is_in_table_scope 'table'
1626                                                         loop
1627                                                                 el = open_els.shift()
1628                                                                 if el.name is 'table'
1629                                                                         break
1630                                                         reset_insertion_mode()
1631                                                 else
1632                                                         parse_error
1633                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1634                                                 parse_error()
1635                                         when 'template'
1636                                                 ins_mode_in_head t
1637                                         else
1638                                                 ins_mode_in_table_else t
1639                         when TYPE_EOF
1640                                 ins_mode_in_body t
1641                         else
1642                                 ins_mode_in_table_else t
1643
1644
1645         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1646         ins_mode_in_table_text = (t) ->
1647                 if t.type is TYPE_TEXT and t.text is "\u0000"
1648                         # huh? I thought the tokenizer didn't emit these
1649                         parse_error()
1650                         return
1651                 if t.type is TYPE_TEXT
1652                         pending_table_character_tokens.push t
1653                         return
1654                 # Anything else
1655                 all_space = true
1656                 for old in pending_table_character_tokens
1657                         unless is_space_tok old
1658                                 all_space = false
1659                                 break
1660                 if all_space
1661                         for old in pending_table_character_tokens
1662                                 insert_character old
1663                 else
1664                         for old in pending_table_character_tokens
1665                                 ins_mode_table_else old
1666                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1667                 insertion_mode = original_insertion_mode
1668                 insertion_mode t
1669
1670         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1671         ins_mode_in_caption = (t) ->
1672                 if t.type is TYPE_END_TAG and t.name is 'caption'
1673                         if is_in_table_scope 'caption'
1674                                 generate_implied_end_tags()
1675                                 if open_els[0].name isnt 'caption'
1676                                         parse_error()
1677                                 loop
1678                                         el = open_els.shift()
1679                                         if el.name is 'caption'
1680                                                 break
1681                                 clear_afe_to_marker()
1682                                 insertion_mode = in_table
1683                         else
1684                                 parse_error()
1685                                 # fragment case
1686                         return
1687                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1688                         parse_error()
1689                         if is_in_table_scope 'caption'
1690                                 loop
1691                                         el = open_els.shift()
1692                                         if el.name is 'caption'
1693                                                 break
1694                                 clear_afe_to_marker()
1695                                 insertion_mode = in_table
1696                                 insertion_mode t
1697                         # else fragment case
1698                         return
1699                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1700                         parse_error()
1701                         return
1702                 # Anything else
1703                 ins_mode_in_body t
1704
1705         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1706         ins_mode_in_column_group = (t) ->
1707                 if is_space_tok t
1708                         insert_character t
1709                         return
1710                 if t.type is TYPE_COMMENT
1711                         insert_comment t
1712                         return
1713                 if t.type is TYPE_DOCTYPE
1714                         parse_error()
1715                         return
1716                 if t.type is TYPE_START_TAG and t.name is 'html'
1717                         ins_mode_in_body t
1718                         return
1719                 if t.type is TYPE_START_TAG and t.name is 'col'
1720                         el = insert_html_element t
1721                         open_els.shift()
1722                         t.acknowledge_self_closing()
1723                         return
1724                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1725                         if open_els[0].name is 'colgroup'
1726                                 open_els[0].shift()
1727                                 insertion_mode = ins_mode_in_table
1728                         else
1729                                 parse_error()
1730                         return
1731                 if t.type is TYPE_END_TAG and t.name is 'col'
1732                         parse_error()
1733                         return
1734                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1735                         ins_mode_in_head t
1736                         return
1737                 if t.type is TYPE_EOF
1738                         ins_mode_in_body t
1739                         return
1740                 # Anything else
1741                 if open_els[0].name isnt 'colgroup'
1742                         parse_error()
1743                         return
1744                 open_els.shift()
1745                 insertion_mode = ins_mode_in_table
1746                 insertion_mode t
1747                 return
1748
1749         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1750         ins_mode_in_table_body = (t) ->
1751                 if t.type is TYPE_START_TAG and t.name is 'tr'
1752                         clear_stack_to_table_body_context()
1753                         insert_html_element t
1754                         insertion_mode = ins_mode_in_row
1755                         return
1756                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1757                         parse_error()
1758                         clear_stack_to_table_body_context()
1759                         insert_html_element new_open_tag 'tr'
1760                         insertion_mode = ins_mode_in_row
1761                         insertion_mode t
1762                         return
1763                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1764                         unless is_in_table_scope t.name # fixfull check namespace
1765                                 parse_error()
1766                                 return
1767                         clear_stack_to_table_body_context()
1768                         open_els.shift()
1769                         insertion_mode = ins_mode_in_table
1770                         return
1771                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1772                         has = false
1773                         for el in open_els
1774                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1775                                         has = true
1776                                         break
1777                                 if table_scopers[el.name]
1778                                         break
1779                         if !has
1780                                 parse_error()
1781                                 return
1782                         clear_stack_to_table_body_context()
1783                         open_els.shift()
1784                         insertion_mode = ins_mode_in_table
1785                         insertion_mode t
1786                         return
1787                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1788                         parse_error()
1789                         return
1790                 # Anything else
1791                 ins_mode_in_table t
1792
1793         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1794         ins_mode_in_row = (t) ->
1795                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1796                         clear_stack_to_table_row_context()
1797                         insert_html_element t
1798                         insertion_mode = ins_mode_in_cell
1799                         afe_push_marker()
1800                         return
1801                 if t.type is TYPE_END_TAG and t.name is 'tr'
1802                         if is_in_table_scope 'tr'
1803                                 clear_stack_to_table_row_context()
1804                                 open_els.shift()
1805                                 insertion_mode = ins_mode_in_table_body
1806                         else
1807                                 parse_error()
1808                         return
1809                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1810                         if is_in_table_scope 'tr'
1811                                 clear_stack_to_table_row_context()
1812                                 open_els.shift()
1813                                 insertion_mode = ins_mode_in_table_body
1814                                 insertion_mode t
1815                         else
1816                                 parse_error()
1817                         return
1818                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1819                         if is_in_table_scope t.name # fixfull namespace
1820                                 if is_in_table_scope 'tr'
1821                                         clear_stack_to_table_row_context()
1822                                         open_els.shift()
1823                                         insertion_mode = ins_mode_in_table_body
1824                                         insertion_mode t
1825                         else
1826                                 parse_error()
1827                         return
1828                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1829                         parse_error()
1830                         return
1831                 # Anything else
1832                 ins_mode_in_table t
1833
1834         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1835         close_the_cell = ->
1836                 generate_implied_end_tags()
1837                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1838                         parse_error()
1839                 loop
1840                         el = open_els.shift()
1841                         if el.name is 'td' or el.name is 'th'
1842                                 break
1843                 clear_afe_to_marker()
1844                 insertion_mode = ins_mode_in_row
1845
1846         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1847         ins_mode_in_cell = (t) ->
1848                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1849                         if is_in_table_scope t.name
1850                                 generate_implied_end_tags()
1851                                 if open_els[0].name isnt t.name
1852                                         parse_error
1853                                 loop
1854                                         el = open_els.shift()
1855                                         if el.name is t.name
1856                                                 break
1857                                 clear_afe_to_marker()
1858                                 insertion_mode = ins_mode_in_row
1859                         else
1860                                 parse_error()
1861                         return
1862                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1863                         has = false
1864                         for el in open_els
1865                                 if el.name is 'td' or el.name is 'th'
1866                                         has = true
1867                                         break
1868                                 if table_scopers[el.name]
1869                                         break
1870                         if !has
1871                                 parse_error()
1872                                 return
1873                         close_the_cell()
1874                         insertion_mode t
1875                         return
1876                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1877                         parse_error()
1878                         return
1879                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1880                         if is_in_table_scope t.name # fixfull namespace
1881                                 close_the_cell()
1882                                 insertion_mode t
1883                         else
1884                                 parse_error()
1885                         return
1886                 # Anything Else
1887                 ins_mode_in_body t
1888
1889         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1890         ins_mode_in_select = (t) ->
1891                 if t.type is TYPE_TEXT and t.text is "\u0000"
1892                         parse_error()
1893                         return
1894                 if t.type is TYPE_TEXT
1895                         insert_character t
1896                         return
1897                 if t.type is TYPE_COMMENT
1898                         insert_comment t
1899                         return
1900                 if t.type is TYPE_DOCTYPE
1901                         parse_error()
1902                         return
1903                 if t.type is TYPE_START_TAG and t.name is 'html'
1904                         ins_mode_in_body t
1905                         return
1906                 if t.type is TYPE_START_TAG and t.name is 'option'
1907                         if open_els[0].name is 'option'
1908                                 open_els.shift()
1909                         insert_html_element t
1910                         return
1911                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1912                         if open_els[0].name is 'option'
1913                                 open_els.shift()
1914                         if open_els[0].name is 'optgroup'
1915                                 open_els.shift()
1916                         insert_html_element t
1917                         return
1918                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1919                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1920                                 open_els.shift()
1921                         if open_els[0].name is 'optgroup'
1922                                 open_els.shift()
1923                         else
1924                                 parse_error()
1925                         return
1926                 if t.type is TYPE_END_TAG and t.name is 'option'
1927                         if open_els[0].name is 'option'
1928                                 open_els.shift()
1929                         else
1930                                 parse_error()
1931                         return
1932                 if t.type is TYPE_END_TAG and t.name is 'select'
1933                         if is_in_select_scope 'select'
1934                                 loop
1935                                         el = open_els.shift()
1936                                         if el.name is 'select'
1937                                                 break
1938                                 reset_insertion_mode()
1939                         else
1940                                 parse_error()
1941                         return
1942                 if t.type is TYPE_START_TAG and t.name is 'select'
1943                         parse_error()
1944                         loop
1945                                 el = open_els.shift()
1946                                 if el.name is 'select'
1947                                         break
1948                         reset_insertion_mode()
1949                         # spec says that this is the same as </select> but it doesn't say
1950                         # to check scope first
1951                         return
1952                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1953                         parse_error()
1954                         if is_in_select_scope 'select'
1955                                 return
1956                         loop
1957                                 el = open_els.shift()
1958                                 if el.name is 'select'
1959                                         break
1960                         reset_insertion_mode()
1961                         insertion_mode t
1962                         return
1963                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1964                         ins_mode_in_head t
1965                         return
1966                 if t.type is TYPE_EOF
1967                         ins_mode_in_body t
1968                         return
1969                 # Anything else
1970                 parse_error()
1971                 return
1972
1973         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1974         ins_mode_in_select_in_table = (t) ->
1975                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1976                         parse_error()
1977                         loop
1978                                 el = open_els.shift()
1979                                 if el.name is 'select'
1980                                         break
1981                         reset_insertion_mode()
1982                         insertion_mode t
1983                         return
1984                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1985                         parse_error()
1986                         unless is_in_table_scope t.name, NS_HTML
1987                                 return
1988                         loop
1989                                 el = open_els.shift()
1990                                 if el.name is 'select'
1991                                         break
1992                         reset_insertion_mode()
1993                         insertion_mode t
1994                         return
1995                 # Anything else
1996                 ins_mode_in_select t
1997                 return
1998
1999         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2000         ins_mode_in_template = (t) ->
2001                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2002                         ins_mode_in_body t
2003                         return
2004                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2005                         ins_mode_in_head t
2006                         return
2007                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2008                         template_insertion_modes.shift()
2009                         template_insertion_modes.unshift ins_mode_in_table
2010                         insertion_mode = ins_mode_in_table
2011                         insertion_mode t
2012                         return
2013                 if t.type is TYPE_START_TAG and t.name is 'col'
2014                         template_insertion_modes.shift()
2015                         template_insertion_modes.unshift ins_mode_in_column_group
2016                         insertion_mode = ins_mode_in_column_group
2017                         insertion_mode t
2018                         return
2019                 if t.type is TYPE_START_TAG and t.name is 'tr'
2020                         template_insertion_modes.shift()
2021                         template_insertion_modes.unshift ins_mode_in_table_body
2022                         insertion_mode = ins_mode_in_table_body
2023                         insertion_mode t
2024                         return
2025                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2026                         template_insertion_modes.shift()
2027                         template_insertion_modes.unshift ins_mode_in_row
2028                         insertion_mode = ins_mode_in_row
2029                         insertion_mode t
2030                         return
2031                 if t.type is TYPE_START_TAG
2032                         template_insertion_modes.shift()
2033                         template_insertion_modes.unshift ins_mode_in_body
2034                         insertion_mode = ins_mode_in_body
2035                         insertion_mode t
2036                         return
2037                 if t.type is TYPE_END_TAG
2038                         parse_error()
2039                         return
2040                 if t.type is EOF
2041                         unless template_tag_is_open()
2042                                 stop_parsing()
2043                                 return
2044                         parse_error()
2045                         loop
2046                                 el = open_els.shift()
2047                                 if el.name is 'template' # fixfull check namespace
2048                                         break
2049                         clear_afe_to_marker()
2050                         template_insertion_modes.shift()
2051                         reset_insertion_mode()
2052                         insertion_mode t
2053
2054         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2055         ins_mode_after_body = (t) ->
2056                 if is_space_tok t
2057                         ins_mode_in_body t
2058                         return
2059                 if t.type is TYPE_COMMENT
2060                         insert_comment t, [open_els[0], open_els[0].children.length]
2061                         return
2062                 if t.type is TYPE_DOCTYPE
2063                         parse_error()
2064                         return
2065                 if t.type is TYPE_START_TAG and t.name is 'html'
2066                         ins_mode_in_body t
2067                         return
2068                 if t.type is TYPE_END_TAG and t.name is 'html'
2069                         # fixfull fragment case
2070                         insertion_mode = ins_mode_after_after_body
2071                         return
2072                 if t.type is TYPE_EOF
2073                         stop_parsing()
2074                         return
2075                 # Anything ELse
2076                 parse_error()
2077                 insertion_mode = ins_mode_in_body
2078                 insertion_mode t
2079
2080         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2081         ins_mode_in_frameset = (t) ->
2082                 if is_space_tok t
2083                         insert_character t
2084                         return
2085                 if t.type is TYPE_COMMENT
2086                         insert_comment t
2087                         return
2088                 if t.type is TYPE_DOCTYPE
2089                         parse_error()
2090                         return
2091                 if t.type is TYPE_START_TAG and t.name is 'html'
2092                         ins_mode_in_body t
2093                         return
2094                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2095                         insert_html_element t
2096                         return
2097                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2098                         # TODO ?correct for: "if the current node is the root html element"
2099                         if open_els.length is 1
2100                                 parse_error()
2101                                 return # fragment case
2102                         open_els.shift()
2103                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2104                                 insertion_mode = ins_mode_after_frameset
2105                         return
2106                 if t.type is TYPE_START_TAG and t.name is 'frame'
2107                         insert_html_element t
2108                         open_els.shift()
2109                         t.acknowledge_self_closing()
2110                         return
2111                 if t.type is TYPE_START TAG and t.name is 'noframes'
2112                         ins_mode_in_head t
2113                         return
2114                 if t.type is TYPE_EOF
2115                         # TODO ?correct for: "if the current node is not the root html element"
2116                         if open_els.length isnt 1
2117                                 parse_error()
2118                         stop_parsing()
2119                         return
2120                 # Anything else
2121                 parse_error()
2122                 return
2123
2124         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2125         ins_mode_after_frameset = (t) ->
2126                 if is_space_tok t
2127                         insert_character t
2128                         return
2129                 if t.type is TYPE_COMMENT
2130                         insert_comment t
2131                         return
2132                 if t.type is TYPE_DOCTYPE
2133                         parse_error()
2134                         return
2135                 if t.type is TYPE_START_TAG and t.name is 'html'
2136                         ins_mode_in_body t
2137                         return
2138                 if t.type is TYPE_END_TAG and t.name is 'html'
2139                         insert_mode = ins_mode_after_after_frameset
2140                         return
2141                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2142                         ins_mode_in_head t
2143                         return
2144                 if t.type is TYPE_EOF
2145                         stop_parsing()
2146                         return
2147                 # Anything else
2148                 parse_error()
2149                 return
2150
2151         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2152         ins_mode_after_after_body = (t) ->
2153                 if t.type is TYPE_COMMENT
2154                         insert_comment t, [doc, doc.children.length]
2155                         return
2156                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2157                         ins_mode_in_body t
2158                         return
2159                 if t.type is TYPE_EOF
2160                         stop_parsing()
2161                         return
2162                 # Anything else
2163                 parse_error()
2164                 insertion_mode = ins_mode_in_body
2165                 return
2166
2167         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2168         ins_mode_after_after_frameset = (t) ->
2169                 if t.type is TYPE_COMMENT
2170                         insert_comment t, [doc, doc.children.length]
2171                         return
2172                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2173                         ins_mode_in_body t
2174                         return
2175                 if t.type is TYPE_EOF
2176                         stop_parsing()
2177                         return
2178                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2179                         ins_mode_in_head t
2180                         return
2181                 # Anything else
2182                 parse_error()
2183                 return
2184
2185
2186
2187
2188
2189         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2190         tok_state_data = ->
2191                 switch c = txt.charAt(cur++)
2192                         when '&'
2193                                 return new_text_node parse_character_reference()
2194                         when '<'
2195                                 tok_state = tok_state_tag_open
2196                         when "\u0000"
2197                                 parse_error()
2198                                 return new_text_node c
2199                         when '' # EOF
2200                                 return new_eof_token()
2201                         else
2202                                 return new_text_node c
2203                 return null
2204
2205         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2206         # not needed: tok_state_character_reference_in_data = ->
2207         # just call parse_character_reference()
2208
2209         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2210         tok_state_rcdata = ->
2211                 switch c = txt.charAt(cur++)
2212                         when '&'
2213                                 return new_text_node parse_character_reference()
2214                         when '<'
2215                                 tok_state = tok_state_rcdata_less_than_sign
2216                         when "\u0000"
2217                                 parse_error()
2218                                 return new_character_token "\ufffd"
2219                         when '' # EOF
2220                                 return new_eof_token()
2221                         else
2222                                 return new_character_token c
2223                 return null
2224
2225         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2226         # not needed: tok_state_character_reference_in_rcdata = ->
2227         # just call parse_character_reference()
2228
2229         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2230         tok_state_rawtext = ->
2231                 switch c = txt.charAt(cur++)
2232                         when '<'
2233                                 tok_state = tok_state_rawtext_less_than_sign
2234                         when "\u0000"
2235                                 parse_error()
2236                                 return new_character_token "\ufffd"
2237                         when '' # EOF
2238                                 return new_eof_token()
2239                         else
2240                                 return new_character_token c
2241                 return null
2242
2243         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2244         tok_state_script_data = ->
2245                 switch c = txt.charAt(cur++)
2246                         when '<'
2247                                 tok_state = tok_state_script_data_less_than_sign
2248                         when "\u0000"
2249                                 parse_error()
2250                                 return new_character_token "\ufffd"
2251                         when '' # EOF
2252                                 return new_eof_token()
2253                         else
2254                                 return new_character_token c
2255                 return null
2256
2257         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2258         tok_state_plaintext = ->
2259                 switch c = txt.charAt(cur++)
2260                         when "\u0000"
2261                                 parse_error()
2262                                 return new_character_token "\ufffd"
2263                         when '' # EOF
2264                                 return new_eof_token()
2265                         else
2266                                 return new_character_token c
2267                 return null
2268
2269
2270         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2271         tok_state_tag_open = ->
2272                 switch c = txt.charAt(cur++)
2273                         when '!'
2274                                 tok_state = tok_state_markup_declaration_open
2275                         when '/'
2276                                 tok_state = tok_state_end_tag_open
2277                         when '?'
2278                                 parse_error()
2279                                 tok_state = tok_state_bogus_comment
2280                         else
2281                                 if lc_alpha.indexOf(c) > -1
2282                                         tok_cur_tag = new_open_tag c
2283                                         tok_state = tok_state_tag_name
2284                                 else if uc_alpha.indexOf(c) > -1
2285                                         tok_cur_tag = new_open_tag c.toLowerCase()
2286                                         tok_state = tok_state_tag_name
2287                                 else
2288                                         parse_error()
2289                                         tok_state = tok_state_data
2290                                         cur -= 1 # we didn't parse/handle the char after <
2291                                         return new_text_node '<'
2292                 return null
2293
2294         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2295         tok_state_end_tag_open = ->
2296                 switch c = txt.charAt(cur++)
2297                         when '>'
2298                                 parse_error()
2299                                 tok_state = tok_state_data
2300                         when '' # EOF
2301                                 parse_error()
2302                                 tok_state = tok_state_data
2303                                 return new_text_node '</'
2304                         else
2305                                 if uc_alpha.indexOf(c) > -1
2306                                         tok_cur_tag = new_end_tag c.toLowerCase()
2307                                         tok_state = tok_state_tag_name
2308                                 else if lc_alpha.indexOf(c) > -1
2309                                         tok_cur_tag = new_end_tag c
2310                                         tok_state = tok_state_tag_name
2311                                 else
2312                                         parse_error()
2313                                         tok_state = tok_state_bogus_comment
2314                 return null
2315
2316         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2317         tok_state_tag_name = ->
2318                 switch c = txt.charAt(cur++)
2319                         when "\t", "\n", "\u000c", ' '
2320                                 tok_state = tok_state_before_attribute_name
2321                         when '/'
2322                                 tok_state = tok_state_self_closing_start_tag
2323                         when '>'
2324                                 tok_state = tok_state_data
2325                                 tmp = tok_cur_tag
2326                                 tok_cur_tag = null
2327                                 return tmp
2328                         when "\u0000"
2329                                 parse_error()
2330                                 tok_cur_tag.name += "\ufffd"
2331                         when '' # EOF
2332                                 parse_error()
2333                                 tok_state = tok_state_data
2334                         else
2335                                 if uc_alpha.indexOf(c) > -1
2336                                         tok_cur_tag.name += c.toLowerCase()
2337                                 else
2338                                         tok_cur_tag.name += c
2339                 return null
2340
2341         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2342         tok_state_rcdata_less_than_sign = ->
2343                 c = txt.charAt(cur++)
2344                 if c is '/'
2345                         temporary_buffer = ''
2346                         tok_state = tok_state_rcdata_end_tag_open
2347                         return null
2348                 # Anything else
2349                 tok_state = tok_state_rcdata
2350                 cur -= 1 # reconsume the input character
2351                 return new_character_token '<'
2352
2353         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2354         tok_state_rcdata_end_tag_open = ->
2355                 c = txt.charAt(cur++)
2356                 if uc_alpha.indexOf(c) > -1
2357                         tok_cur_tag = new_end_tag c.toLowerCase()
2358                         temporary_buffer += c
2359                         tok_state = tok_state_rcdata_end_tag_name
2360                         return null
2361                 if lc_alpha.indexOf(c) > -1
2362                         tok_cur_tag = new_end_tag c
2363                         temporary_buffer += c
2364                         tok_state = tok_state_rcdata_end_tag_name
2365                         return null
2366                 # Anything else
2367                 tok_state = tok_state_rcdata
2368                 cur -= 1 # reconsume the input character
2369                 return new_character_token "</" # fixfull separate these
2370
2371         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2372         is_appropriate_end_tag = (t) ->
2373                 # spec says to check against "the tag name of the last start tag to
2374                 # have been emitted from this tokenizer", but this is only called from
2375                 # the various "raw" states, which I'm pretty sure all push the start
2376                 # token onto open_els. TODO: verify this after the script data states
2377                 # are implemented
2378                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2379                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2380
2381         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2382         tok_state_rcdata_end_tag_name = ->
2383                 c = txt.charAt(cur++)
2384                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2385                         if is_appropriate_end_tag tok_cur_tag
2386                                 tok_state = tok_state_before_attribute_name
2387                                 return
2388                         # else fall through to "Anything else"
2389                 if c is '/'
2390                         if is_appropriate_end_tag tok_cur_tag
2391                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2392                                 return
2393                         # else fall through to "Anything else"
2394                 if c is '>'
2395                         if is_appropriate_end_tag tok_cur_tag
2396                                 tok_state = tok_state_data
2397                                 return tok_cur_tag
2398                         # else fall through to "Anything else"
2399                 if uc_alpha.indexOf(c) > -1
2400                         tok_cur_tag.name += c.toLowerCase()
2401                         temporary_buffer += c
2402                         return null
2403                 if lc_alpha.indexOf(c) > -1
2404                         tok_cur_tag.name += c
2405                         temporary_buffer += c
2406                         return null
2407                 # Anything else
2408                 tok_state = tok_state_rcdata
2409                 cur -= 1 # reconsume the input character
2410                 return new_character_token '</' + temporary_buffer # fixfull separate these
2411
2412         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2413         tok_state_rawtext_less_than_sign = ->
2414                 c = txt.charAt(cur++)
2415                 if c is '/'
2416                         temporary_buffer = ''
2417                         tok_state = tok_state_rawtext_end_tag_open
2418                         return null
2419                 # Anything else
2420                 tok_state = tok_state_rawtext
2421                 cur -= 1 # reconsume the input character
2422                 return new_character_token '<'
2423
2424         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2425         tok_state_rawtext_end_tag_open = ->
2426                 c = txt.charAt(cur++)
2427                 if uc_alpha.indexOf(c) > -1
2428                         tok_cur_tag = new_end_tag c.toLowerCase()
2429                         temporary_buffer += c
2430                         tok_state = tok_state_rawtext_end_tag_name
2431                         return null
2432                 if lc_alpha.indexOf(c) > -1
2433                         tok_cur_tag = new_end_tag c
2434                         temporary_buffer += c
2435                         tok_state = tok_state_rawtext_end_tag_name
2436                         return null
2437                 # Anything else
2438                 tok_state = tok_state_rawtext
2439                 cur -= 1 # reconsume the input character
2440                 return new_character_token "</" # fixfull separate these
2441
2442         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2443         tok_state_rawtext_end_tag_name = ->
2444                 c = txt.charAt(cur++)
2445                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2446                         if is_appropriate_end_tag tok_cur_tag
2447                                 tok_state = tok_state_before_attribute_name
2448                                 return
2449                         # else fall through to "Anything else"
2450                 if c is '/'
2451                         if is_appropriate_end_tag tok_cur_tag
2452                                 tok_state = tok_state_self_closing_start_tag
2453                                 return
2454                         # else fall through to "Anything else"
2455                 if c is '>'
2456                         if is_appropriate_end_tag tok_cur_tag
2457                                 tok_state = tok_state_data
2458                                 return tok_cur_tag
2459                         # else fall through to "Anything else"
2460                 if uc_alpha.indexOf(c) > -1
2461                         tok_cur_tag.name += c.toLowerCase()
2462                         temporary_buffer += c
2463                         return null
2464                 if lc_alpha.indexOf(c) > -1
2465                         tok_cur_tag.name += c
2466                         temporary_buffer += c
2467                         return null
2468                 # Anything else
2469                 tok_state = tok_state_rawtext
2470                 cur -= 1 # reconsume the input character
2471                 return new_character_token '</' + temporary_buffer # fixfull separate these
2472
2473         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2474
2475         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2476         tok_state_before_attribute_name = ->
2477                 attr_name = null
2478                 switch c = txt.charAt(cur++)
2479                         when "\t", "\n", "\u000c", ' '
2480                                 return null
2481                         when '/'
2482                                 tok_state = tok_state_self_closing_start_tag
2483                                 return null
2484                         when '>'
2485                                 tok_state = tok_state_data
2486                                 tmp = tok_cur_tag
2487                                 tok_cur_tag = null
2488                                 return tmp
2489                         when "\u0000"
2490                                 parse_error()
2491                                 attr_name = "\ufffd"
2492                         when '"', "'", '<', '='
2493                                 parse_error()
2494                                 attr_name = c
2495                         when '' # EOF
2496                                 parse_error()
2497                                 tok_state = tok_state_data
2498                         else
2499                                 if uc_alpha.indexOf(c) > -1
2500                                         attr_name = c.toLowerCase()
2501                                 else
2502                                         attr_name = c
2503                 if attr_name?
2504                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2505                         tok_state = tok_state_attribute_name
2506                 return null
2507
2508         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2509         tok_state_attribute_name = ->
2510                 switch c = txt.charAt(cur++)
2511                         when "\t", "\n", "\u000c", ' '
2512                                 tok_state = tok_state_after_attribute_name
2513                         when '/'
2514                                 tok_state = tok_state_self_closing_start_tag
2515                         when '='
2516                                 tok_state = tok_state_before_attribute_value
2517                         when '>'
2518                                 tok_state = tok_state_data
2519                                 tmp = tok_cur_tag
2520                                 tok_cur_tag = null
2521                                 return tmp
2522                         when "\u0000"
2523                                 parse_error()
2524                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2525                         when '"', "'", '<'
2526                                 parse_error()
2527                                 tok_cur_tag.attrs_a[0][0] = c
2528                         when '' # EOF
2529                                 parse_error()
2530                                 tok_state = tok_state_data
2531                         else
2532                                 if uc_alpha.indexOf(c) > -1
2533                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2534                                 else
2535                                         tok_cur_tag.attrs_a[0][0] += c
2536                 return null
2537
2538         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2539         tok_state_after_attribute_name = ->
2540                 c = txt.charAt(cur++)
2541                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2542                         return
2543                 if c is '/'
2544                         tok_state = tok_state_self_closing_start_tag
2545                         return
2546                 if c is '='
2547                         tok_state = tok_state_before_attribute_value
2548                         return
2549                 if c is '>'
2550                         tok_state = tok_state_data
2551                         return
2552                 if uc_alpha.indexOf(c) > -1
2553                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2554                         tok_state = tok_state_attribute_name
2555                         return
2556                 if c is "\u0000"
2557                         parse_error()
2558                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2559                         tok_state = tok_state_attribute_name
2560                         return
2561                 if c is '' # EOF
2562                         parse_error()
2563                         tok_state = tok_state_data
2564                         cur -= 1 # reconsume
2565                         return
2566                 if c is '"' or c is "'" or c is '<'
2567                         parse_error()
2568                         # fall through to Anything else
2569                 # Anything else
2570                 tok_cur_tag.attrs_a.unshift [c, '']
2571                 tok_state = tok_state_attribute_name
2572
2573         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2574         tok_state_before_attribute_value = ->
2575                 switch c = txt.charAt(cur++)
2576                         when "\t", "\n", "\u000c", ' '
2577                                 return null
2578                         when '"'
2579                                 tok_state = tok_state_attribute_value_double_quoted
2580                         when '&'
2581                                 tok_state = tok_state_attribute_value_unquoted
2582                                 cur -= 1
2583                         when "'"
2584                                 tok_state = tok_state_attribute_value_single_quoted
2585                         when "\u0000"
2586                                 # Parse error
2587                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2588                                 tok_state = tok_state_attribute_value_unquoted
2589                         when '>'
2590                                 # Parse error
2591                                 tok_state = tok_state_data
2592                                 tmp = tok_cur_tag
2593                                 tok_cur_tag = null
2594                                 return tmp
2595                         when '' # EOF
2596                                 parse_error()
2597                                 tok_state = tok_state_data
2598                         else
2599                                 tok_cur_tag.attrs_a[0][1] += c
2600                                 tok_state = tok_state_attribute_value_unquoted
2601                 return null
2602
2603         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2604         tok_state_attribute_value_double_quoted = ->
2605                 switch c = txt.charAt(cur++)
2606                         when '"'
2607                                 tok_state = tok_state_after_attribute_value_quoted
2608                         when '&'
2609                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2610                         when "\u0000"
2611                                 # Parse error
2612                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2613                         when '' # EOF
2614                                 parse_error()
2615                                 tok_state = tok_state_data
2616                         else
2617                                 tok_cur_tag.attrs_a[0][1] += c
2618                 return null
2619
2620         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2621         tok_state_attribute_value_single_quoted = ->
2622                 switch c = txt.charAt(cur++)
2623                         when "'"
2624                                 tok_state = tok_state_after_attribute_value_quoted
2625                         when '&'
2626                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2627                         when "\u0000"
2628                                 # Parse error
2629                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2630                         when '' # EOF
2631                                 parse_error()
2632                                 tok_state = tok_state_data
2633                         else
2634                                 tok_cur_tag.attrs_a[0][1] += c
2635                 return null
2636
2637         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2638         tok_state_attribute_value_unquoted = ->
2639                 switch c = txt.charAt(cur++)
2640                         when "\t", "\n", "\u000c", ' '
2641                                 tok_state = tok_state_before_attribute_name
2642                         when '&'
2643                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2644                         when '>'
2645                                 tok_state = tok_state_data
2646                                 tmp = tok_cur_tag
2647                                 tok_cur_tag = null
2648                                 return tmp
2649                         when "\u0000"
2650                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2651                         when '' # EOF
2652                                 parse_error()
2653                                 tok_state = tok_state_data
2654                         else
2655                                 # Parse Error if ', <, = or ` (backtick)
2656                                 tok_cur_tag.attrs_a[0][1] += c
2657                 return null
2658
2659         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2660         tok_state_after_attribute_value_quoted = ->
2661                 switch c = txt.charAt(cur++)
2662                         when "\t", "\n", "\u000c", ' '
2663                                 tok_state = tok_state_before_attribute_name
2664                         when '/'
2665                                 tok_state = tok_state_self_closing_start_tag
2666                         when '>'
2667                                 tok_state = tok_state_data
2668                                 tmp = tok_cur_tag
2669                                 tok_cur_tag = null
2670                                 return tmp
2671                         when '' # EOF
2672                                 parse_error()
2673                                 tok_state = tok_state_data
2674                         else
2675                                 # Parse Error
2676                                 tok_state = tok_state_before_attribute_name
2677                                 cur -= 1 # we didn't handle that char
2678                 return null
2679
2680         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2681         # Don't set this as a state, just call it
2682         # returns a string (NOT a text node)
2683         parse_character_reference = (allowed_char = null, in_attr = false) ->
2684                 if cur >= txt.length
2685                         return '&'
2686                 switch c = txt.charAt(cur)
2687                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2688                                 # explicitly not a parse error
2689                                 return '&'
2690                         when ';'
2691                                 # there has to be "one or more" alnums between & and ; to be a parse error
2692                                 return '&'
2693                         when '#'
2694                                 if cur + 1 >= txt.length
2695                                         return '&'
2696                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2697                                         prefix = '#x'
2698                                         charset = hex_chars
2699                                         start = cur + 2
2700                                 else
2701                                         charset = digits
2702                                         start = cur + 1
2703                                         prefix = '#'
2704                                 i = 0
2705                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2706                                         i += 1
2707                                 if i is 0
2708                                         return '&'
2709                                 if txt.charAt(start + i) is ';'
2710                                         i += 1
2711                                 # FIXME This is supposed to generate parse errors for some chars
2712                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2713                                 if decoded?
2714                                         cur = start + i
2715                                         return decoded
2716                                 return '&'
2717                         else
2718                                 for i in [0...31]
2719                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2720                                                 break
2721                                 if i is 0
2722                                         # exit early, because parse_error() below needs at least one alnum
2723                                         return '&'
2724                                 if txt.charAt(cur + i) is ';'
2725                                         i += 1 # include ';' terminator in value
2726                                         decoded = decode_named_char_ref txt.substr(cur, i)
2727                                         if decoded?
2728                                                 cur += i
2729                                                 return decoded
2730                                         parse_error()
2731                                         return '&'
2732                                 else
2733                                         # no ';' terminator (only legacy char refs)
2734                                         max = i
2735                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2736                                                 c = legacy_char_refs[txt.substr(cur, i)]
2737                                                 if c?
2738                                                         if in_attr
2739                                                                 if txt.charAt(cur + i) is '='
2740                                                                         # "because some legacy user agents will
2741                                                                         # misinterpret the markup in those cases"
2742                                                                         parse_error()
2743                                                                         return '&'
2744                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2745                                                                         # this makes attributes forgiving about url args
2746                                                                         return '&'
2747                                                         # ok, and besides the weird exceptions for attributes...
2748                                                         # return the matching char
2749                                                         cur += i # consume entity chars
2750                                                         parse_error() # because no terminating ";"
2751                                                         return c
2752                                         parse_error()
2753                                         return '&'
2754                 return # never reached
2755
2756         # tree constructor initialization
2757         # see comments on TYPE_TAG/etc for the structure of this data
2758         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2759         open_els = [doc]
2760         afe = [] # active formatting elements
2761         template_insertion_modes = []
2762         insertion_mode = ins_mode_initial
2763         original_insertion_mode = insertion_mode # TODO check spec
2764         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2765         flag_frameset_ok = true
2766         flag_parsing = true
2767         flag_foster_parenting = false
2768         form_element_pointer = null
2769         temporary_buffer = null
2770         pending_table_character_tokens = []
2771         head_element_pointer = null
2772         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2773
2774         # tokenizer initialization
2775         tok_state = tok_state_data
2776
2777         # proccess input
2778         while flag_parsing
2779                 t = tok_state()
2780                 if t?
2781                         insertion_mode t
2782                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2783         return doc.children
2784
2785 serialize_els = (els, shallow, show_ids) ->
2786         serialized = ''
2787         sep = ''
2788         for t in els
2789                 serialized += sep
2790                 sep = ','
2791                 serialized += t.serialize shallow, show_ids
2792         return serialized
2793
2794 # TODO export TYPE_*
2795 module.exports.parse_html = parse_html
2796 module.exports.debug_log_reset = debug_log_reset
2797 module.exports.debug_log_each = debug_log_each
2798 module.exports.TYPE_TAG = TYPE_TAG
2799 module.exports.TYPE_TEXT = TYPE_TEXT
2800 module.exports.TYPE_COMMENT = TYPE_COMMENT
2801 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE