JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implemented more of ins_mode_in_body
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         acknowledge_self_closing: ->
100                 if @token?
101                         @token.flag 'did_self_close'
102                 else
103                         @flag 'did_self_close', true
104         flag: ->
105                 # fixfull
106         serialize: (shallow = false, show_ids = false) -> # for unit tests
107                 ret = ''
108                 switch @type
109                         when TYPE_TAG
110                                 ret += 'tag:'
111                                 ret += JSON.stringify @name
112                                 ret += ','
113                                 if show_ids
114                                         ret += "##{@id},"
115                                 if shallow
116                                         break
117                                 attr_keys = []
118                                 for k of @attrs
119                                         attr_keys.push k
120                                 attr_keys.sort()
121                                 ret += '{'
122                                 sep = ''
123                                 for k in attr_keys
124                                         ret += sep
125                                         sep = ','
126                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
127                                 ret += '},['
128                                 sep = ''
129                                 for c in @children
130                                         ret += sep
131                                         sep = ','
132                                         ret += c.serialize shallow, show_ids
133                                 ret += ']'
134                         when TYPE_TEXT
135                                 ret += 'text:'
136                                 ret += JSON.stringify @text
137                         when TYPE_COMMENT
138                                 ret += 'comment:'
139                                 ret += JSON.stringify @text
140                         when TYPE_DOCTYPE
141                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
142                         when TYPE_AFE_MARKER
143                                 ret += 'marker'
144                         when TYPE_AAA_BOOKMARK
145                                 ret += 'aaa_bookmark'
146                         else
147                                 ret += 'unknown:'
148                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
149                 return ret
150
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153         return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155         return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157         return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159         return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162         return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164         return new Node TYPE_DOCTYPE, name: name
165 new_eof_token = ->
166         return new Node TYPE_EOF
167 new_afe_marker = ->
168         return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170         return new Node TYPE_AAA_BOOKMARK
171
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
177
178 is_uc_alpha = (str) ->
179         return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181         return str.length is 1 and lc_alpha.indexOf(str) > -1
182
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
185
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
188 is_space = (txt) ->
189         return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
192
193 is_input_hidden_tok = (t) ->
194         return unless t.type is TYPE_START_TAG
195         for a of t.attrs_a
196                 if a[0] is 'type'
197                         if a[1].toLowerCase() is 'hidden'
198                                 return true
199                         return false
200         return false
201
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
204
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
207 legacy_char_refs = {
208         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
225         yen: '¥', yuml: 'ÿ'
226 }
227
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
232 svg_elements = [
233         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
247         'view', 'vkern'
248 ]
249
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
251 mathml_elements = [
252         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258         'determinant', 'diff', 'divergence', 'divide', 'domain',
259         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279         'vectorproduct', 'xor'
280 ]
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
283
284 special_elements = {
285         # HTML:
286         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303         wbr:NS_HTML, xmp:NS_HTML,
304
305         # MathML:
306         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307         'annotation-xml':NS_MATHML,
308
309         # SVG:
310         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
311 }
312
313 formatting_elements = {
314          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
316          u: true
317 }
318
319 h_tags = {
320         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
321 }
322
323 foster_parenting_targets = {
324         table: true
325         tbody: true
326         tfoot: true
327         thead: true
328         tr: true
329 }
330
331 # all html I presume
332 end_tag_implied = {
333         dd: true
334         dt: true
335         li: true
336         option: true
337         optgroup: true
338         p: true
339         rb: true
340         rp: true
341         rt: true
342         rtc: true
343 }
344
345 el_is_special = (e) ->
346         return special_elements[e.name] is e.namespace
347
348 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
349 el_is_special_not_adp = (el) ->
350         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
351
352 # decode_named_char_ref()
353 #
354 # The list of named character references is _huge_ so ask the browser to decode
355 # for us instead of wasting bandwidth/space on including the table here.
356 #
357 # Pass without the "&" but with the ";" examples:
358 #    for "&amp" pass "amp;"
359 #    for "&#x2032" pass "x2032;"
360 g_dncr = {
361         cache: {}
362         textarea: document.createElement('textarea')
363 }
364 # TODO test this in IE8
365 decode_named_char_ref = (txt) ->
366         txt = "&#{txt}"
367         decoded = g_dncr.cache[txt]
368         return decoded if decoded?
369         g_dncr.textarea.innerHTML = txt
370         decoded = g_dncr.textarea.value
371         return null if decoded is txt
372         return g_dncr.cache[txt] = decoded
373
374 parse_html = (txt, parse_error_cb = null) ->
375         cur = 0 # index of next char in txt to be parsed
376         # declare doc and tokenizer variables so they're in scope below
377         doc = null
378         open_els = null # stack of open elements
379         afe = null # active formatting elements
380         template_ins_modes = null
381         ins_mode = null
382         original_ins_mode = null
383         tok_state = null
384         tok_cur_tag = null # partially parsed tag
385         flag_scripting = null
386         flag_frameset_ok = null
387         flag_parsing = null
388         flag_foster_parenting = null
389         form_element_pointer = null
390         temporary_buffer = null
391         pending_table_character_tokens = null
392         head_element_pointer = null
393         flag_fragment_parsing = null
394         context_element = null
395
396         stop_parsing = ->
397                 flag_parsing = false
398
399         parse_error = ->
400                 if parse_error_cb?
401                         parse_error_cb cur
402                 else
403                         console.log "Parse error at character #{cur} of #{txt.length}"
404
405         afe_push = (new_el) ->
406                 matches = 0
407                 for el, i in afe
408                         if el.name is new_el.name and el.namespace is new_el.namespace
409                                 for k, v of el.attrs
410                                         continue unless new_el.attrs[k] is v
411                                 for k, v of new_el.attrs
412                                         continue unless el.attrs[k] is v
413                                 matches += 1
414                                 if matches is 3
415                                         afe.splice i, 1
416                                         break
417                 afe.unshift new_el
418         afe_push_marker = ->
419                 afe.unshift new_afe_marker()
420
421         # the functions below impliment the Tree Contstruction algorithm
422         # http://www.w3.org/TR/html5/syntax.html#tree-construction
423
424         # But first... the helpers
425         template_tag_is_open = ->
426                 for t in open_els
427                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
428                                 return true
429                 return false
430         is_in_scope_x = (tag_name, scope, namespace) ->
431                 for t in open_els
432                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
433                                 return true
434                         if scope[t.name] is t.namespace
435                                 return false
436                 return false
437         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
438                 for t in open_els
439                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
440                                 return true
441                         if scope[t.name] is t.namespace
442                                 return false
443                         if scope2[t.name] is t.namespace
444                                 return false
445                 return false
446         standard_scopers = {
447                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
448                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
449                 template: NS_HTML, mi: NS_MATHML,
450
451                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
452                 'annotation-xml': NS_MATHML,
453
454                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
455         }
456         button_scopers = button: NS_HTML
457         li_scopers = ol: NS_HTML, ul: NS_HTML
458         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
459         is_in_scope = (tag_name, namespace = null) ->
460                 return is_in_scope_x tag_name, standard_scopers, namespace
461         is_in_button_scope = (tag_name, namespace = null) ->
462                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
463         is_in_table_scope = (tag_name, namespace = null) ->
464                 return is_in_scope_x tag_name, table_scopers, namespace
465         is_in_select_scope = (tag_name, namespace = null) ->
466                 for t in open_els
467                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
468                                 return true
469                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
470                                 return false
471                 return false
472         # this checks for a particular element, not by name
473         el_is_in_scope = (el) ->
474                 for t in open_els
475                         if t is el
476                                 return true
477                         if standard_scopers[t.name] is t.namespace
478                                 return false
479                 return false
480
481         clear_to_table_stopers = {
482                 'table': true
483                 'template': true
484                 'html': true
485         }
486         clear_stack_to_table_context = ->
487                 loop
488                         if clear_to_table_stopers[open_els[0].name]?
489                                 break
490                         open_els.shift()
491                 return
492         clear_to_table_body_stopers = {
493                 'tbody': true
494                 'tfoot': true
495                 'thead': true
496                 'template': true
497                 'html': true
498         }
499         clear_stack_to_table_body_context = ->
500                 loop
501                         if clear_to_table_body_stopers[open_els[0].name]?
502                                 break
503                         open_els.shift()
504                 return
505         clear_to_table_row_stopers = {
506                 'tr': true
507                 'template': true
508                 'html': true
509         }
510         clear_stack_to_table_row_context = ->
511                 loop
512                         if clear_to_table_row_stopers[open_els[0].name]?
513                                 break
514                         open_els.shift()
515                 return
516         clear_afe_to_marker = ->
517                 loop
518                         return unless afe.length > 0 # this happens in fragment case, ?spec error
519                         el = afe.shift()
520                         if el.type is TYPE_AFE_MARKER
521                                 return
522                 return
523
524         # 8.2.3.1 ...
525         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
526         reset_ins_mode = ->
527                 # 1. Let last be false.
528                 last = false
529                 # 2. Let node be the last node in the stack of open elements.
530                 node_i = 0
531                 node = open_els[node_i]
532                 # 3. Loop: If node is the first node in the stack of open elements,
533                 # then set last to true, and, if the parser was originally created as
534                 # part of the HTML fragment parsing algorithm (fragment case) set node
535                 # to the context element.
536                 loop
537                         if node_i is open_els.length - 1
538                                 last = true
539                                 # fixfull (fragment case)
540
541                         # 4. If node is a select element, run these substeps:
542                         if node.name is 'select'
543                                 # 1. If last is true, jump to the step below labeled done.
544                                 unless last
545                                         # 2. Let ancestor be node.
546                                         ancestor_i = node_i
547                                         ancestor = node
548                                         # 3. Loop: If ancestor is the first node in the stack of
549                                         # open elements, jump to the step below labeled done.
550                                         loop
551                                                 if ancestor_i is open_els.length - 1
552                                                         break
553                                                 # 4. Let ancestor be the node before ancestor in the stack
554                                                 # of open elements.
555                                                 ancestor_i += 1
556                                                 ancestor = open_els[ancestor_i]
557                                                 # 5. If ancestor is a template node, jump to the step below
558                                                 # labeled done.
559                                                 if ancestor.name is 'template'
560                                                         break
561                                                 # 6. If ancestor is a table node, switch the insertion mode
562                                                 # to "in select in table" and abort these steps.
563                                                 if ancestor.name is 'table'
564                                                         ins_mode = ins_mode_in_select_in_table
565                                                         return
566                                                 # 7. Jump back to the step labeled loop.
567                                 # 8. Done: Switch the insertion mode to "in select" and abort
568                                 # these steps.
569                                 ins_mode = ins_mode_in_select
570                                 return
571                         # 5. If node is a td or th element and last is false, then switch
572                         # the insertion mode to "in cell" and abort these steps.
573                         if (node.name is 'td' or node.name is 'th') and last is false
574                                 ins_mode = ins_mode_in_cell
575                                 return
576                         # 6. If node is a tr element, then switch the insertion mode to "in
577                         # row" and abort these steps.
578                         if node.name is 'tr'
579                                 ins_mode = ins_mode_in_row
580                                 return
581                         # 7. If node is a tbody, thead, or tfoot element, then switch the
582                         # insertion mode to "in table body" and abort these steps.
583                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
584                                 ins_mode = ins_mode_in_table_body
585                                 return
586                         # 8. If node is a caption element, then switch the insertion mode
587                         # to "in caption" and abort these steps.
588                         if node.name is 'caption'
589                                 ins_mode = ins_mode_in_caption
590                                 return
591                         # 9. If node is a colgroup element, then switch the insertion mode
592                         # to "in column group" and abort these steps.
593                         if node.name is 'colgroup'
594                                 ins_mode = ins_mode_in_column_group
595                                 return
596                         # 10. If node is a table element, then switch the insertion mode to
597                         # "in table" and abort these steps.
598                         if node.name is 'table'
599                                 ins_mode = ins_mode_in_table
600                                 return
601                         # 11. If node is a template element, then switch the insertion mode
602                         # to the current template insertion mode and abort these steps.
603                         # fixfull (template insertion mode stack)
604
605                         # 12. If node is a head element and last is true, then switch the
606                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
607                         # these steps. (fragment case)
608                         if node.name is 'head' and last
609                                 ins_mode = ins_mode_in_body
610                                 return
611                         # 13. If node is a head element and last is false, then switch the
612                         # insertion mode to "in head" and abort these steps.
613                         if node.name is 'head' and last is false
614                                 ins_mode = ins_mode_in_head
615                                 return
616                         # 14. If node is a body element, then switch the insertion mode to
617                         # "in body" and abort these steps.
618                         if node.name is 'body'
619                                 ins_mode = ins_mode_in_body
620                                 return
621                         # 15. If node is a frameset element, then switch the insertion mode
622                         # to "in frameset" and abort these steps. (fragment case)
623                         if node.name is 'frameset'
624                                 ins_mode = ins_mode_in_frameset
625                                 return
626                         # 16. If node is an html element, run these substeps:
627                         if node.name is 'html'
628                                 # 1. If the head element pointer is null, switch the insertion
629                                 # mode to "before head" and abort these steps. (fragment case)
630                                 if head_element_pointer is null
631                                         ins_mode = ins_mode_before_head
632                                 else
633                                         # 2. Otherwise, the head element pointer is not null,
634                                         # switch the insertion mode to "after head" and abort these
635                                         # steps.
636                                         ins_mode = ins_mode_after_head
637                                 return
638                         # 17. If last is true, then switch the insertion mode to "in body"
639                         # and abort these steps. (fragment case)
640                         if last
641                                 ins_mode = ins_mode_in_body
642                                 return
643                         # 18. Let node now be the node before node in the stack of open
644                         # elements.
645                         node_i += 1
646                         node = open_els[node_i]
647                         # 19. Return to the step labeled loop.
648
649         # 8.2.3.2
650
651         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
652         adjusted_current_node = ->
653                 if open_els.length is 1 and flag_fragment_parsing
654                         return context_element
655                 return open_els[0]
656
657         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
658         # this implementation is structured (mostly) as described at the link above.
659         # capitalized comments are the "labels" described at the link above.
660         reconstruct_active_formatting_elements = ->
661                 return if afe.length is 0
662                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
663                         return
664                 # Rewind
665                 i = 0
666                 loop
667                         if i is afe.length - 1
668                                 break
669                         i += 1
670                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
671                                 i -= 1 # Advance
672                                 break
673                 # Create
674                 loop
675                         el = insert_html_element afe[i].token
676                         afe[i] = el
677                         break if i is 0
678                         i -= 1 # Advance
679
680         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
681         # adoption agency algorithm
682         # overview here:
683         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
684         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
685         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
686         adoption_agency = (subject) ->
687                 debug_log "adoption_agency()"
688                 debug_log "tree: #{serialize_els doc.children, false, true}"
689                 debug_log "open_els: #{serialize_els open_els, true, true}"
690                 debug_log "afe: #{serialize_els afe, true, true}"
691                 if open_els[0].name is subject
692                         el = open_els[0]
693                         open_els.shift()
694                         # remove it from the list of active formatting elements (if found)
695                         for t, i in afe
696                                 if t is el
697                                         afe.splice i, 1
698                                         break
699                         debug_log "aaa: starting off with subject on top of stack, exiting"
700                         return
701                 outer = 0
702                 loop
703                         if outer >= 8
704                                 return
705                         outer += 1
706                         # 5. Let formatting element be the last element in the list of
707                         # active formatting elements that: is between the end of the list
708                         # and the last scope marker in the list, if any, or the start of
709                         # the list otherwise, and  has the tag name subject.
710                         fe = null
711                         for t, fe_of_afe in afe
712                                 if t.type is TYPE_AFE_MARKER
713                                         break
714                                 if t.name is subject
715                                         fe = t
716                                         break
717                         # If there is no such element, then abort these steps and instead
718                         # act as described in the "any other end tag" entry above.
719                         if fe is null
720                                 debug_log "aaa: fe not found in afe"
721                                 in_body_any_other_end_tag subject
722                                 return
723                         # 6. If formatting element is not in the stack of open elements,
724                         # then this is a parse error; remove the element from the list, and
725                         # abort these steps.
726                         in_open_els = false
727                         for t, fe_of_open_els in open_els
728                                 if t is fe
729                                         in_open_els = true
730                                         break
731                         unless in_open_els
732                                 debug_log "aaa: fe not found in open_els"
733                                 parse_error()
734                                 # "remove it from the list" must mean afe, since it's not in open_els
735                                 afe.splice fe_of_afe, 1
736                                 return
737                         # 7. If formatting element is in the stack of open elements, but
738                         # the element is not in scope, then this is a parse error; abort
739                         # these steps.
740                         unless el_is_in_scope fe
741                                 debug_log "aaa: fe not in scope"
742                                 parse_error()
743                                 return
744                         # 8. If formatting element is not the current node, this is a parse
745                         # error. (But do not abort these steps.)
746                         unless open_els[0] is fe
747                                 parse_error()
748                                 # continue
749                         # 9. Let furthest block be the topmost node in the stack of open
750                         # elements that is lower in the stack than formatting element, and
751                         # is an element in the special category. There might not be one.
752                         fb = null
753                         fb_of_open_els = null
754                         for t, i in open_els
755                                 if t is fe
756                                         break
757                                 if el_is_special t
758                                         fb = t
759                                         fb_of_open_els = i
760                                         # and continue, to see if there's one that's more "topmost"
761                         # 10. If there is no furthest block, then the UA must first pop all
762                         # the nodes from the bottom of the stack of open elements, from the
763                         # current node up to and including formatting element, then remove
764                         # formatting element from the list of active formatting elements,
765                         # and finally abort these steps.
766                         if fb is null
767                                 debug_log "aaa: no fb"
768                                 loop
769                                         t = open_els.shift()
770                                         if t is fe
771                                                 afe.splice fe_of_afe, 1
772                                                 return
773                         # 11. Let common ancestor be the element immediately above
774                         # formatting element in the stack of open elements.
775                         ca = open_els[fe_of_open_els + 1] # common ancestor
776
777                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
778                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
779                         bookmark = new_aaa_bookmark()
780                         for t, i in afe
781                                 if t is fe
782                                         afe.splice i, 0, bookmark
783                                         break
784                         node = last_node = fb
785                         inner = 0
786                         loop
787                                 inner += 1
788                                 # 3. Let node be the element immediately above node in the
789                                 # stack of open elements, or if node is no longer in the stack
790                                 # of open elements (e.g. because it got removed by this
791                                 # algorithm), the element that was immediately above node in
792                                 # the stack of open elements before node was removed.
793                                 node_next = null
794                                 for t, i in open_els
795                                         if t is node
796                                                 node_next = open_els[i + 1]
797                                                 break
798                                 node = node_next ? node_above
799                                 debug_log "inner loop #{inner}"
800                                 debug_log "tree: #{serialize_els doc.children, false, true}"
801                                 debug_log "open_els: #{serialize_els open_els, true, true}"
802                                 debug_log "afe: #{serialize_els afe, true, true}"
803                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
804                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
805                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
806                                 debug_log "node: #{node.serialize true, true}"
807                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
808
809                                 # 4. If node is formatting element, then go to the next step in
810                                 # the overall algorithm.
811                                 if node is fe
812                                         break
813                                 debug_log "the meat"
814                                 # 5. If inner loop counter is greater than three and node is in
815                                 # the list of active formatting elements, then remove node from
816                                 # the list of active formatting elements.
817                                 node_in_afe = false
818                                 for t, i in afe
819                                         if t is node
820                                                 if inner > 3
821                                                         afe.splice i, 1
822                                                         debug_log "max out inner"
823                                                 else
824                                                         node_in_afe = true
825                                                         debug_log "in afe"
826                                                 break
827                                 # 6. If node is not in the list of active formatting elements,
828                                 # then remove node from the stack of open elements and then go
829                                 # back to the step labeled inner loop.
830                                 unless node_in_afe
831                                         debug_log "not in afe"
832                                         for t, i in open_els
833                                                 if t is node
834                                                         node_above = open_els[i + 1]
835                                                         open_els.splice i, 1
836                                                         break
837                                         continue
838                                 debug_log "the bones"
839                                 # 7. create an element for the token for which the element node
840                                 # was created, in the HTML namespace, with common ancestor as
841                                 # the intended parent; replace the entry for node in the list
842                                 # of active formatting elements with an entry for the new
843                                 # element, replace the entry for node in the stack of open
844                                 # elements with an entry for the new element, and let node be
845                                 # the new element.
846                                 new_node = token_to_element node.token, NS_HTML, ca
847                                 for t, i in afe
848                                         if t is node
849                                                 afe[i] = new_node
850                                                 debug_log "replaced in afe"
851                                                 break
852                                 for t, i in open_els
853                                         if t is node
854                                                 node_above = open_els[i + 1]
855                                                 open_els[i] = new_node
856                                                 debug_log "replaced in open_els"
857                                                 break
858                                 node = new_node
859                                 # 8. If last node is furthest block, then move the
860                                 # aforementioned bookmark to be immediately after the new node
861                                 # in the list of active formatting elements.
862                                 if last_node is fb
863                                         for t, i in afe
864                                                 if t is bookmark
865                                                         afe.splice i, 1
866                                                         debug_log "removed bookmark"
867                                                         break
868                                         for t, i in afe
869                                                 if t is node
870                                                         # "after" means lower
871                                                         afe.splice i, 0, bookmark # "after as <-
872                                                         debug_log "placed bookmark after node"
873                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
874                                                         break
875                                 # 9. Insert last node into node, first removing it from its
876                                 # previous parent node if any.
877                                 if last_node.parent?
878                                         debug_log "last_node has parent"
879                                         for c, i in last_node.parent.children
880                                                 if c is last_node
881                                                         debug_log "removing last_node from parent"
882                                                         last_node.parent.children.splice i, 1
883                                                         break
884                                 node.children.push last_node
885                                 last_node.parent = node
886                                 # 10. Let last node be node.
887                                 last_node = node
888                                 debug_log "at last"
889                                 # 11. Return to the step labeled inner loop.
890                         # 14. Insert whatever last node ended up being in the previous step
891                         # at the appropriate place for inserting a node, but using common
892                         # ancestor as the override target.
893
894                         # In the case where fe is immediately followed by fb:
895                         #   * inner loop exits out early (node==fe)
896                         #   * last_node is fb
897                         #   * last_node is still in the tree (not a duplicate)
898                         if last_node.parent?
899                                 debug_log "FEFIRST? last_node has parent"
900                                 for c, i in last_node.parent.children
901                                         if c is last_node
902                                                 debug_log "removing last_node from parent"
903                                                 last_node.parent.children.splice i, 1
904                                                 break
905
906                         debug_log "after aaa inner loop"
907                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
908                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
909                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
910                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
911                         debug_log "tree: #{serialize_els doc.children, false, true}"
912
913                         debug_log "insert"
914
915
916                         # can't use standard insert token thing, because it's already in
917                         # open_els and must stay at it's current position in open_els
918                         dest = adjusted_insertion_location ca
919                         dest[0].children.splice dest[1], 0, last_node
920                         last_node.parent = dest[0]
921
922
923                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
924                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
925                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
926                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
927                         debug_log "tree: #{serialize_els doc.children, false, true}"
928
929                         # 15. Create an element for the token for which formatting element
930                         # was created, in the HTML namespace, with furthest block as the
931                         # intended parent.
932                         new_element = token_to_element fe.token, NS_HTML, fb
933                         # 16. Take all of the child nodes of furthest block and append them
934                         # to the element created in the last step.
935                         while fb.children.length
936                                 t = fb.children.shift()
937                                 t.parent = new_element
938                                 new_element.children.push t
939                         # 17. Append that new element to furthest block.
940                         new_element.parent = fb
941                         fb.children.push new_element
942                         # 18. Remove formatting element from the list of active formatting
943                         # elements, and insert the new element into the list of active
944                         # formatting elements at the position of the aforementioned
945                         # bookmark.
946                         for t, i in afe
947                                 if t is fe
948                                         afe.splice i, 1
949                                         break
950                         for t, i in afe
951                                 if t is bookmark
952                                         afe[i] = new_element
953                                         break
954                         # 19. Remove formatting element from the stack of open elements,
955                         # and insert the new element into the stack of open elements
956                         # immediately below the position of furthest block in that stack.
957                         for t, i in open_els
958                                 if t is fe
959                                         open_els.splice i, 1
960                                         break
961                         for t, i in open_els
962                                 if t is fb
963                                         open_els.splice i, 0, new_element
964                                         break
965                         # 20. Jump back to the step labeled outer loop.
966                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
967                         debug_log "tree: #{serialize_els doc.children, false, true}"
968                         debug_log "open_els: #{serialize_els open_els, true, true}"
969                         debug_log "afe: #{serialize_els afe, true, true}"
970                 debug_log "AAA DONE"
971
972         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
973         close_p_element = ->
974                 generate_implied_end_tags 'p' # arg is exception
975                 if open_els[0].name isnt 'p'
976                         parse_error()
977                 while open_els.length > 1 # just in case
978                         el = open_els.shift()
979                         if el.name is 'p'
980                                 return
981         close_p_if_in_button_scope = ->
982                 if is_in_button_scope 'p'
983                         close_p_element()
984
985         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
986         # aka insert_a_character = (t) ->
987         insert_character = (t) ->
988                 dest = adjusted_insertion_location()
989                 # fixfull check for Document node
990                 if dest[1] > 0
991                         prev = dest[0].children[dest[1] - 1]
992                         if prev.type is TYPE_TEXT
993                                 prev.text += t.text
994                                 return
995                 dest[0].children.splice dest[1], 0, t
996
997         # 8.2.5.1
998         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
999         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1000         adjusted_insertion_location = (override_target = null) ->
1001                 # 1. If there was an override target specified, then let target be the
1002                 # override target.
1003                 if override_target?
1004                         target = override_target
1005                 else # Otherwise, let target be the current node.
1006                         target = open_els[0]
1007                 # 2. Determine the adjusted insertion location using the first matching
1008                 # steps from the following list:
1009                 #
1010                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1011                 # thead, or tr element Foster parenting happens when content is
1012                 # misnested in tables.
1013                 if flag_foster_parenting and foster_parenting_targets[target.name]
1014                         loop # once. this is here so we can ``break`` to "abort these substeps"
1015                                 # 1. Let last template be the last template element in the
1016                                 # stack of open elements, if any.
1017                                 last_template = null
1018                                 last_template_i = null
1019                                 for el, i in open_els
1020                                         if el.name is 'template'
1021                                                 last_template = el
1022                                                 last_template_i = i
1023                                                 break
1024                                 # 2. Let last table be the last table element in the stack of
1025                                 # open elements, if any.
1026                                 last_table = null
1027                                 last_table_i
1028                                 for el, i in open_els
1029                                         if el.name is 'table'
1030                                                 last_table = el
1031                                                 last_table_i = i
1032                                                 break
1033                                 # 3. If there is a last template and either there is no last
1034                                 # table, or there is one, but last template is lower (more
1035                                 # recently added) than last table in the stack of open
1036                                 # elements, then: let adjusted insertion location be inside
1037                                 # last template's template contents, after its last child (if
1038                                 # any), and abort these substeps.
1039                                 if last_template and (last_table is null or last_template_i < last_table_i)
1040                                         target = last_template # fixfull should be it's contents
1041                                         target_i = target.children.length
1042                                         break
1043                                 # 4. If there is no last table, then let adjusted insertion
1044                                 # location be inside the first element in the stack of open
1045                                 # elements (the html element), after its last child (if any),
1046                                 # and abort these substeps. (fragment case)
1047                                 if last_table is null
1048                                         # this is odd
1049                                         target = open_els[open_els.length - 1]
1050                                         target_i = target.children.length
1051                                 # 5. If last table has a parent element, then let adjusted
1052                                 # insertion location be inside last table's parent element,
1053                                 # immediately before last table, and abort these substeps.
1054                                 if last_table.parent?
1055                                         for c, i in last_table.parent.children
1056                                                 if c is last_table
1057                                                         target = last_table.parent
1058                                                         target_i = i
1059                                                         break
1060                                         break
1061                                 # 6. Let previous element be the element immediately above last
1062                                 # table in the stack of open elements.
1063                                 #
1064                                 # huh? how could it not have a parent?
1065                                 previous_element = open_els[last_table_i + 1]
1066                                 # 7. Let adjusted insertion location be inside previous
1067                                 # element, after its last child (if any).
1068                                 target = previous_element
1069                                 target_i = target.children.length
1070                                 # Note: These steps are involved in part because it's possible
1071                                 # for elements, the table element in this case in particular,
1072                                 # to have been moved by a script around in the DOM, or indeed
1073                                 # removed from the DOM entirely, after the element was inserted
1074                                 # by the parser.
1075                                 break # don't really loop
1076                 else
1077                         # Otherwise Let adjusted insertion location be inside target, after
1078                         # its last child (if any).
1079                         target_i = target.children.length
1080
1081                 # 3. If the adjusted insertion location is inside a template element,
1082                 # let it instead be inside the template element's template contents,
1083                 # after its last child (if any).
1084                 # fixfull (template)
1085
1086                 # 4. Return the adjusted insertion location.
1087                 return [target, target_i]
1088
1089         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1090         # aka create_an_element_for_token
1091         token_to_element = (t, namespace, intended_parent) ->
1092                 # convert attributes into a hash
1093                 attrs = {}
1094                 for a in t.attrs_a
1095                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1096                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1097
1098                 # TODO 2. If the newly created element has an xmlns attribute in the
1099                 # XMLNS namespace whose value is not exactly the same as the element's
1100                 # namespace, that is a parse error. Similarly, if the newly created
1101                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1102                 # value is not the XLink Namespace, that is a parse error.
1103
1104                 # fixfull: the spec says stuff about form pointers and ownerDocument
1105
1106                 return el
1107
1108         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1109         insert_foreign_element = (token, namespace) ->
1110                 ail = adjusted_insertion_location()
1111                 ail_el = ail[0]
1112                 ail_i = ail[1]
1113                 el = token_to_element token, namespace, ail_el
1114                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1115                 el.parent = ail_el
1116                 ail_el.children.splice ail_i, 0, el
1117                 open_els.unshift el
1118                 return el
1119         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1120         insert_html_element = insert_foreign_element # (token, namespace) ->
1121
1122         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1123         # position should be [node, index_within_children]
1124         insert_comment = (t, position = null) ->
1125                 position ?= adjusted_insertion_location()
1126                 position[0].children.splice position[1], 0, t
1127
1128         # 8.2.5.2
1129         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1130         parse_generic_raw_text = (t) ->
1131                 insert_html_element t
1132                 tok_state = tok_state_rawtext
1133                 original_ins_mode = ins_mode
1134                 ins_mode = ins_mode_text
1135         parse_generic_rcdata_text = (t) ->
1136                 insert_html_element t
1137                 tok_state = tok_state_rcdata
1138                 original_ins_mode = ins_mode
1139                 ins_mode = ins_mode_text
1140
1141         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1142         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1143         generate_implied_end_tags = (except = null) ->
1144                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1145                         open_els.shift()
1146
1147         # 8.2.5.4 The rules for parsing tokens in HTML content
1148         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1149
1150         # 8.2.5.4.1 The "initial" insertion mode
1151         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1152         ins_mode_initial = (t) ->
1153                 if is_space_tok t
1154                         return
1155                 if t.type is TYPE_COMMENT
1156                         # ?fixfull
1157                         doc.children.push t
1158                         return
1159                 if t.type is TYPE_DOCTYPE
1160                         # FIXME check identifiers, set quirks, etc
1161                         # fixfull
1162                         doc.children.push t
1163                         ins_mode = ins_mode_before_html
1164                         return
1165                 # Anything else
1166                 #fixfull (iframe, quirks)
1167                 ins_mode = ins_mode_before_html
1168                 ins_mode t # reprocess the token
1169                 return
1170
1171         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1172         ins_mode_before_html = (t) ->
1173                 if t.type is TYPE_DOCTYPE
1174                         parse_error()
1175                         return
1176                 if t.type is TYPE_COMMENT
1177                         doc.children.push t
1178                         return
1179                 if is_space_tok t
1180                         return
1181                 if t.type is TYPE_START_TAG and t.name is 'html'
1182                         el = token_to_element t, NS_HTML, doc
1183                         doc.children.push el
1184                         open_els.unshift(el)
1185                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1186                         ins_mode = ins_mode_before_head
1187                         return
1188                 if t.type is TYPE_END_TAG
1189                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1190                                 # fall through to "anything else"
1191                         else
1192                                 parse_error()
1193                                 return
1194                 # Anything else
1195                 html_tok = new_open_tag 'html'
1196                 el = token_to_element html_tok, NS_HTML, doc
1197                 doc.children.push el
1198                 open_els.unshift el
1199                 # ?fixfull browsing context
1200                 ins_mode = ins_mode_before_head
1201                 ins_mode t
1202                 return
1203
1204         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1205         ins_mode_before_head = (t) ->
1206                 if is_space_tok t
1207                         return
1208                 if t.type is TYPE_COMMENT
1209                         insert_comment t
1210                         return
1211                 if t.type is TYPE_DOCTYPE
1212                         parse_error()
1213                         return
1214                 if t.type is TYPE_START_TAG and t.name is 'html'
1215                         ins_mode_in_body t
1216                         return
1217                 if t.type is TYPE_START_TAG and t.name is 'head'
1218                         el = insert_html_element t
1219                         head_element_pointer = el
1220                         ins_mode = ins_mode_in_head
1221                 if t.type is TYPE_END_TAG
1222                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1223                                 # fall through to Anything else below
1224                         else
1225                                 parse_error()
1226                                 return
1227                 # Anything else
1228                 head_tok = new_open_tag 'head'
1229                 el = insert_html_element head_tok
1230                 head_element_pointer = el
1231                 ins_mode = ins_mode_in_head
1232                 ins_mode t # reprocess current token
1233
1234         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1235         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1236                 open_els.shift() # spec says this will be a 'head' node
1237                 ins_mode = ins_mode_after_head
1238                 ins_mode t
1239         ins_mode_in_head = (t) ->
1240                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1241                         insert_character t
1242                         return
1243                 if t.type is TYPE_COMMENT
1244                         insert_comment t
1245                         return
1246                 if t.type is TYPE_DOCTYPE
1247                         parse_error()
1248                         return
1249                 if t.type is TYPE_START_TAG and t.name is 'html'
1250                         ins_mode_in_body t
1251                         return
1252                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1253                         el = insert_html_element t
1254                         open_els.shift()
1255                         t.acknowledge_self_closing()
1256                         return
1257                 if t.type is TYPE_START_TAG and t.name is 'meta'
1258                         el = insert_html_element t
1259                         open_els.shift()
1260                         t.acknowledge_self_closing()
1261                         # fixfull encoding stuff
1262                         return
1263                 if t.type is TYPE_START_TAG and t.name is 'title'
1264                         parse_generic_rcdata_text t
1265                         return
1266                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1267                         parse_generic_raw_text t
1268                         return
1269                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1270                         insert_html_element t
1271                         ins_mode = ins_mode_in_head_noscript
1272                         return
1273                 if t.type is TYPE_START_TAG and t.name is 'script'
1274                         ail = adjusted_insertion_location()
1275                         el = token_to_element t, NS_HTML, ail
1276                         el.flag 'parser-inserted', true
1277                         # fixfull frament case
1278                         ail[0].children.splice ail[1], 0, el
1279                         open_els.unshift el
1280                         tok_state = tok_state_script_data
1281                         original_ins_mode = ins_mode # make sure orig... is defined
1282                         ins_mode = ins_mode_text
1283                         return
1284                 if t.type is TYPE_END_TAG and t.name is 'head'
1285                         open_els.shift() # will be a head element... spec says so
1286                         ins_mode = ins_mode_after_head
1287                         return
1288                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1289                         ins_mode_in_head_else t
1290                         return
1291                 if t.type is TYPE_START_TAG and t.name is 'template'
1292                         insert_html_element t
1293                         afe_push_marker()
1294                         flag_frameset_ok = false
1295                         ins_mode = ins_mode_in_template
1296                         template_ins_modes.unshift ins_mode_in_template
1297                         return
1298                 if t.type is TYPE_END_TAG and t.name is 'template'
1299                         if template_tag_is_open()
1300                                 generate_implied_end_tags
1301                                 if open_els[0].name isnt 'template'
1302                                         parse_error()
1303                                 loop
1304                                         el = open_els.shift()
1305                                         if el.name is 'template'
1306                                                 break
1307                                 clear_afe_to_marker()
1308                                 template_ins_modes.shift()
1309                                 reset_ins_mode()
1310                         else
1311                                 parse_error()
1312                         return
1313                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1314                         parse_error()
1315                         return
1316                 ins_mode_in_head_else t
1317
1318         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1319         ins_mode_in_head_noscript_else = (t) ->
1320                 parse_error()
1321                 open_els.shift()
1322                 ins_mode = ins_mode_in_head
1323                 ins_mode t
1324         ins_mode_in_head_noscript = (t) ->
1325                 if t.type is TYPE_DOCTYPE
1326                         parse_error()
1327                         return
1328                 if t.type is TYPE_START_TAG
1329                         ins_mode_in_body t
1330                         return
1331                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1332                         open_els.shift()
1333                         ins_mode = ins_mode_in_head
1334                         return
1335                 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1336                         ins_mode_in_head t
1337                         return
1338                 if t.type is TYPE_END_TAG and t.name is 'br'
1339                         ins_mode_in_head_noscript_else t
1340                         return
1341                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1342                         parse_error()
1343                         return
1344                 # Anything else
1345                 ins_mode_in_head_noscript_else t
1346                 return
1347
1348
1349
1350         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1351         ins_mode_after_head_else = (t) ->
1352                 body_tok = new_open_tag 'body'
1353                 insert_html_element body_tok
1354                 ins_mode = ins_mode_in_body
1355                 ins_mode t # reprocess token
1356                 return
1357         ins_mode_after_head = (t) ->
1358                 if is_space_tok t
1359                         insert_character t
1360                         return
1361                 if t.type is TYPE_COMMENT
1362                         insert_comment t
1363                         return
1364                 if t.type is TYPE_DOCTYPE
1365                         parse_error()
1366                         return
1367                 if t.type is TYPE_START_TAG and t.name is 'html'
1368                         ins_mode_in_body t
1369                         return
1370                 if t.type is TYPE_START_TAG and t.name is 'body'
1371                         insert_html_element t
1372                         flag_frameset_ok = false
1373                         ins_mode = ins_mode_in_body
1374                         return
1375                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1376                         insert_html_element t
1377                         ins_mode = ins_mode_in_frameset
1378                         return
1379                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1380                         parse_error()
1381                         open_els.unshift head_element_pointer
1382                         ins_mode_in_head t
1383                         for el, i of open_els
1384                                 if el is head_element_pointer
1385                                         open_els.splice i, 1
1386                                         return
1387                         console.log "warning: 23904 couldn't find head element in open_els"
1388                         return
1389                 if t.type is TYPE_END_TAG and t.name is 'template'
1390                         ins_mode_in_head t
1391                         return
1392                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1393                         ins_mode_after_head_else t
1394                         return
1395                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1396                         parse_error()
1397                         return
1398                 # Anything else
1399                 ins_mode_after_head_else t
1400
1401         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1402         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1403                 for el, i in open_els
1404                         if el.namespace is NS_HTML and el.name is name
1405                                 generate_implied_end_tags name # arg is exception
1406                                 parse_error() unless i is 0
1407                                 while i >= 0
1408                                         open_els.shift()
1409                                         i -= 1
1410                                 return
1411                         if special_elements[el.name] is el.namespace
1412                                 parse_error()
1413                                 return
1414                 return
1415         ins_mode_in_body = (t) ->
1416                 if t.type is TYPE_TEXT and t.text is "\u0000"
1417                         parse_error()
1418                         return
1419                 if is_space_tok t
1420                         reconstruct_active_formatting_elements()
1421                         insert_character t
1422                         return
1423                 if t.type is TYPE_TEXT
1424                         reconstruct_active_formatting_elements()
1425                         insert_character t
1426                         flag_frameset_ok = false
1427                         return
1428                 if t.type is TYPE_COMMENT
1429                         insert_comment t
1430                         return
1431                 if t.type is TYPE_DOCTYPE
1432                         parse_error()
1433                         return
1434                 if t.type is TYPE_START_TAG and t.name is 'html'
1435                         parse_error()
1436                         return if template_tag_is_open()
1437                         root_attrs = open_els[open_els.length - 1].attrs
1438                         for a of t.attrs_a
1439                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1440                         return
1441
1442                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1443                         ins_mode_in_head t
1444                         return
1445                 if t.type is TYPE_START_TAG and t.name is 'body'
1446                         parse_error()
1447                         return if open_els.length < 2
1448                         second = open_els[open_els.length - 2]
1449                         return unless second.ns is NS_HTML
1450                         return unless second.name is 'body'
1451                         return if template_tag_is_open()
1452                         frameset_ok_flag = false
1453                         for a of t.attrs_a
1454                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1455                         return
1456                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1457                         parse_error()
1458                         return if open_els.length < 2
1459                         second_i = open_els.length - 2
1460                         second = open_els[second_i]
1461                         return unless second.ns is NS_HTML
1462                         return unless second.name is 'body'
1463                         flag_frameset_ok = false
1464                         if second.parent?
1465                                 for el, i in second.parent.children
1466                                         if el is second
1467                                                 second.parent.children.splice i, 1
1468                                                 break
1469                         open_els.splice second_i, 1
1470                         # pop everything except the "root html element"
1471                         while open_els.length > 1
1472                                 open_els.shift()
1473                         insert_html_element t
1474                         ins_mode = ins_mode_in_frameset
1475                         return
1476                 if t.type is TYPE_EOF
1477                         ok_tags = {
1478                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1479                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1480                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1481                         }
1482                         for el in open_els
1483                                 unless ok_tags[t.name] is el.namespace
1484                                         parse_error()
1485                                         break
1486                         if template_ins_modes.length > 0
1487                                 ins_mode_in_template t
1488                         else
1489                                 stop_parsing()
1490                         return
1491                 if t.type is TYPE_END_TAG and t.name is 'body'
1492                         unless is_in_scope 'body'
1493                                 parse_error()
1494                                 return
1495                         ok_tags = {
1496                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1497                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1498                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1499                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1500                                 html:NS_HTML
1501                         }
1502                         for el in open_els
1503                                 unless ok_tags[t.name] is el.namespace
1504                                         parse_error()
1505                                         break
1506                         ins_mode = ins_mode_after_body
1507                         return
1508                 if t.type is TYPE_END_TAG and t.name is 'html'
1509                         unless is_in_scope 'body'
1510                                 parse_error()
1511                                 return
1512                         ok_tags = {
1513                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1514                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1515                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1516                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1517                                 html:NS_HTML
1518                         }
1519                         for el in open_els
1520                                 unless ok_tags[t.name] is el.namespace
1521                                         parse_error()
1522                                         break
1523                         ins_mode = ins_mode_after_body
1524                         ins_mode t
1525                         return
1526                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1527                         close_p_if_in_button_scope()
1528                         insert_html_element t
1529                         return
1530                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1531                         close_p_if_in_button_scope()
1532                         if h_tags[open_els[0]] is NS_HTML
1533                                 parse_error()
1534                                 open_els.shift()
1535                         insert_html_element t
1536                         return
1537                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1538                         close_p_if_in_button_scope()
1539                         insert_html_element t
1540                         # spec: If the next token is a "LF" (U+000A) character token, then
1541                         # ignore that token and move on to the next one. (Newlines at the
1542                         # start of pre blocks are ignored as an authoring convenience.)
1543                         if txt.charAt(cur) is "\u000a"
1544                                 cur += 1
1545                         flag_frameset_ok = false
1546                         return
1547                 if t.type is TYPE_START_TAG and t.name is 'form'
1548                         unless form_element_pointer is null or template_tag_is_open()
1549                                 parse_error()
1550                                 return
1551                         close_p_if_in_button_scope()
1552                         el = insert_html_element t
1553                         unless template_tag_is_open()
1554                                 form_element_pointer = el
1555                         return
1556                 if t.type is TYPE_START_TAG and t.name is 'li'
1557                         flag_frameset_ok = false
1558                         for node in open_els
1559                                 if node.name is 'li' and node.namespace is NS_HTML
1560                                         generate_implied_end_tags 'li' # arg is exception
1561                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1562                                                 parse_error()
1563                                         loop
1564                                                 el = open_els.shift()
1565                                                 if el.name is 'li' and el.namespace is NS_HTML
1566                                                         break
1567                                         break
1568                                 if el_is_special_not_adp node
1569                                                 break
1570                         close_p_if_in_button_scope()
1571                         insert_html_element t
1572                         return
1573                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1574                         flag_frameset_ok = false
1575                         for node in open_els
1576                                 if node.name is 'dd' and node.namespace is NS_HTML
1577                                         generate_implied_end_tags 'dd' # arg is exception
1578                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1579                                                 parse_error()
1580                                         loop
1581                                                 el = open_els.shift()
1582                                                 if el.name is 'dd' and el.namespace is NS_HTML
1583                                                         break
1584                                         break
1585                                 if node.name is 'dt' and node.namespace is NS_HTML
1586                                         generate_implied_end_tags 'dt' # arg is exception
1587                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1588                                                 parse_error()
1589                                         loop
1590                                                 el = open_els.shift()
1591                                                 if el.name is 'dt' and el.namespace is NS_HTML
1592                                                         break
1593                                         break
1594                                 if el_is_special_not_adp node
1595                                         break
1596                         close_p_if_in_button_scope()
1597                         insert_html_element t
1598                         return
1599                 # FIXME CONTINUE
1600
1601                 if t.type is TYPE_START_TAG and t.name is 'a'
1602                         # If the list of active formatting elements contains an a element
1603                         # between the end of the list and the last marker on the list (or
1604                         # the start of the list if there is no marker on the list), then
1605                         # this is a parse error; run the adoption agency algorithm for the
1606                         # tag name "a", then remove that element from the list of active
1607                         # formatting elements and the stack of open elements if the
1608                         # adoption agency algorithm didn't already remove it (it might not
1609                         # have if the element is not in table scope).
1610                         found = false
1611                         for el in afe
1612                                 if el.type is TYPE_AFE_MARKER
1613                                         break
1614                                 if el.name is 'a'
1615                                         found = el
1616                         if found?
1617                                 parse_error()
1618                                 adoption_agency 'a'
1619                                 for el, i in afe
1620                                         if el is found
1621                                                 afe.splice i, 1
1622                                 for el, i in open_els
1623                                         if el is found
1624                                                 open_els.splice i, 1
1625                         reconstruct_active_formatting_elements()
1626                         el = insert_html_element t
1627                         afe_push el
1628                         return
1629                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1630                         reconstruct_active_formatting_elements()
1631                         el = insert_html_element t
1632                         afe_push el
1633                         return
1634                 if t.type is TYPE_START_TAG and t.name is 'table'
1635                         # fixfull quirksmode thing
1636                         close_p_if_in_button_scope()
1637                         insert_html_element t
1638                         ins_mode = ins_mode_in_table
1639                         return
1640                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1641                         unless is_in_scope t.name, NS_HTML
1642                                 parse_error()
1643                                 return
1644                         generate_implied_end_tags()
1645                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1646                                 parse_error()
1647                         loop
1648                                 el = open_els.shift()
1649                                 if el.name is t.name and el.namespace is NS_HTML
1650                                         return
1651                         return
1652                 if t.type is TYPE_END_TAG and t.name is 'p'
1653                         unless is_in_button_scope 'p'
1654                                 parse_error()
1655                                 insert_html_element new_open_tag 'p'
1656                         close_p_element()
1657                         return
1658                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1659                         adoption_agency t.name
1660                         return
1661                 if t.type is TYPE_START_TAG # any other start tag
1662                         reconstruct_active_formatting_elements()
1663                         insert_html_element t
1664                         return
1665                 if t.type is TYPE_END_TAG # any other end tag
1666                         in_body_any_other_end_tag t.name
1667                 return
1668
1669         ins_mode_in_table_else = (t) ->
1670                 parse_error()
1671                 flag_foster_parenting = true # FIXME
1672                 ins_mode_in_body t
1673                 flag_foster_parenting = false
1674         can_in_table = { # FIXME do this inline like everywhere else
1675                 'table': true
1676                 'tbody': true
1677                 'tfoot': true
1678                 'thead': true
1679                 'tr': true
1680         }
1681
1682         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1683         ins_mode_text = (t) ->
1684                 if t.type is TYPE_TEXT
1685                         insert_character t
1686                         return
1687                 if t.type is TYPE_EOF
1688                         parse_error()
1689                         if open_els[0].name is 'script'
1690                                 open_els[0].flag 'already started', true
1691                         open_els.shift()
1692                         ins_mode = original_ins_mode
1693                         ins_mode t
1694                         return
1695                 if t.type is TYPE_END_TAG and t.name is 'script'
1696                         open_els.shift()
1697                         ins_mode = original_ins_mode
1698                         # fixfull the spec seems to assume that I'm going to run the script
1699                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1700                         return
1701                 if t.type is TYPE_END_TAG
1702                         open_els.shift()
1703                         ins_mode = original_ins_mode
1704                         return
1705                 console.log 'warning: end of ins_mode_text reached'
1706
1707         # the functions below implement the tokenizer stats described here:
1708         # http://www.w3.org/TR/html5/syntax.html#tokenization
1709
1710         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1711         ins_mode_in_table = (t) ->
1712                 switch t.type
1713                         when TYPE_TEXT
1714                                 if can_in_table[t.name]
1715                                         original_ins_mode = ins_mode
1716                                         ins_mode = ins_mode_in_table_text
1717                                         ins_mode t
1718                                 else
1719                                         ins_mode_in_table_else t
1720                         when TYPE_COMMENT
1721                                 insert_comment t
1722                         when TYPE_DOCTYPE
1723                                 parse_error()
1724                         when TYPE_START_TAG
1725                                 switch t.name
1726                                         when 'caption'
1727                                                 clear_stack_to_table_context()
1728                                                 afe_push_marker()
1729                                                 insert_html_element t
1730                                                 ins_mode = ins_mode_in_caption
1731                                         when 'colgroup'
1732                                                 clear_stack_to_table_context()
1733                                                 insert_html_element t
1734                                                 ins_mode = ins_mode_in_column_group
1735                                         when 'col'
1736                                                 clear_stack_to_table_context()
1737                                                 insert_html_element new_open_tag 'colgroup'
1738                                                 ins_mode = ins_mode_in_column_group
1739                                                 ins_mode t
1740                                         when 'tbody', 'tfoot', 'thead'
1741                                                 clear_stack_to_table_context()
1742                                                 insert_html_element t
1743                                                 ins_mode = ins_mode_in_table_body
1744                                         when 'td', 'th', 'tr'
1745                                                 clear_stack_to_table_context()
1746                                                 insert_html_element new_open_tag 'tbody'
1747                                                 ins_mode = ins_mode_in_table_body
1748                                                 ins_mode t
1749                                         when 'table'
1750                                                 parse_error()
1751                                                 if is_in_table_scope 'table'
1752                                                         loop
1753                                                                 el = open_els.shift()
1754                                                                 if el.name is 'table'
1755                                                                         break
1756                                                         reset_ins_mode()
1757                                                         ins_mode t
1758                                         when 'style', 'script', 'template'
1759                                                 ins_mode_in_head t
1760                                         when 'input'
1761                                                 if is_input_hidden_tok t
1762                                                         ins_mode_in_table_else t
1763                                                 else
1764                                                         parse_error()
1765                                                         el = insert_html_element t
1766                                                         open_els.shift()
1767                                                         t.acknowledge_self_closing()
1768                                         when 'form'
1769                                                 parse_error()
1770                                                 if form_element_pointer?
1771                                                         return
1772                                                 if template_tag_is_open()
1773                                                         return
1774                                                 form_element_pointer = insert_html_element t
1775                                                 open_els.shift()
1776                                         else
1777                                                 ins_mode_in_table_else t
1778                         when TYPE_END_TAG
1779                                 switch t.name
1780                                         when 'table'
1781                                                 if is_in_table_scope 'table'
1782                                                         loop
1783                                                                 el = open_els.shift()
1784                                                                 if el.name is 'table'
1785                                                                         break
1786                                                         reset_ins_mode()
1787                                                 else
1788                                                         parse_error
1789                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1790                                                 parse_error()
1791                                         when 'template'
1792                                                 ins_mode_in_head t
1793                                         else
1794                                                 ins_mode_in_table_else t
1795                         when TYPE_EOF
1796                                 ins_mode_in_body t
1797                         else
1798                                 ins_mode_in_table_else t
1799
1800
1801         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1802         ins_mode_in_table_text = (t) ->
1803                 if t.type is TYPE_TEXT and t.text is "\u0000"
1804                         # huh? I thought the tokenizer didn't emit these
1805                         parse_error()
1806                         return
1807                 if t.type is TYPE_TEXT
1808                         pending_table_character_tokens.push t
1809                         return
1810                 # Anything else
1811                 all_space = true
1812                 for old in pending_table_character_tokens
1813                         unless is_space_tok old
1814                                 all_space = false
1815                                 break
1816                 if all_space
1817                         for old in pending_table_character_tokens
1818                                 insert_character old
1819                 else
1820                         for old in pending_table_character_tokens
1821                                 ins_mode_table_else old
1822                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1823                 ins_mode = original_ins_mode
1824                 ins_mode t
1825
1826         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1827         ins_mode_in_caption = (t) ->
1828                 if t.type is TYPE_END_TAG and t.name is 'caption'
1829                         if is_in_table_scope 'caption'
1830                                 generate_implied_end_tags()
1831                                 if open_els[0].name isnt 'caption'
1832                                         parse_error()
1833                                 loop
1834                                         el = open_els.shift()
1835                                         if el.name is 'caption'
1836                                                 break
1837                                 clear_afe_to_marker()
1838                                 ins_mode = ins_mode_in_table
1839                         else
1840                                 parse_error()
1841                                 # fragment case
1842                         return
1843                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1844                         parse_error()
1845                         if is_in_table_scope 'caption'
1846                                 loop
1847                                         el = open_els.shift()
1848                                         if el.name is 'caption'
1849                                                 break
1850                                 clear_afe_to_marker()
1851                                 ins_mode = ins_mode_in_table
1852                                 ins_mode t
1853                         # else fragment case
1854                         return
1855                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1856                         parse_error()
1857                         return
1858                 # Anything else
1859                 ins_mode_in_body t
1860
1861         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1862         ins_mode_in_column_group = (t) ->
1863                 if is_space_tok t
1864                         insert_character t
1865                         return
1866                 if t.type is TYPE_COMMENT
1867                         insert_comment t
1868                         return
1869                 if t.type is TYPE_DOCTYPE
1870                         parse_error()
1871                         return
1872                 if t.type is TYPE_START_TAG and t.name is 'html'
1873                         ins_mode_in_body t
1874                         return
1875                 if t.type is TYPE_START_TAG and t.name is 'col'
1876                         el = insert_html_element t
1877                         open_els.shift()
1878                         t.acknowledge_self_closing()
1879                         return
1880                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1881                         if open_els[0].name is 'colgroup'
1882                                 open_els.shift()
1883                                 ins_mode = ins_mode_in_table
1884                         else
1885                                 parse_error()
1886                         return
1887                 if t.type is TYPE_END_TAG and t.name is 'col'
1888                         parse_error()
1889                         return
1890                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1891                         ins_mode_in_head t
1892                         return
1893                 if t.type is TYPE_EOF
1894                         ins_mode_in_body t
1895                         return
1896                 # Anything else
1897                 if open_els[0].name isnt 'colgroup'
1898                         parse_error()
1899                         return
1900                 open_els.shift()
1901                 ins_mode = ins_mode_in_table
1902                 ins_mode t
1903                 return
1904
1905         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1906         ins_mode_in_table_body = (t) ->
1907                 if t.type is TYPE_START_TAG and t.name is 'tr'
1908                         clear_stack_to_table_body_context()
1909                         insert_html_element t
1910                         ins_mode = ins_mode_in_row
1911                         return
1912                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1913                         parse_error()
1914                         clear_stack_to_table_body_context()
1915                         insert_html_element new_open_tag 'tr'
1916                         ins_mode = ins_mode_in_row
1917                         ins_mode t
1918                         return
1919                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1920                         unless is_in_table_scope t.name # fixfull check namespace
1921                                 parse_error()
1922                                 return
1923                         clear_stack_to_table_body_context()
1924                         open_els.shift()
1925                         ins_mode = ins_mode_in_table
1926                         return
1927                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1928                         has = false
1929                         for el in open_els
1930                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1931                                         has = true
1932                                         break
1933                                 if table_scopers[el.name]
1934                                         break
1935                         if !has
1936                                 parse_error()
1937                                 return
1938                         clear_stack_to_table_body_context()
1939                         open_els.shift()
1940                         ins_mode = ins_mode_in_table
1941                         ins_mode t
1942                         return
1943                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1944                         parse_error()
1945                         return
1946                 # Anything else
1947                 ins_mode_in_table t
1948
1949         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1950         ins_mode_in_row = (t) ->
1951                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1952                         clear_stack_to_table_row_context()
1953                         insert_html_element t
1954                         ins_mode = ins_mode_in_cell
1955                         afe_push_marker()
1956                         return
1957                 if t.type is TYPE_END_TAG and t.name is 'tr'
1958                         if is_in_table_scope 'tr'
1959                                 clear_stack_to_table_row_context()
1960                                 open_els.shift()
1961                                 ins_mode = ins_mode_in_table_body
1962                         else
1963                                 parse_error()
1964                         return
1965                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1966                         if is_in_table_scope 'tr'
1967                                 clear_stack_to_table_row_context()
1968                                 open_els.shift()
1969                                 ins_mode = ins_mode_in_table_body
1970                                 ins_mode t
1971                         else
1972                                 parse_error()
1973                         return
1974                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1975                         if is_in_table_scope t.name # fixfull namespace
1976                                 if is_in_table_scope 'tr'
1977                                         clear_stack_to_table_row_context()
1978                                         open_els.shift()
1979                                         ins_mode = ins_mode_in_table_body
1980                                         ins_mode t
1981                         else
1982                                 parse_error()
1983                         return
1984                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1985                         parse_error()
1986                         return
1987                 # Anything else
1988                 ins_mode_in_table t
1989
1990         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1991         close_the_cell = ->
1992                 generate_implied_end_tags()
1993                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1994                         parse_error()
1995                 loop
1996                         el = open_els.shift()
1997                         if el.name is 'td' or el.name is 'th'
1998                                 break
1999                 clear_afe_to_marker()
2000                 ins_mode = ins_mode_in_row
2001
2002         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2003         ins_mode_in_cell = (t) ->
2004                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2005                         if is_in_table_scope t.name
2006                                 generate_implied_end_tags()
2007                                 if open_els[0].name isnt t.name
2008                                         parse_error
2009                                 loop
2010                                         el = open_els.shift()
2011                                         if el.name is t.name
2012                                                 break
2013                                 clear_afe_to_marker()
2014                                 ins_mode = ins_mode_in_row
2015                         else
2016                                 parse_error()
2017                         return
2018                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2019                         has = false
2020                         for el in open_els
2021                                 if el.name is 'td' or el.name is 'th'
2022                                         has = true
2023                                         break
2024                                 if table_scopers[el.name]
2025                                         break
2026                         if !has
2027                                 parse_error()
2028                                 return
2029                         close_the_cell()
2030                         ins_mode t
2031                         return
2032                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2033                         parse_error()
2034                         return
2035                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2036                         if is_in_table_scope t.name # fixfull namespace
2037                                 close_the_cell()
2038                                 ins_mode t
2039                         else
2040                                 parse_error()
2041                         return
2042                 # Anything Else
2043                 ins_mode_in_body t
2044
2045         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2046         ins_mode_in_select = (t) ->
2047                 if t.type is TYPE_TEXT and t.text is "\u0000"
2048                         parse_error()
2049                         return
2050                 if t.type is TYPE_TEXT
2051                         insert_character t
2052                         return
2053                 if t.type is TYPE_COMMENT
2054                         insert_comment t
2055                         return
2056                 if t.type is TYPE_DOCTYPE
2057                         parse_error()
2058                         return
2059                 if t.type is TYPE_START_TAG and t.name is 'html'
2060                         ins_mode_in_body t
2061                         return
2062                 if t.type is TYPE_START_TAG and t.name is 'option'
2063                         if open_els[0].name is 'option'
2064                                 open_els.shift()
2065                         insert_html_element t
2066                         return
2067                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2068                         if open_els[0].name is 'option'
2069                                 open_els.shift()
2070                         if open_els[0].name is 'optgroup'
2071                                 open_els.shift()
2072                         insert_html_element t
2073                         return
2074                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2075                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2076                                 open_els.shift()
2077                         if open_els[0].name is 'optgroup'
2078                                 open_els.shift()
2079                         else
2080                                 parse_error()
2081                         return
2082                 if t.type is TYPE_END_TAG and t.name is 'option'
2083                         if open_els[0].name is 'option'
2084                                 open_els.shift()
2085                         else
2086                                 parse_error()
2087                         return
2088                 if t.type is TYPE_END_TAG and t.name is 'select'
2089                         if is_in_select_scope 'select'
2090                                 loop
2091                                         el = open_els.shift()
2092                                         if el.name is 'select'
2093                                                 break
2094                                 reset_ins_mode()
2095                         else
2096                                 parse_error()
2097                         return
2098                 if t.type is TYPE_START_TAG and t.name is 'select'
2099                         parse_error()
2100                         loop
2101                                 el = open_els.shift()
2102                                 if el.name is 'select'
2103                                         break
2104                         reset_ins_mode()
2105                         # spec says that this is the same as </select> but it doesn't say
2106                         # to check scope first
2107                         return
2108                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2109                         parse_error()
2110                         if is_in_select_scope 'select'
2111                                 return
2112                         loop
2113                                 el = open_els.shift()
2114                                 if el.name is 'select'
2115                                         break
2116                         reset_ins_mode()
2117                         ins_mode t
2118                         return
2119                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2120                         ins_mode_in_head t
2121                         return
2122                 if t.type is TYPE_EOF
2123                         ins_mode_in_body t
2124                         return
2125                 # Anything else
2126                 parse_error()
2127                 return
2128
2129         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2130         ins_mode_in_select_in_table = (t) ->
2131                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2132                         parse_error()
2133                         loop
2134                                 el = open_els.shift()
2135                                 if el.name is 'select'
2136                                         break
2137                         reset_ins_mode()
2138                         ins_mode t
2139                         return
2140                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2141                         parse_error()
2142                         unless is_in_table_scope t.name, NS_HTML
2143                                 return
2144                         loop
2145                                 el = open_els.shift()
2146                                 if el.name is 'select'
2147                                         break
2148                         reset_ins_mode()
2149                         ins_mode t
2150                         return
2151                 # Anything else
2152                 ins_mode_in_select t
2153                 return
2154
2155         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2156         ins_mode_in_template = (t) ->
2157                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2158                         ins_mode_in_body t
2159                         return
2160                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2161                         ins_mode_in_head t
2162                         return
2163                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2164                         template_ins_modes.shift()
2165                         template_ins_modes.unshift ins_mode_in_table
2166                         ins_mode = ins_mode_in_table
2167                         ins_mode t
2168                         return
2169                 if t.type is TYPE_START_TAG and t.name is 'col'
2170                         template_ins_modes.shift()
2171                         template_ins_modes.unshift ins_mode_in_column_group
2172                         ins_mode = ins_mode_in_column_group
2173                         ins_mode t
2174                         return
2175                 if t.type is TYPE_START_TAG and t.name is 'tr'
2176                         template_ins_modes.shift()
2177                         template_ins_modes.unshift ins_mode_in_table_body
2178                         ins_mode = ins_mode_in_table_body
2179                         ins_mode t
2180                         return
2181                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2182                         template_ins_modes.shift()
2183                         template_ins_modes.unshift ins_mode_in_row
2184                         ins_mode = ins_mode_in_row
2185                         ins_mode t
2186                         return
2187                 if t.type is TYPE_START_TAG
2188                         template_ins_modes.shift()
2189                         template_ins_modes.unshift ins_mode_in_body
2190                         ins_mode = ins_mode_in_body
2191                         ins_mode t
2192                         return
2193                 if t.type is TYPE_END_TAG
2194                         parse_error()
2195                         return
2196                 if t.type is TYPE_EOF
2197                         unless template_tag_is_open()
2198                                 stop_parsing()
2199                                 return
2200                         parse_error()
2201                         loop
2202                                 el = open_els.shift()
2203                                 if el.name is 'template' # fixfull check namespace
2204                                         break
2205                         clear_afe_to_marker()
2206                         template_ins_modes.shift()
2207                         reset_ins_mode()
2208                         ins_mode t
2209
2210         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2211         ins_mode_after_body = (t) ->
2212                 if is_space_tok t
2213                         ins_mode_in_body t
2214                         return
2215                 if t.type is TYPE_COMMENT
2216                         insert_comment t, [open_els[0], open_els[0].children.length]
2217                         return
2218                 if t.type is TYPE_DOCTYPE
2219                         parse_error()
2220                         return
2221                 if t.type is TYPE_START_TAG and t.name is 'html'
2222                         ins_mode_in_body t
2223                         return
2224                 if t.type is TYPE_END_TAG and t.name is 'html'
2225                         # fixfull fragment case
2226                         ins_mode = ins_mode_after_after_body
2227                         return
2228                 if t.type is TYPE_EOF
2229                         stop_parsing()
2230                         return
2231                 # Anything ELse
2232                 parse_error()
2233                 ins_mode = ins_mode_in_body
2234                 ins_mode t
2235
2236         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2237         ins_mode_in_frameset = (t) ->
2238                 if is_space_tok t
2239                         insert_character t
2240                         return
2241                 if t.type is TYPE_COMMENT
2242                         insert_comment t
2243                         return
2244                 if t.type is TYPE_DOCTYPE
2245                         parse_error()
2246                         return
2247                 if t.type is TYPE_START_TAG and t.name is 'html'
2248                         ins_mode_in_body t
2249                         return
2250                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2251                         insert_html_element t
2252                         return
2253                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2254                         # TODO ?correct for: "if the current node is the root html element"
2255                         if open_els.length is 1
2256                                 parse_error()
2257                                 return # fragment case
2258                         open_els.shift()
2259                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2260                                 ins_mode = ins_mode_after_frameset
2261                         return
2262                 if t.type is TYPE_START_TAG and t.name is 'frame'
2263                         insert_html_element t
2264                         open_els.shift()
2265                         t.acknowledge_self_closing()
2266                         return
2267                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2268                         ins_mode_in_head t
2269                         return
2270                 if t.type is TYPE_EOF
2271                         # TODO ?correct for: "if the current node is not the root html element"
2272                         if open_els.length isnt 1
2273                                 parse_error()
2274                         stop_parsing()
2275                         return
2276                 # Anything else
2277                 parse_error()
2278                 return
2279
2280         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2281         ins_mode_after_frameset = (t) ->
2282                 if is_space_tok t
2283                         insert_character t
2284                         return
2285                 if t.type is TYPE_COMMENT
2286                         insert_comment t
2287                         return
2288                 if t.type is TYPE_DOCTYPE
2289                         parse_error()
2290                         return
2291                 if t.type is TYPE_START_TAG and t.name is 'html'
2292                         ins_mode_in_body t
2293                         return
2294                 if t.type is TYPE_END_TAG and t.name is 'html'
2295                         insert_mode = ins_mode_after_after_frameset
2296                         return
2297                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2298                         ins_mode_in_head t
2299                         return
2300                 if t.type is TYPE_EOF
2301                         stop_parsing()
2302                         return
2303                 # Anything else
2304                 parse_error()
2305                 return
2306
2307         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2308         ins_mode_after_after_body = (t) ->
2309                 if t.type is TYPE_COMMENT
2310                         insert_comment t, [doc, doc.children.length]
2311                         return
2312                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2313                         ins_mode_in_body t
2314                         return
2315                 if t.type is TYPE_EOF
2316                         stop_parsing()
2317                         return
2318                 # Anything else
2319                 parse_error()
2320                 ins_mode = ins_mode_in_body
2321                 return
2322
2323         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2324         ins_mode_after_after_frameset = (t) ->
2325                 if t.type is TYPE_COMMENT
2326                         insert_comment t, [doc, doc.children.length]
2327                         return
2328                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2329                         ins_mode_in_body t
2330                         return
2331                 if t.type is TYPE_EOF
2332                         stop_parsing()
2333                         return
2334                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2335                         ins_mode_in_head t
2336                         return
2337                 # Anything else
2338                 parse_error()
2339                 return
2340
2341
2342
2343
2344
2345         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2346         tok_state_data = ->
2347                 switch c = txt.charAt(cur++)
2348                         when '&'
2349                                 return new_text_node parse_character_reference()
2350                         when '<'
2351                                 tok_state = tok_state_tag_open
2352                         when "\u0000"
2353                                 parse_error()
2354                                 return new_text_node c
2355                         when '' # EOF
2356                                 return new_eof_token()
2357                         else
2358                                 return new_text_node c
2359                 return null
2360
2361         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2362         # not needed: tok_state_character_reference_in_data = ->
2363         # just call parse_character_reference()
2364
2365         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2366         tok_state_rcdata = ->
2367                 switch c = txt.charAt(cur++)
2368                         when '&'
2369                                 return new_text_node parse_character_reference()
2370                         when '<'
2371                                 tok_state = tok_state_rcdata_less_than_sign
2372                         when "\u0000"
2373                                 parse_error()
2374                                 return new_character_token "\ufffd"
2375                         when '' # EOF
2376                                 return new_eof_token()
2377                         else
2378                                 return new_character_token c
2379                 return null
2380
2381         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2382         # not needed: tok_state_character_reference_in_rcdata = ->
2383         # just call parse_character_reference()
2384
2385         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2386         tok_state_rawtext = ->
2387                 switch c = txt.charAt(cur++)
2388                         when '<'
2389                                 tok_state = tok_state_rawtext_less_than_sign
2390                         when "\u0000"
2391                                 parse_error()
2392                                 return new_character_token "\ufffd"
2393                         when '' # EOF
2394                                 return new_eof_token()
2395                         else
2396                                 return new_character_token c
2397                 return null
2398
2399         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2400         tok_state_script_data = ->
2401                 switch c = txt.charAt(cur++)
2402                         when '<'
2403                                 tok_state = tok_state_script_data_less_than_sign
2404                         when "\u0000"
2405                                 parse_error()
2406                                 return new_character_token "\ufffd"
2407                         when '' # EOF
2408                                 return new_eof_token()
2409                         else
2410                                 return new_character_token c
2411                 return null
2412
2413         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2414         tok_state_plaintext = ->
2415                 switch c = txt.charAt(cur++)
2416                         when "\u0000"
2417                                 parse_error()
2418                                 return new_character_token "\ufffd"
2419                         when '' # EOF
2420                                 return new_eof_token()
2421                         else
2422                                 return new_character_token c
2423                 return null
2424
2425
2426         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2427         tok_state_tag_open = ->
2428                 switch c = txt.charAt(cur++)
2429                         when '!'
2430                                 tok_state = tok_state_markup_declaration_open
2431                         when '/'
2432                                 tok_state = tok_state_end_tag_open
2433                         when '?'
2434                                 parse_error()
2435                                 tok_cur_tag = new_comment_token '?'
2436                                 tok_state = tok_state_bogus_comment
2437                         else
2438                                 if is_lc_alpha(c)
2439                                         tok_cur_tag = new_open_tag c
2440                                         tok_state = tok_state_tag_name
2441                                 else if is_uc_alpha(c)
2442                                         tok_cur_tag = new_open_tag c.toLowerCase()
2443                                         tok_state = tok_state_tag_name
2444                                 else
2445                                         parse_error()
2446                                         tok_state = tok_state_data
2447                                         cur -= 1 # we didn't parse/handle the char after <
2448                                         return new_text_node '<'
2449                 return null
2450
2451         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2452         tok_state_end_tag_open = ->
2453                 switch c = txt.charAt(cur++)
2454                         when '>'
2455                                 parse_error()
2456                                 tok_state = tok_state_data
2457                         when '' # EOF
2458                                 parse_error()
2459                                 tok_state = tok_state_data
2460                                 return new_text_node '</'
2461                         else
2462                                 if is_uc_alpha(c)
2463                                         tok_cur_tag = new_end_tag c.toLowerCase()
2464                                         tok_state = tok_state_tag_name
2465                                 else if is_lc_alpha(c)
2466                                         tok_cur_tag = new_end_tag c
2467                                         tok_state = tok_state_tag_name
2468                                 else
2469                                         parse_error()
2470                                         tok_cur_tag = new_comment_token '/'
2471                                         tok_state = tok_state_bogus_comment
2472                 return null
2473
2474         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2475         tok_state_tag_name = ->
2476                 switch c = txt.charAt(cur++)
2477                         when "\t", "\n", "\u000c", ' '
2478                                 tok_state = tok_state_before_attribute_name
2479                         when '/'
2480                                 tok_state = tok_state_self_closing_start_tag
2481                         when '>'
2482                                 tok_state = tok_state_data
2483                                 tmp = tok_cur_tag
2484                                 tok_cur_tag = null
2485                                 return tmp
2486                         when "\u0000"
2487                                 parse_error()
2488                                 tok_cur_tag.name += "\ufffd"
2489                         when '' # EOF
2490                                 parse_error()
2491                                 tok_state = tok_state_data
2492                         else
2493                                 if is_uc_alpha(c)
2494                                         tok_cur_tag.name += c.toLowerCase()
2495                                 else
2496                                         tok_cur_tag.name += c
2497                 return null
2498
2499         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2500         tok_state_rcdata_less_than_sign = ->
2501                 c = txt.charAt(cur++)
2502                 if c is '/'
2503                         temporary_buffer = ''
2504                         tok_state = tok_state_rcdata_end_tag_open
2505                         return null
2506                 # Anything else
2507                 tok_state = tok_state_rcdata
2508                 cur -= 1 # reconsume the input character
2509                 return new_character_token '<'
2510
2511         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2512         tok_state_rcdata_end_tag_open = ->
2513                 c = txt.charAt(cur++)
2514                 if is_uc_alpha(c)
2515                         tok_cur_tag = new_end_tag c.toLowerCase()
2516                         temporary_buffer += c
2517                         tok_state = tok_state_rcdata_end_tag_name
2518                         return null
2519                 if is_lc_alpha(c)
2520                         tok_cur_tag = new_end_tag c
2521                         temporary_buffer += c
2522                         tok_state = tok_state_rcdata_end_tag_name
2523                         return null
2524                 # Anything else
2525                 tok_state = tok_state_rcdata
2526                 cur -= 1 # reconsume the input character
2527                 return new_character_token "</" # fixfull separate these
2528
2529         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2530         is_appropriate_end_tag = (t) ->
2531                 # spec says to check against "the tag name of the last start tag to
2532                 # have been emitted from this tokenizer", but this is only called from
2533                 # the various "raw" states, which I'm pretty sure all push the start
2534                 # token onto open_els. TODO: verify this after the script data states
2535                 # are implemented
2536                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2537                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2538
2539         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2540         tok_state_rcdata_end_tag_name = ->
2541                 c = txt.charAt(cur++)
2542                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2543                         if is_appropriate_end_tag tok_cur_tag
2544                                 tok_state = tok_state_before_attribute_name
2545                                 return
2546                         # else fall through to "Anything else"
2547                 if c is '/'
2548                         if is_appropriate_end_tag tok_cur_tag
2549                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2550                                 return
2551                         # else fall through to "Anything else"
2552                 if c is '>'
2553                         if is_appropriate_end_tag tok_cur_tag
2554                                 tok_state = tok_state_data
2555                                 return tok_cur_tag
2556                         # else fall through to "Anything else"
2557                 if is_uc_alpha(c)
2558                         tok_cur_tag.name += c.toLowerCase()
2559                         temporary_buffer += c
2560                         return null
2561                 if is_lc_alpha(c)
2562                         tok_cur_tag.name += c
2563                         temporary_buffer += c
2564                         return null
2565                 # Anything else
2566                 tok_state = tok_state_rcdata
2567                 cur -= 1 # reconsume the input character
2568                 return new_character_token '</' + temporary_buffer # fixfull separate these
2569
2570         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2571         tok_state_rawtext_less_than_sign = ->
2572                 c = txt.charAt(cur++)
2573                 if c is '/'
2574                         temporary_buffer = ''
2575                         tok_state = tok_state_rawtext_end_tag_open
2576                         return null
2577                 # Anything else
2578                 tok_state = tok_state_rawtext
2579                 cur -= 1 # reconsume the input character
2580                 return new_character_token '<'
2581
2582         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2583         tok_state_rawtext_end_tag_open = ->
2584                 c = txt.charAt(cur++)
2585                 if is_uc_alpha(c)
2586                         tok_cur_tag = new_end_tag c.toLowerCase()
2587                         temporary_buffer += c
2588                         tok_state = tok_state_rawtext_end_tag_name
2589                         return null
2590                 if is_lc_alpha(c)
2591                         tok_cur_tag = new_end_tag c
2592                         temporary_buffer += c
2593                         tok_state = tok_state_rawtext_end_tag_name
2594                         return null
2595                 # Anything else
2596                 tok_state = tok_state_rawtext
2597                 cur -= 1 # reconsume the input character
2598                 return new_character_token "</" # fixfull separate these
2599
2600         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2601         tok_state_rawtext_end_tag_name = ->
2602                 c = txt.charAt(cur++)
2603                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2604                         if is_appropriate_end_tag tok_cur_tag
2605                                 tok_state = tok_state_before_attribute_name
2606                                 return
2607                         # else fall through to "Anything else"
2608                 if c is '/'
2609                         if is_appropriate_end_tag tok_cur_tag
2610                                 tok_state = tok_state_self_closing_start_tag
2611                                 return
2612                         # else fall through to "Anything else"
2613                 if c is '>'
2614                         if is_appropriate_end_tag tok_cur_tag
2615                                 tok_state = tok_state_data
2616                                 return tok_cur_tag
2617                         # else fall through to "Anything else"
2618                 if is_uc_alpha(c)
2619                         tok_cur_tag.name += c.toLowerCase()
2620                         temporary_buffer += c
2621                         return null
2622                 if is_lc_alpha(c)
2623                         tok_cur_tag.name += c
2624                         temporary_buffer += c
2625                         return null
2626                 # Anything else
2627                 tok_state = tok_state_rawtext
2628                 cur -= 1 # reconsume the input character
2629                 return new_character_token '</' + temporary_buffer # fixfull separate these
2630
2631         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2632         tok_state_script_data_less_than_sign = ->
2633                 c = txt.charAt(cur++)
2634                 if c is '/'
2635                         temporary_buffer = ''
2636                         tok_state = tok_state_script_data_end_tag_open
2637                         return
2638                 if c is '!'
2639                         tok_state = tok_state_script_data_escape_start
2640                         return new_character_token '<!' # fixfull split
2641                 # Anything else
2642                 tok_state = tok_state_script_data
2643                 cur -= 1 # Reconsume
2644                 return new_character_token '<'
2645
2646         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2647         tok_state_script_data_end_tag_open = ->
2648                 c = txt.charAt(cur++)
2649                 if is_uc_alpha(c)
2650                         tok_cur_tag = new_end_tag c.toLowerCase()
2651                         temporary_buffer += c
2652                         tok_state = tok_state_script_data_end_tag_name
2653                         return
2654                 if is_lc_alpha(c)
2655                         tok_cur_tag = new_end_tag c
2656                         temporary_buffer += c
2657                         tok_state = tok_state_script_data_end_tag_name
2658                         return
2659                 # Anything else
2660                 tok_state = tok_state_script_data
2661                 cur -= 1 # Reconsume
2662                 return new_character_token '</'
2663
2664         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2665         tok_state_script_data_end_tag_name = ->
2666                 c = txt.charAt(cur++)
2667                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2668                         if is_appropriate_end_tag tok_cur_tag
2669                                 tok_state = tok_state_before_attribute_name
2670                                 return
2671                         # fall through
2672                 if c is '/'
2673                         if is_appropriate_end_tag tok_cur_tag
2674                                 tok_state = tok_state_self_closing_start_tag
2675                                 return
2676                         # fall through
2677                 if is_uc_alpha(c)
2678                         tok_cur_tag.name += c.toLowerCase()
2679                         temporary_buffer += c
2680                         return
2681                 if is_lc_alpha(c)
2682                         tok_cur_tag.name += c
2683                         temporary_buffer += c
2684                         return
2685                 # Anything else
2686                 tok_state = tok_state_script_data
2687                 cur -= 1 # Reconsume
2688                 return new_character_token "</#{temporary_buffer}" # fixfull split
2689
2690         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2691         tok_state_script_data_escape_start = ->
2692                 c = txt.charAt(cur++)
2693                 if c is '-'
2694                         tok_state = tok_state_script_data_escape_start_dash
2695                         return new_character_token '-'
2696                 # Anything else
2697                 tok_state = tok_state_script_data
2698                 cur -= 1 # Reconsume
2699                 return
2700
2701         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2702         tok_state_script_data_escape_start_dash = ->
2703                 c = txt.charAt(cur++)
2704                 if c is '-'
2705                         tok_state = tok_state_script_data_escaped_dash_dash
2706                         return new_character_token '-'
2707                 # Anything else
2708                 tok_state = tok_state_script_data
2709                 cur -= 1 # Reconsume
2710                 return
2711
2712         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2713         tok_state_script_data_escaped = ->
2714                 c = txt.charAt(cur++)
2715                 if c is '-'
2716                         tok_state = tok_state_script_data_escaped_dash
2717                         return new_character_token '-'
2718                 if c is '<'
2719                         tok_state = tok_state_script_data_escaped_less_than_sign
2720                         return
2721                 if c is "\u0000"
2722                         parse_error()
2723                         return new_character_token "\ufffd"
2724                 if c is '' # EOF
2725                         tok_state = tok_state_data
2726                         parse_error()
2727                         cur -= 1 # Reconsume
2728                         return
2729                 # Anything else
2730                 return new_character_token c
2731
2732         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2733         tok_state_script_data_escaped_dash = ->
2734                 c = txt.charAt(cur++)
2735                 if c is '-'
2736                         tok_state = tok_state_script_data_escaped_dash_dash
2737                         return new_character_token '-'
2738                 if c is '<'
2739                         tok_state = tok_state_script_data_escaped_less_than_sign
2740                         return
2741                 if c is "\u0000"
2742                         parse_error()
2743                         tok_state = tok_state_script_data_escaped
2744                         return new_character_token "\ufffd"
2745                 if c is '' # EOF
2746                         tok_state = tok_state_data
2747                         parse_error()
2748                         cur -= 1 # Reconsume
2749                         return
2750                 # Anything else
2751                 tok_state = tok_state_script_data_escaped
2752                 return new_character_token c
2753
2754         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2755         tok_state_script_data_escaped_dash_dash = ->
2756                 c = txt.charAt(cur++)
2757                 if c is '-'
2758                         return new_character_token '-'
2759                 if c is '<'
2760                         tok_state = tok_state_script_data_escaped_less_than_sign
2761                         return
2762                 if c is '>'
2763                         tok_state = tok_state_script_data
2764                         return new_character_token '>'
2765                 if c is "\u0000"
2766                         parse_error()
2767                         tok_state = tok_state_script_data_escaped
2768                         return new_character_token "\ufffd"
2769                 if c is '' # EOF
2770                         parse_error()
2771                         tok_state = tok_state_data
2772                         cur -= 1 # Reconsume
2773                         return
2774                 # Anything else
2775                 tok_state = tok_state_script_data_escaped
2776                 return new_character_token c
2777
2778         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2779         tok_state_script_data_escaped_less_than_sign = ->
2780                 c = txt.charAt(cur++)
2781                 if c is '/'
2782                         temporary_buffer = ''
2783                         tok_state = tok_state_script_data_escaped_end_tag_open
2784                         return
2785                 if is_uc_alpha(c)
2786                         temporary_buffer = c.toLowerCase() # yes, really
2787                         tok_state = tok_state_script_data_double_escape_start
2788                         return new_character_token "<#{c}" # fixfull split
2789                 if is_lc_alpha(c)
2790                         temporary_buffer = c
2791                         tok_state = tok_state_script_data_double_escape_start
2792                         return new_character_token "<#{c}" # fixfull split
2793                 # Anything else
2794                 tok_state = tok_state_script_data_escaped
2795                 cur -= 1 # Reconsume
2796                 return new_character_token c
2797
2798         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2799         tok_state_script_data_escaped_end_tag_open = ->
2800                 c = txt.charAt(cur++)
2801                 if is_uc_alpha(c)
2802                         tok_cur_tag = new_end_tag c.toLowerCase()
2803                         temporary_buffer += c
2804                         tok_state = tok_state_script_data_escaped_end_tag_name
2805                         return
2806                 if is_lc_alpha(c)
2807                         tok_cur_tag = new_end_tag c
2808                         temporary_buffer += c
2809                         tok_state = tok_state_script_data_escaped_end_tag_name
2810                         return
2811                 # Anything else
2812                 tok_state = tok_state_script_data_escaped
2813                 cur -= 1 # Reconsume
2814                 return new_character_token '</' # fixfull split
2815
2816         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2817         tok_state_script_data_escaped_end_tag_name = ->
2818                 c = txt.charAt(cur++)
2819                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2820                         if is_appropriate_end_tag tok_cur_tag
2821                                 tok_state = tok_state_before_attribute_name
2822                                 return
2823                         # fall through
2824                 if c is '/'
2825                         if is_appropriate_end_tag tok_cur_tag
2826                                 tok_state = tok_state_self_closing_start_tag
2827                                 return
2828                         # fall through
2829                 if is_uc_alpha(c)
2830                         tok_cur_tag.name += c.toLowerCase()
2831                         temporary_buffer += c.toLowerCase()
2832                         return
2833                 if is_lc_alpha(c)
2834                         tok_cur_tag.name += c
2835                         temporary_buffer += c.toLowerCase()
2836                         return
2837                 # Anything else
2838                 tok_state = tok_state_script_data_escaped
2839                 cur -= 1 # Reconsume
2840                 return new_character_token "</#{temporary_buffer}" # fixfull split
2841
2842         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2843         tok_state_script_data_double_escape_start = ->
2844                 c = txt.charAt(cur++)
2845                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2846                         if temporary_buffer is 'script'
2847                                 tok_state = tok_state_script_data_double_escaped
2848                         else
2849                                 tok_state = tok_state_script_data_escaped
2850                         return new_character_token c
2851                 if is_uc_alpha(c)
2852                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2853                         return new_character_token c
2854                 if is_lc_alpha(c)
2855                         temporary_buffer += c
2856                         return new_character_token c
2857                 # Anything else
2858                 tok_state = tok_state_script_data_escaped
2859                 cur -= 1 # Reconsume
2860                 return
2861
2862         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2863         tok_state_script_data_double_escaped = ->
2864                 c = txt.charAt(cur++)
2865                 if c is '-'
2866                         tok_state = tok_state_script_data_double_escaped_dash
2867                         return new_character_token '-'
2868                 if c is '<'
2869                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2870                         return new_character_token '<'
2871                 if c is "\u0000"
2872                         parse_error()
2873                         return new_character_token "\ufffd"
2874                 if c is '' # EOF
2875                         parse_error()
2876                         tok_state = tok_state_data
2877                         cur -= 1 # Reconsume
2878                         return
2879                 # Anything else
2880                 return new_character_token c
2881
2882         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2883         tok_state_script_data_double_escaped_dash = ->
2884                 c = txt.charAt(cur++)
2885                 if c is '-'
2886                         tok_state = tok_state_script_data_double_escaped_dash_dash
2887                         return new_character_token '-'
2888                 if c is '<'
2889                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2890                         return new_character_token '<'
2891                 if c is "\u0000"
2892                         parse_error()
2893                         tok_state = tok_state_script_data_double_escaped
2894                         return new_character_token "\ufffd"
2895                 if c is '' # EOF
2896                         parse_error()
2897                         tok_state = tok_state_data
2898                         cur -= 1 # Reconsume
2899                         return
2900                 # Anything else
2901                 tok_state = tok_state_script_data_double_escaped
2902                 return new_character_token c
2903
2904         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2905         tok_state_script_data_double_escaped_dash_dash = ->
2906                 c = txt.charAt(cur++)
2907                 if c is '-'
2908                         return new_character_token '-'
2909                 if c is '<'
2910                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2911                         return new_character_token '<'
2912                 if c is '>'
2913                         tok_state = tok_state_script_data
2914                         return new_character_token '>'
2915                 if c is "\u0000"
2916                         parse_error()
2917                         tok_state = tok_state_script_data_double_escaped
2918                         return new_character_token "\ufffd"
2919                 if c is '' # EOF
2920                         parse_error()
2921                         tok_state = tok_state_data
2922                         cur -= 1 # Reconsume
2923                         return
2924                 # Anything else
2925                 tok_state = tok_state_script_data_double_escaped
2926                 return new_character_token c
2927
2928         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2929         tok_state_script_data_double_escaped_less_than_sign = ->
2930                 c = txt.charAt(cur++)
2931                 if c is '/'
2932                         temporary_buffer = ''
2933                         tok_state = tok_state_script_data_double_escape_end
2934                         return new_character_token '/'
2935                 # Anything else
2936                 tok_state = tok_state_script_data_double_escaped
2937                 cur -= 1 # Reconsume
2938                 return
2939
2940         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2941         tok_state_script_data_double_escape_end = ->
2942                 c = txt.charAt(cur++)
2943                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2944                         if temporary_buffer is 'script'
2945                                 tok_state = tok_state_script_data_escaped
2946                         else
2947                                 tok_state = tok_state_script_data_double_escaped
2948                         return new_character_token c
2949                 if is_uc_alpha(c)
2950                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2951                         return new_character_token c
2952                 if is_lc_alpha(c)
2953                         temporary_buffer += c
2954                         return new_character_token c
2955                 # Anything else
2956                 tok_state = tok_state_script_data_double_escaped
2957                 cur -= 1 # Reconsume
2958                 return
2959
2960         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2961         tok_state_before_attribute_name = ->
2962                 attr_name = null
2963                 switch c = txt.charAt(cur++)
2964                         when "\t", "\n", "\u000c", ' '
2965                                 return null
2966                         when '/'
2967                                 tok_state = tok_state_self_closing_start_tag
2968                                 return null
2969                         when '>'
2970                                 tok_state = tok_state_data
2971                                 tmp = tok_cur_tag
2972                                 tok_cur_tag = null
2973                                 return tmp
2974                         when "\u0000"
2975                                 parse_error()
2976                                 attr_name = "\ufffd"
2977                         when '"', "'", '<', '='
2978                                 parse_error()
2979                                 attr_name = c
2980                         when '' # EOF
2981                                 parse_error()
2982                                 tok_state = tok_state_data
2983                         else
2984                                 if is_uc_alpha(c)
2985                                         attr_name = c.toLowerCase()
2986                                 else
2987                                         attr_name = c
2988                 if attr_name?
2989                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2990                         tok_state = tok_state_attribute_name
2991                 return null
2992
2993         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2994         tok_state_attribute_name = ->
2995                 switch c = txt.charAt(cur++)
2996                         when "\t", "\n", "\u000c", ' '
2997                                 tok_state = tok_state_after_attribute_name
2998                         when '/'
2999                                 tok_state = tok_state_self_closing_start_tag
3000                         when '='
3001                                 tok_state = tok_state_before_attribute_value
3002                         when '>'
3003                                 tok_state = tok_state_data
3004                                 tmp = tok_cur_tag
3005                                 tok_cur_tag = null
3006                                 return tmp
3007                         when "\u0000"
3008                                 parse_error()
3009                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3010                         when '"', "'", '<'
3011                                 parse_error()
3012                                 tok_cur_tag.attrs_a[0][0] = c
3013                         when '' # EOF
3014                                 parse_error()
3015                                 tok_state = tok_state_data
3016                         else
3017                                 if is_uc_alpha(c)
3018                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3019                                 else
3020                                         tok_cur_tag.attrs_a[0][0] += c
3021                 return null
3022
3023         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3024         tok_state_after_attribute_name = ->
3025                 c = txt.charAt(cur++)
3026                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3027                         return
3028                 if c is '/'
3029                         tok_state = tok_state_self_closing_start_tag
3030                         return
3031                 if c is '='
3032                         tok_state = tok_state_before_attribute_value
3033                         return
3034                 if c is '>'
3035                         tok_state = tok_state_data
3036                         return
3037                 if is_uc_alpha(c)
3038                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3039                         tok_state = tok_state_attribute_name
3040                         return
3041                 if c is "\u0000"
3042                         parse_error()
3043                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3044                         tok_state = tok_state_attribute_name
3045                         return
3046                 if c is '' # EOF
3047                         parse_error()
3048                         tok_state = tok_state_data
3049                         cur -= 1 # reconsume
3050                         return
3051                 if c is '"' or c is "'" or c is '<'
3052                         parse_error()
3053                         # fall through to Anything else
3054                 # Anything else
3055                 tok_cur_tag.attrs_a.unshift [c, '']
3056                 tok_state = tok_state_attribute_name
3057
3058         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3059         tok_state_before_attribute_value = ->
3060                 switch c = txt.charAt(cur++)
3061                         when "\t", "\n", "\u000c", ' '
3062                                 return null
3063                         when '"'
3064                                 tok_state = tok_state_attribute_value_double_quoted
3065                         when '&'
3066                                 tok_state = tok_state_attribute_value_unquoted
3067                                 cur -= 1
3068                         when "'"
3069                                 tok_state = tok_state_attribute_value_single_quoted
3070                         when "\u0000"
3071                                 # Parse error
3072                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3073                                 tok_state = tok_state_attribute_value_unquoted
3074                         when '>'
3075                                 # Parse error
3076                                 tok_state = tok_state_data
3077                                 tmp = tok_cur_tag
3078                                 tok_cur_tag = null
3079                                 return tmp
3080                         when '' # EOF
3081                                 parse_error()
3082                                 tok_state = tok_state_data
3083                         else
3084                                 tok_cur_tag.attrs_a[0][1] += c
3085                                 tok_state = tok_state_attribute_value_unquoted
3086                 return null
3087
3088         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3089         tok_state_attribute_value_double_quoted = ->
3090                 switch c = txt.charAt(cur++)
3091                         when '"'
3092                                 tok_state = tok_state_after_attribute_value_quoted
3093                         when '&'
3094                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3095                         when "\u0000"
3096                                 # Parse error
3097                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3098                         when '' # EOF
3099                                 parse_error()
3100                                 tok_state = tok_state_data
3101                         else
3102                                 tok_cur_tag.attrs_a[0][1] += c
3103                 return null
3104
3105         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3106         tok_state_attribute_value_single_quoted = ->
3107                 switch c = txt.charAt(cur++)
3108                         when "'"
3109                                 tok_state = tok_state_after_attribute_value_quoted
3110                         when '&'
3111                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3112                         when "\u0000"
3113                                 # Parse error
3114                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3115                         when '' # EOF
3116                                 parse_error()
3117                                 tok_state = tok_state_data
3118                         else
3119                                 tok_cur_tag.attrs_a[0][1] += c
3120                 return null
3121
3122         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3123         tok_state_attribute_value_unquoted = ->
3124                 switch c = txt.charAt(cur++)
3125                         when "\t", "\n", "\u000c", ' '
3126                                 tok_state = tok_state_before_attribute_name
3127                         when '&'
3128                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3129                         when '>'
3130                                 tok_state = tok_state_data
3131                                 tmp = tok_cur_tag
3132                                 tok_cur_tag = null
3133                                 return tmp
3134                         when "\u0000"
3135                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3136                         when '' # EOF
3137                                 parse_error()
3138                                 tok_state = tok_state_data
3139                         else
3140                                 # Parse Error if ', <, = or ` (backtick)
3141                                 tok_cur_tag.attrs_a[0][1] += c
3142                 return null
3143
3144         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3145         tok_state_after_attribute_value_quoted = ->
3146                 switch c = txt.charAt(cur++)
3147                         when "\t", "\n", "\u000c", ' '
3148                                 tok_state = tok_state_before_attribute_name
3149                         when '/'
3150                                 tok_state = tok_state_self_closing_start_tag
3151                         when '>'
3152                                 tok_state = tok_state_data
3153                                 tmp = tok_cur_tag
3154                                 tok_cur_tag = null
3155                                 return tmp
3156                         when '' # EOF
3157                                 parse_error()
3158                                 tok_state = tok_state_data
3159                         else
3160                                 # Parse Error
3161                                 tok_state = tok_state_before_attribute_name
3162                                 cur -= 1 # we didn't handle that char
3163                 return null
3164
3165         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3166         tok_state_self_closing_start_tag = ->
3167                 c = txt.charAt(cur++)
3168                 if c is '>'
3169                         tok_cur_tag.flag 'self-closing'
3170                         tok_state = tok_state_data
3171                         return tok_cur_tag
3172                 if c is ''
3173                         parse_error()
3174                         tok_state = tok_state_data
3175                         cur -= 1 # Reconsume
3176                         return
3177                 # Anything else
3178                 parse_error()
3179                 tok_state = tok_state_before_attribute_name
3180                 cur -= 1 # Reconsume
3181                 return
3182
3183         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3184         # WARNING: put a comment token in tok_cur_tag before setting this state
3185         tok_state_bogus_comment = ->
3186                 next_gt = txt.indexOf '>', cur
3187                 if next_gt is -1
3188                         val = txt.substr cur
3189                         cur = txt.length
3190                 else
3191                         val = txt.substr cur, (next_gt - cur)
3192                         cur = next_gt + 1
3193                 val = val.replace "\u0000", "\ufffd"
3194                 tok_cur_tag.text += val
3195                 tok_state = tok_state_data
3196                 return tok_cur_tag
3197
3198         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3199         tok_state_markup_declaration_open = ->
3200                 if txt.substr(cur, 2) is '--'
3201                         cur += 2
3202                         tok_cur_tag = new_comment_token ''
3203                         tok_state = tok_state_comment_start
3204                         return
3205                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3206                         cur += 7
3207                         tok_state = tok_state_doctype
3208                         return
3209                 acn = adjusted_current_node()
3210                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3211                         cur += 7
3212                         tok_state = tok_state_cdata_section
3213                         return
3214                 # Otherwise
3215                 parse_error()
3216                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3217                 tok_state = tok_state_bogus_comment
3218                 return
3219
3220         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3221         tok_state_comment_start = ->
3222                 switch c = txt.charAt(cur++)
3223                         when '-'
3224                                 tok_state = tok_state_comment_start_dash
3225                         when "\u0000"
3226                                 parse_error()
3227                                 return new_character_token "\ufffd"
3228                         when '>'
3229                                 parse_error()
3230                                 tok_state = tok_state_data
3231                                 return tok_cur_tag
3232                         when '' # EOF
3233                                 parse_error()
3234                                 tok_state = tok_state_data
3235                                 cur -= 1 # Reconsume
3236                                 return tok_cur_tag
3237                         else
3238                                 tok_cur_tag.text += c
3239                 return null
3240
3241         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3242         tok_state_comment_start_dash = ->
3243                 switch c = txt.charAt(cur++)
3244                         when '-'
3245                                 tok_state = tok_state_comment_end
3246                         when "\u0000"
3247                                 parse_error()
3248                                 tok_cur_tag.text += "-\ufffd"
3249                                 tok_state = tok_state_comment
3250                         when '>'
3251                                 parse_error()
3252                                 tok_state = tok_state_data
3253                                 return tok_cur_tag
3254                         when '' # EOF
3255                                 parse_error()
3256                                 tok_state = tok_state_data
3257                                 cur -= 1 # Reconsume
3258                                 return tok_cur_tag
3259                         else
3260                                 tok_cur_tag.text += "-#{c}"
3261                                 tok_state = tok_state_comment
3262                 return null
3263
3264         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3265         tok_state_comment = ->
3266                 switch c = txt.charAt(cur++)
3267                         when '-'
3268                                 tok_state = tok_state_comment_end_dash
3269                         when "\u0000"
3270                                 parse_error()
3271                                 tok_cur_tag.text += "\ufffd"
3272                         when '' # EOF
3273                                 parse_error()
3274                                 tok_state = tok_state_data
3275                                 cur -= 1 # Reconsume
3276                                 return tok_cur_tag
3277                         else
3278                                 tok_cur_tag.text += c
3279                 return null
3280
3281         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3282         tok_state_comment_end_dash = ->
3283                 switch c = txt.charAt(cur++)
3284                         when '-'
3285                                 tok_state = tok_state_comment_end
3286                         when "\u0000"
3287                                 parse_error()
3288                                 tok_cur_tag.text += "-\ufffd"
3289                                 tok_state = tok_state_comment
3290                         when '' # EOF
3291                                 parse_error()
3292                                 tok_state = tok_state_data
3293                                 cur -= 1 # Reconsume
3294                                 return tok_cur_tag
3295                         else
3296                                 tok_cur_tag.text += "-#{c}"
3297                                 tok_state = tok_state_comment
3298                 return null
3299
3300         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3301         tok_state_comment_end = ->
3302                 switch c = txt.charAt(cur++)
3303                         when '>'
3304                                 tok_state = tok_state_data
3305                                 return tok_cur_tag
3306                         when "\u0000"
3307                                 parse_error()
3308                                 tok_cur_tag.text += "--\ufffd"
3309                                 tok_state = tok_state_comment
3310                         when '!'
3311                                 parse_error()
3312                                 tok_state = tok_state_comment_end_bang
3313                         when '-'
3314                                 parse_error()
3315                                 tok_cur_tag.text += '-'
3316                         when '' # EOF
3317                                 parse_error()
3318                                 tok_state = tok_state_data
3319                                 cur -= 1 # Reconsume
3320                                 return tok_cur_tag
3321                         else
3322                                 parse_error()
3323                                 tok_cur_tag.text += "--#{c}"
3324                                 tok_state = tok_state_comment
3325                 return null
3326
3327         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3328         tok_state_comment_end_bang = ->
3329                 switch c = txt.charAt(cur++)
3330                         when '-'
3331                                 tok_cur_tag.text += "--!#{c}"
3332                                 tok_state = tok_state_comment_end_dash
3333                         when '>'
3334                                 tok_state = tok_state_data
3335                                 return tok_cur_tag
3336                         when "\u0000"
3337                                 parse_error()
3338                                 tok_cur_tag.text += "--!\ufffd"
3339                                 tok_state = tok_state_comment
3340                         when '' # EOF
3341                                 parse_error()
3342                                 tok_state = tok_state_data
3343                                 cur -= 1 # Reconsume
3344                                 return tok_cur_tag
3345                         else
3346                                 tok_cur_tag.text += "--!#{c}"
3347                                 tok_state = tok_state_comment
3348                 return null
3349
3350         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3351         tok_state_doctype = ->
3352                 switch c = txt.charAt(cur++)
3353                         when "\t", "\u000a", "\u000c", ' '
3354                                 tok_state = tok_state_before_doctype_name
3355                         when '' # EOF
3356                                 parse_error()
3357                                 tok_state = tok_state_data
3358                                 el = new_doctype_token ''
3359                                 el.flag 'force-quirks', true
3360                                 cur -= 1 # Reconsume
3361                                 return el
3362                         else
3363                                 parse_error()
3364                                 tok_state = tok_state_before_doctype_name
3365                                 cur -= 1 # Reconsume
3366                 return null
3367
3368         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3369         tok_state_before_doctype_name = ->
3370                 c = txt.charAt(cur++)
3371                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3372                         return
3373                 if is_uc_alpha(c)
3374                         tok_cur_tag = new_doctype_token c.toLowerCase()
3375                         tok_state = tok_state_doctype_name
3376                         return
3377                 if c is "\u0000"
3378                         parse_error()
3379                         tok_cur_tag = new_doctype_token "\ufffd"
3380                         tok_state = tok_state_doctype_name
3381                         return
3382                 if c is '>'
3383                         parse_error()
3384                         el = new_doctype_token ''
3385                         el.flag 'force-quirks', true
3386                         tok_state = tok_state_data
3387                         return el
3388                 if c is '' # EOF
3389                         parse_error()
3390                         tok_state = tok_state_data
3391                         el = new_doctype_token ''
3392                         el.flag 'force-quirks', true
3393                         cur -= 1 # Reconsume
3394                         return el
3395                 # Anything else
3396                 tok_cur_tag = new_doctype_token c
3397                 tok_state = tok_state_doctype_name
3398                 return null
3399
3400         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3401         tok_state_doctype_name = ->
3402                 c = txt.charAt(cur++)
3403                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3404                         tok_state = tok_state_after_doctype_name
3405                         return
3406                 if c is '>'
3407                         tok_state = tok_state_data
3408                         return tok_cur_tag
3409                 if is_uc_alpha(c)
3410                         tok_cur_tag.name += c.toLowerCase()
3411                         return
3412                 if c is "\u0000"
3413                         parse_error()
3414                         tok_cur_tag.name += "\ufffd"
3415                         return
3416                 if c is '' # EOF
3417                         parse_error()
3418                         tok_state = tok_state_data
3419                         tok_cur_tag.flag 'force-quirks', true
3420                         cur -= 1 # Reconsume
3421                         return tok_cur_tag
3422                 # Anything else
3423                 tok_cur_tag.name += c
3424                 return null
3425
3426         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3427         tok_state_after_doctype_name = ->
3428                 c = txt.charAt(cur++)
3429                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3430                         return
3431                 if c is '>'
3432                         tok_state = tok_state_data
3433                         return tok_cur_tag
3434                 if c is '' # EOF
3435                         parse_error()
3436                         tok_state = tok_state_data
3437                         tok_cur_tag.flag 'force-quirks', true
3438                         cur -= 1 # Reconsume
3439                         return tok_cur_tag
3440                 # Anything else
3441                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3442                         cur += 5
3443                         tok_state = tok_state_after_doctype_public_keyword
3444                         return
3445                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3446                         cur += 5
3447                         tok_state = tok_state_after_doctype_system_keyword
3448                         return
3449                 parse_error()
3450                 tok_cur_tag.flag 'force-quirks', true
3451                 tok_state = tok_state_bogus_doctype
3452                 return null
3453
3454         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3455         tok_state_after_doctype_public_keyword = ->
3456                 c = txt.charAt(cur++)
3457                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3458                         tok_state = tok_state_before_doctype_public_identifier
3459                         return
3460                 if c is '"'
3461                         parse_error()
3462                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3463                         tok_state = tok_state_doctype_public_identifier_double_quoted
3464                         return
3465                 if c is "'"
3466                         parse_error()
3467                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3468                         tok_state = tok_state_doctype_public_identifier_single_quoted
3469                         return
3470                 if c is '>'
3471                         parse_error()
3472                         tok_cur_tag.flag 'force-quirks', true
3473                         tok_state = tok_state_data
3474                         return tok_cur_tag
3475                 if c is '' # EOF
3476                         parse_error()
3477                         tok_state = tok_state_data
3478                         tok_cur_tag.flag 'force-quirks', true
3479                         cur -= 1 # Reconsume
3480                         return tok_cur_tag
3481                 # Anything else
3482                 parse_error()
3483                 tok_cur_tag.flag 'force-quirks', true
3484                 tok_state = tok_state_bogus_doctype
3485                 return null
3486
3487         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3488         tok_state_before_doctype_public_identifier = ->
3489                 c = txt.charAt(cur++)
3490                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3491                         return
3492                 if c is '"'
3493                         parse_error()
3494                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3495                         tok_state = tok_state_doctype_public_identifier_double_quoted
3496                         return
3497                 if c is "'"
3498                         parse_error()
3499                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3500                         tok_state = tok_state_doctype_public_identifier_single_quoted
3501                         return
3502                 if c is '>'
3503                         parse_error()
3504                         tok_cur_tag.flag 'force-quirks', true
3505                         tok_state = tok_state_data
3506                         return tok_cur_tag
3507                 if c is '' # EOF
3508                         parse_error()
3509                         tok_state = tok_state_data
3510                         tok_cur_tag.flag 'force-quirks', true
3511                         cur -= 1 # Reconsume
3512                         return tok_cur_tag
3513                 # Anything else
3514                 parse_error()
3515                 tok_cur_tag.flag 'force-quirks', true
3516                 tok_state = tok_state_bogus_doctype
3517                 return null
3518
3519
3520         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3521         tok_state_doctype_public_identifier_double_quoted = ->
3522                 c = txt.charAt(cur++)
3523                 if c is '"'
3524                         tok_state = tok_state_after_doctype_public_identifier
3525                         return
3526                 if c is "\u0000"
3527                         parse_error()
3528                         tok_cur_tag.public_identifier += "\ufffd"
3529                         return
3530                 if c is '>'
3531                         parse_error()
3532                         tok_cur_tag.flag 'force-quirks', true
3533                         tok_state = tok_state_data
3534                         return tok_cur_tag
3535                 if c is '' # EOF
3536                         parse_error()
3537                         tok_state = tok_state_data
3538                         tok_cur_tag.flag 'force-quirks', true
3539                         cur -= 1 # Reconsume
3540                         return tok_cur_tag
3541                 # Anything else
3542                 tok_cur_tag.public_identifier += c
3543                 return null
3544
3545         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3546         tok_state_doctype_public_identifier_single_quoted = ->
3547                 c = txt.charAt(cur++)
3548                 if c is "'"
3549                         tok_state = tok_state_after_doctype_public_identifier
3550                         return
3551                 if c is "\u0000"
3552                         parse_error()
3553                         tok_cur_tag.public_identifier += "\ufffd"
3554                         return
3555                 if c is '>'
3556                         parse_error()
3557                         tok_cur_tag.flag 'force-quirks', true
3558                         tok_state = tok_state_data
3559                         return tok_cur_tag
3560                 if c is '' # EOF
3561                         parse_error()
3562                         tok_state = tok_state_data
3563                         tok_cur_tag.flag 'force-quirks', true
3564                         cur -= 1 # Reconsume
3565                         return tok_cur_tag
3566                 # Anything else
3567                 tok_cur_tag.public_identifier += c
3568                 return null
3569
3570         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3571         tok_state_after_doctype_public_identifier = ->
3572                 c = txt.charAt(cur++)
3573                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3574                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3575                         return
3576                 if c is '>'
3577                         tok_state = tok_state_data
3578                         return tok_cur_tag
3579                 if c is '"'
3580                         parse_error()
3581                         tok_cur_tag.system_identifier = ''
3582                         tok_state = tok_state_doctype_system_identifier_double_quoted
3583                         return
3584                 if c is "'"
3585                         parse_error()
3586                         tok_cur_tag.system_identifier = ''
3587                         tok_state = tok_state_doctype_system_identifier_single_quoted
3588                         return
3589                 if c is '' # EOF
3590                         parse_error()
3591                         tok_state = tok_state_data
3592                         tok_cur_tag.flag 'force-quirks', true
3593                         cur -= 1 # Reconsume
3594                         return tok_cur_tag
3595                 # Anything else
3596                 parse_error()
3597                 tok_cur_tag.flag 'force-quirks', true
3598                 tok_state = tok_state_bogus_doctype
3599                 return null
3600
3601         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3602         tok_state_between_doctype_public_and_system_identifiers = ->
3603                 c = txt.charAt(cur++)
3604                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3605                         return
3606                 if c is '>'
3607                         tok_state = tok_state_data
3608                         return tok_cur_tag
3609                 if c is '"'
3610                         parse_error()
3611                         tok_cur_tag.system_identifier = ''
3612                         tok_state = tok_state_doctype_system_identifier_double_quoted
3613                         return
3614                 if c is "'"
3615                         parse_error()
3616                         tok_cur_tag.system_identifier = ''
3617                         tok_state = tok_state_doctype_system_identifier_single_quoted
3618                         return
3619                 if c is '' # EOF
3620                         parse_error()
3621                         tok_state = tok_state_data
3622                         tok_cur_tag.flag 'force-quirks', true
3623                         cur -= 1 # Reconsume
3624                         return tok_cur_tag
3625                 # Anything else
3626                 parse_error()
3627                 tok_cur_tag.flag 'force-quirks', true
3628                 tok_state = tok_state_bogus_doctype
3629                 return null
3630
3631         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3632         tok_state_after_doctype_system_keyword = ->
3633                 c = txt.charAt(cur++)
3634                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3635                         tok_state = tok_state_before_doctype_system_identifier
3636                         return
3637                 if c is '"'
3638                         parse_error()
3639                         tok_cur_tag.system_identifier = ''
3640                         tok_state = tok_state_doctype_system_identifier_double_quoted
3641                         return
3642                 if c is "'"
3643                         parse_error()
3644                         tok_cur_tag.system_identifier = ''
3645                         tok_state = tok_state_doctype_system_identifier_single_quoted
3646                         return
3647                 if c is '>'
3648                         parse_error()
3649                         tok_cur_tag.flag 'force-quirks', true
3650                         tok_state = tok_state_data
3651                         return tok_cur_tag
3652                 if c is '' # EOF
3653                         parse_error()
3654                         tok_state = tok_state_data
3655                         tok_cur_tag.flag 'force-quirks', true
3656                         cur -= 1 # Reconsume
3657                         return tok_cur_tag
3658                 # Anything else
3659                 parse_error()
3660                 tok_cur_tag.flag 'force-quirks', true
3661                 tok_state = tok_state_bogus_doctype
3662                 return null
3663
3664         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3665         tok_state_before_doctype_system_identifier = ->
3666                 c = txt.charAt(cur++)
3667                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3668                         return
3669                 if c is '"'
3670                         tok_cur_tag.system_identifier = ''
3671                         tok_state = tok_state_doctype_system_identifier_double_quoted
3672                         return
3673                 if c is "'"
3674                         tok_cur_tag.system_identifier = ''
3675                         tok_state = tok_state_doctype_system_identifier_single_quoted
3676                         return
3677                 if c is '>'
3678                         parse_error()
3679                         tok_cur_tag.flag 'force-quirks', true
3680                         tok_state = tok_state_data
3681                         return tok_cur_tag
3682                 if c is '' # EOF
3683                         parse_error()
3684                         tok_state = tok_state_data
3685                         tok_cur_tag.flag 'force-quirks', true
3686                         cur -= 1 # Reconsume
3687                         return tok_cur_tag
3688                 # Anything else
3689                 parse_error()
3690                 tok_cur_tag.flag 'force-quirks', true
3691                 tok_state = tok_state_bogus_doctype
3692                 return null
3693
3694         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3695         tok_state_doctype_system_identifier_double_quoted = ->
3696                 c = txt.charAt(cur++)
3697                 if c is '"'
3698                         tok_state = tok_state_after_doctype_system_identifier
3699                         return
3700                 if c is "\u0000"
3701                         parse_error()
3702                         tok_cur_tag.system_identifier += "\ufffd"
3703                         return
3704                 if c is '>'
3705                         parse_error()
3706                         tok_cur_tag.flag 'force-quirks', true
3707                         tok_state = tok_state_data
3708                         return tok_cur_tag
3709                 if c is '' # EOF
3710                         parse_error()
3711                         tok_state = tok_state_data
3712                         tok_cur_tag.flag 'force-quirks', true
3713                         cur -= 1 # Reconsume
3714                         return tok_cur_tag
3715                 # Anything else
3716                 tok_cur_tag.system_identifier += c
3717                 return null
3718
3719         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3720         tok_state_doctype_system_identifier_single_quoted = ->
3721                 c = txt.charAt(cur++)
3722                 if c is "'"
3723                         tok_state = tok_state_after_doctype_system_identifier
3724                         return
3725                 if c is "\u0000"
3726                         parse_error()
3727                         tok_cur_tag.system_identifier += "\ufffd"
3728                         return
3729                 if c is '>'
3730                         parse_error()
3731                         tok_cur_tag.flag 'force-quirks', true
3732                         tok_state = tok_state_data
3733                         return tok_cur_tag
3734                 if c is '' # EOF
3735                         parse_error()
3736                         tok_state = tok_state_data
3737                         tok_cur_tag.flag 'force-quirks', true
3738                         cur -= 1 # Reconsume
3739                         return tok_cur_tag
3740                 # Anything else
3741                 tok_cur_tag.system_identifier += c
3742                 return null
3743
3744         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3745         tok_state_after_doctype_system_identifier = ->
3746                 c = txt.charAt(cur++)
3747                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3748                         return
3749                 if c is '>'
3750                         tok_state = tok_state_data
3751                         return tok_cur_tag
3752                 if c is '' # EOF
3753                         parse_error()
3754                         tok_state = tok_state_data
3755                         tok_cur_tag.flag 'force-quirks', true
3756                         cur -= 1 # Reconsume
3757                         return tok_cur_tag
3758                 # Anything else
3759                 parse_error()
3760                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3761                 tok_state = tok_state_bogus_doctype
3762                 return null
3763
3764         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3765         tok_state_bogus_doctype = ->
3766                 c = txt.charAt(cur++)
3767                 if c is '>'
3768                         tok_state = tok_state_data
3769                         return tok_cur_tag
3770                 if c is '' # EOF
3771                         tok_state = tok_state_data
3772                         cur -= 1 # Reconsume
3773                         return tok_cur_tag
3774                 # Anything else
3775                 return null
3776
3777
3778         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3779         # Don't set this as a state, just call it
3780         # returns a string (NOT a text node)
3781         parse_character_reference = (allowed_char = null, in_attr = false) ->
3782                 if cur >= txt.length
3783                         return '&'
3784                 switch c = txt.charAt(cur)
3785                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3786                                 # explicitly not a parse error
3787                                 return '&'
3788                         when ';'
3789                                 # there has to be "one or more" alnums between & and ; to be a parse error
3790                                 return '&'
3791                         when '#'
3792                                 if cur + 1 >= txt.length
3793                                         return '&'
3794                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3795                                         prefix = '#x'
3796                                         charset = hex_chars
3797                                         start = cur + 2
3798                                 else
3799                                         charset = digits
3800                                         start = cur + 1
3801                                         prefix = '#'
3802                                 i = 0
3803                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3804                                         i += 1
3805                                 if i is 0
3806                                         return '&'
3807                                 if txt.charAt(start + i) is ';'
3808                                         i += 1
3809                                 # FIXME This is supposed to generate parse errors for some chars
3810                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3811                                 if decoded?
3812                                         cur = start + i
3813                                         return decoded
3814                                 return '&'
3815                         else
3816                                 for i in [0...31]
3817                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3818                                                 break
3819                                 if i is 0
3820                                         # exit early, because parse_error() below needs at least one alnum
3821                                         return '&'
3822                                 if txt.charAt(cur + i) is ';'
3823                                         i += 1 # include ';' terminator in value
3824                                         decoded = decode_named_char_ref txt.substr(cur, i)
3825                                         if decoded?
3826                                                 cur += i
3827                                                 return decoded
3828                                         parse_error()
3829                                         return '&'
3830                                 else
3831                                         # no ';' terminator (only legacy char refs)
3832                                         max = i
3833                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3834                                                 c = legacy_char_refs[txt.substr(cur, i)]
3835                                                 if c?
3836                                                         if in_attr
3837                                                                 if txt.charAt(cur + i) is '='
3838                                                                         # "because some legacy user agents will
3839                                                                         # misinterpret the markup in those cases"
3840                                                                         parse_error()
3841                                                                         return '&'
3842                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3843                                                                         # this makes attributes forgiving about url args
3844                                                                         return '&'
3845                                                         # ok, and besides the weird exceptions for attributes...
3846                                                         # return the matching char
3847                                                         cur += i # consume entity chars
3848                                                         parse_error() # because no terminating ";"
3849                                                         return c
3850                                         parse_error()
3851                                         return '&'
3852                 return # never reached
3853
3854         # tree constructor initialization
3855         # see comments on TYPE_TAG/etc for the structure of this data
3856         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3857         open_els = []
3858         afe = [] # active formatting elements
3859         template_ins_modes = []
3860         ins_mode = ins_mode_initial
3861         original_ins_mode = ins_mode # TODO check spec
3862         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3863         flag_frameset_ok = true
3864         flag_parsing = true
3865         flag_foster_parenting = false
3866         form_element_pointer = null
3867         temporary_buffer = null
3868         pending_table_character_tokens = []
3869         head_element_pointer = null
3870         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3871         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3872
3873         # tokenizer initialization
3874         tok_state = tok_state_data
3875
3876         # proccess input
3877         while flag_parsing
3878                 t = tok_state()
3879                 if t?
3880                         ins_mode t
3881                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3882         return doc.children
3883
3884 serialize_els = (els, shallow, show_ids) ->
3885         serialized = ''
3886         sep = ''
3887         for t in els
3888                 serialized += sep
3889                 sep = ','
3890                 serialized += t.serialize shallow, show_ids
3891         return serialized
3892
3893 # TODO export TYPE_*
3894 module.exports.parse_html = parse_html
3895 module.exports.debug_log_reset = debug_log_reset
3896 module.exports.debug_log_each = debug_log_each
3897 module.exports.TYPE_TAG = TYPE_TAG
3898 module.exports.TYPE_TEXT = TYPE_TEXT
3899 module.exports.TYPE_COMMENT = TYPE_COMMENT
3900 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE