JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
tests pass running under node.js
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a thorough parser for html5, meant to be used by a
19 # WYSIWYG editor.
20
21 # The implementation is a pretty direct implementation of the parsing algorithm
22 # described here:
23 #
24 #     http://www.w3.org/TR/html5/syntax.html
25 #
26 # except for some places marked "WHATWG" that are implemented as described here:
27 #
28 #     https://html.spec.whatwg.org/multipage/syntax.html
29 #
30 # This code passes all of the tests in the .dat files at:
31 #
32 #     https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
33
34
35 ##################################
36 ## how to use this code
37 ##################################
38 #
39 # See README.md for how to pre-compile this file, or compile it in the browser.
40 #
41 # This file exports a single useful function: parse_tml
42 #
43 # Once you include this file in a page (see index.html for an example) you'll
44 # have window.wheic
45 #
46 # Call it like this:
47 #
48 #     wheic.parse_html({html: "<p><b>hi</p>"})
49 #
50 # Or, if you don't want <html><head><body>/etc, do this:
51 #
52 #     wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
53 #
54 # This code can _almost_ run outside the browser (eg under node.js). To get it
55 # to run without the browser would require native implementation of
56 # decode_named_char_ref(). The current implementation of that function uses the
57 # browser's DOM api, to save space (the list of valid named characters is
58 # massive.)
59
60 # This code is a work in progress, eg try search this file for "fixfull",
61 # "TODO" and "FIXME"
62
63
64 # Notes:  stacks/lists
65 #
66 # Jason was frequently confused by the terminology used to refer to different
67 # parts of the stacks and lists in the spec, so he made this chart to help keep
68 # his head straight:
69 #
70 # stacks grow downward (current element is index=0)
71 #
72 # example: open_els = [a, b, c, d, e, f, g]
73 #
74 # "grows downwards" means it's visualized like this: (index: el, names)
75 #
76 #   6: g "start of the list", "topmost", "first"
77 #   5: f
78 #   4: e "previous" (to d), "above", "before"
79 #   3: d   (previous/next are relative to this element)
80 #   2: c "next", "after", "lower", "below"
81 #   1: b
82 #   0: a "end of the list", "current node", "bottommost", "last"
83
84 if (typeof module) isnt 'undefined' and module.exports?
85         context = 'module'
86         exports = module.exports
87 else
88         context = 'browser'
89         window.wheic = {}
90         exports = window.wheic
91
92 from_code_point = (x) ->
93         if String.fromCodePoint?
94                 return String.fromCodePoint x
95         else
96                 if x <= 0xffff
97                         return String.fromCharCode x
98                 x -= 0x10000
99                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
100
101 # Each node is an obect of the Node class. Here are the Node types:
102 TYPE_TAG = 0 # name, {attributes}, [children]
103 TYPE_TEXT = 1 # "text"
104 TYPE_COMMENT = 2
105 TYPE_DOCTYPE = 3
106 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
107 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
108 TYPE_END_TAG = 5 # name
109 TYPE_EOF = 6
110 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
111 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
112
113 # namespace constants
114 NS_HTML = 1
115 NS_MATHML = 2
116 NS_SVG = 3
117
118 # quirks mode constants
119 QUIRKS_NO = 1
120 QUIRKS_LIMITED = 2
121 QUIRKS_YES = 3
122
123 # queue up debug logs, so eg they can be shown only for tests that fail
124 g_debug_log = []
125 debug_log_reset = ->
126         g_debug_log = []
127         return
128 debug_log = (str) ->
129         g_debug_log.push str
130         return
131 debug_log_each = (cb) ->
132         for str in g_debug_log
133                 cb str
134         return
135
136 prev_node_id = 0
137 class Node
138         constructor: (type, args = {}) ->
139                 @type = type # one of the TYPE_* constants above
140                 @name = args.name ? '' # tag name
141                 @text = args.text ? '' # contents for text/comment nodes
142                 @attrs = args.attrs ? {}
143                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
144                 @children = args.children ? []
145                 @namespace = args.namespace ? NS_HTML
146                 @parent = args.parent ? null
147                 @token = args.token ? null
148                 @flags = args.flags ? {}
149                 if args.id?
150                         @id = "#{args.id}+"
151                 else
152                         @id = "#{++prev_node_id}"
153         acknowledge_self_closing: ->
154                 if @token?
155                         @token.flag 'did_self_close', true
156                 else
157                         @flag 'did_self_close', true
158                 return
159         flag: (key, value = null) ->
160                 if value?
161                         @flags[key] = value
162                 else
163                         return @flags[key]
164                 return
165
166 # helpers: (only take args that are normally known when parser creates nodes)
167 new_open_tag = (name) ->
168         return new Node TYPE_START_TAG, name: name
169 new_end_tag = (name) ->
170         return new Node TYPE_END_TAG, name: name
171 new_element = (name) ->
172         return new Node TYPE_TAG, name: name
173 new_text_node = (txt) ->
174         return new Node TYPE_TEXT, text: txt
175 new_character_token = new_text_node
176 new_comment_token = (txt) ->
177         return new Node TYPE_COMMENT, text: txt
178 new_doctype_token = (name) ->
179         return new Node TYPE_DOCTYPE, name: name
180 new_eof_token = ->
181         return new Node TYPE_EOF
182 new_afe_marker = ->
183         return new Node TYPE_AFE_MARKER
184 new_aaa_bookmark = ->
185         return new Node TYPE_AAA_BOOKMARK
186
187 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
188 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
189 digits = "0123456789"
190 alnum = lc_alpha + uc_alpha + digits
191 hex_chars = digits + "abcdefABCDEF"
192
193 is_uc_alpha = (str) ->
194         return str.length is 1 and uc_alpha.indexOf(str) > -1
195 is_lc_alpha = (str) ->
196         return str.length is 1 and lc_alpha.indexOf(str) > -1
197
198 # some SVG elements have dashes in them
199 tag_name_chars = alnum + "-"
200
201 # http://www.w3.org/TR/html5/infrastructure.html#space-character
202 space_chars = "\u0009\u000a\u000c\u000d\u0020"
203 is_space = (txt) ->
204         return txt.length is 1 and space_chars.indexOf(txt) > -1
205 is_space_tok = (t) ->
206         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
207
208 is_input_hidden_tok = (t) ->
209         return false unless t.type is TYPE_START_TAG
210         for a in t.attrs_a
211                 if a[0] is 'type'
212                         if a[1].toLowerCase() is 'hidden'
213                                 return true
214                         return false
215         return false
216
217 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
218 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
219
220 unicode_fixes = {}
221 unicode_fixes[0x00] = "\uFFFD"
222 unicode_fixes[0x80] = "\u20AC"
223 unicode_fixes[0x82] = "\u201A"
224 unicode_fixes[0x83] = "\u0192"
225 unicode_fixes[0x84] = "\u201E"
226 unicode_fixes[0x85] = "\u2026"
227 unicode_fixes[0x86] = "\u2020"
228 unicode_fixes[0x87] = "\u2021"
229 unicode_fixes[0x88] = "\u02C6"
230 unicode_fixes[0x89] = "\u2030"
231 unicode_fixes[0x8A] = "\u0160"
232 unicode_fixes[0x8B] = "\u2039"
233 unicode_fixes[0x8C] = "\u0152"
234 unicode_fixes[0x8E] = "\u017D"
235 unicode_fixes[0x91] = "\u2018"
236 unicode_fixes[0x92] = "\u2019"
237 unicode_fixes[0x93] = "\u201C"
238 unicode_fixes[0x94] = "\u201D"
239 unicode_fixes[0x95] = "\u2022"
240 unicode_fixes[0x96] = "\u2013"
241 unicode_fixes[0x97] = "\u2014"
242 unicode_fixes[0x98] = "\u02DC"
243 unicode_fixes[0x99] = "\u2122"
244 unicode_fixes[0x9A] = "\u0161"
245 unicode_fixes[0x9B] = "\u203A"
246 unicode_fixes[0x9C] = "\u0153"
247 unicode_fixes[0x9E] = "\u017E"
248 unicode_fixes[0x9F] = "\u0178"
249
250 quirks_yes_pi_prefixes = [
251         "+//silmaril//dtd html pro v0r11 19970101//"
252         "-//as//dtd html 3.0 aswedit + extensions//"
253         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
254         "-//ietf//dtd html 2.0 level 1//"
255         "-//ietf//dtd html 2.0 level 2//"
256         "-//ietf//dtd html 2.0 strict level 1//"
257         "-//ietf//dtd html 2.0 strict level 2//"
258         "-//ietf//dtd html 2.0 strict//"
259         "-//ietf//dtd html 2.0//"
260         "-//ietf//dtd html 2.1e//"
261         "-//ietf//dtd html 3.0//"
262         "-//ietf//dtd html 3.2 final//"
263         "-//ietf//dtd html 3.2//"
264         "-//ietf//dtd html 3//"
265         "-//ietf//dtd html level 0//"
266         "-//ietf//dtd html level 1//"
267         "-//ietf//dtd html level 2//"
268         "-//ietf//dtd html level 3//"
269         "-//ietf//dtd html strict level 0//"
270         "-//ietf//dtd html strict level 1//"
271         "-//ietf//dtd html strict level 2//"
272         "-//ietf//dtd html strict level 3//"
273         "-//ietf//dtd html strict//"
274         "-//ietf//dtd html//"
275         "-//metrius//dtd metrius presentational//"
276         "-//microsoft//dtd internet explorer 2.0 html strict//"
277         "-//microsoft//dtd internet explorer 2.0 html//"
278         "-//microsoft//dtd internet explorer 2.0 tables//"
279         "-//microsoft//dtd internet explorer 3.0 html strict//"
280         "-//microsoft//dtd internet explorer 3.0 html//"
281         "-//microsoft//dtd internet explorer 3.0 tables//"
282         "-//netscape comm. corp.//dtd html//"
283         "-//netscape comm. corp.//dtd strict html//"
284         "-//o'reilly and associates//dtd html 2.0//"
285         "-//o'reilly and associates//dtd html extended 1.0//"
286         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
287         "-//sq//dtd html 2.0 hotmetal + extensions//"
288         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
289         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
290         "-//spyglass//dtd html 2.0 extended//"
291         "-//sun microsystems corp.//dtd hotjava html//"
292         "-//sun microsystems corp.//dtd hotjava strict html//"
293         "-//w3c//dtd html 3 1995-03-24//"
294         "-//w3c//dtd html 3.2 draft//"
295         "-//w3c//dtd html 3.2 final//"
296         "-//w3c//dtd html 3.2//"
297         "-//w3c//dtd html 3.2s draft//"
298         "-//w3c//dtd html 4.0 frameset//"
299         "-//w3c//dtd html 4.0 transitional//"
300         "-//w3c//dtd html experimental 19960712//"
301         "-//w3c//dtd html experimental 970421//"
302         "-//w3c//dtd w3 html//"
303         "-//w3o//dtd w3 html 3.0//"
304         "-//webtechs//dtd mozilla html 2.0//"
305         "-//webtechs//dtd mozilla html//"
306 ]
307
308 # These are the character references that don't need a terminating semicolon
309 # min length: 2, max: 6, none are a prefix of any other.
310 legacy_char_refs = {
311         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
312         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
313         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
314         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
315         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
316         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
317         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
318         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
319         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
320         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
321         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
322         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
323         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
324         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
325         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
326         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
327         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
328         yen: '¥', yuml: 'ÿ'
329 }
330
331 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
332 raw_text_elements = ['script', 'style']
333 escapable_raw_text_elements = ['textarea', 'title']
334 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
335 svg_elements = [
336         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
337         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
338         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
339         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
340         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
341         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
342         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
343         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
344         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
345         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
346         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
347         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
348         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
349         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
350         'view', 'vkern'
351 ]
352
353 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
354 mathml_elements = [
355         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
356         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
357         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
358         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
359         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
360         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
361         'determinant', 'diff', 'divergence', 'divide', 'domain',
362         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
363         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
364         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
365         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
366         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
367         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
368         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
369         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
370         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
371         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
372         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
373         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
374         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
375         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
376         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
377         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
378         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
379         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
380         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
381         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
382         'vectorproduct', 'xor'
383 ]
384 # foreign_elements = [svg_elements..., mathml_elements...]
385 #normal_elements = All other allowed HTML elements are normal elements.
386
387 special_elements = {
388         # HTML:
389         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
390         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
391         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
392         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
393         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
394         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
395         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
396         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
397         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
398         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
399         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
400
401         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
402
403         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
404         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
405         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
406         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
407         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
408         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
409         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
410
411         # MathML:
412         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
413         'annotation-xml':NS_MATHML,
414
415         # SVG:
416         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
417 }
418
419 formatting_elements = {
420          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
421          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
422          u: true
423 }
424
425 mathml_text_integration = {
426         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
427 }
428 is_mathml_text_integration_point = (el) ->
429         return mathml_text_integration[el.name] is el.namespace
430 is_html_integration = (el) -> # DON'T PASS A TOKEN
431         if el.namespace is NS_MATHML
432                 if el.name is 'annotation-xml'
433                         if el.attrs.encoding?
434                                 if el.attrs.encoding.toLowerCase() is 'text/html'
435                                         return true
436                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
437                                         return true
438                 return false
439         if el.namespace is NS_SVG
440                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
441                         return true
442         return false
443
444 h_tags = {
445         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
446 }
447
448 foster_parenting_targets = {
449         table: NS_HTML
450         tbody: NS_HTML
451         tfoot: NS_HTML
452         thead: NS_HTML
453         tr: NS_HTML
454 }
455
456 end_tag_implied = {
457         dd: NS_HTML
458         dt: NS_HTML
459         li: NS_HTML
460         option: NS_HTML
461         optgroup: NS_HTML
462         p: NS_HTML
463         rb: NS_HTML
464         rp: NS_HTML
465         rt: NS_HTML
466         rtc: NS_HTML
467 }
468
469 el_is_special = (e) ->
470         return special_elements[e.name] is e.namespace
471
472 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
473 el_is_special_not_adp = (el) ->
474         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
475
476 svg_name_fixes = {
477         altglyph: 'altGlyph'
478         altglyphdef: 'altGlyphDef'
479         altglyphitem: 'altGlyphItem'
480         animatecolor: 'animateColor'
481         animatemotion: 'animateMotion'
482         animatetransform: 'animateTransform'
483         clippath: 'clipPath'
484         feblend: 'feBlend'
485         fecolormatrix: 'feColorMatrix'
486         fecomponenttransfer: 'feComponentTransfer'
487         fecomposite: 'feComposite'
488         feconvolvematrix: 'feConvolveMatrix'
489         fediffuselighting: 'feDiffuseLighting'
490         fedisplacementmap: 'feDisplacementMap'
491         fedistantlight: 'feDistantLight'
492         fedropshadow: 'feDropShadow'
493         feflood: 'feFlood'
494         fefunca: 'feFuncA'
495         fefuncb: 'feFuncB'
496         fefuncg: 'feFuncG'
497         fefuncr: 'feFuncR'
498         fegaussianblur: 'feGaussianBlur'
499         feimage: 'feImage'
500         femerge: 'feMerge'
501         femergenode: 'feMergeNode'
502         femorphology: 'feMorphology'
503         feoffset: 'feOffset'
504         fepointlight: 'fePointLight'
505         fespecularlighting: 'feSpecularLighting'
506         fespotlight: 'feSpotLight'
507         fetile: 'feTile'
508         feturbulence: 'feTurbulence'
509         foreignobject: 'foreignObject'
510         glyphref: 'glyphRef'
511         lineargradient: 'linearGradient'
512         radialgradient: 'radialGradient'
513         textpath: 'textPath'
514 }
515 svg_attribute_fixes = {
516         attributename: 'attributeName'
517         attributetype: 'attributeType'
518         basefrequency: 'baseFrequency'
519         baseprofile: 'baseProfile'
520         calcmode: 'calcMode'
521         clippathunits: 'clipPathUnits'
522         contentscripttype: 'contentScriptType'
523         contentstyletype: 'contentStyleType'
524         diffuseconstant: 'diffuseConstant'
525         edgemode: 'edgeMode'
526         externalresourcesrequired: 'externalResourcesRequired'
527         # WHATWG removes this: filterres: 'filterRes'
528         filterunits: 'filterUnits'
529         glyphref: 'glyphRef'
530         gradienttransform: 'gradientTransform'
531         gradientunits: 'gradientUnits'
532         kernelmatrix: 'kernelMatrix'
533         kernelunitlength: 'kernelUnitLength'
534         keypoints: 'keyPoints'
535         keysplines: 'keySplines'
536         keytimes: 'keyTimes'
537         lengthadjust: 'lengthAdjust'
538         limitingconeangle: 'limitingConeAngle'
539         markerheight: 'markerHeight'
540         markerunits: 'markerUnits'
541         markerwidth: 'markerWidth'
542         maskcontentunits: 'maskContentUnits'
543         maskunits: 'maskUnits'
544         numoctaves: 'numOctaves'
545         pathlength: 'pathLength'
546         patterncontentunits: 'patternContentUnits'
547         patterntransform: 'patternTransform'
548         patternunits: 'patternUnits'
549         pointsatx: 'pointsAtX'
550         pointsaty: 'pointsAtY'
551         pointsatz: 'pointsAtZ'
552         preservealpha: 'preserveAlpha'
553         preserveaspectratio: 'preserveAspectRatio'
554         primitiveunits: 'primitiveUnits'
555         refx: 'refX'
556         refy: 'refY'
557         repeatcount: 'repeatCount'
558         repeatdur: 'repeatDur'
559         requiredextensions: 'requiredExtensions'
560         requiredfeatures: 'requiredFeatures'
561         specularconstant: 'specularConstant'
562         specularexponent: 'specularExponent'
563         spreadmethod: 'spreadMethod'
564         startoffset: 'startOffset'
565         stddeviation: 'stdDeviation'
566         stitchtiles: 'stitchTiles'
567         surfacescale: 'surfaceScale'
568         systemlanguage: 'systemLanguage'
569         tablevalues: 'tableValues'
570         targetx: 'targetX'
571         targety: 'targetY'
572         textlength: 'textLength'
573         viewbox: 'viewBox'
574         viewtarget: 'viewTarget'
575         xchannelselector: 'xChannelSelector'
576         ychannelselector: 'yChannelSelector'
577         zoomandpan: 'zoomAndPan'
578 }
579 foreign_attr_fixes = {
580         'xlink:actuate': 'xlink actuate'
581         'xlink:arcrole': 'xlink arcrole'
582         'xlink:href': 'xlink href'
583         'xlink:role': 'xlink role'
584         'xlink:show': 'xlink show'
585         'xlink:title': 'xlink title'
586         'xlink:type': 'xlink type'
587         'xml:base': 'xml base'
588         'xml:lang': 'xml lang'
589         'xml:space': 'xml space'
590         'xmlns': 'xmlns'
591         'xmlns:xlink': 'xmlns xlink'
592 }
593 adjust_mathml_attributes = (t) ->
594         for a in t.attrs_a
595                 if a[0] is 'definitionurl'
596                         a[0] = 'definitionURL'
597         return
598 adjust_svg_attributes = (t) ->
599         for a in t.attrs_a
600                 if svg_attribute_fixes[a[0]]?
601                         a[0] = svg_attribute_fixes[a[0]]
602         return
603 adjust_foreign_attributes = (t) ->
604         # fixfull
605         for a in t.attrs_a
606                 if foreign_attr_fixes[a[0]]?
607                         a[0] = foreign_attr_fixes[a[0]]
608         return
609
610 # decode_named_char_ref()
611 #
612 # The list of named character references is _huge_ so if we're running in a
613 # browser, we get the browser to decode them, rather than increasing the code
614 # size to include the table.
615 if context is 'module'
616         _decode_named_char_ref = require './html5-named-entities.coffee'
617 else
618         # TODO test this in IE8
619         decode_named_char_ref_el = document.createElement('textarea')
620         _decode_named_char_ref = (txt) ->
621                 txt = "&#{txt};"
622                 decode_named_char_ref_el.innerHTML = txt
623                 decoded = decode_named_char_ref_el.value
624                 return null if decoded is txt
625                 return decoded
626 # Pass the name of a named entity _that has a terminating semicolon_
627 # Entities without terminating semicolons should use legacy_char_refs[]
628 # Do not include the "&" or ";" in your argument, eg pass "alpha"
629 decode_named_char_ref_cache = {}
630 decode_named_char_ref = (txt) ->
631         decoded = decode_named_char_ref_cache[txt]
632         return decoded if decoded?
633         decoded = _decode_named_char_ref txt
634         return decode_named_char_ref_cache[txt] = decoded
635
636 parse_html = (args) ->
637         txt = null
638         cur = null # index of next char in txt to be parsed
639         # declare doc and tokenizer variables so they're in scope below
640         doc = null
641         open_els = null # stack of open elements
642         afe = null # active formatting elements
643         template_ins_modes = null
644         ins_mode = null
645         original_ins_mode = null
646         tok_state = null
647         tok_cur_tag = null # partially parsed tag
648         flag_scripting = null
649         flag_frameset_ok = null
650         flag_parsing = null
651         flag_foster_parenting = null
652         form_element_pointer = null
653         temporary_buffer = null
654         pending_table_character_tokens = null
655         head_element_pointer = null
656         flag_fragment_parsing = null
657         context_element = null
658
659         stop_parsing = ->
660                 flag_parsing = false
661                 return
662
663         parse_error = ->
664                 if args.error_cb?
665                         args.error_cb cur
666                 else
667                         console.log "Parse error at character #{cur} of #{txt.length}"
668                 return
669
670         # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
671         # "Noah's Ark clause" but with three
672         afe_push = (new_el) ->
673                 matches = 0
674                 for el, i in afe
675                         if el.type is TYPE_AFE_MARKER
676                                 break
677                         if el.name is new_el.name and el.namespace is new_el.namespace
678                                 attrs_match = true
679                                 for k, v of el.attrs
680                                         unless new_el.attrs[k] is v
681                                                 attrs_match = false
682                                                 break
683                                 if attrs_match
684                                         for k, v of new_el.attrs
685                                                 unless el.attrs[k] is v
686                                                         attrs_match = false
687                                                         break
688                                 if attrs_match
689                                         matches += 1
690                                         if matches is 3
691                                                 afe.splice i, 1
692                                                 break
693                 afe.unshift new_el
694                 return
695
696         afe_push_marker = ->
697                 afe.unshift new_afe_marker()
698                 return
699
700         # the functions below impliment the Tree Contstruction algorithm
701         # http://www.w3.org/TR/html5/syntax.html#tree-construction
702
703         # But first... the helpers
704         template_tag_is_open = ->
705                 for el in open_els
706                         if el.name is 'template' and el.namespace is NS_HTML
707                                 return true
708                 return false
709         is_in_scope_x = (tag_name, scope, namespace) ->
710                 for el in open_els
711                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
712                                 return true
713                         if scope[el.name] is el.namespace
714                                 return false
715                 return false
716         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
717                 for el in open_els
718                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
719                                 return true
720                         if scope[el.name] is el.namespace
721                                 return false
722                         if scope2[el.name] is el.namespace
723                                 return false
724                 return false
725         standard_scopers = {
726                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
727                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728                 template: NS_HTML,
729
730                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
731                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
732
733                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
734         }
735         button_scopers = button: NS_HTML
736         li_scopers = ol: NS_HTML, ul: NS_HTML
737         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
738         is_in_scope = (tag_name, namespace = null) ->
739                 return is_in_scope_x tag_name, standard_scopers, namespace
740         is_in_button_scope = (tag_name, namespace = null) ->
741                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
742         is_in_table_scope = (tag_name, namespace = null) ->
743                 return is_in_scope_x tag_name, table_scopers, namespace
744         # aka is_in_list_item_scope
745         is_in_li_scope = (tag_name, namespace = null) ->
746                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
747         is_in_select_scope = (tag_name, namespace = null) ->
748                 for t in open_els
749                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
750                                 return true
751                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752                                 return false
753                 return false
754         # this checks for a particular element, not by name
755         # this requires a namespace match
756         el_is_in_scope = (needle) ->
757                 for el in open_els
758                         if el is needle
759                                 return true
760                         if standard_scopers[el.name] is el.namespace
761                                 return false
762                 return false
763
764         clear_to_table_stopers = {
765                 'table': true
766                 'template': true
767                 'html': true
768         }
769         clear_stack_to_table_context = ->
770                 loop
771                         if clear_to_table_stopers[open_els[0].name]?
772                                 break
773                         open_els.shift()
774                 return
775         clear_to_table_body_stopers = {
776                 tbody: NS_HTML
777                 tfoot: NS_HTML
778                 thead: NS_HTML
779                 template: NS_HTML
780                 html: NS_HTML
781         }
782         clear_stack_to_table_body_context = ->
783                 loop
784                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
785                                 break
786                         open_els.shift()
787                 return
788         clear_to_table_row_stopers = {
789                 'tr': true
790                 'template': true
791                 'html': true
792         }
793         clear_stack_to_table_row_context = ->
794                 loop
795                         if clear_to_table_row_stopers[open_els[0].name]?
796                                 break
797                         open_els.shift()
798                 return
799         clear_afe_to_marker = ->
800                 loop
801                         return unless afe.length > 0 # this happens in fragment case, ?spec error
802                         el = afe.shift()
803                         if el.type is TYPE_AFE_MARKER
804                                 return
805                 return
806
807         # 8.2.3.1 ...
808         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
809         reset_ins_mode = ->
810                 # 1. Let last be false.
811                 last = false
812                 # 2. Let node be the last node in the stack of open elements.
813                 node_i = 0
814                 node = open_els[node_i]
815                 # 3. Loop: If node is the first node in the stack of open elements,
816                 # then set last to true, and, if the parser was originally created as
817                 # part of the HTML fragment parsing algorithm (fragment case) set node
818                 # to the context element.
819                 loop
820                         if node_i is open_els.length - 1
821                                 last = true
822                                 if flag_fragment_parsing
823                                         node = context_element
824                         # 4. If node is a select element, run these substeps:
825                         if node.name is 'select' and node.namespace is NS_HTML
826                                 # 1. If last is true, jump to the step below labeled done.
827                                 unless last
828                                         # 2. Let ancestor be node.
829                                         ancestor_i = node_i
830                                         ancestor = node
831                                         # 3. Loop: If ancestor is the first node in the stack of
832                                         # open elements, jump to the step below labeled done.
833                                         loop
834                                                 if ancestor_i is open_els.length - 1
835                                                         break
836                                                 # 4. Let ancestor be the node before ancestor in the stack
837                                                 # of open elements.
838                                                 ancestor_i += 1
839                                                 ancestor = open_els[ancestor_i]
840                                                 # 5. If ancestor is a template node, jump to the step below
841                                                 # labeled done.
842                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
843                                                         break
844                                                 # 6. If ancestor is a table node, switch the insertion mode
845                                                 # to "in select in table" and abort these steps.
846                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
847                                                         ins_mode = ins_mode_in_select_in_table
848                                                         return
849                                                 # 7. Jump back to the step labeled loop.
850                                 # 8. Done: Switch the insertion mode to "in select" and abort
851                                 # these steps.
852                                 ins_mode = ins_mode_in_select
853                                 return
854                         # 5. If node is a td or th element and last is false, then switch
855                         # the insertion mode to "in cell" and abort these steps.
856                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
857                                 ins_mode = ins_mode_in_cell
858                                 return
859                         # 6. If node is a tr element, then switch the insertion mode to "in
860                         # row" and abort these steps.
861                         if node.name is 'tr' and node.namespace is NS_HTML
862                                 ins_mode = ins_mode_in_row
863                                 return
864                         # 7. If node is a tbody, thead, or tfoot element, then switch the
865                         # insertion mode to "in table body" and abort these steps.
866                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
867                                 ins_mode = ins_mode_in_table_body
868                                 return
869                         # 8. If node is a caption element, then switch the insertion mode
870                         # to "in caption" and abort these steps.
871                         if node.name is 'caption' and node.namespace is NS_HTML
872                                 ins_mode = ins_mode_in_caption
873                                 return
874                         # 9. If node is a colgroup element, then switch the insertion mode
875                         # to "in column group" and abort these steps.
876                         if node.name is 'colgroup' and node.namespace is NS_HTML
877                                 ins_mode = ins_mode_in_column_group
878                                 return
879                         # 10. If node is a table element, then switch the insertion mode to
880                         # "in table" and abort these steps.
881                         if node.name is 'table' and node.namespace is NS_HTML
882                                 ins_mode = ins_mode_in_table
883                                 return
884                         # 11. If node is a template element, then switch the insertion mode
885                         # to the current template insertion mode and abort these steps.
886                         if node.name is 'template' and node.namespace is NS_HTML
887                                 ins_mode = template_ins_modes[0]
888                                 return
889                         # 12. If node is a head element and last is true, then switch the
890                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
891                         # these steps. (fragment case)
892                         if node.name is 'head' and node.namespace is NS_HTML and last
893                                 ins_mode = ins_mode_in_body
894                                 return
895                         # 13. If node is a head element and last is false, then switch the
896                         # insertion mode to "in head" and abort these steps.
897                         if node.name is 'head' and node.namespace is NS_HTML and last is false
898                                 ins_mode = ins_mode_in_head
899                                 return
900                         # 14. If node is a body element, then switch the insertion mode to
901                         # "in body" and abort these steps.
902                         if node.name is 'body' and node.namespace is NS_HTML
903                                 ins_mode = ins_mode_in_body
904                                 return
905                         # 15. If node is a frameset element, then switch the insertion mode
906                         # to "in frameset" and abort these steps. (fragment case)
907                         if node.name is 'frameset' and node.namespace is NS_HTML
908                                 ins_mode = ins_mode_in_frameset
909                                 return
910                         # 16. If node is an html element, run these substeps:
911                         if node.name is 'html' and node.namespace is NS_HTML
912                                 # 1. If the head element pointer is null, switch the insertion
913                                 # mode to "before head" and abort these steps. (fragment case)
914                                 if head_element_pointer is null
915                                         ins_mode = ins_mode_before_head
916                                 else
917                                         # 2. Otherwise, the head element pointer is not null,
918                                         # switch the insertion mode to "after head" and abort these
919                                         # steps.
920                                         ins_mode = ins_mode_after_head
921                                 return
922                         # 17. If last is true, then switch the insertion mode to "in body"
923                         # and abort these steps. (fragment case)
924                         if last
925                                 ins_mode = ins_mode_in_body
926                                 return
927                         # 18. Let node now be the node before node in the stack of open
928                         # elements.
929                         node_i += 1
930                         node = open_els[node_i]
931                         # 19. Return to the step labeled loop.
932                 return
933
934         # 8.2.3.2
935
936         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
937         adjusted_current_node = ->
938                 if open_els.length is 1 and flag_fragment_parsing
939                         return context_element
940                 return open_els[0]
941
942         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
943         # this implementation is structured (mostly) as described at the link above.
944         # capitalized comments are the "labels" described at the link above.
945         reconstruct_afe = ->
946                 return if afe.length is 0
947                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
948                         return
949                 # Rewind
950                 i = 0
951                 loop
952                         if i is afe.length - 1
953                                 break
954                         i += 1
955                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
956                                 i -= 1 # Advance
957                                 break
958                 # Create
959                 loop
960                         el = insert_html_element afe[i].token
961                         afe[i] = el
962                         break if i is 0
963                         i -= 1 # Advance
964                 return
965
966         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
967         # adoption agency algorithm
968         # overview here:
969         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
970         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
971         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
972         adoption_agency = (subject) ->
973 # this block implements tha W3C spec
974 #               # 1. If the current node is an HTML element whose tag name is subject,
975 #               # then run these substeps:
976 #               #
977 #               # 1. Let element be the current node.
978 #               #
979 #               # 2. Pop element off the stack of open elements.
980 #               #
981 #               # 3. If element is also in the list of active formatting elements,
982 #               # remove the element from the list.
983 #               #
984 #               # 4. Abort the adoption agency algorithm.
985 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 #                       el = open_els.shift()
987 #                       # remove it from the list of active formatting elements (if found)
988 #                       for t, i in afe
989 #                               if t is el
990 #                                       afe.splice i, 1
991 #                                       break
992 #                       return
993 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
994                 # If the current node is an HTML element whose tag name is subject, and
995                 # the current node is not in the list of active formatting elements,
996                 # then pop the current node off the stack of open elements, and abort
997                 # these steps.
998                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
999                         # remove it from the list of active formatting elements (if found)
1000                         in_afe = false
1001                         for el, i in afe
1002                                 if el is open_els[0]
1003                                         in_afe = true
1004                                         break
1005                         unless in_afe
1006                                 open_els.shift()
1007                                 return
1008                         # fall through
1009 # END WHATWG
1010                 outer = 0
1011                 loop
1012                         if outer >= 8
1013                                 return
1014                         outer += 1
1015                         # 5. Let formatting element be the last element in the list of
1016                         # active formatting elements that: is between the end of the list
1017                         # and the last scope marker in the list, if any, or the start of
1018                         # the list otherwise, and  has the tag name subject.
1019                         fe = null
1020                         for t, fe_of_afe in afe
1021                                 if t.type is TYPE_AFE_MARKER
1022                                         break
1023                                 if t.name is subject
1024                                         fe = t
1025                                         break
1026                         # If there is no such element, then abort these steps and instead
1027                         # act as described in the "any other end tag" entry above.
1028                         if fe is null
1029                                 in_body_any_other_end_tag subject
1030                                 return
1031                         # 6. If formatting element is not in the stack of open elements,
1032                         # then this is a parse error; remove the element from the list, and
1033                         # abort these steps.
1034                         in_open_els = false
1035                         for t, fe_of_open_els in open_els
1036                                 if t is fe
1037                                         in_open_els = true
1038                                         break
1039                         unless in_open_els
1040                                 parse_error()
1041                                 # "remove it from the list" must mean afe, since it's not in open_els
1042                                 afe.splice fe_of_afe, 1
1043                                 return
1044                         # 7. If formatting element is in the stack of open elements, but
1045                         # the element is not in scope, then this is a parse error; abort
1046                         # these steps.
1047                         unless el_is_in_scope fe
1048                                 parse_error()
1049                                 return
1050                         # 8. If formatting element is not the current node, this is a parse
1051                         # error. (But do not abort these steps.)
1052                         unless open_els[0] is fe
1053                                 parse_error()
1054                                 # continue
1055                         # 9. Let furthest block be the topmost node in the stack of open
1056                         # elements that is lower in the stack than formatting element, and
1057                         # is an element in the special category. There might not be one.
1058                         fb = null
1059                         fb_of_open_els = null
1060                         for t, i in open_els
1061                                 if t is fe
1062                                         break
1063                                 if el_is_special t
1064                                         fb = t
1065                                         fb_of_open_els = i
1066                                         # and continue, to see if there's one that's more "topmost"
1067                         # 10. If there is no furthest block, then the UA must first pop all
1068                         # the nodes from the bottom of the stack of open elements, from the
1069                         # current node up to and including formatting element, then remove
1070                         # formatting element from the list of active formatting elements,
1071                         # and finally abort these steps.
1072                         if fb is null
1073                                 loop
1074                                         t = open_els.shift()
1075                                         if t is fe
1076                                                 afe.splice fe_of_afe, 1
1077                                                 return
1078                         # 11. Let common ancestor be the element immediately above
1079                         # formatting element in the stack of open elements.
1080                         ca = open_els[fe_of_open_els + 1] # common ancestor
1081
1082                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1083                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1084                         bookmark = new_aaa_bookmark()
1085                         for t, i in afe
1086                                 if t is fe
1087                                         afe.splice i, 0, bookmark
1088                                         break
1089                         node = last_node = fb
1090                         inner = 0
1091                         loop
1092                                 inner += 1
1093                                 # 3. Let node be the element immediately above node in the
1094                                 # stack of open elements, or if node is no longer in the stack
1095                                 # of open elements (e.g. because it got removed by this
1096                                 # algorithm), the element that was immediately above node in
1097                                 # the stack of open elements before node was removed.
1098                                 node_next = null
1099                                 for t, i in open_els
1100                                         if t is node
1101                                                 node_next = open_els[i + 1]
1102                                                 break
1103                                 node = node_next ? node_above
1104                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1105
1106                                 # 4. If node is formatting element, then go to the next step in
1107                                 # the overall algorithm.
1108                                 if node is fe
1109                                         break
1110                                 # 5. If inner loop counter is greater than three and node is in
1111                                 # the list of active formatting elements, then remove node from
1112                                 # the list of active formatting elements.
1113                                 node_in_afe = false
1114                                 for t, i in afe
1115                                         if t is node
1116                                                 if inner > 3
1117                                                         afe.splice i, 1
1118                                                 else
1119                                                         node_in_afe = true
1120                                                 break
1121                                 # 6. If node is not in the list of active formatting elements,
1122                                 # then remove node from the stack of open elements and then go
1123                                 # back to the step labeled inner loop.
1124                                 unless node_in_afe
1125                                         for t, i in open_els
1126                                                 if t is node
1127                                                         node_above = open_els[i + 1]
1128                                                         open_els.splice i, 1
1129                                                         break
1130                                         continue
1131                                 # 7. create an element for the token for which the element node
1132                                 # was created, in the HTML namespace, with common ancestor as
1133                                 # the intended parent; replace the entry for node in the list
1134                                 # of active formatting elements with an entry for the new
1135                                 # element, replace the entry for node in the stack of open
1136                                 # elements with an entry for the new element, and let node be
1137                                 # the new element.
1138                                 new_node = token_to_element node.token, NS_HTML, ca
1139                                 for t, i in afe
1140                                         if t is node
1141                                                 afe[i] = new_node
1142                                                 break
1143                                 for t, i in open_els
1144                                         if t is node
1145                                                 node_above = open_els[i + 1]
1146                                                 open_els[i] = new_node
1147                                                 break
1148                                 node = new_node
1149                                 # 8. If last node is furthest block, then move the
1150                                 # aforementioned bookmark to be immediately after the new node
1151                                 # in the list of active formatting elements.
1152                                 if last_node is fb
1153                                         for t, i in afe
1154                                                 if t is bookmark
1155                                                         afe.splice i, 1
1156                                                         break
1157                                         for t, i in afe
1158                                                 if t is node
1159                                                         # "after" means lower
1160                                                         afe.splice i, 0, bookmark # "after as <-
1161                                                         break
1162                                 # 9. Insert last node into node, first removing it from its
1163                                 # previous parent node if any.
1164                                 if last_node.parent?
1165                                         for c, i in last_node.parent.children
1166                                                 if c is last_node
1167                                                         last_node.parent.children.splice i, 1
1168                                                         break
1169                                 node.children.push last_node
1170                                 last_node.parent = node
1171                                 # 10. Let last node be node.
1172                                 last_node = node
1173                                 # 11. Return to the step labeled inner loop.
1174                         # 14. Insert whatever last node ended up being in the previous step
1175                         # at the appropriate place for inserting a node, but using common
1176                         # ancestor as the override target.
1177
1178                         # In the case where fe is immediately followed by fb:
1179                         #   * inner loop exits out early (node==fe)
1180                         #   * last_node is fb
1181                         #   * last_node is still in the tree (not a duplicate)
1182                         if last_node.parent?
1183                                 for c, i in last_node.parent.children
1184                                         if c is last_node
1185                                                 last_node.parent.children.splice i, 1
1186                                                 break
1187                         # can't use standard insert token thing, because it's already in
1188                         # open_els and must stay at it's current position in open_els
1189                         dest = adjusted_insertion_location ca
1190                         dest[0].children.splice dest[1], 0, last_node
1191                         last_node.parent = dest[0]
1192                         # 15. Create an element for the token for which formatting element
1193                         # was created, in the HTML namespace, with furthest block as the
1194                         # intended parent.
1195                         new_element = token_to_element fe.token, NS_HTML, fb
1196                         # 16. Take all of the child nodes of furthest block and append them
1197                         # to the element created in the last step.
1198                         while fb.children.length
1199                                 t = fb.children.shift()
1200                                 t.parent = new_element
1201                                 new_element.children.push t
1202                         # 17. Append that new element to furthest block.
1203                         new_element.parent = fb
1204                         fb.children.push new_element
1205                         # 18. Remove formatting element from the list of active formatting
1206                         # elements, and insert the new element into the list of active
1207                         # formatting elements at the position of the aforementioned
1208                         # bookmark.
1209                         for t, i in afe
1210                                 if t is fe
1211                                         afe.splice i, 1
1212                                         break
1213                         for t, i in afe
1214                                 if t is bookmark
1215                                         afe[i] = new_element
1216                                         break
1217                         # 19. Remove formatting element from the stack of open elements,
1218                         # and insert the new element into the stack of open elements
1219                         # immediately below the position of furthest block in that stack.
1220                         for t, i in open_els
1221                                 if t is fe
1222                                         open_els.splice i, 1
1223                                         break
1224                         for t, i in open_els
1225                                 if t is fb
1226                                         open_els.splice i, 0, new_element
1227                                         break
1228                         # 20. Jump back to the step labeled outer loop.
1229                 return
1230
1231         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1232         close_p_element = ->
1233                 generate_implied_end_tags 'p' # arg is exception
1234                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1235                         parse_error()
1236                 while open_els.length > 1 # just in case
1237                         el = open_els.shift()
1238                         if el.name is 'p' and el.namespace is NS_HTML
1239                                 return
1240                 return
1241         close_p_if_in_button_scope = ->
1242                 if is_in_button_scope 'p', NS_HTML
1243                         close_p_element()
1244                 return
1245
1246         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1247         # aka insert_a_character = (t) ->
1248         insert_character = (t) ->
1249                 dest = adjusted_insertion_location()
1250                 # fixfull check for Document node
1251                 if dest[1] > 0
1252                         prev = dest[0].children[dest[1] - 1]
1253                         if prev.type is TYPE_TEXT
1254                                 prev.text += t.text
1255                                 return
1256                 dest[0].children.splice dest[1], 0, t
1257                 return
1258
1259         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1260         process_token = (t) ->
1261                 acn = adjusted_current_node()
1262                 unless acn?
1263                         ins_mode t
1264                         return
1265                 if acn.namespace is NS_HTML
1266                         ins_mode t
1267                         return
1268                 if is_mathml_text_integration_point(acn)
1269                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1270                                 ins_mode t
1271                                 return
1272                         if t.type is TYPE_TEXT
1273                                 ins_mode t
1274                                 return
1275                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1276                         ins_mode t
1277                         return
1278                 if is_html_integration acn
1279                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1280                                 ins_mode t
1281                                 return
1282                 if t.type is TYPE_EOF
1283                         ins_mode t
1284                         return
1285                 in_foreign_content t
1286                 return
1287
1288         # 8.2.5.1
1289         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1290         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1291         adjusted_insertion_location = (override_target = null) ->
1292                 # 1. If there was an override target specified, then let target be the
1293                 # override target.
1294                 if override_target?
1295                         target = override_target
1296                 else # Otherwise, let target be the current node.
1297                         target = open_els[0]
1298                 # 2. Determine the adjusted insertion location using the first matching
1299                 # steps from the following list:
1300                 #
1301                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1302                 # thead, or tr element Foster parenting happens when content is
1303                 # misnested in tables.
1304                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1305                         loop # once. this is here so we can ``break`` to "abort these substeps"
1306                                 # 1. Let last template be the last template element in the
1307                                 # stack of open elements, if any.
1308                                 last_template = null
1309                                 last_template_i = null
1310                                 for el, i in open_els
1311                                         if el.name is 'template' and el.namespace is NS_HTML
1312                                                 last_template = el
1313                                                 last_template_i = i
1314                                                 break
1315                                 # 2. Let last table be the last table element in the stack of
1316                                 # open elements, if any.
1317                                 last_table = null
1318                                 last_table_i
1319                                 for el, i in open_els
1320                                         if el.name is 'table' and el.namespace is NS_HTML
1321                                                 last_table = el
1322                                                 last_table_i = i
1323                                                 break
1324                                 # 3. If there is a last template and either there is no last
1325                                 # table, or there is one, but last template is lower (more
1326                                 # recently added) than last table in the stack of open
1327                                 # elements, then: let adjusted insertion location be inside
1328                                 # last template's template contents, after its last child (if
1329                                 # any), and abort these substeps.
1330                                 if last_template and (last_table is null or last_template_i < last_table_i)
1331                                         target = last_template # fixfull should be it's contents
1332                                         target_i = target.children.length
1333                                         break
1334                                 # 4. If there is no last table, then let adjusted insertion
1335                                 # location be inside the first element in the stack of open
1336                                 # elements (the html element), after its last child (if any),
1337                                 # and abort these substeps. (fragment case)
1338                                 if last_table is null
1339                                         # this is odd
1340                                         target = open_els[open_els.length - 1]
1341                                         target_i = target.children.length
1342                                         break
1343                                 # 5. If last table has a parent element, then let adjusted
1344                                 # insertion location be inside last table's parent element,
1345                                 # immediately before last table, and abort these substeps.
1346                                 if last_table.parent?
1347                                         for c, i in last_table.parent.children
1348                                                 if c is last_table
1349                                                         target = last_table.parent
1350                                                         target_i = i
1351                                                         break
1352                                         break
1353                                 # 6. Let previous element be the element immediately above last
1354                                 # table in the stack of open elements.
1355                                 #
1356                                 # huh? how could it not have a parent?
1357                                 previous_element = open_els[last_table_i + 1]
1358                                 # 7. Let adjusted insertion location be inside previous
1359                                 # element, after its last child (if any).
1360                                 target = previous_element
1361                                 target_i = target.children.length
1362                                 # Note: These steps are involved in part because it's possible
1363                                 # for elements, the table element in this case in particular,
1364                                 # to have been moved by a script around in the DOM, or indeed
1365                                 # removed from the DOM entirely, after the element was inserted
1366                                 # by the parser.
1367                                 break # don't really loop
1368                 else
1369                         # Otherwise Let adjusted insertion location be inside target, after
1370                         # its last child (if any).
1371                         target_i = target.children.length
1372
1373                 # 3. If the adjusted insertion location is inside a template element,
1374                 # let it instead be inside the template element's template contents,
1375                 # after its last child (if any).
1376                 # fixfull (template)
1377
1378                 # 4. Return the adjusted insertion location.
1379                 return [target, target_i]
1380
1381         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1382         # aka create_an_element_for_token
1383         token_to_element = (t, namespace, intended_parent) ->
1384                 # convert attributes into a hash
1385                 attrs = {}
1386                 for a in t.attrs_a
1387                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1388                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1389
1390                 # TODO 2. If the newly created element has an xmlns attribute in the
1391                 # XMLNS namespace whose value is not exactly the same as the element's
1392                 # namespace, that is a parse error. Similarly, if the newly created
1393                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1394                 # value is not the XLink Namespace, that is a parse error.
1395
1396                 # fixfull: the spec says stuff about form pointers and ownerDocument
1397
1398                 return el
1399
1400         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1401         insert_foreign_element = (token, namespace) ->
1402                 ail = adjusted_insertion_location()
1403                 ail_el = ail[0]
1404                 ail_i = ail[1]
1405                 el = token_to_element token, namespace, ail_el
1406                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1407                 el.parent = ail_el
1408                 ail_el.children.splice ail_i, 0, el
1409                 open_els.unshift el
1410                 return el
1411         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1412         insert_html_element = (token) ->
1413                 return insert_foreign_element token, NS_HTML
1414
1415         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1416         # position should be [node, index_within_children]
1417         insert_comment = (t, position = null) ->
1418                 position ?= adjusted_insertion_location()
1419                 position[0].children.splice position[1], 0, t
1420                 return
1421
1422         # 8.2.5.2
1423         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1424         parse_generic_raw_text = (t) ->
1425                 insert_html_element t
1426                 tok_state = tok_state_rawtext
1427                 original_ins_mode = ins_mode
1428                 ins_mode = ins_mode_text
1429                 return
1430         parse_generic_rcdata_text = (t) ->
1431                 insert_html_element t
1432                 tok_state = tok_state_rcdata
1433                 original_ins_mode = ins_mode
1434                 ins_mode = ins_mode_text
1435                 return
1436
1437         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1438         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1439         generate_implied_end_tags = (except = null) ->
1440                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1441                         open_els.shift()
1442                 return
1443
1444         # 8.2.5.4 The rules for parsing tokens in HTML content
1445         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1446
1447         # 8.2.5.4.1 The "initial" insertion mode
1448         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1449         is_quirks_yes_doctype = (t) ->
1450                 if t.flag 'force-quirks'
1451                         return true
1452                 if t.name isnt 'html'
1453                         return true
1454                 if t.public_identifier?
1455                         pi = t.public_identifier.toLowerCase()
1456                         for p in quirks_yes_pi_prefixes
1457                                 if pi.substr(0, p.length) is p
1458                                         return true
1459                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1460                                 return true
1461                 if t.system_identifier?
1462                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1463                                 return true
1464                 else if t.public_identifier?
1465                         # already did this: pi = t.public_identifier.toLowerCase()
1466                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1467                                 return true
1468                 return false
1469         is_quirks_limited_doctype = (t) ->
1470                 if t.public_identifier?
1471                         pi = t.public_identifier.toLowerCase()
1472                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1473                                 return true
1474                         if t.system_identifier?
1475                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1476                                         return true
1477                 return false
1478         ins_mode_initial = (t) ->
1479                 if is_space_tok t
1480                         return
1481                 if t.type is TYPE_COMMENT
1482                         # ?fixfull
1483                         doc.children.push t
1484                         return
1485                 if t.type is TYPE_DOCTYPE
1486                         # fixfull syntax error from first paragraph and following bullets
1487                         # fixfull set doc.doctype
1488                         # fixfull is the "not an iframe srcdoc" thing relevant?
1489                         if is_quirks_yes_doctype t
1490                                 doc.flag 'quirks mode', QUIRKS_YES
1491                         else if is_quirks_limited_doctype t
1492                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1493                         doc.children.push t
1494                         ins_mode = ins_mode_before_html
1495                         return
1496                 # Anything else
1497                 # fixfull not iframe srcdoc?
1498                 parse_error()
1499                 doc.flag 'quirks mode', QUIRKS_YES
1500                 ins_mode = ins_mode_before_html
1501                 process_token t
1502                 return
1503
1504         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1505         ins_mode_before_html = (t) ->
1506                 if t.type is TYPE_DOCTYPE
1507                         parse_error()
1508                         return
1509                 if t.type is TYPE_COMMENT
1510                         doc.children.push t
1511                         return
1512                 if is_space_tok t
1513                         return
1514                 if t.type is TYPE_START_TAG and t.name is 'html'
1515                         el = token_to_element t, NS_HTML, doc
1516                         doc.children.push el
1517                         el.document = doc
1518                         open_els.unshift(el)
1519                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1520                         ins_mode = ins_mode_before_head
1521                         return
1522                 if t.type is TYPE_END_TAG
1523                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1524                                 # fall through to "anything else"
1525                         else
1526                                 parse_error()
1527                                 return
1528                 # Anything else
1529                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1530                 doc.children.push el
1531                 el.document = doc
1532                 open_els.unshift el
1533                 # ?fixfull browsing context
1534                 ins_mode = ins_mode_before_head
1535                 process_token t
1536                 return
1537
1538         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1539         ins_mode_before_head = (t) ->
1540                 if is_space_tok t
1541                         return
1542                 if t.type is TYPE_COMMENT
1543                         insert_comment t
1544                         return
1545                 if t.type is TYPE_DOCTYPE
1546                         parse_error()
1547                         return
1548                 if t.type is TYPE_START_TAG and t.name is 'html'
1549                         ins_mode_in_body t
1550                         return
1551                 if t.type is TYPE_START_TAG and t.name is 'head'
1552                         el = insert_html_element t
1553                         head_element_pointer = el
1554                         ins_mode = ins_mode_in_head
1555                         return
1556                 if t.type is TYPE_END_TAG
1557                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1558                                 # fall through to Anything else below
1559                         else
1560                                 parse_error()
1561                                 return
1562                 # Anything else
1563                 el = insert_html_element new_open_tag 'head'
1564                 head_element_pointer = el
1565                 ins_mode = ins_mode_in_head
1566                 process_token t
1567                 return
1568
1569         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1570         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1571                 open_els.shift() # spec says this will be a 'head' node
1572                 ins_mode = ins_mode_after_head
1573                 process_token t
1574                 return
1575         ins_mode_in_head = (t) ->
1576                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1577                         insert_character t
1578                         return
1579                 if t.type is TYPE_COMMENT
1580                         insert_comment t
1581                         return
1582                 if t.type is TYPE_DOCTYPE
1583                         parse_error()
1584                         return
1585                 if t.type is TYPE_START_TAG and t.name is 'html'
1586                         ins_mode_in_body t
1587                         return
1588                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1589                         el = insert_html_element t
1590                         open_els.shift()
1591                         t.acknowledge_self_closing()
1592                         return
1593                 if t.type is TYPE_START_TAG and t.name is 'meta'
1594                         el = insert_html_element t
1595                         open_els.shift()
1596                         t.acknowledge_self_closing()
1597                         # fixfull encoding stuff
1598                         return
1599                 if t.type is TYPE_START_TAG and t.name is 'title'
1600                         parse_generic_rcdata_text t
1601                         return
1602                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1603                         parse_generic_raw_text t
1604                         return
1605                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1606                         insert_html_element t
1607                         ins_mode = ins_mode_in_head_noscript
1608                         return
1609                 if t.type is TYPE_START_TAG and t.name is 'script'
1610                         ail = adjusted_insertion_location()
1611                         el = token_to_element t, NS_HTML, ail
1612                         el.flag 'parser-inserted', true
1613                         # fixfull frament case
1614                         ail[0].children.splice ail[1], 0, el
1615                         open_els.unshift el
1616                         tok_state = tok_state_script_data
1617                         original_ins_mode = ins_mode # make sure orig... is defined
1618                         ins_mode = ins_mode_text
1619                         return
1620                 if t.type is TYPE_END_TAG and t.name is 'head'
1621                         open_els.shift() # will be a head element... spec says so
1622                         ins_mode = ins_mode_after_head
1623                         return
1624                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1625                         ins_mode_in_head_else t
1626                         return
1627                 if t.type is TYPE_START_TAG and t.name is 'template'
1628                         insert_html_element t
1629                         afe_push_marker()
1630                         flag_frameset_ok = false
1631                         ins_mode = ins_mode_in_template
1632                         template_ins_modes.unshift ins_mode_in_template
1633                         return
1634                 if t.type is TYPE_END_TAG and t.name is 'template'
1635                         if template_tag_is_open()
1636                                 generate_implied_end_tags
1637                                 if open_els[0].name isnt 'template'
1638                                         parse_error()
1639                                 loop
1640                                         el = open_els.shift()
1641                                         if el.name is 'template' and el.namespace is NS_HTML
1642                                                 break
1643                                 clear_afe_to_marker()
1644                                 template_ins_modes.shift()
1645                                 reset_ins_mode()
1646                         else
1647                                 parse_error()
1648                         return
1649                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1650                         parse_error()
1651                         return
1652                 ins_mode_in_head_else t
1653                 return
1654
1655         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1656         ins_mode_in_head_noscript_else = (t) ->
1657                 parse_error()
1658                 open_els.shift()
1659                 ins_mode = ins_mode_in_head
1660                 process_token t
1661                 return
1662         ins_mode_in_head_noscript = (t) ->
1663                 if t.type is TYPE_DOCTYPE
1664                         parse_error()
1665                         return
1666                 if t.type is TYPE_START_TAG and t.name is 'html'
1667                         ins_mode_in_body t
1668                         return
1669                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1670                         open_els.shift()
1671                         ins_mode = ins_mode_in_head
1672                         return
1673                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1674                         ins_mode_in_head t
1675                         return
1676                 if t.type is TYPE_END_TAG and t.name is 'br'
1677                         ins_mode_in_head_noscript_else t
1678                         return
1679                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1680                         parse_error()
1681                         return
1682                 # Anything else
1683                 ins_mode_in_head_noscript_else t
1684                 return
1685
1686         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1687         ins_mode_after_head_else = (t) ->
1688                 body_tok = new_open_tag 'body'
1689                 insert_html_element body_tok
1690                 ins_mode = ins_mode_in_body
1691                 process_token t
1692                 return
1693         ins_mode_after_head = (t) ->
1694                 if is_space_tok t
1695                         insert_character t
1696                         return
1697                 if t.type is TYPE_COMMENT
1698                         insert_comment t
1699                         return
1700                 if t.type is TYPE_DOCTYPE
1701                         parse_error()
1702                         return
1703                 if t.type is TYPE_START_TAG and t.name is 'html'
1704                         ins_mode_in_body t
1705                         return
1706                 if t.type is TYPE_START_TAG and t.name is 'body'
1707                         insert_html_element t
1708                         flag_frameset_ok = false
1709                         ins_mode = ins_mode_in_body
1710                         return
1711                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1712                         insert_html_element t
1713                         ins_mode = ins_mode_in_frameset
1714                         return
1715                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1716                         parse_error()
1717                         open_els.unshift head_element_pointer
1718                         ins_mode_in_head t
1719                         for el, i in open_els
1720                                 if el is head_element_pointer
1721                                         open_els.splice i, 1
1722                                         return
1723                         return
1724                 if t.type is TYPE_END_TAG and t.name is 'template'
1725                         ins_mode_in_head t
1726                         return
1727                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1728                         ins_mode_after_head_else t
1729                         return
1730                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1731                         parse_error()
1732                         return
1733                 # Anything else
1734                 ins_mode_after_head_else t
1735                 return
1736
1737         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1738         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1739                 node = open_els[0]
1740                 loop
1741                         if node.name is name and node.namespace is NS_HTML
1742                                 generate_implied_end_tags name # arg is exception
1743                                 unless node is open_els[0]
1744                                         parse_error()
1745                                 loop
1746                                         el = open_els.shift()
1747                                         if el is node
1748                                                 return
1749                         if special_elements[node.name] is node.namespace
1750                                 parse_error()
1751                                 return
1752                         for el, i in open_els
1753                                 if node is el
1754                                         node = open_els[i + 1]
1755                                         break
1756                 return
1757         ins_mode_in_body = (t) ->
1758                 if t.type is TYPE_TEXT and t.text is "\u0000"
1759                         parse_error()
1760                         return
1761                 if is_space_tok t
1762                         reconstruct_afe()
1763                         insert_character t
1764                         return
1765                 if t.type is TYPE_TEXT
1766                         reconstruct_afe()
1767                         insert_character t
1768                         flag_frameset_ok = false
1769                         return
1770                 if t.type is TYPE_COMMENT
1771                         insert_comment t
1772                         return
1773                 if t.type is TYPE_DOCTYPE
1774                         parse_error()
1775                         return
1776                 if t.type is TYPE_START_TAG and t.name is 'html'
1777                         parse_error()
1778                         return if template_tag_is_open()
1779                         root_attrs = open_els[open_els.length - 1].attrs
1780                         for a in t.attrs_a
1781                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1782                         return
1783
1784                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1785                         ins_mode_in_head t
1786                         return
1787                 if t.type is TYPE_START_TAG and t.name is 'body'
1788                         parse_error()
1789                         return if open_els.length < 2
1790                         second = open_els[open_els.length - 2]
1791                         return unless second.namespace is NS_HTML
1792                         return unless second.name is 'body'
1793                         return if template_tag_is_open()
1794                         flag_frameset_ok = false
1795                         for a in t.attrs_a
1796                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1797                         return
1798                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1799                         parse_error()
1800                         return if open_els.length < 2
1801                         second_i = open_els.length - 2
1802                         second = open_els[second_i]
1803                         return unless second.namespace is NS_HTML
1804                         return unless second.name is 'body'
1805                         if flag_frameset_ok is false
1806                                 return
1807                         if second.parent?
1808                                 for el, i in second.parent.children
1809                                         if el is second
1810                                                 second.parent.children.splice i, 1
1811                                                 break
1812                         open_els.splice second_i, 1
1813                         # pop everything except the "root html element"
1814                         while open_els.length > 1
1815                                 open_els.shift()
1816                         insert_html_element t
1817                         ins_mode = ins_mode_in_frameset
1818                         return
1819                 if t.type is TYPE_EOF
1820                         ok_tags = {
1821                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1822                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1823                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1824                         }
1825                         for el in open_els
1826                                 unless ok_tags[t.name] is el.namespace
1827                                         parse_error()
1828                                         break
1829                         if template_ins_modes.length > 0
1830                                 ins_mode_in_template t
1831                         else
1832                                 stop_parsing()
1833                         return
1834                 if t.type is TYPE_END_TAG and t.name is 'body'
1835                         unless is_in_scope 'body', NS_HTML
1836                                 parse_error()
1837                                 return
1838                         ok_tags = {
1839                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1840                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1841                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1842                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1843                                 html:NS_HTML
1844                         }
1845                         for el in open_els
1846                                 unless ok_tags[t.name] is el.namespace
1847                                         parse_error()
1848                                         break
1849                         ins_mode = ins_mode_after_body
1850                         return
1851                 if t.type is TYPE_END_TAG and t.name is 'html'
1852                         unless is_in_scope 'body', NS_HTML
1853                                 parse_error()
1854                                 return
1855                         ok_tags = {
1856                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1857                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1858                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1859                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1860                                 html:NS_HTML
1861                         }
1862                         for el in open_els
1863                                 unless ok_tags[t.name] is el.namespace
1864                                         parse_error()
1865                                         break
1866                         ins_mode = ins_mode_after_body
1867                         process_token t
1868                         return
1869                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1870                         close_p_if_in_button_scope()
1871                         insert_html_element t
1872                         return
1873                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1874                         close_p_if_in_button_scope()
1875                         if h_tags[open_els[0].name] is open_els[0].namespace
1876                                 parse_error()
1877                                 open_els.shift()
1878                         insert_html_element t
1879                         return
1880                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1881                         close_p_if_in_button_scope()
1882                         insert_html_element t
1883                         eat_next_token_if_newline()
1884                         flag_frameset_ok = false
1885                         return
1886                 if t.type is TYPE_START_TAG and t.name is 'form'
1887                         unless form_element_pointer is null or template_tag_is_open()
1888                                 parse_error()
1889                                 return
1890                         close_p_if_in_button_scope()
1891                         el = insert_html_element t
1892                         unless template_tag_is_open()
1893                                 form_element_pointer = el
1894                         return
1895                 if t.type is TYPE_START_TAG and t.name is 'li'
1896                         flag_frameset_ok = false
1897                         for node in open_els
1898                                 if node.name is 'li' and node.namespace is NS_HTML
1899                                         generate_implied_end_tags 'li' # arg is exception
1900                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1901                                                 parse_error()
1902                                         loop
1903                                                 el = open_els.shift()
1904                                                 if el.name is 'li' and el.namespace is NS_HTML
1905                                                         break
1906                                         break
1907                                 if el_is_special_not_adp node
1908                                                 break
1909                         close_p_if_in_button_scope()
1910                         insert_html_element t
1911                         return
1912                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1913                         flag_frameset_ok = false
1914                         for node in open_els
1915                                 if node.name is 'dd' and node.namespace is NS_HTML
1916                                         generate_implied_end_tags 'dd' # arg is exception
1917                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1918                                                 parse_error()
1919                                         loop
1920                                                 el = open_els.shift()
1921                                                 if el.name is 'dd' and el.namespace is NS_HTML
1922                                                         break
1923                                         break
1924                                 if node.name is 'dt' and node.namespace is NS_HTML
1925                                         generate_implied_end_tags 'dt' # arg is exception
1926                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1927                                                 parse_error()
1928                                         loop
1929                                                 el = open_els.shift()
1930                                                 if el.name is 'dt' and el.namespace is NS_HTML
1931                                                         break
1932                                         break
1933                                 if el_is_special_not_adp node
1934                                         break
1935                         close_p_if_in_button_scope()
1936                         insert_html_element t
1937                         return
1938                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1939                         close_p_if_in_button_scope()
1940                         insert_html_element t
1941                         tok_state = tok_state_plaintext
1942                         return
1943                 if t.type is TYPE_START_TAG and t.name is 'button'
1944                         if is_in_scope 'button', NS_HTML
1945                                 parse_error()
1946                                 generate_implied_end_tags()
1947                                 loop
1948                                         el = open_els.shift()
1949                                         if el.name is 'button' and el.namespace is NS_HTML
1950                                                 break
1951                         reconstruct_afe()
1952                         insert_html_element t
1953                         flag_frameset_ok = false
1954                         return
1955                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1956                         unless is_in_scope t.name, NS_HTML
1957                                 parse_error()
1958                                 return
1959                         generate_implied_end_tags()
1960                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1961                                 parse_error()
1962                         loop
1963                                 el = open_els.shift()
1964                                 if el.name is t.name and el.namespace is NS_HTML
1965                                         return
1966                         return
1967                 if t.type is TYPE_END_TAG and t.name is 'form'
1968                         unless template_tag_is_open()
1969                                 node = form_element_pointer
1970                                 form_element_pointer = null
1971                                 if node is null or not el_is_in_scope node
1972                                         parse_error()
1973                                         return
1974                                 generate_implied_end_tags()
1975                                 if open_els[0] isnt node
1976                                         parse_error()
1977                                 for el, i in open_els
1978                                         if el is node
1979                                                 open_els.splice i, 1
1980                                                 break
1981                         else
1982                                 unless is_in_scope 'form', NS_HTML
1983                                         parse_error()
1984                                         return
1985                                 generate_implied_end_tags()
1986                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1987                                         parse_error()
1988                                 loop
1989                                         el = open_els.shift()
1990                                         if el.name is 'form' and el.namespace is NS_HTML
1991                                                 break
1992                         return
1993                 if t.type is TYPE_END_TAG and t.name is 'p'
1994                         unless is_in_button_scope 'p', NS_HTML
1995                                 parse_error()
1996                                 insert_html_element new_open_tag 'p'
1997                         close_p_element()
1998                         return
1999                 if t.type is TYPE_END_TAG and t.name is 'li'
2000                         unless is_in_li_scope 'li', NS_HTML
2001                                 parse_error()
2002                                 return
2003                         generate_implied_end_tags 'li' # arg is exception
2004                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2005                                 parse_error()
2006                         loop
2007                                 el = open_els.shift()
2008                                 if el.name is 'li' and el.namespace is NS_HTML
2009                                         break
2010                         return
2011                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2012                         unless is_in_scope t.name, NS_HTML
2013                                 parse_error()
2014                                 return
2015                         generate_implied_end_tags t.name # arg is exception
2016                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2017                                 parse_error()
2018                         loop
2019                                 el = open_els.shift()
2020                                 if el.name is t.name and el.namespace is NS_HTML
2021                                         break
2022                         return
2023                 if t.type is TYPE_END_TAG and h_tags[t.name]?
2024                         h_in_scope = false
2025                         for el in open_els
2026                                 if h_tags[el.name] is el.namespace
2027                                         h_in_scope = true
2028                                         break
2029                                 if standard_scopers[el.name] is el.namespace
2030                                         break
2031                         unless h_in_scope
2032                                 parse_error()
2033                                 return
2034                         generate_implied_end_tags()
2035                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2036                                 parse_error()
2037                         loop
2038                                 el = open_els.shift()
2039                                 if h_tags[el.name] is el.namespace
2040                                         break
2041                         return
2042                 # deep breath!
2043                 if t.type is TYPE_START_TAG and t.name is 'a'
2044                         # If the list of active formatting elements contains an a element
2045                         # between the end of the list and the last marker on the list (or
2046                         # the start of the list if there is no marker on the list), then
2047                         # this is a parse error; run the adoption agency algorithm for the
2048                         # tag name "a", then remove that element from the list of active
2049                         # formatting elements and the stack of open elements if the
2050                         # adoption agency algorithm didn't already remove it (it might not
2051                         # have if the element is not in table scope).
2052                         found = false
2053                         for el in afe
2054                                 if el.type is TYPE_AFE_MARKER
2055                                         break
2056                                 if el.name is 'a' and el.namespace is NS_HTML
2057                                         found = el
2058                         if found?
2059                                 parse_error()
2060                                 adoption_agency 'a'
2061                                 for el, i in afe
2062                                         if el is found
2063                                                 afe.splice i, 1
2064                                 for el, i in open_els
2065                                         if el is found
2066                                                 open_els.splice i, 1
2067                         reconstruct_afe()
2068                         el = insert_html_element t
2069                         afe_push el
2070                         return
2071                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2072                         reconstruct_afe()
2073                         el = insert_html_element t
2074                         afe_push el
2075                         return
2076                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2077                         reconstruct_afe()
2078                         if is_in_scope 'nobr', NS_HTML
2079                                 parse_error()
2080                                 adoption_agency 'nobr'
2081                                 reconstruct_afe()
2082                         el = insert_html_element t
2083                         afe_push el
2084                         return
2085                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2086                         adoption_agency t.name
2087                         return
2088                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2089                         reconstruct_afe()
2090                         insert_html_element t
2091                         afe_push_marker()
2092                         flag_frameset_ok = false
2093                         return
2094                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2095                         unless is_in_scope t.name, NS_HTML
2096                                 parse_error()
2097                                 return
2098                         generate_implied_end_tags()
2099                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2100                                 parse_error()
2101                         loop
2102                                 el = open_els.shift()
2103                                 if el.name is t.name and el.namespace is NS_HTML
2104                                         break
2105                         clear_afe_to_marker()
2106                         return
2107                 if t.type is TYPE_START_TAG and t.name is 'table'
2108                         unless doc.flag('quirks mode') is QUIRKS_YES
2109                                 close_p_if_in_button_scope() # test
2110                         insert_html_element t
2111                         flag_frameset_ok = false
2112                         ins_mode = ins_mode_in_table
2113                         return
2114                 if t.type is TYPE_END_TAG and t.name is 'br'
2115                         parse_error()
2116                         # W3C: t.type = TYPE_START_TAG
2117                         t = new_open_tag 'br' # WHATWG
2118                         # fall through
2119                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2120                         reconstruct_afe()
2121                         insert_html_element t
2122                         open_els.shift()
2123                         t.acknowledge_self_closing()
2124                         flag_frameset_ok = false
2125                         return
2126                 if t.type is TYPE_START_TAG and t.name is 'input'
2127                         reconstruct_afe()
2128                         insert_html_element t
2129                         open_els.shift()
2130                         t.acknowledge_self_closing()
2131                         unless is_input_hidden_tok t
2132                                 flag_frameset_ok = false
2133                         return
2134                 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2135                         # WHATWG adds 'menuitem' for this block
2136                         insert_html_element t
2137                         open_els.shift()
2138                         t.acknowledge_self_closing()
2139                         return
2140                 if t.type is TYPE_START_TAG and t.name is 'hr'
2141                         close_p_if_in_button_scope()
2142                         insert_html_element t
2143                         open_els.shift()
2144                         t.acknowledge_self_closing()
2145                         flag_frameset_ok = false
2146                         return
2147                 if t.type is TYPE_START_TAG and t.name is 'image'
2148                         parse_error()
2149                         t.name = 'img'
2150                         process_token t
2151                         return
2152                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2153                         parse_error()
2154                         if template_tag_is_open() is false and form_element_pointer isnt null
2155                                 return
2156                         t.acknowledge_self_closing()
2157                         flag_frameset_ok = false
2158                         close_p_if_in_button_scope()
2159                         el = insert_html_element new_open_tag 'form'
2160                         unless template_tag_is_open()
2161                                 form_element_pointer = el
2162                         for a in t.attrs_a
2163                                 if a[0] is 'action'
2164                                         el.attrs['action'] = a[1]
2165                                         break
2166                         insert_html_element new_open_tag 'hr'
2167                         open_els.shift()
2168                         reconstruct_afe()
2169                         insert_html_element new_open_tag 'label'
2170                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2171                         input_el = new_open_tag 'input'
2172                         prompt = null
2173                         for a in t.attrs_a
2174                                 if a[0] is 'prompt'
2175                                         prompt = a[1]
2176                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2177                                         input_el.attrs_a.push [a[0], a[1]]
2178                         input_el.attrs_a.push ['name', 'isindex']
2179                         # fixfull this next bit is in english... internationalize?
2180                         prompt ?= "This is a searchable index. Enter search keywords: "
2181                         insert_character new_character_token prompt # fixfull split
2182                         # TODO submit typo "balue" in spec
2183                         insert_html_element input_el
2184                         open_els.shift()
2185                         # insert_character '' # you can put chars here if promt attr missing
2186                         open_els.shift()
2187                         insert_html_element new_open_tag 'hr'
2188                         open_els.shift()
2189                         open_els.shift()
2190                         unless template_tag_is_open()
2191                                 form_element_pointer = null
2192                         return
2193                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2194                         insert_html_element t
2195                         eat_next_token_if_newline()
2196                         tok_state = tok_state_rcdata
2197                         original_ins_mode = ins_mode
2198                         flag_frameset_ok = false
2199                         ins_mode = ins_mode_text
2200                         return
2201                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2202                         close_p_if_in_button_scope()
2203                         reconstruct_afe()
2204                         flag_frameset_ok = false
2205                         parse_generic_raw_text t
2206                         return
2207                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2208                         flag_frameset_ok = false
2209                         parse_generic_raw_text t
2210                         return
2211                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2212                         parse_generic_raw_text t
2213                         return
2214                 if t.type is TYPE_START_TAG and t.name is 'select'
2215                         reconstruct_afe()
2216                         insert_html_element t
2217                         flag_frameset_ok = false
2218                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2219                                 ins_mode = ins_mode_in_select_in_table
2220                         else
2221                                 ins_mode = ins_mode_in_select
2222                         return
2223                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2224                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2225                                 open_els.shift()
2226                         reconstruct_afe()
2227                         insert_html_element t
2228                         return
2229 # this comment block implements the W3C spec
2230 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2231 #                       if is_in_scope 'ruby', NS_HTML
2232 #                               generate_implied_end_tags()
2233 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2234 #                                       parse_error()
2235 #                       insert_html_element t
2236 #                       return
2237 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2238 #                       if is_in_scope 'ruby', NS_HTML
2239 #                               generate_implied_end_tags 'rtc' # arg is exception
2240 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2241 #                                       parse_error()
2242 #                       insert_html_element t
2243 #                       return
2244 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2245                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2246                         if is_in_scope 'ruby', NS_HTML
2247                                 generate_implied_end_tags()
2248                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2249                                         parse_error()
2250                         insert_html_element t
2251                         return
2252                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2253                         if is_in_scope 'ruby', NS_HTML
2254                                 generate_implied_end_tags 'rtc'
2255                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2256                                         parse_error()
2257                         insert_html_element t
2258                         return
2259 # end WHATWG chunk
2260                 if t.type is TYPE_START_TAG and t.name is 'math'
2261                         reconstruct_afe()
2262                         adjust_mathml_attributes t
2263                         adjust_foreign_attributes t
2264                         insert_foreign_element t, NS_MATHML
2265                         if t.flag 'self-closing'
2266                                 open_els.shift()
2267                                 t.acknowledge_self_closing()
2268                         return
2269                 if t.type is TYPE_START_TAG and t.name is 'svg'
2270                         reconstruct_afe()
2271                         adjust_svg_attributes t
2272                         adjust_foreign_attributes t
2273                         insert_foreign_element t, NS_SVG
2274                         if t.flag 'self-closing'
2275                                 open_els.shift()
2276                                 t.acknowledge_self_closing()
2277                         return
2278                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2279                         parse_error()
2280                         return
2281                 if t.type is TYPE_START_TAG # any other start tag
2282                         reconstruct_afe()
2283                         insert_html_element t
2284                         return
2285                 if t.type is TYPE_END_TAG # any other end tag
2286                         in_body_any_other_end_tag t.name
2287                         return
2288                 return
2289
2290         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2291         ins_mode_text = (t) ->
2292                 if t.type is TYPE_TEXT
2293                         insert_character t
2294                         return
2295                 if t.type is TYPE_EOF
2296                         parse_error()
2297                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2298                                 open_els[0].flag 'already started', true
2299                         open_els.shift()
2300                         ins_mode = original_ins_mode
2301                         process_token t
2302                         return
2303                 if t.type is TYPE_END_TAG and t.name is 'script'
2304                         open_els.shift()
2305                         ins_mode = original_ins_mode
2306                         # fixfull the spec seems to assume that I'm going to run the script
2307                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2308                         return
2309                 if t.type is TYPE_END_TAG
2310                         open_els.shift()
2311                         ins_mode = original_ins_mode
2312                         return
2313                 return
2314
2315         # the functions below implement the tokenizer stats described here:
2316         # http://www.w3.org/TR/html5/syntax.html#tokenization
2317
2318         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2319         ins_mode_in_table_else = (t) ->
2320                 parse_error()
2321                 flag_foster_parenting = true
2322                 ins_mode_in_body t
2323                 flag_foster_parenting = false
2324                 return
2325         ins_mode_in_table = (t) ->
2326                 switch t.type
2327                         when TYPE_TEXT
2328                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2329                                         pending_table_character_tokens = []
2330                                         original_ins_mode = ins_mode
2331                                         ins_mode = ins_mode_in_table_text
2332                                         process_token t
2333                                 else
2334                                         ins_mode_in_table_else t
2335                         when TYPE_COMMENT
2336                                 insert_comment t
2337                         when TYPE_DOCTYPE
2338                                 parse_error()
2339                         when TYPE_START_TAG
2340                                 switch t.name
2341                                         when 'caption'
2342                                                 clear_stack_to_table_context()
2343                                                 afe_push_marker()
2344                                                 insert_html_element t
2345                                                 ins_mode = ins_mode_in_caption
2346                                         when 'colgroup'
2347                                                 clear_stack_to_table_context()
2348                                                 insert_html_element t
2349                                                 ins_mode = ins_mode_in_column_group
2350                                         when 'col'
2351                                                 clear_stack_to_table_context()
2352                                                 insert_html_element new_open_tag 'colgroup'
2353                                                 ins_mode = ins_mode_in_column_group
2354                                                 process_token t
2355                                         when 'tbody', 'tfoot', 'thead'
2356                                                 clear_stack_to_table_context()
2357                                                 insert_html_element t
2358                                                 ins_mode = ins_mode_in_table_body
2359                                         when 'td', 'th', 'tr'
2360                                                 clear_stack_to_table_context()
2361                                                 insert_html_element new_open_tag 'tbody'
2362                                                 ins_mode = ins_mode_in_table_body
2363                                                 process_token t
2364                                         when 'table'
2365                                                 parse_error()
2366                                                 if is_in_table_scope 'table', NS_HTML
2367                                                         loop
2368                                                                 el = open_els.shift()
2369                                                                 if el.name is 'table' and el.namespace is NS_HTML
2370                                                                         break
2371                                                         reset_ins_mode()
2372                                                         process_token t
2373                                         when 'style', 'script', 'template'
2374                                                 ins_mode_in_head t
2375                                         when 'input'
2376                                                 unless is_input_hidden_tok t
2377                                                         ins_mode_in_table_else t
2378                                                 else
2379                                                         parse_error()
2380                                                         el = insert_html_element t
2381                                                         open_els.shift()
2382                                                         t.acknowledge_self_closing()
2383                                         when 'form'
2384                                                 parse_error()
2385                                                 if form_element_pointer?
2386                                                         return
2387                                                 if template_tag_is_open()
2388                                                         return
2389                                                 form_element_pointer = insert_html_element t
2390                                                 open_els.shift()
2391                                         else
2392                                                 ins_mode_in_table_else t
2393                         when TYPE_END_TAG
2394                                 switch t.name
2395                                         when 'table'
2396                                                 if is_in_table_scope 'table', NS_HTML
2397                                                         loop
2398                                                                 el = open_els.shift()
2399                                                                 if el.name is 'table' and el.namespace is NS_HTML
2400                                                                         break
2401                                                         reset_ins_mode()
2402                                                 else
2403                                                         parse_error()
2404                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2405                                                 parse_error()
2406                                         when 'template'
2407                                                 ins_mode_in_head t
2408                                         else
2409                                                 ins_mode_in_table_else t
2410                         when TYPE_EOF
2411                                 ins_mode_in_body t
2412                         else
2413                                 ins_mode_in_table_else t
2414                 return
2415
2416
2417         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2418         ins_mode_in_table_text = (t) ->
2419                 if t.type is TYPE_TEXT and t.text is "\u0000"
2420                         # from javascript?
2421                         parse_error()
2422                         return
2423                 if t.type is TYPE_TEXT
2424                         pending_table_character_tokens.push t
2425                         return
2426                 # Anything else
2427                 all_space = true
2428                 for old in pending_table_character_tokens
2429                         unless is_space_tok old
2430                                 all_space = false
2431                                 break
2432                 if all_space
2433                         for old in pending_table_character_tokens
2434                                 insert_character old
2435                 else
2436                         for old in pending_table_character_tokens
2437                                 ins_mode_in_table_else old
2438                 pending_table_character_tokens = []
2439                 ins_mode = original_ins_mode
2440                 process_token t
2441                 return
2442
2443         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2444         ins_mode_in_caption = (t) ->
2445                 if t.type is TYPE_END_TAG and t.name is 'caption'
2446                         if is_in_table_scope 'caption', NS_HTML
2447                                 generate_implied_end_tags()
2448                                 if open_els[0].name isnt 'caption'
2449                                         parse_error()
2450                                 loop
2451                                         el = open_els.shift()
2452                                         if el.name is 'caption' and el.namespace is NS_HTML
2453                                                 break
2454                                 clear_afe_to_marker()
2455                                 ins_mode = ins_mode_in_table
2456                         else
2457                                 parse_error()
2458                                 # fragment case
2459                         return
2460                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2461                         parse_error()
2462                         if is_in_table_scope 'caption', NS_HTML
2463                                 loop
2464                                         el = open_els.shift()
2465                                         if el.name is 'caption' and el.namespace is NS_HTML
2466                                                 break
2467                                 clear_afe_to_marker()
2468                                 ins_mode = ins_mode_in_table
2469                                 process_token t
2470                         # else fragment case
2471                         return
2472                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2473                         parse_error()
2474                         return
2475                 # Anything else
2476                 ins_mode_in_body t
2477                 return
2478
2479         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2480         ins_mode_in_column_group = (t) ->
2481                 if is_space_tok t
2482                         insert_character t
2483                         return
2484                 if t.type is TYPE_COMMENT
2485                         insert_comment t
2486                         return
2487                 if t.type is TYPE_DOCTYPE
2488                         parse_error()
2489                         return
2490                 if t.type is TYPE_START_TAG and t.name is 'html'
2491                         ins_mode_in_body t
2492                         return
2493                 if t.type is TYPE_START_TAG and t.name is 'col'
2494                         el = insert_html_element t
2495                         open_els.shift()
2496                         t.acknowledge_self_closing()
2497                         return
2498                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2499                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2500                                 open_els.shift()
2501                                 ins_mode = ins_mode_in_table
2502                         else
2503                                 parse_error()
2504                         return
2505                 if t.type is TYPE_END_TAG and t.name is 'col'
2506                         parse_error()
2507                         return
2508                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2509                         ins_mode_in_head t
2510                         return
2511                 if t.type is TYPE_EOF
2512                         ins_mode_in_body t
2513                         return
2514                 # Anything else
2515                 if open_els[0].name isnt 'colgroup'
2516                         parse_error()
2517                         return
2518                 open_els.shift()
2519                 ins_mode = ins_mode_in_table
2520                 process_token t
2521                 return
2522
2523         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2524         ins_mode_in_table_body = (t) ->
2525                 if t.type is TYPE_START_TAG and t.name is 'tr'
2526                         clear_stack_to_table_body_context()
2527                         insert_html_element t
2528                         ins_mode = ins_mode_in_row
2529                         return
2530                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2531                         parse_error()
2532                         clear_stack_to_table_body_context()
2533                         insert_html_element new_open_tag 'tr'
2534                         ins_mode = ins_mode_in_row
2535                         process_token t
2536                         return
2537                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2538                         unless is_in_table_scope t.name, NS_HTML
2539                                 parse_error()
2540                                 return
2541                         clear_stack_to_table_body_context()
2542                         open_els.shift()
2543                         ins_mode = ins_mode_in_table
2544                         return
2545                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2546                         has = false
2547                         for el in open_els
2548                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2549                                         has = true
2550                                         break
2551                                 if table_scopers[el.name] is el.namespace
2552                                         break
2553                         if !has
2554                                 parse_error()
2555                                 return
2556                         clear_stack_to_table_body_context()
2557                         open_els.shift()
2558                         ins_mode = ins_mode_in_table
2559                         process_token t
2560                         return
2561                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2562                         parse_error()
2563                         return
2564                 # Anything else
2565                 ins_mode_in_table t
2566                 return
2567
2568         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2569         ins_mode_in_row = (t) ->
2570                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2571                         clear_stack_to_table_row_context()
2572                         insert_html_element t
2573                         ins_mode = ins_mode_in_cell
2574                         afe_push_marker()
2575                         return
2576                 if t.type is TYPE_END_TAG and t.name is 'tr'
2577                         if is_in_table_scope 'tr', NS_HTML
2578                                 clear_stack_to_table_row_context()
2579                                 open_els.shift()
2580                                 ins_mode = ins_mode_in_table_body
2581                         else
2582                                 parse_error()
2583                         return
2584                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2585                         if is_in_table_scope 'tr', NS_HTML
2586                                 clear_stack_to_table_row_context()
2587                                 open_els.shift()
2588                                 ins_mode = ins_mode_in_table_body
2589                                 process_token t
2590                         else
2591                                 parse_error()
2592                         return
2593                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2594                         if is_in_table_scope t.name, NS_HTML
2595                                 if is_in_table_scope 'tr', NS_HTML
2596                                         clear_stack_to_table_row_context()
2597                                         open_els.shift()
2598                                         ins_mode = ins_mode_in_table_body
2599                                         process_token t
2600                         else
2601                                 parse_error()
2602                         return
2603                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2604                         parse_error()
2605                         return
2606                 # Anything else
2607                 ins_mode_in_table t
2608                 return
2609
2610         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2611         close_the_cell = ->
2612                 generate_implied_end_tags()
2613                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2614                         parse_error()
2615                 loop
2616                         el = open_els.shift()
2617                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2618                                 break
2619                 clear_afe_to_marker()
2620                 ins_mode = ins_mode_in_row
2621                 return
2622
2623         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2624         ins_mode_in_cell = (t) ->
2625                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2626                         if is_in_table_scope t.name, NS_HTML
2627                                 generate_implied_end_tags()
2628                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2629                                         parse_error()
2630                                 loop
2631                                         el = open_els.shift()
2632                                         if el.name is t.name and el.namespace is NS_HTML
2633                                                 break
2634                                 clear_afe_to_marker()
2635                                 ins_mode = ins_mode_in_row
2636                         else
2637                                 parse_error()
2638                         return
2639                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2640                         has = false
2641                         for el in open_els
2642                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2643                                         has = true
2644                                         break
2645                                 if table_scopers[el.name] is el.namespace
2646                                         break
2647                         if !has
2648                                 parse_error()
2649                                 return
2650                         close_the_cell()
2651                         process_token t
2652                         return
2653                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2654                         parse_error()
2655                         return
2656                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2657                         if is_in_table_scope t.name, NS_HTML
2658                                 close_the_cell()
2659                                 process_token t
2660                         else
2661                                 parse_error()
2662                         return
2663                 # Anything Else
2664                 ins_mode_in_body t
2665                 return
2666
2667         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2668         ins_mode_in_select = (t) ->
2669                 if t.type is TYPE_TEXT and t.text is "\u0000"
2670                         parse_error()
2671                         return
2672                 if t.type is TYPE_TEXT
2673                         insert_character t
2674                         return
2675                 if t.type is TYPE_COMMENT
2676                         insert_comment t
2677                         return
2678                 if t.type is TYPE_DOCTYPE
2679                         parse_error()
2680                         return
2681                 if t.type is TYPE_START_TAG and t.name is 'html'
2682                         ins_mode_in_body t
2683                         return
2684                 if t.type is TYPE_START_TAG and t.name is 'option'
2685                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2686                                 open_els.shift()
2687                         insert_html_element t
2688                         return
2689                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2690                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2691                                 open_els.shift()
2692                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2693                                 open_els.shift()
2694                         insert_html_element t
2695                         return
2696                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2697                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2698                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2699                                         open_els.shift()
2700                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2701                                 open_els.shift()
2702                         else
2703                                 parse_error()
2704                         return
2705                 if t.type is TYPE_END_TAG and t.name is 'option'
2706                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2707                                 open_els.shift()
2708                         else
2709                                 parse_error()
2710                         return
2711                 if t.type is TYPE_END_TAG and t.name is 'select'
2712                         if is_in_select_scope 'select', NS_HTML
2713                                 loop
2714                                         el = open_els.shift()
2715                                         if el.name is 'select' and el.namespace is NS_HTML
2716                                                 break
2717                                 reset_ins_mode()
2718                         else
2719                                 parse_error()
2720                         return
2721                 if t.type is TYPE_START_TAG and t.name is 'select'
2722                         parse_error()
2723                         loop
2724                                 el = open_els.shift()
2725                                 if el.name is 'select' and el.namespace is NS_HTML
2726                                         break
2727                         reset_ins_mode()
2728                         # spec says that this is the same as </select> but it doesn't say
2729                         # to check scope first
2730                         return
2731                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2732                         parse_error()
2733                         unless is_in_select_scope 'select', NS_HTML
2734                                 return
2735                         loop
2736                                 el = open_els.shift()
2737                                 if el.name is 'select' and el.namespace is NS_HTML
2738                                         break
2739                         reset_ins_mode()
2740                         process_token t
2741                         return
2742                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2743                         ins_mode_in_head t
2744                         return
2745                 if t.type is TYPE_EOF
2746                         ins_mode_in_body t
2747                         return
2748                 # Anything else
2749                 parse_error()
2750                 return
2751
2752         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2753         ins_mode_in_select_in_table = (t) ->
2754                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2755                         parse_error()
2756                         loop
2757                                 el = open_els.shift()
2758                                 if el.name is 'select' and el.namespace is NS_HTML
2759                                         break
2760                         reset_ins_mode()
2761                         process_token t
2762                         return
2763                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2764                         parse_error()
2765                         unless is_in_table_scope t.name, NS_HTML
2766                                 return
2767                         loop
2768                                 el = open_els.shift()
2769                                 if el.name is 'select' and el.namespace is NS_HTML
2770                                         break
2771                         reset_ins_mode()
2772                         process_token t
2773                         return
2774                 # Anything else
2775                 ins_mode_in_select t
2776                 return
2777
2778         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2779         ins_mode_in_template = (t) ->
2780                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2781                         ins_mode_in_body t
2782                         return
2783                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2784                         ins_mode_in_head t
2785                         return
2786                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2787                         template_ins_modes.shift()
2788                         template_ins_modes.unshift ins_mode_in_table
2789                         ins_mode = ins_mode_in_table
2790                         process_token t
2791                         return
2792                 if t.type is TYPE_START_TAG and t.name is 'col'
2793                         template_ins_modes.shift()
2794                         template_ins_modes.unshift ins_mode_in_column_group
2795                         ins_mode = ins_mode_in_column_group
2796                         process_token t
2797                         return
2798                 if t.type is TYPE_START_TAG and t.name is 'tr'
2799                         template_ins_modes.shift()
2800                         template_ins_modes.unshift ins_mode_in_table_body
2801                         ins_mode = ins_mode_in_table_body
2802                         process_token t
2803                         return
2804                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2805                         template_ins_modes.shift()
2806                         template_ins_modes.unshift ins_mode_in_row
2807                         ins_mode = ins_mode_in_row
2808                         process_token t
2809                         return
2810                 if t.type is TYPE_START_TAG
2811                         template_ins_modes.shift()
2812                         template_ins_modes.unshift ins_mode_in_body
2813                         ins_mode = ins_mode_in_body
2814                         process_token t
2815                         return
2816                 if t.type is TYPE_END_TAG
2817                         parse_error()
2818                         return
2819                 if t.type is TYPE_EOF
2820                         unless template_tag_is_open()
2821                                 stop_parsing()
2822                                 return
2823                         parse_error()
2824                         loop
2825                                 el = open_els.shift()
2826                                 if el.name is 'template' and el.namespace is NS_HTML
2827                                         break
2828                         clear_afe_to_marker()
2829                         template_ins_modes.shift()
2830                         reset_ins_mode()
2831                         process_token t
2832                 return
2833
2834         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2835         ins_mode_after_body = (t) ->
2836                 if is_space_tok t
2837                         ins_mode_in_body t
2838                         return
2839                 if t.type is TYPE_COMMENT
2840                         first = open_els[open_els.length - 1]
2841                         insert_comment t, [first, first.children.length]
2842                         return
2843                 if t.type is TYPE_DOCTYPE
2844                         parse_error()
2845                         return
2846                 if t.type is TYPE_START_TAG and t.name is 'html'
2847                         ins_mode_in_body t
2848                         return
2849                 if t.type is TYPE_END_TAG and t.name is 'html'
2850                         if flag_fragment_parsing
2851                                 parse_error()
2852                                 return
2853                         ins_mode = ins_mode_after_after_body
2854                         return
2855                 if t.type is TYPE_EOF
2856                         stop_parsing()
2857                         return
2858                 # Anything ELse
2859                 parse_error()
2860                 ins_mode = ins_mode_in_body
2861                 process_token t
2862                 return
2863
2864         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2865         ins_mode_in_frameset = (t) ->
2866                 if is_space_tok t
2867                         insert_character t
2868                         return
2869                 if t.type is TYPE_COMMENT
2870                         insert_comment t
2871                         return
2872                 if t.type is TYPE_DOCTYPE
2873                         parse_error()
2874                         return
2875                 if t.type is TYPE_START_TAG and t.name is 'html'
2876                         ins_mode_in_body t
2877                         return
2878                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2879                         insert_html_element t
2880                         return
2881                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2882                         if open_els.length is 1
2883                                 parse_error()
2884                                 return # fragment case
2885                         open_els.shift()
2886                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2887                                 ins_mode = ins_mode_after_frameset
2888                         return
2889                 if t.type is TYPE_START_TAG and t.name is 'frame'
2890                         insert_html_element t
2891                         open_els.shift()
2892                         t.acknowledge_self_closing()
2893                         return
2894                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2895                         ins_mode_in_head t
2896                         return
2897                 if t.type is TYPE_EOF
2898                         if open_els.length isnt 1
2899                                 parse_error()
2900                         stop_parsing()
2901                         return
2902                 # Anything else
2903                 parse_error()
2904                 return
2905
2906         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2907         ins_mode_after_frameset = (t) ->
2908                 if is_space_tok t
2909                         insert_character t
2910                         return
2911                 if t.type is TYPE_COMMENT
2912                         insert_comment t
2913                         return
2914                 if t.type is TYPE_DOCTYPE
2915                         parse_error()
2916                         return
2917                 if t.type is TYPE_START_TAG and t.name is 'html'
2918                         ins_mode_in_body t
2919                         return
2920                 if t.type is TYPE_END_TAG and t.name is 'html'
2921                         ins_mode = ins_mode_after_after_frameset
2922                         return
2923                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2924                         ins_mode_in_head t
2925                         return
2926                 if t.type is TYPE_EOF
2927                         stop_parsing()
2928                         return
2929                 # Anything else
2930                 parse_error()
2931                 return
2932
2933         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2934         ins_mode_after_after_body = (t) ->
2935                 if t.type is TYPE_COMMENT
2936                         insert_comment t, [doc, doc.children.length]
2937                         return
2938                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2939                         ins_mode_in_body t
2940                         return
2941                 if t.type is TYPE_EOF
2942                         stop_parsing()
2943                         return
2944                 # Anything else
2945                 parse_error()
2946                 ins_mode = ins_mode_in_body
2947                 process_token t
2948                 return
2949
2950         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2951         ins_mode_after_after_frameset = (t) ->
2952                 if t.type is TYPE_COMMENT
2953                         insert_comment t, [doc, doc.children.length]
2954                         return
2955                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2956                         ins_mode_in_body t
2957                         return
2958                 if t.type is TYPE_EOF
2959                         stop_parsing()
2960                         return
2961                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2962                         ins_mode_in_head t
2963                         return
2964                 # Anything else
2965                 parse_error()
2966                 return
2967
2968         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2969         has_color_face_or_size = (t) ->
2970                 for a in t.attrs_a
2971                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2972                                 return true
2973                 return false
2974         in_foreign_content_end_script = ->
2975                 open_els.shift()
2976                 # fixfull
2977                 return
2978         in_foreign_content_other_start = (t) ->
2979                 acn = adjusted_current_node()
2980                 if acn.namespace is NS_MATHML
2981                         adjust_mathml_attributes t
2982                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2983                         t.name = svg_name_fixes[t.name]
2984                 if acn.namespace is NS_SVG
2985                         adjust_svg_attributes t
2986                 adjust_foreign_attributes t
2987                 insert_foreign_element t, acn.namespace
2988                 if t.flag 'self-closing'
2989                         if t.name is 'script'
2990                                 t.acknowledge_self_closing()
2991                                 in_foreign_content_end_script()
2992                                 # fixfull
2993                         else
2994                                 open_els.shift()
2995                                 t.acknowledge_self_closing()
2996                 return
2997         in_foreign_content = (t) ->
2998                 if t.type is TYPE_TEXT and t.text is "\u0000"
2999                         parse_error()
3000                         insert_character new_character_token "\ufffd"
3001                         return
3002                 if is_space_tok t
3003                         insert_character t
3004                         return
3005                 if t.type is TYPE_TEXT
3006                         flag_frameset_ok = false
3007                         insert_character t
3008                         return
3009                 if t.type is TYPE_COMMENT
3010                         insert_comment t
3011                         return
3012                 if t.type is TYPE_DOCTYPE
3013                         parse_error()
3014                         return
3015                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3016                         parse_error()
3017                         if flag_fragment_parsing
3018                                 in_foreign_content_other_start t
3019                                 return
3020                         loop # is this safe?
3021                                 open_els.shift()
3022                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3023                                         break
3024                         process_token t
3025                         return
3026                 if t.type is TYPE_START_TAG
3027                         in_foreign_content_other_start t
3028                         return
3029                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3030                         in_foreign_content_end_script()
3031                         return
3032                 if t.type is TYPE_END_TAG
3033                         i = 0
3034                         node = open_els[i]
3035                         if node.name.toLowerCase() isnt t.name
3036                                 parse_error()
3037                         loop
3038                                 if node is open_els[open_els.length - 1]
3039                                         return
3040                                 if node.name.toLowerCase() is t.name
3041                                         loop
3042                                                 el = open_els.shift()
3043                                                 if el is node
3044                                                         return
3045                                 i += 1
3046                                 node = open_els[i]
3047                                 if node.namespace is NS_HTML
3048                                         break
3049                         ins_mode t # explicitly call HTML insertion mode
3050                 return
3051
3052
3053         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3054         tok_state_data = ->
3055                 switch c = txt.charAt(cur++)
3056                         when '&'
3057                                 return new_text_node parse_character_reference()
3058                         when '<'
3059                                 tok_state = tok_state_tag_open
3060                         when "\u0000"
3061                                 parse_error()
3062                                 return new_text_node c
3063                         when '' # EOF
3064                                 return new_eof_token()
3065                         else
3066                                 return new_text_node c
3067                 return null
3068
3069         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3070         # not needed: tok_state_character_reference_in_data = ->
3071         # just call parse_character_reference()
3072
3073         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3074         tok_state_rcdata = ->
3075                 switch c = txt.charAt(cur++)
3076                         when '&'
3077                                 return new_text_node parse_character_reference()
3078                         when '<'
3079                                 tok_state = tok_state_rcdata_less_than_sign
3080                         when "\u0000"
3081                                 parse_error()
3082                                 return new_character_token "\ufffd"
3083                         when '' # EOF
3084                                 return new_eof_token()
3085                         else
3086                                 return new_character_token c
3087                 return null
3088
3089         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3090         # not needed: tok_state_character_reference_in_rcdata = ->
3091         # just call parse_character_reference()
3092
3093         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3094         tok_state_rawtext = ->
3095                 switch c = txt.charAt(cur++)
3096                         when '<'
3097                                 tok_state = tok_state_rawtext_less_than_sign
3098                         when "\u0000"
3099                                 parse_error()
3100                                 return new_character_token "\ufffd"
3101                         when '' # EOF
3102                                 return new_eof_token()
3103                         else
3104                                 return new_character_token c
3105                 return null
3106
3107         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3108         tok_state_script_data = ->
3109                 switch c = txt.charAt(cur++)
3110                         when '<'
3111                                 tok_state = tok_state_script_data_less_than_sign
3112                         when "\u0000"
3113                                 parse_error()
3114                                 return new_character_token "\ufffd"
3115                         when '' # EOF
3116                                 return new_eof_token()
3117                         else
3118                                 return new_character_token c
3119                 return null
3120
3121         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3122         tok_state_plaintext = ->
3123                 switch c = txt.charAt(cur++)
3124                         when "\u0000"
3125                                 parse_error()
3126                                 return new_character_token "\ufffd"
3127                         when '' # EOF
3128                                 return new_eof_token()
3129                         else
3130                                 return new_character_token c
3131                 return null
3132
3133
3134         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3135         tok_state_tag_open = ->
3136                 c = txt.charAt(cur++)
3137                 if c is '!'
3138                         tok_state = tok_state_markup_declaration_open
3139                         return
3140                 if c is '/'
3141                         tok_state = tok_state_end_tag_open
3142                         return
3143                 if is_uc_alpha(c)
3144                         tok_cur_tag = new_open_tag c.toLowerCase()
3145                         tok_state = tok_state_tag_name
3146                         return
3147                 if is_lc_alpha(c)
3148                         tok_cur_tag = new_open_tag c
3149                         tok_state = tok_state_tag_name
3150                         return
3151                 if c is '?'
3152                         parse_error()
3153                         tok_cur_tag = new_comment_token '?' # FIXME right?
3154                         tok_state = tok_state_bogus_comment
3155                         return
3156                 # Anything else
3157                 parse_error()
3158                 tok_state = tok_state_data
3159                 cur -= 1 # we didn't parse/handle the char after <
3160                 return new_text_node '<'
3161
3162         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3163         tok_state_end_tag_open = ->
3164                 c = txt.charAt(cur++)
3165                 if is_uc_alpha(c)
3166                         tok_cur_tag = new_end_tag c.toLowerCase()
3167                         tok_state = tok_state_tag_name
3168                         return
3169                 if is_lc_alpha(c)
3170                         tok_cur_tag = new_end_tag c
3171                         tok_state = tok_state_tag_name
3172                         return
3173                 if c is '>'
3174                         parse_error()
3175                         tok_state = tok_state_data
3176                         return
3177                 if c is '' # EOF
3178                         parse_error()
3179                         tok_state = tok_state_data
3180                         return new_text_node '</'
3181                 # Anything else
3182                 parse_error()
3183                 tok_cur_tag = new_comment_token c
3184                 tok_state = tok_state_bogus_comment
3185                 return null
3186
3187         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3188         tok_state_tag_name = ->
3189                 switch c = txt.charAt(cur++)
3190                         when "\t", "\n", "\u000c", ' '
3191                                 tok_state = tok_state_before_attribute_name
3192                         when '/'
3193                                 tok_state = tok_state_self_closing_start_tag
3194                         when '>'
3195                                 tok_state = tok_state_data
3196                                 tmp = tok_cur_tag
3197                                 tok_cur_tag = null
3198                                 return tmp
3199                         when "\u0000"
3200                                 parse_error()
3201                                 tok_cur_tag.name += "\ufffd"
3202                         when '' # EOF
3203                                 parse_error()
3204                                 tok_state = tok_state_data
3205                         else
3206                                 if is_uc_alpha(c)
3207                                         tok_cur_tag.name += c.toLowerCase()
3208                                 else
3209                                         tok_cur_tag.name += c
3210                 return null
3211
3212         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3213         tok_state_rcdata_less_than_sign = ->
3214                 c = txt.charAt(cur++)
3215                 if c is '/'
3216                         temporary_buffer = ''
3217                         tok_state = tok_state_rcdata_end_tag_open
3218                         return null
3219                 # Anything else
3220                 tok_state = tok_state_rcdata
3221                 cur -= 1 # reconsume the input character
3222                 return new_character_token '<'
3223
3224         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3225         tok_state_rcdata_end_tag_open = ->
3226                 c = txt.charAt(cur++)
3227                 if is_uc_alpha(c)
3228                         tok_cur_tag = new_end_tag c.toLowerCase()
3229                         temporary_buffer += c
3230                         tok_state = tok_state_rcdata_end_tag_name
3231                         return null
3232                 if is_lc_alpha(c)
3233                         tok_cur_tag = new_end_tag c
3234                         temporary_buffer += c
3235                         tok_state = tok_state_rcdata_end_tag_name
3236                         return null
3237                 # Anything else
3238                 tok_state = tok_state_rcdata
3239                 cur -= 1 # reconsume the input character
3240                 return new_character_token "</" # fixfull separate these
3241
3242         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3243         is_appropriate_end_tag = (t) ->
3244                 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3245                 # start tag to have been emitted from this tokenizer"
3246                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3247
3248         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3249         tok_state_rcdata_end_tag_name = ->
3250                 c = txt.charAt(cur++)
3251                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3252                         if is_appropriate_end_tag tok_cur_tag
3253                                 tok_state = tok_state_before_attribute_name
3254                                 return
3255                         # else fall through to "Anything else"
3256                 if c is '/'
3257                         if is_appropriate_end_tag tok_cur_tag
3258                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3259                                 return
3260                         # else fall through to "Anything else"
3261                 if c is '>'
3262                         if is_appropriate_end_tag tok_cur_tag
3263                                 tok_state = tok_state_data
3264                                 return tok_cur_tag
3265                         # else fall through to "Anything else"
3266                 if is_uc_alpha(c)
3267                         tok_cur_tag.name += c.toLowerCase()
3268                         temporary_buffer += c
3269                         return null
3270                 if is_lc_alpha(c)
3271                         tok_cur_tag.name += c
3272                         temporary_buffer += c
3273                         return null
3274                 # Anything else
3275                 tok_state = tok_state_rcdata
3276                 cur -= 1 # reconsume the input character
3277                 return new_character_token '</' + temporary_buffer # fixfull separate these
3278
3279         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3280         tok_state_rawtext_less_than_sign = ->
3281                 c = txt.charAt(cur++)
3282                 if c is '/'
3283                         temporary_buffer = ''
3284                         tok_state = tok_state_rawtext_end_tag_open
3285                         return null
3286                 # Anything else
3287                 tok_state = tok_state_rawtext
3288                 cur -= 1 # reconsume the input character
3289                 return new_character_token '<'
3290
3291         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3292         tok_state_rawtext_end_tag_open = ->
3293                 c = txt.charAt(cur++)
3294                 if is_uc_alpha(c)
3295                         tok_cur_tag = new_end_tag c.toLowerCase()
3296                         temporary_buffer += c
3297                         tok_state = tok_state_rawtext_end_tag_name
3298                         return null
3299                 if is_lc_alpha(c)
3300                         tok_cur_tag = new_end_tag c
3301                         temporary_buffer += c
3302                         tok_state = tok_state_rawtext_end_tag_name
3303                         return null
3304                 # Anything else
3305                 tok_state = tok_state_rawtext
3306                 cur -= 1 # reconsume the input character
3307                 return new_character_token "</" # fixfull separate these
3308
3309         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3310         tok_state_rawtext_end_tag_name = ->
3311                 c = txt.charAt(cur++)
3312                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3313                         if is_appropriate_end_tag tok_cur_tag
3314                                 tok_state = tok_state_before_attribute_name
3315                                 return
3316                         # else fall through to "Anything else"
3317                 if c is '/'
3318                         if is_appropriate_end_tag tok_cur_tag
3319                                 tok_state = tok_state_self_closing_start_tag
3320                                 return
3321                         # else fall through to "Anything else"
3322                 if c is '>'
3323                         if is_appropriate_end_tag tok_cur_tag
3324                                 tok_state = tok_state_data
3325                                 return tok_cur_tag
3326                         # else fall through to "Anything else"
3327                 if is_uc_alpha(c)
3328                         tok_cur_tag.name += c.toLowerCase()
3329                         temporary_buffer += c
3330                         return null
3331                 if is_lc_alpha(c)
3332                         tok_cur_tag.name += c
3333                         temporary_buffer += c
3334                         return null
3335                 # Anything else
3336                 tok_state = tok_state_rawtext
3337                 cur -= 1 # reconsume the input character
3338                 return new_character_token '</' + temporary_buffer # fixfull separate these
3339
3340         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3341         tok_state_script_data_less_than_sign = ->
3342                 c = txt.charAt(cur++)
3343                 if c is '/'
3344                         temporary_buffer = ''
3345                         tok_state = tok_state_script_data_end_tag_open
3346                         return
3347                 if c is '!'
3348                         tok_state = tok_state_script_data_escape_start
3349                         return new_character_token '<!' # fixfull split
3350                 # Anything else
3351                 tok_state = tok_state_script_data
3352                 cur -= 1 # Reconsume
3353                 return new_character_token '<'
3354
3355         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3356         tok_state_script_data_end_tag_open = ->
3357                 c = txt.charAt(cur++)
3358                 if is_uc_alpha(c)
3359                         tok_cur_tag = new_end_tag c.toLowerCase()
3360                         temporary_buffer += c
3361                         tok_state = tok_state_script_data_end_tag_name
3362                         return
3363                 if is_lc_alpha(c)
3364                         tok_cur_tag = new_end_tag c
3365                         temporary_buffer += c
3366                         tok_state = tok_state_script_data_end_tag_name
3367                         return
3368                 # Anything else
3369                 tok_state = tok_state_script_data
3370                 cur -= 1 # Reconsume
3371                 return new_character_token '</'
3372
3373         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3374         tok_state_script_data_end_tag_name = ->
3375                 c = txt.charAt(cur++)
3376                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3377                         if is_appropriate_end_tag tok_cur_tag
3378                                 tok_state = tok_state_before_attribute_name
3379                                 return
3380                         # fall through
3381                 if c is '/'
3382                         if is_appropriate_end_tag tok_cur_tag
3383                                 tok_state = tok_state_self_closing_start_tag
3384                                 return
3385                         # fall through
3386                 if c is '>'
3387                         if is_appropriate_end_tag tok_cur_tag
3388                                 tok_state = tok_state_data
3389                                 return tok_cur_tag
3390                         # fall through
3391                 if is_uc_alpha(c)
3392                         tok_cur_tag.name += c.toLowerCase()
3393                         temporary_buffer += c
3394                         return
3395                 if is_lc_alpha(c)
3396                         tok_cur_tag.name += c
3397                         temporary_buffer += c
3398                         return
3399                 # Anything else
3400                 tok_state = tok_state_script_data
3401                 cur -= 1 # Reconsume
3402                 return new_character_token "</#{temporary_buffer}" # fixfull split
3403
3404         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3405         tok_state_script_data_escape_start = ->
3406                 c = txt.charAt(cur++)
3407                 if c is '-'
3408                         tok_state = tok_state_script_data_escape_start_dash
3409                         return new_character_token '-'
3410                 # Anything else
3411                 tok_state = tok_state_script_data
3412                 cur -= 1 # Reconsume
3413                 return
3414
3415         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3416         tok_state_script_data_escape_start_dash = ->
3417                 c = txt.charAt(cur++)
3418                 if c is '-'
3419                         tok_state = tok_state_script_data_escaped_dash_dash
3420                         return new_character_token '-'
3421                 # Anything else
3422                 tok_state = tok_state_script_data
3423                 cur -= 1 # Reconsume
3424                 return
3425
3426         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3427         tok_state_script_data_escaped = ->
3428                 c = txt.charAt(cur++)
3429                 if c is '-'
3430                         tok_state = tok_state_script_data_escaped_dash
3431                         return new_character_token '-'
3432                 if c is '<'
3433                         tok_state = tok_state_script_data_escaped_less_than_sign
3434                         return
3435                 if c is "\u0000"
3436                         parse_error()
3437                         return new_character_token "\ufffd"
3438                 if c is '' # EOF
3439                         tok_state = tok_state_data
3440                         parse_error()
3441                         cur -= 1 # Reconsume
3442                         return
3443                 # Anything else
3444                 return new_character_token c
3445
3446         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3447         tok_state_script_data_escaped_dash = ->
3448                 c = txt.charAt(cur++)
3449                 if c is '-'
3450                         tok_state = tok_state_script_data_escaped_dash_dash
3451                         return new_character_token '-'
3452                 if c is '<'
3453                         tok_state = tok_state_script_data_escaped_less_than_sign
3454                         return
3455                 if c is "\u0000"
3456                         parse_error()
3457                         tok_state = tok_state_script_data_escaped
3458                         return new_character_token "\ufffd"
3459                 if c is '' # EOF
3460                         tok_state = tok_state_data
3461                         parse_error()
3462                         cur -= 1 # Reconsume
3463                         return
3464                 # Anything else
3465                 tok_state = tok_state_script_data_escaped
3466                 return new_character_token c
3467
3468         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3469         tok_state_script_data_escaped_dash_dash = ->
3470                 c = txt.charAt(cur++)
3471                 if c is '-'
3472                         return new_character_token '-'
3473                 if c is '<'
3474                         tok_state = tok_state_script_data_escaped_less_than_sign
3475                         return
3476                 if c is '>'
3477                         tok_state = tok_state_script_data
3478                         return new_character_token '>'
3479                 if c is "\u0000"
3480                         parse_error()
3481                         tok_state = tok_state_script_data_escaped
3482                         return new_character_token "\ufffd"
3483                 if c is '' # EOF
3484                         parse_error()
3485                         tok_state = tok_state_data
3486                         cur -= 1 # Reconsume
3487                         return
3488                 # Anything else
3489                 tok_state = tok_state_script_data_escaped
3490                 return new_character_token c
3491
3492         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3493         tok_state_script_data_escaped_less_than_sign = ->
3494                 c = txt.charAt(cur++)
3495                 if c is '/'
3496                         temporary_buffer = ''
3497                         tok_state = tok_state_script_data_escaped_end_tag_open
3498                         return
3499                 if is_uc_alpha(c)
3500                         temporary_buffer = c.toLowerCase() # yes, really
3501                         tok_state = tok_state_script_data_double_escape_start
3502                         return new_character_token "<#{c}" # fixfull split
3503                 if is_lc_alpha(c)
3504                         temporary_buffer = c
3505                         tok_state = tok_state_script_data_double_escape_start
3506                         return new_character_token "<#{c}" # fixfull split
3507                 # Anything else
3508                 tok_state = tok_state_script_data_escaped
3509                 cur -= 1 # Reconsume
3510                 return new_character_token '<'
3511
3512         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3513         tok_state_script_data_escaped_end_tag_open = ->
3514                 c = txt.charAt(cur++)
3515                 if is_uc_alpha(c)
3516                         tok_cur_tag = new_end_tag c.toLowerCase()
3517                         temporary_buffer += c
3518                         tok_state = tok_state_script_data_escaped_end_tag_name
3519                         return
3520                 if is_lc_alpha(c)
3521                         tok_cur_tag = new_end_tag c
3522                         temporary_buffer += c
3523                         tok_state = tok_state_script_data_escaped_end_tag_name
3524                         return
3525                 # Anything else
3526                 tok_state = tok_state_script_data_escaped
3527                 cur -= 1 # Reconsume
3528                 return new_character_token '</' # fixfull split
3529
3530         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3531         tok_state_script_data_escaped_end_tag_name = ->
3532                 c = txt.charAt(cur++)
3533                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3534                         if is_appropriate_end_tag tok_cur_tag
3535                                 tok_state = tok_state_before_attribute_name
3536                                 return
3537                         # fall through
3538                 if c is '/'
3539                         if is_appropriate_end_tag tok_cur_tag
3540                                 tok_state = tok_state_self_closing_start_tag
3541                                 return
3542                         # fall through
3543                 if c is '>'
3544                         if is_appropriate_end_tag tok_cur_tag
3545                                 tok_state = tok_state_data
3546                                 return tok_cur_tag
3547                         # fall through
3548                 if is_uc_alpha(c)
3549                         tok_cur_tag.name += c.toLowerCase()
3550                         temporary_buffer += c.toLowerCase()
3551                         return
3552                 if is_lc_alpha(c)
3553                         tok_cur_tag.name += c
3554                         temporary_buffer += c.toLowerCase()
3555                         return
3556                 # Anything else
3557                 tok_state = tok_state_script_data_escaped
3558                 cur -= 1 # Reconsume
3559                 return new_character_token "</#{temporary_buffer}" # fixfull split
3560
3561         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3562         tok_state_script_data_double_escape_start = ->
3563                 c = txt.charAt(cur++)
3564                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3565                         if temporary_buffer is 'script'
3566                                 tok_state = tok_state_script_data_double_escaped
3567                         else
3568                                 tok_state = tok_state_script_data_escaped
3569                         return new_character_token c
3570                 if is_uc_alpha(c)
3571                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3572                         return new_character_token c
3573                 if is_lc_alpha(c)
3574                         temporary_buffer += c
3575                         return new_character_token c
3576                 # Anything else
3577                 tok_state = tok_state_script_data_escaped
3578                 cur -= 1 # Reconsume
3579                 return
3580
3581         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3582         tok_state_script_data_double_escaped = ->
3583                 c = txt.charAt(cur++)
3584                 if c is '-'
3585                         tok_state = tok_state_script_data_double_escaped_dash
3586                         return new_character_token '-'
3587                 if c is '<'
3588                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3589                         return new_character_token '<'
3590                 if c is "\u0000"
3591                         parse_error()
3592                         return new_character_token "\ufffd"
3593                 if c is '' # EOF
3594                         parse_error()
3595                         tok_state = tok_state_data
3596                         cur -= 1 # Reconsume
3597                         return
3598                 # Anything else
3599                 return new_character_token c
3600
3601         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3602         tok_state_script_data_double_escaped_dash = ->
3603                 c = txt.charAt(cur++)
3604                 if c is '-'
3605                         tok_state = tok_state_script_data_double_escaped_dash_dash
3606                         return new_character_token '-'
3607                 if c is '<'
3608                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3609                         return new_character_token '<'
3610                 if c is "\u0000"
3611                         parse_error()
3612                         tok_state = tok_state_script_data_double_escaped
3613                         return new_character_token "\ufffd"
3614                 if c is '' # EOF
3615                         parse_error()
3616                         tok_state = tok_state_data
3617                         cur -= 1 # Reconsume
3618                         return
3619                 # Anything else
3620                 tok_state = tok_state_script_data_double_escaped
3621                 return new_character_token c
3622
3623         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3624         tok_state_script_data_double_escaped_dash_dash = ->
3625                 c = txt.charAt(cur++)
3626                 if c is '-'
3627                         return new_character_token '-'
3628                 if c is '<'
3629                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3630                         return new_character_token '<'
3631                 if c is '>'
3632                         tok_state = tok_state_script_data
3633                         return new_character_token '>'
3634                 if c is "\u0000"
3635                         parse_error()
3636                         tok_state = tok_state_script_data_double_escaped
3637                         return new_character_token "\ufffd"
3638                 if c is '' # EOF
3639                         parse_error()
3640                         tok_state = tok_state_data
3641                         cur -= 1 # Reconsume
3642                         return
3643                 # Anything else
3644                 tok_state = tok_state_script_data_double_escaped
3645                 return new_character_token c
3646
3647         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3648         tok_state_script_data_double_escaped_less_than_sign = ->
3649                 c = txt.charAt(cur++)
3650                 if c is '/'
3651                         temporary_buffer = ''
3652                         tok_state = tok_state_script_data_double_escape_end
3653                         return new_character_token '/'
3654                 # Anything else
3655                 tok_state = tok_state_script_data_double_escaped
3656                 cur -= 1 # Reconsume
3657                 return
3658
3659         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3660         tok_state_script_data_double_escape_end = ->
3661                 c = txt.charAt(cur++)
3662                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3663                         if temporary_buffer is 'script'
3664                                 tok_state = tok_state_script_data_escaped
3665                         else
3666                                 tok_state = tok_state_script_data_double_escaped
3667                         return new_character_token c
3668                 if is_uc_alpha(c)
3669                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3670                         return new_character_token c
3671                 if is_lc_alpha(c)
3672                         temporary_buffer += c
3673                         return new_character_token c
3674                 # Anything else
3675                 tok_state = tok_state_script_data_double_escaped
3676                 cur -= 1 # Reconsume
3677                 return
3678
3679         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3680         tok_state_before_attribute_name = ->
3681                 attr_name = null
3682                 switch c = txt.charAt(cur++)
3683                         when "\t", "\n", "\u000c", ' '
3684                                 return null
3685                         when '/'
3686                                 tok_state = tok_state_self_closing_start_tag
3687                                 return null
3688                         when '>'
3689                                 tok_state = tok_state_data
3690                                 tmp = tok_cur_tag
3691                                 tok_cur_tag = null
3692                                 return tmp
3693                         when "\u0000"
3694                                 parse_error()
3695                                 attr_name = "\ufffd"
3696                         when '"', "'", '<', '='
3697                                 parse_error()
3698                                 attr_name = c
3699                         when '' # EOF
3700                                 parse_error()
3701                                 tok_state = tok_state_data
3702                         else
3703                                 if is_uc_alpha(c)
3704                                         attr_name = c.toLowerCase()
3705                                 else
3706                                         attr_name = c
3707                 if attr_name?
3708                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3709                         tok_state = tok_state_attribute_name
3710                 return null
3711
3712         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3713         tok_state_attribute_name = ->
3714                 switch c = txt.charAt(cur++)
3715                         when "\t", "\n", "\u000c", ' '
3716                                 tok_state = tok_state_after_attribute_name
3717                         when '/'
3718                                 tok_state = tok_state_self_closing_start_tag
3719                         when '='
3720                                 tok_state = tok_state_before_attribute_value
3721                         when '>'
3722                                 tok_state = tok_state_data
3723                                 tmp = tok_cur_tag
3724                                 tok_cur_tag = null
3725                                 return tmp
3726                         when "\u0000"
3727                                 parse_error()
3728                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3729                         when '"', "'", '<'
3730                                 parse_error()
3731                                 tok_cur_tag.attrs_a[0][0] += c
3732                         when '' # EOF
3733                                 parse_error()
3734                                 tok_state = tok_state_data
3735                         else
3736                                 if is_uc_alpha(c)
3737                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3738                                 else
3739                                         tok_cur_tag.attrs_a[0][0] += c
3740                 return null
3741
3742         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3743         tok_state_after_attribute_name = ->
3744                 c = txt.charAt(cur++)
3745                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3746                         return
3747                 if c is '/'
3748                         tok_state = tok_state_self_closing_start_tag
3749                         return
3750                 if c is '='
3751                         tok_state = tok_state_before_attribute_value
3752                         return
3753                 if c is '>'
3754                         tok_state = tok_state_data
3755                         return tok_cur_tag
3756                 if is_uc_alpha(c)
3757                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3758                         tok_state = tok_state_attribute_name
3759                         return
3760                 if c is "\u0000"
3761                         parse_error()
3762                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3763                         tok_state = tok_state_attribute_name
3764                         return
3765                 if c is '' # EOF
3766                         parse_error()
3767                         tok_state = tok_state_data
3768                         cur -= 1 # reconsume
3769                         return
3770                 if c is '"' or c is "'" or c is '<'
3771                         parse_error()
3772                         # fall through to Anything else
3773                 # Anything else
3774                 tok_cur_tag.attrs_a.unshift [c, '']
3775                 tok_state = tok_state_attribute_name
3776                 return
3777
3778         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3779         tok_state_before_attribute_value = ->
3780                 switch c = txt.charAt(cur++)
3781                         when "\t", "\n", "\u000c", ' '
3782                                 return null
3783                         when '"'
3784                                 tok_state = tok_state_attribute_value_double_quoted
3785                         when '&'
3786                                 tok_state = tok_state_attribute_value_unquoted
3787                                 cur -= 1
3788                         when "'"
3789                                 tok_state = tok_state_attribute_value_single_quoted
3790                         when "\u0000"
3791                                 # Parse error
3792                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3793                                 tok_state = tok_state_attribute_value_unquoted
3794                         when '>'
3795                                 # Parse error
3796                                 tok_state = tok_state_data
3797                                 tmp = tok_cur_tag
3798                                 tok_cur_tag = null
3799                                 return tmp
3800                         when '' # EOF
3801                                 parse_error()
3802                                 tok_state = tok_state_data
3803                         else
3804                                 tok_cur_tag.attrs_a[0][1] += c
3805                                 tok_state = tok_state_attribute_value_unquoted
3806                 return null
3807
3808         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3809         tok_state_attribute_value_double_quoted = ->
3810                 switch c = txt.charAt(cur++)
3811                         when '"'
3812                                 tok_state = tok_state_after_attribute_value_quoted
3813                         when '&'
3814                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3815                         when "\u0000"
3816                                 # Parse error
3817                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3818                         when '' # EOF
3819                                 parse_error()
3820                                 tok_state = tok_state_data
3821                         else
3822                                 tok_cur_tag.attrs_a[0][1] += c
3823                 return null
3824
3825         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3826         tok_state_attribute_value_single_quoted = ->
3827                 switch c = txt.charAt(cur++)
3828                         when "'"
3829                                 tok_state = tok_state_after_attribute_value_quoted
3830                         when '&'
3831                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3832                         when "\u0000"
3833                                 # Parse error
3834                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3835                         when '' # EOF
3836                                 parse_error()
3837                                 tok_state = tok_state_data
3838                         else
3839                                 tok_cur_tag.attrs_a[0][1] += c
3840                 return null
3841
3842         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3843         tok_state_attribute_value_unquoted = ->
3844                 switch c = txt.charAt(cur++)
3845                         when "\t", "\n", "\u000c", ' '
3846                                 tok_state = tok_state_before_attribute_name
3847                         when '&'
3848                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3849                         when '>'
3850                                 tok_state = tok_state_data
3851                                 tmp = tok_cur_tag
3852                                 tok_cur_tag = null
3853                                 return tmp
3854                         when "\u0000"
3855                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3856                         when '' # EOF
3857                                 parse_error()
3858                                 tok_state = tok_state_data
3859                         else
3860                                 # Parse Error if ', <, = or ` (backtick)
3861                                 tok_cur_tag.attrs_a[0][1] += c
3862                 return null
3863
3864         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3865         tok_state_after_attribute_value_quoted = ->
3866                 switch c = txt.charAt(cur++)
3867                         when "\t", "\n", "\u000c", ' '
3868                                 tok_state = tok_state_before_attribute_name
3869                         when '/'
3870                                 tok_state = tok_state_self_closing_start_tag
3871                         when '>'
3872                                 tok_state = tok_state_data
3873                                 tmp = tok_cur_tag
3874                                 tok_cur_tag = null
3875                                 return tmp
3876                         when '' # EOF
3877                                 parse_error()
3878                                 tok_state = tok_state_data
3879                         else
3880                                 # Parse Error
3881                                 tok_state = tok_state_before_attribute_name
3882                                 cur -= 1 # we didn't handle that char
3883                 return null
3884
3885         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3886         tok_state_self_closing_start_tag = ->
3887                 c = txt.charAt(cur++)
3888                 if c is '>'
3889                         tok_cur_tag.flag 'self-closing', true
3890                         tok_state = tok_state_data
3891                         return tok_cur_tag
3892                 if c is ''
3893                         parse_error()
3894                         tok_state = tok_state_data
3895                         cur -= 1 # Reconsume
3896                         return
3897                 # Anything else
3898                 parse_error()
3899                 tok_state = tok_state_before_attribute_name
3900                 cur -= 1 # Reconsume
3901                 return
3902
3903         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3904         # WARNING: put a comment token in tok_cur_tag before setting this state
3905         tok_state_bogus_comment = ->
3906                 next_gt = txt.indexOf '>', cur
3907                 if next_gt is -1
3908                         val = txt.substr cur
3909                         cur = txt.length
3910                 else
3911                         val = txt.substr cur, (next_gt - cur)
3912                         cur = next_gt + 1
3913                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3914                 tok_cur_tag.text += val
3915                 tok_state = tok_state_data
3916                 return tok_cur_tag
3917
3918         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3919         tok_state_markup_declaration_open = ->
3920                 if txt.substr(cur, 2) is '--'
3921                         cur += 2
3922                         tok_cur_tag = new_comment_token ''
3923                         tok_state = tok_state_comment_start
3924                         return
3925                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3926                         cur += 7
3927                         tok_state = tok_state_doctype
3928                         return
3929                 acn = adjusted_current_node()
3930                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3931                         cur += 7
3932                         tok_state = tok_state_cdata_section
3933                         return
3934                 # Otherwise
3935                 parse_error()
3936                 tok_cur_tag = new_comment_token ''
3937                 tok_state = tok_state_bogus_comment
3938                 return
3939
3940         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3941         tok_state_comment_start = ->
3942                 switch c = txt.charAt(cur++)
3943                         when '-'
3944                                 tok_state = tok_state_comment_start_dash
3945                         when "\u0000"
3946                                 parse_error()
3947                                 tok_state = tok_state_comment
3948                                 return new_character_token "\ufffd"
3949                         when '>'
3950                                 parse_error()
3951                                 tok_state = tok_state_data
3952                                 return tok_cur_tag
3953                         when '' # EOF
3954                                 parse_error()
3955                                 tok_state = tok_state_data
3956                                 cur -= 1 # Reconsume
3957                                 return tok_cur_tag
3958                         else
3959                                 tok_cur_tag.text += c
3960                                 tok_state = tok_state_comment
3961                 return null
3962
3963         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3964         tok_state_comment_start_dash = ->
3965                 switch c = txt.charAt(cur++)
3966                         when '-'
3967                                 tok_state = tok_state_comment_end
3968                         when "\u0000"
3969                                 parse_error()
3970                                 tok_cur_tag.text += "-\ufffd"
3971                                 tok_state = tok_state_comment
3972                         when '>'
3973                                 parse_error()
3974                                 tok_state = tok_state_data
3975                                 return tok_cur_tag
3976                         when '' # EOF
3977                                 parse_error()
3978                                 tok_state = tok_state_data
3979                                 cur -= 1 # Reconsume
3980                                 return tok_cur_tag
3981                         else
3982                                 tok_cur_tag.text += "-#{c}"
3983                                 tok_state = tok_state_comment
3984                 return null
3985
3986         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3987         tok_state_comment = ->
3988                 switch c = txt.charAt(cur++)
3989                         when '-'
3990                                 tok_state = tok_state_comment_end_dash
3991                         when "\u0000"
3992                                 parse_error()
3993                                 tok_cur_tag.text += "\ufffd"
3994                         when '' # EOF
3995                                 parse_error()
3996                                 tok_state = tok_state_data
3997                                 cur -= 1 # Reconsume
3998                                 return tok_cur_tag
3999                         else
4000                                 tok_cur_tag.text += c
4001                 return null
4002
4003         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4004         tok_state_comment_end_dash = ->
4005                 switch c = txt.charAt(cur++)
4006                         when '-'
4007                                 tok_state = tok_state_comment_end
4008                         when "\u0000"
4009                                 parse_error()
4010                                 tok_cur_tag.text += "-\ufffd"
4011                                 tok_state = tok_state_comment
4012                         when '' # EOF
4013                                 parse_error()
4014                                 tok_state = tok_state_data
4015                                 cur -= 1 # Reconsume
4016                                 return tok_cur_tag
4017                         else
4018                                 tok_cur_tag.text += "-#{c}"
4019                                 tok_state = tok_state_comment
4020                 return null
4021
4022         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4023         tok_state_comment_end = ->
4024                 switch c = txt.charAt(cur++)
4025                         when '>'
4026                                 tok_state = tok_state_data
4027                                 return tok_cur_tag
4028                         when "\u0000"
4029                                 parse_error()
4030                                 tok_cur_tag.text += "--\ufffd"
4031                                 tok_state = tok_state_comment
4032                         when '!'
4033                                 parse_error()
4034                                 tok_state = tok_state_comment_end_bang
4035                         when '-'
4036                                 parse_error()
4037                                 tok_cur_tag.text += '-'
4038                         when '' # EOF
4039                                 parse_error()
4040                                 tok_state = tok_state_data
4041                                 cur -= 1 # Reconsume
4042                                 return tok_cur_tag
4043                         else
4044                                 parse_error()
4045                                 tok_cur_tag.text += "--#{c}"
4046                                 tok_state = tok_state_comment
4047                 return null
4048
4049         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4050         tok_state_comment_end_bang = ->
4051                 switch c = txt.charAt(cur++)
4052                         when '-'
4053                                 tok_cur_tag.text += "--!#{c}"
4054                                 tok_state = tok_state_comment_end_dash
4055                         when '>'
4056                                 tok_state = tok_state_data
4057                                 return tok_cur_tag
4058                         when "\u0000"
4059                                 parse_error()
4060                                 tok_cur_tag.text += "--!\ufffd"
4061                                 tok_state = tok_state_comment
4062                         when '' # EOF
4063                                 parse_error()
4064                                 tok_state = tok_state_data
4065                                 cur -= 1 # Reconsume
4066                                 return tok_cur_tag
4067                         else
4068                                 tok_cur_tag.text += "--!#{c}"
4069                                 tok_state = tok_state_comment
4070                 return null
4071
4072         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4073         tok_state_doctype = ->
4074                 switch c = txt.charAt(cur++)
4075                         when "\t", "\u000a", "\u000c", ' '
4076                                 tok_state = tok_state_before_doctype_name
4077                         when '' # EOF
4078                                 parse_error()
4079                                 tok_state = tok_state_data
4080                                 el = new_doctype_token ''
4081                                 el.flag 'force-quirks', true
4082                                 cur -= 1 # Reconsume
4083                                 return el
4084                         else
4085                                 parse_error()
4086                                 tok_state = tok_state_before_doctype_name
4087                                 cur -= 1 # Reconsume
4088                 return null
4089
4090         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4091         tok_state_before_doctype_name = ->
4092                 c = txt.charAt(cur++)
4093                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4094                         return
4095                 if is_uc_alpha(c)
4096                         tok_cur_tag = new_doctype_token c.toLowerCase()
4097                         tok_state = tok_state_doctype_name
4098                         return
4099                 if c is "\u0000"
4100                         parse_error()
4101                         tok_cur_tag = new_doctype_token "\ufffd"
4102                         tok_state = tok_state_doctype_name
4103                         return
4104                 if c is '>'
4105                         parse_error()
4106                         el = new_doctype_token ''
4107                         el.flag 'force-quirks', true
4108                         tok_state = tok_state_data
4109                         return el
4110                 if c is '' # EOF
4111                         parse_error()
4112                         tok_state = tok_state_data
4113                         el = new_doctype_token ''
4114                         el.flag 'force-quirks', true
4115                         cur -= 1 # Reconsume
4116                         return el
4117                 # Anything else
4118                 tok_cur_tag = new_doctype_token c
4119                 tok_state = tok_state_doctype_name
4120                 return null
4121
4122         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4123         tok_state_doctype_name = ->
4124                 c = txt.charAt(cur++)
4125                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4126                         tok_state = tok_state_after_doctype_name
4127                         return
4128                 if c is '>'
4129                         tok_state = tok_state_data
4130                         return tok_cur_tag
4131                 if is_uc_alpha(c)
4132                         tok_cur_tag.name += c.toLowerCase()
4133                         return
4134                 if c is "\u0000"
4135                         parse_error()
4136                         tok_cur_tag.name += "\ufffd"
4137                         return
4138                 if c is '' # EOF
4139                         parse_error()
4140                         tok_state = tok_state_data
4141                         tok_cur_tag.flag 'force-quirks', true
4142                         cur -= 1 # Reconsume
4143                         return tok_cur_tag
4144                 # Anything else
4145                 tok_cur_tag.name += c
4146                 return null
4147
4148         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4149         tok_state_after_doctype_name = ->
4150                 c = txt.charAt(cur++)
4151                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4152                         return
4153                 if c is '>'
4154                         tok_state = tok_state_data
4155                         return tok_cur_tag
4156                 if c is '' # EOF
4157                         parse_error()
4158                         tok_state = tok_state_data
4159                         tok_cur_tag.flag 'force-quirks', true
4160                         cur -= 1 # Reconsume
4161                         return tok_cur_tag
4162                 # Anything else
4163                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4164                         cur += 5
4165                         tok_state = tok_state_after_doctype_public_keyword
4166                         return
4167                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4168                         cur += 5
4169                         tok_state = tok_state_after_doctype_system_keyword
4170                         return
4171                 parse_error()
4172                 tok_cur_tag.flag 'force-quirks', true
4173                 tok_state = tok_state_bogus_doctype
4174                 return null
4175
4176         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4177         tok_state_after_doctype_public_keyword = ->
4178                 c = txt.charAt(cur++)
4179                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4180                         tok_state = tok_state_before_doctype_public_identifier
4181                         return
4182                 if c is '"'
4183                         parse_error()
4184                         tok_cur_tag.public_identifier = ''
4185                         tok_state = tok_state_doctype_public_identifier_double_quoted
4186                         return
4187                 if c is "'"
4188                         parse_error()
4189                         tok_cur_tag.public_identifier = ''
4190                         tok_state = tok_state_doctype_public_identifier_single_quoted
4191                         return
4192                 if c is '>'
4193                         parse_error()
4194                         tok_cur_tag.flag 'force-quirks', true
4195                         tok_state = tok_state_data
4196                         return tok_cur_tag
4197                 if c is '' # EOF
4198                         parse_error()
4199                         tok_state = tok_state_data
4200                         tok_cur_tag.flag 'force-quirks', true
4201                         cur -= 1 # Reconsume
4202                         return tok_cur_tag
4203                 # Anything else
4204                 parse_error()
4205                 tok_cur_tag.flag 'force-quirks', true
4206                 tok_state = tok_state_bogus_doctype
4207                 return null
4208
4209         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4210         tok_state_before_doctype_public_identifier = ->
4211                 c = txt.charAt(cur++)
4212                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4213                         return
4214                 if c is '"'
4215                         parse_error()
4216                         tok_cur_tag.public_identifier = ''
4217                         tok_state = tok_state_doctype_public_identifier_double_quoted
4218                         return
4219                 if c is "'"
4220                         parse_error()
4221                         tok_cur_tag.public_identifier = ''
4222                         tok_state = tok_state_doctype_public_identifier_single_quoted
4223                         return
4224                 if c is '>'
4225                         parse_error()
4226                         tok_cur_tag.flag 'force-quirks', true
4227                         tok_state = tok_state_data
4228                         return tok_cur_tag
4229                 if c is '' # EOF
4230                         parse_error()
4231                         tok_state = tok_state_data
4232                         tok_cur_tag.flag 'force-quirks', true
4233                         cur -= 1 # Reconsume
4234                         return tok_cur_tag
4235                 # Anything else
4236                 parse_error()
4237                 tok_cur_tag.flag 'force-quirks', true
4238                 tok_state = tok_state_bogus_doctype
4239                 return null
4240
4241
4242         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4243         tok_state_doctype_public_identifier_double_quoted = ->
4244                 c = txt.charAt(cur++)
4245                 if c is '"'
4246                         tok_state = tok_state_after_doctype_public_identifier
4247                         return
4248                 if c is "\u0000"
4249                         parse_error()
4250                         tok_cur_tag.public_identifier += "\ufffd"
4251                         return
4252                 if c is '>'
4253                         parse_error()
4254                         tok_cur_tag.flag 'force-quirks', true
4255                         tok_state = tok_state_data
4256                         return tok_cur_tag
4257                 if c is '' # EOF
4258                         parse_error()
4259                         tok_state = tok_state_data
4260                         tok_cur_tag.flag 'force-quirks', true
4261                         cur -= 1 # Reconsume
4262                         return tok_cur_tag
4263                 # Anything else
4264                 tok_cur_tag.public_identifier += c
4265                 return null
4266
4267         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4268         tok_state_doctype_public_identifier_single_quoted = ->
4269                 c = txt.charAt(cur++)
4270                 if c is "'"
4271                         tok_state = tok_state_after_doctype_public_identifier
4272                         return
4273                 if c is "\u0000"
4274                         parse_error()
4275                         tok_cur_tag.public_identifier += "\ufffd"
4276                         return
4277                 if c is '>'
4278                         parse_error()
4279                         tok_cur_tag.flag 'force-quirks', true
4280                         tok_state = tok_state_data
4281                         return tok_cur_tag
4282                 if c is '' # EOF
4283                         parse_error()
4284                         tok_state = tok_state_data
4285                         tok_cur_tag.flag 'force-quirks', true
4286                         cur -= 1 # Reconsume
4287                         return tok_cur_tag
4288                 # Anything else
4289                 tok_cur_tag.public_identifier += c
4290                 return null
4291
4292         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4293         tok_state_after_doctype_public_identifier = ->
4294                 c = txt.charAt(cur++)
4295                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4296                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4297                         return
4298                 if c is '>'
4299                         tok_state = tok_state_data
4300                         return tok_cur_tag
4301                 if c is '"'
4302                         parse_error()
4303                         tok_cur_tag.system_identifier = ''
4304                         tok_state = tok_state_doctype_system_identifier_double_quoted
4305                         return
4306                 if c is "'"
4307                         parse_error()
4308                         tok_cur_tag.system_identifier = ''
4309                         tok_state = tok_state_doctype_system_identifier_single_quoted
4310                         return
4311                 if c is '' # EOF
4312                         parse_error()
4313                         tok_state = tok_state_data
4314                         tok_cur_tag.flag 'force-quirks', true
4315                         cur -= 1 # Reconsume
4316                         return tok_cur_tag
4317                 # Anything else
4318                 parse_error()
4319                 tok_cur_tag.flag 'force-quirks', true
4320                 tok_state = tok_state_bogus_doctype
4321                 return null
4322
4323         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4324         tok_state_between_doctype_public_and_system_identifiers = ->
4325                 c = txt.charAt(cur++)
4326                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4327                         return
4328                 if c is '>'
4329                         tok_state = tok_state_data
4330                         return tok_cur_tag
4331                 if c is '"'
4332                         parse_error()
4333                         tok_cur_tag.system_identifier = ''
4334                         tok_state = tok_state_doctype_system_identifier_double_quoted
4335                         return
4336                 if c is "'"
4337                         parse_error()
4338                         tok_cur_tag.system_identifier = ''
4339                         tok_state = tok_state_doctype_system_identifier_single_quoted
4340                         return
4341                 if c is '' # EOF
4342                         parse_error()
4343                         tok_state = tok_state_data
4344                         tok_cur_tag.flag 'force-quirks', true
4345                         cur -= 1 # Reconsume
4346                         return tok_cur_tag
4347                 # Anything else
4348                 parse_error()
4349                 tok_cur_tag.flag 'force-quirks', true
4350                 tok_state = tok_state_bogus_doctype
4351                 return null
4352
4353         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4354         tok_state_after_doctype_system_keyword = ->
4355                 c = txt.charAt(cur++)
4356                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4357                         tok_state = tok_state_before_doctype_system_identifier
4358                         return
4359                 if c is '"'
4360                         parse_error()
4361                         tok_cur_tag.system_identifier = ''
4362                         tok_state = tok_state_doctype_system_identifier_double_quoted
4363                         return
4364                 if c is "'"
4365                         parse_error()
4366                         tok_cur_tag.system_identifier = ''
4367                         tok_state = tok_state_doctype_system_identifier_single_quoted
4368                         return
4369                 if c is '>'
4370                         parse_error()
4371                         tok_cur_tag.flag 'force-quirks', true
4372                         tok_state = tok_state_data
4373                         return tok_cur_tag
4374                 if c is '' # EOF
4375                         parse_error()
4376                         tok_state = tok_state_data
4377                         tok_cur_tag.flag 'force-quirks', true
4378                         cur -= 1 # Reconsume
4379                         return tok_cur_tag
4380                 # Anything else
4381                 parse_error()
4382                 tok_cur_tag.flag 'force-quirks', true
4383                 tok_state = tok_state_bogus_doctype
4384                 return null
4385
4386         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4387         tok_state_before_doctype_system_identifier = ->
4388                 c = txt.charAt(cur++)
4389                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4390                         return
4391                 if c is '"'
4392                         tok_cur_tag.system_identifier = ''
4393                         tok_state = tok_state_doctype_system_identifier_double_quoted
4394                         return
4395                 if c is "'"
4396                         tok_cur_tag.system_identifier = ''
4397                         tok_state = tok_state_doctype_system_identifier_single_quoted
4398                         return
4399                 if c is '>'
4400                         parse_error()
4401                         tok_cur_tag.flag 'force-quirks', true
4402                         tok_state = tok_state_data
4403                         return tok_cur_tag
4404                 if c is '' # EOF
4405                         parse_error()
4406                         tok_state = tok_state_data
4407                         tok_cur_tag.flag 'force-quirks', true
4408                         cur -= 1 # Reconsume
4409                         return tok_cur_tag
4410                 # Anything else
4411                 parse_error()
4412                 tok_cur_tag.flag 'force-quirks', true
4413                 tok_state = tok_state_bogus_doctype
4414                 return null
4415
4416         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4417         tok_state_doctype_system_identifier_double_quoted = ->
4418                 c = txt.charAt(cur++)
4419                 if c is '"'
4420                         tok_state = tok_state_after_doctype_system_identifier
4421                         return
4422                 if c is "\u0000"
4423                         parse_error()
4424                         tok_cur_tag.system_identifier += "\ufffd"
4425                         return
4426                 if c is '>'
4427                         parse_error()
4428                         tok_cur_tag.flag 'force-quirks', true
4429                         tok_state = tok_state_data
4430                         return tok_cur_tag
4431                 if c is '' # EOF
4432                         parse_error()
4433                         tok_state = tok_state_data
4434                         tok_cur_tag.flag 'force-quirks', true
4435                         cur -= 1 # Reconsume
4436                         return tok_cur_tag
4437                 # Anything else
4438                 tok_cur_tag.system_identifier += c
4439                 return null
4440
4441         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4442         tok_state_doctype_system_identifier_single_quoted = ->
4443                 c = txt.charAt(cur++)
4444                 if c is "'"
4445                         tok_state = tok_state_after_doctype_system_identifier
4446                         return
4447                 if c is "\u0000"
4448                         parse_error()
4449                         tok_cur_tag.system_identifier += "\ufffd"
4450                         return
4451                 if c is '>'
4452                         parse_error()
4453                         tok_cur_tag.flag 'force-quirks', true
4454                         tok_state = tok_state_data
4455                         return tok_cur_tag
4456                 if c is '' # EOF
4457                         parse_error()
4458                         tok_state = tok_state_data
4459                         tok_cur_tag.flag 'force-quirks', true
4460                         cur -= 1 # Reconsume
4461                         return tok_cur_tag
4462                 # Anything else
4463                 tok_cur_tag.system_identifier += c
4464                 return null
4465
4466         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4467         tok_state_after_doctype_system_identifier = ->
4468                 c = txt.charAt(cur++)
4469                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4470                         return
4471                 if c is '>'
4472                         tok_state = tok_state_data
4473                         return tok_cur_tag
4474                 if c is '' # EOF
4475                         parse_error()
4476                         tok_state = tok_state_data
4477                         tok_cur_tag.flag 'force-quirks', true
4478                         cur -= 1 # Reconsume
4479                         return tok_cur_tag
4480                 # Anything else
4481                 parse_error()
4482                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4483                 tok_state = tok_state_bogus_doctype
4484                 return null
4485
4486         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4487         tok_state_bogus_doctype = ->
4488                 c = txt.charAt(cur++)
4489                 if c is '>'
4490                         tok_state = tok_state_data
4491                         return tok_cur_tag
4492                 if c is '' # EOF
4493                         tok_state = tok_state_data
4494                         cur -= 1 # Reconsume
4495                         return tok_cur_tag
4496                 # Anything else
4497                 return null
4498
4499         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4500         tok_state_cdata_section = ->
4501                 tok_state = tok_state_data
4502                 next_gt = txt.indexOf ']]>', cur
4503                 if next_gt is -1
4504                         val = txt.substr cur
4505                         cur = txt.length
4506                 else
4507                         val = txt.substr cur, (next_gt - cur)
4508                         cur = next_gt + 3
4509                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4510                 if val.length > 0
4511                         return new_character_token val # fixfull split
4512                 return null
4513
4514         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4515         # Don't set this as a state, just call it
4516         # returns a string (NOT a text node)
4517         parse_character_reference = (allowed_char = null, in_attr = false) ->
4518                 if cur >= txt.length
4519                         return '&'
4520                 switch c = txt.charAt(cur)
4521                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4522                                 # explicitly not a parse error
4523                                 return '&'
4524                         when ';'
4525                                 # there has to be "one or more" alnums between & and ; to be a parse error
4526                                 return '&'
4527                         when '#'
4528                                 if cur + 1 >= txt.length
4529                                         return '&'
4530                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4531                                         base = 16
4532                                         charset = hex_chars
4533                                         start = cur + 2
4534                                 else
4535                                         charset = digits
4536                                         start = cur + 1
4537                                         base = 10
4538                                 i = 0
4539                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4540                                         i += 1
4541                                 if i is 0
4542                                         return '&'
4543                                 cur = start + i
4544                                 if txt.charAt(start + i) is ';'
4545                                         cur += 1
4546                                 else
4547                                         parse_error()
4548                                 code_point = txt.substr(start, i)
4549                                 while code_point.charAt(0) is '0' and code_point.length > 1
4550                                         code_point = code_point.substr 1
4551                                 code_point = parseInt(code_point, base)
4552                                 if unicode_fixes[code_point]?
4553                                         parse_error()
4554                                         return unicode_fixes[code_point]
4555                                 else
4556                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4557                                                 parse_error()
4558                                                 return "\ufffd"
4559                                         else
4560                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4561                                                         parse_error()
4562                                                 return from_code_point code_point
4563                                 return
4564                         else
4565                                 for i in [0...31]
4566                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4567                                                 break
4568                                 if i is 0
4569                                         # exit early, because parse_error() below needs at least one alnum
4570                                         return '&'
4571                                 if txt.charAt(cur + i) is ';'
4572                                         decoded = decode_named_char_ref txt.substr(cur, i)
4573                                         i += 1 # scan past the ';' (after, so we dno't pass it to decode)
4574                                         if decoded?
4575                                                 cur += i
4576                                                 return decoded
4577                                         # else FALL THROUGH (check for match without last char(s) or ";")
4578                                 # no ';' terminator (only legacy char refs)
4579                                 max = i
4580                                 for i in [2..max] # no prefix matches, so ok to check shortest first
4581                                         c = legacy_char_refs[txt.substr(cur, i)]
4582                                         if c?
4583                                                 if in_attr
4584                                                         if txt.charAt(cur + i) is '='
4585                                                                 # "because some legacy user agents will
4586                                                                 # misinterpret the markup in those cases"
4587                                                                 parse_error()
4588                                                                 return '&'
4589                                                         if alnum.indexOf(txt.charAt(cur + i)) > -1
4590                                                                 # this makes attributes forgiving about url args
4591                                                                 return '&'
4592                                                 # ok, and besides the weird exceptions for attributes...
4593                                                 # return the matching char
4594                                                 cur += i # consume entity chars
4595                                                 parse_error() # because no terminating ";"
4596                                                 return c
4597                                 parse_error()
4598                                 return '&'
4599                 return # never reached
4600
4601         eat_next_token_if_newline = ->
4602                 old_cur = cur
4603                 t = null
4604                 until t?
4605                         t = tok_state()
4606                 if t.type is TYPE_TEXT
4607                         # definition of a newline depends on whether it was a character ref or not
4608                         if cur - old_cur is 1
4609                                 # not a character reference
4610                                 if t.text is "\u000d" or t.text is "\u000a"
4611                                         return
4612                         else
4613                                 if t.text is "\u000a"
4614                                         return
4615                 # not a "newline"
4616                 cur = old_cur
4617                 return
4618
4619         # tree constructor initialization
4620         # see comments on TYPE_TAG/etc for the structure of this data
4621         txt = args.html
4622         cur = 0
4623         doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4624         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4625         fragment_root = null # fragment parsing algorithm returns children of this
4626         open_els = []
4627         afe = [] # active formatting elements
4628         template_ins_modes = []
4629         ins_mode = ins_mode_initial
4630         original_ins_mode = ins_mode # TODO check spec
4631         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4632         flag_frameset_ok = true
4633         flag_parsing = true
4634         flag_foster_parenting = false
4635         form_element_pointer = null
4636         temporary_buffer = null
4637         pending_table_character_tokens = []
4638         head_element_pointer = null
4639         flag_fragment_parsing = false
4640         context_element = null
4641         prev_node_id = 0 # just for debugging
4642
4643         # tokenizer initialization
4644         tok_state = tok_state_data
4645
4646         parse_init = ->
4647                 # fragment parsing (text arg)
4648                 if args.fragment?
4649                         # this handles the fragment from the tests in the format described here:
4650                         # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4651                         f = args.fragment
4652                         ns = NS_HTML
4653                         if f.substr(0, 5) is 'math '
4654                                 f = f.substr 5
4655                                 ns = NS_MATHML
4656                         else if f.substr(0, 4) is 'svg '
4657                                 f = f.substr 4
4658                                 ns = NS_SVG
4659                         t = new_open_tag f
4660                         context_element = token_to_element t, ns
4661                         context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4662                         context_element.document.flag 'quirks mode', QUIRKS_NO
4663                 # fragment parsing (Node arg)
4664                 if args.context?
4665                         context_element = args.context
4666
4667                 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4668                 # fragment parsing algorithm
4669                 if context_element?
4670                         flag_fragment_parsing = true
4671                         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4672                         # search up the tree from context, to try to find it's document,
4673                         # because this file only puts a "document" property on the root
4674                         # element.
4675                         old_doc = null
4676                         el = context_element
4677                         loop
4678                                 if el.document?
4679                                         old_doc = el.document
4680                                         break
4681                                 if el.parent
4682                                         el = el.parent
4683                                 else
4684                                         break
4685                         if old_doc
4686                                 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4687                         # set tok_state
4688                         if context_element.namespace is NS_HTML
4689                                 switch context_element.name
4690                                         when 'title', 'textarea'
4691                                                 tok_state = tok_state_rcdata
4692                                         when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4693                                                 tok_state = tok_state_rawtext
4694                                         when 'script'
4695                                                 tok_state = tok_state_script_data
4696                                         when 'noscript'
4697                                                 if flag_scripting
4698                                                         tok_state = tok_state_rawtext
4699                                         when 'plaintext'
4700                                                 tok_state = tok_state_plaintext
4701                         fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4702                         doc.children.push fragment_root
4703                         fragment_root.document = doc
4704                         open_els = [fragment_root]
4705                         if context_element.name is 'template' and context_element.namespace is NS_HTML
4706                                 template_ins_modes.unshift ins_mode_in_template
4707                         # fixfull create token for context (it should have it's original one already)
4708                         reset_ins_mode()
4709                         # set form_element pointer... in the foreign doc?!
4710                         el = context_element
4711                         loop
4712                                 if el.name is 'form' and el.namespace is NS_HTML
4713                                         form_element_pointer = el
4714                                         break
4715                                 if el.parent
4716                                         el = el.parent
4717                                 else
4718                                         break
4719
4720                 # text pre-processing
4721                 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4722                 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4723                 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4724
4725                 return
4726
4727         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4728         parse_main_loop = ->
4729                 while flag_parsing
4730                         t = tok_state()
4731                         if t?
4732                                 process_token t
4733                                 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4734                 return
4735         parse_init()
4736         parse_main_loop()
4737
4738         if flag_fragment_parsing
4739                 return fragment_root.children
4740         return doc.children
4741
4742 exports.parse_html = parse_html
4743 exports.debug_log_reset = debug_log_reset
4744 exports.debug_log_each = debug_log_each
4745 exports.TYPE_TAG = TYPE_TAG
4746 exports.TYPE_TEXT = TYPE_TEXT
4747 exports.TYPE_COMMENT = TYPE_COMMENT
4748 exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4749 exports.NS_HTML = NS_HTML
4750 exports.NS_MATHML = NS_MATHML
4751 exports.NS_SVG = NS_SVG
4752 exports.QUIRKS_NO = QUIRKS_NO
4753 exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4754 exports.QUIRKS_YES = QUIRKS_YES