JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
code cleanup, remove debug logs
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor.
20
21 # The implementation is a pretty direct implementation of the parsing algorithm
22 # described here:
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
24 #
25 # Deviations from that spec:
26 #
27 #   Purposeful: search this file for "WHATWG"
28 #
29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
30
31
32 # stacks/lists
33 #
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
37 # (both as stacks)
38 #
39 # stacks grow downward (current element is index=0)
40 #
41 # example: open_els = [a, b, c, d, e, f, g]
42 #
43 # "grows downwards" means it's visualized like this: (index: el, names)
44 #
45 #   6: g "start of the list", "topmost", "first"
46 #   5: f
47 #   4: e "previous" (to d), "above", "before"
48 #   3: d   (previous/next are relative to this element)
49 #   2: c "next", "after", "lower", "below"
50 #   1: b
51 #   0: a "end of the list", "current node", "bottommost", "last"
52
53
54 # browser
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
58         window.wheic = {}
59         module = exports: window.wheic
60
61 from_code_point = (x) ->
62         if String.fromCodePoint?
63                 return String.fromCodePoint x
64         else
65                 if x <= 0xffff
66                         return String.fromCharCode x
67                 x -= 0x10000
68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
69
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
73 TYPE_COMMENT = 2
74 TYPE_DOCTYPE = 3
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
78 TYPE_EOF = 6
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
81
82 # namespace constants
83 NS_HTML = 1
84 NS_MATHML = 2
85 NS_SVG = 3
86
87 # quirks mode constants
88 QUIRKS_NO = 1
89 QUIRKS_LIMITED = 2
90 QUIRKS_YES = 3
91
92 # queue up debug logs, so eg they can be shown only for tests that fail
93 g_debug_log = []
94 debug_log_reset = ->
95         g_debug_log = []
96         return
97 debug_log = (str) ->
98         g_debug_log.push str
99         return
100 debug_log_each = (cb) ->
101         for str in g_debug_log
102                 cb str
103         return
104
105 prev_node_id = 0
106 class Node
107         constructor: (type, args = {}) ->
108                 @type = type # one of the TYPE_* constants above
109                 @name = args.name ? '' # tag name
110                 @text = args.text ? '' # contents for text/comment nodes
111                 @attrs = args.attrs ? {}
112                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
113                 @children = args.children ? []
114                 @namespace = args.namespace ? NS_HTML
115                 @parent = args.parent ? null
116                 @token = args.token ? null
117                 @flags = args.flags ? {}
118                 if args.id?
119                         @id = "#{args.id}+"
120                 else
121                         @id = "#{++prev_node_id}"
122         acknowledge_self_closing: ->
123                 if @token?
124                         @token.flag 'did_self_close', true
125                 else
126                         @flag 'did_self_close', true
127                 return
128         flag: (key, value = null) ->
129                 if value?
130                         @flags[key] = value
131                 else
132                         return @flags[key]
133                 return
134
135 # helpers: (only take args that are normally known when parser creates nodes)
136 new_open_tag = (name) ->
137         return new Node TYPE_START_TAG, name: name
138 new_end_tag = (name) ->
139         return new Node TYPE_END_TAG, name: name
140 new_element = (name) ->
141         return new Node TYPE_TAG, name: name
142 new_text_node = (txt) ->
143         return new Node TYPE_TEXT, text: txt
144 new_character_token = new_text_node
145 new_comment_token = (txt) ->
146         return new Node TYPE_COMMENT, text: txt
147 new_doctype_token = (name) ->
148         return new Node TYPE_DOCTYPE, name: name
149 new_eof_token = ->
150         return new Node TYPE_EOF
151 new_afe_marker = ->
152         return new Node TYPE_AFE_MARKER
153 new_aaa_bookmark = ->
154         return new Node TYPE_AAA_BOOKMARK
155
156 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
157 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
158 digits = "0123456789"
159 alnum = lc_alpha + uc_alpha + digits
160 hex_chars = digits + "abcdefABCDEF"
161
162 is_uc_alpha = (str) ->
163         return str.length is 1 and uc_alpha.indexOf(str) > -1
164 is_lc_alpha = (str) ->
165         return str.length is 1 and lc_alpha.indexOf(str) > -1
166
167 # some SVG elements have dashes in them
168 tag_name_chars = alnum + "-"
169
170 # http://www.w3.org/TR/html5/infrastructure.html#space-character
171 space_chars = "\u0009\u000a\u000c\u000d\u0020"
172 is_space = (txt) ->
173         return txt.length is 1 and space_chars.indexOf(txt) > -1
174 is_space_tok = (t) ->
175         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
176
177 is_input_hidden_tok = (t) ->
178         return false unless t.type is TYPE_START_TAG
179         for a in t.attrs_a
180                 if a[0] is 'type'
181                         if a[1].toLowerCase() is 'hidden'
182                                 return true
183                         return false
184         return false
185
186 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
187 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
188
189 unicode_fixes = {}
190 unicode_fixes[0x00] = "\uFFFD"
191 unicode_fixes[0x80] = "\u20AC"
192 unicode_fixes[0x82] = "\u201A"
193 unicode_fixes[0x83] = "\u0192"
194 unicode_fixes[0x84] = "\u201E"
195 unicode_fixes[0x85] = "\u2026"
196 unicode_fixes[0x86] = "\u2020"
197 unicode_fixes[0x87] = "\u2021"
198 unicode_fixes[0x88] = "\u02C6"
199 unicode_fixes[0x89] = "\u2030"
200 unicode_fixes[0x8A] = "\u0160"
201 unicode_fixes[0x8B] = "\u2039"
202 unicode_fixes[0x8C] = "\u0152"
203 unicode_fixes[0x8E] = "\u017D"
204 unicode_fixes[0x91] = "\u2018"
205 unicode_fixes[0x92] = "\u2019"
206 unicode_fixes[0x93] = "\u201C"
207 unicode_fixes[0x94] = "\u201D"
208 unicode_fixes[0x95] = "\u2022"
209 unicode_fixes[0x96] = "\u2013"
210 unicode_fixes[0x97] = "\u2014"
211 unicode_fixes[0x98] = "\u02DC"
212 unicode_fixes[0x99] = "\u2122"
213 unicode_fixes[0x9A] = "\u0161"
214 unicode_fixes[0x9B] = "\u203A"
215 unicode_fixes[0x9C] = "\u0153"
216 unicode_fixes[0x9E] = "\u017E"
217 unicode_fixes[0x9F] = "\u0178"
218
219 quirks_yes_pi_prefixes = [
220         "+//silmaril//dtd html pro v0r11 19970101//"
221         "-//as//dtd html 3.0 aswedit + extensions//"
222         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
223         "-//ietf//dtd html 2.0 level 1//"
224         "-//ietf//dtd html 2.0 level 2//"
225         "-//ietf//dtd html 2.0 strict level 1//"
226         "-//ietf//dtd html 2.0 strict level 2//"
227         "-//ietf//dtd html 2.0 strict//"
228         "-//ietf//dtd html 2.0//"
229         "-//ietf//dtd html 2.1e//"
230         "-//ietf//dtd html 3.0//"
231         "-//ietf//dtd html 3.2 final//"
232         "-//ietf//dtd html 3.2//"
233         "-//ietf//dtd html 3//"
234         "-//ietf//dtd html level 0//"
235         "-//ietf//dtd html level 1//"
236         "-//ietf//dtd html level 2//"
237         "-//ietf//dtd html level 3//"
238         "-//ietf//dtd html strict level 0//"
239         "-//ietf//dtd html strict level 1//"
240         "-//ietf//dtd html strict level 2//"
241         "-//ietf//dtd html strict level 3//"
242         "-//ietf//dtd html strict//"
243         "-//ietf//dtd html//"
244         "-//metrius//dtd metrius presentational//"
245         "-//microsoft//dtd internet explorer 2.0 html strict//"
246         "-//microsoft//dtd internet explorer 2.0 html//"
247         "-//microsoft//dtd internet explorer 2.0 tables//"
248         "-//microsoft//dtd internet explorer 3.0 html strict//"
249         "-//microsoft//dtd internet explorer 3.0 html//"
250         "-//microsoft//dtd internet explorer 3.0 tables//"
251         "-//netscape comm. corp.//dtd html//"
252         "-//netscape comm. corp.//dtd strict html//"
253         "-//o'reilly and associates//dtd html 2.0//"
254         "-//o'reilly and associates//dtd html extended 1.0//"
255         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
256         "-//sq//dtd html 2.0 hotmetal + extensions//"
257         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
258         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
259         "-//spyglass//dtd html 2.0 extended//"
260         "-//sun microsystems corp.//dtd hotjava html//"
261         "-//sun microsystems corp.//dtd hotjava strict html//"
262         "-//w3c//dtd html 3 1995-03-24//"
263         "-//w3c//dtd html 3.2 draft//"
264         "-//w3c//dtd html 3.2 final//"
265         "-//w3c//dtd html 3.2//"
266         "-//w3c//dtd html 3.2s draft//"
267         "-//w3c//dtd html 4.0 frameset//"
268         "-//w3c//dtd html 4.0 transitional//"
269         "-//w3c//dtd html experimental 19960712//"
270         "-//w3c//dtd html experimental 970421//"
271         "-//w3c//dtd w3 html//"
272         "-//w3o//dtd w3 html 3.0//"
273         "-//webtechs//dtd mozilla html 2.0//"
274         "-//webtechs//dtd mozilla html//"
275 ]
276
277 # These are the character references that don't need a terminating semicolon
278 # min length: 2, max: 6, none are a prefix of any other.
279 legacy_char_refs = {
280         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
281         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
282         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
283         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
284         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
285         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
286         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
287         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
288         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
289         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
290         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
291         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
292         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
293         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
294         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
295         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
296         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
297         yen: '¥', yuml: 'ÿ'
298 }
299
300 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
301 raw_text_elements = ['script', 'style']
302 escapable_raw_text_elements = ['textarea', 'title']
303 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
304 svg_elements = [
305         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
306         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
307         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
308         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
309         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
310         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
311         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
312         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
313         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
314         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
315         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
316         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
317         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
318         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
319         'view', 'vkern'
320 ]
321
322 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
323 mathml_elements = [
324         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
325         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
326         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
327         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
328         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
329         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
330         'determinant', 'diff', 'divergence', 'divide', 'domain',
331         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
332         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
333         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
334         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
335         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
336         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
337         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
338         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
339         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
340         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
341         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
342         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
343         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
344         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
345         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
346         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
347         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
348         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
349         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
350         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
351         'vectorproduct', 'xor'
352 ]
353 # foreign_elements = [svg_elements..., mathml_elements...]
354 #normal_elements = All other allowed HTML elements are normal elements.
355
356 special_elements = {
357         # HTML:
358         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
359         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
360         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
361         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
362         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
363         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
364         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
365         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
366         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
367         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
368         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
369
370         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
371
372         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
373         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
374         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
375         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
376         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
377         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
378         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
379
380         # MathML:
381         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
382         'annotation-xml':NS_MATHML,
383
384         # SVG:
385         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
386 }
387
388 formatting_elements = {
389          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
390          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
391          u: true
392 }
393
394 mathml_text_integration = {
395         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
396 }
397 is_mathml_text_integration_point = (el) ->
398         return mathml_text_integration[el.name] is el.namespace
399 is_html_integration = (el) -> # DON'T PASS A TOKEN
400         if el.namespace is NS_MATHML
401                 if el.name is 'annotation-xml'
402                         if el.attrs.encoding?
403                                 if el.attrs.encoding.toLowerCase() is 'text/html'
404                                         return true
405                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
406                                         return true
407                 return false
408         if el.namespace is NS_SVG
409                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
410                         return true
411         return false
412
413 h_tags = {
414         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
415 }
416
417 foster_parenting_targets = {
418         table: NS_HTML
419         tbody: NS_HTML
420         tfoot: NS_HTML
421         thead: NS_HTML
422         tr: NS_HTML
423 }
424
425 end_tag_implied = {
426         dd: NS_HTML
427         dt: NS_HTML
428         li: NS_HTML
429         option: NS_HTML
430         optgroup: NS_HTML
431         p: NS_HTML
432         rb: NS_HTML
433         rp: NS_HTML
434         rt: NS_HTML
435         rtc: NS_HTML
436 }
437
438 el_is_special = (e) ->
439         return special_elements[e.name] is e.namespace
440
441 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
442 el_is_special_not_adp = (el) ->
443         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
444
445 svg_name_fixes = {
446         altglyph: 'altGlyph'
447         altglyphdef: 'altGlyphDef'
448         altglyphitem: 'altGlyphItem'
449         animatecolor: 'animateColor'
450         animatemotion: 'animateMotion'
451         animatetransform: 'animateTransform'
452         clippath: 'clipPath'
453         feblend: 'feBlend'
454         fecolormatrix: 'feColorMatrix'
455         fecomponenttransfer: 'feComponentTransfer'
456         fecomposite: 'feComposite'
457         feconvolvematrix: 'feConvolveMatrix'
458         fediffuselighting: 'feDiffuseLighting'
459         fedisplacementmap: 'feDisplacementMap'
460         fedistantlight: 'feDistantLight'
461         fedropshadow: 'feDropShadow'
462         feflood: 'feFlood'
463         fefunca: 'feFuncA'
464         fefuncb: 'feFuncB'
465         fefuncg: 'feFuncG'
466         fefuncr: 'feFuncR'
467         fegaussianblur: 'feGaussianBlur'
468         feimage: 'feImage'
469         femerge: 'feMerge'
470         femergenode: 'feMergeNode'
471         femorphology: 'feMorphology'
472         feoffset: 'feOffset'
473         fepointlight: 'fePointLight'
474         fespecularlighting: 'feSpecularLighting'
475         fespotlight: 'feSpotLight'
476         fetile: 'feTile'
477         feturbulence: 'feTurbulence'
478         foreignobject: 'foreignObject'
479         glyphref: 'glyphRef'
480         lineargradient: 'linearGradient'
481         radialgradient: 'radialGradient'
482         textpath: 'textPath'
483 }
484 svg_attribute_fixes = {
485         attributename: 'attributeName'
486         attributetype: 'attributeType'
487         basefrequency: 'baseFrequency'
488         baseprofile: 'baseProfile'
489         calcmode: 'calcMode'
490         clippathunits: 'clipPathUnits'
491         contentscripttype: 'contentScriptType'
492         contentstyletype: 'contentStyleType'
493         diffuseconstant: 'diffuseConstant'
494         edgemode: 'edgeMode'
495         externalresourcesrequired: 'externalResourcesRequired'
496         # WHATWG removes this: filterres: 'filterRes'
497         filterunits: 'filterUnits'
498         glyphref: 'glyphRef'
499         gradienttransform: 'gradientTransform'
500         gradientunits: 'gradientUnits'
501         kernelmatrix: 'kernelMatrix'
502         kernelunitlength: 'kernelUnitLength'
503         keypoints: 'keyPoints'
504         keysplines: 'keySplines'
505         keytimes: 'keyTimes'
506         lengthadjust: 'lengthAdjust'
507         limitingconeangle: 'limitingConeAngle'
508         markerheight: 'markerHeight'
509         markerunits: 'markerUnits'
510         markerwidth: 'markerWidth'
511         maskcontentunits: 'maskContentUnits'
512         maskunits: 'maskUnits'
513         numoctaves: 'numOctaves'
514         pathlength: 'pathLength'
515         patterncontentunits: 'patternContentUnits'
516         patterntransform: 'patternTransform'
517         patternunits: 'patternUnits'
518         pointsatx: 'pointsAtX'
519         pointsaty: 'pointsAtY'
520         pointsatz: 'pointsAtZ'
521         preservealpha: 'preserveAlpha'
522         preserveaspectratio: 'preserveAspectRatio'
523         primitiveunits: 'primitiveUnits'
524         refx: 'refX'
525         refy: 'refY'
526         repeatcount: 'repeatCount'
527         repeatdur: 'repeatDur'
528         requiredextensions: 'requiredExtensions'
529         requiredfeatures: 'requiredFeatures'
530         specularconstant: 'specularConstant'
531         specularexponent: 'specularExponent'
532         spreadmethod: 'spreadMethod'
533         startoffset: 'startOffset'
534         stddeviation: 'stdDeviation'
535         stitchtiles: 'stitchTiles'
536         surfacescale: 'surfaceScale'
537         systemlanguage: 'systemLanguage'
538         tablevalues: 'tableValues'
539         targetx: 'targetX'
540         targety: 'targetY'
541         textlength: 'textLength'
542         viewbox: 'viewBox'
543         viewtarget: 'viewTarget'
544         xchannelselector: 'xChannelSelector'
545         ychannelselector: 'yChannelSelector'
546         zoomandpan: 'zoomAndPan'
547 }
548 foreign_attr_fixes = {
549         'xlink:actuate': 'xlink actuate'
550         'xlink:arcrole': 'xlink arcrole'
551         'xlink:href': 'xlink href'
552         'xlink:role': 'xlink role'
553         'xlink:show': 'xlink show'
554         'xlink:title': 'xlink title'
555         'xlink:type': 'xlink type'
556         'xml:base': 'xml base'
557         'xml:lang': 'xml lang'
558         'xml:space': 'xml space'
559         'xmlns': 'xmlns'
560         'xmlns:xlink': 'xmlns xlink'
561 }
562 adjust_mathml_attributes = (t) ->
563         for a in t.attrs_a
564                 if a[0] is 'definitionurl'
565                         a[0] = 'definitionURL'
566         return
567 adjust_svg_attributes = (t) ->
568         for a in t.attrs_a
569                 if svg_attribute_fixes[a[0]]?
570                         a[0] = svg_attribute_fixes[a[0]]
571         return
572 adjust_foreign_attributes = (t) ->
573         # fixfull
574         for a in t.attrs_a
575                 if foreign_attr_fixes[a[0]]?
576                         a[0] = foreign_attr_fixes[a[0]]
577         return
578
579 # decode_named_char_ref()
580 #
581 # The list of named character references is _huge_ so ask the browser to decode
582 # for us instead of wasting bandwidth/space on including the table here.
583 #
584 # Pass without the "&" but with the ";" examples:
585 #    for "&amp" pass "amp;"
586 #    for "&#x2032" pass "x2032;"
587 g_dncr = {
588         cache: {}
589         textarea: document.createElement('textarea')
590 }
591 # TODO test this in IE8
592 decode_named_char_ref = (txt) ->
593         txt = "&#{txt}"
594         decoded = g_dncr.cache[txt]
595         return decoded if decoded?
596         g_dncr.textarea.innerHTML = txt
597         decoded = g_dncr.textarea.value
598         return null if decoded is txt
599         return g_dncr.cache[txt] = decoded
600
601 parse_html = (args) ->
602         txt = null
603         cur = null # index of next char in txt to be parsed
604         # declare doc and tokenizer variables so they're in scope below
605         doc = null
606         open_els = null # stack of open elements
607         afe = null # active formatting elements
608         template_ins_modes = null
609         ins_mode = null
610         original_ins_mode = null
611         tok_state = null
612         tok_cur_tag = null # partially parsed tag
613         flag_scripting = null
614         flag_frameset_ok = null
615         flag_parsing = null
616         flag_foster_parenting = null
617         form_element_pointer = null
618         temporary_buffer = null
619         pending_table_character_tokens = null
620         head_element_pointer = null
621         flag_fragment_parsing = null
622         context_element = null
623
624         stop_parsing = ->
625                 flag_parsing = false
626                 return
627
628         parse_error = ->
629                 if args.error_cb?
630                         args.error_cb cur
631                 else
632                         console.log "Parse error at character #{cur} of #{txt.length}"
633                 return
634
635         # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
636         # "Noah's Ark clause" but with three
637         afe_push = (new_el) ->
638                 matches = 0
639                 for el, i in afe
640                         if el.type is TYPE_AFE_MARKER
641                                 break
642                         if el.name is new_el.name and el.namespace is new_el.namespace
643                                 attrs_match = true
644                                 for k, v of el.attrs
645                                         unless new_el.attrs[k] is v
646                                                 attrs_match = false
647                                                 break
648                                 if attrs_match
649                                         for k, v of new_el.attrs
650                                                 unless el.attrs[k] is v
651                                                         attrs_match = false
652                                                         break
653                                 if attrs_match
654                                         matches += 1
655                                         if matches is 3
656                                                 afe.splice i, 1
657                                                 break
658                 afe.unshift new_el
659                 return
660
661         afe_push_marker = ->
662                 afe.unshift new_afe_marker()
663                 return
664
665         # the functions below impliment the Tree Contstruction algorithm
666         # http://www.w3.org/TR/html5/syntax.html#tree-construction
667
668         # But first... the helpers
669         template_tag_is_open = ->
670                 for el in open_els
671                         if el.name is 'template' and el.namespace is NS_HTML
672                                 return true
673                 return false
674         is_in_scope_x = (tag_name, scope, namespace) ->
675                 for el in open_els
676                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
677                                 return true
678                         if scope[el.name] is el.namespace
679                                 return false
680                 return false
681         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
682                 for el in open_els
683                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
684                                 return true
685                         if scope[el.name] is el.namespace
686                                 return false
687                         if scope2[el.name] is el.namespace
688                                 return false
689                 return false
690         standard_scopers = {
691                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
692                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
693                 template: NS_HTML,
694
695                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
696                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
697
698                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
699         }
700         button_scopers = button: NS_HTML
701         li_scopers = ol: NS_HTML, ul: NS_HTML
702         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
703         is_in_scope = (tag_name, namespace = null) ->
704                 return is_in_scope_x tag_name, standard_scopers, namespace
705         is_in_button_scope = (tag_name, namespace = null) ->
706                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
707         is_in_table_scope = (tag_name, namespace = null) ->
708                 return is_in_scope_x tag_name, table_scopers, namespace
709         # aka is_in_list_item_scope
710         is_in_li_scope = (tag_name, namespace = null) ->
711                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
712         is_in_select_scope = (tag_name, namespace = null) ->
713                 for t in open_els
714                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
715                                 return true
716                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
717                                 return false
718                 return false
719         # this checks for a particular element, not by name
720         # this requires a namespace match
721         el_is_in_scope = (needle) ->
722                 for el in open_els
723                         if el is needle
724                                 return true
725                         if standard_scopers[el.name] is el.namespace
726                                 return false
727                 return false
728
729         clear_to_table_stopers = {
730                 'table': true
731                 'template': true
732                 'html': true
733         }
734         clear_stack_to_table_context = ->
735                 loop
736                         if clear_to_table_stopers[open_els[0].name]?
737                                 break
738                         open_els.shift()
739                 return
740         clear_to_table_body_stopers = {
741                 tbody: NS_HTML
742                 tfoot: NS_HTML
743                 thead: NS_HTML
744                 template: NS_HTML
745                 html: NS_HTML
746         }
747         clear_stack_to_table_body_context = ->
748                 loop
749                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
750                                 break
751                         open_els.shift()
752                 return
753         clear_to_table_row_stopers = {
754                 'tr': true
755                 'template': true
756                 'html': true
757         }
758         clear_stack_to_table_row_context = ->
759                 loop
760                         if clear_to_table_row_stopers[open_els[0].name]?
761                                 break
762                         open_els.shift()
763                 return
764         clear_afe_to_marker = ->
765                 loop
766                         return unless afe.length > 0 # this happens in fragment case, ?spec error
767                         el = afe.shift()
768                         if el.type is TYPE_AFE_MARKER
769                                 return
770                 return
771
772         # 8.2.3.1 ...
773         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
774         reset_ins_mode = ->
775                 # 1. Let last be false.
776                 last = false
777                 # 2. Let node be the last node in the stack of open elements.
778                 node_i = 0
779                 node = open_els[node_i]
780                 # 3. Loop: If node is the first node in the stack of open elements,
781                 # then set last to true, and, if the parser was originally created as
782                 # part of the HTML fragment parsing algorithm (fragment case) set node
783                 # to the context element.
784                 loop
785                         if node_i is open_els.length - 1
786                                 last = true
787                                 if flag_fragment_parsing
788                                         node = context_element
789                         # 4. If node is a select element, run these substeps:
790                         if node.name is 'select' and node.namespace is NS_HTML
791                                 # 1. If last is true, jump to the step below labeled done.
792                                 unless last
793                                         # 2. Let ancestor be node.
794                                         ancestor_i = node_i
795                                         ancestor = node
796                                         # 3. Loop: If ancestor is the first node in the stack of
797                                         # open elements, jump to the step below labeled done.
798                                         loop
799                                                 if ancestor_i is open_els.length - 1
800                                                         break
801                                                 # 4. Let ancestor be the node before ancestor in the stack
802                                                 # of open elements.
803                                                 ancestor_i += 1
804                                                 ancestor = open_els[ancestor_i]
805                                                 # 5. If ancestor is a template node, jump to the step below
806                                                 # labeled done.
807                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
808                                                         break
809                                                 # 6. If ancestor is a table node, switch the insertion mode
810                                                 # to "in select in table" and abort these steps.
811                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
812                                                         ins_mode = ins_mode_in_select_in_table
813                                                         return
814                                                 # 7. Jump back to the step labeled loop.
815                                 # 8. Done: Switch the insertion mode to "in select" and abort
816                                 # these steps.
817                                 ins_mode = ins_mode_in_select
818                                 return
819                         # 5. If node is a td or th element and last is false, then switch
820                         # the insertion mode to "in cell" and abort these steps.
821                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
822                                 ins_mode = ins_mode_in_cell
823                                 return
824                         # 6. If node is a tr element, then switch the insertion mode to "in
825                         # row" and abort these steps.
826                         if node.name is 'tr' and node.namespace is NS_HTML
827                                 ins_mode = ins_mode_in_row
828                                 return
829                         # 7. If node is a tbody, thead, or tfoot element, then switch the
830                         # insertion mode to "in table body" and abort these steps.
831                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
832                                 ins_mode = ins_mode_in_table_body
833                                 return
834                         # 8. If node is a caption element, then switch the insertion mode
835                         # to "in caption" and abort these steps.
836                         if node.name is 'caption' and node.namespace is NS_HTML
837                                 ins_mode = ins_mode_in_caption
838                                 return
839                         # 9. If node is a colgroup element, then switch the insertion mode
840                         # to "in column group" and abort these steps.
841                         if node.name is 'colgroup' and node.namespace is NS_HTML
842                                 ins_mode = ins_mode_in_column_group
843                                 return
844                         # 10. If node is a table element, then switch the insertion mode to
845                         # "in table" and abort these steps.
846                         if node.name is 'table' and node.namespace is NS_HTML
847                                 ins_mode = ins_mode_in_table
848                                 return
849                         # 11. If node is a template element, then switch the insertion mode
850                         # to the current template insertion mode and abort these steps.
851                         if node.name is 'template' and node.namespace is NS_HTML
852                                 ins_mode = template_ins_modes[0]
853                                 return
854                         # 12. If node is a head element and last is true, then switch the
855                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
856                         # these steps. (fragment case)
857                         if node.name is 'head' and node.namespace is NS_HTML and last
858                                 ins_mode = ins_mode_in_body
859                                 return
860                         # 13. If node is a head element and last is false, then switch the
861                         # insertion mode to "in head" and abort these steps.
862                         if node.name is 'head' and node.namespace is NS_HTML and last is false
863                                 ins_mode = ins_mode_in_head
864                                 return
865                         # 14. If node is a body element, then switch the insertion mode to
866                         # "in body" and abort these steps.
867                         if node.name is 'body' and node.namespace is NS_HTML
868                                 ins_mode = ins_mode_in_body
869                                 return
870                         # 15. If node is a frameset element, then switch the insertion mode
871                         # to "in frameset" and abort these steps. (fragment case)
872                         if node.name is 'frameset' and node.namespace is NS_HTML
873                                 ins_mode = ins_mode_in_frameset
874                                 return
875                         # 16. If node is an html element, run these substeps:
876                         if node.name is 'html' and node.namespace is NS_HTML
877                                 # 1. If the head element pointer is null, switch the insertion
878                                 # mode to "before head" and abort these steps. (fragment case)
879                                 if head_element_pointer is null
880                                         ins_mode = ins_mode_before_head
881                                 else
882                                         # 2. Otherwise, the head element pointer is not null,
883                                         # switch the insertion mode to "after head" and abort these
884                                         # steps.
885                                         ins_mode = ins_mode_after_head
886                                 return
887                         # 17. If last is true, then switch the insertion mode to "in body"
888                         # and abort these steps. (fragment case)
889                         if last
890                                 ins_mode = ins_mode_in_body
891                                 return
892                         # 18. Let node now be the node before node in the stack of open
893                         # elements.
894                         node_i += 1
895                         node = open_els[node_i]
896                         # 19. Return to the step labeled loop.
897                 return
898
899         # 8.2.3.2
900
901         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
902         adjusted_current_node = ->
903                 if open_els.length is 1 and flag_fragment_parsing
904                         return context_element
905                 return open_els[0]
906
907         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
908         # this implementation is structured (mostly) as described at the link above.
909         # capitalized comments are the "labels" described at the link above.
910         reconstruct_afe = ->
911                 return if afe.length is 0
912                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
913                         return
914                 # Rewind
915                 i = 0
916                 loop
917                         if i is afe.length - 1
918                                 break
919                         i += 1
920                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
921                                 i -= 1 # Advance
922                                 break
923                 # Create
924                 loop
925                         el = insert_html_element afe[i].token
926                         afe[i] = el
927                         break if i is 0
928                         i -= 1 # Advance
929                 return
930
931         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
932         # adoption agency algorithm
933         # overview here:
934         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
935         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
936         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
937         adoption_agency = (subject) ->
938 # this block implements tha W3C spec
939 #               # 1. If the current node is an HTML element whose tag name is subject,
940 #               # then run these substeps:
941 #               #
942 #               # 1. Let element be the current node.
943 #               #
944 #               # 2. Pop element off the stack of open elements.
945 #               #
946 #               # 3. If element is also in the list of active formatting elements,
947 #               # remove the element from the list.
948 #               #
949 #               # 4. Abort the adoption agency algorithm.
950 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
951 #                       el = open_els.shift()
952 #                       # remove it from the list of active formatting elements (if found)
953 #                       for t, i in afe
954 #                               if t is el
955 #                                       afe.splice i, 1
956 #                                       break
957 #                       return
958 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
959                 # If the current node is an HTML element whose tag name is subject, and
960                 # the current node is not in the list of active formatting elements,
961                 # then pop the current node off the stack of open elements, and abort
962                 # these steps.
963                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
964                         # remove it from the list of active formatting elements (if found)
965                         in_afe = false
966                         for el, i in afe
967                                 if el is open_els[0]
968                                         in_afe = true
969                                         break
970                         unless in_afe
971                                 open_els.shift()
972                                 return
973                         # fall through
974 # END WHATWG
975                 outer = 0
976                 loop
977                         if outer >= 8
978                                 return
979                         outer += 1
980                         # 5. Let formatting element be the last element in the list of
981                         # active formatting elements that: is between the end of the list
982                         # and the last scope marker in the list, if any, or the start of
983                         # the list otherwise, and  has the tag name subject.
984                         fe = null
985                         for t, fe_of_afe in afe
986                                 if t.type is TYPE_AFE_MARKER
987                                         break
988                                 if t.name is subject
989                                         fe = t
990                                         break
991                         # If there is no such element, then abort these steps and instead
992                         # act as described in the "any other end tag" entry above.
993                         if fe is null
994                                 in_body_any_other_end_tag subject
995                                 return
996                         # 6. If formatting element is not in the stack of open elements,
997                         # then this is a parse error; remove the element from the list, and
998                         # abort these steps.
999                         in_open_els = false
1000                         for t, fe_of_open_els in open_els
1001                                 if t is fe
1002                                         in_open_els = true
1003                                         break
1004                         unless in_open_els
1005                                 parse_error()
1006                                 # "remove it from the list" must mean afe, since it's not in open_els
1007                                 afe.splice fe_of_afe, 1
1008                                 return
1009                         # 7. If formatting element is in the stack of open elements, but
1010                         # the element is not in scope, then this is a parse error; abort
1011                         # these steps.
1012                         unless el_is_in_scope fe
1013                                 parse_error()
1014                                 return
1015                         # 8. If formatting element is not the current node, this is a parse
1016                         # error. (But do not abort these steps.)
1017                         unless open_els[0] is fe
1018                                 parse_error()
1019                                 # continue
1020                         # 9. Let furthest block be the topmost node in the stack of open
1021                         # elements that is lower in the stack than formatting element, and
1022                         # is an element in the special category. There might not be one.
1023                         fb = null
1024                         fb_of_open_els = null
1025                         for t, i in open_els
1026                                 if t is fe
1027                                         break
1028                                 if el_is_special t
1029                                         fb = t
1030                                         fb_of_open_els = i
1031                                         # and continue, to see if there's one that's more "topmost"
1032                         # 10. If there is no furthest block, then the UA must first pop all
1033                         # the nodes from the bottom of the stack of open elements, from the
1034                         # current node up to and including formatting element, then remove
1035                         # formatting element from the list of active formatting elements,
1036                         # and finally abort these steps.
1037                         if fb is null
1038                                 loop
1039                                         t = open_els.shift()
1040                                         if t is fe
1041                                                 afe.splice fe_of_afe, 1
1042                                                 return
1043                         # 11. Let common ancestor be the element immediately above
1044                         # formatting element in the stack of open elements.
1045                         ca = open_els[fe_of_open_els + 1] # common ancestor
1046
1047                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1048                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1049                         bookmark = new_aaa_bookmark()
1050                         for t, i in afe
1051                                 if t is fe
1052                                         afe.splice i, 0, bookmark
1053                                         break
1054                         node = last_node = fb
1055                         inner = 0
1056                         loop
1057                                 inner += 1
1058                                 # 3. Let node be the element immediately above node in the
1059                                 # stack of open elements, or if node is no longer in the stack
1060                                 # of open elements (e.g. because it got removed by this
1061                                 # algorithm), the element that was immediately above node in
1062                                 # the stack of open elements before node was removed.
1063                                 node_next = null
1064                                 for t, i in open_els
1065                                         if t is node
1066                                                 node_next = open_els[i + 1]
1067                                                 break
1068                                 node = node_next ? node_above
1069                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1070
1071                                 # 4. If node is formatting element, then go to the next step in
1072                                 # the overall algorithm.
1073                                 if node is fe
1074                                         break
1075                                 # 5. If inner loop counter is greater than three and node is in
1076                                 # the list of active formatting elements, then remove node from
1077                                 # the list of active formatting elements.
1078                                 node_in_afe = false
1079                                 for t, i in afe
1080                                         if t is node
1081                                                 if inner > 3
1082                                                         afe.splice i, 1
1083                                                 else
1084                                                         node_in_afe = true
1085                                                 break
1086                                 # 6. If node is not in the list of active formatting elements,
1087                                 # then remove node from the stack of open elements and then go
1088                                 # back to the step labeled inner loop.
1089                                 unless node_in_afe
1090                                         for t, i in open_els
1091                                                 if t is node
1092                                                         node_above = open_els[i + 1]
1093                                                         open_els.splice i, 1
1094                                                         break
1095                                         continue
1096                                 # 7. create an element for the token for which the element node
1097                                 # was created, in the HTML namespace, with common ancestor as
1098                                 # the intended parent; replace the entry for node in the list
1099                                 # of active formatting elements with an entry for the new
1100                                 # element, replace the entry for node in the stack of open
1101                                 # elements with an entry for the new element, and let node be
1102                                 # the new element.
1103                                 new_node = token_to_element node.token, NS_HTML, ca
1104                                 for t, i in afe
1105                                         if t is node
1106                                                 afe[i] = new_node
1107                                                 break
1108                                 for t, i in open_els
1109                                         if t is node
1110                                                 node_above = open_els[i + 1]
1111                                                 open_els[i] = new_node
1112                                                 break
1113                                 node = new_node
1114                                 # 8. If last node is furthest block, then move the
1115                                 # aforementioned bookmark to be immediately after the new node
1116                                 # in the list of active formatting elements.
1117                                 if last_node is fb
1118                                         for t, i in afe
1119                                                 if t is bookmark
1120                                                         afe.splice i, 1
1121                                                         break
1122                                         for t, i in afe
1123                                                 if t is node
1124                                                         # "after" means lower
1125                                                         afe.splice i, 0, bookmark # "after as <-
1126                                                         break
1127                                 # 9. Insert last node into node, first removing it from its
1128                                 # previous parent node if any.
1129                                 if last_node.parent?
1130                                         for c, i in last_node.parent.children
1131                                                 if c is last_node
1132                                                         last_node.parent.children.splice i, 1
1133                                                         break
1134                                 node.children.push last_node
1135                                 last_node.parent = node
1136                                 # 10. Let last node be node.
1137                                 last_node = node
1138                                 # 11. Return to the step labeled inner loop.
1139                         # 14. Insert whatever last node ended up being in the previous step
1140                         # at the appropriate place for inserting a node, but using common
1141                         # ancestor as the override target.
1142
1143                         # In the case where fe is immediately followed by fb:
1144                         #   * inner loop exits out early (node==fe)
1145                         #   * last_node is fb
1146                         #   * last_node is still in the tree (not a duplicate)
1147                         if last_node.parent?
1148                                 for c, i in last_node.parent.children
1149                                         if c is last_node
1150                                                 last_node.parent.children.splice i, 1
1151                                                 break
1152                         # can't use standard insert token thing, because it's already in
1153                         # open_els and must stay at it's current position in open_els
1154                         dest = adjusted_insertion_location ca
1155                         dest[0].children.splice dest[1], 0, last_node
1156                         last_node.parent = dest[0]
1157                         # 15. Create an element for the token for which formatting element
1158                         # was created, in the HTML namespace, with furthest block as the
1159                         # intended parent.
1160                         new_element = token_to_element fe.token, NS_HTML, fb
1161                         # 16. Take all of the child nodes of furthest block and append them
1162                         # to the element created in the last step.
1163                         while fb.children.length
1164                                 t = fb.children.shift()
1165                                 t.parent = new_element
1166                                 new_element.children.push t
1167                         # 17. Append that new element to furthest block.
1168                         new_element.parent = fb
1169                         fb.children.push new_element
1170                         # 18. Remove formatting element from the list of active formatting
1171                         # elements, and insert the new element into the list of active
1172                         # formatting elements at the position of the aforementioned
1173                         # bookmark.
1174                         for t, i in afe
1175                                 if t is fe
1176                                         afe.splice i, 1
1177                                         break
1178                         for t, i in afe
1179                                 if t is bookmark
1180                                         afe[i] = new_element
1181                                         break
1182                         # 19. Remove formatting element from the stack of open elements,
1183                         # and insert the new element into the stack of open elements
1184                         # immediately below the position of furthest block in that stack.
1185                         for t, i in open_els
1186                                 if t is fe
1187                                         open_els.splice i, 1
1188                                         break
1189                         for t, i in open_els
1190                                 if t is fb
1191                                         open_els.splice i, 0, new_element
1192                                         break
1193                         # 20. Jump back to the step labeled outer loop.
1194                 return
1195
1196         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1197         close_p_element = ->
1198                 generate_implied_end_tags 'p' # arg is exception
1199                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1200                         parse_error()
1201                 while open_els.length > 1 # just in case
1202                         el = open_els.shift()
1203                         if el.name is 'p' and el.namespace is NS_HTML
1204                                 return
1205                 return
1206         close_p_if_in_button_scope = ->
1207                 if is_in_button_scope 'p', NS_HTML
1208                         close_p_element()
1209                 return
1210
1211         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1212         # aka insert_a_character = (t) ->
1213         insert_character = (t) ->
1214                 dest = adjusted_insertion_location()
1215                 # fixfull check for Document node
1216                 if dest[1] > 0
1217                         prev = dest[0].children[dest[1] - 1]
1218                         if prev.type is TYPE_TEXT
1219                                 prev.text += t.text
1220                                 return
1221                 dest[0].children.splice dest[1], 0, t
1222                 return
1223
1224         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1225         process_token = (t) ->
1226                 acn = adjusted_current_node()
1227                 unless acn?
1228                         ins_mode t
1229                         return
1230                 if acn.namespace is NS_HTML
1231                         ins_mode t
1232                         return
1233                 if is_mathml_text_integration_point(acn)
1234                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1235                                 ins_mode t
1236                                 return
1237                         if t.type is TYPE_TEXT
1238                                 ins_mode t
1239                                 return
1240                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1241                         ins_mode t
1242                         return
1243                 if is_html_integration acn
1244                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1245                                 ins_mode t
1246                                 return
1247                 if t.type is TYPE_EOF
1248                         ins_mode t
1249                         return
1250                 in_foreign_content t
1251                 return
1252
1253         # 8.2.5.1
1254         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1255         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1256         adjusted_insertion_location = (override_target = null) ->
1257                 # 1. If there was an override target specified, then let target be the
1258                 # override target.
1259                 if override_target?
1260                         target = override_target
1261                 else # Otherwise, let target be the current node.
1262                         target = open_els[0]
1263                 # 2. Determine the adjusted insertion location using the first matching
1264                 # steps from the following list:
1265                 #
1266                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1267                 # thead, or tr element Foster parenting happens when content is
1268                 # misnested in tables.
1269                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1270                         loop # once. this is here so we can ``break`` to "abort these substeps"
1271                                 # 1. Let last template be the last template element in the
1272                                 # stack of open elements, if any.
1273                                 last_template = null
1274                                 last_template_i = null
1275                                 for el, i in open_els
1276                                         if el.name is 'template' and el.namespace is NS_HTML
1277                                                 last_template = el
1278                                                 last_template_i = i
1279                                                 break
1280                                 # 2. Let last table be the last table element in the stack of
1281                                 # open elements, if any.
1282                                 last_table = null
1283                                 last_table_i
1284                                 for el, i in open_els
1285                                         if el.name is 'table' and el.namespace is NS_HTML
1286                                                 last_table = el
1287                                                 last_table_i = i
1288                                                 break
1289                                 # 3. If there is a last template and either there is no last
1290                                 # table, or there is one, but last template is lower (more
1291                                 # recently added) than last table in the stack of open
1292                                 # elements, then: let adjusted insertion location be inside
1293                                 # last template's template contents, after its last child (if
1294                                 # any), and abort these substeps.
1295                                 if last_template and (last_table is null or last_template_i < last_table_i)
1296                                         target = last_template # fixfull should be it's contents
1297                                         target_i = target.children.length
1298                                         break
1299                                 # 4. If there is no last table, then let adjusted insertion
1300                                 # location be inside the first element in the stack of open
1301                                 # elements (the html element), after its last child (if any),
1302                                 # and abort these substeps. (fragment case)
1303                                 if last_table is null
1304                                         # this is odd
1305                                         target = open_els[open_els.length - 1]
1306                                         target_i = target.children.length
1307                                         break
1308                                 # 5. If last table has a parent element, then let adjusted
1309                                 # insertion location be inside last table's parent element,
1310                                 # immediately before last table, and abort these substeps.
1311                                 if last_table.parent?
1312                                         for c, i in last_table.parent.children
1313                                                 if c is last_table
1314                                                         target = last_table.parent
1315                                                         target_i = i
1316                                                         break
1317                                         break
1318                                 # 6. Let previous element be the element immediately above last
1319                                 # table in the stack of open elements.
1320                                 #
1321                                 # huh? how could it not have a parent?
1322                                 previous_element = open_els[last_table_i + 1]
1323                                 # 7. Let adjusted insertion location be inside previous
1324                                 # element, after its last child (if any).
1325                                 target = previous_element
1326                                 target_i = target.children.length
1327                                 # Note: These steps are involved in part because it's possible
1328                                 # for elements, the table element in this case in particular,
1329                                 # to have been moved by a script around in the DOM, or indeed
1330                                 # removed from the DOM entirely, after the element was inserted
1331                                 # by the parser.
1332                                 break # don't really loop
1333                 else
1334                         # Otherwise Let adjusted insertion location be inside target, after
1335                         # its last child (if any).
1336                         target_i = target.children.length
1337
1338                 # 3. If the adjusted insertion location is inside a template element,
1339                 # let it instead be inside the template element's template contents,
1340                 # after its last child (if any).
1341                 # fixfull (template)
1342
1343                 # 4. Return the adjusted insertion location.
1344                 return [target, target_i]
1345
1346         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1347         # aka create_an_element_for_token
1348         token_to_element = (t, namespace, intended_parent) ->
1349                 # convert attributes into a hash
1350                 attrs = {}
1351                 for a in t.attrs_a
1352                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1353                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1354
1355                 # TODO 2. If the newly created element has an xmlns attribute in the
1356                 # XMLNS namespace whose value is not exactly the same as the element's
1357                 # namespace, that is a parse error. Similarly, if the newly created
1358                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1359                 # value is not the XLink Namespace, that is a parse error.
1360
1361                 # fixfull: the spec says stuff about form pointers and ownerDocument
1362
1363                 return el
1364
1365         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1366         insert_foreign_element = (token, namespace) ->
1367                 ail = adjusted_insertion_location()
1368                 ail_el = ail[0]
1369                 ail_i = ail[1]
1370                 el = token_to_element token, namespace, ail_el
1371                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1372                 el.parent = ail_el
1373                 ail_el.children.splice ail_i, 0, el
1374                 open_els.unshift el
1375                 return el
1376         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1377         insert_html_element = (token) ->
1378                 return insert_foreign_element token, NS_HTML
1379
1380         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1381         # position should be [node, index_within_children]
1382         insert_comment = (t, position = null) ->
1383                 position ?= adjusted_insertion_location()
1384                 position[0].children.splice position[1], 0, t
1385                 return
1386
1387         # 8.2.5.2
1388         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1389         parse_generic_raw_text = (t) ->
1390                 insert_html_element t
1391                 tok_state = tok_state_rawtext
1392                 original_ins_mode = ins_mode
1393                 ins_mode = ins_mode_text
1394                 return
1395         parse_generic_rcdata_text = (t) ->
1396                 insert_html_element t
1397                 tok_state = tok_state_rcdata
1398                 original_ins_mode = ins_mode
1399                 ins_mode = ins_mode_text
1400                 return
1401
1402         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1403         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1404         generate_implied_end_tags = (except = null) ->
1405                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1406                         open_els.shift()
1407                 return
1408
1409         # 8.2.5.4 The rules for parsing tokens in HTML content
1410         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1411
1412         # 8.2.5.4.1 The "initial" insertion mode
1413         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1414         is_quirks_yes_doctype = (t) ->
1415                 if t.flag 'force-quirks'
1416                         return true
1417                 if t.name isnt 'html'
1418                         return true
1419                 if t.public_identifier?
1420                         pi = t.public_identifier.toLowerCase()
1421                         for p in quirks_yes_pi_prefixes
1422                                 if pi.substr(0, p.length) is p
1423                                         return true
1424                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1425                                 return true
1426                 if t.system_identifier?
1427                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1428                                 return true
1429                 else if t.public_identifier?
1430                         # already did this: pi = t.public_identifier.toLowerCase()
1431                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1432                                 return true
1433                 return false
1434         is_quirks_limited_doctype = (t) ->
1435                 if t.public_identifier?
1436                         pi = t.public_identifier.toLowerCase()
1437                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1438                                 return true
1439                         if t.system_identifier?
1440                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1441                                         return true
1442                 return false
1443         ins_mode_initial = (t) ->
1444                 if is_space_tok t
1445                         return
1446                 if t.type is TYPE_COMMENT
1447                         # ?fixfull
1448                         doc.children.push t
1449                         return
1450                 if t.type is TYPE_DOCTYPE
1451                         # fixfull syntax error from first paragraph and following bullets
1452                         # fixfull set doc.doctype
1453                         # fixfull is the "not an iframe srcdoc" thing relevant?
1454                         if is_quirks_yes_doctype t
1455                                 doc.flag 'quirks mode', QUIRKS_YES
1456                         else if is_quirks_limited_doctype t
1457                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1458                         doc.children.push t
1459                         ins_mode = ins_mode_before_html
1460                         return
1461                 # Anything else
1462                 # fixfull not iframe srcdoc?
1463                 parse_error()
1464                 doc.flag 'quirks mode', QUIRKS_YES
1465                 ins_mode = ins_mode_before_html
1466                 process_token t
1467                 return
1468
1469         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1470         ins_mode_before_html = (t) ->
1471                 if t.type is TYPE_DOCTYPE
1472                         parse_error()
1473                         return
1474                 if t.type is TYPE_COMMENT
1475                         doc.children.push t
1476                         return
1477                 if is_space_tok t
1478                         return
1479                 if t.type is TYPE_START_TAG and t.name is 'html'
1480                         el = token_to_element t, NS_HTML, doc
1481                         doc.children.push el
1482                         el.document = doc
1483                         open_els.unshift(el)
1484                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1485                         ins_mode = ins_mode_before_head
1486                         return
1487                 if t.type is TYPE_END_TAG
1488                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1489                                 # fall through to "anything else"
1490                         else
1491                                 parse_error()
1492                                 return
1493                 # Anything else
1494                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1495                 doc.children.push el
1496                 el.document = doc
1497                 open_els.unshift el
1498                 # ?fixfull browsing context
1499                 ins_mode = ins_mode_before_head
1500                 process_token t
1501                 return
1502
1503         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1504         ins_mode_before_head = (t) ->
1505                 if is_space_tok t
1506                         return
1507                 if t.type is TYPE_COMMENT
1508                         insert_comment t
1509                         return
1510                 if t.type is TYPE_DOCTYPE
1511                         parse_error()
1512                         return
1513                 if t.type is TYPE_START_TAG and t.name is 'html'
1514                         ins_mode_in_body t
1515                         return
1516                 if t.type is TYPE_START_TAG and t.name is 'head'
1517                         el = insert_html_element t
1518                         head_element_pointer = el
1519                         ins_mode = ins_mode_in_head
1520                         return
1521                 if t.type is TYPE_END_TAG
1522                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1523                                 # fall through to Anything else below
1524                         else
1525                                 parse_error()
1526                                 return
1527                 # Anything else
1528                 el = insert_html_element new_open_tag 'head'
1529                 head_element_pointer = el
1530                 ins_mode = ins_mode_in_head
1531                 process_token t
1532                 return
1533
1534         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1535         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1536                 open_els.shift() # spec says this will be a 'head' node
1537                 ins_mode = ins_mode_after_head
1538                 process_token t
1539                 return
1540         ins_mode_in_head = (t) ->
1541                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1542                         insert_character t
1543                         return
1544                 if t.type is TYPE_COMMENT
1545                         insert_comment t
1546                         return
1547                 if t.type is TYPE_DOCTYPE
1548                         parse_error()
1549                         return
1550                 if t.type is TYPE_START_TAG and t.name is 'html'
1551                         ins_mode_in_body t
1552                         return
1553                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1554                         el = insert_html_element t
1555                         open_els.shift()
1556                         t.acknowledge_self_closing()
1557                         return
1558                 if t.type is TYPE_START_TAG and t.name is 'meta'
1559                         el = insert_html_element t
1560                         open_els.shift()
1561                         t.acknowledge_self_closing()
1562                         # fixfull encoding stuff
1563                         return
1564                 if t.type is TYPE_START_TAG and t.name is 'title'
1565                         parse_generic_rcdata_text t
1566                         return
1567                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1568                         parse_generic_raw_text t
1569                         return
1570                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1571                         insert_html_element t
1572                         ins_mode = ins_mode_in_head_noscript
1573                         return
1574                 if t.type is TYPE_START_TAG and t.name is 'script'
1575                         ail = adjusted_insertion_location()
1576                         el = token_to_element t, NS_HTML, ail
1577                         el.flag 'parser-inserted', true
1578                         # fixfull frament case
1579                         ail[0].children.splice ail[1], 0, el
1580                         open_els.unshift el
1581                         tok_state = tok_state_script_data
1582                         original_ins_mode = ins_mode # make sure orig... is defined
1583                         ins_mode = ins_mode_text
1584                         return
1585                 if t.type is TYPE_END_TAG and t.name is 'head'
1586                         open_els.shift() # will be a head element... spec says so
1587                         ins_mode = ins_mode_after_head
1588                         return
1589                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1590                         ins_mode_in_head_else t
1591                         return
1592                 if t.type is TYPE_START_TAG and t.name is 'template'
1593                         insert_html_element t
1594                         afe_push_marker()
1595                         flag_frameset_ok = false
1596                         ins_mode = ins_mode_in_template
1597                         template_ins_modes.unshift ins_mode_in_template
1598                         return
1599                 if t.type is TYPE_END_TAG and t.name is 'template'
1600                         if template_tag_is_open()
1601                                 generate_implied_end_tags
1602                                 if open_els[0].name isnt 'template'
1603                                         parse_error()
1604                                 loop
1605                                         el = open_els.shift()
1606                                         if el.name is 'template' and el.namespace is NS_HTML
1607                                                 break
1608                                 clear_afe_to_marker()
1609                                 template_ins_modes.shift()
1610                                 reset_ins_mode()
1611                         else
1612                                 parse_error()
1613                         return
1614                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1615                         parse_error()
1616                         return
1617                 ins_mode_in_head_else t
1618                 return
1619
1620         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1621         ins_mode_in_head_noscript_else = (t) ->
1622                 parse_error()
1623                 open_els.shift()
1624                 ins_mode = ins_mode_in_head
1625                 process_token t
1626                 return
1627         ins_mode_in_head_noscript = (t) ->
1628                 if t.type is TYPE_DOCTYPE
1629                         parse_error()
1630                         return
1631                 if t.type is TYPE_START_TAG and t.name is 'html'
1632                         ins_mode_in_body t
1633                         return
1634                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1635                         open_els.shift()
1636                         ins_mode = ins_mode_in_head
1637                         return
1638                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1639                         ins_mode_in_head t
1640                         return
1641                 if t.type is TYPE_END_TAG and t.name is 'br'
1642                         ins_mode_in_head_noscript_else t
1643                         return
1644                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1645                         parse_error()
1646                         return
1647                 # Anything else
1648                 ins_mode_in_head_noscript_else t
1649                 return
1650
1651         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1652         ins_mode_after_head_else = (t) ->
1653                 body_tok = new_open_tag 'body'
1654                 insert_html_element body_tok
1655                 ins_mode = ins_mode_in_body
1656                 process_token t
1657                 return
1658         ins_mode_after_head = (t) ->
1659                 if is_space_tok t
1660                         insert_character t
1661                         return
1662                 if t.type is TYPE_COMMENT
1663                         insert_comment t
1664                         return
1665                 if t.type is TYPE_DOCTYPE
1666                         parse_error()
1667                         return
1668                 if t.type is TYPE_START_TAG and t.name is 'html'
1669                         ins_mode_in_body t
1670                         return
1671                 if t.type is TYPE_START_TAG and t.name is 'body'
1672                         insert_html_element t
1673                         flag_frameset_ok = false
1674                         ins_mode = ins_mode_in_body
1675                         return
1676                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1677                         insert_html_element t
1678                         ins_mode = ins_mode_in_frameset
1679                         return
1680                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1681                         parse_error()
1682                         open_els.unshift head_element_pointer
1683                         ins_mode_in_head t
1684                         for el, i in open_els
1685                                 if el is head_element_pointer
1686                                         open_els.splice i, 1
1687                                         return
1688                         return
1689                 if t.type is TYPE_END_TAG and t.name is 'template'
1690                         ins_mode_in_head t
1691                         return
1692                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1693                         ins_mode_after_head_else t
1694                         return
1695                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1696                         parse_error()
1697                         return
1698                 # Anything else
1699                 ins_mode_after_head_else t
1700                 return
1701
1702         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1703         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1704                 node = open_els[0]
1705                 loop
1706                         if node.name is name and node.namespace is NS_HTML
1707                                 generate_implied_end_tags name # arg is exception
1708                                 unless node is open_els[0]
1709                                         parse_error()
1710                                 loop
1711                                         el = open_els.shift()
1712                                         if el is node
1713                                                 return
1714                         if special_elements[node.name] is node.namespace
1715                                 parse_error()
1716                                 return
1717                         for el, i in open_els
1718                                 if node is el
1719                                         node = open_els[i + 1]
1720                                         break
1721                 return
1722         ins_mode_in_body = (t) ->
1723                 if t.type is TYPE_TEXT and t.text is "\u0000"
1724                         parse_error()
1725                         return
1726                 if is_space_tok t
1727                         reconstruct_afe()
1728                         insert_character t
1729                         return
1730                 if t.type is TYPE_TEXT
1731                         reconstruct_afe()
1732                         insert_character t
1733                         flag_frameset_ok = false
1734                         return
1735                 if t.type is TYPE_COMMENT
1736                         insert_comment t
1737                         return
1738                 if t.type is TYPE_DOCTYPE
1739                         parse_error()
1740                         return
1741                 if t.type is TYPE_START_TAG and t.name is 'html'
1742                         parse_error()
1743                         return if template_tag_is_open()
1744                         root_attrs = open_els[open_els.length - 1].attrs
1745                         for a in t.attrs_a
1746                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1747                         return
1748
1749                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1750                         ins_mode_in_head t
1751                         return
1752                 if t.type is TYPE_START_TAG and t.name is 'body'
1753                         parse_error()
1754                         return if open_els.length < 2
1755                         second = open_els[open_els.length - 2]
1756                         return unless second.namespace is NS_HTML
1757                         return unless second.name is 'body'
1758                         return if template_tag_is_open()
1759                         flag_frameset_ok = false
1760                         for a in t.attrs_a
1761                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1762                         return
1763                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1764                         parse_error()
1765                         return if open_els.length < 2
1766                         second_i = open_els.length - 2
1767                         second = open_els[second_i]
1768                         return unless second.namespace is NS_HTML
1769                         return unless second.name is 'body'
1770                         if flag_frameset_ok is false
1771                                 return
1772                         if second.parent?
1773                                 for el, i in second.parent.children
1774                                         if el is second
1775                                                 second.parent.children.splice i, 1
1776                                                 break
1777                         open_els.splice second_i, 1
1778                         # pop everything except the "root html element"
1779                         while open_els.length > 1
1780                                 open_els.shift()
1781                         insert_html_element t
1782                         ins_mode = ins_mode_in_frameset
1783                         return
1784                 if t.type is TYPE_EOF
1785                         ok_tags = {
1786                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1787                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1788                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1789                         }
1790                         for el in open_els
1791                                 unless ok_tags[t.name] is el.namespace
1792                                         parse_error()
1793                                         break
1794                         if template_ins_modes.length > 0
1795                                 ins_mode_in_template t
1796                         else
1797                                 stop_parsing()
1798                         return
1799                 if t.type is TYPE_END_TAG and t.name is 'body'
1800                         unless is_in_scope 'body', NS_HTML
1801                                 parse_error()
1802                                 return
1803                         ok_tags = {
1804                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1805                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1806                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1807                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1808                                 html:NS_HTML
1809                         }
1810                         for el in open_els
1811                                 unless ok_tags[t.name] is el.namespace
1812                                         parse_error()
1813                                         break
1814                         ins_mode = ins_mode_after_body
1815                         return
1816                 if t.type is TYPE_END_TAG and t.name is 'html'
1817                         unless is_in_scope 'body', NS_HTML
1818                                 parse_error()
1819                                 return
1820                         ok_tags = {
1821                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1822                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1823                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1824                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1825                                 html:NS_HTML
1826                         }
1827                         for el in open_els
1828                                 unless ok_tags[t.name] is el.namespace
1829                                         parse_error()
1830                                         break
1831                         ins_mode = ins_mode_after_body
1832                         process_token t
1833                         return
1834                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1835                         close_p_if_in_button_scope()
1836                         insert_html_element t
1837                         return
1838                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1839                         close_p_if_in_button_scope()
1840                         if h_tags[open_els[0].name] is open_els[0].namespace
1841                                 parse_error()
1842                                 open_els.shift()
1843                         insert_html_element t
1844                         return
1845                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1846                         close_p_if_in_button_scope()
1847                         insert_html_element t
1848                         eat_next_token_if_newline()
1849                         flag_frameset_ok = false
1850                         return
1851                 if t.type is TYPE_START_TAG and t.name is 'form'
1852                         unless form_element_pointer is null or template_tag_is_open()
1853                                 parse_error()
1854                                 return
1855                         close_p_if_in_button_scope()
1856                         el = insert_html_element t
1857                         unless template_tag_is_open()
1858                                 form_element_pointer = el
1859                         return
1860                 if t.type is TYPE_START_TAG and t.name is 'li'
1861                         flag_frameset_ok = false
1862                         for node in open_els
1863                                 if node.name is 'li' and node.namespace is NS_HTML
1864                                         generate_implied_end_tags 'li' # arg is exception
1865                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1866                                                 parse_error()
1867                                         loop
1868                                                 el = open_els.shift()
1869                                                 if el.name is 'li' and el.namespace is NS_HTML
1870                                                         break
1871                                         break
1872                                 if el_is_special_not_adp node
1873                                                 break
1874                         close_p_if_in_button_scope()
1875                         insert_html_element t
1876                         return
1877                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1878                         flag_frameset_ok = false
1879                         for node in open_els
1880                                 if node.name is 'dd' and node.namespace is NS_HTML
1881                                         generate_implied_end_tags 'dd' # arg is exception
1882                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1883                                                 parse_error()
1884                                         loop
1885                                                 el = open_els.shift()
1886                                                 if el.name is 'dd' and el.namespace is NS_HTML
1887                                                         break
1888                                         break
1889                                 if node.name is 'dt' and node.namespace is NS_HTML
1890                                         generate_implied_end_tags 'dt' # arg is exception
1891                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1892                                                 parse_error()
1893                                         loop
1894                                                 el = open_els.shift()
1895                                                 if el.name is 'dt' and el.namespace is NS_HTML
1896                                                         break
1897                                         break
1898                                 if el_is_special_not_adp node
1899                                         break
1900                         close_p_if_in_button_scope()
1901                         insert_html_element t
1902                         return
1903                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1904                         close_p_if_in_button_scope()
1905                         insert_html_element t
1906                         tok_state = tok_state_plaintext
1907                         return
1908                 if t.type is TYPE_START_TAG and t.name is 'button'
1909                         if is_in_scope 'button', NS_HTML
1910                                 parse_error()
1911                                 generate_implied_end_tags()
1912                                 loop
1913                                         el = open_els.shift()
1914                                         if el.name is 'button' and el.namespace is NS_HTML
1915                                                 break
1916                         reconstruct_afe()
1917                         insert_html_element t
1918                         flag_frameset_ok = false
1919                         return
1920                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1921                         unless is_in_scope t.name, NS_HTML
1922                                 parse_error()
1923                                 return
1924                         generate_implied_end_tags()
1925                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1926                                 parse_error()
1927                         loop
1928                                 el = open_els.shift()
1929                                 if el.name is t.name and el.namespace is NS_HTML
1930                                         return
1931                         return
1932                 if t.type is TYPE_END_TAG and t.name is 'form'
1933                         unless template_tag_is_open()
1934                                 node = form_element_pointer
1935                                 form_element_pointer = null
1936                                 if node is null or not el_is_in_scope node
1937                                         parse_error()
1938                                         return
1939                                 generate_implied_end_tags()
1940                                 if open_els[0] isnt node
1941                                         parse_error()
1942                                 for el, i in open_els
1943                                         if el is node
1944                                                 open_els.splice i, 1
1945                                                 break
1946                         else
1947                                 unless is_in_scope 'form', NS_HTML
1948                                         parse_error()
1949                                         return
1950                                 generate_implied_end_tags()
1951                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1952                                         parse_error()
1953                                 loop
1954                                         el = open_els.shift()
1955                                         if el.name is 'form' and el.namespace is NS_HTML
1956                                                 break
1957                         return
1958                 if t.type is TYPE_END_TAG and t.name is 'p'
1959                         unless is_in_button_scope 'p', NS_HTML
1960                                 parse_error()
1961                                 insert_html_element new_open_tag 'p'
1962                         close_p_element()
1963                         return
1964                 if t.type is TYPE_END_TAG and t.name is 'li'
1965                         unless is_in_li_scope 'li', NS_HTML
1966                                 parse_error()
1967                                 return
1968                         generate_implied_end_tags 'li' # arg is exception
1969                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1970                                 parse_error()
1971                         loop
1972                                 el = open_els.shift()
1973                                 if el.name is 'li' and el.namespace is NS_HTML
1974                                         break
1975                         return
1976                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1977                         unless is_in_scope t.name, NS_HTML
1978                                 parse_error()
1979                                 return
1980                         generate_implied_end_tags t.name # arg is exception
1981                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1982                                 parse_error()
1983                         loop
1984                                 el = open_els.shift()
1985                                 if el.name is t.name and el.namespace is NS_HTML
1986                                         break
1987                         return
1988                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1989                         h_in_scope = false
1990                         for el in open_els
1991                                 if h_tags[el.name] is el.namespace
1992                                         h_in_scope = true
1993                                         break
1994                                 if standard_scopers[el.name] is el.namespace
1995                                         break
1996                         unless h_in_scope
1997                                 parse_error()
1998                                 return
1999                         generate_implied_end_tags()
2000                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001                                 parse_error()
2002                         loop
2003                                 el = open_els.shift()
2004                                 if h_tags[el.name] is el.namespace
2005                                         break
2006                         return
2007                 # deep breath!
2008                 if t.type is TYPE_START_TAG and t.name is 'a'
2009                         # If the list of active formatting elements contains an a element
2010                         # between the end of the list and the last marker on the list (or
2011                         # the start of the list if there is no marker on the list), then
2012                         # this is a parse error; run the adoption agency algorithm for the
2013                         # tag name "a", then remove that element from the list of active
2014                         # formatting elements and the stack of open elements if the
2015                         # adoption agency algorithm didn't already remove it (it might not
2016                         # have if the element is not in table scope).
2017                         found = false
2018                         for el in afe
2019                                 if el.type is TYPE_AFE_MARKER
2020                                         break
2021                                 if el.name is 'a' and el.namespace is NS_HTML
2022                                         found = el
2023                         if found?
2024                                 parse_error()
2025                                 adoption_agency 'a'
2026                                 for el, i in afe
2027                                         if el is found
2028                                                 afe.splice i, 1
2029                                 for el, i in open_els
2030                                         if el is found
2031                                                 open_els.splice i, 1
2032                         reconstruct_afe()
2033                         el = insert_html_element t
2034                         afe_push el
2035                         return
2036                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2037                         reconstruct_afe()
2038                         el = insert_html_element t
2039                         afe_push el
2040                         return
2041                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2042                         reconstruct_afe()
2043                         if is_in_scope 'nobr', NS_HTML
2044                                 parse_error()
2045                                 adoption_agency 'nobr'
2046                                 reconstruct_afe()
2047                         el = insert_html_element t
2048                         afe_push el
2049                         return
2050                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2051                         adoption_agency t.name
2052                         return
2053                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2054                         reconstruct_afe()
2055                         insert_html_element t
2056                         afe_push_marker()
2057                         flag_frameset_ok = false
2058                         return
2059                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2060                         unless is_in_scope t.name, NS_HTML
2061                                 parse_error()
2062                                 return
2063                         generate_implied_end_tags()
2064                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2065                                 parse_error()
2066                         loop
2067                                 el = open_els.shift()
2068                                 if el.name is t.name and el.namespace is NS_HTML
2069                                         break
2070                         clear_afe_to_marker()
2071                         return
2072                 if t.type is TYPE_START_TAG and t.name is 'table'
2073                         unless doc.flag('quirks mode') is QUIRKS_YES
2074                                 close_p_if_in_button_scope() # test
2075                         insert_html_element t
2076                         flag_frameset_ok = false
2077                         ins_mode = ins_mode_in_table
2078                         return
2079                 if t.type is TYPE_END_TAG and t.name is 'br'
2080                         parse_error()
2081                         # W3C: t.type = TYPE_START_TAG
2082                         t = new_open_tag 'br' # WHATWG
2083                         # fall through
2084                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2085                         reconstruct_afe()
2086                         insert_html_element t
2087                         open_els.shift()
2088                         t.acknowledge_self_closing()
2089                         flag_frameset_ok = false
2090                         return
2091                 if t.type is TYPE_START_TAG and t.name is 'input'
2092                         reconstruct_afe()
2093                         insert_html_element t
2094                         open_els.shift()
2095                         t.acknowledge_self_closing()
2096                         unless is_input_hidden_tok t
2097                                 flag_frameset_ok = false
2098                         return
2099                 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2100                         # WHATWG adds 'menuitem' for this block
2101                         insert_html_element t
2102                         open_els.shift()
2103                         t.acknowledge_self_closing()
2104                         return
2105                 if t.type is TYPE_START_TAG and t.name is 'hr'
2106                         close_p_if_in_button_scope()
2107                         insert_html_element t
2108                         open_els.shift()
2109                         t.acknowledge_self_closing()
2110                         flag_frameset_ok = false
2111                         return
2112                 if t.type is TYPE_START_TAG and t.name is 'image'
2113                         parse_error()
2114                         t.name = 'img'
2115                         process_token t
2116                         return
2117                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2118                         parse_error()
2119                         if template_tag_is_open() is false and form_element_pointer isnt null
2120                                 return
2121                         t.acknowledge_self_closing()
2122                         flag_frameset_ok = false
2123                         close_p_if_in_button_scope()
2124                         el = insert_html_element new_open_tag 'form'
2125                         unless template_tag_is_open()
2126                                 form_element_pointer = el
2127                         for a in t.attrs_a
2128                                 if a[0] is 'action'
2129                                         el.attrs['action'] = a[1]
2130                                         break
2131                         insert_html_element new_open_tag 'hr'
2132                         open_els.shift()
2133                         reconstruct_afe()
2134                         insert_html_element new_open_tag 'label'
2135                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2136                         input_el = new_open_tag 'input'
2137                         prompt = null
2138                         for a in t.attrs_a
2139                                 if a[0] is 'prompt'
2140                                         prompt = a[1]
2141                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2142                                         input_el.attrs_a.push [a[0], a[1]]
2143                         input_el.attrs_a.push ['name', 'isindex']
2144                         # fixfull this next bit is in english... internationalize?
2145                         prompt ?= "This is a searchable index. Enter search keywords: "
2146                         insert_character new_character_token prompt # fixfull split
2147                         # TODO submit typo "balue" in spec
2148                         insert_html_element input_el
2149                         open_els.shift()
2150                         # insert_character '' # you can put chars here if promt attr missing
2151                         open_els.shift()
2152                         insert_html_element new_open_tag 'hr'
2153                         open_els.shift()
2154                         open_els.shift()
2155                         unless template_tag_is_open()
2156                                 form_element_pointer = null
2157                         return
2158                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2159                         insert_html_element t
2160                         eat_next_token_if_newline()
2161                         tok_state = tok_state_rcdata
2162                         original_ins_mode = ins_mode
2163                         flag_frameset_ok = false
2164                         ins_mode = ins_mode_text
2165                         return
2166                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2167                         close_p_if_in_button_scope()
2168                         reconstruct_afe()
2169                         flag_frameset_ok = false
2170                         parse_generic_raw_text t
2171                         return
2172                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2173                         flag_frameset_ok = false
2174                         parse_generic_raw_text t
2175                         return
2176                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2177                         parse_generic_raw_text t
2178                         return
2179                 if t.type is TYPE_START_TAG and t.name is 'select'
2180                         reconstruct_afe()
2181                         insert_html_element t
2182                         flag_frameset_ok = false
2183                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2184                                 ins_mode = ins_mode_in_select_in_table
2185                         else
2186                                 ins_mode = ins_mode_in_select
2187                         return
2188                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2189                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2190                                 open_els.shift()
2191                         reconstruct_afe()
2192                         insert_html_element t
2193                         return
2194 # this comment block implements the W3C spec
2195 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2196 #                       if is_in_scope 'ruby', NS_HTML
2197 #                               generate_implied_end_tags()
2198 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2199 #                                       parse_error()
2200 #                       insert_html_element t
2201 #                       return
2202 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2203 #                       if is_in_scope 'ruby', NS_HTML
2204 #                               generate_implied_end_tags 'rtc' # arg is exception
2205 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2206 #                                       parse_error()
2207 #                       insert_html_element t
2208 #                       return
2209 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2210                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2211                         if is_in_scope 'ruby', NS_HTML
2212                                 generate_implied_end_tags()
2213                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2214                                         parse_error()
2215                         insert_html_element t
2216                         return
2217                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2218                         if is_in_scope 'ruby', NS_HTML
2219                                 generate_implied_end_tags 'rtc'
2220                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2221                                         parse_error()
2222                         insert_html_element t
2223                         return
2224 # end WHATWG chunk
2225                 if t.type is TYPE_START_TAG and t.name is 'math'
2226                         reconstruct_afe()
2227                         adjust_mathml_attributes t
2228                         adjust_foreign_attributes t
2229                         insert_foreign_element t, NS_MATHML
2230                         if t.flag 'self-closing'
2231                                 open_els.shift()
2232                                 t.acknowledge_self_closing()
2233                         return
2234                 if t.type is TYPE_START_TAG and t.name is 'svg'
2235                         reconstruct_afe()
2236                         adjust_svg_attributes t
2237                         adjust_foreign_attributes t
2238                         insert_foreign_element t, NS_SVG
2239                         if t.flag 'self-closing'
2240                                 open_els.shift()
2241                                 t.acknowledge_self_closing()
2242                         return
2243                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2244                         parse_error()
2245                         return
2246                 if t.type is TYPE_START_TAG # any other start tag
2247                         reconstruct_afe()
2248                         insert_html_element t
2249                         return
2250                 if t.type is TYPE_END_TAG # any other end tag
2251                         in_body_any_other_end_tag t.name
2252                         return
2253                 return
2254
2255         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2256         ins_mode_text = (t) ->
2257                 if t.type is TYPE_TEXT
2258                         insert_character t
2259                         return
2260                 if t.type is TYPE_EOF
2261                         parse_error()
2262                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2263                                 open_els[0].flag 'already started', true
2264                         open_els.shift()
2265                         ins_mode = original_ins_mode
2266                         process_token t
2267                         return
2268                 if t.type is TYPE_END_TAG and t.name is 'script'
2269                         open_els.shift()
2270                         ins_mode = original_ins_mode
2271                         # fixfull the spec seems to assume that I'm going to run the script
2272                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2273                         return
2274                 if t.type is TYPE_END_TAG
2275                         open_els.shift()
2276                         ins_mode = original_ins_mode
2277                         return
2278                 return
2279
2280         # the functions below implement the tokenizer stats described here:
2281         # http://www.w3.org/TR/html5/syntax.html#tokenization
2282
2283         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2284         ins_mode_in_table_else = (t) ->
2285                 parse_error()
2286                 flag_foster_parenting = true
2287                 ins_mode_in_body t
2288                 flag_foster_parenting = false
2289                 return
2290         ins_mode_in_table = (t) ->
2291                 switch t.type
2292                         when TYPE_TEXT
2293                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2294                                         pending_table_character_tokens = []
2295                                         original_ins_mode = ins_mode
2296                                         ins_mode = ins_mode_in_table_text
2297                                         process_token t
2298                                 else
2299                                         ins_mode_in_table_else t
2300                         when TYPE_COMMENT
2301                                 insert_comment t
2302                         when TYPE_DOCTYPE
2303                                 parse_error()
2304                         when TYPE_START_TAG
2305                                 switch t.name
2306                                         when 'caption'
2307                                                 clear_stack_to_table_context()
2308                                                 afe_push_marker()
2309                                                 insert_html_element t
2310                                                 ins_mode = ins_mode_in_caption
2311                                         when 'colgroup'
2312                                                 clear_stack_to_table_context()
2313                                                 insert_html_element t
2314                                                 ins_mode = ins_mode_in_column_group
2315                                         when 'col'
2316                                                 clear_stack_to_table_context()
2317                                                 insert_html_element new_open_tag 'colgroup'
2318                                                 ins_mode = ins_mode_in_column_group
2319                                                 process_token t
2320                                         when 'tbody', 'tfoot', 'thead'
2321                                                 clear_stack_to_table_context()
2322                                                 insert_html_element t
2323                                                 ins_mode = ins_mode_in_table_body
2324                                         when 'td', 'th', 'tr'
2325                                                 clear_stack_to_table_context()
2326                                                 insert_html_element new_open_tag 'tbody'
2327                                                 ins_mode = ins_mode_in_table_body
2328                                                 process_token t
2329                                         when 'table'
2330                                                 parse_error()
2331                                                 if is_in_table_scope 'table', NS_HTML
2332                                                         loop
2333                                                                 el = open_els.shift()
2334                                                                 if el.name is 'table' and el.namespace is NS_HTML
2335                                                                         break
2336                                                         reset_ins_mode()
2337                                                         process_token t
2338                                         when 'style', 'script', 'template'
2339                                                 ins_mode_in_head t
2340                                         when 'input'
2341                                                 unless is_input_hidden_tok t
2342                                                         ins_mode_in_table_else t
2343                                                 else
2344                                                         parse_error()
2345                                                         el = insert_html_element t
2346                                                         open_els.shift()
2347                                                         t.acknowledge_self_closing()
2348                                         when 'form'
2349                                                 parse_error()
2350                                                 if form_element_pointer?
2351                                                         return
2352                                                 if template_tag_is_open()
2353                                                         return
2354                                                 form_element_pointer = insert_html_element t
2355                                                 open_els.shift()
2356                                         else
2357                                                 ins_mode_in_table_else t
2358                         when TYPE_END_TAG
2359                                 switch t.name
2360                                         when 'table'
2361                                                 if is_in_table_scope 'table', NS_HTML
2362                                                         loop
2363                                                                 el = open_els.shift()
2364                                                                 if el.name is 'table' and el.namespace is NS_HTML
2365                                                                         break
2366                                                         reset_ins_mode()
2367                                                 else
2368                                                         parse_error()
2369                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2370                                                 parse_error()
2371                                         when 'template'
2372                                                 ins_mode_in_head t
2373                                         else
2374                                                 ins_mode_in_table_else t
2375                         when TYPE_EOF
2376                                 ins_mode_in_body t
2377                         else
2378                                 ins_mode_in_table_else t
2379                 return
2380
2381
2382         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2383         ins_mode_in_table_text = (t) ->
2384                 if t.type is TYPE_TEXT and t.text is "\u0000"
2385                         # from javascript?
2386                         parse_error()
2387                         return
2388                 if t.type is TYPE_TEXT
2389                         pending_table_character_tokens.push t
2390                         return
2391                 # Anything else
2392                 all_space = true
2393                 for old in pending_table_character_tokens
2394                         unless is_space_tok old
2395                                 all_space = false
2396                                 break
2397                 if all_space
2398                         for old in pending_table_character_tokens
2399                                 insert_character old
2400                 else
2401                         for old in pending_table_character_tokens
2402                                 ins_mode_in_table_else old
2403                 pending_table_character_tokens = []
2404                 ins_mode = original_ins_mode
2405                 process_token t
2406                 return
2407
2408         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2409         ins_mode_in_caption = (t) ->
2410                 if t.type is TYPE_END_TAG and t.name is 'caption'
2411                         if is_in_table_scope 'caption', NS_HTML
2412                                 generate_implied_end_tags()
2413                                 if open_els[0].name isnt 'caption'
2414                                         parse_error()
2415                                 loop
2416                                         el = open_els.shift()
2417                                         if el.name is 'caption' and el.namespace is NS_HTML
2418                                                 break
2419                                 clear_afe_to_marker()
2420                                 ins_mode = ins_mode_in_table
2421                         else
2422                                 parse_error()
2423                                 # fragment case
2424                         return
2425                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2426                         parse_error()
2427                         if is_in_table_scope 'caption', NS_HTML
2428                                 loop
2429                                         el = open_els.shift()
2430                                         if el.name is 'caption' and el.namespace is NS_HTML
2431                                                 break
2432                                 clear_afe_to_marker()
2433                                 ins_mode = ins_mode_in_table
2434                                 process_token t
2435                         # else fragment case
2436                         return
2437                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2438                         parse_error()
2439                         return
2440                 # Anything else
2441                 ins_mode_in_body t
2442                 return
2443
2444         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2445         ins_mode_in_column_group = (t) ->
2446                 if is_space_tok t
2447                         insert_character t
2448                         return
2449                 if t.type is TYPE_COMMENT
2450                         insert_comment t
2451                         return
2452                 if t.type is TYPE_DOCTYPE
2453                         parse_error()
2454                         return
2455                 if t.type is TYPE_START_TAG and t.name is 'html'
2456                         ins_mode_in_body t
2457                         return
2458                 if t.type is TYPE_START_TAG and t.name is 'col'
2459                         el = insert_html_element t
2460                         open_els.shift()
2461                         t.acknowledge_self_closing()
2462                         return
2463                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2464                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2465                                 open_els.shift()
2466                                 ins_mode = ins_mode_in_table
2467                         else
2468                                 parse_error()
2469                         return
2470                 if t.type is TYPE_END_TAG and t.name is 'col'
2471                         parse_error()
2472                         return
2473                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2474                         ins_mode_in_head t
2475                         return
2476                 if t.type is TYPE_EOF
2477                         ins_mode_in_body t
2478                         return
2479                 # Anything else
2480                 if open_els[0].name isnt 'colgroup'
2481                         parse_error()
2482                         return
2483                 open_els.shift()
2484                 ins_mode = ins_mode_in_table
2485                 process_token t
2486                 return
2487
2488         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2489         ins_mode_in_table_body = (t) ->
2490                 if t.type is TYPE_START_TAG and t.name is 'tr'
2491                         clear_stack_to_table_body_context()
2492                         insert_html_element t
2493                         ins_mode = ins_mode_in_row
2494                         return
2495                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2496                         parse_error()
2497                         clear_stack_to_table_body_context()
2498                         insert_html_element new_open_tag 'tr'
2499                         ins_mode = ins_mode_in_row
2500                         process_token t
2501                         return
2502                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2503                         unless is_in_table_scope t.name, NS_HTML
2504                                 parse_error()
2505                                 return
2506                         clear_stack_to_table_body_context()
2507                         open_els.shift()
2508                         ins_mode = ins_mode_in_table
2509                         return
2510                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2511                         has = false
2512                         for el in open_els
2513                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2514                                         has = true
2515                                         break
2516                                 if table_scopers[el.name] is el.namespace
2517                                         break
2518                         if !has
2519                                 parse_error()
2520                                 return
2521                         clear_stack_to_table_body_context()
2522                         open_els.shift()
2523                         ins_mode = ins_mode_in_table
2524                         process_token t
2525                         return
2526                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2527                         parse_error()
2528                         return
2529                 # Anything else
2530                 ins_mode_in_table t
2531                 return
2532
2533         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2534         ins_mode_in_row = (t) ->
2535                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2536                         clear_stack_to_table_row_context()
2537                         insert_html_element t
2538                         ins_mode = ins_mode_in_cell
2539                         afe_push_marker()
2540                         return
2541                 if t.type is TYPE_END_TAG and t.name is 'tr'
2542                         if is_in_table_scope 'tr', NS_HTML
2543                                 clear_stack_to_table_row_context()
2544                                 open_els.shift()
2545                                 ins_mode = ins_mode_in_table_body
2546                         else
2547                                 parse_error()
2548                         return
2549                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2550                         if is_in_table_scope 'tr', NS_HTML
2551                                 clear_stack_to_table_row_context()
2552                                 open_els.shift()
2553                                 ins_mode = ins_mode_in_table_body
2554                                 process_token t
2555                         else
2556                                 parse_error()
2557                         return
2558                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2559                         if is_in_table_scope t.name, NS_HTML
2560                                 if is_in_table_scope 'tr', NS_HTML
2561                                         clear_stack_to_table_row_context()
2562                                         open_els.shift()
2563                                         ins_mode = ins_mode_in_table_body
2564                                         process_token t
2565                         else
2566                                 parse_error()
2567                         return
2568                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2569                         parse_error()
2570                         return
2571                 # Anything else
2572                 ins_mode_in_table t
2573                 return
2574
2575         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2576         close_the_cell = ->
2577                 generate_implied_end_tags()
2578                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2579                         parse_error()
2580                 loop
2581                         el = open_els.shift()
2582                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2583                                 break
2584                 clear_afe_to_marker()
2585                 ins_mode = ins_mode_in_row
2586                 return
2587
2588         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2589         ins_mode_in_cell = (t) ->
2590                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2591                         if is_in_table_scope t.name, NS_HTML
2592                                 generate_implied_end_tags()
2593                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2594                                         parse_error()
2595                                 loop
2596                                         el = open_els.shift()
2597                                         if el.name is t.name and el.namespace is NS_HTML
2598                                                 break
2599                                 clear_afe_to_marker()
2600                                 ins_mode = ins_mode_in_row
2601                         else
2602                                 parse_error()
2603                         return
2604                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2605                         has = false
2606                         for el in open_els
2607                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2608                                         has = true
2609                                         break
2610                                 if table_scopers[el.name] is el.namespace
2611                                         break
2612                         if !has
2613                                 parse_error()
2614                                 return
2615                         close_the_cell()
2616                         process_token t
2617                         return
2618                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2619                         parse_error()
2620                         return
2621                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2622                         if is_in_table_scope t.name, NS_HTML
2623                                 close_the_cell()
2624                                 process_token t
2625                         else
2626                                 parse_error()
2627                         return
2628                 # Anything Else
2629                 ins_mode_in_body t
2630                 return
2631
2632         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2633         ins_mode_in_select = (t) ->
2634                 if t.type is TYPE_TEXT and t.text is "\u0000"
2635                         parse_error()
2636                         return
2637                 if t.type is TYPE_TEXT
2638                         insert_character t
2639                         return
2640                 if t.type is TYPE_COMMENT
2641                         insert_comment t
2642                         return
2643                 if t.type is TYPE_DOCTYPE
2644                         parse_error()
2645                         return
2646                 if t.type is TYPE_START_TAG and t.name is 'html'
2647                         ins_mode_in_body t
2648                         return
2649                 if t.type is TYPE_START_TAG and t.name is 'option'
2650                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2651                                 open_els.shift()
2652                         insert_html_element t
2653                         return
2654                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2655                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2656                                 open_els.shift()
2657                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2658                                 open_els.shift()
2659                         insert_html_element t
2660                         return
2661                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2662                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2663                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2664                                         open_els.shift()
2665                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2666                                 open_els.shift()
2667                         else
2668                                 parse_error()
2669                         return
2670                 if t.type is TYPE_END_TAG and t.name is 'option'
2671                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2672                                 open_els.shift()
2673                         else
2674                                 parse_error()
2675                         return
2676                 if t.type is TYPE_END_TAG and t.name is 'select'
2677                         if is_in_select_scope 'select', NS_HTML
2678                                 loop
2679                                         el = open_els.shift()
2680                                         if el.name is 'select' and el.namespace is NS_HTML
2681                                                 break
2682                                 reset_ins_mode()
2683                         else
2684                                 parse_error()
2685                         return
2686                 if t.type is TYPE_START_TAG and t.name is 'select'
2687                         parse_error()
2688                         loop
2689                                 el = open_els.shift()
2690                                 if el.name is 'select' and el.namespace is NS_HTML
2691                                         break
2692                         reset_ins_mode()
2693                         # spec says that this is the same as </select> but it doesn't say
2694                         # to check scope first
2695                         return
2696                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2697                         parse_error()
2698                         unless is_in_select_scope 'select', NS_HTML
2699                                 return
2700                         loop
2701                                 el = open_els.shift()
2702                                 if el.name is 'select' and el.namespace is NS_HTML
2703                                         break
2704                         reset_ins_mode()
2705                         process_token t
2706                         return
2707                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2708                         ins_mode_in_head t
2709                         return
2710                 if t.type is TYPE_EOF
2711                         ins_mode_in_body t
2712                         return
2713                 # Anything else
2714                 parse_error()
2715                 return
2716
2717         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2718         ins_mode_in_select_in_table = (t) ->
2719                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2720                         parse_error()
2721                         loop
2722                                 el = open_els.shift()
2723                                 if el.name is 'select' and el.namespace is NS_HTML
2724                                         break
2725                         reset_ins_mode()
2726                         process_token t
2727                         return
2728                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2729                         parse_error()
2730                         unless is_in_table_scope t.name, NS_HTML
2731                                 return
2732                         loop
2733                                 el = open_els.shift()
2734                                 if el.name is 'select' and el.namespace is NS_HTML
2735                                         break
2736                         reset_ins_mode()
2737                         process_token t
2738                         return
2739                 # Anything else
2740                 ins_mode_in_select t
2741                 return
2742
2743         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2744         ins_mode_in_template = (t) ->
2745                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2746                         ins_mode_in_body t
2747                         return
2748                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2749                         ins_mode_in_head t
2750                         return
2751                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2752                         template_ins_modes.shift()
2753                         template_ins_modes.unshift ins_mode_in_table
2754                         ins_mode = ins_mode_in_table
2755                         process_token t
2756                         return
2757                 if t.type is TYPE_START_TAG and t.name is 'col'
2758                         template_ins_modes.shift()
2759                         template_ins_modes.unshift ins_mode_in_column_group
2760                         ins_mode = ins_mode_in_column_group
2761                         process_token t
2762                         return
2763                 if t.type is TYPE_START_TAG and t.name is 'tr'
2764                         template_ins_modes.shift()
2765                         template_ins_modes.unshift ins_mode_in_table_body
2766                         ins_mode = ins_mode_in_table_body
2767                         process_token t
2768                         return
2769                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2770                         template_ins_modes.shift()
2771                         template_ins_modes.unshift ins_mode_in_row
2772                         ins_mode = ins_mode_in_row
2773                         process_token t
2774                         return
2775                 if t.type is TYPE_START_TAG
2776                         template_ins_modes.shift()
2777                         template_ins_modes.unshift ins_mode_in_body
2778                         ins_mode = ins_mode_in_body
2779                         process_token t
2780                         return
2781                 if t.type is TYPE_END_TAG
2782                         parse_error()
2783                         return
2784                 if t.type is TYPE_EOF
2785                         unless template_tag_is_open()
2786                                 stop_parsing()
2787                                 return
2788                         parse_error()
2789                         loop
2790                                 el = open_els.shift()
2791                                 if el.name is 'template' and el.namespace is NS_HTML
2792                                         break
2793                         clear_afe_to_marker()
2794                         template_ins_modes.shift()
2795                         reset_ins_mode()
2796                         process_token t
2797                 return
2798
2799         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2800         ins_mode_after_body = (t) ->
2801                 if is_space_tok t
2802                         ins_mode_in_body t
2803                         return
2804                 if t.type is TYPE_COMMENT
2805                         first = open_els[open_els.length - 1]
2806                         insert_comment t, [first, first.children.length]
2807                         return
2808                 if t.type is TYPE_DOCTYPE
2809                         parse_error()
2810                         return
2811                 if t.type is TYPE_START_TAG and t.name is 'html'
2812                         ins_mode_in_body t
2813                         return
2814                 if t.type is TYPE_END_TAG and t.name is 'html'
2815                         if flag_fragment_parsing
2816                                 parse_error()
2817                                 return
2818                         ins_mode = ins_mode_after_after_body
2819                         return
2820                 if t.type is TYPE_EOF
2821                         stop_parsing()
2822                         return
2823                 # Anything ELse
2824                 parse_error()
2825                 ins_mode = ins_mode_in_body
2826                 process_token t
2827                 return
2828
2829         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2830         ins_mode_in_frameset = (t) ->
2831                 if is_space_tok t
2832                         insert_character t
2833                         return
2834                 if t.type is TYPE_COMMENT
2835                         insert_comment t
2836                         return
2837                 if t.type is TYPE_DOCTYPE
2838                         parse_error()
2839                         return
2840                 if t.type is TYPE_START_TAG and t.name is 'html'
2841                         ins_mode_in_body t
2842                         return
2843                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2844                         insert_html_element t
2845                         return
2846                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2847                         if open_els.length is 1
2848                                 parse_error()
2849                                 return # fragment case
2850                         open_els.shift()
2851                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2852                                 ins_mode = ins_mode_after_frameset
2853                         return
2854                 if t.type is TYPE_START_TAG and t.name is 'frame'
2855                         insert_html_element t
2856                         open_els.shift()
2857                         t.acknowledge_self_closing()
2858                         return
2859                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2860                         ins_mode_in_head t
2861                         return
2862                 if t.type is TYPE_EOF
2863                         if open_els.length isnt 1
2864                                 parse_error()
2865                         stop_parsing()
2866                         return
2867                 # Anything else
2868                 parse_error()
2869                 return
2870
2871         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2872         ins_mode_after_frameset = (t) ->
2873                 if is_space_tok t
2874                         insert_character t
2875                         return
2876                 if t.type is TYPE_COMMENT
2877                         insert_comment t
2878                         return
2879                 if t.type is TYPE_DOCTYPE
2880                         parse_error()
2881                         return
2882                 if t.type is TYPE_START_TAG and t.name is 'html'
2883                         ins_mode_in_body t
2884                         return
2885                 if t.type is TYPE_END_TAG and t.name is 'html'
2886                         ins_mode = ins_mode_after_after_frameset
2887                         return
2888                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2889                         ins_mode_in_head t
2890                         return
2891                 if t.type is TYPE_EOF
2892                         stop_parsing()
2893                         return
2894                 # Anything else
2895                 parse_error()
2896                 return
2897
2898         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2899         ins_mode_after_after_body = (t) ->
2900                 if t.type is TYPE_COMMENT
2901                         insert_comment t, [doc, doc.children.length]
2902                         return
2903                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2904                         ins_mode_in_body t
2905                         return
2906                 if t.type is TYPE_EOF
2907                         stop_parsing()
2908                         return
2909                 # Anything else
2910                 parse_error()
2911                 ins_mode = ins_mode_in_body
2912                 process_token t
2913                 return
2914
2915         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2916         ins_mode_after_after_frameset = (t) ->
2917                 if t.type is TYPE_COMMENT
2918                         insert_comment t, [doc, doc.children.length]
2919                         return
2920                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2921                         ins_mode_in_body t
2922                         return
2923                 if t.type is TYPE_EOF
2924                         stop_parsing()
2925                         return
2926                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2927                         ins_mode_in_head t
2928                         return
2929                 # Anything else
2930                 parse_error()
2931                 return
2932
2933         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2934         has_color_face_or_size = (t) ->
2935                 for a in t.attrs_a
2936                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2937                                 return true
2938                 return false
2939         in_foreign_content_end_script = ->
2940                 open_els.shift()
2941                 # fixfull
2942                 return
2943         in_foreign_content_other_start = (t) ->
2944                 acn = adjusted_current_node()
2945                 if acn.namespace is NS_MATHML
2946                         adjust_mathml_attributes t
2947                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2948                         t.name = svg_name_fixes[t.name]
2949                 if acn.namespace is NS_SVG
2950                         adjust_svg_attributes t
2951                 adjust_foreign_attributes t
2952                 insert_foreign_element t, acn.namespace
2953                 if t.flag 'self-closing'
2954                         if t.name is 'script'
2955                                 t.acknowledge_self_closing()
2956                                 in_foreign_content_end_script()
2957                                 # fixfull
2958                         else
2959                                 open_els.shift()
2960                                 t.acknowledge_self_closing()
2961                 return
2962         in_foreign_content = (t) ->
2963                 if t.type is TYPE_TEXT and t.text is "\u0000"
2964                         parse_error()
2965                         insert_character new_character_token "\ufffd"
2966                         return
2967                 if is_space_tok t
2968                         insert_character t
2969                         return
2970                 if t.type is TYPE_TEXT
2971                         flag_frameset_ok = false
2972                         insert_character t
2973                         return
2974                 if t.type is TYPE_COMMENT
2975                         insert_comment t
2976                         return
2977                 if t.type is TYPE_DOCTYPE
2978                         parse_error()
2979                         return
2980                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2981                         parse_error()
2982                         if flag_fragment_parsing
2983                                 in_foreign_content_other_start t
2984                                 return
2985                         loop # is this safe?
2986                                 open_els.shift()
2987                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2988                                         break
2989                         process_token t
2990                         return
2991                 if t.type is TYPE_START_TAG
2992                         in_foreign_content_other_start t
2993                         return
2994                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2995                         in_foreign_content_end_script()
2996                         return
2997                 if t.type is TYPE_END_TAG
2998                         i = 0
2999                         node = open_els[i]
3000                         if node.name.toLowerCase() isnt t.name
3001                                 parse_error()
3002                         loop
3003                                 if node is open_els[open_els.length - 1]
3004                                         return
3005                                 if node.name.toLowerCase() is t.name
3006                                         loop
3007                                                 el = open_els.shift()
3008                                                 if el is node
3009                                                         return
3010                                 i += 1
3011                                 node = open_els[i]
3012                                 if node.namespace is NS_HTML
3013                                         break
3014                         ins_mode t # explicitly call HTML insertion mode
3015                 return
3016
3017
3018         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3019         tok_state_data = ->
3020                 switch c = txt.charAt(cur++)
3021                         when '&'
3022                                 return new_text_node parse_character_reference()
3023                         when '<'
3024                                 tok_state = tok_state_tag_open
3025                         when "\u0000"
3026                                 parse_error()
3027                                 return new_text_node c
3028                         when '' # EOF
3029                                 return new_eof_token()
3030                         else
3031                                 return new_text_node c
3032                 return null
3033
3034         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3035         # not needed: tok_state_character_reference_in_data = ->
3036         # just call parse_character_reference()
3037
3038         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3039         tok_state_rcdata = ->
3040                 switch c = txt.charAt(cur++)
3041                         when '&'
3042                                 return new_text_node parse_character_reference()
3043                         when '<'
3044                                 tok_state = tok_state_rcdata_less_than_sign
3045                         when "\u0000"
3046                                 parse_error()
3047                                 return new_character_token "\ufffd"
3048                         when '' # EOF
3049                                 return new_eof_token()
3050                         else
3051                                 return new_character_token c
3052                 return null
3053
3054         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3055         # not needed: tok_state_character_reference_in_rcdata = ->
3056         # just call parse_character_reference()
3057
3058         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3059         tok_state_rawtext = ->
3060                 switch c = txt.charAt(cur++)
3061                         when '<'
3062                                 tok_state = tok_state_rawtext_less_than_sign
3063                         when "\u0000"
3064                                 parse_error()
3065                                 return new_character_token "\ufffd"
3066                         when '' # EOF
3067                                 return new_eof_token()
3068                         else
3069                                 return new_character_token c
3070                 return null
3071
3072         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3073         tok_state_script_data = ->
3074                 switch c = txt.charAt(cur++)
3075                         when '<'
3076                                 tok_state = tok_state_script_data_less_than_sign
3077                         when "\u0000"
3078                                 parse_error()
3079                                 return new_character_token "\ufffd"
3080                         when '' # EOF
3081                                 return new_eof_token()
3082                         else
3083                                 return new_character_token c
3084                 return null
3085
3086         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3087         tok_state_plaintext = ->
3088                 switch c = txt.charAt(cur++)
3089                         when "\u0000"
3090                                 parse_error()
3091                                 return new_character_token "\ufffd"
3092                         when '' # EOF
3093                                 return new_eof_token()
3094                         else
3095                                 return new_character_token c
3096                 return null
3097
3098
3099         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3100         tok_state_tag_open = ->
3101                 c = txt.charAt(cur++)
3102                 if c is '!'
3103                         tok_state = tok_state_markup_declaration_open
3104                         return
3105                 if c is '/'
3106                         tok_state = tok_state_end_tag_open
3107                         return
3108                 if is_uc_alpha(c)
3109                         tok_cur_tag = new_open_tag c.toLowerCase()
3110                         tok_state = tok_state_tag_name
3111                         return
3112                 if is_lc_alpha(c)
3113                         tok_cur_tag = new_open_tag c
3114                         tok_state = tok_state_tag_name
3115                         return
3116                 if c is '?'
3117                         parse_error()
3118                         tok_cur_tag = new_comment_token '?' # FIXME right?
3119                         tok_state = tok_state_bogus_comment
3120                         return
3121                 # Anything else
3122                 parse_error()
3123                 tok_state = tok_state_data
3124                 cur -= 1 # we didn't parse/handle the char after <
3125                 return new_text_node '<'
3126
3127         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3128         tok_state_end_tag_open = ->
3129                 c = txt.charAt(cur++)
3130                 if is_uc_alpha(c)
3131                         tok_cur_tag = new_end_tag c.toLowerCase()
3132                         tok_state = tok_state_tag_name
3133                         return
3134                 if is_lc_alpha(c)
3135                         tok_cur_tag = new_end_tag c
3136                         tok_state = tok_state_tag_name
3137                         return
3138                 if c is '>'
3139                         parse_error()
3140                         tok_state = tok_state_data
3141                         return
3142                 if c is '' # EOF
3143                         parse_error()
3144                         tok_state = tok_state_data
3145                         return new_text_node '</'
3146                 # Anything else
3147                 parse_error()
3148                 tok_cur_tag = new_comment_token c
3149                 tok_state = tok_state_bogus_comment
3150                 return null
3151
3152         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3153         tok_state_tag_name = ->
3154                 switch c = txt.charAt(cur++)
3155                         when "\t", "\n", "\u000c", ' '
3156                                 tok_state = tok_state_before_attribute_name
3157                         when '/'
3158                                 tok_state = tok_state_self_closing_start_tag
3159                         when '>'
3160                                 tok_state = tok_state_data
3161                                 tmp = tok_cur_tag
3162                                 tok_cur_tag = null
3163                                 return tmp
3164                         when "\u0000"
3165                                 parse_error()
3166                                 tok_cur_tag.name += "\ufffd"
3167                         when '' # EOF
3168                                 parse_error()
3169                                 tok_state = tok_state_data
3170                         else
3171                                 if is_uc_alpha(c)
3172                                         tok_cur_tag.name += c.toLowerCase()
3173                                 else
3174                                         tok_cur_tag.name += c
3175                 return null
3176
3177         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3178         tok_state_rcdata_less_than_sign = ->
3179                 c = txt.charAt(cur++)
3180                 if c is '/'
3181                         temporary_buffer = ''
3182                         tok_state = tok_state_rcdata_end_tag_open
3183                         return null
3184                 # Anything else
3185                 tok_state = tok_state_rcdata
3186                 cur -= 1 # reconsume the input character
3187                 return new_character_token '<'
3188
3189         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3190         tok_state_rcdata_end_tag_open = ->
3191                 c = txt.charAt(cur++)
3192                 if is_uc_alpha(c)
3193                         tok_cur_tag = new_end_tag c.toLowerCase()
3194                         temporary_buffer += c
3195                         tok_state = tok_state_rcdata_end_tag_name
3196                         return null
3197                 if is_lc_alpha(c)
3198                         tok_cur_tag = new_end_tag c
3199                         temporary_buffer += c
3200                         tok_state = tok_state_rcdata_end_tag_name
3201                         return null
3202                 # Anything else
3203                 tok_state = tok_state_rcdata
3204                 cur -= 1 # reconsume the input character
3205                 return new_character_token "</" # fixfull separate these
3206
3207         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3208         is_appropriate_end_tag = (t) ->
3209                 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3210                 # start tag to have been emitted from this tokenizer"
3211                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3212
3213         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3214         tok_state_rcdata_end_tag_name = ->
3215                 c = txt.charAt(cur++)
3216                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3217                         if is_appropriate_end_tag tok_cur_tag
3218                                 tok_state = tok_state_before_attribute_name
3219                                 return
3220                         # else fall through to "Anything else"
3221                 if c is '/'
3222                         if is_appropriate_end_tag tok_cur_tag
3223                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3224                                 return
3225                         # else fall through to "Anything else"
3226                 if c is '>'
3227                         if is_appropriate_end_tag tok_cur_tag
3228                                 tok_state = tok_state_data
3229                                 return tok_cur_tag
3230                         # else fall through to "Anything else"
3231                 if is_uc_alpha(c)
3232                         tok_cur_tag.name += c.toLowerCase()
3233                         temporary_buffer += c
3234                         return null
3235                 if is_lc_alpha(c)
3236                         tok_cur_tag.name += c
3237                         temporary_buffer += c
3238                         return null
3239                 # Anything else
3240                 tok_state = tok_state_rcdata
3241                 cur -= 1 # reconsume the input character
3242                 return new_character_token '</' + temporary_buffer # fixfull separate these
3243
3244         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3245         tok_state_rawtext_less_than_sign = ->
3246                 c = txt.charAt(cur++)
3247                 if c is '/'
3248                         temporary_buffer = ''
3249                         tok_state = tok_state_rawtext_end_tag_open
3250                         return null
3251                 # Anything else
3252                 tok_state = tok_state_rawtext
3253                 cur -= 1 # reconsume the input character
3254                 return new_character_token '<'
3255
3256         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3257         tok_state_rawtext_end_tag_open = ->
3258                 c = txt.charAt(cur++)
3259                 if is_uc_alpha(c)
3260                         tok_cur_tag = new_end_tag c.toLowerCase()
3261                         temporary_buffer += c
3262                         tok_state = tok_state_rawtext_end_tag_name
3263                         return null
3264                 if is_lc_alpha(c)
3265                         tok_cur_tag = new_end_tag c
3266                         temporary_buffer += c
3267                         tok_state = tok_state_rawtext_end_tag_name
3268                         return null
3269                 # Anything else
3270                 tok_state = tok_state_rawtext
3271                 cur -= 1 # reconsume the input character
3272                 return new_character_token "</" # fixfull separate these
3273
3274         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3275         tok_state_rawtext_end_tag_name = ->
3276                 c = txt.charAt(cur++)
3277                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3278                         if is_appropriate_end_tag tok_cur_tag
3279                                 tok_state = tok_state_before_attribute_name
3280                                 return
3281                         # else fall through to "Anything else"
3282                 if c is '/'
3283                         if is_appropriate_end_tag tok_cur_tag
3284                                 tok_state = tok_state_self_closing_start_tag
3285                                 return
3286                         # else fall through to "Anything else"
3287                 if c is '>'
3288                         if is_appropriate_end_tag tok_cur_tag
3289                                 tok_state = tok_state_data
3290                                 return tok_cur_tag
3291                         # else fall through to "Anything else"
3292                 if is_uc_alpha(c)
3293                         tok_cur_tag.name += c.toLowerCase()
3294                         temporary_buffer += c
3295                         return null
3296                 if is_lc_alpha(c)
3297                         tok_cur_tag.name += c
3298                         temporary_buffer += c
3299                         return null
3300                 # Anything else
3301                 tok_state = tok_state_rawtext
3302                 cur -= 1 # reconsume the input character
3303                 return new_character_token '</' + temporary_buffer # fixfull separate these
3304
3305         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3306         tok_state_script_data_less_than_sign = ->
3307                 c = txt.charAt(cur++)
3308                 if c is '/'
3309                         temporary_buffer = ''
3310                         tok_state = tok_state_script_data_end_tag_open
3311                         return
3312                 if c is '!'
3313                         tok_state = tok_state_script_data_escape_start
3314                         return new_character_token '<!' # fixfull split
3315                 # Anything else
3316                 tok_state = tok_state_script_data
3317                 cur -= 1 # Reconsume
3318                 return new_character_token '<'
3319
3320         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3321         tok_state_script_data_end_tag_open = ->
3322                 c = txt.charAt(cur++)
3323                 if is_uc_alpha(c)
3324                         tok_cur_tag = new_end_tag c.toLowerCase()
3325                         temporary_buffer += c
3326                         tok_state = tok_state_script_data_end_tag_name
3327                         return
3328                 if is_lc_alpha(c)
3329                         tok_cur_tag = new_end_tag c
3330                         temporary_buffer += c
3331                         tok_state = tok_state_script_data_end_tag_name
3332                         return
3333                 # Anything else
3334                 tok_state = tok_state_script_data
3335                 cur -= 1 # Reconsume
3336                 return new_character_token '</'
3337
3338         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3339         tok_state_script_data_end_tag_name = ->
3340                 c = txt.charAt(cur++)
3341                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3342                         if is_appropriate_end_tag tok_cur_tag
3343                                 tok_state = tok_state_before_attribute_name
3344                                 return
3345                         # fall through
3346                 if c is '/'
3347                         if is_appropriate_end_tag tok_cur_tag
3348                                 tok_state = tok_state_self_closing_start_tag
3349                                 return
3350                         # fall through
3351                 if c is '>'
3352                         if is_appropriate_end_tag tok_cur_tag
3353                                 tok_state = tok_state_data
3354                                 return tok_cur_tag
3355                         # fall through
3356                 if is_uc_alpha(c)
3357                         tok_cur_tag.name += c.toLowerCase()
3358                         temporary_buffer += c
3359                         return
3360                 if is_lc_alpha(c)
3361                         tok_cur_tag.name += c
3362                         temporary_buffer += c
3363                         return
3364                 # Anything else
3365                 tok_state = tok_state_script_data
3366                 cur -= 1 # Reconsume
3367                 return new_character_token "</#{temporary_buffer}" # fixfull split
3368
3369         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3370         tok_state_script_data_escape_start = ->
3371                 c = txt.charAt(cur++)
3372                 if c is '-'
3373                         tok_state = tok_state_script_data_escape_start_dash
3374                         return new_character_token '-'
3375                 # Anything else
3376                 tok_state = tok_state_script_data
3377                 cur -= 1 # Reconsume
3378                 return
3379
3380         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3381         tok_state_script_data_escape_start_dash = ->
3382                 c = txt.charAt(cur++)
3383                 if c is '-'
3384                         tok_state = tok_state_script_data_escaped_dash_dash
3385                         return new_character_token '-'
3386                 # Anything else
3387                 tok_state = tok_state_script_data
3388                 cur -= 1 # Reconsume
3389                 return
3390
3391         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3392         tok_state_script_data_escaped = ->
3393                 c = txt.charAt(cur++)
3394                 if c is '-'
3395                         tok_state = tok_state_script_data_escaped_dash
3396                         return new_character_token '-'
3397                 if c is '<'
3398                         tok_state = tok_state_script_data_escaped_less_than_sign
3399                         return
3400                 if c is "\u0000"
3401                         parse_error()
3402                         return new_character_token "\ufffd"
3403                 if c is '' # EOF
3404                         tok_state = tok_state_data
3405                         parse_error()
3406                         cur -= 1 # Reconsume
3407                         return
3408                 # Anything else
3409                 return new_character_token c
3410
3411         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3412         tok_state_script_data_escaped_dash = ->
3413                 c = txt.charAt(cur++)
3414                 if c is '-'
3415                         tok_state = tok_state_script_data_escaped_dash_dash
3416                         return new_character_token '-'
3417                 if c is '<'
3418                         tok_state = tok_state_script_data_escaped_less_than_sign
3419                         return
3420                 if c is "\u0000"
3421                         parse_error()
3422                         tok_state = tok_state_script_data_escaped
3423                         return new_character_token "\ufffd"
3424                 if c is '' # EOF
3425                         tok_state = tok_state_data
3426                         parse_error()
3427                         cur -= 1 # Reconsume
3428                         return
3429                 # Anything else
3430                 tok_state = tok_state_script_data_escaped
3431                 return new_character_token c
3432
3433         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3434         tok_state_script_data_escaped_dash_dash = ->
3435                 c = txt.charAt(cur++)
3436                 if c is '-'
3437                         return new_character_token '-'
3438                 if c is '<'
3439                         tok_state = tok_state_script_data_escaped_less_than_sign
3440                         return
3441                 if c is '>'
3442                         tok_state = tok_state_script_data
3443                         return new_character_token '>'
3444                 if c is "\u0000"
3445                         parse_error()
3446                         tok_state = tok_state_script_data_escaped
3447                         return new_character_token "\ufffd"
3448                 if c is '' # EOF
3449                         parse_error()
3450                         tok_state = tok_state_data
3451                         cur -= 1 # Reconsume
3452                         return
3453                 # Anything else
3454                 tok_state = tok_state_script_data_escaped
3455                 return new_character_token c
3456
3457         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3458         tok_state_script_data_escaped_less_than_sign = ->
3459                 c = txt.charAt(cur++)
3460                 if c is '/'
3461                         temporary_buffer = ''
3462                         tok_state = tok_state_script_data_escaped_end_tag_open
3463                         return
3464                 if is_uc_alpha(c)
3465                         temporary_buffer = c.toLowerCase() # yes, really
3466                         tok_state = tok_state_script_data_double_escape_start
3467                         return new_character_token "<#{c}" # fixfull split
3468                 if is_lc_alpha(c)
3469                         temporary_buffer = c
3470                         tok_state = tok_state_script_data_double_escape_start
3471                         return new_character_token "<#{c}" # fixfull split
3472                 # Anything else
3473                 tok_state = tok_state_script_data_escaped
3474                 cur -= 1 # Reconsume
3475                 return new_character_token '<'
3476
3477         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3478         tok_state_script_data_escaped_end_tag_open = ->
3479                 c = txt.charAt(cur++)
3480                 if is_uc_alpha(c)
3481                         tok_cur_tag = new_end_tag c.toLowerCase()
3482                         temporary_buffer += c
3483                         tok_state = tok_state_script_data_escaped_end_tag_name
3484                         return
3485                 if is_lc_alpha(c)
3486                         tok_cur_tag = new_end_tag c
3487                         temporary_buffer += c
3488                         tok_state = tok_state_script_data_escaped_end_tag_name
3489                         return
3490                 # Anything else
3491                 tok_state = tok_state_script_data_escaped
3492                 cur -= 1 # Reconsume
3493                 return new_character_token '</' # fixfull split
3494
3495         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3496         tok_state_script_data_escaped_end_tag_name = ->
3497                 c = txt.charAt(cur++)
3498                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3499                         if is_appropriate_end_tag tok_cur_tag
3500                                 tok_state = tok_state_before_attribute_name
3501                                 return
3502                         # fall through
3503                 if c is '/'
3504                         if is_appropriate_end_tag tok_cur_tag
3505                                 tok_state = tok_state_self_closing_start_tag
3506                                 return
3507                         # fall through
3508                 if c is '>'
3509                         if is_appropriate_end_tag tok_cur_tag
3510                                 tok_state = tok_state_data
3511                                 return tok_cur_tag
3512                         # fall through
3513                 if is_uc_alpha(c)
3514                         tok_cur_tag.name += c.toLowerCase()
3515                         temporary_buffer += c.toLowerCase()
3516                         return
3517                 if is_lc_alpha(c)
3518                         tok_cur_tag.name += c
3519                         temporary_buffer += c.toLowerCase()
3520                         return
3521                 # Anything else
3522                 tok_state = tok_state_script_data_escaped
3523                 cur -= 1 # Reconsume
3524                 return new_character_token "</#{temporary_buffer}" # fixfull split
3525
3526         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3527         tok_state_script_data_double_escape_start = ->
3528                 c = txt.charAt(cur++)
3529                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3530                         if temporary_buffer is 'script'
3531                                 tok_state = tok_state_script_data_double_escaped
3532                         else
3533                                 tok_state = tok_state_script_data_escaped
3534                         return new_character_token c
3535                 if is_uc_alpha(c)
3536                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3537                         return new_character_token c
3538                 if is_lc_alpha(c)
3539                         temporary_buffer += c
3540                         return new_character_token c
3541                 # Anything else
3542                 tok_state = tok_state_script_data_escaped
3543                 cur -= 1 # Reconsume
3544                 return
3545
3546         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3547         tok_state_script_data_double_escaped = ->
3548                 c = txt.charAt(cur++)
3549                 if c is '-'
3550                         tok_state = tok_state_script_data_double_escaped_dash
3551                         return new_character_token '-'
3552                 if c is '<'
3553                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3554                         return new_character_token '<'
3555                 if c is "\u0000"
3556                         parse_error()
3557                         return new_character_token "\ufffd"
3558                 if c is '' # EOF
3559                         parse_error()
3560                         tok_state = tok_state_data
3561                         cur -= 1 # Reconsume
3562                         return
3563                 # Anything else
3564                 return new_character_token c
3565
3566         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3567         tok_state_script_data_double_escaped_dash = ->
3568                 c = txt.charAt(cur++)
3569                 if c is '-'
3570                         tok_state = tok_state_script_data_double_escaped_dash_dash
3571                         return new_character_token '-'
3572                 if c is '<'
3573                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3574                         return new_character_token '<'
3575                 if c is "\u0000"
3576                         parse_error()
3577                         tok_state = tok_state_script_data_double_escaped
3578                         return new_character_token "\ufffd"
3579                 if c is '' # EOF
3580                         parse_error()
3581                         tok_state = tok_state_data
3582                         cur -= 1 # Reconsume
3583                         return
3584                 # Anything else
3585                 tok_state = tok_state_script_data_double_escaped
3586                 return new_character_token c
3587
3588         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3589         tok_state_script_data_double_escaped_dash_dash = ->
3590                 c = txt.charAt(cur++)
3591                 if c is '-'
3592                         return new_character_token '-'
3593                 if c is '<'
3594                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3595                         return new_character_token '<'
3596                 if c is '>'
3597                         tok_state = tok_state_script_data
3598                         return new_character_token '>'
3599                 if c is "\u0000"
3600                         parse_error()
3601                         tok_state = tok_state_script_data_double_escaped
3602                         return new_character_token "\ufffd"
3603                 if c is '' # EOF
3604                         parse_error()
3605                         tok_state = tok_state_data
3606                         cur -= 1 # Reconsume
3607                         return
3608                 # Anything else
3609                 tok_state = tok_state_script_data_double_escaped
3610                 return new_character_token c
3611
3612         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3613         tok_state_script_data_double_escaped_less_than_sign = ->
3614                 c = txt.charAt(cur++)
3615                 if c is '/'
3616                         temporary_buffer = ''
3617                         tok_state = tok_state_script_data_double_escape_end
3618                         return new_character_token '/'
3619                 # Anything else
3620                 tok_state = tok_state_script_data_double_escaped
3621                 cur -= 1 # Reconsume
3622                 return
3623
3624         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3625         tok_state_script_data_double_escape_end = ->
3626                 c = txt.charAt(cur++)
3627                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3628                         if temporary_buffer is 'script'
3629                                 tok_state = tok_state_script_data_escaped
3630                         else
3631                                 tok_state = tok_state_script_data_double_escaped
3632                         return new_character_token c
3633                 if is_uc_alpha(c)
3634                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3635                         return new_character_token c
3636                 if is_lc_alpha(c)
3637                         temporary_buffer += c
3638                         return new_character_token c
3639                 # Anything else
3640                 tok_state = tok_state_script_data_double_escaped
3641                 cur -= 1 # Reconsume
3642                 return
3643
3644         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3645         tok_state_before_attribute_name = ->
3646                 attr_name = null
3647                 switch c = txt.charAt(cur++)
3648                         when "\t", "\n", "\u000c", ' '
3649                                 return null
3650                         when '/'
3651                                 tok_state = tok_state_self_closing_start_tag
3652                                 return null
3653                         when '>'
3654                                 tok_state = tok_state_data
3655                                 tmp = tok_cur_tag
3656                                 tok_cur_tag = null
3657                                 return tmp
3658                         when "\u0000"
3659                                 parse_error()
3660                                 attr_name = "\ufffd"
3661                         when '"', "'", '<', '='
3662                                 parse_error()
3663                                 attr_name = c
3664                         when '' # EOF
3665                                 parse_error()
3666                                 tok_state = tok_state_data
3667                         else
3668                                 if is_uc_alpha(c)
3669                                         attr_name = c.toLowerCase()
3670                                 else
3671                                         attr_name = c
3672                 if attr_name?
3673                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3674                         tok_state = tok_state_attribute_name
3675                 return null
3676
3677         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3678         tok_state_attribute_name = ->
3679                 switch c = txt.charAt(cur++)
3680                         when "\t", "\n", "\u000c", ' '
3681                                 tok_state = tok_state_after_attribute_name
3682                         when '/'
3683                                 tok_state = tok_state_self_closing_start_tag
3684                         when '='
3685                                 tok_state = tok_state_before_attribute_value
3686                         when '>'
3687                                 tok_state = tok_state_data
3688                                 tmp = tok_cur_tag
3689                                 tok_cur_tag = null
3690                                 return tmp
3691                         when "\u0000"
3692                                 parse_error()
3693                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3694                         when '"', "'", '<'
3695                                 parse_error()
3696                                 tok_cur_tag.attrs_a[0][0] += c
3697                         when '' # EOF
3698                                 parse_error()
3699                                 tok_state = tok_state_data
3700                         else
3701                                 if is_uc_alpha(c)
3702                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3703                                 else
3704                                         tok_cur_tag.attrs_a[0][0] += c
3705                 return null
3706
3707         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3708         tok_state_after_attribute_name = ->
3709                 c = txt.charAt(cur++)
3710                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3711                         return
3712                 if c is '/'
3713                         tok_state = tok_state_self_closing_start_tag
3714                         return
3715                 if c is '='
3716                         tok_state = tok_state_before_attribute_value
3717                         return
3718                 if c is '>'
3719                         tok_state = tok_state_data
3720                         return tok_cur_tag
3721                 if is_uc_alpha(c)
3722                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3723                         tok_state = tok_state_attribute_name
3724                         return
3725                 if c is "\u0000"
3726                         parse_error()
3727                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3728                         tok_state = tok_state_attribute_name
3729                         return
3730                 if c is '' # EOF
3731                         parse_error()
3732                         tok_state = tok_state_data
3733                         cur -= 1 # reconsume
3734                         return
3735                 if c is '"' or c is "'" or c is '<'
3736                         parse_error()
3737                         # fall through to Anything else
3738                 # Anything else
3739                 tok_cur_tag.attrs_a.unshift [c, '']
3740                 tok_state = tok_state_attribute_name
3741                 return
3742
3743         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3744         tok_state_before_attribute_value = ->
3745                 switch c = txt.charAt(cur++)
3746                         when "\t", "\n", "\u000c", ' '
3747                                 return null
3748                         when '"'
3749                                 tok_state = tok_state_attribute_value_double_quoted
3750                         when '&'
3751                                 tok_state = tok_state_attribute_value_unquoted
3752                                 cur -= 1
3753                         when "'"
3754                                 tok_state = tok_state_attribute_value_single_quoted
3755                         when "\u0000"
3756                                 # Parse error
3757                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3758                                 tok_state = tok_state_attribute_value_unquoted
3759                         when '>'
3760                                 # Parse error
3761                                 tok_state = tok_state_data
3762                                 tmp = tok_cur_tag
3763                                 tok_cur_tag = null
3764                                 return tmp
3765                         when '' # EOF
3766                                 parse_error()
3767                                 tok_state = tok_state_data
3768                         else
3769                                 tok_cur_tag.attrs_a[0][1] += c
3770                                 tok_state = tok_state_attribute_value_unquoted
3771                 return null
3772
3773         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3774         tok_state_attribute_value_double_quoted = ->
3775                 switch c = txt.charAt(cur++)
3776                         when '"'
3777                                 tok_state = tok_state_after_attribute_value_quoted
3778                         when '&'
3779                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3780                         when "\u0000"
3781                                 # Parse error
3782                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3783                         when '' # EOF
3784                                 parse_error()
3785                                 tok_state = tok_state_data
3786                         else
3787                                 tok_cur_tag.attrs_a[0][1] += c
3788                 return null
3789
3790         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3791         tok_state_attribute_value_single_quoted = ->
3792                 switch c = txt.charAt(cur++)
3793                         when "'"
3794                                 tok_state = tok_state_after_attribute_value_quoted
3795                         when '&'
3796                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3797                         when "\u0000"
3798                                 # Parse error
3799                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3800                         when '' # EOF
3801                                 parse_error()
3802                                 tok_state = tok_state_data
3803                         else
3804                                 tok_cur_tag.attrs_a[0][1] += c
3805                 return null
3806
3807         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3808         tok_state_attribute_value_unquoted = ->
3809                 switch c = txt.charAt(cur++)
3810                         when "\t", "\n", "\u000c", ' '
3811                                 tok_state = tok_state_before_attribute_name
3812                         when '&'
3813                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3814                         when '>'
3815                                 tok_state = tok_state_data
3816                                 tmp = tok_cur_tag
3817                                 tok_cur_tag = null
3818                                 return tmp
3819                         when "\u0000"
3820                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3821                         when '' # EOF
3822                                 parse_error()
3823                                 tok_state = tok_state_data
3824                         else
3825                                 # Parse Error if ', <, = or ` (backtick)
3826                                 tok_cur_tag.attrs_a[0][1] += c
3827                 return null
3828
3829         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3830         tok_state_after_attribute_value_quoted = ->
3831                 switch c = txt.charAt(cur++)
3832                         when "\t", "\n", "\u000c", ' '
3833                                 tok_state = tok_state_before_attribute_name
3834                         when '/'
3835                                 tok_state = tok_state_self_closing_start_tag
3836                         when '>'
3837                                 tok_state = tok_state_data
3838                                 tmp = tok_cur_tag
3839                                 tok_cur_tag = null
3840                                 return tmp
3841                         when '' # EOF
3842                                 parse_error()
3843                                 tok_state = tok_state_data
3844                         else
3845                                 # Parse Error
3846                                 tok_state = tok_state_before_attribute_name
3847                                 cur -= 1 # we didn't handle that char
3848                 return null
3849
3850         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3851         tok_state_self_closing_start_tag = ->
3852                 c = txt.charAt(cur++)
3853                 if c is '>'
3854                         tok_cur_tag.flag 'self-closing', true
3855                         tok_state = tok_state_data
3856                         return tok_cur_tag
3857                 if c is ''
3858                         parse_error()
3859                         tok_state = tok_state_data
3860                         cur -= 1 # Reconsume
3861                         return
3862                 # Anything else
3863                 parse_error()
3864                 tok_state = tok_state_before_attribute_name
3865                 cur -= 1 # Reconsume
3866                 return
3867
3868         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3869         # WARNING: put a comment token in tok_cur_tag before setting this state
3870         tok_state_bogus_comment = ->
3871                 next_gt = txt.indexOf '>', cur
3872                 if next_gt is -1
3873                         val = txt.substr cur
3874                         cur = txt.length
3875                 else
3876                         val = txt.substr cur, (next_gt - cur)
3877                         cur = next_gt + 1
3878                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3879                 tok_cur_tag.text += val
3880                 tok_state = tok_state_data
3881                 return tok_cur_tag
3882
3883         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3884         tok_state_markup_declaration_open = ->
3885                 if txt.substr(cur, 2) is '--'
3886                         cur += 2
3887                         tok_cur_tag = new_comment_token ''
3888                         tok_state = tok_state_comment_start
3889                         return
3890                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3891                         cur += 7
3892                         tok_state = tok_state_doctype
3893                         return
3894                 acn = adjusted_current_node()
3895                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3896                         cur += 7
3897                         tok_state = tok_state_cdata_section
3898                         return
3899                 # Otherwise
3900                 parse_error()
3901                 tok_cur_tag = new_comment_token ''
3902                 tok_state = tok_state_bogus_comment
3903                 return
3904
3905         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3906         tok_state_comment_start = ->
3907                 switch c = txt.charAt(cur++)
3908                         when '-'
3909                                 tok_state = tok_state_comment_start_dash
3910                         when "\u0000"
3911                                 parse_error()
3912                                 tok_state = tok_state_comment
3913                                 return new_character_token "\ufffd"
3914                         when '>'
3915                                 parse_error()
3916                                 tok_state = tok_state_data
3917                                 return tok_cur_tag
3918                         when '' # EOF
3919                                 parse_error()
3920                                 tok_state = tok_state_data
3921                                 cur -= 1 # Reconsume
3922                                 return tok_cur_tag
3923                         else
3924                                 tok_cur_tag.text += c
3925                                 tok_state = tok_state_comment
3926                 return null
3927
3928         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3929         tok_state_comment_start_dash = ->
3930                 switch c = txt.charAt(cur++)
3931                         when '-'
3932                                 tok_state = tok_state_comment_end
3933                         when "\u0000"
3934                                 parse_error()
3935                                 tok_cur_tag.text += "-\ufffd"
3936                                 tok_state = tok_state_comment
3937                         when '>'
3938                                 parse_error()
3939                                 tok_state = tok_state_data
3940                                 return tok_cur_tag
3941                         when '' # EOF
3942                                 parse_error()
3943                                 tok_state = tok_state_data
3944                                 cur -= 1 # Reconsume
3945                                 return tok_cur_tag
3946                         else
3947                                 tok_cur_tag.text += "-#{c}"
3948                                 tok_state = tok_state_comment
3949                 return null
3950
3951         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3952         tok_state_comment = ->
3953                 switch c = txt.charAt(cur++)
3954                         when '-'
3955                                 tok_state = tok_state_comment_end_dash
3956                         when "\u0000"
3957                                 parse_error()
3958                                 tok_cur_tag.text += "\ufffd"
3959                         when '' # EOF
3960                                 parse_error()
3961                                 tok_state = tok_state_data
3962                                 cur -= 1 # Reconsume
3963                                 return tok_cur_tag
3964                         else
3965                                 tok_cur_tag.text += c
3966                 return null
3967
3968         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3969         tok_state_comment_end_dash = ->
3970                 switch c = txt.charAt(cur++)
3971                         when '-'
3972                                 tok_state = tok_state_comment_end
3973                         when "\u0000"
3974                                 parse_error()
3975                                 tok_cur_tag.text += "-\ufffd"
3976                                 tok_state = tok_state_comment
3977                         when '' # EOF
3978                                 parse_error()
3979                                 tok_state = tok_state_data
3980                                 cur -= 1 # Reconsume
3981                                 return tok_cur_tag
3982                         else
3983                                 tok_cur_tag.text += "-#{c}"
3984                                 tok_state = tok_state_comment
3985                 return null
3986
3987         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3988         tok_state_comment_end = ->
3989                 switch c = txt.charAt(cur++)
3990                         when '>'
3991                                 tok_state = tok_state_data
3992                                 return tok_cur_tag
3993                         when "\u0000"
3994                                 parse_error()
3995                                 tok_cur_tag.text += "--\ufffd"
3996                                 tok_state = tok_state_comment
3997                         when '!'
3998                                 parse_error()
3999                                 tok_state = tok_state_comment_end_bang
4000                         when '-'
4001                                 parse_error()
4002                                 tok_cur_tag.text += '-'
4003                         when '' # EOF
4004                                 parse_error()
4005                                 tok_state = tok_state_data
4006                                 cur -= 1 # Reconsume
4007                                 return tok_cur_tag
4008                         else
4009                                 parse_error()
4010                                 tok_cur_tag.text += "--#{c}"
4011                                 tok_state = tok_state_comment
4012                 return null
4013
4014         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4015         tok_state_comment_end_bang = ->
4016                 switch c = txt.charAt(cur++)
4017                         when '-'
4018                                 tok_cur_tag.text += "--!#{c}"
4019                                 tok_state = tok_state_comment_end_dash
4020                         when '>'
4021                                 tok_state = tok_state_data
4022                                 return tok_cur_tag
4023                         when "\u0000"
4024                                 parse_error()
4025                                 tok_cur_tag.text += "--!\ufffd"
4026                                 tok_state = tok_state_comment
4027                         when '' # EOF
4028                                 parse_error()
4029                                 tok_state = tok_state_data
4030                                 cur -= 1 # Reconsume
4031                                 return tok_cur_tag
4032                         else
4033                                 tok_cur_tag.text += "--!#{c}"
4034                                 tok_state = tok_state_comment
4035                 return null
4036
4037         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4038         tok_state_doctype = ->
4039                 switch c = txt.charAt(cur++)
4040                         when "\t", "\u000a", "\u000c", ' '
4041                                 tok_state = tok_state_before_doctype_name
4042                         when '' # EOF
4043                                 parse_error()
4044                                 tok_state = tok_state_data
4045                                 el = new_doctype_token ''
4046                                 el.flag 'force-quirks', true
4047                                 cur -= 1 # Reconsume
4048                                 return el
4049                         else
4050                                 parse_error()
4051                                 tok_state = tok_state_before_doctype_name
4052                                 cur -= 1 # Reconsume
4053                 return null
4054
4055         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4056         tok_state_before_doctype_name = ->
4057                 c = txt.charAt(cur++)
4058                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4059                         return
4060                 if is_uc_alpha(c)
4061                         tok_cur_tag = new_doctype_token c.toLowerCase()
4062                         tok_state = tok_state_doctype_name
4063                         return
4064                 if c is "\u0000"
4065                         parse_error()
4066                         tok_cur_tag = new_doctype_token "\ufffd"
4067                         tok_state = tok_state_doctype_name
4068                         return
4069                 if c is '>'
4070                         parse_error()
4071                         el = new_doctype_token ''
4072                         el.flag 'force-quirks', true
4073                         tok_state = tok_state_data
4074                         return el
4075                 if c is '' # EOF
4076                         parse_error()
4077                         tok_state = tok_state_data
4078                         el = new_doctype_token ''
4079                         el.flag 'force-quirks', true
4080                         cur -= 1 # Reconsume
4081                         return el
4082                 # Anything else
4083                 tok_cur_tag = new_doctype_token c
4084                 tok_state = tok_state_doctype_name
4085                 return null
4086
4087         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4088         tok_state_doctype_name = ->
4089                 c = txt.charAt(cur++)
4090                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4091                         tok_state = tok_state_after_doctype_name
4092                         return
4093                 if c is '>'
4094                         tok_state = tok_state_data
4095                         return tok_cur_tag
4096                 if is_uc_alpha(c)
4097                         tok_cur_tag.name += c.toLowerCase()
4098                         return
4099                 if c is "\u0000"
4100                         parse_error()
4101                         tok_cur_tag.name += "\ufffd"
4102                         return
4103                 if c is '' # EOF
4104                         parse_error()
4105                         tok_state = tok_state_data
4106                         tok_cur_tag.flag 'force-quirks', true
4107                         cur -= 1 # Reconsume
4108                         return tok_cur_tag
4109                 # Anything else
4110                 tok_cur_tag.name += c
4111                 return null
4112
4113         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4114         tok_state_after_doctype_name = ->
4115                 c = txt.charAt(cur++)
4116                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4117                         return
4118                 if c is '>'
4119                         tok_state = tok_state_data
4120                         return tok_cur_tag
4121                 if c is '' # EOF
4122                         parse_error()
4123                         tok_state = tok_state_data
4124                         tok_cur_tag.flag 'force-quirks', true
4125                         cur -= 1 # Reconsume
4126                         return tok_cur_tag
4127                 # Anything else
4128                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4129                         cur += 5
4130                         tok_state = tok_state_after_doctype_public_keyword
4131                         return
4132                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4133                         cur += 5
4134                         tok_state = tok_state_after_doctype_system_keyword
4135                         return
4136                 parse_error()
4137                 tok_cur_tag.flag 'force-quirks', true
4138                 tok_state = tok_state_bogus_doctype
4139                 return null
4140
4141         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4142         tok_state_after_doctype_public_keyword = ->
4143                 c = txt.charAt(cur++)
4144                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4145                         tok_state = tok_state_before_doctype_public_identifier
4146                         return
4147                 if c is '"'
4148                         parse_error()
4149                         tok_cur_tag.public_identifier = ''
4150                         tok_state = tok_state_doctype_public_identifier_double_quoted
4151                         return
4152                 if c is "'"
4153                         parse_error()
4154                         tok_cur_tag.public_identifier = ''
4155                         tok_state = tok_state_doctype_public_identifier_single_quoted
4156                         return
4157                 if c is '>'
4158                         parse_error()
4159                         tok_cur_tag.flag 'force-quirks', true
4160                         tok_state = tok_state_data
4161                         return tok_cur_tag
4162                 if c is '' # EOF
4163                         parse_error()
4164                         tok_state = tok_state_data
4165                         tok_cur_tag.flag 'force-quirks', true
4166                         cur -= 1 # Reconsume
4167                         return tok_cur_tag
4168                 # Anything else
4169                 parse_error()
4170                 tok_cur_tag.flag 'force-quirks', true
4171                 tok_state = tok_state_bogus_doctype
4172                 return null
4173
4174         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4175         tok_state_before_doctype_public_identifier = ->
4176                 c = txt.charAt(cur++)
4177                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4178                         return
4179                 if c is '"'
4180                         parse_error()
4181                         tok_cur_tag.public_identifier = ''
4182                         tok_state = tok_state_doctype_public_identifier_double_quoted
4183                         return
4184                 if c is "'"
4185                         parse_error()
4186                         tok_cur_tag.public_identifier = ''
4187                         tok_state = tok_state_doctype_public_identifier_single_quoted
4188                         return
4189                 if c is '>'
4190                         parse_error()
4191                         tok_cur_tag.flag 'force-quirks', true
4192                         tok_state = tok_state_data
4193                         return tok_cur_tag
4194                 if c is '' # EOF
4195                         parse_error()
4196                         tok_state = tok_state_data
4197                         tok_cur_tag.flag 'force-quirks', true
4198                         cur -= 1 # Reconsume
4199                         return tok_cur_tag
4200                 # Anything else
4201                 parse_error()
4202                 tok_cur_tag.flag 'force-quirks', true
4203                 tok_state = tok_state_bogus_doctype
4204                 return null
4205
4206
4207         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4208         tok_state_doctype_public_identifier_double_quoted = ->
4209                 c = txt.charAt(cur++)
4210                 if c is '"'
4211                         tok_state = tok_state_after_doctype_public_identifier
4212                         return
4213                 if c is "\u0000"
4214                         parse_error()
4215                         tok_cur_tag.public_identifier += "\ufffd"
4216                         return
4217                 if c is '>'
4218                         parse_error()
4219                         tok_cur_tag.flag 'force-quirks', true
4220                         tok_state = tok_state_data
4221                         return tok_cur_tag
4222                 if c is '' # EOF
4223                         parse_error()
4224                         tok_state = tok_state_data
4225                         tok_cur_tag.flag 'force-quirks', true
4226                         cur -= 1 # Reconsume
4227                         return tok_cur_tag
4228                 # Anything else
4229                 tok_cur_tag.public_identifier += c
4230                 return null
4231
4232         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4233         tok_state_doctype_public_identifier_single_quoted = ->
4234                 c = txt.charAt(cur++)
4235                 if c is "'"
4236                         tok_state = tok_state_after_doctype_public_identifier
4237                         return
4238                 if c is "\u0000"
4239                         parse_error()
4240                         tok_cur_tag.public_identifier += "\ufffd"
4241                         return
4242                 if c is '>'
4243                         parse_error()
4244                         tok_cur_tag.flag 'force-quirks', true
4245                         tok_state = tok_state_data
4246                         return tok_cur_tag
4247                 if c is '' # EOF
4248                         parse_error()
4249                         tok_state = tok_state_data
4250                         tok_cur_tag.flag 'force-quirks', true
4251                         cur -= 1 # Reconsume
4252                         return tok_cur_tag
4253                 # Anything else
4254                 tok_cur_tag.public_identifier += c
4255                 return null
4256
4257         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4258         tok_state_after_doctype_public_identifier = ->
4259                 c = txt.charAt(cur++)
4260                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4261                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4262                         return
4263                 if c is '>'
4264                         tok_state = tok_state_data
4265                         return tok_cur_tag
4266                 if c is '"'
4267                         parse_error()
4268                         tok_cur_tag.system_identifier = ''
4269                         tok_state = tok_state_doctype_system_identifier_double_quoted
4270                         return
4271                 if c is "'"
4272                         parse_error()
4273                         tok_cur_tag.system_identifier = ''
4274                         tok_state = tok_state_doctype_system_identifier_single_quoted
4275                         return
4276                 if c is '' # EOF
4277                         parse_error()
4278                         tok_state = tok_state_data
4279                         tok_cur_tag.flag 'force-quirks', true
4280                         cur -= 1 # Reconsume
4281                         return tok_cur_tag
4282                 # Anything else
4283                 parse_error()
4284                 tok_cur_tag.flag 'force-quirks', true
4285                 tok_state = tok_state_bogus_doctype
4286                 return null
4287
4288         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4289         tok_state_between_doctype_public_and_system_identifiers = ->
4290                 c = txt.charAt(cur++)
4291                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4292                         return
4293                 if c is '>'
4294                         tok_state = tok_state_data
4295                         return tok_cur_tag
4296                 if c is '"'
4297                         parse_error()
4298                         tok_cur_tag.system_identifier = ''
4299                         tok_state = tok_state_doctype_system_identifier_double_quoted
4300                         return
4301                 if c is "'"
4302                         parse_error()
4303                         tok_cur_tag.system_identifier = ''
4304                         tok_state = tok_state_doctype_system_identifier_single_quoted
4305                         return
4306                 if c is '' # EOF
4307                         parse_error()
4308                         tok_state = tok_state_data
4309                         tok_cur_tag.flag 'force-quirks', true
4310                         cur -= 1 # Reconsume
4311                         return tok_cur_tag
4312                 # Anything else
4313                 parse_error()
4314                 tok_cur_tag.flag 'force-quirks', true
4315                 tok_state = tok_state_bogus_doctype
4316                 return null
4317
4318         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4319         tok_state_after_doctype_system_keyword = ->
4320                 c = txt.charAt(cur++)
4321                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4322                         tok_state = tok_state_before_doctype_system_identifier
4323                         return
4324                 if c is '"'
4325                         parse_error()
4326                         tok_cur_tag.system_identifier = ''
4327                         tok_state = tok_state_doctype_system_identifier_double_quoted
4328                         return
4329                 if c is "'"
4330                         parse_error()
4331                         tok_cur_tag.system_identifier = ''
4332                         tok_state = tok_state_doctype_system_identifier_single_quoted
4333                         return
4334                 if c is '>'
4335                         parse_error()
4336                         tok_cur_tag.flag 'force-quirks', true
4337                         tok_state = tok_state_data
4338                         return tok_cur_tag
4339                 if c is '' # EOF
4340                         parse_error()
4341                         tok_state = tok_state_data
4342                         tok_cur_tag.flag 'force-quirks', true
4343                         cur -= 1 # Reconsume
4344                         return tok_cur_tag
4345                 # Anything else
4346                 parse_error()
4347                 tok_cur_tag.flag 'force-quirks', true
4348                 tok_state = tok_state_bogus_doctype
4349                 return null
4350
4351         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4352         tok_state_before_doctype_system_identifier = ->
4353                 c = txt.charAt(cur++)
4354                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4355                         return
4356                 if c is '"'
4357                         tok_cur_tag.system_identifier = ''
4358                         tok_state = tok_state_doctype_system_identifier_double_quoted
4359                         return
4360                 if c is "'"
4361                         tok_cur_tag.system_identifier = ''
4362                         tok_state = tok_state_doctype_system_identifier_single_quoted
4363                         return
4364                 if c is '>'
4365                         parse_error()
4366                         tok_cur_tag.flag 'force-quirks', true
4367                         tok_state = tok_state_data
4368                         return tok_cur_tag
4369                 if c is '' # EOF
4370                         parse_error()
4371                         tok_state = tok_state_data
4372                         tok_cur_tag.flag 'force-quirks', true
4373                         cur -= 1 # Reconsume
4374                         return tok_cur_tag
4375                 # Anything else
4376                 parse_error()
4377                 tok_cur_tag.flag 'force-quirks', true
4378                 tok_state = tok_state_bogus_doctype
4379                 return null
4380
4381         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4382         tok_state_doctype_system_identifier_double_quoted = ->
4383                 c = txt.charAt(cur++)
4384                 if c is '"'
4385                         tok_state = tok_state_after_doctype_system_identifier
4386                         return
4387                 if c is "\u0000"
4388                         parse_error()
4389                         tok_cur_tag.system_identifier += "\ufffd"
4390                         return
4391                 if c is '>'
4392                         parse_error()
4393                         tok_cur_tag.flag 'force-quirks', true
4394                         tok_state = tok_state_data
4395                         return tok_cur_tag
4396                 if c is '' # EOF
4397                         parse_error()
4398                         tok_state = tok_state_data
4399                         tok_cur_tag.flag 'force-quirks', true
4400                         cur -= 1 # Reconsume
4401                         return tok_cur_tag
4402                 # Anything else
4403                 tok_cur_tag.system_identifier += c
4404                 return null
4405
4406         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4407         tok_state_doctype_system_identifier_single_quoted = ->
4408                 c = txt.charAt(cur++)
4409                 if c is "'"
4410                         tok_state = tok_state_after_doctype_system_identifier
4411                         return
4412                 if c is "\u0000"
4413                         parse_error()
4414                         tok_cur_tag.system_identifier += "\ufffd"
4415                         return
4416                 if c is '>'
4417                         parse_error()
4418                         tok_cur_tag.flag 'force-quirks', true
4419                         tok_state = tok_state_data
4420                         return tok_cur_tag
4421                 if c is '' # EOF
4422                         parse_error()
4423                         tok_state = tok_state_data
4424                         tok_cur_tag.flag 'force-quirks', true
4425                         cur -= 1 # Reconsume
4426                         return tok_cur_tag
4427                 # Anything else
4428                 tok_cur_tag.system_identifier += c
4429                 return null
4430
4431         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4432         tok_state_after_doctype_system_identifier = ->
4433                 c = txt.charAt(cur++)
4434                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4435                         return
4436                 if c is '>'
4437                         tok_state = tok_state_data
4438                         return tok_cur_tag
4439                 if c is '' # EOF
4440                         parse_error()
4441                         tok_state = tok_state_data
4442                         tok_cur_tag.flag 'force-quirks', true
4443                         cur -= 1 # Reconsume
4444                         return tok_cur_tag
4445                 # Anything else
4446                 parse_error()
4447                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4448                 tok_state = tok_state_bogus_doctype
4449                 return null
4450
4451         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4452         tok_state_bogus_doctype = ->
4453                 c = txt.charAt(cur++)
4454                 if c is '>'
4455                         tok_state = tok_state_data
4456                         return tok_cur_tag
4457                 if c is '' # EOF
4458                         tok_state = tok_state_data
4459                         cur -= 1 # Reconsume
4460                         return tok_cur_tag
4461                 # Anything else
4462                 return null
4463
4464         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4465         tok_state_cdata_section = ->
4466                 tok_state = tok_state_data
4467                 next_gt = txt.indexOf ']]>', cur
4468                 if next_gt is -1
4469                         val = txt.substr cur
4470                         cur = txt.length
4471                 else
4472                         val = txt.substr cur, (next_gt - cur)
4473                         cur = next_gt + 3
4474                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4475                 if val.length > 0
4476                         return new_character_token val # fixfull split
4477                 return null
4478
4479         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4480         # Don't set this as a state, just call it
4481         # returns a string (NOT a text node)
4482         parse_character_reference = (allowed_char = null, in_attr = false) ->
4483                 if cur >= txt.length
4484                         return '&'
4485                 switch c = txt.charAt(cur)
4486                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4487                                 # explicitly not a parse error
4488                                 return '&'
4489                         when ';'
4490                                 # there has to be "one or more" alnums between & and ; to be a parse error
4491                                 return '&'
4492                         when '#'
4493                                 if cur + 1 >= txt.length
4494                                         return '&'
4495                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4496                                         base = 16
4497                                         charset = hex_chars
4498                                         start = cur + 2
4499                                 else
4500                                         charset = digits
4501                                         start = cur + 1
4502                                         base = 10
4503                                 i = 0
4504                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4505                                         i += 1
4506                                 if i is 0
4507                                         return '&'
4508                                 cur = start + i
4509                                 if txt.charAt(start + i) is ';'
4510                                         cur += 1
4511                                 else
4512                                         parse_error()
4513                                 code_point = txt.substr(start, i)
4514                                 while code_point.charAt(0) is '0' and code_point.length > 1
4515                                         code_point = code_point.substr 1
4516                                 code_point = parseInt(code_point, base)
4517                                 if unicode_fixes[code_point]?
4518                                         parse_error()
4519                                         return unicode_fixes[code_point]
4520                                 else
4521                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4522                                                 parse_error()
4523                                                 return "\ufffd"
4524                                         else
4525                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4526                                                         parse_error()
4527                                                 return from_code_point code_point
4528                                 return
4529                         else
4530                                 for i in [0...31]
4531                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4532                                                 break
4533                                 if i is 0
4534                                         # exit early, because parse_error() below needs at least one alnum
4535                                         return '&'
4536                                 if txt.charAt(cur + i) is ';'
4537                                         i += 1 # include ';' terminator in value
4538                                         decoded = decode_named_char_ref txt.substr(cur, i)
4539                                         if decoded?
4540                                                 cur += i
4541                                                 return decoded
4542                                         parse_error()
4543                                         return '&'
4544                                 else
4545                                         # no ';' terminator (only legacy char refs)
4546                                         max = i
4547                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4548                                                 c = legacy_char_refs[txt.substr(cur, i)]
4549                                                 if c?
4550                                                         if in_attr
4551                                                                 if txt.charAt(cur + i) is '='
4552                                                                         # "because some legacy user agents will
4553                                                                         # misinterpret the markup in those cases"
4554                                                                         parse_error()
4555                                                                         return '&'
4556                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4557                                                                         # this makes attributes forgiving about url args
4558                                                                         return '&'
4559                                                         # ok, and besides the weird exceptions for attributes...
4560                                                         # return the matching char
4561                                                         cur += i # consume entity chars
4562                                                         parse_error() # because no terminating ";"
4563                                                         return c
4564                                         parse_error()
4565                                         return '&'
4566                 return # never reached
4567
4568         eat_next_token_if_newline = ->
4569                 old_cur = cur
4570                 t = null
4571                 until t?
4572                         t = tok_state()
4573                 if t.type is TYPE_TEXT
4574                         # definition of a newline depends on whether it was a character ref or not
4575                         if cur - old_cur is 1
4576                                 # not a character reference
4577                                 if t.text is "\u000d" or t.text is "\u000a"
4578                                         return
4579                         else
4580                                 if t.text is "\u000a"
4581                                         return
4582                 # not a "newline"
4583                 cur = old_cur
4584                 return
4585
4586         # tree constructor initialization
4587         # see comments on TYPE_TAG/etc for the structure of this data
4588         txt = args.html
4589         cur = 0
4590         doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4591         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4592         fragment_root = null # fragment parsing algorithm returns children of this
4593         open_els = []
4594         afe = [] # active formatting elements
4595         template_ins_modes = []
4596         ins_mode = ins_mode_initial
4597         original_ins_mode = ins_mode # TODO check spec
4598         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4599         flag_frameset_ok = true
4600         flag_parsing = true
4601         flag_foster_parenting = false
4602         form_element_pointer = null
4603         temporary_buffer = null
4604         pending_table_character_tokens = []
4605         head_element_pointer = null
4606         flag_fragment_parsing = false
4607         context_element = null
4608         prev_node_id = 0 # just for debugging
4609
4610         # tokenizer initialization
4611         tok_state = tok_state_data
4612
4613         parse_init = ->
4614                 # fragment parsing (text arg)
4615                 if args.fragment?
4616                         # this handles the fragment from the tests in the format described here:
4617                         # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4618                         f = args.fragment
4619                         ns = NS_HTML
4620                         if f.substr(0, 5) is 'math '
4621                                 f = f.substr 5
4622                                 ns = NS_MATHML
4623                         else if f.substr(0, 4) is 'svg '
4624                                 f = f.substr 4
4625                                 ns = NS_SVG
4626                         t = new_open_tag f
4627                         context_element = token_to_element t, ns
4628                         context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4629                         context_element.document.flag 'quirks mode', QUIRKS_NO
4630                 # fragment parsing (Node arg)
4631                 if args.context?
4632                         context_element = args.context
4633
4634                 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4635                 # fragment parsing algorithm
4636                 if context_element?
4637                         flag_fragment_parsing = true
4638                         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4639                         # search up the tree from context, to try to find it's document,
4640                         # because this file only puts a "document" property on the root
4641                         # element.
4642                         old_doc = null
4643                         el = context_element
4644                         loop
4645                                 if el.document?
4646                                         old_doc = el.document
4647                                         break
4648                                 if el.parent
4649                                         el = el.parent
4650                                 else
4651                                         break
4652                         if old_doc
4653                                 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4654                         # set tok_state
4655                         if context_element.namespace is NS_HTML
4656                                 switch context_element.name
4657                                         when 'title', 'textarea'
4658                                                 tok_state = tok_state_rcdata
4659                                         when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4660                                                 tok_state = tok_state_rawtext
4661                                         when 'script'
4662                                                 tok_state = tok_state_script_data
4663                                         when 'noscript'
4664                                                 if flag_scripting
4665                                                         tok_state = tok_state_rawtext
4666                                         when 'plaintext'
4667                                                 tok_state = tok_state_plaintext
4668                         fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4669                         doc.children.push fragment_root
4670                         fragment_root.document = doc
4671                         open_els = [fragment_root]
4672                         if context_element.name is 'template' and context_element.namespace is NS_HTML
4673                                 template_ins_modes.unshift ins_mode_in_template
4674                         # fixfull create token for context (it should have it's original one already)
4675                         reset_ins_mode()
4676                         # set form_element pointer... in the foreign doc?!
4677                         el = context_element
4678                         loop
4679                                 if el.name is 'form' and el.namespace is NS_HTML
4680                                         form_element_pointer = el
4681                                         break
4682                                 if el.parent
4683                                         el = el.parent
4684                                 else
4685                                         break
4686
4687                 # text pre-processing
4688                 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4689                 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4690                 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4691
4692                 return
4693
4694         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4695         parse_main_loop = ->
4696                 while flag_parsing
4697                         t = tok_state()
4698                         if t?
4699                                 process_token t
4700                                 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4701                 return
4702         parse_init()
4703         parse_main_loop()
4704
4705         if flag_fragment_parsing
4706                 return fragment_root.children
4707         return doc.children
4708
4709 module.exports.parse_html = parse_html
4710 module.exports.debug_log_reset = debug_log_reset
4711 module.exports.debug_log_each = debug_log_each
4712 module.exports.TYPE_TAG = TYPE_TAG
4713 module.exports.TYPE_TEXT = TYPE_TEXT
4714 module.exports.TYPE_COMMENT = TYPE_COMMENT
4715 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4716 module.exports.NS_HTML = NS_HTML
4717 module.exports.NS_MATHML = NS_MATHML
4718 module.exports.NS_SVG = NS_SVG
4719 module.exports.QUIRKS_NO = QUIRKS_NO
4720 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4721 module.exports.QUIRKS_YES = QUIRKS_YES