JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix space before > at end of tag
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor.
20
21 # The implementation is a pretty direct implementation of the parsing algorithm
22 # described here:
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
24 #
25 # Deviations from that spec:
26 #
27 #   Purposeful: search this file for "WHATWG"
28 #
29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
30
31
32 # stacks/lists
33 #
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
37 # (both as stacks)
38 #
39 # stacks grow downward (current element is index=0)
40 #
41 # example: open_els = [a, b, c, d, e, f, g]
42 #
43 # "grows downwards" means it's visualized like this: (index: el, names)
44 #
45 #   6: g "start of the list", "topmost", "first"
46 #   5: f
47 #   4: e "previous" (to d), "above", "before"
48 #   3: d   (previous/next are relative to this element)
49 #   2: c "next", "after", "lower", "below"
50 #   1: b
51 #   0: a "end of the list", "current node", "bottommost", "last"
52
53
54 # browser
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
58         window.wheic = {}
59         module = exports: window.wheic
60
61 from_code_point = (x) ->
62         if String.fromCodePoint?
63                 return String.fromCodePoint x
64         else
65                 if x <= 0xffff
66                         return String.fromCharCode x
67                 x -= 0x10000
68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
69
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
73 TYPE_COMMENT = 2
74 TYPE_DOCTYPE = 3
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
78 TYPE_EOF = 6
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
81
82 # namespace constants
83 NS_HTML = 1
84 NS_MATHML = 2
85 NS_SVG = 3
86
87 # quirks mode constants
88 QUIRKS_NO = 1
89 QUIRKS_LIMITED = 2
90 QUIRKS_YES = 3
91
92 g_debug_log = []
93 debug_log_reset = ->
94         g_debug_log = []
95 debug_log = (str) ->
96         g_debug_log.push str
97 debug_log_each = (cb) ->
98         for str in g_debug_log
99                 cb str
100
101 prev_node_id = 0
102 class Node
103         constructor: (type, args = {}) ->
104                 @type = type # one of the TYPE_* constants above
105                 @name = args.name ? '' # tag name
106                 @text = args.text ? '' # contents for text/comment nodes
107                 @attrs = args.attrs ? {}
108                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109                 @children = args.children ? []
110                 @namespace = args.namespace ? NS_HTML
111                 @parent = args.parent ? null
112                 @token = args.token ? null
113                 @flags = args.flags ? {}
114                 if args.id?
115                         @id = "#{args.id}+"
116                 else
117                         @id = "#{++prev_node_id}"
118         acknowledge_self_closing: ->
119                 if @token?
120                         @token.flag 'did_self_close', true
121                 else
122                         @flag 'did_self_close', true
123         flag: (key, value = null) ->
124                 if value?
125                         @flags[key] = value
126                 else
127                         return @flags[key]
128         serialize: (shallow = false, show_ids = false) -> # for unit tests
129                 ret = ''
130                 switch @type
131                         when TYPE_TAG
132                                 ret += 'tag:'
133                                 ret += JSON.stringify @name
134                                 ret += ','
135                                 if show_ids
136                                         ret += "##{@id},"
137                                 if shallow
138                                         break
139                                 attr_keys = []
140                                 for k of @attrs
141                                         attr_keys.push k
142                                 attr_keys.sort()
143                                 ret += '{'
144                                 sep = ''
145                                 for k in attr_keys
146                                         ret += sep
147                                         sep = ','
148                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149                                 ret += '},['
150                                 sep = ''
151                                 for c in @children
152                                         ret += sep
153                                         sep = ','
154                                         ret += c.serialize shallow, show_ids
155                                 ret += ']'
156                         when TYPE_TEXT
157                                 ret += 'text:'
158                                 ret += JSON.stringify @text
159                         when TYPE_COMMENT
160                                 ret += 'comment:'
161                                 ret += JSON.stringify @text
162                         when TYPE_DOCTYPE
163                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
164                         when TYPE_AFE_MARKER
165                                 ret += 'marker'
166                         when TYPE_AAA_BOOKMARK
167                                 ret += 'aaa_bookmark'
168                         else
169                                 ret += 'unknown:'
170                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
171                 return ret
172
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175         return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177         return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179         return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181         return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184         return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186         return new Node TYPE_DOCTYPE, name: name
187 new_eof_token = ->
188         return new Node TYPE_EOF
189 new_afe_marker = ->
190         return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192         return new Node TYPE_AAA_BOOKMARK
193
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
199
200 is_uc_alpha = (str) ->
201         return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203         return str.length is 1 and lc_alpha.indexOf(str) > -1
204
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
207
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
210 is_space = (txt) ->
211         return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
214
215 is_input_hidden_tok = (t) ->
216         return false unless t.type is TYPE_START_TAG
217         for a in t.attrs_a
218                 if a[0] is 'type'
219                         if a[1].toLowerCase() is 'hidden'
220                                 return true
221                         return false
222         return false
223
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
226
227 unicode_fixes = {}
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
256
257 quirks_yes_pi_prefixes = [
258         "+//silmaril//dtd html pro v0r11 19970101//"
259         "-//as//dtd html 3.0 aswedit + extensions//"
260         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261         "-//ietf//dtd html 2.0 level 1//"
262         "-//ietf//dtd html 2.0 level 2//"
263         "-//ietf//dtd html 2.0 strict level 1//"
264         "-//ietf//dtd html 2.0 strict level 2//"
265         "-//ietf//dtd html 2.0 strict//"
266         "-//ietf//dtd html 2.0//"
267         "-//ietf//dtd html 2.1e//"
268         "-//ietf//dtd html 3.0//"
269         "-//ietf//dtd html 3.2 final//"
270         "-//ietf//dtd html 3.2//"
271         "-//ietf//dtd html 3//"
272         "-//ietf//dtd html level 0//"
273         "-//ietf//dtd html level 1//"
274         "-//ietf//dtd html level 2//"
275         "-//ietf//dtd html level 3//"
276         "-//ietf//dtd html strict level 0//"
277         "-//ietf//dtd html strict level 1//"
278         "-//ietf//dtd html strict level 2//"
279         "-//ietf//dtd html strict level 3//"
280         "-//ietf//dtd html strict//"
281         "-//ietf//dtd html//"
282         "-//metrius//dtd metrius presentational//"
283         "-//microsoft//dtd internet explorer 2.0 html strict//"
284         "-//microsoft//dtd internet explorer 2.0 html//"
285         "-//microsoft//dtd internet explorer 2.0 tables//"
286         "-//microsoft//dtd internet explorer 3.0 html strict//"
287         "-//microsoft//dtd internet explorer 3.0 html//"
288         "-//microsoft//dtd internet explorer 3.0 tables//"
289         "-//netscape comm. corp.//dtd html//"
290         "-//netscape comm. corp.//dtd strict html//"
291         "-//o'reilly and associates//dtd html 2.0//"
292         "-//o'reilly and associates//dtd html extended 1.0//"
293         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294         "-//sq//dtd html 2.0 hotmetal + extensions//"
295         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297         "-//spyglass//dtd html 2.0 extended//"
298         "-//sun microsystems corp.//dtd hotjava html//"
299         "-//sun microsystems corp.//dtd hotjava strict html//"
300         "-//w3c//dtd html 3 1995-03-24//"
301         "-//w3c//dtd html 3.2 draft//"
302         "-//w3c//dtd html 3.2 final//"
303         "-//w3c//dtd html 3.2//"
304         "-//w3c//dtd html 3.2s draft//"
305         "-//w3c//dtd html 4.0 frameset//"
306         "-//w3c//dtd html 4.0 transitional//"
307         "-//w3c//dtd html experimental 19960712//"
308         "-//w3c//dtd html experimental 970421//"
309         "-//w3c//dtd w3 html//"
310         "-//w3o//dtd w3 html 3.0//"
311         "-//webtechs//dtd mozilla html 2.0//"
312         "-//webtechs//dtd mozilla html//"
313 ]
314
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
317 legacy_char_refs = {
318         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
335         yen: '¥', yuml: 'ÿ'
336 }
337
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
342 svg_elements = [
343         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
357         'view', 'vkern'
358 ]
359
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
361 mathml_elements = [
362         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368         'determinant', 'diff', 'divergence', 'divide', 'domain',
369         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389         'vectorproduct', 'xor'
390 ]
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
393
394 special_elements = {
395         # HTML:
396         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
407
408         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
409
410         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
417
418         # MathML:
419         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420         'annotation-xml':NS_MATHML,
421
422         # SVG:
423         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
424 }
425
426 formatting_elements = {
427          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
429          u: true
430 }
431
432 mathml_text_integration = {
433         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
434 }
435 is_mathml_text_integration_point = (el) ->
436         return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438         if el.namespace is NS_MATHML
439                 if el.name is 'annotation-xml'
440                         if el.attrs.encoding?
441                                 if el.attrs.encoding.toLowerCase() is 'text/html'
442                                         return true
443                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
444                                         return true
445                 return false
446         if el.namespace is NS_SVG
447                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
448                         return true
449         return false
450
451 h_tags = {
452         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
453 }
454
455 foster_parenting_targets = {
456         table: NS_HTML
457         tbody: NS_HTML
458         tfoot: NS_HTML
459         thead: NS_HTML
460         tr: NS_HTML
461 }
462
463 end_tag_implied = {
464         dd: NS_HTML
465         dt: NS_HTML
466         li: NS_HTML
467         option: NS_HTML
468         optgroup: NS_HTML
469         p: NS_HTML
470         rb: NS_HTML
471         rp: NS_HTML
472         rt: NS_HTML
473         rtc: NS_HTML
474 }
475
476 el_is_special = (e) ->
477         return special_elements[e.name] is e.namespace
478
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
482
483 svg_name_fixes = {
484         altglyph: 'altGlyph'
485         altglyphdef: 'altGlyphDef'
486         altglyphitem: 'altGlyphItem'
487         animatecolor: 'animateColor'
488         animatemotion: 'animateMotion'
489         animatetransform: 'animateTransform'
490         clippath: 'clipPath'
491         feblend: 'feBlend'
492         fecolormatrix: 'feColorMatrix'
493         fecomponenttransfer: 'feComponentTransfer'
494         fecomposite: 'feComposite'
495         feconvolvematrix: 'feConvolveMatrix'
496         fediffuselighting: 'feDiffuseLighting'
497         fedisplacementmap: 'feDisplacementMap'
498         fedistantlight: 'feDistantLight'
499         fedropshadow: 'feDropShadow'
500         feflood: 'feFlood'
501         fefunca: 'feFuncA'
502         fefuncb: 'feFuncB'
503         fefuncg: 'feFuncG'
504         fefuncr: 'feFuncR'
505         fegaussianblur: 'feGaussianBlur'
506         feimage: 'feImage'
507         femerge: 'feMerge'
508         femergenode: 'feMergeNode'
509         femorphology: 'feMorphology'
510         feoffset: 'feOffset'
511         fepointlight: 'fePointLight'
512         fespecularlighting: 'feSpecularLighting'
513         fespotlight: 'feSpotLight'
514         fetile: 'feTile'
515         feturbulence: 'feTurbulence'
516         foreignobject: 'foreignObject'
517         glyphref: 'glyphRef'
518         lineargradient: 'linearGradient'
519         radialgradient: 'radialGradient'
520         textpath: 'textPath'
521 }
522 svg_attribute_fixes = {
523         attributename: 'attributeName'
524         attributetype: 'attributeType'
525         basefrequency: 'baseFrequency'
526         baseprofile: 'baseProfile'
527         calcmode: 'calcMode'
528         clippathunits: 'clipPathUnits'
529         contentscripttype: 'contentScriptType'
530         contentstyletype: 'contentStyleType'
531         diffuseconstant: 'diffuseConstant'
532         edgemode: 'edgeMode'
533         externalresourcesrequired: 'externalResourcesRequired'
534         # WHATWG removes this: filterres: 'filterRes'
535         filterunits: 'filterUnits'
536         glyphref: 'glyphRef'
537         gradienttransform: 'gradientTransform'
538         gradientunits: 'gradientUnits'
539         kernelmatrix: 'kernelMatrix'
540         kernelunitlength: 'kernelUnitLength'
541         keypoints: 'keyPoints'
542         keysplines: 'keySplines'
543         keytimes: 'keyTimes'
544         lengthadjust: 'lengthAdjust'
545         limitingconeangle: 'limitingConeAngle'
546         markerheight: 'markerHeight'
547         markerunits: 'markerUnits'
548         markerwidth: 'markerWidth'
549         maskcontentunits: 'maskContentUnits'
550         maskunits: 'maskUnits'
551         numoctaves: 'numOctaves'
552         pathlength: 'pathLength'
553         patterncontentunits: 'patternContentUnits'
554         patterntransform: 'patternTransform'
555         patternunits: 'patternUnits'
556         pointsatx: 'pointsAtX'
557         pointsaty: 'pointsAtY'
558         pointsatz: 'pointsAtZ'
559         preservealpha: 'preserveAlpha'
560         preserveaspectratio: 'preserveAspectRatio'
561         primitiveunits: 'primitiveUnits'
562         refx: 'refX'
563         refy: 'refY'
564         repeatcount: 'repeatCount'
565         repeatdur: 'repeatDur'
566         requiredextensions: 'requiredExtensions'
567         requiredfeatures: 'requiredFeatures'
568         specularconstant: 'specularConstant'
569         specularexponent: 'specularExponent'
570         spreadmethod: 'spreadMethod'
571         startoffset: 'startOffset'
572         stddeviation: 'stdDeviation'
573         stitchtiles: 'stitchTiles'
574         surfacescale: 'surfaceScale'
575         systemlanguage: 'systemLanguage'
576         tablevalues: 'tableValues'
577         targetx: 'targetX'
578         targety: 'targetY'
579         textlength: 'textLength'
580         viewbox: 'viewBox'
581         viewtarget: 'viewTarget'
582         xchannelselector: 'xChannelSelector'
583         ychannelselector: 'yChannelSelector'
584         zoomandpan: 'zoomAndPan'
585 }
586 foreign_attr_fixes = {
587         'xlink:actuate': 'xlink actuate'
588         'xlink:arcrole': 'xlink arcrole'
589         'xlink:href': 'xlink href'
590         'xlink:role': 'xlink role'
591         'xlink:show': 'xlink show'
592         'xlink:title': 'xlink title'
593         'xlink:type': 'xlink type'
594         'xml:base': 'xml base'
595         'xml:lang': 'xml lang'
596         'xml:space': 'xml space'
597         'xmlns': 'xmlns'
598         'xmlns:xlink': 'xmlns xlink'
599 }
600 adjust_mathml_attributes = (t) ->
601         for a in t.attrs_a
602                 if a[0] is 'definitionurl'
603                         a[0] = 'definitionURL'
604         return
605 adjust_svg_attributes = (t) ->
606         for a in t.attrs_a
607                 if svg_attribute_fixes[a[0]]?
608                         a[0] = svg_attribute_fixes[a[0]]
609         return
610 adjust_foreign_attributes = (t) ->
611         # fixfull
612         for a in t.attrs_a
613                 if foreign_attr_fixes[a[0]]?
614                         a[0] = foreign_attr_fixes[a[0]]
615         return
616
617 # decode_named_char_ref()
618 #
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
621 #
622 # Pass without the "&" but with the ";" examples:
623 #    for "&amp" pass "amp;"
624 #    for "&#x2032" pass "x2032;"
625 g_dncr = {
626         cache: {}
627         textarea: document.createElement('textarea')
628 }
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
631         txt = "&#{txt}"
632         decoded = g_dncr.cache[txt]
633         return decoded if decoded?
634         g_dncr.textarea.innerHTML = txt
635         decoded = g_dncr.textarea.value
636         return null if decoded is txt
637         return g_dncr.cache[txt] = decoded
638
639 parse_html = (args) ->
640         txt = null
641         cur = null # index of next char in txt to be parsed
642         # declare doc and tokenizer variables so they're in scope below
643         doc = null
644         open_els = null # stack of open elements
645         afe = null # active formatting elements
646         template_ins_modes = null
647         ins_mode = null
648         original_ins_mode = null
649         tok_state = null
650         tok_cur_tag = null # partially parsed tag
651         flag_scripting = null
652         flag_frameset_ok = null
653         flag_parsing = null
654         flag_foster_parenting = null
655         form_element_pointer = null
656         temporary_buffer = null
657         pending_table_character_tokens = null
658         head_element_pointer = null
659         flag_fragment_parsing = null
660         context_element = null
661
662         stop_parsing = ->
663                 flag_parsing = false
664
665         parse_error = ->
666                 if args.error_cb?
667                         args.error_cb cur
668                 else
669                         console.log "Parse error at character #{cur} of #{txt.length}"
670
671         # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672         # "Noah's Ark clause" but with three
673         afe_push = (new_el) ->
674                 matches = 0
675                 for el, i in afe
676                         if el.type is TYPE_AFE_MARKER
677                                 break
678                         if el.name is new_el.name and el.namespace is new_el.namespace
679                                 attrs_match = true
680                                 for k, v of el.attrs
681                                         unless new_el.attrs[k] is v
682                                                 attrs_match = false
683                                                 break
684                                 if attrs_match
685                                         for k, v of new_el.attrs
686                                                 unless el.attrs[k] is v
687                                                         attrs_match = false
688                                                         break
689                                 if attrs_match
690                                         matches += 1
691                                         if matches is 3
692                                                 afe.splice i, 1
693                                                 break
694                 afe.unshift new_el
695         afe_push_marker = ->
696                 afe.unshift new_afe_marker()
697
698         # the functions below impliment the Tree Contstruction algorithm
699         # http://www.w3.org/TR/html5/syntax.html#tree-construction
700
701         # But first... the helpers
702         template_tag_is_open = ->
703                 for el in open_els
704                         if el.name is 'template' and el.namespace is NS_HTML
705                                 return true
706                 return false
707         is_in_scope_x = (tag_name, scope, namespace) ->
708                 for el in open_els
709                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
710                                 return true
711                         if scope[el.name] is el.namespace
712                                 return false
713                 return false
714         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
715                 for el in open_els
716                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
717                                 return true
718                         if scope[el.name] is el.namespace
719                                 return false
720                         if scope2[el.name] is el.namespace
721                                 return false
722                 return false
723         standard_scopers = {
724                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
726                 template: NS_HTML,
727
728                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
730
731                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
732         }
733         button_scopers = button: NS_HTML
734         li_scopers = ol: NS_HTML, ul: NS_HTML
735         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736         is_in_scope = (tag_name, namespace = null) ->
737                 return is_in_scope_x tag_name, standard_scopers, namespace
738         is_in_button_scope = (tag_name, namespace = null) ->
739                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740         is_in_table_scope = (tag_name, namespace = null) ->
741                 return is_in_scope_x tag_name, table_scopers, namespace
742         # aka is_in_list_item_scope
743         is_in_li_scope = (tag_name, namespace = null) ->
744                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745         is_in_select_scope = (tag_name, namespace = null) ->
746                 for t in open_els
747                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
748                                 return true
749                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
750                                 return false
751                 return false
752         # this checks for a particular element, not by name
753         # this requires a namespace match
754         el_is_in_scope = (needle) ->
755                 for el in open_els
756                         if el is needle
757                                 return true
758                         if standard_scopers[el.name] is el.namespace
759                                 return false
760                 return false
761
762         clear_to_table_stopers = {
763                 'table': true
764                 'template': true
765                 'html': true
766         }
767         clear_stack_to_table_context = ->
768                 loop
769                         if clear_to_table_stopers[open_els[0].name]?
770                                 break
771                         open_els.shift()
772                 return
773         clear_to_table_body_stopers = {
774                 tbody: NS_HTML
775                 tfoot: NS_HTML
776                 thead: NS_HTML
777                 template: NS_HTML
778                 html: NS_HTML
779         }
780         clear_stack_to_table_body_context = ->
781                 loop
782                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
783                                 break
784                         open_els.shift()
785                 return
786         clear_to_table_row_stopers = {
787                 'tr': true
788                 'template': true
789                 'html': true
790         }
791         clear_stack_to_table_row_context = ->
792                 loop
793                         if clear_to_table_row_stopers[open_els[0].name]?
794                                 break
795                         open_els.shift()
796                 return
797         clear_afe_to_marker = ->
798                 loop
799                         return unless afe.length > 0 # this happens in fragment case, ?spec error
800                         el = afe.shift()
801                         if el.type is TYPE_AFE_MARKER
802                                 return
803                 return
804
805         # 8.2.3.1 ...
806         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
807         reset_ins_mode = ->
808                 # 1. Let last be false.
809                 last = false
810                 # 2. Let node be the last node in the stack of open elements.
811                 node_i = 0
812                 node = open_els[node_i]
813                 # 3. Loop: If node is the first node in the stack of open elements,
814                 # then set last to true, and, if the parser was originally created as
815                 # part of the HTML fragment parsing algorithm (fragment case) set node
816                 # to the context element.
817                 loop
818                         if node_i is open_els.length - 1
819                                 last = true
820                                 # fixfull (fragment case)
821
822                         # 4. If node is a select element, run these substeps:
823                         if node.name is 'select' and node.namespace is NS_HTML
824                                 # 1. If last is true, jump to the step below labeled done.
825                                 unless last
826                                         # 2. Let ancestor be node.
827                                         ancestor_i = node_i
828                                         ancestor = node
829                                         # 3. Loop: If ancestor is the first node in the stack of
830                                         # open elements, jump to the step below labeled done.
831                                         loop
832                                                 if ancestor_i is open_els.length - 1
833                                                         break
834                                                 # 4. Let ancestor be the node before ancestor in the stack
835                                                 # of open elements.
836                                                 ancestor_i += 1
837                                                 ancestor = open_els[ancestor_i]
838                                                 # 5. If ancestor is a template node, jump to the step below
839                                                 # labeled done.
840                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
841                                                         break
842                                                 # 6. If ancestor is a table node, switch the insertion mode
843                                                 # to "in select in table" and abort these steps.
844                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845                                                         ins_mode = ins_mode_in_select_in_table
846                                                         return
847                                                 # 7. Jump back to the step labeled loop.
848                                 # 8. Done: Switch the insertion mode to "in select" and abort
849                                 # these steps.
850                                 ins_mode = ins_mode_in_select
851                                 return
852                         # 5. If node is a td or th element and last is false, then switch
853                         # the insertion mode to "in cell" and abort these steps.
854                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855                                 ins_mode = ins_mode_in_cell
856                                 return
857                         # 6. If node is a tr element, then switch the insertion mode to "in
858                         # row" and abort these steps.
859                         if node.name is 'tr' and node.namespace is NS_HTML
860                                 ins_mode = ins_mode_in_row
861                                 return
862                         # 7. If node is a tbody, thead, or tfoot element, then switch the
863                         # insertion mode to "in table body" and abort these steps.
864                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865                                 ins_mode = ins_mode_in_table_body
866                                 return
867                         # 8. If node is a caption element, then switch the insertion mode
868                         # to "in caption" and abort these steps.
869                         if node.name is 'caption' and node.namespace is NS_HTML
870                                 ins_mode = ins_mode_in_caption
871                                 return
872                         # 9. If node is a colgroup element, then switch the insertion mode
873                         # to "in column group" and abort these steps.
874                         if node.name is 'colgroup' and node.namespace is NS_HTML
875                                 ins_mode = ins_mode_in_column_group
876                                 return
877                         # 10. If node is a table element, then switch the insertion mode to
878                         # "in table" and abort these steps.
879                         if node.name is 'table' and node.namespace is NS_HTML
880                                 ins_mode = ins_mode_in_table
881                                 return
882                         # 11. If node is a template element, then switch the insertion mode
883                         # to the current template insertion mode and abort these steps.
884                         if node.name is 'template' and node.namespace is NS_HTML
885                                 ins_mode = template_ins_modes[0]
886                                 return
887                         # 12. If node is a head element and last is true, then switch the
888                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
889                         # these steps. (fragment case)
890                         if node.name is 'head' and node.namespace is NS_HTML and last
891                                 ins_mode = ins_mode_in_body
892                                 return
893                         # 13. If node is a head element and last is false, then switch the
894                         # insertion mode to "in head" and abort these steps.
895                         if node.name is 'head' and node.namespace is NS_HTML and last is false
896                                 ins_mode = ins_mode_in_head
897                                 return
898                         # 14. If node is a body element, then switch the insertion mode to
899                         # "in body" and abort these steps.
900                         if node.name is 'body' and node.namespace is NS_HTML
901                                 ins_mode = ins_mode_in_body
902                                 return
903                         # 15. If node is a frameset element, then switch the insertion mode
904                         # to "in frameset" and abort these steps. (fragment case)
905                         if node.name is 'frameset' and node.namespace is NS_HTML
906                                 ins_mode = ins_mode_in_frameset
907                                 return
908                         # 16. If node is an html element, run these substeps:
909                         if node.name is 'html' and node.namespace is NS_HTML
910                                 # 1. If the head element pointer is null, switch the insertion
911                                 # mode to "before head" and abort these steps. (fragment case)
912                                 if head_element_pointer is null
913                                         ins_mode = ins_mode_before_head
914                                 else
915                                         # 2. Otherwise, the head element pointer is not null,
916                                         # switch the insertion mode to "after head" and abort these
917                                         # steps.
918                                         ins_mode = ins_mode_after_head
919                                 return
920                         # 17. If last is true, then switch the insertion mode to "in body"
921                         # and abort these steps. (fragment case)
922                         if last
923                                 ins_mode = ins_mode_in_body
924                                 return
925                         # 18. Let node now be the node before node in the stack of open
926                         # elements.
927                         node_i += 1
928                         node = open_els[node_i]
929                         # 19. Return to the step labeled loop.
930
931         # 8.2.3.2
932
933         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934         adjusted_current_node = ->
935                 if open_els.length is 1 and flag_fragment_parsing
936                         return context_element
937                 return open_els[0]
938
939         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940         # this implementation is structured (mostly) as described at the link above.
941         # capitalized comments are the "labels" described at the link above.
942         reconstruct_afe = ->
943                 return if afe.length is 0
944                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
945                         return
946                 # Rewind
947                 i = 0
948                 loop
949                         if i is afe.length - 1
950                                 break
951                         i += 1
952                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
953                                 i -= 1 # Advance
954                                 break
955                 # Create
956                 loop
957                         el = insert_html_element afe[i].token
958                         afe[i] = el
959                         break if i is 0
960                         i -= 1 # Advance
961
962         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963         # adoption agency algorithm
964         # overview here:
965         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968         adoption_agency = (subject) ->
969                 debug_log "adoption_agency()"
970                 debug_log "tree: #{serialize_els doc.children, false, true}"
971                 debug_log "open_els: #{serialize_els open_els, true, true}"
972                 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 #               # 1. If the current node is an HTML element whose tag name is subject,
975 #               # then run these substeps:
976 #               #
977 #               # 1. Let element be the current node.
978 #               #
979 #               # 2. Pop element off the stack of open elements.
980 #               #
981 #               # 3. If element is also in the list of active formatting elements,
982 #               # remove the element from the list.
983 #               #
984 #               # 4. Abort the adoption agency algorithm.
985 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 #                       el = open_els.shift()
987 #                       # remove it from the list of active formatting elements (if found)
988 #                       for t, i in afe
989 #                               if t is el
990 #                                       afe.splice i, 1
991 #                                       break
992 #                       debug_log "aaa: starting off with subject on top of stack, exiting"
993 #                       return
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995                 # If the current node is an HTML element whose tag name is subject, and
996                 # the current node is not in the list of active formatting elements,
997                 # then pop the current node off the stack of open elements, and abort
998                 # these steps.
999                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000                         debug_log "aaa: starting off with subject on top of stack, exiting"
1001                         # remove it from the list of active formatting elements (if found)
1002                         in_afe = false
1003                         for el, i in afe
1004                                 if el is open_els[0]
1005                                         in_afe = true
1006                                         break
1007                         unless in_afe
1008                                 debug_log "aaa: ...and not in afe, aaa done"
1009                                 open_els.shift()
1010                                 return
1011                         # fall through
1012 # END WHATWG
1013                 outer = 0
1014                 loop
1015                         if outer >= 8
1016                                 return
1017                         outer += 1
1018                         # 5. Let formatting element be the last element in the list of
1019                         # active formatting elements that: is between the end of the list
1020                         # and the last scope marker in the list, if any, or the start of
1021                         # the list otherwise, and  has the tag name subject.
1022                         fe = null
1023                         for t, fe_of_afe in afe
1024                                 if t.type is TYPE_AFE_MARKER
1025                                         break
1026                                 if t.name is subject
1027                                         fe = t
1028                                         break
1029                         # If there is no such element, then abort these steps and instead
1030                         # act as described in the "any other end tag" entry above.
1031                         if fe is null
1032                                 debug_log "aaa: fe not found in afe"
1033                                 in_body_any_other_end_tag subject
1034                                 return
1035                         # 6. If formatting element is not in the stack of open elements,
1036                         # then this is a parse error; remove the element from the list, and
1037                         # abort these steps.
1038                         in_open_els = false
1039                         for t, fe_of_open_els in open_els
1040                                 if t is fe
1041                                         in_open_els = true
1042                                         break
1043                         unless in_open_els
1044                                 debug_log "aaa: fe not found in open_els"
1045                                 parse_error()
1046                                 # "remove it from the list" must mean afe, since it's not in open_els
1047                                 afe.splice fe_of_afe, 1
1048                                 return
1049                         # 7. If formatting element is in the stack of open elements, but
1050                         # the element is not in scope, then this is a parse error; abort
1051                         # these steps.
1052                         unless el_is_in_scope fe
1053                                 debug_log "aaa: fe not in scope"
1054                                 parse_error()
1055                                 return
1056                         # 8. If formatting element is not the current node, this is a parse
1057                         # error. (But do not abort these steps.)
1058                         unless open_els[0] is fe
1059                                 parse_error()
1060                                 # continue
1061                         # 9. Let furthest block be the topmost node in the stack of open
1062                         # elements that is lower in the stack than formatting element, and
1063                         # is an element in the special category. There might not be one.
1064                         fb = null
1065                         fb_of_open_els = null
1066                         for t, i in open_els
1067                                 if t is fe
1068                                         break
1069                                 if el_is_special t
1070                                         fb = t
1071                                         fb_of_open_els = i
1072                                         # and continue, to see if there's one that's more "topmost"
1073                         # 10. If there is no furthest block, then the UA must first pop all
1074                         # the nodes from the bottom of the stack of open elements, from the
1075                         # current node up to and including formatting element, then remove
1076                         # formatting element from the list of active formatting elements,
1077                         # and finally abort these steps.
1078                         if fb is null
1079                                 debug_log "aaa: no fb"
1080                                 loop
1081                                         t = open_els.shift()
1082                                         if t is fe
1083                                                 afe.splice fe_of_afe, 1
1084                                                 return
1085                         # 11. Let common ancestor be the element immediately above
1086                         # formatting element in the stack of open elements.
1087                         ca = open_els[fe_of_open_els + 1] # common ancestor
1088
1089                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091                         bookmark = new_aaa_bookmark()
1092                         for t, i in afe
1093                                 if t is fe
1094                                         afe.splice i, 0, bookmark
1095                                         break
1096                         node = last_node = fb
1097                         inner = 0
1098                         loop
1099                                 inner += 1
1100                                 # 3. Let node be the element immediately above node in the
1101                                 # stack of open elements, or if node is no longer in the stack
1102                                 # of open elements (e.g. because it got removed by this
1103                                 # algorithm), the element that was immediately above node in
1104                                 # the stack of open elements before node was removed.
1105                                 node_next = null
1106                                 for t, i in open_els
1107                                         if t is node
1108                                                 node_next = open_els[i + 1]
1109                                                 break
1110                                 node = node_next ? node_above
1111                                 debug_log "inner loop #{inner}"
1112                                 debug_log "tree: #{serialize_els doc.children, false, true}"
1113                                 debug_log "open_els: #{serialize_els open_els, true, true}"
1114                                 debug_log "afe: #{serialize_els afe, true, true}"
1115                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118                                 debug_log "node: #{node.serialize true, true}"
1119                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1120
1121                                 # 4. If node is formatting element, then go to the next step in
1122                                 # the overall algorithm.
1123                                 if node is fe
1124                                         break
1125                                 debug_log "the meat"
1126                                 # 5. If inner loop counter is greater than three and node is in
1127                                 # the list of active formatting elements, then remove node from
1128                                 # the list of active formatting elements.
1129                                 node_in_afe = false
1130                                 for t, i in afe
1131                                         if t is node
1132                                                 if inner > 3
1133                                                         afe.splice i, 1
1134                                                         debug_log "max out inner"
1135                                                 else
1136                                                         node_in_afe = true
1137                                                         debug_log "in afe"
1138                                                 break
1139                                 # 6. If node is not in the list of active formatting elements,
1140                                 # then remove node from the stack of open elements and then go
1141                                 # back to the step labeled inner loop.
1142                                 unless node_in_afe
1143                                         debug_log "not in afe"
1144                                         for t, i in open_els
1145                                                 if t is node
1146                                                         node_above = open_els[i + 1]
1147                                                         open_els.splice i, 1
1148                                                         break
1149                                         continue
1150                                 debug_log "the bones"
1151                                 # 7. create an element for the token for which the element node
1152                                 # was created, in the HTML namespace, with common ancestor as
1153                                 # the intended parent; replace the entry for node in the list
1154                                 # of active formatting elements with an entry for the new
1155                                 # element, replace the entry for node in the stack of open
1156                                 # elements with an entry for the new element, and let node be
1157                                 # the new element.
1158                                 new_node = token_to_element node.token, NS_HTML, ca
1159                                 for t, i in afe
1160                                         if t is node
1161                                                 afe[i] = new_node
1162                                                 debug_log "replaced in afe"
1163                                                 break
1164                                 for t, i in open_els
1165                                         if t is node
1166                                                 node_above = open_els[i + 1]
1167                                                 open_els[i] = new_node
1168                                                 debug_log "replaced in open_els"
1169                                                 break
1170                                 node = new_node
1171                                 # 8. If last node is furthest block, then move the
1172                                 # aforementioned bookmark to be immediately after the new node
1173                                 # in the list of active formatting elements.
1174                                 if last_node is fb
1175                                         for t, i in afe
1176                                                 if t is bookmark
1177                                                         afe.splice i, 1
1178                                                         debug_log "removed bookmark"
1179                                                         break
1180                                         for t, i in afe
1181                                                 if t is node
1182                                                         # "after" means lower
1183                                                         afe.splice i, 0, bookmark # "after as <-
1184                                                         debug_log "placed bookmark after node"
1185                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1186                                                         break
1187                                 # 9. Insert last node into node, first removing it from its
1188                                 # previous parent node if any.
1189                                 if last_node.parent?
1190                                         debug_log "last_node has parent"
1191                                         for c, i in last_node.parent.children
1192                                                 if c is last_node
1193                                                         debug_log "removing last_node from parent"
1194                                                         last_node.parent.children.splice i, 1
1195                                                         break
1196                                 node.children.push last_node
1197                                 last_node.parent = node
1198                                 # 10. Let last node be node.
1199                                 last_node = node
1200                                 debug_log "at last"
1201                                 # 11. Return to the step labeled inner loop.
1202                         # 14. Insert whatever last node ended up being in the previous step
1203                         # at the appropriate place for inserting a node, but using common
1204                         # ancestor as the override target.
1205
1206                         # In the case where fe is immediately followed by fb:
1207                         #   * inner loop exits out early (node==fe)
1208                         #   * last_node is fb
1209                         #   * last_node is still in the tree (not a duplicate)
1210                         if last_node.parent?
1211                                 debug_log "FEFIRST? last_node has parent"
1212                                 for c, i in last_node.parent.children
1213                                         if c is last_node
1214                                                 debug_log "removing last_node from parent"
1215                                                 last_node.parent.children.splice i, 1
1216                                                 break
1217
1218                         debug_log "after aaa inner loop"
1219                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223                         debug_log "tree: #{serialize_els doc.children, false, true}"
1224
1225                         debug_log "insert"
1226
1227
1228                         # can't use standard insert token thing, because it's already in
1229                         # open_els and must stay at it's current position in open_els
1230                         dest = adjusted_insertion_location ca
1231                         dest[0].children.splice dest[1], 0, last_node
1232                         last_node.parent = dest[0]
1233
1234
1235                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239                         debug_log "tree: #{serialize_els doc.children, false, true}"
1240
1241                         # 15. Create an element for the token for which formatting element
1242                         # was created, in the HTML namespace, with furthest block as the
1243                         # intended parent.
1244                         new_element = token_to_element fe.token, NS_HTML, fb
1245                         # 16. Take all of the child nodes of furthest block and append them
1246                         # to the element created in the last step.
1247                         while fb.children.length
1248                                 t = fb.children.shift()
1249                                 t.parent = new_element
1250                                 new_element.children.push t
1251                         # 17. Append that new element to furthest block.
1252                         new_element.parent = fb
1253                         fb.children.push new_element
1254                         # 18. Remove formatting element from the list of active formatting
1255                         # elements, and insert the new element into the list of active
1256                         # formatting elements at the position of the aforementioned
1257                         # bookmark.
1258                         for t, i in afe
1259                                 if t is fe
1260                                         afe.splice i, 1
1261                                         break
1262                         for t, i in afe
1263                                 if t is bookmark
1264                                         afe[i] = new_element
1265                                         break
1266                         # 19. Remove formatting element from the stack of open elements,
1267                         # and insert the new element into the stack of open elements
1268                         # immediately below the position of furthest block in that stack.
1269                         for t, i in open_els
1270                                 if t is fe
1271                                         open_els.splice i, 1
1272                                         break
1273                         for t, i in open_els
1274                                 if t is fb
1275                                         open_els.splice i, 0, new_element
1276                                         break
1277                         # 20. Jump back to the step labeled outer loop.
1278                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279                         debug_log "tree: #{serialize_els doc.children, false, true}"
1280                         debug_log "open_els: #{serialize_els open_els, true, true}"
1281                         debug_log "afe: #{serialize_els afe, true, true}"
1282                 debug_log "AAA DONE"
1283
1284         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285         close_p_element = ->
1286                 generate_implied_end_tags 'p' # arg is exception
1287                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1288                         parse_error()
1289                 while open_els.length > 1 # just in case
1290                         el = open_els.shift()
1291                         if el.name is 'p' and el.namespace is NS_HTML
1292                                 return
1293         close_p_if_in_button_scope = ->
1294                 if is_in_button_scope 'p', NS_HTML
1295                         close_p_element()
1296
1297         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298         # aka insert_a_character = (t) ->
1299         insert_character = (t) ->
1300                 dest = adjusted_insertion_location()
1301                 # fixfull check for Document node
1302                 if dest[1] > 0
1303                         prev = dest[0].children[dest[1] - 1]
1304                         if prev.type is TYPE_TEXT
1305                                 prev.text += t.text
1306                                 return
1307                 dest[0].children.splice dest[1], 0, t
1308
1309
1310         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311         process_token = (t) ->
1312                 acn = adjusted_current_node()
1313                 unless acn?
1314                         ins_mode t
1315                         return
1316                 if acn.namespace is NS_HTML
1317                         ins_mode t
1318                         return
1319                 if is_mathml_text_integration_point(acn)
1320                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1321                                 ins_mode t
1322                                 return
1323                         if t.type is TYPE_TEXT
1324                                 ins_mode t
1325                                 return
1326                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1327                         ins_mode t
1328                         return
1329                 if is_html_integration acn
1330                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1331                                 ins_mode t
1332                                 return
1333                 if t.type is TYPE_EOF
1334                         ins_mode t
1335                         return
1336                 in_foreign_content t
1337                 return
1338
1339         # 8.2.5.1
1340         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342         adjusted_insertion_location = (override_target = null) ->
1343                 # 1. If there was an override target specified, then let target be the
1344                 # override target.
1345                 if override_target?
1346                         target = override_target
1347                 else # Otherwise, let target be the current node.
1348                         target = open_els[0]
1349                 # 2. Determine the adjusted insertion location using the first matching
1350                 # steps from the following list:
1351                 #
1352                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353                 # thead, or tr element Foster parenting happens when content is
1354                 # misnested in tables.
1355                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356                         loop # once. this is here so we can ``break`` to "abort these substeps"
1357                                 # 1. Let last template be the last template element in the
1358                                 # stack of open elements, if any.
1359                                 last_template = null
1360                                 last_template_i = null
1361                                 for el, i in open_els
1362                                         if el.name is 'template' and el.namespace is NS_HTML
1363                                                 last_template = el
1364                                                 last_template_i = i
1365                                                 break
1366                                 # 2. Let last table be the last table element in the stack of
1367                                 # open elements, if any.
1368                                 last_table = null
1369                                 last_table_i
1370                                 for el, i in open_els
1371                                         if el.name is 'table' and el.namespace is NS_HTML
1372                                                 last_table = el
1373                                                 last_table_i = i
1374                                                 break
1375                                 # 3. If there is a last template and either there is no last
1376                                 # table, or there is one, but last template is lower (more
1377                                 # recently added) than last table in the stack of open
1378                                 # elements, then: let adjusted insertion location be inside
1379                                 # last template's template contents, after its last child (if
1380                                 # any), and abort these substeps.
1381                                 if last_template and (last_table is null or last_template_i < last_table_i)
1382                                         target = last_template # fixfull should be it's contents
1383                                         target_i = target.children.length
1384                                         break
1385                                 # 4. If there is no last table, then let adjusted insertion
1386                                 # location be inside the first element in the stack of open
1387                                 # elements (the html element), after its last child (if any),
1388                                 # and abort these substeps. (fragment case)
1389                                 if last_table is null
1390                                         # this is odd
1391                                         target = open_els[open_els.length - 1]
1392                                         target_i = target.children.length
1393                                         break
1394                                 # 5. If last table has a parent element, then let adjusted
1395                                 # insertion location be inside last table's parent element,
1396                                 # immediately before last table, and abort these substeps.
1397                                 if last_table.parent?
1398                                         for c, i in last_table.parent.children
1399                                                 if c is last_table
1400                                                         target = last_table.parent
1401                                                         target_i = i
1402                                                         break
1403                                         break
1404                                 # 6. Let previous element be the element immediately above last
1405                                 # table in the stack of open elements.
1406                                 #
1407                                 # huh? how could it not have a parent?
1408                                 previous_element = open_els[last_table_i + 1]
1409                                 # 7. Let adjusted insertion location be inside previous
1410                                 # element, after its last child (if any).
1411                                 target = previous_element
1412                                 target_i = target.children.length
1413                                 # Note: These steps are involved in part because it's possible
1414                                 # for elements, the table element in this case in particular,
1415                                 # to have been moved by a script around in the DOM, or indeed
1416                                 # removed from the DOM entirely, after the element was inserted
1417                                 # by the parser.
1418                                 break # don't really loop
1419                 else
1420                         # Otherwise Let adjusted insertion location be inside target, after
1421                         # its last child (if any).
1422                         target_i = target.children.length
1423
1424                 # 3. If the adjusted insertion location is inside a template element,
1425                 # let it instead be inside the template element's template contents,
1426                 # after its last child (if any).
1427                 # fixfull (template)
1428
1429                 # 4. Return the adjusted insertion location.
1430                 return [target, target_i]
1431
1432         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433         # aka create_an_element_for_token
1434         token_to_element = (t, namespace, intended_parent) ->
1435                 # convert attributes into a hash
1436                 attrs = {}
1437                 for a in t.attrs_a
1438                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1440
1441                 # TODO 2. If the newly created element has an xmlns attribute in the
1442                 # XMLNS namespace whose value is not exactly the same as the element's
1443                 # namespace, that is a parse error. Similarly, if the newly created
1444                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445                 # value is not the XLink Namespace, that is a parse error.
1446
1447                 # fixfull: the spec says stuff about form pointers and ownerDocument
1448
1449                 return el
1450
1451         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452         insert_foreign_element = (token, namespace) ->
1453                 ail = adjusted_insertion_location()
1454                 ail_el = ail[0]
1455                 ail_i = ail[1]
1456                 el = token_to_element token, namespace, ail_el
1457                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1458                 el.parent = ail_el
1459                 ail_el.children.splice ail_i, 0, el
1460                 open_els.unshift el
1461                 return el
1462         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463         insert_html_element = (token) ->
1464                 insert_foreign_element token, NS_HTML
1465
1466         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467         # position should be [node, index_within_children]
1468         insert_comment = (t, position = null) ->
1469                 position ?= adjusted_insertion_location()
1470                 position[0].children.splice position[1], 0, t
1471
1472         # 8.2.5.2
1473         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474         parse_generic_raw_text = (t) ->
1475                 insert_html_element t
1476                 tok_state = tok_state_rawtext
1477                 original_ins_mode = ins_mode
1478                 ins_mode = ins_mode_text
1479         parse_generic_rcdata_text = (t) ->
1480                 insert_html_element t
1481                 tok_state = tok_state_rcdata
1482                 original_ins_mode = ins_mode
1483                 ins_mode = ins_mode_text
1484
1485         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487         generate_implied_end_tags = (except = null) ->
1488                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1489                         open_els.shift()
1490
1491         # 8.2.5.4 The rules for parsing tokens in HTML content
1492         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1493
1494         # 8.2.5.4.1 The "initial" insertion mode
1495         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496         is_quirks_yes_doctype = (t) ->
1497                 if t.flag 'force-quirks'
1498                         return true
1499                 if t.name isnt 'html'
1500                         return true
1501                 if t.public_identifier?
1502                         pi = t.public_identifier.toLowerCase()
1503                         for p in quirks_yes_pi_prefixes
1504                                 if pi.substr(0, p.length) is p
1505                                         return true
1506                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1507                                 return true
1508                 if t.system_identifier?
1509                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1510                                 return true
1511                 else if t.public_identifier?
1512                         # already did this: pi = t.public_identifier.toLowerCase()
1513                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1514                                 return true
1515                 return false
1516         is_quirks_limited_doctype = (t) ->
1517                 if t.public_identifier?
1518                         pi = t.public_identifier.toLowerCase()
1519                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1520                                 return true
1521                         if t.system_identifier?
1522                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1523                                         return true
1524                 return false
1525         ins_mode_initial = (t) ->
1526                 if is_space_tok t
1527                         return
1528                 if t.type is TYPE_COMMENT
1529                         # ?fixfull
1530                         doc.children.push t
1531                         return
1532                 if t.type is TYPE_DOCTYPE
1533                         # fixfull syntax error from first paragraph and following bullets
1534                         # fixfull set doc.doctype
1535                         # fixfull is the "not an iframe srcdoc" thing relevant?
1536                         if is_quirks_yes_doctype t
1537                                 doc.flag 'quirks mode', QUIRKS_YES
1538                         else if is_quirks_limited_doctype t
1539                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1540                         doc.children.push t
1541                         ins_mode = ins_mode_before_html
1542                         return
1543                 # Anything else
1544                 # fixfull not iframe srcdoc?
1545                 parse_error()
1546                 doc.flag 'quirks mode', QUIRKS_YES
1547                 ins_mode = ins_mode_before_html
1548                 process_token t
1549                 return
1550
1551         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552         ins_mode_before_html = (t) ->
1553                 if t.type is TYPE_DOCTYPE
1554                         parse_error()
1555                         return
1556                 if t.type is TYPE_COMMENT
1557                         doc.children.push t
1558                         return
1559                 if is_space_tok t
1560                         return
1561                 if t.type is TYPE_START_TAG and t.name is 'html'
1562                         el = token_to_element t, NS_HTML, doc
1563                         doc.children.push el
1564                         open_els.unshift(el)
1565                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566                         ins_mode = ins_mode_before_head
1567                         return
1568                 if t.type is TYPE_END_TAG
1569                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570                                 # fall through to "anything else"
1571                         else
1572                                 parse_error()
1573                                 return
1574                 # Anything else
1575                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576                 doc.children.push el
1577                 el.parent = doc
1578                 open_els.unshift el
1579                 # ?fixfull browsing context
1580                 ins_mode = ins_mode_before_head
1581                 process_token t
1582                 return
1583
1584         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585         ins_mode_before_head = (t) ->
1586                 if is_space_tok t
1587                         return
1588                 if t.type is TYPE_COMMENT
1589                         insert_comment t
1590                         return
1591                 if t.type is TYPE_DOCTYPE
1592                         parse_error()
1593                         return
1594                 if t.type is TYPE_START_TAG and t.name is 'html'
1595                         ins_mode_in_body t
1596                         return
1597                 if t.type is TYPE_START_TAG and t.name is 'head'
1598                         el = insert_html_element t
1599                         head_element_pointer = el
1600                         ins_mode = ins_mode_in_head
1601                         return
1602                 if t.type is TYPE_END_TAG
1603                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604                                 # fall through to Anything else below
1605                         else
1606                                 parse_error()
1607                                 return
1608                 # Anything else
1609                 el = insert_html_element new_open_tag 'head'
1610                 head_element_pointer = el
1611                 ins_mode = ins_mode_in_head
1612                 process_token t
1613
1614         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616                 open_els.shift() # spec says this will be a 'head' node
1617                 ins_mode = ins_mode_after_head
1618                 process_token t
1619         ins_mode_in_head = (t) ->
1620                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1621                         insert_character t
1622                         return
1623                 if t.type is TYPE_COMMENT
1624                         insert_comment t
1625                         return
1626                 if t.type is TYPE_DOCTYPE
1627                         parse_error()
1628                         return
1629                 if t.type is TYPE_START_TAG and t.name is 'html'
1630                         ins_mode_in_body t
1631                         return
1632                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633                         el = insert_html_element t
1634                         open_els.shift()
1635                         t.acknowledge_self_closing()
1636                         return
1637                 if t.type is TYPE_START_TAG and t.name is 'meta'
1638                         el = insert_html_element t
1639                         open_els.shift()
1640                         t.acknowledge_self_closing()
1641                         # fixfull encoding stuff
1642                         return
1643                 if t.type is TYPE_START_TAG and t.name is 'title'
1644                         parse_generic_rcdata_text t
1645                         return
1646                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647                         parse_generic_raw_text t
1648                         return
1649                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650                         insert_html_element t
1651                         ins_mode = ins_mode_in_head_noscript
1652                         return
1653                 if t.type is TYPE_START_TAG and t.name is 'script'
1654                         ail = adjusted_insertion_location()
1655                         el = token_to_element t, NS_HTML, ail
1656                         el.flag 'parser-inserted', true
1657                         # fixfull frament case
1658                         ail[0].children.splice ail[1], 0, el
1659                         open_els.unshift el
1660                         tok_state = tok_state_script_data
1661                         original_ins_mode = ins_mode # make sure orig... is defined
1662                         ins_mode = ins_mode_text
1663                         return
1664                 if t.type is TYPE_END_TAG and t.name is 'head'
1665                         open_els.shift() # will be a head element... spec says so
1666                         ins_mode = ins_mode_after_head
1667                         return
1668                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669                         ins_mode_in_head_else t
1670                         return
1671                 if t.type is TYPE_START_TAG and t.name is 'template'
1672                         insert_html_element t
1673                         afe_push_marker()
1674                         flag_frameset_ok = false
1675                         ins_mode = ins_mode_in_template
1676                         template_ins_modes.unshift ins_mode_in_template
1677                         return
1678                 if t.type is TYPE_END_TAG and t.name is 'template'
1679                         if template_tag_is_open()
1680                                 generate_implied_end_tags
1681                                 if open_els[0].name isnt 'template'
1682                                         parse_error()
1683                                 loop
1684                                         el = open_els.shift()
1685                                         if el.name is 'template' and el.namespace is NS_HTML
1686                                                 break
1687                                 clear_afe_to_marker()
1688                                 template_ins_modes.shift()
1689                                 reset_ins_mode()
1690                         else
1691                                 parse_error()
1692                         return
1693                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1694                         parse_error()
1695                         return
1696                 ins_mode_in_head_else t
1697
1698         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699         ins_mode_in_head_noscript_else = (t) ->
1700                 parse_error()
1701                 open_els.shift()
1702                 ins_mode = ins_mode_in_head
1703                 process_token t
1704         ins_mode_in_head_noscript = (t) ->
1705                 if t.type is TYPE_DOCTYPE
1706                         parse_error()
1707                         return
1708                 if t.type is TYPE_START_TAG and t.name is 'html'
1709                         ins_mode_in_body t
1710                         return
1711                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1712                         open_els.shift()
1713                         ins_mode = ins_mode_in_head
1714                         return
1715                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1716                         ins_mode_in_head t
1717                         return
1718                 if t.type is TYPE_END_TAG and t.name is 'br'
1719                         ins_mode_in_head_noscript_else t
1720                         return
1721                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1722                         parse_error()
1723                         return
1724                 # Anything else
1725                 ins_mode_in_head_noscript_else t
1726                 return
1727
1728
1729
1730         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731         ins_mode_after_head_else = (t) ->
1732                 body_tok = new_open_tag 'body'
1733                 insert_html_element body_tok
1734                 ins_mode = ins_mode_in_body
1735                 process_token t
1736                 return
1737         ins_mode_after_head = (t) ->
1738                 if is_space_tok t
1739                         insert_character t
1740                         return
1741                 if t.type is TYPE_COMMENT
1742                         insert_comment t
1743                         return
1744                 if t.type is TYPE_DOCTYPE
1745                         parse_error()
1746                         return
1747                 if t.type is TYPE_START_TAG and t.name is 'html'
1748                         ins_mode_in_body t
1749                         return
1750                 if t.type is TYPE_START_TAG and t.name is 'body'
1751                         insert_html_element t
1752                         flag_frameset_ok = false
1753                         ins_mode = ins_mode_in_body
1754                         return
1755                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756                         insert_html_element t
1757                         ins_mode = ins_mode_in_frameset
1758                         return
1759                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1760                         parse_error()
1761                         open_els.unshift head_element_pointer
1762                         ins_mode_in_head t
1763                         for el, i in open_els
1764                                 if el is head_element_pointer
1765                                         open_els.splice i, 1
1766                                         return
1767                         console.log "warning: 23904 couldn't find head element in open_els"
1768                         return
1769                 if t.type is TYPE_END_TAG and t.name is 'template'
1770                         ins_mode_in_head t
1771                         return
1772                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773                         ins_mode_after_head_else t
1774                         return
1775                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1776                         parse_error()
1777                         return
1778                 # Anything else
1779                 ins_mode_after_head_else t
1780
1781         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1783                 node = open_els[0]
1784                 loop
1785                         if node.name is name and node.namespace is NS_HTML
1786                                 generate_implied_end_tags name # arg is exception
1787                                 unless node is open_els[0]
1788                                         parse_error()
1789                                 loop
1790                                         el = open_els.shift()
1791                                         if el is node
1792                                                 return
1793                         if special_elements[node.name] is node.namespace
1794                                 parse_error()
1795                                 return
1796                         for el, i in open_els
1797                                 if node is el
1798                                         node = open_els[i + 1]
1799                                         break
1800                 return
1801         ins_mode_in_body = (t) ->
1802                 if t.type is TYPE_TEXT and t.text is "\u0000"
1803                         parse_error()
1804                         return
1805                 if is_space_tok t
1806                         reconstruct_afe()
1807                         insert_character t
1808                         return
1809                 if t.type is TYPE_TEXT
1810                         reconstruct_afe()
1811                         insert_character t
1812                         flag_frameset_ok = false
1813                         return
1814                 if t.type is TYPE_COMMENT
1815                         insert_comment t
1816                         return
1817                 if t.type is TYPE_DOCTYPE
1818                         parse_error()
1819                         return
1820                 if t.type is TYPE_START_TAG and t.name is 'html'
1821                         parse_error()
1822                         return if template_tag_is_open()
1823                         root_attrs = open_els[open_els.length - 1].attrs
1824                         for a in t.attrs_a
1825                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1826                         return
1827
1828                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1829                         ins_mode_in_head t
1830                         return
1831                 if t.type is TYPE_START_TAG and t.name is 'body'
1832                         parse_error()
1833                         return if open_els.length < 2
1834                         second = open_els[open_els.length - 2]
1835                         return unless second.namespace is NS_HTML
1836                         return unless second.name is 'body'
1837                         return if template_tag_is_open()
1838                         flag_frameset_ok = false
1839                         for a in t.attrs_a
1840                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1841                         return
1842                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1843                         parse_error()
1844                         return if open_els.length < 2
1845                         second_i = open_els.length - 2
1846                         second = open_els[second_i]
1847                         return unless second.namespace is NS_HTML
1848                         return unless second.name is 'body'
1849                         if flag_frameset_ok is false
1850                                 return
1851                         if second.parent?
1852                                 for el, i in second.parent.children
1853                                         if el is second
1854                                                 second.parent.children.splice i, 1
1855                                                 break
1856                         open_els.splice second_i, 1
1857                         # pop everything except the "root html element"
1858                         while open_els.length > 1
1859                                 open_els.shift()
1860                         insert_html_element t
1861                         ins_mode = ins_mode_in_frameset
1862                         return
1863                 if t.type is TYPE_EOF
1864                         ok_tags = {
1865                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1866                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1867                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1868                         }
1869                         for el in open_els
1870                                 unless ok_tags[t.name] is el.namespace
1871                                         parse_error()
1872                                         break
1873                         if template_ins_modes.length > 0
1874                                 ins_mode_in_template t
1875                         else
1876                                 stop_parsing()
1877                         return
1878                 if t.type is TYPE_END_TAG and t.name is 'body'
1879                         unless is_in_scope 'body', NS_HTML
1880                                 parse_error()
1881                                 return
1882                         ok_tags = {
1883                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1887                                 html:NS_HTML
1888                         }
1889                         for el in open_els
1890                                 unless ok_tags[t.name] is el.namespace
1891                                         parse_error()
1892                                         break
1893                         ins_mode = ins_mode_after_body
1894                         return
1895                 if t.type is TYPE_END_TAG and t.name is 'html'
1896                         unless is_in_scope 'body', NS_HTML
1897                                 parse_error()
1898                                 return
1899                         ok_tags = {
1900                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1901                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1902                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1903                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1904                                 html:NS_HTML
1905                         }
1906                         for el in open_els
1907                                 unless ok_tags[t.name] is el.namespace
1908                                         parse_error()
1909                                         break
1910                         ins_mode = ins_mode_after_body
1911                         process_token t
1912                         return
1913                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1914                         close_p_if_in_button_scope()
1915                         insert_html_element t
1916                         return
1917                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1918                         close_p_if_in_button_scope()
1919                         if h_tags[open_els[0].name] is open_els[0].namespace
1920                                 parse_error()
1921                                 open_els.shift()
1922                         insert_html_element t
1923                         return
1924                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1925                         close_p_if_in_button_scope()
1926                         insert_html_element t
1927                         eat_next_token_if_newline()
1928                         flag_frameset_ok = false
1929                         return
1930                 if t.type is TYPE_START_TAG and t.name is 'form'
1931                         unless form_element_pointer is null or template_tag_is_open()
1932                                 parse_error()
1933                                 return
1934                         close_p_if_in_button_scope()
1935                         el = insert_html_element t
1936                         unless template_tag_is_open()
1937                                 form_element_pointer = el
1938                         return
1939                 if t.type is TYPE_START_TAG and t.name is 'li'
1940                         flag_frameset_ok = false
1941                         for node in open_els
1942                                 if node.name is 'li' and node.namespace is NS_HTML
1943                                         generate_implied_end_tags 'li' # arg is exception
1944                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1945                                                 parse_error()
1946                                         loop
1947                                                 el = open_els.shift()
1948                                                 if el.name is 'li' and el.namespace is NS_HTML
1949                                                         break
1950                                         break
1951                                 if el_is_special_not_adp node
1952                                                 break
1953                         close_p_if_in_button_scope()
1954                         insert_html_element t
1955                         return
1956                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1957                         flag_frameset_ok = false
1958                         for node in open_els
1959                                 if node.name is 'dd' and node.namespace is NS_HTML
1960                                         generate_implied_end_tags 'dd' # arg is exception
1961                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1962                                                 parse_error()
1963                                         loop
1964                                                 el = open_els.shift()
1965                                                 if el.name is 'dd' and el.namespace is NS_HTML
1966                                                         break
1967                                         break
1968                                 if node.name is 'dt' and node.namespace is NS_HTML
1969                                         generate_implied_end_tags 'dt' # arg is exception
1970                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1971                                                 parse_error()
1972                                         loop
1973                                                 el = open_els.shift()
1974                                                 if el.name is 'dt' and el.namespace is NS_HTML
1975                                                         break
1976                                         break
1977                                 if el_is_special_not_adp node
1978                                         break
1979                         close_p_if_in_button_scope()
1980                         insert_html_element t
1981                         return
1982                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1983                         close_p_if_in_button_scope()
1984                         insert_html_element t
1985                         tok_state = tok_state_plaintext
1986                         return
1987                 if t.type is TYPE_START_TAG and t.name is 'button'
1988                         if is_in_scope 'button', NS_HTML
1989                                 parse_error()
1990                                 generate_implied_end_tags()
1991                                 loop
1992                                         el = open_els.shift()
1993                                         if el.name is 'button' and el.namespace is NS_HTML
1994                                                 break
1995                         reconstruct_afe()
1996                         insert_html_element t
1997                         flag_frameset_ok = false
1998                         return
1999                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2000                         unless is_in_scope t.name, NS_HTML
2001                                 parse_error()
2002                                 return
2003                         generate_implied_end_tags()
2004                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2005                                 parse_error()
2006                         loop
2007                                 el = open_els.shift()
2008                                 if el.name is t.name and el.namespace is NS_HTML
2009                                         return
2010                         return
2011                 if t.type is TYPE_END_TAG and t.name is 'form'
2012                         unless template_tag_is_open()
2013                                 node = form_element_pointer
2014                                 form_element_pointer = null
2015                                 if node is null or not el_is_in_scope node
2016                                         parse_error()
2017                                         return
2018                                 generate_implied_end_tags()
2019                                 if open_els[0] isnt node
2020                                         parse_error()
2021                                 for el, i in open_els
2022                                         if el is node
2023                                                 open_els.splice i, 1
2024                                                 break
2025                         else
2026                                 unless is_in_scope 'form', NS_HTML
2027                                         parse_error()
2028                                         return
2029                                 generate_implied_end_tags()
2030                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2031                                         parse_error()
2032                                 loop
2033                                         el = open_els.shift()
2034                                         if el.name is 'form' and el.namespace is NS_HTML
2035                                                 break
2036                         return
2037                 if t.type is TYPE_END_TAG and t.name is 'p'
2038                         unless is_in_button_scope 'p', NS_HTML
2039                                 parse_error()
2040                                 insert_html_element new_open_tag 'p'
2041                         close_p_element()
2042                         return
2043                 if t.type is TYPE_END_TAG and t.name is 'li'
2044                         unless is_in_li_scope 'li', NS_HTML
2045                                 parse_error()
2046                                 return
2047                         generate_implied_end_tags 'li' # arg is exception
2048                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2049                                 parse_error()
2050                         loop
2051                                 el = open_els.shift()
2052                                 if el.name is 'li' and el.namespace is NS_HTML
2053                                         break
2054                         return
2055                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2056                         unless is_in_scope t.name, NS_HTML
2057                                 parse_error()
2058                                 return
2059                         generate_implied_end_tags t.name # arg is exception
2060                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2061                                 parse_error()
2062                         loop
2063                                 el = open_els.shift()
2064                                 if el.name is t.name and el.namespace is NS_HTML
2065                                         break
2066                         return
2067                 if t.type is TYPE_END_TAG and h_tags[t.name]?
2068                         h_in_scope = false
2069                         for el in open_els
2070                                 if h_tags[el.name] is el.namespace
2071                                         h_in_scope = true
2072                                         break
2073                                 if standard_scopers[el.name] is el.namespace
2074                                         break
2075                         unless h_in_scope
2076                                 parse_error()
2077                                 return
2078                         generate_implied_end_tags()
2079                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2080                                 parse_error()
2081                         loop
2082                                 el = open_els.shift()
2083                                 if h_tags[el.name] is el.namespace
2084                                         break
2085                         return
2086                 # deep breath!
2087                 if t.type is TYPE_START_TAG and t.name is 'a'
2088                         # If the list of active formatting elements contains an a element
2089                         # between the end of the list and the last marker on the list (or
2090                         # the start of the list if there is no marker on the list), then
2091                         # this is a parse error; run the adoption agency algorithm for the
2092                         # tag name "a", then remove that element from the list of active
2093                         # formatting elements and the stack of open elements if the
2094                         # adoption agency algorithm didn't already remove it (it might not
2095                         # have if the element is not in table scope).
2096                         found = false
2097                         for el in afe
2098                                 if el.type is TYPE_AFE_MARKER
2099                                         break
2100                                 if el.name is 'a' and el.namespace is NS_HTML
2101                                         found = el
2102                         if found?
2103                                 parse_error()
2104                                 adoption_agency 'a'
2105                                 for el, i in afe
2106                                         if el is found
2107                                                 afe.splice i, 1
2108                                 for el, i in open_els
2109                                         if el is found
2110                                                 open_els.splice i, 1
2111                         reconstruct_afe()
2112                         el = insert_html_element t
2113                         afe_push el
2114                         return
2115                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2116                         reconstruct_afe()
2117                         el = insert_html_element t
2118                         afe_push el
2119                         return
2120                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2121                         reconstruct_afe()
2122                         if is_in_scope 'nobr', NS_HTML
2123                                 parse_error()
2124                                 adoption_agency 'nobr'
2125                                 reconstruct_afe()
2126                         el = insert_html_element t
2127                         afe_push el
2128                         return
2129                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2130                         adoption_agency t.name
2131                         return
2132                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2133                         reconstruct_afe()
2134                         insert_html_element t
2135                         afe_push_marker()
2136                         flag_frameset_ok = false
2137                         return
2138                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2139                         unless is_in_scope t.name, NS_HTML
2140                                 parse_error()
2141                                 return
2142                         generate_implied_end_tags()
2143                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2144                                 parse_error()
2145                         loop
2146                                 el = open_els.shift()
2147                                 if el.name is t.name and el.namespace is NS_HTML
2148                                         break
2149                         clear_afe_to_marker()
2150                         return
2151                 if t.type is TYPE_START_TAG and t.name is 'table'
2152                         unless doc.flag('quirks mode') is QUIRKS_YES
2153                                 close_p_if_in_button_scope() # test
2154                         insert_html_element t
2155                         flag_frameset_ok = false
2156                         ins_mode = ins_mode_in_table
2157                         return
2158                 if t.type is TYPE_END_TAG and t.name is 'br'
2159                         parse_error()
2160                         t.type = TYPE_START_TAG
2161                         # fall through
2162                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2163                         reconstruct_afe()
2164                         insert_html_element t
2165                         open_els.shift()
2166                         t.acknowledge_self_closing()
2167                         flag_frameset_ok = false
2168                         return
2169                 if t.type is TYPE_START_TAG and t.name is 'input'
2170                         reconstruct_afe()
2171                         insert_html_element t
2172                         open_els.shift()
2173                         t.acknowledge_self_closing()
2174                         unless is_input_hidden_tok t
2175                                 flag_frameset_ok = false
2176                         return
2177                 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2178                         # WHATWG adds 'menuitem' for this block
2179                         insert_html_element t
2180                         open_els.shift()
2181                         t.acknowledge_self_closing()
2182                         return
2183                 if t.type is TYPE_START_TAG and t.name is 'hr'
2184                         close_p_if_in_button_scope()
2185                         insert_html_element t
2186                         open_els.shift()
2187                         t.acknowledge_self_closing()
2188                         flag_frameset_ok = false
2189                         return
2190                 if t.type is TYPE_START_TAG and t.name is 'image'
2191                         parse_error()
2192                         t.name = 'img'
2193                         process_token t
2194                         return
2195                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2196                         parse_error()
2197                         if template_tag_is_open() is false and form_element_pointer isnt null
2198                                 return
2199                         t.acknowledge_self_closing()
2200                         flag_frameset_ok = false
2201                         close_p_if_in_button_scope()
2202                         el = insert_html_element new_open_tag 'form'
2203                         unless template_tag_is_open()
2204                                 form_element_pointer = el
2205                         for a in t.attrs_a
2206                                 if a[0] is 'action'
2207                                         el.attrs['action'] = a[1]
2208                                         break
2209                         insert_html_element new_open_tag 'hr'
2210                         open_els.shift()
2211                         reconstruct_afe()
2212                         insert_html_element new_open_tag 'label'
2213                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2214                         input_el = new_open_tag 'input'
2215                         prompt = null
2216                         for a in t.attrs_a
2217                                 if a[0] is 'prompt'
2218                                         prompt = a[1]
2219                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2220                                         input_el.attrs_a.push [a[0], a[1]]
2221                         input_el.attrs_a.push ['name', 'isindex']
2222                         # fixfull this next bit is in english... internationalize?
2223                         prompt ?= "This is a searchable index. Enter search keywords: "
2224                         insert_character new_character_token prompt # fixfull split
2225                         # TODO submit typo "balue" in spec
2226                         insert_html_element input_el
2227                         open_els.shift()
2228                         # insert_character '' # you can put chars here if promt attr missing
2229                         open_els.shift()
2230                         insert_html_element new_open_tag 'hr'
2231                         open_els.shift()
2232                         open_els.shift()
2233                         unless template_tag_is_open()
2234                                 form_element_pointer = null
2235                         return
2236                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2237                         insert_html_element t
2238                         eat_next_token_if_newline()
2239                         tok_state = tok_state_rcdata
2240                         original_ins_mode = ins_mode
2241                         flag_frameset_ok = false
2242                         ins_mode = ins_mode_text
2243                         return
2244                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2245                         close_p_if_in_button_scope()
2246                         reconstruct_afe()
2247                         flag_frameset_ok = false
2248                         parse_generic_raw_text t
2249                         return
2250                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2251                         flag_frameset_ok = false
2252                         parse_generic_raw_text t
2253                         return
2254                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2255                         parse_generic_raw_text t
2256                         return
2257                 if t.type is TYPE_START_TAG and t.name is 'select'
2258                         reconstruct_afe()
2259                         insert_html_element t
2260                         flag_frameset_ok = false
2261                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2262                                 ins_mode = ins_mode_in_select_in_table
2263                         else
2264                                 ins_mode = ins_mode_in_select
2265                         return
2266                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2267                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2268                                 open_els.shift()
2269                         reconstruct_afe()
2270                         insert_html_element t
2271                         return
2272 # this comment block implements the W3C spec
2273 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2274 #                       if is_in_scope 'ruby', NS_HTML
2275 #                               generate_implied_end_tags()
2276 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2277 #                                       parse_error()
2278 #                       insert_html_element t
2279 #                       return
2280 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2281 #                       if is_in_scope 'ruby', NS_HTML
2282 #                               generate_implied_end_tags 'rtc' # arg is exception
2283 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2284 #                                       parse_error()
2285 #                       insert_html_element t
2286 #                       return
2287 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2288                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2289                         if is_in_scope 'ruby', NS_HTML
2290                                 generate_implied_end_tags()
2291                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2292                                         parse_error()
2293                         insert_html_element t
2294                         return
2295                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2296                         if is_in_scope 'ruby', NS_HTML
2297                                 generate_implied_end_tags 'rtc'
2298                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2299                                         parse_error()
2300                         insert_html_element t
2301                         return
2302 # end WHATWG chunk
2303                 if t.type is TYPE_START_TAG and t.name is 'math'
2304                         reconstruct_afe()
2305                         adjust_mathml_attributes t
2306                         adjust_foreign_attributes t
2307                         insert_foreign_element t, NS_MATHML
2308                         if t.flag 'self-closing'
2309                                 open_els.shift()
2310                                 t.acknowledge_self_closing()
2311                         return
2312                 if t.type is TYPE_START_TAG and t.name is 'svg'
2313                         reconstruct_afe()
2314                         adjust_svg_attributes t
2315                         adjust_foreign_attributes t
2316                         insert_foreign_element t, NS_SVG
2317                         if t.flag 'self-closing'
2318                                 open_els.shift()
2319                                 t.acknowledge_self_closing()
2320                         return
2321                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2322                         parse_error()
2323                         return
2324                 if t.type is TYPE_START_TAG # any other start tag
2325                         reconstruct_afe()
2326                         insert_html_element t
2327                         return
2328                 if t.type is TYPE_END_TAG # any other end tag
2329                         in_body_any_other_end_tag t.name
2330                         return
2331                 return
2332
2333         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2334         ins_mode_text = (t) ->
2335                 if t.type is TYPE_TEXT
2336                         insert_character t
2337                         return
2338                 if t.type is TYPE_EOF
2339                         parse_error()
2340                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2341                                 open_els[0].flag 'already started', true
2342                         open_els.shift()
2343                         ins_mode = original_ins_mode
2344                         process_token t
2345                         return
2346                 if t.type is TYPE_END_TAG and t.name is 'script'
2347                         open_els.shift()
2348                         ins_mode = original_ins_mode
2349                         # fixfull the spec seems to assume that I'm going to run the script
2350                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2351                         return
2352                 if t.type is TYPE_END_TAG
2353                         open_els.shift()
2354                         ins_mode = original_ins_mode
2355                         return
2356                 console.log 'warning: end of ins_mode_text reached'
2357
2358         # the functions below implement the tokenizer stats described here:
2359         # http://www.w3.org/TR/html5/syntax.html#tokenization
2360
2361         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2362         ins_mode_in_table_else = (t) ->
2363                 parse_error()
2364                 flag_foster_parenting = true
2365                 ins_mode_in_body t
2366                 flag_foster_parenting = false
2367                 return
2368         ins_mode_in_table = (t) ->
2369                 switch t.type
2370                         when TYPE_TEXT
2371                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2372                                         pending_table_character_tokens = []
2373                                         original_ins_mode = ins_mode
2374                                         ins_mode = ins_mode_in_table_text
2375                                         process_token t
2376                                 else
2377                                         ins_mode_in_table_else t
2378                         when TYPE_COMMENT
2379                                 insert_comment t
2380                         when TYPE_DOCTYPE
2381                                 parse_error()
2382                         when TYPE_START_TAG
2383                                 switch t.name
2384                                         when 'caption'
2385                                                 clear_stack_to_table_context()
2386                                                 afe_push_marker()
2387                                                 insert_html_element t
2388                                                 ins_mode = ins_mode_in_caption
2389                                         when 'colgroup'
2390                                                 clear_stack_to_table_context()
2391                                                 insert_html_element t
2392                                                 ins_mode = ins_mode_in_column_group
2393                                         when 'col'
2394                                                 clear_stack_to_table_context()
2395                                                 insert_html_element new_open_tag 'colgroup'
2396                                                 ins_mode = ins_mode_in_column_group
2397                                                 process_token t
2398                                         when 'tbody', 'tfoot', 'thead'
2399                                                 clear_stack_to_table_context()
2400                                                 insert_html_element t
2401                                                 ins_mode = ins_mode_in_table_body
2402                                         when 'td', 'th', 'tr'
2403                                                 clear_stack_to_table_context()
2404                                                 insert_html_element new_open_tag 'tbody'
2405                                                 ins_mode = ins_mode_in_table_body
2406                                                 process_token t
2407                                         when 'table'
2408                                                 parse_error()
2409                                                 if is_in_table_scope 'table', NS_HTML
2410                                                         loop
2411                                                                 el = open_els.shift()
2412                                                                 if el.name is 'table' and el.namespace is NS_HTML
2413                                                                         break
2414                                                         reset_ins_mode()
2415                                                         process_token t
2416                                         when 'style', 'script', 'template'
2417                                                 ins_mode_in_head t
2418                                         when 'input'
2419                                                 unless is_input_hidden_tok t
2420                                                         ins_mode_in_table_else t
2421                                                 else
2422                                                         parse_error()
2423                                                         el = insert_html_element t
2424                                                         open_els.shift()
2425                                                         t.acknowledge_self_closing()
2426                                         when 'form'
2427                                                 parse_error()
2428                                                 if form_element_pointer?
2429                                                         return
2430                                                 if template_tag_is_open()
2431                                                         return
2432                                                 form_element_pointer = insert_html_element t
2433                                                 open_els.shift()
2434                                         else
2435                                                 ins_mode_in_table_else t
2436                         when TYPE_END_TAG
2437                                 switch t.name
2438                                         when 'table'
2439                                                 if is_in_table_scope 'table', NS_HTML
2440                                                         loop
2441                                                                 el = open_els.shift()
2442                                                                 if el.name is 'table' and el.namespace is NS_HTML
2443                                                                         break
2444                                                         reset_ins_mode()
2445                                                 else
2446                                                         parse_error()
2447                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2448                                                 parse_error()
2449                                         when 'template'
2450                                                 ins_mode_in_head t
2451                                         else
2452                                                 ins_mode_in_table_else t
2453                         when TYPE_EOF
2454                                 ins_mode_in_body t
2455                         else
2456                                 ins_mode_in_table_else t
2457
2458
2459         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2460         ins_mode_in_table_text = (t) ->
2461                 if t.type is TYPE_TEXT and t.text is "\u0000"
2462                         # from javascript?
2463                         parse_error()
2464                         return
2465                 if t.type is TYPE_TEXT
2466                         pending_table_character_tokens.push t
2467                         return
2468                 # Anything else
2469                 all_space = true
2470                 for old in pending_table_character_tokens
2471                         unless is_space_tok old
2472                                 all_space = false
2473                                 break
2474                 if all_space
2475                         for old in pending_table_character_tokens
2476                                 insert_character old
2477                 else
2478                         for old in pending_table_character_tokens
2479                                 ins_mode_in_table_else old
2480                 pending_table_character_tokens = []
2481                 ins_mode = original_ins_mode
2482                 process_token t
2483
2484         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2485         ins_mode_in_caption = (t) ->
2486                 if t.type is TYPE_END_TAG and t.name is 'caption'
2487                         if is_in_table_scope 'caption', NS_HTML
2488                                 generate_implied_end_tags()
2489                                 if open_els[0].name isnt 'caption'
2490                                         parse_error()
2491                                 loop
2492                                         el = open_els.shift()
2493                                         if el.name is 'caption' and el.namespace is NS_HTML
2494                                                 break
2495                                 clear_afe_to_marker()
2496                                 ins_mode = ins_mode_in_table
2497                         else
2498                                 parse_error()
2499                                 # fragment case
2500                         return
2501                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2502                         parse_error()
2503                         if is_in_table_scope 'caption', NS_HTML
2504                                 loop
2505                                         el = open_els.shift()
2506                                         if el.name is 'caption' and el.namespace is NS_HTML
2507                                                 break
2508                                 clear_afe_to_marker()
2509                                 ins_mode = ins_mode_in_table
2510                                 process_token t
2511                         # else fragment case
2512                         return
2513                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2514                         parse_error()
2515                         return
2516                 # Anything else
2517                 ins_mode_in_body t
2518
2519         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2520         ins_mode_in_column_group = (t) ->
2521                 if is_space_tok t
2522                         insert_character t
2523                         return
2524                 if t.type is TYPE_COMMENT
2525                         insert_comment t
2526                         return
2527                 if t.type is TYPE_DOCTYPE
2528                         parse_error()
2529                         return
2530                 if t.type is TYPE_START_TAG and t.name is 'html'
2531                         ins_mode_in_body t
2532                         return
2533                 if t.type is TYPE_START_TAG and t.name is 'col'
2534                         el = insert_html_element t
2535                         open_els.shift()
2536                         t.acknowledge_self_closing()
2537                         return
2538                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2539                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2540                                 open_els.shift()
2541                                 ins_mode = ins_mode_in_table
2542                         else
2543                                 parse_error()
2544                         return
2545                 if t.type is TYPE_END_TAG and t.name is 'col'
2546                         parse_error()
2547                         return
2548                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2549                         ins_mode_in_head t
2550                         return
2551                 if t.type is TYPE_EOF
2552                         ins_mode_in_body t
2553                         return
2554                 # Anything else
2555                 if open_els[0].name isnt 'colgroup'
2556                         parse_error()
2557                         return
2558                 open_els.shift()
2559                 ins_mode = ins_mode_in_table
2560                 process_token t
2561                 return
2562
2563         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2564         ins_mode_in_table_body = (t) ->
2565                 if t.type is TYPE_START_TAG and t.name is 'tr'
2566                         clear_stack_to_table_body_context()
2567                         insert_html_element t
2568                         ins_mode = ins_mode_in_row
2569                         return
2570                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2571                         parse_error()
2572                         clear_stack_to_table_body_context()
2573                         insert_html_element new_open_tag 'tr'
2574                         ins_mode = ins_mode_in_row
2575                         process_token t
2576                         return
2577                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2578                         unless is_in_table_scope t.name, NS_HTML
2579                                 parse_error()
2580                                 return
2581                         clear_stack_to_table_body_context()
2582                         open_els.shift()
2583                         ins_mode = ins_mode_in_table
2584                         return
2585                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2586                         has = false
2587                         for el in open_els
2588                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2589                                         has = true
2590                                         break
2591                                 if table_scopers[el.name] is el.namespace
2592                                         break
2593                         if !has
2594                                 parse_error()
2595                                 return
2596                         clear_stack_to_table_body_context()
2597                         open_els.shift()
2598                         ins_mode = ins_mode_in_table
2599                         process_token t
2600                         return
2601                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2602                         parse_error()
2603                         return
2604                 # Anything else
2605                 ins_mode_in_table t
2606
2607         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2608         ins_mode_in_row = (t) ->
2609                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2610                         clear_stack_to_table_row_context()
2611                         insert_html_element t
2612                         ins_mode = ins_mode_in_cell
2613                         afe_push_marker()
2614                         return
2615                 if t.type is TYPE_END_TAG and t.name is 'tr'
2616                         if is_in_table_scope 'tr', NS_HTML
2617                                 clear_stack_to_table_row_context()
2618                                 open_els.shift()
2619                                 ins_mode = ins_mode_in_table_body
2620                         else
2621                                 parse_error()
2622                         return
2623                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2624                         if is_in_table_scope 'tr', NS_HTML
2625                                 clear_stack_to_table_row_context()
2626                                 open_els.shift()
2627                                 ins_mode = ins_mode_in_table_body
2628                                 process_token t
2629                         else
2630                                 parse_error()
2631                         return
2632                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2633                         if is_in_table_scope t.name, NS_HTML
2634                                 if is_in_table_scope 'tr', NS_HTML
2635                                         clear_stack_to_table_row_context()
2636                                         open_els.shift()
2637                                         ins_mode = ins_mode_in_table_body
2638                                         process_token t
2639                         else
2640                                 parse_error()
2641                         return
2642                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2643                         parse_error()
2644                         return
2645                 # Anything else
2646                 ins_mode_in_table t
2647
2648         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2649         close_the_cell = ->
2650                 generate_implied_end_tags()
2651                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2652                         parse_error()
2653                 loop
2654                         el = open_els.shift()
2655                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2656                                 break
2657                 clear_afe_to_marker()
2658                 ins_mode = ins_mode_in_row
2659
2660         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2661         ins_mode_in_cell = (t) ->
2662                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2663                         if is_in_table_scope t.name, NS_HTML
2664                                 generate_implied_end_tags()
2665                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2666                                         parse_error()
2667                                 loop
2668                                         el = open_els.shift()
2669                                         if el.name is t.name and el.namespace is NS_HTML
2670                                                 break
2671                                 clear_afe_to_marker()
2672                                 ins_mode = ins_mode_in_row
2673                         else
2674                                 parse_error()
2675                         return
2676                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2677                         has = false
2678                         for el in open_els
2679                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2680                                         has = true
2681                                         break
2682                                 if table_scopers[el.name] is el.namespace
2683                                         break
2684                         if !has
2685                                 parse_error()
2686                                 return
2687                         close_the_cell()
2688                         process_token t
2689                         return
2690                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2691                         parse_error()
2692                         return
2693                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2694                         if is_in_table_scope t.name, NS_HTML
2695                                 close_the_cell()
2696                                 process_token t
2697                         else
2698                                 parse_error()
2699                         return
2700                 # Anything Else
2701                 ins_mode_in_body t
2702
2703         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2704         ins_mode_in_select = (t) ->
2705                 if t.type is TYPE_TEXT and t.text is "\u0000"
2706                         parse_error()
2707                         return
2708                 if t.type is TYPE_TEXT
2709                         insert_character t
2710                         return
2711                 if t.type is TYPE_COMMENT
2712                         insert_comment t
2713                         return
2714                 if t.type is TYPE_DOCTYPE
2715                         parse_error()
2716                         return
2717                 if t.type is TYPE_START_TAG and t.name is 'html'
2718                         ins_mode_in_body t
2719                         return
2720                 if t.type is TYPE_START_TAG and t.name is 'option'
2721                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2722                                 open_els.shift()
2723                         insert_html_element t
2724                         return
2725                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2726                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2727                                 open_els.shift()
2728                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2729                                 open_els.shift()
2730                         insert_html_element t
2731                         return
2732                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2733                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2734                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2735                                         open_els.shift()
2736                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2737                                 open_els.shift()
2738                         else
2739                                 parse_error()
2740                         return
2741                 if t.type is TYPE_END_TAG and t.name is 'option'
2742                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2743                                 open_els.shift()
2744                         else
2745                                 parse_error()
2746                         return
2747                 if t.type is TYPE_END_TAG and t.name is 'select'
2748                         if is_in_select_scope 'select', NS_HTML
2749                                 loop
2750                                         el = open_els.shift()
2751                                         if el.name is 'select' and el.namespace is NS_HTML
2752                                                 break
2753                                 reset_ins_mode()
2754                         else
2755                                 parse_error()
2756                         return
2757                 if t.type is TYPE_START_TAG and t.name is 'select'
2758                         parse_error()
2759                         loop
2760                                 el = open_els.shift()
2761                                 if el.name is 'select' and el.namespace is NS_HTML
2762                                         break
2763                         reset_ins_mode()
2764                         # spec says that this is the same as </select> but it doesn't say
2765                         # to check scope first
2766                         return
2767                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2768                         parse_error()
2769                         unless is_in_select_scope 'select', NS_HTML
2770                                 return
2771                         loop
2772                                 el = open_els.shift()
2773                                 if el.name is 'select' and el.namespace is NS_HTML
2774                                         break
2775                         reset_ins_mode()
2776                         process_token t
2777                         return
2778                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2779                         ins_mode_in_head t
2780                         return
2781                 if t.type is TYPE_EOF
2782                         ins_mode_in_body t
2783                         return
2784                 # Anything else
2785                 parse_error()
2786                 return
2787
2788         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2789         ins_mode_in_select_in_table = (t) ->
2790                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2791                         parse_error()
2792                         loop
2793                                 el = open_els.shift()
2794                                 if el.name is 'select' and el.namespace is NS_HTML
2795                                         break
2796                         reset_ins_mode()
2797                         process_token t
2798                         return
2799                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2800                         parse_error()
2801                         unless is_in_table_scope t.name, NS_HTML
2802                                 return
2803                         loop
2804                                 el = open_els.shift()
2805                                 if el.name is 'select' and el.namespace is NS_HTML
2806                                         break
2807                         reset_ins_mode()
2808                         process_token t
2809                         return
2810                 # Anything else
2811                 ins_mode_in_select t
2812                 return
2813
2814         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2815         ins_mode_in_template = (t) ->
2816                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2817                         ins_mode_in_body t
2818                         return
2819                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2820                         ins_mode_in_head t
2821                         return
2822                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2823                         template_ins_modes.shift()
2824                         template_ins_modes.unshift ins_mode_in_table
2825                         ins_mode = ins_mode_in_table
2826                         process_token t
2827                         return
2828                 if t.type is TYPE_START_TAG and t.name is 'col'
2829                         template_ins_modes.shift()
2830                         template_ins_modes.unshift ins_mode_in_column_group
2831                         ins_mode = ins_mode_in_column_group
2832                         process_token t
2833                         return
2834                 if t.type is TYPE_START_TAG and t.name is 'tr'
2835                         template_ins_modes.shift()
2836                         template_ins_modes.unshift ins_mode_in_table_body
2837                         ins_mode = ins_mode_in_table_body
2838                         process_token t
2839                         return
2840                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2841                         template_ins_modes.shift()
2842                         template_ins_modes.unshift ins_mode_in_row
2843                         ins_mode = ins_mode_in_row
2844                         process_token t
2845                         return
2846                 if t.type is TYPE_START_TAG
2847                         template_ins_modes.shift()
2848                         template_ins_modes.unshift ins_mode_in_body
2849                         ins_mode = ins_mode_in_body
2850                         process_token t
2851                         return
2852                 if t.type is TYPE_END_TAG
2853                         parse_error()
2854                         return
2855                 if t.type is TYPE_EOF
2856                         unless template_tag_is_open()
2857                                 stop_parsing()
2858                                 return
2859                         parse_error()
2860                         loop
2861                                 el = open_els.shift()
2862                                 if el.name is 'template' and el.namespace is NS_HTML
2863                                         break
2864                         clear_afe_to_marker()
2865                         template_ins_modes.shift()
2866                         reset_ins_mode()
2867                         process_token t
2868
2869         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2870         ins_mode_after_body = (t) ->
2871                 if is_space_tok t
2872                         ins_mode_in_body t
2873                         return
2874                 if t.type is TYPE_COMMENT
2875                         first = open_els[open_els.length - 1]
2876                         insert_comment t, [first, first.children.length]
2877                         return
2878                 if t.type is TYPE_DOCTYPE
2879                         parse_error()
2880                         return
2881                 if t.type is TYPE_START_TAG and t.name is 'html'
2882                         ins_mode_in_body t
2883                         return
2884                 if t.type is TYPE_END_TAG and t.name is 'html'
2885                         if flag_fragment_parsing
2886                                 parse_error()
2887                                 return
2888                         ins_mode = ins_mode_after_after_body
2889                         return
2890                 if t.type is TYPE_EOF
2891                         stop_parsing()
2892                         return
2893                 # Anything ELse
2894                 parse_error()
2895                 ins_mode = ins_mode_in_body
2896                 process_token t
2897
2898         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2899         ins_mode_in_frameset = (t) ->
2900                 if is_space_tok t
2901                         insert_character t
2902                         return
2903                 if t.type is TYPE_COMMENT
2904                         insert_comment t
2905                         return
2906                 if t.type is TYPE_DOCTYPE
2907                         parse_error()
2908                         return
2909                 if t.type is TYPE_START_TAG and t.name is 'html'
2910                         ins_mode_in_body t
2911                         return
2912                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2913                         insert_html_element t
2914                         return
2915                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2916                         if open_els.length is 1
2917                                 parse_error()
2918                                 return # fragment case
2919                         open_els.shift()
2920                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2921                                 ins_mode = ins_mode_after_frameset
2922                         return
2923                 if t.type is TYPE_START_TAG and t.name is 'frame'
2924                         insert_html_element t
2925                         open_els.shift()
2926                         t.acknowledge_self_closing()
2927                         return
2928                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2929                         ins_mode_in_head t
2930                         return
2931                 if t.type is TYPE_EOF
2932                         if open_els.length isnt 1
2933                                 parse_error()
2934                         stop_parsing()
2935                         return
2936                 # Anything else
2937                 parse_error()
2938                 return
2939
2940         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2941         ins_mode_after_frameset = (t) ->
2942                 if is_space_tok t
2943                         insert_character t
2944                         return
2945                 if t.type is TYPE_COMMENT
2946                         insert_comment t
2947                         return
2948                 if t.type is TYPE_DOCTYPE
2949                         parse_error()
2950                         return
2951                 if t.type is TYPE_START_TAG and t.name is 'html'
2952                         ins_mode_in_body t
2953                         return
2954                 if t.type is TYPE_END_TAG and t.name is 'html'
2955                         ins_mode = ins_mode_after_after_frameset
2956                         return
2957                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2958                         ins_mode_in_head t
2959                         return
2960                 if t.type is TYPE_EOF
2961                         stop_parsing()
2962                         return
2963                 # Anything else
2964                 parse_error()
2965                 return
2966
2967         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2968         ins_mode_after_after_body = (t) ->
2969                 if t.type is TYPE_COMMENT
2970                         insert_comment t, [doc, doc.children.length]
2971                         return
2972                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2973                         ins_mode_in_body t
2974                         return
2975                 if t.type is TYPE_EOF
2976                         stop_parsing()
2977                         return
2978                 # Anything else
2979                 parse_error()
2980                 ins_mode = ins_mode_in_body
2981                 process_token t
2982                 return
2983
2984         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2985         ins_mode_after_after_frameset = (t) ->
2986                 if t.type is TYPE_COMMENT
2987                         insert_comment t, [doc, doc.children.length]
2988                         return
2989                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2990                         ins_mode_in_body t
2991                         return
2992                 if t.type is TYPE_EOF
2993                         stop_parsing()
2994                         return
2995                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2996                         ins_mode_in_head t
2997                         return
2998                 # Anything else
2999                 parse_error()
3000                 return
3001
3002         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3003         has_color_face_or_size = (t) ->
3004                 for a in t.attrs_a
3005                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3006                                 return true
3007                 return false
3008         in_foreign_content_end_script = ->
3009                 open_els.shift()
3010                 # fixfull
3011                 return
3012         in_foreign_content_other_start = (t) ->
3013                 acn = adjusted_current_node()
3014                 if acn.namespace is NS_MATHML
3015                         adjust_mathml_attributes t
3016                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3017                         t.name = svg_name_fixes[t.name]
3018                 if acn.namespace is NS_SVG
3019                         adjust_svg_attributes t
3020                 adjust_foreign_attributes t
3021                 insert_foreign_element t, acn.namespace
3022                 if t.flag 'self-closing'
3023                         if t.name is 'script'
3024                                 t.acknowledge_self_closing()
3025                                 in_foreign_content_end_script()
3026                                 # fixfull
3027                         else
3028                                 open_els.shift()
3029                                 t.acknowledge_self_closing()
3030                 return
3031         in_foreign_content = (t) ->
3032                 if t.type is TYPE_TEXT and t.text is "\u0000"
3033                         parse_error()
3034                         insert_character new_character_token "\ufffd"
3035                         return
3036                 if is_space_tok t
3037                         insert_character t
3038                         return
3039                 if t.type is TYPE_TEXT
3040                         flag_frameset_ok = false
3041                         insert_character t
3042                         return
3043                 if t.type is TYPE_COMMENT
3044                         insert_comment t
3045                         return
3046                 if t.type is TYPE_DOCTYPE
3047                         parse_error()
3048                         return
3049                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3050                         parse_error()
3051                         if flag_fragment_parsing
3052                                 in_foreign_content_other_start t
3053                                 return
3054                         loop # is this safe?
3055                                 open_els.shift()
3056                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3057                                         break
3058                         process_token t
3059                         return
3060                 if t.type is TYPE_START_TAG
3061                         in_foreign_content_other_start t
3062                         return
3063                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3064                         in_foreign_content_end_script()
3065                         return
3066                 if t.type is TYPE_END_TAG
3067                         i = 0
3068                         node = open_els[i]
3069                         if node.name.toLowerCase() isnt t.name
3070                                 parse_error()
3071                         loop
3072                                 if node is open_els[open_els.length - 1]
3073                                         return
3074                                 if node.name.toLowerCase() is t.name
3075                                         loop
3076                                                 el = open_els.shift()
3077                                                 if el is node
3078                                                         return
3079                                 i += 1
3080                                 node = open_els[i]
3081                                 if node.namespace is NS_HTML
3082                                         break
3083                         ins_mode t # explicitly call HTML insertion mode
3084
3085
3086         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3087         tok_state_data = ->
3088                 switch c = txt.charAt(cur++)
3089                         when '&'
3090                                 return new_text_node parse_character_reference()
3091                         when '<'
3092                                 tok_state = tok_state_tag_open
3093                         when "\u0000"
3094                                 parse_error()
3095                                 return new_text_node "\ufffd"
3096                         when '' # EOF
3097                                 return new_eof_token()
3098                         else
3099                                 return new_text_node c
3100                 return null
3101
3102         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3103         # not needed: tok_state_character_reference_in_data = ->
3104         # just call parse_character_reference()
3105
3106         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3107         tok_state_rcdata = ->
3108                 switch c = txt.charAt(cur++)
3109                         when '&'
3110                                 return new_text_node parse_character_reference()
3111                         when '<'
3112                                 tok_state = tok_state_rcdata_less_than_sign
3113                         when "\u0000"
3114                                 parse_error()
3115                                 return new_character_token "\ufffd"
3116                         when '' # EOF
3117                                 return new_eof_token()
3118                         else
3119                                 return new_character_token c
3120                 return null
3121
3122         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3123         # not needed: tok_state_character_reference_in_rcdata = ->
3124         # just call parse_character_reference()
3125
3126         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3127         tok_state_rawtext = ->
3128                 switch c = txt.charAt(cur++)
3129                         when '<'
3130                                 tok_state = tok_state_rawtext_less_than_sign
3131                         when "\u0000"
3132                                 parse_error()
3133                                 return new_character_token "\ufffd"
3134                         when '' # EOF
3135                                 return new_eof_token()
3136                         else
3137                                 return new_character_token c
3138                 return null
3139
3140         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3141         tok_state_script_data = ->
3142                 switch c = txt.charAt(cur++)
3143                         when '<'
3144                                 tok_state = tok_state_script_data_less_than_sign
3145                         when "\u0000"
3146                                 parse_error()
3147                                 return new_character_token "\ufffd"
3148                         when '' # EOF
3149                                 return new_eof_token()
3150                         else
3151                                 return new_character_token c
3152                 return null
3153
3154         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3155         tok_state_plaintext = ->
3156                 switch c = txt.charAt(cur++)
3157                         when "\u0000"
3158                                 parse_error()
3159                                 return new_character_token "\ufffd"
3160                         when '' # EOF
3161                                 return new_eof_token()
3162                         else
3163                                 return new_character_token c
3164                 return null
3165
3166
3167         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3168         tok_state_tag_open = ->
3169                 c = txt.charAt(cur++)
3170                 if c is '!'
3171                         tok_state = tok_state_markup_declaration_open
3172                         return
3173                 if c is '/'
3174                         tok_state = tok_state_end_tag_open
3175                         return
3176                 if is_uc_alpha(c)
3177                         tok_cur_tag = new_open_tag c.toLowerCase()
3178                         tok_state = tok_state_tag_name
3179                         return
3180                 if is_lc_alpha(c)
3181                         tok_cur_tag = new_open_tag c
3182                         tok_state = tok_state_tag_name
3183                         return
3184                 if c is '?'
3185                         parse_error()
3186                         tok_cur_tag = new_comment_token '?' # FIXME right?
3187                         tok_state = tok_state_bogus_comment
3188                         return
3189                 # Anything else
3190                 parse_error()
3191                 tok_state = tok_state_data
3192                 cur -= 1 # we didn't parse/handle the char after <
3193                 return new_text_node '<'
3194
3195         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3196         tok_state_end_tag_open = ->
3197                 c = txt.charAt(cur++)
3198                 if is_uc_alpha(c)
3199                         tok_cur_tag = new_end_tag c.toLowerCase()
3200                         tok_state = tok_state_tag_name
3201                         return
3202                 if is_lc_alpha(c)
3203                         tok_cur_tag = new_end_tag c
3204                         tok_state = tok_state_tag_name
3205                         return
3206                 if c is '>'
3207                         parse_error()
3208                         tok_state = tok_state_data
3209                         return
3210                 if c is '' # EOF
3211                         parse_error()
3212                         tok_state = tok_state_data
3213                         return new_text_node '</'
3214                 # Anything else
3215                 parse_error()
3216                 tok_cur_tag = new_comment_token c
3217                 tok_state = tok_state_bogus_comment
3218                 return null
3219
3220         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3221         tok_state_tag_name = ->
3222                 switch c = txt.charAt(cur++)
3223                         when "\t", "\n", "\u000c", ' '
3224                                 tok_state = tok_state_before_attribute_name
3225                         when '/'
3226                                 tok_state = tok_state_self_closing_start_tag
3227                         when '>'
3228                                 tok_state = tok_state_data
3229                                 tmp = tok_cur_tag
3230                                 tok_cur_tag = null
3231                                 return tmp
3232                         when "\u0000"
3233                                 parse_error()
3234                                 tok_cur_tag.name += "\ufffd"
3235                         when '' # EOF
3236                                 parse_error()
3237                                 tok_state = tok_state_data
3238                         else
3239                                 if is_uc_alpha(c)
3240                                         tok_cur_tag.name += c.toLowerCase()
3241                                 else
3242                                         tok_cur_tag.name += c
3243                 return null
3244
3245         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3246         tok_state_rcdata_less_than_sign = ->
3247                 c = txt.charAt(cur++)
3248                 if c is '/'
3249                         temporary_buffer = ''
3250                         tok_state = tok_state_rcdata_end_tag_open
3251                         return null
3252                 # Anything else
3253                 tok_state = tok_state_rcdata
3254                 cur -= 1 # reconsume the input character
3255                 return new_character_token '<'
3256
3257         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3258         tok_state_rcdata_end_tag_open = ->
3259                 c = txt.charAt(cur++)
3260                 if is_uc_alpha(c)
3261                         tok_cur_tag = new_end_tag c.toLowerCase()
3262                         temporary_buffer += c
3263                         tok_state = tok_state_rcdata_end_tag_name
3264                         return null
3265                 if is_lc_alpha(c)
3266                         tok_cur_tag = new_end_tag c
3267                         temporary_buffer += c
3268                         tok_state = tok_state_rcdata_end_tag_name
3269                         return null
3270                 # Anything else
3271                 tok_state = tok_state_rcdata
3272                 cur -= 1 # reconsume the input character
3273                 return new_character_token "</" # fixfull separate these
3274
3275         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3276         is_appropriate_end_tag = (t) ->
3277                 # spec says to check against "the tag name of the last start tag to
3278                 # have been emitted from this tokenizer", but this is only called from
3279                 # the various "raw" states, so it's hopefully ok to assume that
3280                 # open_els[0].name will work instead TODO: verify this after the script
3281                 # data states are implemented
3282                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3283                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3284
3285         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3286         tok_state_rcdata_end_tag_name = ->
3287                 c = txt.charAt(cur++)
3288                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3289                         if is_appropriate_end_tag tok_cur_tag
3290                                 tok_state = tok_state_before_attribute_name
3291                                 return
3292                         # else fall through to "Anything else"
3293                 if c is '/'
3294                         if is_appropriate_end_tag tok_cur_tag
3295                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3296                                 return
3297                         # else fall through to "Anything else"
3298                 if c is '>'
3299                         if is_appropriate_end_tag tok_cur_tag
3300                                 tok_state = tok_state_data
3301                                 return tok_cur_tag
3302                         # else fall through to "Anything else"
3303                 if is_uc_alpha(c)
3304                         tok_cur_tag.name += c.toLowerCase()
3305                         temporary_buffer += c
3306                         return null
3307                 if is_lc_alpha(c)
3308                         tok_cur_tag.name += c
3309                         temporary_buffer += c
3310                         return null
3311                 # Anything else
3312                 tok_state = tok_state_rcdata
3313                 cur -= 1 # reconsume the input character
3314                 return new_character_token '</' + temporary_buffer # fixfull separate these
3315
3316         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3317         tok_state_rawtext_less_than_sign = ->
3318                 c = txt.charAt(cur++)
3319                 if c is '/'
3320                         temporary_buffer = ''
3321                         tok_state = tok_state_rawtext_end_tag_open
3322                         return null
3323                 # Anything else
3324                 tok_state = tok_state_rawtext
3325                 cur -= 1 # reconsume the input character
3326                 return new_character_token '<'
3327
3328         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3329         tok_state_rawtext_end_tag_open = ->
3330                 c = txt.charAt(cur++)
3331                 if is_uc_alpha(c)
3332                         tok_cur_tag = new_end_tag c.toLowerCase()
3333                         temporary_buffer += c
3334                         tok_state = tok_state_rawtext_end_tag_name
3335                         return null
3336                 if is_lc_alpha(c)
3337                         tok_cur_tag = new_end_tag c
3338                         temporary_buffer += c
3339                         tok_state = tok_state_rawtext_end_tag_name
3340                         return null
3341                 # Anything else
3342                 tok_state = tok_state_rawtext
3343                 cur -= 1 # reconsume the input character
3344                 return new_character_token "</" # fixfull separate these
3345
3346         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3347         tok_state_rawtext_end_tag_name = ->
3348                 c = txt.charAt(cur++)
3349                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3350                         if is_appropriate_end_tag tok_cur_tag
3351                                 tok_state = tok_state_before_attribute_name
3352                                 return
3353                         # else fall through to "Anything else"
3354                 if c is '/'
3355                         if is_appropriate_end_tag tok_cur_tag
3356                                 tok_state = tok_state_self_closing_start_tag
3357                                 return
3358                         # else fall through to "Anything else"
3359                 if c is '>'
3360                         if is_appropriate_end_tag tok_cur_tag
3361                                 tok_state = tok_state_data
3362                                 return tok_cur_tag
3363                         # else fall through to "Anything else"
3364                 if is_uc_alpha(c)
3365                         tok_cur_tag.name += c.toLowerCase()
3366                         temporary_buffer += c
3367                         return null
3368                 if is_lc_alpha(c)
3369                         tok_cur_tag.name += c
3370                         temporary_buffer += c
3371                         return null
3372                 # Anything else
3373                 tok_state = tok_state_rawtext
3374                 cur -= 1 # reconsume the input character
3375                 return new_character_token '</' + temporary_buffer # fixfull separate these
3376
3377         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3378         tok_state_script_data_less_than_sign = ->
3379                 c = txt.charAt(cur++)
3380                 if c is '/'
3381                         temporary_buffer = ''
3382                         tok_state = tok_state_script_data_end_tag_open
3383                         return
3384                 if c is '!'
3385                         tok_state = tok_state_script_data_escape_start
3386                         return new_character_token '<!' # fixfull split
3387                 # Anything else
3388                 tok_state = tok_state_script_data
3389                 cur -= 1 # Reconsume
3390                 return new_character_token '<'
3391
3392         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3393         tok_state_script_data_end_tag_open = ->
3394                 c = txt.charAt(cur++)
3395                 if is_uc_alpha(c)
3396                         tok_cur_tag = new_end_tag c.toLowerCase()
3397                         temporary_buffer += c
3398                         tok_state = tok_state_script_data_end_tag_name
3399                         return
3400                 if is_lc_alpha(c)
3401                         tok_cur_tag = new_end_tag c
3402                         temporary_buffer += c
3403                         tok_state = tok_state_script_data_end_tag_name
3404                         return
3405                 # Anything else
3406                 tok_state = tok_state_script_data
3407                 cur -= 1 # Reconsume
3408                 return new_character_token '</'
3409
3410         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3411         tok_state_script_data_end_tag_name = ->
3412                 c = txt.charAt(cur++)
3413                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3414                         if is_appropriate_end_tag tok_cur_tag
3415                                 tok_state = tok_state_before_attribute_name
3416                                 return
3417                         # fall through
3418                 if c is '/'
3419                         if is_appropriate_end_tag tok_cur_tag
3420                                 tok_state = tok_state_self_closing_start_tag
3421                                 return
3422                         # fall through
3423                 if c is '>'
3424                         if is_appropriate_end_tag tok_cur_tag
3425                                 tok_state = tok_state_data
3426                                 return tok_cur_tag
3427                         # fall through
3428                 if is_uc_alpha(c)
3429                         tok_cur_tag.name += c.toLowerCase()
3430                         temporary_buffer += c
3431                         return
3432                 if is_lc_alpha(c)
3433                         tok_cur_tag.name += c
3434                         temporary_buffer += c
3435                         return
3436                 # Anything else
3437                 tok_state = tok_state_script_data
3438                 cur -= 1 # Reconsume
3439                 return new_character_token "</#{temporary_buffer}" # fixfull split
3440
3441         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3442         tok_state_script_data_escape_start = ->
3443                 c = txt.charAt(cur++)
3444                 if c is '-'
3445                         tok_state = tok_state_script_data_escape_start_dash
3446                         return new_character_token '-'
3447                 # Anything else
3448                 tok_state = tok_state_script_data
3449                 cur -= 1 # Reconsume
3450                 return
3451
3452         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3453         tok_state_script_data_escape_start_dash = ->
3454                 c = txt.charAt(cur++)
3455                 if c is '-'
3456                         tok_state = tok_state_script_data_escaped_dash_dash
3457                         return new_character_token '-'
3458                 # Anything else
3459                 tok_state = tok_state_script_data
3460                 cur -= 1 # Reconsume
3461                 return
3462
3463         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3464         tok_state_script_data_escaped = ->
3465                 c = txt.charAt(cur++)
3466                 if c is '-'
3467                         tok_state = tok_state_script_data_escaped_dash
3468                         return new_character_token '-'
3469                 if c is '<'
3470                         tok_state = tok_state_script_data_escaped_less_than_sign
3471                         return
3472                 if c is "\u0000"
3473                         parse_error()
3474                         return new_character_token "\ufffd"
3475                 if c is '' # EOF
3476                         tok_state = tok_state_data
3477                         parse_error()
3478                         cur -= 1 # Reconsume
3479                         return
3480                 # Anything else
3481                 return new_character_token c
3482
3483         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3484         tok_state_script_data_escaped_dash = ->
3485                 c = txt.charAt(cur++)
3486                 if c is '-'
3487                         tok_state = tok_state_script_data_escaped_dash_dash
3488                         return new_character_token '-'
3489                 if c is '<'
3490                         tok_state = tok_state_script_data_escaped_less_than_sign
3491                         return
3492                 if c is "\u0000"
3493                         parse_error()
3494                         tok_state = tok_state_script_data_escaped
3495                         return new_character_token "\ufffd"
3496                 if c is '' # EOF
3497                         tok_state = tok_state_data
3498                         parse_error()
3499                         cur -= 1 # Reconsume
3500                         return
3501                 # Anything else
3502                 tok_state = tok_state_script_data_escaped
3503                 return new_character_token c
3504
3505         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3506         tok_state_script_data_escaped_dash_dash = ->
3507                 c = txt.charAt(cur++)
3508                 if c is '-'
3509                         return new_character_token '-'
3510                 if c is '<'
3511                         tok_state = tok_state_script_data_escaped_less_than_sign
3512                         return
3513                 if c is '>'
3514                         tok_state = tok_state_script_data
3515                         return new_character_token '>'
3516                 if c is "\u0000"
3517                         parse_error()
3518                         tok_state = tok_state_script_data_escaped
3519                         return new_character_token "\ufffd"
3520                 if c is '' # EOF
3521                         parse_error()
3522                         tok_state = tok_state_data
3523                         cur -= 1 # Reconsume
3524                         return
3525                 # Anything else
3526                 tok_state = tok_state_script_data_escaped
3527                 return new_character_token c
3528
3529         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3530         tok_state_script_data_escaped_less_than_sign = ->
3531                 c = txt.charAt(cur++)
3532                 if c is '/'
3533                         temporary_buffer = ''
3534                         tok_state = tok_state_script_data_escaped_end_tag_open
3535                         return
3536                 if is_uc_alpha(c)
3537                         temporary_buffer = c.toLowerCase() # yes, really
3538                         tok_state = tok_state_script_data_double_escape_start
3539                         return new_character_token "<#{c}" # fixfull split
3540                 if is_lc_alpha(c)
3541                         temporary_buffer = c
3542                         tok_state = tok_state_script_data_double_escape_start
3543                         return new_character_token "<#{c}" # fixfull split
3544                 # Anything else
3545                 tok_state = tok_state_script_data_escaped
3546                 cur -= 1 # Reconsume
3547                 return new_character_token '<'
3548
3549         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3550         tok_state_script_data_escaped_end_tag_open = ->
3551                 c = txt.charAt(cur++)
3552                 if is_uc_alpha(c)
3553                         tok_cur_tag = new_end_tag c.toLowerCase()
3554                         temporary_buffer += c
3555                         tok_state = tok_state_script_data_escaped_end_tag_name
3556                         return
3557                 if is_lc_alpha(c)
3558                         tok_cur_tag = new_end_tag c
3559                         temporary_buffer += c
3560                         tok_state = tok_state_script_data_escaped_end_tag_name
3561                         return
3562                 # Anything else
3563                 tok_state = tok_state_script_data_escaped
3564                 cur -= 1 # Reconsume
3565                 return new_character_token '</' # fixfull split
3566
3567         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3568         tok_state_script_data_escaped_end_tag_name = ->
3569                 c = txt.charAt(cur++)
3570                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3571                         if is_appropriate_end_tag tok_cur_tag
3572                                 tok_state = tok_state_before_attribute_name
3573                                 return
3574                         # fall through
3575                 if c is '/'
3576                         if is_appropriate_end_tag tok_cur_tag
3577                                 tok_state = tok_state_self_closing_start_tag
3578                                 return
3579                         # fall through
3580                 if c is '>'
3581                         if is_appropriate_end_tag tok_cur_tag
3582                                 tok_state = tok_state_data
3583                                 return tok_cur_tag
3584                         # fall through
3585                 if is_uc_alpha(c)
3586                         tok_cur_tag.name += c.toLowerCase()
3587                         temporary_buffer += c.toLowerCase()
3588                         return
3589                 if is_lc_alpha(c)
3590                         tok_cur_tag.name += c
3591                         temporary_buffer += c.toLowerCase()
3592                         return
3593                 # Anything else
3594                 tok_state = tok_state_script_data_escaped
3595                 cur -= 1 # Reconsume
3596                 return new_character_token "</#{temporary_buffer}" # fixfull split
3597
3598         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3599         tok_state_script_data_double_escape_start = ->
3600                 c = txt.charAt(cur++)
3601                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3602                         if temporary_buffer is 'script'
3603                                 tok_state = tok_state_script_data_double_escaped
3604                         else
3605                                 tok_state = tok_state_script_data_escaped
3606                         return new_character_token c
3607                 if is_uc_alpha(c)
3608                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3609                         return new_character_token c
3610                 if is_lc_alpha(c)
3611                         temporary_buffer += c
3612                         return new_character_token c
3613                 # Anything else
3614                 tok_state = tok_state_script_data_escaped
3615                 cur -= 1 # Reconsume
3616                 return
3617
3618         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3619         tok_state_script_data_double_escaped = ->
3620                 c = txt.charAt(cur++)
3621                 if c is '-'
3622                         tok_state = tok_state_script_data_double_escaped_dash
3623                         return new_character_token '-'
3624                 if c is '<'
3625                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3626                         return new_character_token '<'
3627                 if c is "\u0000"
3628                         parse_error()
3629                         return new_character_token "\ufffd"
3630                 if c is '' # EOF
3631                         parse_error()
3632                         tok_state = tok_state_data
3633                         cur -= 1 # Reconsume
3634                         return
3635                 # Anything else
3636                 return new_character_token c
3637
3638         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3639         tok_state_script_data_double_escaped_dash = ->
3640                 c = txt.charAt(cur++)
3641                 if c is '-'
3642                         tok_state = tok_state_script_data_double_escaped_dash_dash
3643                         return new_character_token '-'
3644                 if c is '<'
3645                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3646                         return new_character_token '<'
3647                 if c is "\u0000"
3648                         parse_error()
3649                         tok_state = tok_state_script_data_double_escaped
3650                         return new_character_token "\ufffd"
3651                 if c is '' # EOF
3652                         parse_error()
3653                         tok_state = tok_state_data
3654                         cur -= 1 # Reconsume
3655                         return
3656                 # Anything else
3657                 tok_state = tok_state_script_data_double_escaped
3658                 return new_character_token c
3659
3660         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3661         tok_state_script_data_double_escaped_dash_dash = ->
3662                 c = txt.charAt(cur++)
3663                 if c is '-'
3664                         return new_character_token '-'
3665                 if c is '<'
3666                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3667                         return new_character_token '<'
3668                 if c is '>'
3669                         tok_state = tok_state_script_data
3670                         return new_character_token '>'
3671                 if c is "\u0000"
3672                         parse_error()
3673                         tok_state = tok_state_script_data_double_escaped
3674                         return new_character_token "\ufffd"
3675                 if c is '' # EOF
3676                         parse_error()
3677                         tok_state = tok_state_data
3678                         cur -= 1 # Reconsume
3679                         return
3680                 # Anything else
3681                 tok_state = tok_state_script_data_double_escaped
3682                 return new_character_token c
3683
3684         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3685         tok_state_script_data_double_escaped_less_than_sign = ->
3686                 c = txt.charAt(cur++)
3687                 if c is '/'
3688                         temporary_buffer = ''
3689                         tok_state = tok_state_script_data_double_escape_end
3690                         return new_character_token '/'
3691                 # Anything else
3692                 tok_state = tok_state_script_data_double_escaped
3693                 cur -= 1 # Reconsume
3694                 return
3695
3696         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3697         tok_state_script_data_double_escape_end = ->
3698                 c = txt.charAt(cur++)
3699                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3700                         if temporary_buffer is 'script'
3701                                 tok_state = tok_state_script_data_escaped
3702                         else
3703                                 tok_state = tok_state_script_data_double_escaped
3704                         return new_character_token c
3705                 if is_uc_alpha(c)
3706                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3707                         return new_character_token c
3708                 if is_lc_alpha(c)
3709                         temporary_buffer += c
3710                         return new_character_token c
3711                 # Anything else
3712                 tok_state = tok_state_script_data_double_escaped
3713                 cur -= 1 # Reconsume
3714                 return
3715
3716         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3717         tok_state_before_attribute_name = ->
3718                 attr_name = null
3719                 switch c = txt.charAt(cur++)
3720                         when "\t", "\n", "\u000c", ' '
3721                                 return null
3722                         when '/'
3723                                 tok_state = tok_state_self_closing_start_tag
3724                                 return null
3725                         when '>'
3726                                 tok_state = tok_state_data
3727                                 tmp = tok_cur_tag
3728                                 tok_cur_tag = null
3729                                 return tmp
3730                         when "\u0000"
3731                                 parse_error()
3732                                 attr_name = "\ufffd"
3733                         when '"', "'", '<', '='
3734                                 parse_error()
3735                                 attr_name = c
3736                         when '' # EOF
3737                                 parse_error()
3738                                 tok_state = tok_state_data
3739                         else
3740                                 if is_uc_alpha(c)
3741                                         attr_name = c.toLowerCase()
3742                                 else
3743                                         attr_name = c
3744                 if attr_name?
3745                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3746                         tok_state = tok_state_attribute_name
3747                 return null
3748
3749         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3750         tok_state_attribute_name = ->
3751                 switch c = txt.charAt(cur++)
3752                         when "\t", "\n", "\u000c", ' '
3753                                 tok_state = tok_state_after_attribute_name
3754                         when '/'
3755                                 tok_state = tok_state_self_closing_start_tag
3756                         when '='
3757                                 tok_state = tok_state_before_attribute_value
3758                         when '>'
3759                                 tok_state = tok_state_data
3760                                 tmp = tok_cur_tag
3761                                 tok_cur_tag = null
3762                                 return tmp
3763                         when "\u0000"
3764                                 parse_error()
3765                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3766                         when '"', "'", '<'
3767                                 parse_error()
3768                                 tok_cur_tag.attrs_a[0][0] += c
3769                         when '' # EOF
3770                                 parse_error()
3771                                 tok_state = tok_state_data
3772                         else
3773                                 if is_uc_alpha(c)
3774                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3775                                 else
3776                                         tok_cur_tag.attrs_a[0][0] += c
3777                 return null
3778
3779         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3780         tok_state_after_attribute_name = ->
3781                 c = txt.charAt(cur++)
3782                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3783                         return
3784                 if c is '/'
3785                         tok_state = tok_state_self_closing_start_tag
3786                         return
3787                 if c is '='
3788                         tok_state = tok_state_before_attribute_value
3789                         return
3790                 if c is '>'
3791                         tok_state = tok_state_data
3792                         return tok_cur_tag
3793                 if is_uc_alpha(c)
3794                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3795                         tok_state = tok_state_attribute_name
3796                         return
3797                 if c is "\u0000"
3798                         parse_error()
3799                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3800                         tok_state = tok_state_attribute_name
3801                         return
3802                 if c is '' # EOF
3803                         parse_error()
3804                         tok_state = tok_state_data
3805                         cur -= 1 # reconsume
3806                         return
3807                 if c is '"' or c is "'" or c is '<'
3808                         parse_error()
3809                         # fall through to Anything else
3810                 # Anything else
3811                 tok_cur_tag.attrs_a.unshift [c, '']
3812                 tok_state = tok_state_attribute_name
3813
3814         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3815         tok_state_before_attribute_value = ->
3816                 switch c = txt.charAt(cur++)
3817                         when "\t", "\n", "\u000c", ' '
3818                                 return null
3819                         when '"'
3820                                 tok_state = tok_state_attribute_value_double_quoted
3821                         when '&'
3822                                 tok_state = tok_state_attribute_value_unquoted
3823                                 cur -= 1
3824                         when "'"
3825                                 tok_state = tok_state_attribute_value_single_quoted
3826                         when "\u0000"
3827                                 # Parse error
3828                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3829                                 tok_state = tok_state_attribute_value_unquoted
3830                         when '>'
3831                                 # Parse error
3832                                 tok_state = tok_state_data
3833                                 tmp = tok_cur_tag
3834                                 tok_cur_tag = null
3835                                 return tmp
3836                         when '' # EOF
3837                                 parse_error()
3838                                 tok_state = tok_state_data
3839                         else
3840                                 tok_cur_tag.attrs_a[0][1] += c
3841                                 tok_state = tok_state_attribute_value_unquoted
3842                 return null
3843
3844         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3845         tok_state_attribute_value_double_quoted = ->
3846                 switch c = txt.charAt(cur++)
3847                         when '"'
3848                                 tok_state = tok_state_after_attribute_value_quoted
3849                         when '&'
3850                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3851                         when "\u0000"
3852                                 # Parse error
3853                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3854                         when '' # EOF
3855                                 parse_error()
3856                                 tok_state = tok_state_data
3857                         else
3858                                 tok_cur_tag.attrs_a[0][1] += c
3859                 return null
3860
3861         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3862         tok_state_attribute_value_single_quoted = ->
3863                 switch c = txt.charAt(cur++)
3864                         when "'"
3865                                 tok_state = tok_state_after_attribute_value_quoted
3866                         when '&'
3867                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3868                         when "\u0000"
3869                                 # Parse error
3870                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3871                         when '' # EOF
3872                                 parse_error()
3873                                 tok_state = tok_state_data
3874                         else
3875                                 tok_cur_tag.attrs_a[0][1] += c
3876                 return null
3877
3878         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3879         tok_state_attribute_value_unquoted = ->
3880                 switch c = txt.charAt(cur++)
3881                         when "\t", "\n", "\u000c", ' '
3882                                 tok_state = tok_state_before_attribute_name
3883                         when '&'
3884                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3885                         when '>'
3886                                 tok_state = tok_state_data
3887                                 tmp = tok_cur_tag
3888                                 tok_cur_tag = null
3889                                 return tmp
3890                         when "\u0000"
3891                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3892                         when '' # EOF
3893                                 parse_error()
3894                                 tok_state = tok_state_data
3895                         else
3896                                 # Parse Error if ', <, = or ` (backtick)
3897                                 tok_cur_tag.attrs_a[0][1] += c
3898                 return null
3899
3900         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3901         tok_state_after_attribute_value_quoted = ->
3902                 switch c = txt.charAt(cur++)
3903                         when "\t", "\n", "\u000c", ' '
3904                                 tok_state = tok_state_before_attribute_name
3905                         when '/'
3906                                 tok_state = tok_state_self_closing_start_tag
3907                         when '>'
3908                                 tok_state = tok_state_data
3909                                 tmp = tok_cur_tag
3910                                 tok_cur_tag = null
3911                                 return tmp
3912                         when '' # EOF
3913                                 parse_error()
3914                                 tok_state = tok_state_data
3915                         else
3916                                 # Parse Error
3917                                 tok_state = tok_state_before_attribute_name
3918                                 cur -= 1 # we didn't handle that char
3919                 return null
3920
3921         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3922         tok_state_self_closing_start_tag = ->
3923                 c = txt.charAt(cur++)
3924                 if c is '>'
3925                         tok_cur_tag.flag 'self-closing', true
3926                         tok_state = tok_state_data
3927                         return tok_cur_tag
3928                 if c is ''
3929                         parse_error()
3930                         tok_state = tok_state_data
3931                         cur -= 1 # Reconsume
3932                         return
3933                 # Anything else
3934                 parse_error()
3935                 tok_state = tok_state_before_attribute_name
3936                 cur -= 1 # Reconsume
3937                 return
3938
3939         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3940         # WARNING: put a comment token in tok_cur_tag before setting this state
3941         tok_state_bogus_comment = ->
3942                 next_gt = txt.indexOf '>', cur
3943                 if next_gt is -1
3944                         val = txt.substr cur
3945                         cur = txt.length
3946                 else
3947                         val = txt.substr cur, (next_gt - cur)
3948                         cur = next_gt + 1
3949                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3950                 tok_cur_tag.text += val
3951                 tok_state = tok_state_data
3952                 return tok_cur_tag
3953
3954         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3955         tok_state_markup_declaration_open = ->
3956                 if txt.substr(cur, 2) is '--'
3957                         cur += 2
3958                         tok_cur_tag = new_comment_token ''
3959                         tok_state = tok_state_comment_start
3960                         return
3961                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3962                         cur += 7
3963                         tok_state = tok_state_doctype
3964                         return
3965                 acn = adjusted_current_node()
3966                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3967                         cur += 7
3968                         tok_state = tok_state_cdata_section
3969                         return
3970                 # Otherwise
3971                 parse_error()
3972                 tok_cur_tag = new_comment_token ''
3973                 tok_state = tok_state_bogus_comment
3974                 return
3975
3976         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3977         tok_state_comment_start = ->
3978                 switch c = txt.charAt(cur++)
3979                         when '-'
3980                                 tok_state = tok_state_comment_start_dash
3981                         when "\u0000"
3982                                 parse_error()
3983                                 tok_state = tok_state_comment
3984                                 return new_character_token "\ufffd"
3985                         when '>'
3986                                 parse_error()
3987                                 tok_state = tok_state_data
3988                                 return tok_cur_tag
3989                         when '' # EOF
3990                                 parse_error()
3991                                 tok_state = tok_state_data
3992                                 cur -= 1 # Reconsume
3993                                 return tok_cur_tag
3994                         else
3995                                 tok_cur_tag.text += c
3996                                 tok_state = tok_state_comment
3997                 return null
3998
3999         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4000         tok_state_comment_start_dash = ->
4001                 switch c = txt.charAt(cur++)
4002                         when '-'
4003                                 tok_state = tok_state_comment_end
4004                         when "\u0000"
4005                                 parse_error()
4006                                 tok_cur_tag.text += "-\ufffd"
4007                                 tok_state = tok_state_comment
4008                         when '>'
4009                                 parse_error()
4010                                 tok_state = tok_state_data
4011                                 return tok_cur_tag
4012                         when '' # EOF
4013                                 parse_error()
4014                                 tok_state = tok_state_data
4015                                 cur -= 1 # Reconsume
4016                                 return tok_cur_tag
4017                         else
4018                                 tok_cur_tag.text += "-#{c}"
4019                                 tok_state = tok_state_comment
4020                 return null
4021
4022         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4023         tok_state_comment = ->
4024                 switch c = txt.charAt(cur++)
4025                         when '-'
4026                                 tok_state = tok_state_comment_end_dash
4027                         when "\u0000"
4028                                 parse_error()
4029                                 tok_cur_tag.text += "\ufffd"
4030                         when '' # EOF
4031                                 parse_error()
4032                                 tok_state = tok_state_data
4033                                 cur -= 1 # Reconsume
4034                                 return tok_cur_tag
4035                         else
4036                                 tok_cur_tag.text += c
4037                 return null
4038
4039         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4040         tok_state_comment_end_dash = ->
4041                 switch c = txt.charAt(cur++)
4042                         when '-'
4043                                 tok_state = tok_state_comment_end
4044                         when "\u0000"
4045                                 parse_error()
4046                                 tok_cur_tag.text += "-\ufffd"
4047                                 tok_state = tok_state_comment
4048                         when '' # EOF
4049                                 parse_error()
4050                                 tok_state = tok_state_data
4051                                 cur -= 1 # Reconsume
4052                                 return tok_cur_tag
4053                         else
4054                                 tok_cur_tag.text += "-#{c}"
4055                                 tok_state = tok_state_comment
4056                 return null
4057
4058         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4059         tok_state_comment_end = ->
4060                 switch c = txt.charAt(cur++)
4061                         when '>'
4062                                 tok_state = tok_state_data
4063                                 return tok_cur_tag
4064                         when "\u0000"
4065                                 parse_error()
4066                                 tok_cur_tag.text += "--\ufffd"
4067                                 tok_state = tok_state_comment
4068                         when '!'
4069                                 parse_error()
4070                                 tok_state = tok_state_comment_end_bang
4071                         when '-'
4072                                 parse_error()
4073                                 tok_cur_tag.text += '-'
4074                         when '' # EOF
4075                                 parse_error()
4076                                 tok_state = tok_state_data
4077                                 cur -= 1 # Reconsume
4078                                 return tok_cur_tag
4079                         else
4080                                 parse_error()
4081                                 tok_cur_tag.text += "--#{c}"
4082                                 tok_state = tok_state_comment
4083                 return null
4084
4085         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4086         tok_state_comment_end_bang = ->
4087                 switch c = txt.charAt(cur++)
4088                         when '-'
4089                                 tok_cur_tag.text += "--!#{c}"
4090                                 tok_state = tok_state_comment_end_dash
4091                         when '>'
4092                                 tok_state = tok_state_data
4093                                 return tok_cur_tag
4094                         when "\u0000"
4095                                 parse_error()
4096                                 tok_cur_tag.text += "--!\ufffd"
4097                                 tok_state = tok_state_comment
4098                         when '' # EOF
4099                                 parse_error()
4100                                 tok_state = tok_state_data
4101                                 cur -= 1 # Reconsume
4102                                 return tok_cur_tag
4103                         else
4104                                 tok_cur_tag.text += "--!#{c}"
4105                                 tok_state = tok_state_comment
4106                 return null
4107
4108         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4109         tok_state_doctype = ->
4110                 switch c = txt.charAt(cur++)
4111                         when "\t", "\u000a", "\u000c", ' '
4112                                 tok_state = tok_state_before_doctype_name
4113                         when '' # EOF
4114                                 parse_error()
4115                                 tok_state = tok_state_data
4116                                 el = new_doctype_token ''
4117                                 el.flag 'force-quirks', true
4118                                 cur -= 1 # Reconsume
4119                                 return el
4120                         else
4121                                 parse_error()
4122                                 tok_state = tok_state_before_doctype_name
4123                                 cur -= 1 # Reconsume
4124                 return null
4125
4126         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4127         tok_state_before_doctype_name = ->
4128                 c = txt.charAt(cur++)
4129                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4130                         return
4131                 if is_uc_alpha(c)
4132                         tok_cur_tag = new_doctype_token c.toLowerCase()
4133                         tok_state = tok_state_doctype_name
4134                         return
4135                 if c is "\u0000"
4136                         parse_error()
4137                         tok_cur_tag = new_doctype_token "\ufffd"
4138                         tok_state = tok_state_doctype_name
4139                         return
4140                 if c is '>'
4141                         parse_error()
4142                         el = new_doctype_token ''
4143                         el.flag 'force-quirks', true
4144                         tok_state = tok_state_data
4145                         return el
4146                 if c is '' # EOF
4147                         parse_error()
4148                         tok_state = tok_state_data
4149                         el = new_doctype_token ''
4150                         el.flag 'force-quirks', true
4151                         cur -= 1 # Reconsume
4152                         return el
4153                 # Anything else
4154                 tok_cur_tag = new_doctype_token c
4155                 tok_state = tok_state_doctype_name
4156                 return null
4157
4158         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4159         tok_state_doctype_name = ->
4160                 c = txt.charAt(cur++)
4161                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4162                         tok_state = tok_state_after_doctype_name
4163                         return
4164                 if c is '>'
4165                         tok_state = tok_state_data
4166                         return tok_cur_tag
4167                 if is_uc_alpha(c)
4168                         tok_cur_tag.name += c.toLowerCase()
4169                         return
4170                 if c is "\u0000"
4171                         parse_error()
4172                         tok_cur_tag.name += "\ufffd"
4173                         return
4174                 if c is '' # EOF
4175                         parse_error()
4176                         tok_state = tok_state_data
4177                         tok_cur_tag.flag 'force-quirks', true
4178                         cur -= 1 # Reconsume
4179                         return tok_cur_tag
4180                 # Anything else
4181                 tok_cur_tag.name += c
4182                 return null
4183
4184         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4185         tok_state_after_doctype_name = ->
4186                 c = txt.charAt(cur++)
4187                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4188                         return
4189                 if c is '>'
4190                         tok_state = tok_state_data
4191                         return tok_cur_tag
4192                 if c is '' # EOF
4193                         parse_error()
4194                         tok_state = tok_state_data
4195                         tok_cur_tag.flag 'force-quirks', true
4196                         cur -= 1 # Reconsume
4197                         return tok_cur_tag
4198                 # Anything else
4199                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4200                         cur += 5
4201                         tok_state = tok_state_after_doctype_public_keyword
4202                         return
4203                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4204                         cur += 5
4205                         tok_state = tok_state_after_doctype_system_keyword
4206                         return
4207                 parse_error()
4208                 tok_cur_tag.flag 'force-quirks', true
4209                 tok_state = tok_state_bogus_doctype
4210                 return null
4211
4212         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4213         tok_state_after_doctype_public_keyword = ->
4214                 c = txt.charAt(cur++)
4215                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4216                         tok_state = tok_state_before_doctype_public_identifier
4217                         return
4218                 if c is '"'
4219                         parse_error()
4220                         tok_cur_tag.public_identifier = ''
4221                         tok_state = tok_state_doctype_public_identifier_double_quoted
4222                         return
4223                 if c is "'"
4224                         parse_error()
4225                         tok_cur_tag.public_identifier = ''
4226                         tok_state = tok_state_doctype_public_identifier_single_quoted
4227                         return
4228                 if c is '>'
4229                         parse_error()
4230                         tok_cur_tag.flag 'force-quirks', true
4231                         tok_state = tok_state_data
4232                         return tok_cur_tag
4233                 if c is '' # EOF
4234                         parse_error()
4235                         tok_state = tok_state_data
4236                         tok_cur_tag.flag 'force-quirks', true
4237                         cur -= 1 # Reconsume
4238                         return tok_cur_tag
4239                 # Anything else
4240                 parse_error()
4241                 tok_cur_tag.flag 'force-quirks', true
4242                 tok_state = tok_state_bogus_doctype
4243                 return null
4244
4245         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4246         tok_state_before_doctype_public_identifier = ->
4247                 c = txt.charAt(cur++)
4248                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4249                         return
4250                 if c is '"'
4251                         parse_error()
4252                         tok_cur_tag.public_identifier = ''
4253                         tok_state = tok_state_doctype_public_identifier_double_quoted
4254                         return
4255                 if c is "'"
4256                         parse_error()
4257                         tok_cur_tag.public_identifier = ''
4258                         tok_state = tok_state_doctype_public_identifier_single_quoted
4259                         return
4260                 if c is '>'
4261                         parse_error()
4262                         tok_cur_tag.flag 'force-quirks', true
4263                         tok_state = tok_state_data
4264                         return tok_cur_tag
4265                 if c is '' # EOF
4266                         parse_error()
4267                         tok_state = tok_state_data
4268                         tok_cur_tag.flag 'force-quirks', true
4269                         cur -= 1 # Reconsume
4270                         return tok_cur_tag
4271                 # Anything else
4272                 parse_error()
4273                 tok_cur_tag.flag 'force-quirks', true
4274                 tok_state = tok_state_bogus_doctype
4275                 return null
4276
4277
4278         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4279         tok_state_doctype_public_identifier_double_quoted = ->
4280                 c = txt.charAt(cur++)
4281                 if c is '"'
4282                         tok_state = tok_state_after_doctype_public_identifier
4283                         return
4284                 if c is "\u0000"
4285                         parse_error()
4286                         tok_cur_tag.public_identifier += "\ufffd"
4287                         return
4288                 if c is '>'
4289                         parse_error()
4290                         tok_cur_tag.flag 'force-quirks', true
4291                         tok_state = tok_state_data
4292                         return tok_cur_tag
4293                 if c is '' # EOF
4294                         parse_error()
4295                         tok_state = tok_state_data
4296                         tok_cur_tag.flag 'force-quirks', true
4297                         cur -= 1 # Reconsume
4298                         return tok_cur_tag
4299                 # Anything else
4300                 tok_cur_tag.public_identifier += c
4301                 return null
4302
4303         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4304         tok_state_doctype_public_identifier_single_quoted = ->
4305                 c = txt.charAt(cur++)
4306                 if c is "'"
4307                         tok_state = tok_state_after_doctype_public_identifier
4308                         return
4309                 if c is "\u0000"
4310                         parse_error()
4311                         tok_cur_tag.public_identifier += "\ufffd"
4312                         return
4313                 if c is '>'
4314                         parse_error()
4315                         tok_cur_tag.flag 'force-quirks', true
4316                         tok_state = tok_state_data
4317                         return tok_cur_tag
4318                 if c is '' # EOF
4319                         parse_error()
4320                         tok_state = tok_state_data
4321                         tok_cur_tag.flag 'force-quirks', true
4322                         cur -= 1 # Reconsume
4323                         return tok_cur_tag
4324                 # Anything else
4325                 tok_cur_tag.public_identifier += c
4326                 return null
4327
4328         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4329         tok_state_after_doctype_public_identifier = ->
4330                 c = txt.charAt(cur++)
4331                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4332                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4333                         return
4334                 if c is '>'
4335                         tok_state = tok_state_data
4336                         return tok_cur_tag
4337                 if c is '"'
4338                         parse_error()
4339                         tok_cur_tag.system_identifier = ''
4340                         tok_state = tok_state_doctype_system_identifier_double_quoted
4341                         return
4342                 if c is "'"
4343                         parse_error()
4344                         tok_cur_tag.system_identifier = ''
4345                         tok_state = tok_state_doctype_system_identifier_single_quoted
4346                         return
4347                 if c is '' # EOF
4348                         parse_error()
4349                         tok_state = tok_state_data
4350                         tok_cur_tag.flag 'force-quirks', true
4351                         cur -= 1 # Reconsume
4352                         return tok_cur_tag
4353                 # Anything else
4354                 parse_error()
4355                 tok_cur_tag.flag 'force-quirks', true
4356                 tok_state = tok_state_bogus_doctype
4357                 return null
4358
4359         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4360         tok_state_between_doctype_public_and_system_identifiers = ->
4361                 c = txt.charAt(cur++)
4362                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4363                         return
4364                 if c is '>'
4365                         tok_state = tok_state_data
4366                         return tok_cur_tag
4367                 if c is '"'
4368                         parse_error()
4369                         tok_cur_tag.system_identifier = ''
4370                         tok_state = tok_state_doctype_system_identifier_double_quoted
4371                         return
4372                 if c is "'"
4373                         parse_error()
4374                         tok_cur_tag.system_identifier = ''
4375                         tok_state = tok_state_doctype_system_identifier_single_quoted
4376                         return
4377                 if c is '' # EOF
4378                         parse_error()
4379                         tok_state = tok_state_data
4380                         tok_cur_tag.flag 'force-quirks', true
4381                         cur -= 1 # Reconsume
4382                         return tok_cur_tag
4383                 # Anything else
4384                 parse_error()
4385                 tok_cur_tag.flag 'force-quirks', true
4386                 tok_state = tok_state_bogus_doctype
4387                 return null
4388
4389         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4390         tok_state_after_doctype_system_keyword = ->
4391                 c = txt.charAt(cur++)
4392                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4393                         tok_state = tok_state_before_doctype_system_identifier
4394                         return
4395                 if c is '"'
4396                         parse_error()
4397                         tok_cur_tag.system_identifier = ''
4398                         tok_state = tok_state_doctype_system_identifier_double_quoted
4399                         return
4400                 if c is "'"
4401                         parse_error()
4402                         tok_cur_tag.system_identifier = ''
4403                         tok_state = tok_state_doctype_system_identifier_single_quoted
4404                         return
4405                 if c is '>'
4406                         parse_error()
4407                         tok_cur_tag.flag 'force-quirks', true
4408                         tok_state = tok_state_data
4409                         return tok_cur_tag
4410                 if c is '' # EOF
4411                         parse_error()
4412                         tok_state = tok_state_data
4413                         tok_cur_tag.flag 'force-quirks', true
4414                         cur -= 1 # Reconsume
4415                         return tok_cur_tag
4416                 # Anything else
4417                 parse_error()
4418                 tok_cur_tag.flag 'force-quirks', true
4419                 tok_state = tok_state_bogus_doctype
4420                 return null
4421
4422         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4423         tok_state_before_doctype_system_identifier = ->
4424                 c = txt.charAt(cur++)
4425                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4426                         return
4427                 if c is '"'
4428                         tok_cur_tag.system_identifier = ''
4429                         tok_state = tok_state_doctype_system_identifier_double_quoted
4430                         return
4431                 if c is "'"
4432                         tok_cur_tag.system_identifier = ''
4433                         tok_state = tok_state_doctype_system_identifier_single_quoted
4434                         return
4435                 if c is '>'
4436                         parse_error()
4437                         tok_cur_tag.flag 'force-quirks', true
4438                         tok_state = tok_state_data
4439                         return tok_cur_tag
4440                 if c is '' # EOF
4441                         parse_error()
4442                         tok_state = tok_state_data
4443                         tok_cur_tag.flag 'force-quirks', true
4444                         cur -= 1 # Reconsume
4445                         return tok_cur_tag
4446                 # Anything else
4447                 parse_error()
4448                 tok_cur_tag.flag 'force-quirks', true
4449                 tok_state = tok_state_bogus_doctype
4450                 return null
4451
4452         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4453         tok_state_doctype_system_identifier_double_quoted = ->
4454                 c = txt.charAt(cur++)
4455                 if c is '"'
4456                         tok_state = tok_state_after_doctype_system_identifier
4457                         return
4458                 if c is "\u0000"
4459                         parse_error()
4460                         tok_cur_tag.system_identifier += "\ufffd"
4461                         return
4462                 if c is '>'
4463                         parse_error()
4464                         tok_cur_tag.flag 'force-quirks', true
4465                         tok_state = tok_state_data
4466                         return tok_cur_tag
4467                 if c is '' # EOF
4468                         parse_error()
4469                         tok_state = tok_state_data
4470                         tok_cur_tag.flag 'force-quirks', true
4471                         cur -= 1 # Reconsume
4472                         return tok_cur_tag
4473                 # Anything else
4474                 tok_cur_tag.system_identifier += c
4475                 return null
4476
4477         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4478         tok_state_doctype_system_identifier_single_quoted = ->
4479                 c = txt.charAt(cur++)
4480                 if c is "'"
4481                         tok_state = tok_state_after_doctype_system_identifier
4482                         return
4483                 if c is "\u0000"
4484                         parse_error()
4485                         tok_cur_tag.system_identifier += "\ufffd"
4486                         return
4487                 if c is '>'
4488                         parse_error()
4489                         tok_cur_tag.flag 'force-quirks', true
4490                         tok_state = tok_state_data
4491                         return tok_cur_tag
4492                 if c is '' # EOF
4493                         parse_error()
4494                         tok_state = tok_state_data
4495                         tok_cur_tag.flag 'force-quirks', true
4496                         cur -= 1 # Reconsume
4497                         return tok_cur_tag
4498                 # Anything else
4499                 tok_cur_tag.system_identifier += c
4500                 return null
4501
4502         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4503         tok_state_after_doctype_system_identifier = ->
4504                 c = txt.charAt(cur++)
4505                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4506                         return
4507                 if c is '>'
4508                         tok_state = tok_state_data
4509                         return tok_cur_tag
4510                 if c is '' # EOF
4511                         parse_error()
4512                         tok_state = tok_state_data
4513                         tok_cur_tag.flag 'force-quirks', true
4514                         cur -= 1 # Reconsume
4515                         return tok_cur_tag
4516                 # Anything else
4517                 parse_error()
4518                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4519                 tok_state = tok_state_bogus_doctype
4520                 return null
4521
4522         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4523         tok_state_bogus_doctype = ->
4524                 c = txt.charAt(cur++)
4525                 if c is '>'
4526                         tok_state = tok_state_data
4527                         return tok_cur_tag
4528                 if c is '' # EOF
4529                         tok_state = tok_state_data
4530                         cur -= 1 # Reconsume
4531                         return tok_cur_tag
4532                 # Anything else
4533                 return null
4534
4535         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4536         tok_state_cdata_section = ->
4537                 tok_state = tok_state_data
4538                 next_gt = txt.indexOf ']]>', cur
4539                 if next_gt is -1
4540                         val = txt.substr cur
4541                         cur = txt.length
4542                 else
4543                         val = txt.substr cur, (next_gt - cur)
4544                         cur = next_gt + 3
4545                 if val.length > 0
4546                         return new_character_token val # fixfull split
4547                 return null
4548
4549         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4550         # Don't set this as a state, just call it
4551         # returns a string (NOT a text node)
4552         parse_character_reference = (allowed_char = null, in_attr = false) ->
4553                 if cur >= txt.length
4554                         return '&'
4555                 switch c = txt.charAt(cur)
4556                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4557                                 # explicitly not a parse error
4558                                 return '&'
4559                         when ';'
4560                                 # there has to be "one or more" alnums between & and ; to be a parse error
4561                                 return '&'
4562                         when '#'
4563                                 if cur + 1 >= txt.length
4564                                         return '&'
4565                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4566                                         base = 16
4567                                         charset = hex_chars
4568                                         start = cur + 2
4569                                 else
4570                                         charset = digits
4571                                         start = cur + 1
4572                                         base = 10
4573                                 i = 0
4574                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4575                                         i += 1
4576                                 if i is 0
4577                                         return '&'
4578                                 cur = start + i
4579                                 if txt.charAt(start + i) is ';'
4580                                         cur += 1
4581                                 else
4582                                         parse_error()
4583                                 code_point = txt.substr(start, i)
4584                                 while code_point.charAt(0) is '0' and code_point.length > 1
4585                                         code_point = code_point.substr 1
4586                                 code_point = parseInt(code_point, base)
4587                                 if unicode_fixes[code_point]?
4588                                         parse_error()
4589                                         return unicode_fixes[code_point]
4590                                 else
4591                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4592                                                 parse_error()
4593                                                 return "\ufffd"
4594                                         else
4595                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4596                                                         parse_error()
4597                                                 return from_code_point code_point
4598                                 return
4599                         else
4600                                 for i in [0...31]
4601                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4602                                                 break
4603                                 if i is 0
4604                                         # exit early, because parse_error() below needs at least one alnum
4605                                         return '&'
4606                                 if txt.charAt(cur + i) is ';'
4607                                         i += 1 # include ';' terminator in value
4608                                         decoded = decode_named_char_ref txt.substr(cur, i)
4609                                         if decoded?
4610                                                 cur += i
4611                                                 return decoded
4612                                         parse_error()
4613                                         return '&'
4614                                 else
4615                                         # no ';' terminator (only legacy char refs)
4616                                         max = i
4617                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4618                                                 c = legacy_char_refs[txt.substr(cur, i)]
4619                                                 if c?
4620                                                         if in_attr
4621                                                                 if txt.charAt(cur + i) is '='
4622                                                                         # "because some legacy user agents will
4623                                                                         # misinterpret the markup in those cases"
4624                                                                         parse_error()
4625                                                                         return '&'
4626                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4627                                                                         # this makes attributes forgiving about url args
4628                                                                         return '&'
4629                                                         # ok, and besides the weird exceptions for attributes...
4630                                                         # return the matching char
4631                                                         cur += i # consume entity chars
4632                                                         parse_error() # because no terminating ";"
4633                                                         return c
4634                                         parse_error()
4635                                         return '&'
4636                 return # never reached
4637
4638         eat_next_token_if_newline = ->
4639                 old_cur = cur
4640                 t = null
4641                 until t?
4642                         t = tok_state()
4643                 if t.type is TYPE_TEXT
4644                         # definition of a newline depends on whether it was a character ref or not
4645                         if cur - old_cur is 1
4646                                 # not a character reference
4647                                 if t.text is "\u000d" or t.text is "\u000a"
4648                                         return
4649                         else
4650                                 if t.text is "\u000a"
4651                                         return
4652                 # not a "newline"
4653                 cur = old_cur
4654                 return
4655
4656         # tree constructor initialization
4657         # see comments on TYPE_TAG/etc for the structure of this data
4658         txt = args.html
4659         cur = 0
4660         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4661         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4662         open_els = []
4663         afe = [] # active formatting elements
4664         template_ins_modes = []
4665         ins_mode = ins_mode_initial
4666         original_ins_mode = ins_mode # TODO check spec
4667         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4668         flag_frameset_ok = true
4669         flag_parsing = true
4670         flag_foster_parenting = false
4671         form_element_pointer = null
4672         temporary_buffer = null
4673         pending_table_character_tokens = []
4674         head_element_pointer = null
4675         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4676         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4677         prev_node_id = 0 # just for debugging
4678
4679         # tokenizer initialization
4680         tok_state = tok_state_data
4681
4682         # text pre-processing
4683         # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4684         txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4685         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4686         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4687
4688         if args.name is "webkit01.dat #12"
4689                 console.log "hi"
4690         # proccess input
4691         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4692         parse_main_loop = ->
4693                 while flag_parsing
4694                         t = tok_state()
4695                         if t?
4696                                 process_token t
4697                                 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4698         parse_main_loop()
4699         return doc.children
4700
4701 serialize_els = (els, shallow, show_ids) ->
4702         serialized = ''
4703         sep = ''
4704         for t in els
4705                 serialized += sep
4706                 sep = ','
4707                 serialized += t.serialize shallow, show_ids
4708         return serialized
4709
4710 module.exports.parse_html = parse_html
4711 module.exports.debug_log_reset = debug_log_reset
4712 module.exports.debug_log_each = debug_log_each
4713 module.exports.TYPE_TAG = TYPE_TAG
4714 module.exports.TYPE_TEXT = TYPE_TEXT
4715 module.exports.TYPE_COMMENT = TYPE_COMMENT
4716 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4717 module.exports.NS_HTML = NS_HTML
4718 module.exports.NS_MATHML = NS_MATHML
4719 module.exports.NS_SVG = NS_SVG
4720 module.exports.QUIRKS_NO = QUIRKS_NO
4721 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4722 module.exports.QUIRKS_YES = QUIRKS_YES