JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix minor parsing bugs
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor.
20
21 # The implementation is a pretty direct implementation of the parsing algorithm
22 # described here:
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
24 #
25 # Deviations from that spec:
26 #
27 #   Purposeful: search this file for "WTAG"
28 #
29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
30
31
32 # stacks/lists
33 #
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
37 # (both as stacks)
38 #
39 # stacks grow downward (current element is index=0)
40 #
41 # example: open_els = [a, b, c, d, e, f, g]
42 #
43 # "grows downwards" means it's visualized like this: (index: el, names)
44 #
45 #   6: g "start of the list", "topmost", "first"
46 #   5: f
47 #   4: e "previous" (to d), "above", "before"
48 #   3: d   (previous/next are relative to this element)
49 #   2: c "next", "after", "lower", "below"
50 #   1: b
51 #   0: a "end of the list", "current node", "bottommost", "last"
52
53
54 # browser
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
58         window.wheic = {}
59         module = exports: window.wheic
60
61 from_code_point = (x) ->
62         if String.fromCodePoint?
63                 return String.fromCodePoint x
64         else
65                 if x <= 0xffff
66                         return String.fromCharCode x
67                 x -= 0x10000
68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
69
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
73 TYPE_COMMENT = 2
74 TYPE_DOCTYPE = 3
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
78 TYPE_EOF = 6
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
81
82 # namespace constants
83 NS_HTML = 1
84 NS_MATHML = 2
85 NS_SVG = 3
86
87 g_debug_log = []
88 debug_log_reset = ->
89         g_debug_log = []
90 debug_log = (str) ->
91         g_debug_log.push str
92 debug_log_each = (cb) ->
93         for str in g_debug_log
94                 cb str
95
96 prev_node_id = 0
97 class Node
98         constructor: (type, args = {}) ->
99                 @type = type # one of the TYPE_* constants above
100                 @name = args.name ? '' # tag name
101                 @text = args.text ? '' # contents for text/comment nodes
102                 @attrs = args.attrs ? {}
103                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104                 @children = args.children ? []
105                 @namespace = args.namespace ? NS_HTML
106                 @parent = args.parent ? null
107                 @token = args.token ? null
108                 @flags = args.flags ? {}
109                 if args.id?
110                         @id = "#{args.id}+"
111                 else
112                         @id = "#{++prev_node_id}"
113         acknowledge_self_closing: ->
114                 if @token?
115                         @token.flag 'did_self_close', true
116                 else
117                         @flag 'did_self_close', true
118         flag: (key, value = null) ->
119                 if value?
120                         @flags[key] = value
121                 else
122                         return @flags[key]
123         serialize: (shallow = false, show_ids = false) -> # for unit tests
124                 ret = ''
125                 switch @type
126                         when TYPE_TAG
127                                 ret += 'tag:'
128                                 ret += JSON.stringify @name
129                                 ret += ','
130                                 if show_ids
131                                         ret += "##{@id},"
132                                 if shallow
133                                         break
134                                 attr_keys = []
135                                 for k of @attrs
136                                         attr_keys.push k
137                                 attr_keys.sort()
138                                 ret += '{'
139                                 sep = ''
140                                 for k in attr_keys
141                                         ret += sep
142                                         sep = ','
143                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
144                                 ret += '},['
145                                 sep = ''
146                                 for c in @children
147                                         ret += sep
148                                         sep = ','
149                                         ret += c.serialize shallow, show_ids
150                                 ret += ']'
151                         when TYPE_TEXT
152                                 ret += 'text:'
153                                 ret += JSON.stringify @text
154                         when TYPE_COMMENT
155                                 ret += 'comment:'
156                                 ret += JSON.stringify @text
157                         when TYPE_DOCTYPE
158                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
159                         when TYPE_AFE_MARKER
160                                 ret += 'marker'
161                         when TYPE_AAA_BOOKMARK
162                                 ret += 'aaa_bookmark'
163                         else
164                                 ret += 'unknown:'
165                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
166                 return ret
167
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170         return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172         return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174         return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176         return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179         return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181         return new Node TYPE_DOCTYPE, name: name
182 new_eof_token = ->
183         return new Node TYPE_EOF
184 new_afe_marker = ->
185         return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187         return new Node TYPE_AAA_BOOKMARK
188
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
194
195 is_uc_alpha = (str) ->
196         return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198         return str.length is 1 and lc_alpha.indexOf(str) > -1
199
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
202
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
205 is_space = (txt) ->
206         return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
209
210 is_input_hidden_tok = (t) ->
211         return false unless t.type is TYPE_START_TAG
212         for a in t.attrs_a
213                 if a[0] is 'type'
214                         if a[1].toLowerCase() is 'hidden'
215                                 return true
216                         return false
217         return false
218
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
221
222 unicode_fixes = {}
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
251
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
254 legacy_char_refs = {
255         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
272         yen: '¥', yuml: 'ÿ'
273 }
274
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
279 svg_elements = [
280         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
294         'view', 'vkern'
295 ]
296
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
298 mathml_elements = [
299         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305         'determinant', 'diff', 'divergence', 'divide', 'domain',
306         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326         'vectorproduct', 'xor'
327 ]
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
330
331 special_elements = {
332         # HTML:
333         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
344
345         menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
346
347         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
354
355         # MathML:
356         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357         'annotation-xml':NS_MATHML,
358
359         # SVG:
360         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
361 }
362
363 formatting_elements = {
364          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
366          u: true
367 }
368
369 mathml_text_integration = {
370         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
371 }
372 is_mathml_text_integration_point = (el) ->
373         return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375         if el.namespace is NS_MATHML
376                 if el.name is 'annotation-xml'
377                         if el.attrs.encoding?
378                                 if el.attrs.encoding.toLowerCase() is 'text/html'
379                                         return true
380                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
381                                         return true
382                 return false
383         if el.namespace is NS_SVG
384                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
385                         return true
386         return false
387
388 h_tags = {
389         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
390 }
391
392 foster_parenting_targets = {
393         table: NS_HTML
394         tbody: NS_HTML
395         tfoot: NS_HTML
396         thead: NS_HTML
397         tr: NS_HTML
398 }
399
400 end_tag_implied = {
401         dd: NS_HTML
402         dt: NS_HTML
403         li: NS_HTML
404         option: NS_HTML
405         optgroup: NS_HTML
406         p: NS_HTML
407         rb: NS_HTML
408         rp: NS_HTML
409         rt: NS_HTML
410         rtc: NS_HTML
411 }
412
413 el_is_special = (e) ->
414         return special_elements[e.name] is e.namespace
415
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
419
420 svg_name_fixes = {
421         altglyph: 'altGlyph'
422         altglyphdef: 'altGlyphDef'
423         altglyphitem: 'altGlyphItem'
424         animatecolor: 'animateColor'
425         animatemotion: 'animateMotion'
426         animatetransform: 'animateTransform'
427         clippath: 'clipPath'
428         feblend: 'feBlend'
429         fecolormatrix: 'feColorMatrix'
430         fecomponenttransfer: 'feComponentTransfer'
431         fecomposite: 'feComposite'
432         feconvolvematrix: 'feConvolveMatrix'
433         fediffuselighting: 'feDiffuseLighting'
434         fedisplacementmap: 'feDisplacementMap'
435         fedistantlight: 'feDistantLight'
436         fedropshadow: 'feDropShadow'
437         feflood: 'feFlood'
438         fefunca: 'feFuncA'
439         fefuncb: 'feFuncB'
440         fefuncg: 'feFuncG'
441         fefuncr: 'feFuncR'
442         fegaussianblur: 'feGaussianBlur'
443         feimage: 'feImage'
444         femerge: 'feMerge'
445         femergenode: 'feMergeNode'
446         femorphology: 'feMorphology'
447         feoffset: 'feOffset'
448         fepointlight: 'fePointLight'
449         fespecularlighting: 'feSpecularLighting'
450         fespotlight: 'feSpotLight'
451         fetile: 'feTile'
452         feturbulence: 'feTurbulence'
453         foreignobject: 'foreignObject'
454         glyphref: 'glyphRef'
455         lineargradient: 'linearGradient'
456         radialgradient: 'radialGradient'
457         textpath: 'textPath'
458 }
459 svg_attribute_fixes = {
460         attributename: 'attributeName'
461         attributetype: 'attributeType'
462         basefrequency: 'baseFrequency'
463         baseprofile: 'baseProfile'
464         calcmode: 'calcMode'
465         clippathunits: 'clipPathUnits'
466         contentscripttype: 'contentScriptType'
467         contentstyletype: 'contentStyleType'
468         diffuseconstant: 'diffuseConstant'
469         edgemode: 'edgeMode'
470         externalresourcesrequired: 'externalResourcesRequired'
471         # WTAG removes this: filterres: 'filterRes'
472         filterunits: 'filterUnits'
473         glyphref: 'glyphRef'
474         gradienttransform: 'gradientTransform'
475         gradientunits: 'gradientUnits'
476         kernelmatrix: 'kernelMatrix'
477         kernelunitlength: 'kernelUnitLength'
478         keypoints: 'keyPoints'
479         keysplines: 'keySplines'
480         keytimes: 'keyTimes'
481         lengthadjust: 'lengthAdjust'
482         limitingconeangle: 'limitingConeAngle'
483         markerheight: 'markerHeight'
484         markerunits: 'markerUnits'
485         markerwidth: 'markerWidth'
486         maskcontentunits: 'maskContentUnits'
487         maskunits: 'maskUnits'
488         numoctaves: 'numOctaves'
489         pathlength: 'pathLength'
490         patterncontentunits: 'patternContentUnits'
491         patterntransform: 'patternTransform'
492         patternunits: 'patternUnits'
493         pointsatx: 'pointsAtX'
494         pointsaty: 'pointsAtY'
495         pointsatz: 'pointsAtZ'
496         preservealpha: 'preserveAlpha'
497         preserveaspectratio: 'preserveAspectRatio'
498         primitiveunits: 'primitiveUnits'
499         refx: 'refX'
500         refy: 'refY'
501         repeatcount: 'repeatCount'
502         repeatdur: 'repeatDur'
503         requiredextensions: 'requiredExtensions'
504         requiredfeatures: 'requiredFeatures'
505         specularconstant: 'specularConstant'
506         specularexponent: 'specularExponent'
507         spreadmethod: 'spreadMethod'
508         startoffset: 'startOffset'
509         stddeviation: 'stdDeviation'
510         stitchtiles: 'stitchTiles'
511         surfacescale: 'surfaceScale'
512         systemlanguage: 'systemLanguage'
513         tablevalues: 'tableValues'
514         targetx: 'targetX'
515         targety: 'targetY'
516         textlength: 'textLength'
517         viewbox: 'viewBox'
518         viewtarget: 'viewTarget'
519         xchannelselector: 'xChannelSelector'
520         ychannelselector: 'yChannelSelector'
521         zoomandpan: 'zoomAndPan'
522 }
523 foreign_attr_fixes = {
524         'xlink:actuate': 'xlink actuate'
525         'xlink:arcrole': 'xlink arcrole'
526         'xlink:href': 'xlink href'
527         'xlink:role': 'xlink role'
528         'xlink:show': 'xlink show'
529         'xlink:title': 'xlink title'
530         'xlink:type': 'xlink type'
531         'xml:base': 'xml base'
532         'xml:lang': 'xml lang'
533         'xml:space': 'xml space'
534         'xmlns': 'xmlns'
535         'xmlns:xlink': 'xmlns xlink'
536 }
537 adjust_mathml_attributes = (t) ->
538         for a in t.attrs_a
539                 if a[0] is 'definitionurl'
540                         a[0] = 'definitionURL'
541         return
542 adjust_svg_attributes = (t) ->
543         for a in t.attrs_a
544                 if svg_attribute_fixes[a[0]]?
545                         a[0] = svg_attribute_fixes[a[0]]
546         return
547 adjust_foreign_attributes = (t) ->
548         # fixfull
549         for a in t.attrs_a
550                 if foreign_attr_fixes[a[0]]?
551                         a[0] = foreign_attr_fixes[a[0]]
552         return
553
554 # decode_named_char_ref()
555 #
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
558 #
559 # Pass without the "&" but with the ";" examples:
560 #    for "&amp" pass "amp;"
561 #    for "&#x2032" pass "x2032;"
562 g_dncr = {
563         cache: {}
564         textarea: document.createElement('textarea')
565 }
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
568         txt = "&#{txt}"
569         decoded = g_dncr.cache[txt]
570         return decoded if decoded?
571         g_dncr.textarea.innerHTML = txt
572         decoded = g_dncr.textarea.value
573         return null if decoded is txt
574         return g_dncr.cache[txt] = decoded
575
576 parse_html = (args) ->
577         txt = null
578         cur = null # index of next char in txt to be parsed
579         # declare doc and tokenizer variables so they're in scope below
580         doc = null
581         open_els = null # stack of open elements
582         afe = null # active formatting elements
583         template_ins_modes = null
584         ins_mode = null
585         original_ins_mode = null
586         tok_state = null
587         tok_cur_tag = null # partially parsed tag
588         flag_scripting = null
589         flag_frameset_ok = null
590         flag_parsing = null
591         flag_foster_parenting = null
592         form_element_pointer = null
593         temporary_buffer = null
594         pending_table_character_tokens = null
595         head_element_pointer = null
596         flag_fragment_parsing = null
597         context_element = null
598
599         stop_parsing = ->
600                 flag_parsing = false
601
602         parse_error = ->
603                 if args.error_cb?
604                         args.error_cb cur
605                 else
606                         console.log "Parse error at character #{cur} of #{txt.length}"
607
608         afe_push = (new_el) ->
609                 matches = 0
610                 for el, i in afe
611                         if el.name is new_el.name and el.namespace is new_el.namespace
612                                 for k, v of el.attrs
613                                         continue unless new_el.attrs[k] is v
614                                 for k, v of new_el.attrs
615                                         continue unless el.attrs[k] is v
616                                 matches += 1
617                                 if matches is 3
618                                         afe.splice i, 1
619                                         break
620                 afe.unshift new_el
621         afe_push_marker = ->
622                 afe.unshift new_afe_marker()
623
624         # the functions below impliment the Tree Contstruction algorithm
625         # http://www.w3.org/TR/html5/syntax.html#tree-construction
626
627         # But first... the helpers
628         template_tag_is_open = ->
629                 for t in open_els
630                         if t.name is 'template' and t.namespace is NS_HTML
631                                 return true
632                 return false
633         is_in_scope_x = (tag_name, scope, namespace) ->
634                 for t in open_els
635                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
636                                 return true
637                         if scope[t.name] is t.namespace
638                                 return false
639                 return false
640         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
641                 for t in open_els
642                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
643                                 return true
644                         if scope[t.name] is t.namespace
645                                 return false
646                         if scope2[t.name] is t.namespace
647                                 return false
648                 return false
649         standard_scopers = {
650                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
652                 template: NS_HTML,
653
654                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
655                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
656
657                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
658         }
659         button_scopers = button: NS_HTML
660         li_scopers = ol: NS_HTML, ul: NS_HTML
661         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662         is_in_scope = (tag_name, namespace = null) ->
663                 return is_in_scope_x tag_name, standard_scopers, namespace
664         is_in_button_scope = (tag_name, namespace = null) ->
665                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666         is_in_table_scope = (tag_name, namespace = null) ->
667                 return is_in_scope_x tag_name, table_scopers, namespace
668         # aka is_in_list_item_scope
669         is_in_li_scope = (tag_name, namespace = null) ->
670                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671         is_in_select_scope = (tag_name, namespace = null) ->
672                 for t in open_els
673                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
674                                 return true
675                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
676                                 return false
677                 return false
678         # this checks for a particular element, not by name
679         # this requires a namespace match
680         el_is_in_scope = (needle) ->
681                 for el in open_els
682                         if el is needle
683                                 return true
684                         if standard_scopers[el.name] is el.namespace
685                                 return false
686                 return false
687
688         clear_to_table_stopers = {
689                 'table': true
690                 'template': true
691                 'html': true
692         }
693         clear_stack_to_table_context = ->
694                 loop
695                         if clear_to_table_stopers[open_els[0].name]?
696                                 break
697                         open_els.shift()
698                 return
699         clear_to_table_body_stopers = {
700                 tbody: NS_HTML
701                 tfoot: NS_HTML
702                 thead: NS_HTML
703                 template: NS_HTML
704                 html: NS_HTML
705         }
706         clear_stack_to_table_body_context = ->
707                 loop
708                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
709                                 break
710                         open_els.shift()
711                 return
712         clear_to_table_row_stopers = {
713                 'tr': true
714                 'template': true
715                 'html': true
716         }
717         clear_stack_to_table_row_context = ->
718                 loop
719                         if clear_to_table_row_stopers[open_els[0].name]?
720                                 break
721                         open_els.shift()
722                 return
723         clear_afe_to_marker = ->
724                 loop
725                         return unless afe.length > 0 # this happens in fragment case, ?spec error
726                         el = afe.shift()
727                         if el.type is TYPE_AFE_MARKER
728                                 return
729                 return
730
731         # 8.2.3.1 ...
732         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
733         reset_ins_mode = ->
734                 # 1. Let last be false.
735                 last = false
736                 # 2. Let node be the last node in the stack of open elements.
737                 node_i = 0
738                 node = open_els[node_i]
739                 # 3. Loop: If node is the first node in the stack of open elements,
740                 # then set last to true, and, if the parser was originally created as
741                 # part of the HTML fragment parsing algorithm (fragment case) set node
742                 # to the context element.
743                 loop
744                         if node_i is open_els.length - 1
745                                 last = true
746                                 # fixfull (fragment case)
747
748                         # 4. If node is a select element, run these substeps:
749                         if node.name is 'select' and node.namespace is NS_HTML
750                                 # 1. If last is true, jump to the step below labeled done.
751                                 unless last
752                                         # 2. Let ancestor be node.
753                                         ancestor_i = node_i
754                                         ancestor = node
755                                         # 3. Loop: If ancestor is the first node in the stack of
756                                         # open elements, jump to the step below labeled done.
757                                         loop
758                                                 if ancestor_i is open_els.length - 1
759                                                         break
760                                                 # 4. Let ancestor be the node before ancestor in the stack
761                                                 # of open elements.
762                                                 ancestor_i += 1
763                                                 ancestor = open_els[ancestor_i]
764                                                 # 5. If ancestor is a template node, jump to the step below
765                                                 # labeled done.
766                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
767                                                         break
768                                                 # 6. If ancestor is a table node, switch the insertion mode
769                                                 # to "in select in table" and abort these steps.
770                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771                                                         ins_mode = ins_mode_in_select_in_table
772                                                         return
773                                                 # 7. Jump back to the step labeled loop.
774                                 # 8. Done: Switch the insertion mode to "in select" and abort
775                                 # these steps.
776                                 ins_mode = ins_mode_in_select
777                                 return
778                         # 5. If node is a td or th element and last is false, then switch
779                         # the insertion mode to "in cell" and abort these steps.
780                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781                                 ins_mode = ins_mode_in_cell
782                                 return
783                         # 6. If node is a tr element, then switch the insertion mode to "in
784                         # row" and abort these steps.
785                         if node.name is 'tr' and node.namespace is NS_HTML
786                                 ins_mode = ins_mode_in_row
787                                 return
788                         # 7. If node is a tbody, thead, or tfoot element, then switch the
789                         # insertion mode to "in table body" and abort these steps.
790                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791                                 ins_mode = ins_mode_in_table_body
792                                 return
793                         # 8. If node is a caption element, then switch the insertion mode
794                         # to "in caption" and abort these steps.
795                         if node.name is 'caption' and node.namespace is NS_HTML
796                                 ins_mode = ins_mode_in_caption
797                                 return
798                         # 9. If node is a colgroup element, then switch the insertion mode
799                         # to "in column group" and abort these steps.
800                         if node.name is 'colgroup' and node.namespace is NS_HTML
801                                 ins_mode = ins_mode_in_column_group
802                                 return
803                         # 10. If node is a table element, then switch the insertion mode to
804                         # "in table" and abort these steps.
805                         if node.name is 'table' and node.namespace is NS_HTML
806                                 ins_mode = ins_mode_in_table
807                                 return
808                         # 11. If node is a template element, then switch the insertion mode
809                         # to the current template insertion mode and abort these steps.
810                         if node.name is 'template' and node.namespace is NS_HTML
811                                 ins_mode = template_ins_modes[0]
812                                 return
813                         # 12. If node is a head element and last is true, then switch the
814                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
815                         # these steps. (fragment case)
816                         if node.name is 'head' and node.namespace is NS_HTML and last
817                                 ins_mode = ins_mode_in_body
818                                 return
819                         # 13. If node is a head element and last is false, then switch the
820                         # insertion mode to "in head" and abort these steps.
821                         if node.name is 'head' and node.namespace is NS_HTML and last is false
822                                 ins_mode = ins_mode_in_head
823                                 return
824                         # 14. If node is a body element, then switch the insertion mode to
825                         # "in body" and abort these steps.
826                         if node.name is 'body' and node.namespace is NS_HTML
827                                 ins_mode = ins_mode_in_body
828                                 return
829                         # 15. If node is a frameset element, then switch the insertion mode
830                         # to "in frameset" and abort these steps. (fragment case)
831                         if node.name is 'frameset' and node.namespace is NS_HTML
832                                 ins_mode = ins_mode_in_frameset
833                                 return
834                         # 16. If node is an html element, run these substeps:
835                         if node.name is 'html' and node.namespace is NS_HTML
836                                 # 1. If the head element pointer is null, switch the insertion
837                                 # mode to "before head" and abort these steps. (fragment case)
838                                 if head_element_pointer is null
839                                         ins_mode = ins_mode_before_head
840                                 else
841                                         # 2. Otherwise, the head element pointer is not null,
842                                         # switch the insertion mode to "after head" and abort these
843                                         # steps.
844                                         ins_mode = ins_mode_after_head
845                                 return
846                         # 17. If last is true, then switch the insertion mode to "in body"
847                         # and abort these steps. (fragment case)
848                         if last
849                                 ins_mode = ins_mode_in_body
850                                 return
851                         # 18. Let node now be the node before node in the stack of open
852                         # elements.
853                         node_i += 1
854                         node = open_els[node_i]
855                         # 19. Return to the step labeled loop.
856
857         # 8.2.3.2
858
859         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860         adjusted_current_node = ->
861                 if open_els.length is 1 and flag_fragment_parsing
862                         return context_element
863                 return open_els[0]
864
865         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866         # this implementation is structured (mostly) as described at the link above.
867         # capitalized comments are the "labels" described at the link above.
868         reconstruct_afe = ->
869                 return if afe.length is 0
870                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
871                         return
872                 # Rewind
873                 i = 0
874                 loop
875                         if i is afe.length - 1
876                                 break
877                         i += 1
878                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
879                                 i -= 1 # Advance
880                                 break
881                 # Create
882                 loop
883                         el = insert_html_element afe[i].token
884                         afe[i] = el
885                         break if i is 0
886                         i -= 1 # Advance
887
888         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889         # adoption agency algorithm
890         # overview here:
891         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894         adoption_agency = (subject) ->
895                 debug_log "adoption_agency()"
896                 debug_log "tree: #{serialize_els doc.children, false, true}"
897                 debug_log "open_els: #{serialize_els open_els, true, true}"
898                 debug_log "afe: #{serialize_els afe, true, true}"
899                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
900                         el = open_els[0]
901                         open_els.shift()
902                         # remove it from the list of active formatting elements (if found)
903                         for t, i in afe
904                                 if t is el
905                                         afe.splice i, 1
906                                         break
907                         debug_log "aaa: starting off with subject on top of stack, exiting"
908                         return
909                 outer = 0
910                 loop
911                         if outer >= 8
912                                 return
913                         outer += 1
914                         # 5. Let formatting element be the last element in the list of
915                         # active formatting elements that: is between the end of the list
916                         # and the last scope marker in the list, if any, or the start of
917                         # the list otherwise, and  has the tag name subject.
918                         fe = null
919                         for t, fe_of_afe in afe
920                                 if t.type is TYPE_AFE_MARKER
921                                         break
922                                 if t.name is subject
923                                         fe = t
924                                         break
925                         # If there is no such element, then abort these steps and instead
926                         # act as described in the "any other end tag" entry above.
927                         if fe is null
928                                 debug_log "aaa: fe not found in afe"
929                                 in_body_any_other_end_tag subject
930                                 return
931                         # 6. If formatting element is not in the stack of open elements,
932                         # then this is a parse error; remove the element from the list, and
933                         # abort these steps.
934                         in_open_els = false
935                         for t, fe_of_open_els in open_els
936                                 if t is fe
937                                         in_open_els = true
938                                         break
939                         unless in_open_els
940                                 debug_log "aaa: fe not found in open_els"
941                                 parse_error()
942                                 # "remove it from the list" must mean afe, since it's not in open_els
943                                 afe.splice fe_of_afe, 1
944                                 return
945                         # 7. If formatting element is in the stack of open elements, but
946                         # the element is not in scope, then this is a parse error; abort
947                         # these steps.
948                         unless el_is_in_scope fe
949                                 debug_log "aaa: fe not in scope"
950                                 parse_error()
951                                 return
952                         # 8. If formatting element is not the current node, this is a parse
953                         # error. (But do not abort these steps.)
954                         unless open_els[0] is fe
955                                 parse_error()
956                                 # continue
957                         # 9. Let furthest block be the topmost node in the stack of open
958                         # elements that is lower in the stack than formatting element, and
959                         # is an element in the special category. There might not be one.
960                         fb = null
961                         fb_of_open_els = null
962                         for t, i in open_els
963                                 if t is fe
964                                         break
965                                 if el_is_special t
966                                         fb = t
967                                         fb_of_open_els = i
968                                         # and continue, to see if there's one that's more "topmost"
969                         # 10. If there is no furthest block, then the UA must first pop all
970                         # the nodes from the bottom of the stack of open elements, from the
971                         # current node up to and including formatting element, then remove
972                         # formatting element from the list of active formatting elements,
973                         # and finally abort these steps.
974                         if fb is null
975                                 debug_log "aaa: no fb"
976                                 loop
977                                         t = open_els.shift()
978                                         if t is fe
979                                                 afe.splice fe_of_afe, 1
980                                                 return
981                         # 11. Let common ancestor be the element immediately above
982                         # formatting element in the stack of open elements.
983                         ca = open_els[fe_of_open_els + 1] # common ancestor
984
985                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987                         bookmark = new_aaa_bookmark()
988                         for t, i in afe
989                                 if t is fe
990                                         afe.splice i, 0, bookmark
991                                         break
992                         node = last_node = fb
993                         inner = 0
994                         loop
995                                 inner += 1
996                                 # 3. Let node be the element immediately above node in the
997                                 # stack of open elements, or if node is no longer in the stack
998                                 # of open elements (e.g. because it got removed by this
999                                 # algorithm), the element that was immediately above node in
1000                                 # the stack of open elements before node was removed.
1001                                 node_next = null
1002                                 for t, i in open_els
1003                                         if t is node
1004                                                 node_next = open_els[i + 1]
1005                                                 break
1006                                 node = node_next ? node_above
1007                                 debug_log "inner loop #{inner}"
1008                                 debug_log "tree: #{serialize_els doc.children, false, true}"
1009                                 debug_log "open_els: #{serialize_els open_els, true, true}"
1010                                 debug_log "afe: #{serialize_els afe, true, true}"
1011                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014                                 debug_log "node: #{node.serialize true, true}"
1015                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1016
1017                                 # 4. If node is formatting element, then go to the next step in
1018                                 # the overall algorithm.
1019                                 if node is fe
1020                                         break
1021                                 debug_log "the meat"
1022                                 # 5. If inner loop counter is greater than three and node is in
1023                                 # the list of active formatting elements, then remove node from
1024                                 # the list of active formatting elements.
1025                                 node_in_afe = false
1026                                 for t, i in afe
1027                                         if t is node
1028                                                 if inner > 3
1029                                                         afe.splice i, 1
1030                                                         debug_log "max out inner"
1031                                                 else
1032                                                         node_in_afe = true
1033                                                         debug_log "in afe"
1034                                                 break
1035                                 # 6. If node is not in the list of active formatting elements,
1036                                 # then remove node from the stack of open elements and then go
1037                                 # back to the step labeled inner loop.
1038                                 unless node_in_afe
1039                                         debug_log "not in afe"
1040                                         for t, i in open_els
1041                                                 if t is node
1042                                                         node_above = open_els[i + 1]
1043                                                         open_els.splice i, 1
1044                                                         break
1045                                         continue
1046                                 debug_log "the bones"
1047                                 # 7. create an element for the token for which the element node
1048                                 # was created, in the HTML namespace, with common ancestor as
1049                                 # the intended parent; replace the entry for node in the list
1050                                 # of active formatting elements with an entry for the new
1051                                 # element, replace the entry for node in the stack of open
1052                                 # elements with an entry for the new element, and let node be
1053                                 # the new element.
1054                                 new_node = token_to_element node.token, NS_HTML, ca
1055                                 for t, i in afe
1056                                         if t is node
1057                                                 afe[i] = new_node
1058                                                 debug_log "replaced in afe"
1059                                                 break
1060                                 for t, i in open_els
1061                                         if t is node
1062                                                 node_above = open_els[i + 1]
1063                                                 open_els[i] = new_node
1064                                                 debug_log "replaced in open_els"
1065                                                 break
1066                                 node = new_node
1067                                 # 8. If last node is furthest block, then move the
1068                                 # aforementioned bookmark to be immediately after the new node
1069                                 # in the list of active formatting elements.
1070                                 if last_node is fb
1071                                         for t, i in afe
1072                                                 if t is bookmark
1073                                                         afe.splice i, 1
1074                                                         debug_log "removed bookmark"
1075                                                         break
1076                                         for t, i in afe
1077                                                 if t is node
1078                                                         # "after" means lower
1079                                                         afe.splice i, 0, bookmark # "after as <-
1080                                                         debug_log "placed bookmark after node"
1081                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1082                                                         break
1083                                 # 9. Insert last node into node, first removing it from its
1084                                 # previous parent node if any.
1085                                 if last_node.parent?
1086                                         debug_log "last_node has parent"
1087                                         for c, i in last_node.parent.children
1088                                                 if c is last_node
1089                                                         debug_log "removing last_node from parent"
1090                                                         last_node.parent.children.splice i, 1
1091                                                         break
1092                                 node.children.push last_node
1093                                 last_node.parent = node
1094                                 # 10. Let last node be node.
1095                                 last_node = node
1096                                 debug_log "at last"
1097                                 # 11. Return to the step labeled inner loop.
1098                         # 14. Insert whatever last node ended up being in the previous step
1099                         # at the appropriate place for inserting a node, but using common
1100                         # ancestor as the override target.
1101
1102                         # In the case where fe is immediately followed by fb:
1103                         #   * inner loop exits out early (node==fe)
1104                         #   * last_node is fb
1105                         #   * last_node is still in the tree (not a duplicate)
1106                         if last_node.parent?
1107                                 debug_log "FEFIRST? last_node has parent"
1108                                 for c, i in last_node.parent.children
1109                                         if c is last_node
1110                                                 debug_log "removing last_node from parent"
1111                                                 last_node.parent.children.splice i, 1
1112                                                 break
1113
1114                         debug_log "after aaa inner loop"
1115                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119                         debug_log "tree: #{serialize_els doc.children, false, true}"
1120
1121                         debug_log "insert"
1122
1123
1124                         # can't use standard insert token thing, because it's already in
1125                         # open_els and must stay at it's current position in open_els
1126                         dest = adjusted_insertion_location ca
1127                         dest[0].children.splice dest[1], 0, last_node
1128                         last_node.parent = dest[0]
1129
1130
1131                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135                         debug_log "tree: #{serialize_els doc.children, false, true}"
1136
1137                         # 15. Create an element for the token for which formatting element
1138                         # was created, in the HTML namespace, with furthest block as the
1139                         # intended parent.
1140                         new_element = token_to_element fe.token, NS_HTML, fb
1141                         # 16. Take all of the child nodes of furthest block and append them
1142                         # to the element created in the last step.
1143                         while fb.children.length
1144                                 t = fb.children.shift()
1145                                 t.parent = new_element
1146                                 new_element.children.push t
1147                         # 17. Append that new element to furthest block.
1148                         new_element.parent = fb
1149                         fb.children.push new_element
1150                         # 18. Remove formatting element from the list of active formatting
1151                         # elements, and insert the new element into the list of active
1152                         # formatting elements at the position of the aforementioned
1153                         # bookmark.
1154                         for t, i in afe
1155                                 if t is fe
1156                                         afe.splice i, 1
1157                                         break
1158                         for t, i in afe
1159                                 if t is bookmark
1160                                         afe[i] = new_element
1161                                         break
1162                         # 19. Remove formatting element from the stack of open elements,
1163                         # and insert the new element into the stack of open elements
1164                         # immediately below the position of furthest block in that stack.
1165                         for t, i in open_els
1166                                 if t is fe
1167                                         open_els.splice i, 1
1168                                         break
1169                         for t, i in open_els
1170                                 if t is fb
1171                                         open_els.splice i, 0, new_element
1172                                         break
1173                         # 20. Jump back to the step labeled outer loop.
1174                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175                         debug_log "tree: #{serialize_els doc.children, false, true}"
1176                         debug_log "open_els: #{serialize_els open_els, true, true}"
1177                         debug_log "afe: #{serialize_els afe, true, true}"
1178                 debug_log "AAA DONE"
1179
1180         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181         close_p_element = ->
1182                 generate_implied_end_tags 'p' # arg is exception
1183                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1184                         parse_error()
1185                 while open_els.length > 1 # just in case
1186                         el = open_els.shift()
1187                         if el.name is 'p' and el.namespace is NS_HTML
1188                                 return
1189         close_p_if_in_button_scope = ->
1190                 if is_in_button_scope 'p', NS_HTML
1191                         close_p_element()
1192
1193         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194         # aka insert_a_character = (t) ->
1195         insert_character = (t) ->
1196                 dest = adjusted_insertion_location()
1197                 # fixfull check for Document node
1198                 if dest[1] > 0
1199                         prev = dest[0].children[dest[1] - 1]
1200                         if prev.type is TYPE_TEXT
1201                                 prev.text += t.text
1202                                 return
1203                 dest[0].children.splice dest[1], 0, t
1204
1205
1206         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207         process_token = (t) ->
1208                 acn = adjusted_current_node()
1209                 unless acn?
1210                         ins_mode t
1211                         return
1212                 if acn.namespace is NS_HTML
1213                         ins_mode t
1214                         return
1215                 if is_mathml_text_integration_point(acn)
1216                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1217                                 ins_mode t
1218                                 return
1219                         if t.type is TYPE_TEXT
1220                                 ins_mode t
1221                                 return
1222                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1223                         ins_mode t
1224                         return
1225                 if is_html_integration acn
1226                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1227                                 ins_mode t
1228                                 return
1229                 if t.type is TYPE_EOF
1230                         ins_mode t
1231                         return
1232                 in_foreign_content t
1233                 return
1234
1235         # 8.2.5.1
1236         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238         adjusted_insertion_location = (override_target = null) ->
1239                 # 1. If there was an override target specified, then let target be the
1240                 # override target.
1241                 if override_target?
1242                         target = override_target
1243                 else # Otherwise, let target be the current node.
1244                         target = open_els[0]
1245                 # 2. Determine the adjusted insertion location using the first matching
1246                 # steps from the following list:
1247                 #
1248                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249                 # thead, or tr element Foster parenting happens when content is
1250                 # misnested in tables.
1251                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252                         loop # once. this is here so we can ``break`` to "abort these substeps"
1253                                 # 1. Let last template be the last template element in the
1254                                 # stack of open elements, if any.
1255                                 last_template = null
1256                                 last_template_i = null
1257                                 for el, i in open_els
1258                                         if el.name is 'template' and el.namespace is NS_HTML
1259                                                 last_template = el
1260                                                 last_template_i = i
1261                                                 break
1262                                 # 2. Let last table be the last table element in the stack of
1263                                 # open elements, if any.
1264                                 last_table = null
1265                                 last_table_i
1266                                 for el, i in open_els
1267                                         if el.name is 'table' and el.namespace is NS_HTML
1268                                                 last_table = el
1269                                                 last_table_i = i
1270                                                 break
1271                                 # 3. If there is a last template and either there is no last
1272                                 # table, or there is one, but last template is lower (more
1273                                 # recently added) than last table in the stack of open
1274                                 # elements, then: let adjusted insertion location be inside
1275                                 # last template's template contents, after its last child (if
1276                                 # any), and abort these substeps.
1277                                 if last_template and (last_table is null or last_template_i < last_table_i)
1278                                         target = last_template # fixfull should be it's contents
1279                                         target_i = target.children.length
1280                                         break
1281                                 # 4. If there is no last table, then let adjusted insertion
1282                                 # location be inside the first element in the stack of open
1283                                 # elements (the html element), after its last child (if any),
1284                                 # and abort these substeps. (fragment case)
1285                                 if last_table is null
1286                                         # this is odd
1287                                         target = open_els[open_els.length - 1]
1288                                         target_i = target.children.length
1289                                         break
1290                                 # 5. If last table has a parent element, then let adjusted
1291                                 # insertion location be inside last table's parent element,
1292                                 # immediately before last table, and abort these substeps.
1293                                 if last_table.parent?
1294                                         for c, i in last_table.parent.children
1295                                                 if c is last_table
1296                                                         target = last_table.parent
1297                                                         target_i = i
1298                                                         break
1299                                         break
1300                                 # 6. Let previous element be the element immediately above last
1301                                 # table in the stack of open elements.
1302                                 #
1303                                 # huh? how could it not have a parent?
1304                                 previous_element = open_els[last_table_i + 1]
1305                                 # 7. Let adjusted insertion location be inside previous
1306                                 # element, after its last child (if any).
1307                                 target = previous_element
1308                                 target_i = target.children.length
1309                                 # Note: These steps are involved in part because it's possible
1310                                 # for elements, the table element in this case in particular,
1311                                 # to have been moved by a script around in the DOM, or indeed
1312                                 # removed from the DOM entirely, after the element was inserted
1313                                 # by the parser.
1314                                 break # don't really loop
1315                 else
1316                         # Otherwise Let adjusted insertion location be inside target, after
1317                         # its last child (if any).
1318                         target_i = target.children.length
1319
1320                 # 3. If the adjusted insertion location is inside a template element,
1321                 # let it instead be inside the template element's template contents,
1322                 # after its last child (if any).
1323                 # fixfull (template)
1324
1325                 # 4. Return the adjusted insertion location.
1326                 return [target, target_i]
1327
1328         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329         # aka create_an_element_for_token
1330         token_to_element = (t, namespace, intended_parent) ->
1331                 # convert attributes into a hash
1332                 attrs = {}
1333                 for a in t.attrs_a
1334                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1336
1337                 # TODO 2. If the newly created element has an xmlns attribute in the
1338                 # XMLNS namespace whose value is not exactly the same as the element's
1339                 # namespace, that is a parse error. Similarly, if the newly created
1340                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341                 # value is not the XLink Namespace, that is a parse error.
1342
1343                 # fixfull: the spec says stuff about form pointers and ownerDocument
1344
1345                 return el
1346
1347         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348         insert_foreign_element = (token, namespace) ->
1349                 ail = adjusted_insertion_location()
1350                 ail_el = ail[0]
1351                 ail_i = ail[1]
1352                 el = token_to_element token, namespace, ail_el
1353                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1354                 el.parent = ail_el
1355                 ail_el.children.splice ail_i, 0, el
1356                 open_els.unshift el
1357                 return el
1358         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359         insert_html_element = (token) ->
1360                 insert_foreign_element token, NS_HTML
1361
1362         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363         # position should be [node, index_within_children]
1364         insert_comment = (t, position = null) ->
1365                 position ?= adjusted_insertion_location()
1366                 position[0].children.splice position[1], 0, t
1367
1368         # 8.2.5.2
1369         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370         parse_generic_raw_text = (t) ->
1371                 insert_html_element t
1372                 tok_state = tok_state_rawtext
1373                 original_ins_mode = ins_mode
1374                 ins_mode = ins_mode_text
1375         parse_generic_rcdata_text = (t) ->
1376                 insert_html_element t
1377                 tok_state = tok_state_rcdata
1378                 original_ins_mode = ins_mode
1379                 ins_mode = ins_mode_text
1380
1381         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383         generate_implied_end_tags = (except = null) ->
1384                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1385                         open_els.shift()
1386
1387         # 8.2.5.4 The rules for parsing tokens in HTML content
1388         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1389
1390         # 8.2.5.4.1 The "initial" insertion mode
1391         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392         ins_mode_initial = (t) ->
1393                 if is_space_tok t
1394                         return
1395                 if t.type is TYPE_COMMENT
1396                         # ?fixfull
1397                         doc.children.push t
1398                         return
1399                 if t.type is TYPE_DOCTYPE
1400                         # FIXME check identifiers, set quirks, etc
1401                         # fixfull
1402                         doc.children.push t
1403                         ins_mode = ins_mode_before_html
1404                         return
1405                 # Anything else
1406                 #fixfull (iframe, quirks)
1407                 ins_mode = ins_mode_before_html
1408                 process_token t
1409                 return
1410
1411         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412         ins_mode_before_html = (t) ->
1413                 if t.type is TYPE_DOCTYPE
1414                         parse_error()
1415                         return
1416                 if t.type is TYPE_COMMENT
1417                         doc.children.push t
1418                         return
1419                 if is_space_tok t
1420                         return
1421                 if t.type is TYPE_START_TAG and t.name is 'html'
1422                         el = token_to_element t, NS_HTML, doc
1423                         doc.children.push el
1424                         open_els.unshift(el)
1425                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426                         ins_mode = ins_mode_before_head
1427                         return
1428                 if t.type is TYPE_END_TAG
1429                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430                                 # fall through to "anything else"
1431                         else
1432                                 parse_error()
1433                                 return
1434                 # Anything else
1435                 html_tok = new_open_tag 'html'
1436                 el = token_to_element html_tok, NS_HTML, doc
1437                 doc.children.push el
1438                 open_els.unshift el
1439                 # ?fixfull browsing context
1440                 ins_mode = ins_mode_before_head
1441                 process_token t
1442                 return
1443
1444         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445         ins_mode_before_head = (t) ->
1446                 if is_space_tok t
1447                         return
1448                 if t.type is TYPE_COMMENT
1449                         insert_comment t
1450                         return
1451                 if t.type is TYPE_DOCTYPE
1452                         parse_error()
1453                         return
1454                 if t.type is TYPE_START_TAG and t.name is 'html'
1455                         ins_mode_in_body t
1456                         return
1457                 if t.type is TYPE_START_TAG and t.name is 'head'
1458                         el = insert_html_element t
1459                         head_element_pointer = el
1460                         ins_mode = ins_mode_in_head
1461                         return
1462                 if t.type is TYPE_END_TAG
1463                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464                                 # fall through to Anything else below
1465                         else
1466                                 parse_error()
1467                                 return
1468                 # Anything else
1469                 head_tok = new_open_tag 'head'
1470                 el = insert_html_element head_tok
1471                 head_element_pointer = el
1472                 ins_mode = ins_mode_in_head
1473                 process_token t
1474
1475         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477                 open_els.shift() # spec says this will be a 'head' node
1478                 ins_mode = ins_mode_after_head
1479                 process_token t
1480         ins_mode_in_head = (t) ->
1481                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1482                         insert_character t
1483                         return
1484                 if t.type is TYPE_COMMENT
1485                         insert_comment t
1486                         return
1487                 if t.type is TYPE_DOCTYPE
1488                         parse_error()
1489                         return
1490                 if t.type is TYPE_START_TAG and t.name is 'html'
1491                         ins_mode_in_body t
1492                         return
1493                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494                         el = insert_html_element t
1495                         open_els.shift()
1496                         t.acknowledge_self_closing()
1497                         return
1498                 if t.type is TYPE_START_TAG and t.name is 'meta'
1499                         el = insert_html_element t
1500                         open_els.shift()
1501                         t.acknowledge_self_closing()
1502                         # fixfull encoding stuff
1503                         return
1504                 if t.type is TYPE_START_TAG and t.name is 'title'
1505                         parse_generic_rcdata_text t
1506                         return
1507                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508                         parse_generic_raw_text t
1509                         return
1510                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511                         insert_html_element t
1512                         ins_mode = ins_mode_in_head_noscript
1513                         return
1514                 if t.type is TYPE_START_TAG and t.name is 'script'
1515                         ail = adjusted_insertion_location()
1516                         el = token_to_element t, NS_HTML, ail
1517                         el.flag 'parser-inserted', true
1518                         # fixfull frament case
1519                         ail[0].children.splice ail[1], 0, el
1520                         open_els.unshift el
1521                         tok_state = tok_state_script_data
1522                         original_ins_mode = ins_mode # make sure orig... is defined
1523                         ins_mode = ins_mode_text
1524                         return
1525                 if t.type is TYPE_END_TAG and t.name is 'head'
1526                         open_els.shift() # will be a head element... spec says so
1527                         ins_mode = ins_mode_after_head
1528                         return
1529                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530                         ins_mode_in_head_else t
1531                         return
1532                 if t.type is TYPE_START_TAG and t.name is 'template'
1533                         insert_html_element t
1534                         afe_push_marker()
1535                         flag_frameset_ok = false
1536                         ins_mode = ins_mode_in_template
1537                         template_ins_modes.unshift ins_mode_in_template
1538                         return
1539                 if t.type is TYPE_END_TAG and t.name is 'template'
1540                         if template_tag_is_open()
1541                                 generate_implied_end_tags
1542                                 if open_els[0].name isnt 'template'
1543                                         parse_error()
1544                                 loop
1545                                         el = open_els.shift()
1546                                         if el.name is 'template' and el.namespace is NS_HTML
1547                                                 break
1548                                 clear_afe_to_marker()
1549                                 template_ins_modes.shift()
1550                                 reset_ins_mode()
1551                         else
1552                                 parse_error()
1553                         return
1554                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1555                         parse_error()
1556                         return
1557                 ins_mode_in_head_else t
1558
1559         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560         ins_mode_in_head_noscript_else = (t) ->
1561                 parse_error()
1562                 open_els.shift()
1563                 ins_mode = ins_mode_in_head
1564                 process_token t
1565         ins_mode_in_head_noscript = (t) ->
1566                 if t.type is TYPE_DOCTYPE
1567                         parse_error()
1568                         return
1569                 if t.type is TYPE_START_TAG and t.name is 'html'
1570                         ins_mode_in_body t
1571                         return
1572                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1573                         open_els.shift()
1574                         ins_mode = ins_mode_in_head
1575                         return
1576                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1577                         ins_mode_in_head t
1578                         return
1579                 if t.type is TYPE_END_TAG and t.name is 'br'
1580                         ins_mode_in_head_noscript_else t
1581                         return
1582                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1583                         parse_error()
1584                         return
1585                 # Anything else
1586                 ins_mode_in_head_noscript_else t
1587                 return
1588
1589
1590
1591         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592         ins_mode_after_head_else = (t) ->
1593                 body_tok = new_open_tag 'body'
1594                 insert_html_element body_tok
1595                 ins_mode = ins_mode_in_body
1596                 process_token t
1597                 return
1598         ins_mode_after_head = (t) ->
1599                 if is_space_tok t
1600                         insert_character t
1601                         return
1602                 if t.type is TYPE_COMMENT
1603                         insert_comment t
1604                         return
1605                 if t.type is TYPE_DOCTYPE
1606                         parse_error()
1607                         return
1608                 if t.type is TYPE_START_TAG and t.name is 'html'
1609                         ins_mode_in_body t
1610                         return
1611                 if t.type is TYPE_START_TAG and t.name is 'body'
1612                         insert_html_element t
1613                         flag_frameset_ok = false
1614                         ins_mode = ins_mode_in_body
1615                         return
1616                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617                         insert_html_element t
1618                         ins_mode = ins_mode_in_frameset
1619                         return
1620                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1621                         parse_error()
1622                         open_els.unshift head_element_pointer
1623                         ins_mode_in_head t
1624                         for el, i of open_els
1625                                 if el is head_element_pointer
1626                                         open_els.splice i, 1
1627                                         return
1628                         console.log "warning: 23904 couldn't find head element in open_els"
1629                         return
1630                 if t.type is TYPE_END_TAG and t.name is 'template'
1631                         ins_mode_in_head t
1632                         return
1633                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634                         ins_mode_after_head_else t
1635                         return
1636                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1637                         parse_error()
1638                         return
1639                 # Anything else
1640                 ins_mode_after_head_else t
1641
1642         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644                 for el, i in open_els
1645                         if el.name is name and el.namespace is NS_HTML
1646                                 generate_implied_end_tags name # arg is exception
1647                                 parse_error() unless i is 0
1648                                 while i >= 0
1649                                         open_els.shift()
1650                                         i -= 1
1651                                 return
1652                         if special_elements[el.name] is el.namespace
1653                                 parse_error()
1654                                 return
1655                 return
1656         ins_mode_in_body = (t) ->
1657                 if t.type is TYPE_TEXT and t.text is "\u0000"
1658                         parse_error()
1659                         return
1660                 if is_space_tok t
1661                         reconstruct_afe()
1662                         insert_character t
1663                         return
1664                 if t.type is TYPE_TEXT
1665                         reconstruct_afe()
1666                         insert_character t
1667                         flag_frameset_ok = false
1668                         return
1669                 if t.type is TYPE_COMMENT
1670                         insert_comment t
1671                         return
1672                 if t.type is TYPE_DOCTYPE
1673                         parse_error()
1674                         return
1675                 if t.type is TYPE_START_TAG and t.name is 'html'
1676                         parse_error()
1677                         return if template_tag_is_open()
1678                         root_attrs = open_els[open_els.length - 1].attrs
1679                         for a in t.attrs_a
1680                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1681                         return
1682
1683                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1684                         ins_mode_in_head t
1685                         return
1686                 if t.type is TYPE_START_TAG and t.name is 'body'
1687                         parse_error()
1688                         return if open_els.length < 2
1689                         second = open_els[open_els.length - 2]
1690                         return unless second.namespace is NS_HTML
1691                         return unless second.name is 'body'
1692                         return if template_tag_is_open()
1693                         flag_frameset_ok = false
1694                         for a of t.attrs_a
1695                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1696                         return
1697                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1698                         parse_error()
1699                         return if open_els.length < 2
1700                         second_i = open_els.length - 2
1701                         second = open_els[second_i]
1702                         return unless second.namespace is NS_HTML
1703                         return unless second.name is 'body'
1704                         if flag_frameset_ok is false
1705                                 return
1706                         if second.parent?
1707                                 for el, i in second.parent.children
1708                                         if el is second
1709                                                 second.parent.children.splice i, 1
1710                                                 break
1711                         open_els.splice second_i, 1
1712                         # pop everything except the "root html element"
1713                         while open_els.length > 1
1714                                 open_els.shift()
1715                         insert_html_element t
1716                         ins_mode = ins_mode_in_frameset
1717                         return
1718                 if t.type is TYPE_EOF
1719                         ok_tags = {
1720                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1723                         }
1724                         for el in open_els
1725                                 unless ok_tags[t.name] is el.namespace
1726                                         parse_error()
1727                                         break
1728                         if template_ins_modes.length > 0
1729                                 ins_mode_in_template t
1730                         else
1731                                 stop_parsing()
1732                         return
1733                 if t.type is TYPE_END_TAG and t.name is 'body'
1734                         unless is_in_scope 'body', NS_HTML
1735                                 parse_error()
1736                                 return
1737                         ok_tags = {
1738                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1742                                 html:NS_HTML
1743                         }
1744                         for el in open_els
1745                                 unless ok_tags[t.name] is el.namespace
1746                                         parse_error()
1747                                         break
1748                         ins_mode = ins_mode_after_body
1749                         return
1750                 if t.type is TYPE_END_TAG and t.name is 'html'
1751                         unless is_in_scope 'body', NS_HTML
1752                                 parse_error()
1753                                 return
1754                         ok_tags = {
1755                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1759                                 html:NS_HTML
1760                         }
1761                         for el in open_els
1762                                 unless ok_tags[t.name] is el.namespace
1763                                         parse_error()
1764                                         break
1765                         ins_mode = ins_mode_after_body
1766                         process_token t
1767                         return
1768                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769                         close_p_if_in_button_scope()
1770                         insert_html_element t
1771                         return
1772                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773                         close_p_if_in_button_scope()
1774                         if h_tags[open_els[0].name] is open_els[0].namespace
1775                                 parse_error()
1776                                 open_els.shift()
1777                         insert_html_element t
1778                         return
1779                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780                         close_p_if_in_button_scope()
1781                         insert_html_element t
1782                         # spec: If the next token is a "LF" (U+000A) character token, then
1783                         # ignore that token and move on to the next one. (Newlines at the
1784                         # start of pre blocks are ignored as an authoring convenience.)
1785                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1786                                 cur += 1
1787                         flag_frameset_ok = false
1788                         return
1789                 if t.type is TYPE_START_TAG and t.name is 'form'
1790                         unless form_element_pointer is null or template_tag_is_open()
1791                                 parse_error()
1792                                 return
1793                         close_p_if_in_button_scope()
1794                         el = insert_html_element t
1795                         unless template_tag_is_open()
1796                                 form_element_pointer = el
1797                         return
1798                 if t.type is TYPE_START_TAG and t.name is 'li'
1799                         flag_frameset_ok = false
1800                         for node in open_els
1801                                 if node.name is 'li' and node.namespace is NS_HTML
1802                                         generate_implied_end_tags 'li' # arg is exception
1803                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1804                                                 parse_error()
1805                                         loop
1806                                                 el = open_els.shift()
1807                                                 if el.name is 'li' and el.namespace is NS_HTML
1808                                                         break
1809                                         break
1810                                 if el_is_special_not_adp node
1811                                                 break
1812                         close_p_if_in_button_scope()
1813                         insert_html_element t
1814                         return
1815                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816                         flag_frameset_ok = false
1817                         for node in open_els
1818                                 if node.name is 'dd' and node.namespace is NS_HTML
1819                                         generate_implied_end_tags 'dd' # arg is exception
1820                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1821                                                 parse_error()
1822                                         loop
1823                                                 el = open_els.shift()
1824                                                 if el.name is 'dd' and el.namespace is NS_HTML
1825                                                         break
1826                                         break
1827                                 if node.name is 'dt' and node.namespace is NS_HTML
1828                                         generate_implied_end_tags 'dt' # arg is exception
1829                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1830                                                 parse_error()
1831                                         loop
1832                                                 el = open_els.shift()
1833                                                 if el.name is 'dt' and el.namespace is NS_HTML
1834                                                         break
1835                                         break
1836                                 if el_is_special_not_adp node
1837                                         break
1838                         close_p_if_in_button_scope()
1839                         insert_html_element t
1840                         return
1841                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842                         close_p_if_in_button_scope()
1843                         insert_html_element t
1844                         tok_state = tok_state_plaintext
1845                         return
1846                 if t.type is TYPE_START_TAG and t.name is 'button'
1847                         if is_in_scope 'button', NS_HTML
1848                                 parse_error()
1849                                 generate_implied_end_tags()
1850                                 loop
1851                                         el = open_els.shift()
1852                                         if el.name is 'button' and el.namespace is NS_HTML
1853                                                 break
1854                         reconstruct_afe()
1855                         insert_html_element t
1856                         flag_frameset_ok = false
1857                         return
1858                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859                         unless is_in_scope t.name, NS_HTML
1860                                 parse_error()
1861                                 return
1862                         generate_implied_end_tags()
1863                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1864                                 parse_error()
1865                         loop
1866                                 el = open_els.shift()
1867                                 if el.name is t.name and el.namespace is NS_HTML
1868                                         return
1869                         return
1870                 if t.type is TYPE_END_TAG and t.name is 'form'
1871                         unless template_tag_is_open()
1872                                 node = form_element_pointer
1873                                 form_element_pointer = null
1874                                 if node is null or not el_is_in_scope node
1875                                         parse_error()
1876                                         return
1877                                 generate_implied_end_tags()
1878                                 if open_els[0] isnt node
1879                                         parse_error()
1880                                 for el, i in open_els
1881                                         if el is node
1882                                                 open_els.splice i, 1
1883                                                 break
1884                         else
1885                                 unless is_in_scope 'form', NS_HTML
1886                                         parse_error()
1887                                         return
1888                                 generate_implied_end_tags()
1889                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1890                                         parse_error()
1891                                 loop
1892                                         el = open_els.shift()
1893                                         if el.name is 'form' and el.namespace is NS_HTML
1894                                                 break
1895                         return
1896                 if t.type is TYPE_END_TAG and t.name is 'p'
1897                         unless is_in_button_scope 'p', NS_HTML
1898                                 parse_error()
1899                                 insert_html_element new_open_tag 'p'
1900                         close_p_element()
1901                         return
1902                 if t.type is TYPE_END_TAG and t.name is 'li'
1903                         unless is_in_li_scope 'li', NS_HTML
1904                                 parse_error()
1905                                 return
1906                         generate_implied_end_tags 'li' # arg is exception
1907                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1908                                 parse_error()
1909                         loop
1910                                 el = open_els.shift()
1911                                 if el.name is 'li' and el.namespace is NS_HTML
1912                                         break
1913                         return
1914                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915                         unless is_in_scope t.name, NS_HTML
1916                                 parse_error()
1917                                 return
1918                         generate_implied_end_tags t.name # arg is exception
1919                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1920                                 parse_error()
1921                         loop
1922                                 el = open_els.shift()
1923                                 if el.name is t.name and el.namespace is NS_HTML
1924                                         break
1925                         return
1926                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1927                         h_in_scope = false
1928                         for el in open_els
1929                                 if h_tags[el.name] is el.namespace
1930                                         h_in_scope = true
1931                                         break
1932                                 if standard_scopers[el.name] is el.namespace
1933                                         break
1934                         unless h_in_scope
1935                                 parse_error()
1936                                 return
1937                         generate_implied_end_tags()
1938                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1939                                 parse_error()
1940                         loop
1941                                 el = open_els.shift()
1942                                 if h_tags[el.name] is el.namespace
1943                                         break
1944                         return
1945                 # deep breath!
1946                 if t.type is TYPE_START_TAG and t.name is 'a'
1947                         # If the list of active formatting elements contains an a element
1948                         # between the end of the list and the last marker on the list (or
1949                         # the start of the list if there is no marker on the list), then
1950                         # this is a parse error; run the adoption agency algorithm for the
1951                         # tag name "a", then remove that element from the list of active
1952                         # formatting elements and the stack of open elements if the
1953                         # adoption agency algorithm didn't already remove it (it might not
1954                         # have if the element is not in table scope).
1955                         found = false
1956                         for el in afe
1957                                 if el.type is TYPE_AFE_MARKER
1958                                         break
1959                                 if el.name is 'a' and el.namespace is NS_HTML
1960                                         found = el
1961                         if found?
1962                                 parse_error()
1963                                 adoption_agency 'a'
1964                                 for el, i in afe
1965                                         if el is found
1966                                                 afe.splice i, 1
1967                                 for el, i in open_els
1968                                         if el is found
1969                                                 open_els.splice i, 1
1970                         reconstruct_afe()
1971                         el = insert_html_element t
1972                         afe_push el
1973                         return
1974                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1975                         reconstruct_afe()
1976                         el = insert_html_element t
1977                         afe_push el
1978                         return
1979                 if t.type is TYPE_START_TAG and t.name is 'nobr'
1980                         reconstruct_afe()
1981                         el = insert_html_element t
1982                         afe_push el
1983                         return
1984                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985                         adoption_agency t.name
1986                         return
1987                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1988                         reconstruct_afe()
1989                         insert_html_element t
1990                         afe_push_marker()
1991                         flag_frameset_ok = false
1992                         return
1993                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994                         unless is_in_scope t.name, NS_HTML
1995                                 parse_error()
1996                                 return
1997                         generate_implied_end_tags()
1998                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1999                                 parse_error()
2000                         loop
2001                                 el = open_els.shift()
2002                                 if el.name is t.name and el.namespace is NS_HTML
2003                                         break
2004                         clear_afe_to_marker()
2005                         return
2006                 if t.type is TYPE_START_TAG and t.name is 'table'
2007                         close_p_if_in_button_scope() # fixfull quirksmode thing
2008                         insert_html_element t
2009                         flag_frameset_ok = false
2010                         ins_mode = ins_mode_in_table
2011                         return
2012                 if t.type is TYPE_END_TAG and t.name is 'br'
2013                         parse_error()
2014                         t.type is TYPE_START_TAG
2015                         # fall through
2016                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2017                         reconstruct_afe()
2018                         insert_html_element t
2019                         open_els.shift()
2020                         t.acknowledge_self_closing()
2021                         flag_frameset_ok = false
2022                         return
2023                 if t.type is TYPE_START_TAG and t.name is 'input'
2024                         reconstruct_afe()
2025                         insert_html_element t
2026                         open_els.shift()
2027                         t.acknowledge_self_closing()
2028                         unless is_input_hidden_tok t
2029                                 flag_frameset_ok = false
2030                         return
2031                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032                         insert_html_element t
2033                         open_els.shift()
2034                         t.acknowledge_self_closing()
2035                         return
2036                 if t.type is TYPE_START_TAG and t.name is 'hr'
2037                         close_p_if_in_button_scope()
2038                         insert_html_element t
2039                         open_els.shift()
2040                         t.acknowledge_self_closing()
2041                         flag_frameset_ok = false
2042                         return
2043                 if t.type is TYPE_START_TAG and t.name is 'image'
2044                         parse_error()
2045                         t.name = 'img'
2046                         process_token t
2047                         return
2048                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2049                         parse_error()
2050                         if template_tag_is_open() is false and form_element_pointer isnt null
2051                                 return
2052                         t.acknowledge_self_closing()
2053                         flag_frameset_ok = false
2054                         close_p_if_in_button_scope()
2055                         el = insert_html_element new_open_tag 'form'
2056                         unless template_tag_is_open()
2057                                 form_element_pointer = el
2058                         for a in t.attrs_a
2059                                 if a[0] is 'action'
2060                                         el.attrs['action'] = a[1]
2061                                         break
2062                         insert_html_element new_open_tag 'hr'
2063                         open_els.shift()
2064                         reconstruct_afe()
2065                         insert_html_element new_open_tag 'label'
2066                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067                         input_el = new_open_tag 'input'
2068                         prompt = null
2069                         for a in t.attrs_a
2070                                 if a[0] is 'prompt'
2071                                         prompt = a[1]
2072                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073                                         input_el.attrs_a.push [a[0], a[1]]
2074                         input_el.attrs_a.push ['name', 'isindex']
2075                         # fixfull this next bit is in english... internationalize?
2076                         prompt ?= "This is a searchable index. Enter search keywords: "
2077                         insert_character new_character_token prompt # fixfull split
2078                         # TODO submit typo "balue" in spec
2079                         insert_html_element input_el
2080                         open_els.shift()
2081                         # insert_character '' # you can put chars here if promt attr missing
2082                         open_els.shift()
2083                         insert_html_element new_open_tag 'hr'
2084                         open_els.shift()
2085                         open_els.shift()
2086                         unless template_tag_is_open()
2087                                 form_element_pointer = null
2088                         return
2089                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090                         insert_html_element t
2091                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2092                                 cur += 1
2093                         tok_state = tok_state_rcdata
2094                         original_ins_mode = ins_mode
2095                         flag_frameset_ok = false
2096                         ins_mode = ins_mode_text
2097                         return
2098                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099                         close_p_if_in_button_scope()
2100                         reconstruct_afe()
2101                         flag_frameset_ok = false
2102                         parse_generic_raw_text t
2103                         return
2104                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105                         flag_frameset_ok = false
2106                         parse_generic_raw_text t
2107                         return
2108                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109                         parse_generic_raw_text t
2110                         return
2111                 if t.type is TYPE_START_TAG and t.name is 'select'
2112                         reconstruct_afe()
2113                         insert_html_element t
2114                         flag_frameset_ok = false
2115                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116                                 ins_mode = ins_mode_in_select_in_table
2117                         else
2118                                 ins_mode = ins_mode_in_select
2119                         return
2120                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2122                                 open_els.shift()
2123                         reconstruct_afe()
2124                         insert_html_element t
2125                         return
2126 # this comment block implements the W3C spec
2127 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 #                       if is_in_scope 'ruby', NS_HTML
2129 #                               generate_implied_end_tags()
2130 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2131 #                                       parse_error()
2132 #                       insert_html_element t
2133 #                       return
2134 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2135 #                       if is_in_scope 'ruby', NS_HTML
2136 #                               generate_implied_end_tags 'rtc' # arg is exception
2137 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2138 #                                       parse_error()
2139 #                       insert_html_element t
2140 #                       return
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143                         if is_in_scope 'ruby', NS_HTML
2144                                 generate_implied_end_tags()
2145                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2146                                         parse_error()
2147                         insert_html_element t
2148                         return
2149                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150                         if is_in_scope 'ruby', NS_HTML
2151                                 generate_implied_end_tags 'rtc'
2152                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2153                                         parse_error()
2154                         insert_html_element t
2155                         return
2156 # end WATWG chunk
2157                 if t.type is TYPE_START_TAG and t.name is 'math'
2158                         reconstruct_afe()
2159                         adjust_mathml_attributes t
2160                         adjust_foreign_attributes t
2161                         insert_foreign_element t, NS_MATHML
2162                         if t.flag 'self-closing'
2163                                 open_els.shift()
2164                                 t.acknowledge_self_closing()
2165                         return
2166                 if t.type is TYPE_START_TAG and t.name is 'svg'
2167                         reconstruct_afe()
2168                         adjust_svg_attributes t
2169                         adjust_foreign_attributes t
2170                         insert_foreign_element t, NS_SVG
2171                         if t.flag 'self-closing'
2172                                 open_els.shift()
2173                                 t.acknowledge_self_closing()
2174                         return
2175                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2176                         parse_error()
2177                         return
2178                 if t.type is TYPE_START_TAG # any other start tag
2179                         reconstruct_afe()
2180                         insert_html_element t
2181                         return
2182                 if t.type is TYPE_END_TAG # any other end tag
2183                         in_body_any_other_end_tag t.name
2184                         return
2185                 return
2186
2187         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188         ins_mode_text = (t) ->
2189                 if t.type is TYPE_TEXT
2190                         insert_character t
2191                         return
2192                 if t.type is TYPE_EOF
2193                         parse_error()
2194                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195                                 open_els[0].flag 'already started', true
2196                         open_els.shift()
2197                         ins_mode = original_ins_mode
2198                         process_token t
2199                         return
2200                 if t.type is TYPE_END_TAG and t.name is 'script'
2201                         open_els.shift()
2202                         ins_mode = original_ins_mode
2203                         # fixfull the spec seems to assume that I'm going to run the script
2204                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2205                         return
2206                 if t.type is TYPE_END_TAG
2207                         open_els.shift()
2208                         ins_mode = original_ins_mode
2209                         return
2210                 console.log 'warning: end of ins_mode_text reached'
2211
2212         # the functions below implement the tokenizer stats described here:
2213         # http://www.w3.org/TR/html5/syntax.html#tokenization
2214
2215         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216         ins_mode_in_table_else = (t) ->
2217                 parse_error()
2218                 flag_foster_parenting = true
2219                 ins_mode_in_body t
2220                 flag_foster_parenting = false
2221                 return
2222         ins_mode_in_table = (t) ->
2223                 switch t.type
2224                         when TYPE_TEXT
2225                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226                                         pending_table_character_tokens = []
2227                                         original_ins_mode = ins_mode
2228                                         ins_mode = ins_mode_in_table_text
2229                                         process_token t
2230                                 else
2231                                         ins_mode_in_table_else t
2232                         when TYPE_COMMENT
2233                                 insert_comment t
2234                         when TYPE_DOCTYPE
2235                                 parse_error()
2236                         when TYPE_START_TAG
2237                                 switch t.name
2238                                         when 'caption'
2239                                                 clear_stack_to_table_context()
2240                                                 afe_push_marker()
2241                                                 insert_html_element t
2242                                                 ins_mode = ins_mode_in_caption
2243                                         when 'colgroup'
2244                                                 clear_stack_to_table_context()
2245                                                 insert_html_element t
2246                                                 ins_mode = ins_mode_in_column_group
2247                                         when 'col'
2248                                                 clear_stack_to_table_context()
2249                                                 insert_html_element new_open_tag 'colgroup'
2250                                                 ins_mode = ins_mode_in_column_group
2251                                                 process_token t
2252                                         when 'tbody', 'tfoot', 'thead'
2253                                                 clear_stack_to_table_context()
2254                                                 insert_html_element t
2255                                                 ins_mode = ins_mode_in_table_body
2256                                         when 'td', 'th', 'tr'
2257                                                 clear_stack_to_table_context()
2258                                                 insert_html_element new_open_tag 'tbody'
2259                                                 ins_mode = ins_mode_in_table_body
2260                                                 process_token t
2261                                         when 'table'
2262                                                 parse_error()
2263                                                 if is_in_table_scope 'table', NS_HTML
2264                                                         loop
2265                                                                 el = open_els.shift()
2266                                                                 if el.name is 'table' and el.namespace is NS_HTML
2267                                                                         break
2268                                                         reset_ins_mode()
2269                                                         process_token t
2270                                         when 'style', 'script', 'template'
2271                                                 ins_mode_in_head t
2272                                         when 'input'
2273                                                 unless is_input_hidden_tok t
2274                                                         ins_mode_in_table_else t
2275                                                 else
2276                                                         parse_error()
2277                                                         el = insert_html_element t
2278                                                         open_els.shift()
2279                                                         t.acknowledge_self_closing()
2280                                         when 'form'
2281                                                 parse_error()
2282                                                 if form_element_pointer?
2283                                                         return
2284                                                 if template_tag_is_open()
2285                                                         return
2286                                                 form_element_pointer = insert_html_element t
2287                                                 open_els.shift()
2288                                         else
2289                                                 ins_mode_in_table_else t
2290                         when TYPE_END_TAG
2291                                 switch t.name
2292                                         when 'table'
2293                                                 if is_in_table_scope 'table', NS_HTML
2294                                                         loop
2295                                                                 el = open_els.shift()
2296                                                                 if el.name is 'table' and el.namespace is NS_HTML
2297                                                                         break
2298                                                         reset_ins_mode()
2299                                                 else
2300                                                         parse_error()
2301                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2302                                                 parse_error()
2303                                         when 'template'
2304                                                 ins_mode_in_head t
2305                                         else
2306                                                 ins_mode_in_table_else t
2307                         when TYPE_EOF
2308                                 ins_mode_in_body t
2309                         else
2310                                 ins_mode_in_table_else t
2311
2312
2313         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314         ins_mode_in_table_text = (t) ->
2315                 if t.type is TYPE_TEXT and t.text is "\u0000"
2316                         # from javascript?
2317                         parse_error()
2318                         return
2319                 if t.type is TYPE_TEXT
2320                         pending_table_character_tokens.push t
2321                         return
2322                 # Anything else
2323                 all_space = true
2324                 for old in pending_table_character_tokens
2325                         unless is_space_tok old
2326                                 all_space = false
2327                                 break
2328                 if all_space
2329                         for old in pending_table_character_tokens
2330                                 insert_character old
2331                 else
2332                         for old in pending_table_character_tokens
2333                                 ins_mode_in_table_else old
2334                 pending_table_character_tokens = []
2335                 ins_mode = original_ins_mode
2336                 process_token t
2337
2338         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339         ins_mode_in_caption = (t) ->
2340                 if t.type is TYPE_END_TAG and t.name is 'caption'
2341                         if is_in_table_scope 'caption', NS_HTML
2342                                 generate_implied_end_tags()
2343                                 if open_els[0].name isnt 'caption'
2344                                         parse_error()
2345                                 loop
2346                                         el = open_els.shift()
2347                                         if el.name is 'caption' and el.namespace is NS_HTML
2348                                                 break
2349                                 clear_afe_to_marker()
2350                                 ins_mode = ins_mode_in_table
2351                         else
2352                                 parse_error()
2353                                 # fragment case
2354                         return
2355                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2356                         parse_error()
2357                         if is_in_table_scope 'caption', NS_HTML
2358                                 loop
2359                                         el = open_els.shift()
2360                                         if el.name is 'caption' and el.namespace is NS_HTML
2361                                                 break
2362                                 clear_afe_to_marker()
2363                                 ins_mode = ins_mode_in_table
2364                                 process_token t
2365                         # else fragment case
2366                         return
2367                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2368                         parse_error()
2369                         return
2370                 # Anything else
2371                 ins_mode_in_body t
2372
2373         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374         ins_mode_in_column_group = (t) ->
2375                 if is_space_tok t
2376                         insert_character t
2377                         return
2378                 if t.type is TYPE_COMMENT
2379                         insert_comment t
2380                         return
2381                 if t.type is TYPE_DOCTYPE
2382                         parse_error()
2383                         return
2384                 if t.type is TYPE_START_TAG and t.name is 'html'
2385                         ins_mode_in_body t
2386                         return
2387                 if t.type is TYPE_START_TAG and t.name is 'col'
2388                         el = insert_html_element t
2389                         open_els.shift()
2390                         t.acknowledge_self_closing()
2391                         return
2392                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2394                                 open_els.shift()
2395                                 ins_mode = ins_mode_in_table
2396                         else
2397                                 parse_error()
2398                         return
2399                 if t.type is TYPE_END_TAG and t.name is 'col'
2400                         parse_error()
2401                         return
2402                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2403                         ins_mode_in_head t
2404                         return
2405                 if t.type is TYPE_EOF
2406                         ins_mode_in_body t
2407                         return
2408                 # Anything else
2409                 if open_els[0].name isnt 'colgroup'
2410                         parse_error()
2411                         return
2412                 open_els.shift()
2413                 ins_mode = ins_mode_in_table
2414                 process_token t
2415                 return
2416
2417         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418         ins_mode_in_table_body = (t) ->
2419                 if t.type is TYPE_START_TAG and t.name is 'tr'
2420                         clear_stack_to_table_body_context()
2421                         insert_html_element t
2422                         ins_mode = ins_mode_in_row
2423                         return
2424                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2425                         parse_error()
2426                         clear_stack_to_table_body_context()
2427                         insert_html_element new_open_tag 'tr'
2428                         ins_mode = ins_mode_in_row
2429                         process_token t
2430                         return
2431                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432                         unless is_in_table_scope t.name, NS_HTML
2433                                 parse_error()
2434                                 return
2435                         clear_stack_to_table_body_context()
2436                         open_els.shift()
2437                         ins_mode = ins_mode_in_table
2438                         return
2439                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2440                         has = false
2441                         for el in open_els
2442                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2443                                         has = true
2444                                         break
2445                                 if table_scopers[el.name] is el.namespace
2446                                         break
2447                         if !has
2448                                 parse_error()
2449                                 return
2450                         clear_stack_to_table_body_context()
2451                         open_els.shift()
2452                         ins_mode = ins_mode_in_table
2453                         process_token t
2454                         return
2455                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2456                         parse_error()
2457                         return
2458                 # Anything else
2459                 ins_mode_in_table t
2460
2461         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462         ins_mode_in_row = (t) ->
2463                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464                         clear_stack_to_table_row_context()
2465                         insert_html_element t
2466                         ins_mode = ins_mode_in_cell
2467                         afe_push_marker()
2468                         return
2469                 if t.type is TYPE_END_TAG and t.name is 'tr'
2470                         if is_in_table_scope 'tr', NS_HTML
2471                                 clear_stack_to_table_row_context()
2472                                 open_els.shift()
2473                                 ins_mode = ins_mode_in_table_body
2474                         else
2475                                 parse_error()
2476                         return
2477                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478                         if is_in_table_scope 'tr', NS_HTML
2479                                 clear_stack_to_table_row_context()
2480                                 open_els.shift()
2481                                 ins_mode = ins_mode_in_table_body
2482                                 process_token t
2483                         else
2484                                 parse_error()
2485                         return
2486                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487                         if is_in_table_scope t.name, NS_HTML
2488                                 if is_in_table_scope 'tr', NS_HTML
2489                                         clear_stack_to_table_row_context()
2490                                         open_els.shift()
2491                                         ins_mode = ins_mode_in_table_body
2492                                         process_token t
2493                         else
2494                                 parse_error()
2495                         return
2496                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2497                         parse_error()
2498                         return
2499                 # Anything else
2500                 ins_mode_in_table t
2501
2502         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2503         close_the_cell = ->
2504                 generate_implied_end_tags()
2505                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2506                         parse_error()
2507                 loop
2508                         el = open_els.shift()
2509                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2510                                 break
2511                 clear_afe_to_marker()
2512                 ins_mode = ins_mode_in_row
2513
2514         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515         ins_mode_in_cell = (t) ->
2516                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517                         if is_in_table_scope t.name, NS_HTML
2518                                 generate_implied_end_tags()
2519                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2520                                         parse_error()
2521                                 loop
2522                                         el = open_els.shift()
2523                                         if el.name is t.name and el.namespace is NS_HTML
2524                                                 break
2525                                 clear_afe_to_marker()
2526                                 ins_mode = ins_mode_in_row
2527                         else
2528                                 parse_error()
2529                         return
2530                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2531                         has = false
2532                         for el in open_els
2533                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2534                                         has = true
2535                                         break
2536                                 if table_scopers[el.name] is el.namespace
2537                                         break
2538                         if !has
2539                                 parse_error()
2540                                 return
2541                         close_the_cell()
2542                         process_token t
2543                         return
2544                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2545                         parse_error()
2546                         return
2547                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548                         if is_in_table_scope t.name, NS_HTML
2549                                 close_the_cell()
2550                                 process_token t
2551                         else
2552                                 parse_error()
2553                         return
2554                 # Anything Else
2555                 ins_mode_in_body t
2556
2557         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558         ins_mode_in_select = (t) ->
2559                 if t.type is TYPE_TEXT and t.text is "\u0000"
2560                         parse_error()
2561                         return
2562                 if t.type is TYPE_TEXT
2563                         insert_character t
2564                         return
2565                 if t.type is TYPE_COMMENT
2566                         insert_comment t
2567                         return
2568                 if t.type is TYPE_DOCTYPE
2569                         parse_error()
2570                         return
2571                 if t.type is TYPE_START_TAG and t.name is 'html'
2572                         ins_mode_in_body t
2573                         return
2574                 if t.type is TYPE_START_TAG and t.name is 'option'
2575                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2576                                 open_els.shift()
2577                         insert_html_element t
2578                         return
2579                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2581                                 open_els.shift()
2582                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2583                                 open_els.shift()
2584                         insert_html_element t
2585                         return
2586                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587                         if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2589                                         open_els.shift()
2590                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2591                                 open_els.shift()
2592                         else
2593                                 parse_error()
2594                         return
2595                 if t.type is TYPE_END_TAG and t.name is 'option'
2596                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2597                                 open_els.shift()
2598                         else
2599                                 parse_error()
2600                         return
2601                 if t.type is TYPE_END_TAG and t.name is 'select'
2602                         if is_in_select_scope 'select', NS_HTML
2603                                 loop
2604                                         el = open_els.shift()
2605                                         if el.name is 'select' and el.namespace is NS_HTML
2606                                                 break
2607                                 reset_ins_mode()
2608                         else
2609                                 parse_error()
2610                         return
2611                 if t.type is TYPE_START_TAG and t.name is 'select'
2612                         parse_error()
2613                         loop
2614                                 el = open_els.shift()
2615                                 if el.name is 'select' and el.namespace is NS_HTML
2616                                         break
2617                         reset_ins_mode()
2618                         # spec says that this is the same as </select> but it doesn't say
2619                         # to check scope first
2620                         return
2621                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2622                         parse_error()
2623                         if is_in_select_scope 'select', NS_HTML
2624                                 return
2625                         loop
2626                                 el = open_els.shift()
2627                                 if el.name is 'select' and el.namespace is NS_HTML
2628                                         break
2629                         reset_ins_mode()
2630                         process_token t
2631                         return
2632                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2633                         ins_mode_in_head t
2634                         return
2635                 if t.type is TYPE_EOF
2636                         ins_mode_in_body t
2637                         return
2638                 # Anything else
2639                 parse_error()
2640                 return
2641
2642         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643         ins_mode_in_select_in_table = (t) ->
2644                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2645                         parse_error()
2646                         loop
2647                                 el = open_els.shift()
2648                                 if el.name is 'select' and el.namespace is NS_HTML
2649                                         break
2650                         reset_ins_mode()
2651                         process_token t
2652                         return
2653                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2654                         parse_error()
2655                         unless is_in_table_scope t.name, NS_HTML
2656                                 return
2657                         loop
2658                                 el = open_els.shift()
2659                                 if el.name is 'select' and el.namespace is NS_HTML
2660                                         break
2661                         reset_ins_mode()
2662                         process_token t
2663                         return
2664                 # Anything else
2665                 ins_mode_in_select t
2666                 return
2667
2668         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669         ins_mode_in_template = (t) ->
2670                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2671                         ins_mode_in_body t
2672                         return
2673                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2674                         ins_mode_in_head t
2675                         return
2676                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677                         template_ins_modes.shift()
2678                         template_ins_modes.unshift ins_mode_in_table
2679                         ins_mode = ins_mode_in_table
2680                         process_token t
2681                         return
2682                 if t.type is TYPE_START_TAG and t.name is 'col'
2683                         template_ins_modes.shift()
2684                         template_ins_modes.unshift ins_mode_in_column_group
2685                         ins_mode = ins_mode_in_column_group
2686                         process_token t
2687                         return
2688                 if t.type is TYPE_START_TAG and t.name is 'tr'
2689                         template_ins_modes.shift()
2690                         template_ins_modes.unshift ins_mode_in_table_body
2691                         ins_mode = ins_mode_in_table_body
2692                         process_token t
2693                         return
2694                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695                         template_ins_modes.shift()
2696                         template_ins_modes.unshift ins_mode_in_row
2697                         ins_mode = ins_mode_in_row
2698                         process_token t
2699                         return
2700                 if t.type is TYPE_START_TAG
2701                         template_ins_modes.shift()
2702                         template_ins_modes.unshift ins_mode_in_body
2703                         ins_mode = ins_mode_in_body
2704                         process_token t
2705                         return
2706                 if t.type is TYPE_END_TAG
2707                         parse_error()
2708                         return
2709                 if t.type is TYPE_EOF
2710                         unless template_tag_is_open()
2711                                 stop_parsing()
2712                                 return
2713                         parse_error()
2714                         loop
2715                                 el = open_els.shift()
2716                                 if el.name is 'template' and el.namespace is NS_HTML
2717                                         break
2718                         clear_afe_to_marker()
2719                         template_ins_modes.shift()
2720                         reset_ins_mode()
2721                         process_token t
2722
2723         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724         ins_mode_after_body = (t) ->
2725                 if is_space_tok t
2726                         ins_mode_in_body t
2727                         return
2728                 if t.type is TYPE_COMMENT
2729                         first = open_els[open_els.length - 1]
2730                         insert_comment t, [first, first.children.length]
2731                         return
2732                 if t.type is TYPE_DOCTYPE
2733                         parse_error()
2734                         return
2735                 if t.type is TYPE_START_TAG and t.name is 'html'
2736                         ins_mode_in_body t
2737                         return
2738                 if t.type is TYPE_END_TAG and t.name is 'html'
2739                         if flag_fragment_parsing
2740                                 parse_error()
2741                                 return
2742                         ins_mode = ins_mode_after_after_body
2743                         return
2744                 if t.type is TYPE_EOF
2745                         stop_parsing()
2746                         return
2747                 # Anything ELse
2748                 parse_error()
2749                 ins_mode = ins_mode_in_body
2750                 process_token t
2751
2752         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2753         ins_mode_in_frameset = (t) ->
2754                 if is_space_tok t
2755                         insert_character t
2756                         return
2757                 if t.type is TYPE_COMMENT
2758                         insert_comment t
2759                         return
2760                 if t.type is TYPE_DOCTYPE
2761                         parse_error()
2762                         return
2763                 if t.type is TYPE_START_TAG and t.name is 'html'
2764                         ins_mode_in_body t
2765                         return
2766                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2767                         insert_html_element t
2768                         return
2769                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2770                         if open_els.length is 1
2771                                 parse_error()
2772                                 return # fragment case
2773                         open_els.shift()
2774                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2775                                 ins_mode = ins_mode_after_frameset
2776                         return
2777                 if t.type is TYPE_START_TAG and t.name is 'frame'
2778                         insert_html_element t
2779                         open_els.shift()
2780                         t.acknowledge_self_closing()
2781                         return
2782                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2783                         ins_mode_in_head t
2784                         return
2785                 if t.type is TYPE_EOF
2786                         if open_els.length isnt 1
2787                                 parse_error()
2788                         stop_parsing()
2789                         return
2790                 # Anything else
2791                 parse_error()
2792                 return
2793
2794         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2795         ins_mode_after_frameset = (t) ->
2796                 if is_space_tok t
2797                         insert_character t
2798                         return
2799                 if t.type is TYPE_COMMENT
2800                         insert_comment t
2801                         return
2802                 if t.type is TYPE_DOCTYPE
2803                         parse_error()
2804                         return
2805                 if t.type is TYPE_START_TAG and t.name is 'html'
2806                         ins_mode_in_body t
2807                         return
2808                 if t.type is TYPE_END_TAG and t.name is 'html'
2809                         ins_mode = ins_mode_after_after_frameset
2810                         return
2811                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2812                         ins_mode_in_head t
2813                         return
2814                 if t.type is TYPE_EOF
2815                         stop_parsing()
2816                         return
2817                 # Anything else
2818                 parse_error()
2819                 return
2820
2821         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2822         ins_mode_after_after_body = (t) ->
2823                 if t.type is TYPE_COMMENT
2824                         insert_comment t, [doc, doc.children.length]
2825                         return
2826                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2827                         ins_mode_in_body t
2828                         return
2829                 if t.type is TYPE_EOF
2830                         stop_parsing()
2831                         return
2832                 # Anything else
2833                 parse_error()
2834                 ins_mode = ins_mode_in_body
2835                 process_token t
2836                 return
2837
2838         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2839         ins_mode_after_after_frameset = (t) ->
2840                 if t.type is TYPE_COMMENT
2841                         insert_comment t, [doc, doc.children.length]
2842                         return
2843                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2844                         ins_mode_in_body t
2845                         return
2846                 if t.type is TYPE_EOF
2847                         stop_parsing()
2848                         return
2849                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2850                         ins_mode_in_head t
2851                         return
2852                 # Anything else
2853                 parse_error()
2854                 return
2855
2856         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2857         has_color_face_or_size = (t) ->
2858                 for a in t.attrs_a
2859                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2860                                 return true
2861                 return false
2862         in_foreign_content_end_script = ->
2863                 open_els.shift()
2864                 # fixfull
2865                 return
2866         in_foreign_content_other_start = (t) ->
2867                 acn = adjusted_current_node()
2868                 if acn.namespace is NS_MATHML
2869                         adjust_mathml_attributes t
2870                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2871                         t.name = svg_name_fixes[t.name]
2872                 if acn.namespace is NS_SVG
2873                         adjust_svg_attributes t
2874                 adjust_foreign_attributes t
2875                 insert_foreign_element t, acn.namespace
2876                 if t.flag 'self-closing'
2877                         if t.name is 'script'
2878                                 t.acknowledge_self_closing()
2879                                 in_foreign_content_end_script()
2880                                 # fixfull
2881                         else
2882                                 open_els.shift()
2883                                 t.acknowledge_self_closing()
2884                 return
2885         in_foreign_content = (t) ->
2886                 if t.type is TYPE_TEXT and t.text is "\u0000"
2887                         parse_error()
2888                         insert_character new_character_token "\ufffd"
2889                         return
2890                 if is_space_tok t
2891                         insert_character t
2892                         return
2893                 if t.type is TYPE_TEXT
2894                         flag_frameset_ok = false
2895                         insert_character t
2896                         return
2897                 if t.type is TYPE_COMMENT
2898                         insert_comment t
2899                         return
2900                 if t.type is TYPE_DOCTYPE
2901                         parse_error()
2902                         return
2903                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2904                         parse_error()
2905                         if flag_fragment_parsing
2906                                 in_foreign_content_other_start t
2907                                 return
2908                         loop # is this safe?
2909                                 open_els.shift()
2910                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2911                                         break
2912                         process_token t
2913                         return
2914                 if t.type is TYPE_START_TAG
2915                         in_foreign_content_other_start t
2916                         return
2917                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2918                         in_foreign_content_end_script()
2919                         return
2920                 if t.type is TYPE_END_TAG
2921                         i = 0
2922                         node = open_els[i]
2923                         if node.name.toLowerCase() isnt t.name
2924                                 parse_error()
2925                         loop
2926                                 if node is open_els[open_els.length - 1]
2927                                         return
2928                                 if node.name.toLowerCase() is t.name
2929                                         loop
2930                                                 el = open_els.shift()
2931                                                 if el is node
2932                                                         return
2933                                 i += 1
2934                                 node = open_els[i]
2935                                 if node.namespace is NS_HTML
2936                                         break
2937                         ins_mode t # explicitly call HTML insertion mode
2938
2939
2940         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2941         tok_state_data = ->
2942                 switch c = txt.charAt(cur++)
2943                         when '&'
2944                                 return new_text_node parse_character_reference()
2945                         when '<'
2946                                 tok_state = tok_state_tag_open
2947                         when "\u0000"
2948                                 parse_error()
2949                                 return new_text_node "\ufffd"
2950                         when '' # EOF
2951                                 return new_eof_token()
2952                         else
2953                                 return new_text_node c
2954                 return null
2955
2956         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2957         # not needed: tok_state_character_reference_in_data = ->
2958         # just call parse_character_reference()
2959
2960         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2961         tok_state_rcdata = ->
2962                 switch c = txt.charAt(cur++)
2963                         when '&'
2964                                 return new_text_node parse_character_reference()
2965                         when '<'
2966                                 tok_state = tok_state_rcdata_less_than_sign
2967                         when "\u0000"
2968                                 parse_error()
2969                                 return new_character_token "\ufffd"
2970                         when '' # EOF
2971                                 return new_eof_token()
2972                         else
2973                                 return new_character_token c
2974                 return null
2975
2976         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2977         # not needed: tok_state_character_reference_in_rcdata = ->
2978         # just call parse_character_reference()
2979
2980         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2981         tok_state_rawtext = ->
2982                 switch c = txt.charAt(cur++)
2983                         when '<'
2984                                 tok_state = tok_state_rawtext_less_than_sign
2985                         when "\u0000"
2986                                 parse_error()
2987                                 return new_character_token "\ufffd"
2988                         when '' # EOF
2989                                 return new_eof_token()
2990                         else
2991                                 return new_character_token c
2992                 return null
2993
2994         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2995         tok_state_script_data = ->
2996                 switch c = txt.charAt(cur++)
2997                         when '<'
2998                                 tok_state = tok_state_script_data_less_than_sign
2999                         when "\u0000"
3000                                 parse_error()
3001                                 return new_character_token "\ufffd"
3002                         when '' # EOF
3003                                 return new_eof_token()
3004                         else
3005                                 return new_character_token c
3006                 return null
3007
3008         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3009         tok_state_plaintext = ->
3010                 switch c = txt.charAt(cur++)
3011                         when "\u0000"
3012                                 parse_error()
3013                                 return new_character_token "\ufffd"
3014                         when '' # EOF
3015                                 return new_eof_token()
3016                         else
3017                                 return new_character_token c
3018                 return null
3019
3020
3021         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3022         tok_state_tag_open = ->
3023                 c = txt.charAt(cur++)
3024                 if c is '!'
3025                         tok_state = tok_state_markup_declaration_open
3026                         return
3027                 if c is '/'
3028                         tok_state = tok_state_end_tag_open
3029                         return
3030                 if is_uc_alpha(c)
3031                         tok_cur_tag = new_open_tag c.toLowerCase()
3032                         tok_state = tok_state_tag_name
3033                         return
3034                 if is_lc_alpha(c)
3035                         tok_cur_tag = new_open_tag c
3036                         tok_state = tok_state_tag_name
3037                         return
3038                 if c is '?'
3039                         parse_error()
3040                         tok_cur_tag = new_comment_token '?' # FIXME right?
3041                         tok_state = tok_state_bogus_comment
3042                         return
3043                 # Anything else
3044                 parse_error()
3045                 tok_state = tok_state_data
3046                 cur -= 1 # we didn't parse/handle the char after <
3047                 return new_text_node '<'
3048
3049         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3050         tok_state_end_tag_open = ->
3051                 switch c = txt.charAt(cur++)
3052                         when '>'
3053                                 parse_error()
3054                                 tok_state = tok_state_data
3055                         when '' # EOF
3056                                 parse_error()
3057                                 tok_state = tok_state_data
3058                                 return new_text_node '</'
3059                         else
3060                                 if is_uc_alpha(c)
3061                                         tok_cur_tag = new_end_tag c.toLowerCase()
3062                                         tok_state = tok_state_tag_name
3063                                 else if is_lc_alpha(c)
3064                                         tok_cur_tag = new_end_tag c
3065                                         tok_state = tok_state_tag_name
3066                                 else
3067                                         parse_error()
3068                                         tok_cur_tag = new_comment_token '/'
3069                                         tok_state = tok_state_bogus_comment
3070                 return null
3071
3072         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3073         tok_state_tag_name = ->
3074                 switch c = txt.charAt(cur++)
3075                         when "\t", "\n", "\u000c", ' '
3076                                 tok_state = tok_state_before_attribute_name
3077                         when '/'
3078                                 tok_state = tok_state_self_closing_start_tag
3079                         when '>'
3080                                 tok_state = tok_state_data
3081                                 tmp = tok_cur_tag
3082                                 tok_cur_tag = null
3083                                 return tmp
3084                         when "\u0000"
3085                                 parse_error()
3086                                 tok_cur_tag.name += "\ufffd"
3087                         when '' # EOF
3088                                 parse_error()
3089                                 tok_state = tok_state_data
3090                         else
3091                                 if is_uc_alpha(c)
3092                                         tok_cur_tag.name += c.toLowerCase()
3093                                 else
3094                                         tok_cur_tag.name += c
3095                 return null
3096
3097         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3098         tok_state_rcdata_less_than_sign = ->
3099                 c = txt.charAt(cur++)
3100                 if c is '/'
3101                         temporary_buffer = ''
3102                         tok_state = tok_state_rcdata_end_tag_open
3103                         return null
3104                 # Anything else
3105                 tok_state = tok_state_rcdata
3106                 cur -= 1 # reconsume the input character
3107                 return new_character_token '<'
3108
3109         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3110         tok_state_rcdata_end_tag_open = ->
3111                 c = txt.charAt(cur++)
3112                 if is_uc_alpha(c)
3113                         tok_cur_tag = new_end_tag c.toLowerCase()
3114                         temporary_buffer += c
3115                         tok_state = tok_state_rcdata_end_tag_name
3116                         return null
3117                 if is_lc_alpha(c)
3118                         tok_cur_tag = new_end_tag c
3119                         temporary_buffer += c
3120                         tok_state = tok_state_rcdata_end_tag_name
3121                         return null
3122                 # Anything else
3123                 tok_state = tok_state_rcdata
3124                 cur -= 1 # reconsume the input character
3125                 return new_character_token "</" # fixfull separate these
3126
3127         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3128         is_appropriate_end_tag = (t) ->
3129                 # spec says to check against "the tag name of the last start tag to
3130                 # have been emitted from this tokenizer", but this is only called from
3131                 # the various "raw" states, so it's hopefully ok to assume that
3132                 # open_els[0].name will work instead TODO: verify this after the script
3133                 # data states are implemented
3134                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3135                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3136
3137         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3138         tok_state_rcdata_end_tag_name = ->
3139                 c = txt.charAt(cur++)
3140                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3141                         if is_appropriate_end_tag tok_cur_tag
3142                                 tok_state = tok_state_before_attribute_name
3143                                 return
3144                         # else fall through to "Anything else"
3145                 if c is '/'
3146                         if is_appropriate_end_tag tok_cur_tag
3147                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3148                                 return
3149                         # else fall through to "Anything else"
3150                 if c is '>'
3151                         if is_appropriate_end_tag tok_cur_tag
3152                                 tok_state = tok_state_data
3153                                 return tok_cur_tag
3154                         # else fall through to "Anything else"
3155                 if is_uc_alpha(c)
3156                         tok_cur_tag.name += c.toLowerCase()
3157                         temporary_buffer += c
3158                         return null
3159                 if is_lc_alpha(c)
3160                         tok_cur_tag.name += c
3161                         temporary_buffer += c
3162                         return null
3163                 # Anything else
3164                 tok_state = tok_state_rcdata
3165                 cur -= 1 # reconsume the input character
3166                 return new_character_token '</' + temporary_buffer # fixfull separate these
3167
3168         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3169         tok_state_rawtext_less_than_sign = ->
3170                 c = txt.charAt(cur++)
3171                 if c is '/'
3172                         temporary_buffer = ''
3173                         tok_state = tok_state_rawtext_end_tag_open
3174                         return null
3175                 # Anything else
3176                 tok_state = tok_state_rawtext
3177                 cur -= 1 # reconsume the input character
3178                 return new_character_token '<'
3179
3180         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3181         tok_state_rawtext_end_tag_open = ->
3182                 c = txt.charAt(cur++)
3183                 if is_uc_alpha(c)
3184                         tok_cur_tag = new_end_tag c.toLowerCase()
3185                         temporary_buffer += c
3186                         tok_state = tok_state_rawtext_end_tag_name
3187                         return null
3188                 if is_lc_alpha(c)
3189                         tok_cur_tag = new_end_tag c
3190                         temporary_buffer += c
3191                         tok_state = tok_state_rawtext_end_tag_name
3192                         return null
3193                 # Anything else
3194                 tok_state = tok_state_rawtext
3195                 cur -= 1 # reconsume the input character
3196                 return new_character_token "</" # fixfull separate these
3197
3198         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3199         tok_state_rawtext_end_tag_name = ->
3200                 c = txt.charAt(cur++)
3201                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3202                         if is_appropriate_end_tag tok_cur_tag
3203                                 tok_state = tok_state_before_attribute_name
3204                                 return
3205                         # else fall through to "Anything else"
3206                 if c is '/'
3207                         if is_appropriate_end_tag tok_cur_tag
3208                                 tok_state = tok_state_self_closing_start_tag
3209                                 return
3210                         # else fall through to "Anything else"
3211                 if c is '>'
3212                         if is_appropriate_end_tag tok_cur_tag
3213                                 tok_state = tok_state_data
3214                                 return tok_cur_tag
3215                         # else fall through to "Anything else"
3216                 if is_uc_alpha(c)
3217                         tok_cur_tag.name += c.toLowerCase()
3218                         temporary_buffer += c
3219                         return null
3220                 if is_lc_alpha(c)
3221                         tok_cur_tag.name += c
3222                         temporary_buffer += c
3223                         return null
3224                 # Anything else
3225                 tok_state = tok_state_rawtext
3226                 cur -= 1 # reconsume the input character
3227                 return new_character_token '</' + temporary_buffer # fixfull separate these
3228
3229         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3230         tok_state_script_data_less_than_sign = ->
3231                 c = txt.charAt(cur++)
3232                 if c is '/'
3233                         temporary_buffer = ''
3234                         tok_state = tok_state_script_data_end_tag_open
3235                         return
3236                 if c is '!'
3237                         tok_state = tok_state_script_data_escape_start
3238                         return new_character_token '<!' # fixfull split
3239                 # Anything else
3240                 tok_state = tok_state_script_data
3241                 cur -= 1 # Reconsume
3242                 return new_character_token '<'
3243
3244         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3245         tok_state_script_data_end_tag_open = ->
3246                 c = txt.charAt(cur++)
3247                 if is_uc_alpha(c)
3248                         tok_cur_tag = new_end_tag c.toLowerCase()
3249                         temporary_buffer += c
3250                         tok_state = tok_state_script_data_end_tag_name
3251                         return
3252                 if is_lc_alpha(c)
3253                         tok_cur_tag = new_end_tag c
3254                         temporary_buffer += c
3255                         tok_state = tok_state_script_data_end_tag_name
3256                         return
3257                 # Anything else
3258                 tok_state = tok_state_script_data
3259                 cur -= 1 # Reconsume
3260                 return new_character_token '</'
3261
3262         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3263         tok_state_script_data_end_tag_name = ->
3264                 c = txt.charAt(cur++)
3265                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3266                         if is_appropriate_end_tag tok_cur_tag
3267                                 tok_state = tok_state_before_attribute_name
3268                                 return
3269                         # fall through
3270                 if c is '/'
3271                         if is_appropriate_end_tag tok_cur_tag
3272                                 tok_state = tok_state_self_closing_start_tag
3273                                 return
3274                         # fall through
3275                 if c is '>'
3276                         if is_appropriate_end_tag tok_cur_tag
3277                                 tok_state = tok_state_data
3278                                 return tok_cur_tag
3279                         # fall through
3280                 if is_uc_alpha(c)
3281                         tok_cur_tag.name += c.toLowerCase()
3282                         temporary_buffer += c
3283                         return
3284                 if is_lc_alpha(c)
3285                         tok_cur_tag.name += c
3286                         temporary_buffer += c
3287                         return
3288                 # Anything else
3289                 tok_state = tok_state_script_data
3290                 cur -= 1 # Reconsume
3291                 return new_character_token "</#{temporary_buffer}" # fixfull split
3292
3293         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3294         tok_state_script_data_escape_start = ->
3295                 c = txt.charAt(cur++)
3296                 if c is '-'
3297                         tok_state = tok_state_script_data_escape_start_dash
3298                         return new_character_token '-'
3299                 # Anything else
3300                 tok_state = tok_state_script_data
3301                 cur -= 1 # Reconsume
3302                 return
3303
3304         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3305         tok_state_script_data_escape_start_dash = ->
3306                 c = txt.charAt(cur++)
3307                 if c is '-'
3308                         tok_state = tok_state_script_data_escaped_dash_dash
3309                         return new_character_token '-'
3310                 # Anything else
3311                 tok_state = tok_state_script_data
3312                 cur -= 1 # Reconsume
3313                 return
3314
3315         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3316         tok_state_script_data_escaped = ->
3317                 c = txt.charAt(cur++)
3318                 if c is '-'
3319                         tok_state = tok_state_script_data_escaped_dash
3320                         return new_character_token '-'
3321                 if c is '<'
3322                         tok_state = tok_state_script_data_escaped_less_than_sign
3323                         return
3324                 if c is "\u0000"
3325                         parse_error()
3326                         return new_character_token "\ufffd"
3327                 if c is '' # EOF
3328                         tok_state = tok_state_data
3329                         parse_error()
3330                         cur -= 1 # Reconsume
3331                         return
3332                 # Anything else
3333                 return new_character_token c
3334
3335         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3336         tok_state_script_data_escaped_dash = ->
3337                 c = txt.charAt(cur++)
3338                 if c is '-'
3339                         tok_state = tok_state_script_data_escaped_dash_dash
3340                         return new_character_token '-'
3341                 if c is '<'
3342                         tok_state = tok_state_script_data_escaped_less_than_sign
3343                         return
3344                 if c is "\u0000"
3345                         parse_error()
3346                         tok_state = tok_state_script_data_escaped
3347                         return new_character_token "\ufffd"
3348                 if c is '' # EOF
3349                         tok_state = tok_state_data
3350                         parse_error()
3351                         cur -= 1 # Reconsume
3352                         return
3353                 # Anything else
3354                 tok_state = tok_state_script_data_escaped
3355                 return new_character_token c
3356
3357         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3358         tok_state_script_data_escaped_dash_dash = ->
3359                 c = txt.charAt(cur++)
3360                 if c is '-'
3361                         return new_character_token '-'
3362                 if c is '<'
3363                         tok_state = tok_state_script_data_escaped_less_than_sign
3364                         return
3365                 if c is '>'
3366                         tok_state = tok_state_script_data
3367                         return new_character_token '>'
3368                 if c is "\u0000"
3369                         parse_error()
3370                         tok_state = tok_state_script_data_escaped
3371                         return new_character_token "\ufffd"
3372                 if c is '' # EOF
3373                         parse_error()
3374                         tok_state = tok_state_data
3375                         cur -= 1 # Reconsume
3376                         return
3377                 # Anything else
3378                 tok_state = tok_state_script_data_escaped
3379                 return new_character_token c
3380
3381         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3382         tok_state_script_data_escaped_less_than_sign = ->
3383                 c = txt.charAt(cur++)
3384                 if c is '/'
3385                         temporary_buffer = ''
3386                         tok_state = tok_state_script_data_escaped_end_tag_open
3387                         return
3388                 if is_uc_alpha(c)
3389                         temporary_buffer = c.toLowerCase() # yes, really
3390                         tok_state = tok_state_script_data_double_escape_start
3391                         return new_character_token "<#{c}" # fixfull split
3392                 if is_lc_alpha(c)
3393                         temporary_buffer = c
3394                         tok_state = tok_state_script_data_double_escape_start
3395                         return new_character_token "<#{c}" # fixfull split
3396                 # Anything else
3397                 tok_state = tok_state_script_data_escaped
3398                 cur -= 1 # Reconsume
3399                 return new_character_token '<'
3400
3401         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3402         tok_state_script_data_escaped_end_tag_open = ->
3403                 c = txt.charAt(cur++)
3404                 if is_uc_alpha(c)
3405                         tok_cur_tag = new_end_tag c.toLowerCase()
3406                         temporary_buffer += c
3407                         tok_state = tok_state_script_data_escaped_end_tag_name
3408                         return
3409                 if is_lc_alpha(c)
3410                         tok_cur_tag = new_end_tag c
3411                         temporary_buffer += c
3412                         tok_state = tok_state_script_data_escaped_end_tag_name
3413                         return
3414                 # Anything else
3415                 tok_state = tok_state_script_data_escaped
3416                 cur -= 1 # Reconsume
3417                 return new_character_token '</' # fixfull split
3418
3419         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3420         tok_state_script_data_escaped_end_tag_name = ->
3421                 c = txt.charAt(cur++)
3422                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3423                         if is_appropriate_end_tag tok_cur_tag
3424                                 tok_state = tok_state_before_attribute_name
3425                                 return
3426                         # fall through
3427                 if c is '/'
3428                         if is_appropriate_end_tag tok_cur_tag
3429                                 tok_state = tok_state_self_closing_start_tag
3430                                 return
3431                         # fall through
3432                 if c is '>'
3433                         if is_appropriate_end_tag tok_cur_tag
3434                                 tok_state = tok_state_data
3435                                 return tok_cur_tag
3436                         # fall through
3437                 if is_uc_alpha(c)
3438                         tok_cur_tag.name += c.toLowerCase()
3439                         temporary_buffer += c.toLowerCase()
3440                         return
3441                 if is_lc_alpha(c)
3442                         tok_cur_tag.name += c
3443                         temporary_buffer += c.toLowerCase()
3444                         return
3445                 # Anything else
3446                 tok_state = tok_state_script_data_escaped
3447                 cur -= 1 # Reconsume
3448                 return new_character_token "</#{temporary_buffer}" # fixfull split
3449
3450         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3451         tok_state_script_data_double_escape_start = ->
3452                 c = txt.charAt(cur++)
3453                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3454                         if temporary_buffer is 'script'
3455                                 tok_state = tok_state_script_data_double_escaped
3456                         else
3457                                 tok_state = tok_state_script_data_escaped
3458                         return new_character_token c
3459                 if is_uc_alpha(c)
3460                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3461                         return new_character_token c
3462                 if is_lc_alpha(c)
3463                         temporary_buffer += c
3464                         return new_character_token c
3465                 # Anything else
3466                 tok_state = tok_state_script_data_escaped
3467                 cur -= 1 # Reconsume
3468                 return
3469
3470         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3471         tok_state_script_data_double_escaped = ->
3472                 c = txt.charAt(cur++)
3473                 if c is '-'
3474                         tok_state = tok_state_script_data_double_escaped_dash
3475                         return new_character_token '-'
3476                 if c is '<'
3477                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3478                         return new_character_token '<'
3479                 if c is "\u0000"
3480                         parse_error()
3481                         return new_character_token "\ufffd"
3482                 if c is '' # EOF
3483                         parse_error()
3484                         tok_state = tok_state_data
3485                         cur -= 1 # Reconsume
3486                         return
3487                 # Anything else
3488                 return new_character_token c
3489
3490         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3491         tok_state_script_data_double_escaped_dash = ->
3492                 c = txt.charAt(cur++)
3493                 if c is '-'
3494                         tok_state = tok_state_script_data_double_escaped_dash_dash
3495                         return new_character_token '-'
3496                 if c is '<'
3497                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3498                         return new_character_token '<'
3499                 if c is "\u0000"
3500                         parse_error()
3501                         tok_state = tok_state_script_data_double_escaped
3502                         return new_character_token "\ufffd"
3503                 if c is '' # EOF
3504                         parse_error()
3505                         tok_state = tok_state_data
3506                         cur -= 1 # Reconsume
3507                         return
3508                 # Anything else
3509                 tok_state = tok_state_script_data_double_escaped
3510                 return new_character_token c
3511
3512         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3513         tok_state_script_data_double_escaped_dash_dash = ->
3514                 c = txt.charAt(cur++)
3515                 if c is '-'
3516                         return new_character_token '-'
3517                 if c is '<'
3518                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3519                         return new_character_token '<'
3520                 if c is '>'
3521                         tok_state = tok_state_script_data
3522                         return new_character_token '>'
3523                 if c is "\u0000"
3524                         parse_error()
3525                         tok_state = tok_state_script_data_double_escaped
3526                         return new_character_token "\ufffd"
3527                 if c is '' # EOF
3528                         parse_error()
3529                         tok_state = tok_state_data
3530                         cur -= 1 # Reconsume
3531                         return
3532                 # Anything else
3533                 tok_state = tok_state_script_data_double_escaped
3534                 return new_character_token c
3535
3536         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3537         tok_state_script_data_double_escaped_less_than_sign = ->
3538                 c = txt.charAt(cur++)
3539                 if c is '/'
3540                         temporary_buffer = ''
3541                         tok_state = tok_state_script_data_double_escape_end
3542                         return new_character_token '/'
3543                 # Anything else
3544                 tok_state = tok_state_script_data_double_escaped
3545                 cur -= 1 # Reconsume
3546                 return
3547
3548         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3549         tok_state_script_data_double_escape_end = ->
3550                 c = txt.charAt(cur++)
3551                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3552                         if temporary_buffer is 'script'
3553                                 tok_state = tok_state_script_data_escaped
3554                         else
3555                                 tok_state = tok_state_script_data_double_escaped
3556                         return new_character_token c
3557                 if is_uc_alpha(c)
3558                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3559                         return new_character_token c
3560                 if is_lc_alpha(c)
3561                         temporary_buffer += c
3562                         return new_character_token c
3563                 # Anything else
3564                 tok_state = tok_state_script_data_double_escaped
3565                 cur -= 1 # Reconsume
3566                 return
3567
3568         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3569         tok_state_before_attribute_name = ->
3570                 attr_name = null
3571                 switch c = txt.charAt(cur++)
3572                         when "\t", "\n", "\u000c", ' '
3573                                 return null
3574                         when '/'
3575                                 tok_state = tok_state_self_closing_start_tag
3576                                 return null
3577                         when '>'
3578                                 tok_state = tok_state_data
3579                                 tmp = tok_cur_tag
3580                                 tok_cur_tag = null
3581                                 return tmp
3582                         when "\u0000"
3583                                 parse_error()
3584                                 attr_name = "\ufffd"
3585                         when '"', "'", '<', '='
3586                                 parse_error()
3587                                 attr_name = c
3588                         when '' # EOF
3589                                 parse_error()
3590                                 tok_state = tok_state_data
3591                         else
3592                                 if is_uc_alpha(c)
3593                                         attr_name = c.toLowerCase()
3594                                 else
3595                                         attr_name = c
3596                 if attr_name?
3597                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3598                         tok_state = tok_state_attribute_name
3599                 return null
3600
3601         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3602         tok_state_attribute_name = ->
3603                 switch c = txt.charAt(cur++)
3604                         when "\t", "\n", "\u000c", ' '
3605                                 tok_state = tok_state_after_attribute_name
3606                         when '/'
3607                                 tok_state = tok_state_self_closing_start_tag
3608                         when '='
3609                                 tok_state = tok_state_before_attribute_value
3610                         when '>'
3611                                 tok_state = tok_state_data
3612                                 tmp = tok_cur_tag
3613                                 tok_cur_tag = null
3614                                 return tmp
3615                         when "\u0000"
3616                                 parse_error()
3617                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3618                         when '"', "'", '<'
3619                                 parse_error()
3620                                 tok_cur_tag.attrs_a[0][0] += c
3621                         when '' # EOF
3622                                 parse_error()
3623                                 tok_state = tok_state_data
3624                         else
3625                                 if is_uc_alpha(c)
3626                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3627                                 else
3628                                         tok_cur_tag.attrs_a[0][0] += c
3629                 return null
3630
3631         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3632         tok_state_after_attribute_name = ->
3633                 c = txt.charAt(cur++)
3634                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3635                         return
3636                 if c is '/'
3637                         tok_state = tok_state_self_closing_start_tag
3638                         return
3639                 if c is '='
3640                         tok_state = tok_state_before_attribute_value
3641                         return
3642                 if c is '>'
3643                         tok_state = tok_state_data
3644                         return
3645                 if is_uc_alpha(c)
3646                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3647                         tok_state = tok_state_attribute_name
3648                         return
3649                 if c is "\u0000"
3650                         parse_error()
3651                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3652                         tok_state = tok_state_attribute_name
3653                         return
3654                 if c is '' # EOF
3655                         parse_error()
3656                         tok_state = tok_state_data
3657                         cur -= 1 # reconsume
3658                         return
3659                 if c is '"' or c is "'" or c is '<'
3660                         parse_error()
3661                         # fall through to Anything else
3662                 # Anything else
3663                 tok_cur_tag.attrs_a.unshift [c, '']
3664                 tok_state = tok_state_attribute_name
3665
3666         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3667         tok_state_before_attribute_value = ->
3668                 switch c = txt.charAt(cur++)
3669                         when "\t", "\n", "\u000c", ' '
3670                                 return null
3671                         when '"'
3672                                 tok_state = tok_state_attribute_value_double_quoted
3673                         when '&'
3674                                 tok_state = tok_state_attribute_value_unquoted
3675                                 cur -= 1
3676                         when "'"
3677                                 tok_state = tok_state_attribute_value_single_quoted
3678                         when "\u0000"
3679                                 # Parse error
3680                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3681                                 tok_state = tok_state_attribute_value_unquoted
3682                         when '>'
3683                                 # Parse error
3684                                 tok_state = tok_state_data
3685                                 tmp = tok_cur_tag
3686                                 tok_cur_tag = null
3687                                 return tmp
3688                         when '' # EOF
3689                                 parse_error()
3690                                 tok_state = tok_state_data
3691                         else
3692                                 tok_cur_tag.attrs_a[0][1] += c
3693                                 tok_state = tok_state_attribute_value_unquoted
3694                 return null
3695
3696         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3697         tok_state_attribute_value_double_quoted = ->
3698                 switch c = txt.charAt(cur++)
3699                         when '"'
3700                                 tok_state = tok_state_after_attribute_value_quoted
3701                         when '&'
3702                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3703                         when "\u0000"
3704                                 # Parse error
3705                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3706                         when '' # EOF
3707                                 parse_error()
3708                                 tok_state = tok_state_data
3709                         else
3710                                 tok_cur_tag.attrs_a[0][1] += c
3711                 return null
3712
3713         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3714         tok_state_attribute_value_single_quoted = ->
3715                 switch c = txt.charAt(cur++)
3716                         when "'"
3717                                 tok_state = tok_state_after_attribute_value_quoted
3718                         when '&'
3719                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3720                         when "\u0000"
3721                                 # Parse error
3722                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3723                         when '' # EOF
3724                                 parse_error()
3725                                 tok_state = tok_state_data
3726                         else
3727                                 tok_cur_tag.attrs_a[0][1] += c
3728                 return null
3729
3730         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3731         tok_state_attribute_value_unquoted = ->
3732                 switch c = txt.charAt(cur++)
3733                         when "\t", "\n", "\u000c", ' '
3734                                 tok_state = tok_state_before_attribute_name
3735                         when '&'
3736                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3737                         when '>'
3738                                 tok_state = tok_state_data
3739                                 tmp = tok_cur_tag
3740                                 tok_cur_tag = null
3741                                 return tmp
3742                         when "\u0000"
3743                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3744                         when '' # EOF
3745                                 parse_error()
3746                                 tok_state = tok_state_data
3747                         else
3748                                 # Parse Error if ', <, = or ` (backtick)
3749                                 tok_cur_tag.attrs_a[0][1] += c
3750                 return null
3751
3752         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3753         tok_state_after_attribute_value_quoted = ->
3754                 switch c = txt.charAt(cur++)
3755                         when "\t", "\n", "\u000c", ' '
3756                                 tok_state = tok_state_before_attribute_name
3757                         when '/'
3758                                 tok_state = tok_state_self_closing_start_tag
3759                         when '>'
3760                                 tok_state = tok_state_data
3761                                 tmp = tok_cur_tag
3762                                 tok_cur_tag = null
3763                                 return tmp
3764                         when '' # EOF
3765                                 parse_error()
3766                                 tok_state = tok_state_data
3767                         else
3768                                 # Parse Error
3769                                 tok_state = tok_state_before_attribute_name
3770                                 cur -= 1 # we didn't handle that char
3771                 return null
3772
3773         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3774         tok_state_self_closing_start_tag = ->
3775                 c = txt.charAt(cur++)
3776                 if c is '>'
3777                         tok_cur_tag.flag 'self-closing', true
3778                         tok_state = tok_state_data
3779                         return tok_cur_tag
3780                 if c is ''
3781                         parse_error()
3782                         tok_state = tok_state_data
3783                         cur -= 1 # Reconsume
3784                         return
3785                 # Anything else
3786                 parse_error()
3787                 tok_state = tok_state_before_attribute_name
3788                 cur -= 1 # Reconsume
3789                 return
3790
3791         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3792         # WARNING: put a comment token in tok_cur_tag before setting this state
3793         tok_state_bogus_comment = ->
3794                 next_gt = txt.indexOf '>', cur
3795                 if next_gt is -1
3796                         val = txt.substr cur
3797                         cur = txt.length
3798                 else
3799                         val = txt.substr cur, (next_gt - cur)
3800                         cur = next_gt + 1
3801                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3802                 tok_cur_tag.text += val
3803                 tok_state = tok_state_data
3804                 return tok_cur_tag
3805
3806         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3807         tok_state_markup_declaration_open = ->
3808                 if txt.substr(cur, 2) is '--'
3809                         cur += 2
3810                         tok_cur_tag = new_comment_token ''
3811                         tok_state = tok_state_comment_start
3812                         return
3813                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3814                         cur += 7
3815                         tok_state = tok_state_doctype
3816                         return
3817                 acn = adjusted_current_node()
3818                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3819                         cur += 7
3820                         tok_state = tok_state_cdata_section
3821                         return
3822                 # Otherwise
3823                 parse_error()
3824                 tok_cur_tag = new_comment_token ''
3825                 tok_state = tok_state_bogus_comment
3826                 return
3827
3828         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3829         tok_state_comment_start = ->
3830                 switch c = txt.charAt(cur++)
3831                         when '-'
3832                                 tok_state = tok_state_comment_start_dash
3833                         when "\u0000"
3834                                 parse_error()
3835                                 tok_state = tok_state_comment
3836                                 return new_character_token "\ufffd"
3837                         when '>'
3838                                 parse_error()
3839                                 tok_state = tok_state_data
3840                                 return tok_cur_tag
3841                         when '' # EOF
3842                                 parse_error()
3843                                 tok_state = tok_state_data
3844                                 cur -= 1 # Reconsume
3845                                 return tok_cur_tag
3846                         else
3847                                 tok_cur_tag.text += c
3848                                 tok_state = tok_state_comment
3849                 return null
3850
3851         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3852         tok_state_comment_start_dash = ->
3853                 switch c = txt.charAt(cur++)
3854                         when '-'
3855                                 tok_state = tok_state_comment_end
3856                         when "\u0000"
3857                                 parse_error()
3858                                 tok_cur_tag.text += "-\ufffd"
3859                                 tok_state = tok_state_comment
3860                         when '>'
3861                                 parse_error()
3862                                 tok_state = tok_state_data
3863                                 return tok_cur_tag
3864                         when '' # EOF
3865                                 parse_error()
3866                                 tok_state = tok_state_data
3867                                 cur -= 1 # Reconsume
3868                                 return tok_cur_tag
3869                         else
3870                                 tok_cur_tag.text += "-#{c}"
3871                                 tok_state = tok_state_comment
3872                 return null
3873
3874         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3875         tok_state_comment = ->
3876                 switch c = txt.charAt(cur++)
3877                         when '-'
3878                                 tok_state = tok_state_comment_end_dash
3879                         when "\u0000"
3880                                 parse_error()
3881                                 tok_cur_tag.text += "\ufffd"
3882                         when '' # EOF
3883                                 parse_error()
3884                                 tok_state = tok_state_data
3885                                 cur -= 1 # Reconsume
3886                                 return tok_cur_tag
3887                         else
3888                                 tok_cur_tag.text += c
3889                 return null
3890
3891         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3892         tok_state_comment_end_dash = ->
3893                 switch c = txt.charAt(cur++)
3894                         when '-'
3895                                 tok_state = tok_state_comment_end
3896                         when "\u0000"
3897                                 parse_error()
3898                                 tok_cur_tag.text += "-\ufffd"
3899                                 tok_state = tok_state_comment
3900                         when '' # EOF
3901                                 parse_error()
3902                                 tok_state = tok_state_data
3903                                 cur -= 1 # Reconsume
3904                                 return tok_cur_tag
3905                         else
3906                                 tok_cur_tag.text += "-#{c}"
3907                                 tok_state = tok_state_comment
3908                 return null
3909
3910         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3911         tok_state_comment_end = ->
3912                 switch c = txt.charAt(cur++)
3913                         when '>'
3914                                 tok_state = tok_state_data
3915                                 return tok_cur_tag
3916                         when "\u0000"
3917                                 parse_error()
3918                                 tok_cur_tag.text += "--\ufffd"
3919                                 tok_state = tok_state_comment
3920                         when '!'
3921                                 parse_error()
3922                                 tok_state = tok_state_comment_end_bang
3923                         when '-'
3924                                 parse_error()
3925                                 tok_cur_tag.text += '-'
3926                         when '' # EOF
3927                                 parse_error()
3928                                 tok_state = tok_state_data
3929                                 cur -= 1 # Reconsume
3930                                 return tok_cur_tag
3931                         else
3932                                 parse_error()
3933                                 tok_cur_tag.text += "--#{c}"
3934                                 tok_state = tok_state_comment
3935                 return null
3936
3937         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3938         tok_state_comment_end_bang = ->
3939                 switch c = txt.charAt(cur++)
3940                         when '-'
3941                                 tok_cur_tag.text += "--!#{c}"
3942                                 tok_state = tok_state_comment_end_dash
3943                         when '>'
3944                                 tok_state = tok_state_data
3945                                 return tok_cur_tag
3946                         when "\u0000"
3947                                 parse_error()
3948                                 tok_cur_tag.text += "--!\ufffd"
3949                                 tok_state = tok_state_comment
3950                         when '' # EOF
3951                                 parse_error()
3952                                 tok_state = tok_state_data
3953                                 cur -= 1 # Reconsume
3954                                 return tok_cur_tag
3955                         else
3956                                 tok_cur_tag.text += "--!#{c}"
3957                                 tok_state = tok_state_comment
3958                 return null
3959
3960         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3961         tok_state_doctype = ->
3962                 switch c = txt.charAt(cur++)
3963                         when "\t", "\u000a", "\u000c", ' '
3964                                 tok_state = tok_state_before_doctype_name
3965                         when '' # EOF
3966                                 parse_error()
3967                                 tok_state = tok_state_data
3968                                 el = new_doctype_token ''
3969                                 el.flag 'force-quirks', true
3970                                 cur -= 1 # Reconsume
3971                                 return el
3972                         else
3973                                 parse_error()
3974                                 tok_state = tok_state_before_doctype_name
3975                                 cur -= 1 # Reconsume
3976                 return null
3977
3978         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3979         tok_state_before_doctype_name = ->
3980                 c = txt.charAt(cur++)
3981                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3982                         return
3983                 if is_uc_alpha(c)
3984                         tok_cur_tag = new_doctype_token c.toLowerCase()
3985                         tok_state = tok_state_doctype_name
3986                         return
3987                 if c is "\u0000"
3988                         parse_error()
3989                         tok_cur_tag = new_doctype_token "\ufffd"
3990                         tok_state = tok_state_doctype_name
3991                         return
3992                 if c is '>'
3993                         parse_error()
3994                         el = new_doctype_token ''
3995                         el.flag 'force-quirks', true
3996                         tok_state = tok_state_data
3997                         return el
3998                 if c is '' # EOF
3999                         parse_error()
4000                         tok_state = tok_state_data
4001                         el = new_doctype_token ''
4002                         el.flag 'force-quirks', true
4003                         cur -= 1 # Reconsume
4004                         return el
4005                 # Anything else
4006                 tok_cur_tag = new_doctype_token c
4007                 tok_state = tok_state_doctype_name
4008                 return null
4009
4010         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4011         tok_state_doctype_name = ->
4012                 c = txt.charAt(cur++)
4013                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4014                         tok_state = tok_state_after_doctype_name
4015                         return
4016                 if c is '>'
4017                         tok_state = tok_state_data
4018                         return tok_cur_tag
4019                 if is_uc_alpha(c)
4020                         tok_cur_tag.name += c.toLowerCase()
4021                         return
4022                 if c is "\u0000"
4023                         parse_error()
4024                         tok_cur_tag.name += "\ufffd"
4025                         return
4026                 if c is '' # EOF
4027                         parse_error()
4028                         tok_state = tok_state_data
4029                         tok_cur_tag.flag 'force-quirks', true
4030                         cur -= 1 # Reconsume
4031                         return tok_cur_tag
4032                 # Anything else
4033                 tok_cur_tag.name += c
4034                 return null
4035
4036         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4037         tok_state_after_doctype_name = ->
4038                 c = txt.charAt(cur++)
4039                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4040                         return
4041                 if c is '>'
4042                         tok_state = tok_state_data
4043                         return tok_cur_tag
4044                 if c is '' # EOF
4045                         parse_error()
4046                         tok_state = tok_state_data
4047                         tok_cur_tag.flag 'force-quirks', true
4048                         cur -= 1 # Reconsume
4049                         return tok_cur_tag
4050                 # Anything else
4051                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4052                         cur += 5
4053                         tok_state = tok_state_after_doctype_public_keyword
4054                         return
4055                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4056                         cur += 5
4057                         tok_state = tok_state_after_doctype_system_keyword
4058                         return
4059                 parse_error()
4060                 tok_cur_tag.flag 'force-quirks', true
4061                 tok_state = tok_state_bogus_doctype
4062                 return null
4063
4064         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4065         tok_state_after_doctype_public_keyword = ->
4066                 c = txt.charAt(cur++)
4067                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4068                         tok_state = tok_state_before_doctype_public_identifier
4069                         return
4070                 if c is '"'
4071                         parse_error()
4072                         tok_cur_tag.public_identifier = ''
4073                         tok_state = tok_state_doctype_public_identifier_double_quoted
4074                         return
4075                 if c is "'"
4076                         parse_error()
4077                         tok_cur_tag.public_identifier = ''
4078                         tok_state = tok_state_doctype_public_identifier_single_quoted
4079                         return
4080                 if c is '>'
4081                         parse_error()
4082                         tok_cur_tag.flag 'force-quirks', true
4083                         tok_state = tok_state_data
4084                         return tok_cur_tag
4085                 if c is '' # EOF
4086                         parse_error()
4087                         tok_state = tok_state_data
4088                         tok_cur_tag.flag 'force-quirks', true
4089                         cur -= 1 # Reconsume
4090                         return tok_cur_tag
4091                 # Anything else
4092                 parse_error()
4093                 tok_cur_tag.flag 'force-quirks', true
4094                 tok_state = tok_state_bogus_doctype
4095                 return null
4096
4097         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4098         tok_state_before_doctype_public_identifier = ->
4099                 c = txt.charAt(cur++)
4100                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4101                         return
4102                 if c is '"'
4103                         parse_error()
4104                         tok_cur_tag.public_identifier = ''
4105                         tok_state = tok_state_doctype_public_identifier_double_quoted
4106                         return
4107                 if c is "'"
4108                         parse_error()
4109                         tok_cur_tag.public_identifier = ''
4110                         tok_state = tok_state_doctype_public_identifier_single_quoted
4111                         return
4112                 if c is '>'
4113                         parse_error()
4114                         tok_cur_tag.flag 'force-quirks', true
4115                         tok_state = tok_state_data
4116                         return tok_cur_tag
4117                 if c is '' # EOF
4118                         parse_error()
4119                         tok_state = tok_state_data
4120                         tok_cur_tag.flag 'force-quirks', true
4121                         cur -= 1 # Reconsume
4122                         return tok_cur_tag
4123                 # Anything else
4124                 parse_error()
4125                 tok_cur_tag.flag 'force-quirks', true
4126                 tok_state = tok_state_bogus_doctype
4127                 return null
4128
4129
4130         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4131         tok_state_doctype_public_identifier_double_quoted = ->
4132                 c = txt.charAt(cur++)
4133                 if c is '"'
4134                         tok_state = tok_state_after_doctype_public_identifier
4135                         return
4136                 if c is "\u0000"
4137                         parse_error()
4138                         tok_cur_tag.public_identifier += "\ufffd"
4139                         return
4140                 if c is '>'
4141                         parse_error()
4142                         tok_cur_tag.flag 'force-quirks', true
4143                         tok_state = tok_state_data
4144                         return tok_cur_tag
4145                 if c is '' # EOF
4146                         parse_error()
4147                         tok_state = tok_state_data
4148                         tok_cur_tag.flag 'force-quirks', true
4149                         cur -= 1 # Reconsume
4150                         return tok_cur_tag
4151                 # Anything else
4152                 tok_cur_tag.public_identifier += c
4153                 return null
4154
4155         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4156         tok_state_doctype_public_identifier_single_quoted = ->
4157                 c = txt.charAt(cur++)
4158                 if c is "'"
4159                         tok_state = tok_state_after_doctype_public_identifier
4160                         return
4161                 if c is "\u0000"
4162                         parse_error()
4163                         tok_cur_tag.public_identifier += "\ufffd"
4164                         return
4165                 if c is '>'
4166                         parse_error()
4167                         tok_cur_tag.flag 'force-quirks', true
4168                         tok_state = tok_state_data
4169                         return tok_cur_tag
4170                 if c is '' # EOF
4171                         parse_error()
4172                         tok_state = tok_state_data
4173                         tok_cur_tag.flag 'force-quirks', true
4174                         cur -= 1 # Reconsume
4175                         return tok_cur_tag
4176                 # Anything else
4177                 tok_cur_tag.public_identifier += c
4178                 return null
4179
4180         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4181         tok_state_after_doctype_public_identifier = ->
4182                 c = txt.charAt(cur++)
4183                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4184                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4185                         return
4186                 if c is '>'
4187                         tok_state = tok_state_data
4188                         return tok_cur_tag
4189                 if c is '"'
4190                         parse_error()
4191                         tok_cur_tag.system_identifier = ''
4192                         tok_state = tok_state_doctype_system_identifier_double_quoted
4193                         return
4194                 if c is "'"
4195                         parse_error()
4196                         tok_cur_tag.system_identifier = ''
4197                         tok_state = tok_state_doctype_system_identifier_single_quoted
4198                         return
4199                 if c is '' # EOF
4200                         parse_error()
4201                         tok_state = tok_state_data
4202                         tok_cur_tag.flag 'force-quirks', true
4203                         cur -= 1 # Reconsume
4204                         return tok_cur_tag
4205                 # Anything else
4206                 parse_error()
4207                 tok_cur_tag.flag 'force-quirks', true
4208                 tok_state = tok_state_bogus_doctype
4209                 return null
4210
4211         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4212         tok_state_between_doctype_public_and_system_identifiers = ->
4213                 c = txt.charAt(cur++)
4214                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4215                         return
4216                 if c is '>'
4217                         tok_state = tok_state_data
4218                         return tok_cur_tag
4219                 if c is '"'
4220                         parse_error()
4221                         tok_cur_tag.system_identifier = ''
4222                         tok_state = tok_state_doctype_system_identifier_double_quoted
4223                         return
4224                 if c is "'"
4225                         parse_error()
4226                         tok_cur_tag.system_identifier = ''
4227                         tok_state = tok_state_doctype_system_identifier_single_quoted
4228                         return
4229                 if c is '' # EOF
4230                         parse_error()
4231                         tok_state = tok_state_data
4232                         tok_cur_tag.flag 'force-quirks', true
4233                         cur -= 1 # Reconsume
4234                         return tok_cur_tag
4235                 # Anything else
4236                 parse_error()
4237                 tok_cur_tag.flag 'force-quirks', true
4238                 tok_state = tok_state_bogus_doctype
4239                 return null
4240
4241         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4242         tok_state_after_doctype_system_keyword = ->
4243                 c = txt.charAt(cur++)
4244                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4245                         tok_state = tok_state_before_doctype_system_identifier
4246                         return
4247                 if c is '"'
4248                         parse_error()
4249                         tok_cur_tag.system_identifier = ''
4250                         tok_state = tok_state_doctype_system_identifier_double_quoted
4251                         return
4252                 if c is "'"
4253                         parse_error()
4254                         tok_cur_tag.system_identifier = ''
4255                         tok_state = tok_state_doctype_system_identifier_single_quoted
4256                         return
4257                 if c is '>'
4258                         parse_error()
4259                         tok_cur_tag.flag 'force-quirks', true
4260                         tok_state = tok_state_data
4261                         return tok_cur_tag
4262                 if c is '' # EOF
4263                         parse_error()
4264                         tok_state = tok_state_data
4265                         tok_cur_tag.flag 'force-quirks', true
4266                         cur -= 1 # Reconsume
4267                         return tok_cur_tag
4268                 # Anything else
4269                 parse_error()
4270                 tok_cur_tag.flag 'force-quirks', true
4271                 tok_state = tok_state_bogus_doctype
4272                 return null
4273
4274         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4275         tok_state_before_doctype_system_identifier = ->
4276                 c = txt.charAt(cur++)
4277                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4278                         return
4279                 if c is '"'
4280                         tok_cur_tag.system_identifier = ''
4281                         tok_state = tok_state_doctype_system_identifier_double_quoted
4282                         return
4283                 if c is "'"
4284                         tok_cur_tag.system_identifier = ''
4285                         tok_state = tok_state_doctype_system_identifier_single_quoted
4286                         return
4287                 if c is '>'
4288                         parse_error()
4289                         tok_cur_tag.flag 'force-quirks', true
4290                         tok_state = tok_state_data
4291                         return tok_cur_tag
4292                 if c is '' # EOF
4293                         parse_error()
4294                         tok_state = tok_state_data
4295                         tok_cur_tag.flag 'force-quirks', true
4296                         cur -= 1 # Reconsume
4297                         return tok_cur_tag
4298                 # Anything else
4299                 parse_error()
4300                 tok_cur_tag.flag 'force-quirks', true
4301                 tok_state = tok_state_bogus_doctype
4302                 return null
4303
4304         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4305         tok_state_doctype_system_identifier_double_quoted = ->
4306                 c = txt.charAt(cur++)
4307                 if c is '"'
4308                         tok_state = tok_state_after_doctype_system_identifier
4309                         return
4310                 if c is "\u0000"
4311                         parse_error()
4312                         tok_cur_tag.system_identifier += "\ufffd"
4313                         return
4314                 if c is '>'
4315                         parse_error()
4316                         tok_cur_tag.flag 'force-quirks', true
4317                         tok_state = tok_state_data
4318                         return tok_cur_tag
4319                 if c is '' # EOF
4320                         parse_error()
4321                         tok_state = tok_state_data
4322                         tok_cur_tag.flag 'force-quirks', true
4323                         cur -= 1 # Reconsume
4324                         return tok_cur_tag
4325                 # Anything else
4326                 tok_cur_tag.system_identifier += c
4327                 return null
4328
4329         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4330         tok_state_doctype_system_identifier_single_quoted = ->
4331                 c = txt.charAt(cur++)
4332                 if c is "'"
4333                         tok_state = tok_state_after_doctype_system_identifier
4334                         return
4335                 if c is "\u0000"
4336                         parse_error()
4337                         tok_cur_tag.system_identifier += "\ufffd"
4338                         return
4339                 if c is '>'
4340                         parse_error()
4341                         tok_cur_tag.flag 'force-quirks', true
4342                         tok_state = tok_state_data
4343                         return tok_cur_tag
4344                 if c is '' # EOF
4345                         parse_error()
4346                         tok_state = tok_state_data
4347                         tok_cur_tag.flag 'force-quirks', true
4348                         cur -= 1 # Reconsume
4349                         return tok_cur_tag
4350                 # Anything else
4351                 tok_cur_tag.system_identifier += c
4352                 return null
4353
4354         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4355         tok_state_after_doctype_system_identifier = ->
4356                 c = txt.charAt(cur++)
4357                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4358                         return
4359                 if c is '>'
4360                         tok_state = tok_state_data
4361                         return tok_cur_tag
4362                 if c is '' # EOF
4363                         parse_error()
4364                         tok_state = tok_state_data
4365                         tok_cur_tag.flag 'force-quirks', true
4366                         cur -= 1 # Reconsume
4367                         return tok_cur_tag
4368                 # Anything else
4369                 parse_error()
4370                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4371                 tok_state = tok_state_bogus_doctype
4372                 return null
4373
4374         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4375         tok_state_bogus_doctype = ->
4376                 c = txt.charAt(cur++)
4377                 if c is '>'
4378                         tok_state = tok_state_data
4379                         return tok_cur_tag
4380                 if c is '' # EOF
4381                         tok_state = tok_state_data
4382                         cur -= 1 # Reconsume
4383                         return tok_cur_tag
4384                 # Anything else
4385                 return null
4386
4387         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4388         tok_state_cdata_section = ->
4389                 tok_state = tok_state_data
4390                 next_gt = txt.indexOf ']]>', cur
4391                 if next_gt is -1
4392                         val = txt.substr cur
4393                         cur = txt.length
4394                 else
4395                         val = txt.substr cur, (next_gt - cur)
4396                         cur = next_gt + 3
4397                 return new_character_token val # fixfull split
4398
4399         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4400         # Don't set this as a state, just call it
4401         # returns a string (NOT a text node)
4402         parse_character_reference = (allowed_char = null, in_attr = false) ->
4403                 if cur >= txt.length
4404                         return '&'
4405                 switch c = txt.charAt(cur)
4406                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4407                                 # explicitly not a parse error
4408                                 return '&'
4409                         when ';'
4410                                 # there has to be "one or more" alnums between & and ; to be a parse error
4411                                 return '&'
4412                         when '#'
4413                                 if cur + 1 >= txt.length
4414                                         return '&'
4415                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4416                                         base = 16
4417                                         charset = hex_chars
4418                                         start = cur + 2
4419                                 else
4420                                         charset = digits
4421                                         start = cur + 1
4422                                         base = 10
4423                                 i = 0
4424                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4425                                         i += 1
4426                                 if i is 0
4427                                         return '&'
4428                                 cur = start + i
4429                                 if txt.charAt(start + i) is ';'
4430                                         cur += 1
4431                                 else
4432                                         parse_error()
4433                                 code_point = txt.substr(start, i)
4434                                 while code_point.charAt(0) is '0' and code_point.length > 1
4435                                         code_point = code_point.substr 1
4436                                 code_point = parseInt(code_point, base)
4437                                 if unicode_fixes[code_point]?
4438                                         parse_error()
4439                                         return unicode_fixes[code_point]
4440                                 else
4441                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4442                                                 parse_error()
4443                                                 return "\ufffd"
4444                                         else
4445                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4446                                                         parse_error()
4447                                                 return from_code_point code_point
4448                                 return
4449                         else
4450                                 for i in [0...31]
4451                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4452                                                 break
4453                                 if i is 0
4454                                         # exit early, because parse_error() below needs at least one alnum
4455                                         return '&'
4456                                 if txt.charAt(cur + i) is ';'
4457                                         i += 1 # include ';' terminator in value
4458                                         decoded = decode_named_char_ref txt.substr(cur, i)
4459                                         if decoded?
4460                                                 cur += i
4461                                                 return decoded
4462                                         parse_error()
4463                                         return '&'
4464                                 else
4465                                         # no ';' terminator (only legacy char refs)
4466                                         max = i
4467                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4468                                                 c = legacy_char_refs[txt.substr(cur, i)]
4469                                                 if c?
4470                                                         if in_attr
4471                                                                 if txt.charAt(cur + i) is '='
4472                                                                         # "because some legacy user agents will
4473                                                                         # misinterpret the markup in those cases"
4474                                                                         parse_error()
4475                                                                         return '&'
4476                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4477                                                                         # this makes attributes forgiving about url args
4478                                                                         return '&'
4479                                                         # ok, and besides the weird exceptions for attributes...
4480                                                         # return the matching char
4481                                                         cur += i # consume entity chars
4482                                                         parse_error() # because no terminating ";"
4483                                                         return c
4484                                         parse_error()
4485                                         return '&'
4486                 return # never reached
4487
4488         # tree constructor initialization
4489         # see comments on TYPE_TAG/etc for the structure of this data
4490         txt = args.html
4491         cur = 0
4492         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4493         open_els = []
4494         afe = [] # active formatting elements
4495         template_ins_modes = []
4496         ins_mode = ins_mode_initial
4497         original_ins_mode = ins_mode # TODO check spec
4498         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4499         flag_frameset_ok = true
4500         flag_parsing = true
4501         flag_foster_parenting = false
4502         form_element_pointer = null
4503         temporary_buffer = null
4504         pending_table_character_tokens = []
4505         head_element_pointer = null
4506         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4507         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4508
4509         # tokenizer initialization
4510         tok_state = tok_state_data
4511
4512         # text pre-processing
4513         # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4514         txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4515         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4516         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4517
4518         if args.name is "tests18.dat #17"
4519                 console.log "hi"
4520         # proccess input
4521         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4522         while flag_parsing
4523                 t = tok_state()
4524                 if t?
4525                         process_token t
4526                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4527         return doc.children
4528
4529 serialize_els = (els, shallow, show_ids) ->
4530         serialized = ''
4531         sep = ''
4532         for t in els
4533                 serialized += sep
4534                 sep = ','
4535                 serialized += t.serialize shallow, show_ids
4536         return serialized
4537
4538 module.exports.parse_html = parse_html
4539 module.exports.debug_log_reset = debug_log_reset
4540 module.exports.debug_log_each = debug_log_each
4541 module.exports.TYPE_TAG = TYPE_TAG
4542 module.exports.TYPE_TEXT = TYPE_TEXT
4543 module.exports.TYPE_COMMENT = TYPE_COMMENT
4544 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4545 module.exports.NS_HTML = NS_HTML
4546 module.exports.NS_MATHML = NS_MATHML
4547 module.exports.NS_SVG = NS_SVG