JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix most fragment tests, fix tree bugs
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 @flags = args.flags ? {}
96                 if args.id?
97                         @id = "#{args.id}+"
98                 else
99                         @id = "#{++prev_node_id}"
100         acknowledge_self_closing: ->
101                 if @token?
102                         @token.flag 'did_self_close'
103                 else
104                         @flag 'did_self_close', true
105         flag: (key, value = null) ->
106                 if value?
107                         @flags[key] = value
108                 else
109                         return @flags[key]
110         serialize: (shallow = false, show_ids = false) -> # for unit tests
111                 ret = ''
112                 switch @type
113                         when TYPE_TAG
114                                 ret += 'tag:'
115                                 ret += JSON.stringify @name
116                                 ret += ','
117                                 if show_ids
118                                         ret += "##{@id},"
119                                 if shallow
120                                         break
121                                 attr_keys = []
122                                 for k of @attrs
123                                         attr_keys.push k
124                                 attr_keys.sort()
125                                 ret += '{'
126                                 sep = ''
127                                 for k in attr_keys
128                                         ret += sep
129                                         sep = ','
130                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
131                                 ret += '},['
132                                 sep = ''
133                                 for c in @children
134                                         ret += sep
135                                         sep = ','
136                                         ret += c.serialize shallow, show_ids
137                                 ret += ']'
138                         when TYPE_TEXT
139                                 ret += 'text:'
140                                 ret += JSON.stringify @text
141                         when TYPE_COMMENT
142                                 ret += 'comment:'
143                                 ret += JSON.stringify @text
144                         when TYPE_DOCTYPE
145                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
146                         when TYPE_AFE_MARKER
147                                 ret += 'marker'
148                         when TYPE_AAA_BOOKMARK
149                                 ret += 'aaa_bookmark'
150                         else
151                                 ret += 'unknown:'
152                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
153                 return ret
154
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157         return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159         return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161         return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163         return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166         return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168         return new Node TYPE_DOCTYPE, name: name
169 new_eof_token = ->
170         return new Node TYPE_EOF
171 new_afe_marker = ->
172         return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174         return new Node TYPE_AAA_BOOKMARK
175
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
181
182 is_uc_alpha = (str) ->
183         return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185         return str.length is 1 and lc_alpha.indexOf(str) > -1
186
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
189
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
192 is_space = (txt) ->
193         return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
196
197 is_input_hidden_tok = (t) ->
198         return unless t.type is TYPE_START_TAG
199         for a of t.attrs_a
200                 if a[0] is 'type'
201                         if a[1].toLowerCase() is 'hidden'
202                                 return true
203                         return false
204         return false
205
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
208
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
211 legacy_char_refs = {
212         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
229         yen: '¥', yuml: 'ÿ'
230 }
231
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
236 svg_elements = [
237         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
251         'view', 'vkern'
252 ]
253
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
255 mathml_elements = [
256         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262         'determinant', 'diff', 'divergence', 'divide', 'domain',
263         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283         'vectorproduct', 'xor'
284 ]
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
287
288 special_elements = {
289         # HTML:
290         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307         wbr:NS_HTML, xmp:NS_HTML,
308
309         # MathML:
310         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311         'annotation-xml':NS_MATHML,
312
313         # SVG:
314         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
315 }
316
317 formatting_elements = {
318          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
320          u: true
321 }
322
323 mathml_text_integration = {
324         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
325 }
326 is_mathml_text_integration_point = (el) ->
327         return mathml_text_integration[el.name] = el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329         if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330                 if el.attrs.encoding?
331                         if el.attrs.encoding.toLowerCase() is 'text/html'
332                                 return true
333                         if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
334                                 return true
335                 return false
336         if el.namespace is NS_SVG
337                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
338                         return true
339         return false
340
341 h_tags = {
342         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
343 }
344
345 # FIXME namespacify
346 foster_parenting_targets = {
347         table: true
348         tbody: true
349         tfoot: true
350         thead: true
351         tr: true
352 }
353
354 # FIXME namespacify
355 # all html I presume
356 end_tag_implied = {
357         dd: true
358         dt: true
359         li: true
360         option: true
361         optgroup: true
362         p: true
363         rb: true
364         rp: true
365         rt: true
366         rtc: true
367 }
368
369 el_is_special = (e) ->
370         return special_elements[e.name] is e.namespace
371
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
375
376 svg_name_fixes = {
377         altglyph: 'altGlyph'
378         altglyphdef: 'altGlyphDef'
379         altglyphitem: 'altGlyphItem'
380         animatecolor: 'animateColor'
381         animatemotion: 'animateMotion'
382         animatetransform: 'animateTransform'
383         clippath: 'clipPath'
384         feblend: 'feBlend'
385         fecolormatrix: 'feColorMatrix'
386         fecomponenttransfer: 'feComponentTransfer'
387         fecomposite: 'feComposite'
388         feconvolvematrix: 'feConvolveMatrix'
389         fediffuselighting: 'feDiffuseLighting'
390         fedisplacementmap: 'feDisplacementMap'
391         fedistantlight: 'feDistantLight'
392         fedropshadow: 'feDropShadow'
393         feflood: 'feFlood'
394         fefunca: 'feFuncA'
395         fefuncb: 'feFuncB'
396         fefuncg: 'feFuncG'
397         fefuncr: 'feFuncR'
398         fegaussianblur: 'feGaussianBlur'
399         feimage: 'feImage'
400         femerge: 'feMerge'
401         femergenode: 'feMergeNode'
402         femorphology: 'feMorphology'
403         feoffset: 'feOffset'
404         fepointlight: 'fePointLight'
405         fespecularlighting: 'feSpecularLighting'
406         fespotlight: 'feSpotLight'
407         fetile: 'feTile'
408         feturbulence: 'feTurbulence'
409         foreignobject: 'foreignObject'
410         glyphref: 'glyphRef'
411         lineargradient: 'linearGradient'
412         radialgradient: 'radialGradient'
413         textpath: 'textPath'
414 }
415 svg_attribute_fixes = {
416         attributename: 'attributeName'
417         attributetype: 'attributeType'
418         basefrequency: 'baseFrequency'
419         baseprofile: 'baseProfile'
420         calcmode: 'calcMode'
421         clippathunits: 'clipPathUnits'
422         contentscripttype: 'contentScriptType'
423         contentstyletype: 'contentStyleType'
424         diffuseconstant: 'diffuseConstant'
425         edgemode: 'edgeMode'
426         externalresourcesrequired: 'externalResourcesRequired'
427         filterres: 'filterRes'
428         filterunits: 'filterUnits'
429         glyphref: 'glyphRef'
430         gradienttransform: 'gradientTransform'
431         gradientunits: 'gradientUnits'
432         kernelmatrix: 'kernelMatrix'
433         kernelunitlength: 'kernelUnitLength'
434         keypoints: 'keyPoints'
435         keysplines: 'keySplines'
436         keytimes: 'keyTimes'
437         lengthadjust: 'lengthAdjust'
438         limitingconeangle: 'limitingConeAngle'
439         markerheight: 'markerHeight'
440         markerunits: 'markerUnits'
441         markerwidth: 'markerWidth'
442         maskcontentunits: 'maskContentUnits'
443         maskunits: 'maskUnits'
444         numoctaves: 'numOctaves'
445         pathlength: 'pathLength'
446         patterncontentunits: 'patternContentUnits'
447         patterntransform: 'patternTransform'
448         patternunits: 'patternUnits'
449         pointsatx: 'pointsAtX'
450         pointsaty: 'pointsAtY'
451         pointsatz: 'pointsAtZ'
452         preservealpha: 'preserveAlpha'
453         preserveaspectratio: 'preserveAspectRatio'
454         primitiveunits: 'primitiveUnits'
455         refx: 'refX'
456         refy: 'refY'
457         repeatcount: 'repeatCount'
458         repeatdur: 'repeatDur'
459         requiredextensions: 'requiredExtensions'
460         requiredfeatures: 'requiredFeatures'
461         specularconstant: 'specularConstant'
462         specularexponent: 'specularExponent'
463         spreadmethod: 'spreadMethod'
464         startoffset: 'startOffset'
465         stddeviation: 'stdDeviation'
466         stitchtiles: 'stitchTiles'
467         surfacescale: 'surfaceScale'
468         systemlanguage: 'systemLanguage'
469         tablevalues: 'tableValues'
470         targetx: 'targetX'
471         targety: 'targetY'
472         textlength: 'textLength'
473         viewbox: 'viewBox'
474         viewtarget: 'viewTarget'
475         xchannelselector: 'xChannelSelector'
476         ychannelselector: 'yChannelSelector'
477         zoomandpan: 'zoomAndPan'
478 }
479 adjust_mathml_attributes = (t) ->
480         for a in t.attrs_a
481                 if a[0] is 'definitionurl'
482                         a[0] = 'definitionURL'
483         return
484 adjust_svg_attributes = (t) ->
485         for a in t.attrs_a
486                 if svg_attribute_fixes[a[0]]?
487                         a[0] = svg_attribute_fixes[a[0]]
488         return
489 adjust_foreign_attributes = (t) ->
490         # fixfull
491         return
492
493 # decode_named_char_ref()
494 #
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
497 #
498 # Pass without the "&" but with the ";" examples:
499 #    for "&amp" pass "amp;"
500 #    for "&#x2032" pass "x2032;"
501 g_dncr = {
502         cache: {}
503         textarea: document.createElement('textarea')
504 }
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
507         txt = "&#{txt}"
508         decoded = g_dncr.cache[txt]
509         return decoded if decoded?
510         g_dncr.textarea.innerHTML = txt
511         decoded = g_dncr.textarea.value
512         return null if decoded is txt
513         return g_dncr.cache[txt] = decoded
514
515 parse_html = (args) ->
516         txt = null
517         cur = null # index of next char in txt to be parsed
518         # declare doc and tokenizer variables so they're in scope below
519         doc = null
520         open_els = null # stack of open elements
521         afe = null # active formatting elements
522         template_ins_modes = null
523         ins_mode = null
524         original_ins_mode = null
525         tok_state = null
526         tok_cur_tag = null # partially parsed tag
527         flag_scripting = null
528         flag_frameset_ok = null
529         flag_parsing = null
530         flag_foster_parenting = null
531         form_element_pointer = null
532         temporary_buffer = null
533         pending_table_character_tokens = null
534         head_element_pointer = null
535         flag_fragment_parsing = null
536         context_element = null
537
538         stop_parsing = ->
539                 flag_parsing = false
540
541         parse_error = ->
542                 if args.error_cb?
543                         args.error_cb cur
544                 else
545                         console.log "Parse error at character #{cur} of #{txt.length}"
546
547         afe_push = (new_el) ->
548                 matches = 0
549                 for el, i in afe
550                         if el.name is new_el.name and el.namespace is new_el.namespace
551                                 for k, v of el.attrs
552                                         continue unless new_el.attrs[k] is v
553                                 for k, v of new_el.attrs
554                                         continue unless el.attrs[k] is v
555                                 matches += 1
556                                 if matches is 3
557                                         afe.splice i, 1
558                                         break
559                 afe.unshift new_el
560         afe_push_marker = ->
561                 afe.unshift new_afe_marker()
562
563         # the functions below impliment the Tree Contstruction algorithm
564         # http://www.w3.org/TR/html5/syntax.html#tree-construction
565
566         # But first... the helpers
567         template_tag_is_open = ->
568                 for t in open_els
569                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
570                                 return true
571                 return false
572         is_in_scope_x = (tag_name, scope, namespace) ->
573                 for t in open_els
574                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
575                                 return true
576                         if scope[t.name] is t.namespace
577                                 return false
578                 return false
579         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
580                 for t in open_els
581                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
582                                 return true
583                         if scope[t.name] is t.namespace
584                                 return false
585                         if scope2[t.name] is t.namespace
586                                 return false
587                 return false
588         standard_scopers = {
589                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
590                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
591                 template: NS_HTML, mi: NS_MATHML,
592
593                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
594                 'annotation-xml': NS_MATHML,
595
596                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
597         }
598         button_scopers = button: NS_HTML
599         li_scopers = ol: NS_HTML, ul: NS_HTML
600         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
601         is_in_scope = (tag_name, namespace = null) ->
602                 return is_in_scope_x tag_name, standard_scopers, namespace
603         is_in_button_scope = (tag_name, namespace = null) ->
604                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
605         is_in_table_scope = (tag_name, namespace = null) ->
606                 return is_in_scope_x tag_name, table_scopers, namespace
607         # aka is_in_list_item_scope
608         is_in_li_scope = (tag_name, namespace = null) ->
609                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
610         is_in_select_scope = (tag_name, namespace = null) ->
611                 for t in open_els
612                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
613                                 return true
614                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
615                                 return false
616                 return false
617         # this checks for a particular element, not by name
618         el_is_in_scope = (el) ->
619                 for t in open_els
620                         if t is el
621                                 return true
622                         if standard_scopers[t.name] is t.namespace
623                                 return false
624                 return false
625
626         clear_to_table_stopers = {
627                 'table': true
628                 'template': true
629                 'html': true
630         }
631         clear_stack_to_table_context = ->
632                 loop
633                         if clear_to_table_stopers[open_els[0].name]?
634                                 break
635                         open_els.shift()
636                 return
637         clear_to_table_body_stopers = {
638                 'tbody': true
639                 'tfoot': true
640                 'thead': true
641                 'template': true
642                 'html': true
643         }
644         clear_stack_to_table_body_context = ->
645                 loop
646                         if clear_to_table_body_stopers[open_els[0].name]?
647                                 break
648                         open_els.shift()
649                 return
650         clear_to_table_row_stopers = {
651                 'tr': true
652                 'template': true
653                 'html': true
654         }
655         clear_stack_to_table_row_context = ->
656                 loop
657                         if clear_to_table_row_stopers[open_els[0].name]?
658                                 break
659                         open_els.shift()
660                 return
661         clear_afe_to_marker = ->
662                 loop
663                         return unless afe.length > 0 # this happens in fragment case, ?spec error
664                         el = afe.shift()
665                         if el.type is TYPE_AFE_MARKER
666                                 return
667                 return
668
669         # 8.2.3.1 ...
670         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
671         reset_ins_mode = ->
672                 # 1. Let last be false.
673                 last = false
674                 # 2. Let node be the last node in the stack of open elements.
675                 node_i = 0
676                 node = open_els[node_i]
677                 # 3. Loop: If node is the first node in the stack of open elements,
678                 # then set last to true, and, if the parser was originally created as
679                 # part of the HTML fragment parsing algorithm (fragment case) set node
680                 # to the context element.
681                 loop
682                         if node_i is open_els.length - 1
683                                 last = true
684                                 # fixfull (fragment case)
685
686                         # 4. If node is a select element, run these substeps:
687                         if node.name is 'select'
688                                 # 1. If last is true, jump to the step below labeled done.
689                                 unless last
690                                         # 2. Let ancestor be node.
691                                         ancestor_i = node_i
692                                         ancestor = node
693                                         # 3. Loop: If ancestor is the first node in the stack of
694                                         # open elements, jump to the step below labeled done.
695                                         loop
696                                                 if ancestor_i is open_els.length - 1
697                                                         break
698                                                 # 4. Let ancestor be the node before ancestor in the stack
699                                                 # of open elements.
700                                                 ancestor_i += 1
701                                                 ancestor = open_els[ancestor_i]
702                                                 # 5. If ancestor is a template node, jump to the step below
703                                                 # labeled done.
704                                                 if ancestor.name is 'template'
705                                                         break
706                                                 # 6. If ancestor is a table node, switch the insertion mode
707                                                 # to "in select in table" and abort these steps.
708                                                 if ancestor.name is 'table'
709                                                         ins_mode = ins_mode_in_select_in_table
710                                                         return
711                                                 # 7. Jump back to the step labeled loop.
712                                 # 8. Done: Switch the insertion mode to "in select" and abort
713                                 # these steps.
714                                 ins_mode = ins_mode_in_select
715                                 return
716                         # 5. If node is a td or th element and last is false, then switch
717                         # the insertion mode to "in cell" and abort these steps.
718                         if (node.name is 'td' or node.name is 'th') and last is false
719                                 ins_mode = ins_mode_in_cell
720                                 return
721                         # 6. If node is a tr element, then switch the insertion mode to "in
722                         # row" and abort these steps.
723                         if node.name is 'tr'
724                                 ins_mode = ins_mode_in_row
725                                 return
726                         # 7. If node is a tbody, thead, or tfoot element, then switch the
727                         # insertion mode to "in table body" and abort these steps.
728                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
729                                 ins_mode = ins_mode_in_table_body
730                                 return
731                         # 8. If node is a caption element, then switch the insertion mode
732                         # to "in caption" and abort these steps.
733                         if node.name is 'caption'
734                                 ins_mode = ins_mode_in_caption
735                                 return
736                         # 9. If node is a colgroup element, then switch the insertion mode
737                         # to "in column group" and abort these steps.
738                         if node.name is 'colgroup'
739                                 ins_mode = ins_mode_in_column_group
740                                 return
741                         # 10. If node is a table element, then switch the insertion mode to
742                         # "in table" and abort these steps.
743                         if node.name is 'table'
744                                 ins_mode = ins_mode_in_table
745                                 return
746                         # 11. If node is a template element, then switch the insertion mode
747                         # to the current template insertion mode and abort these steps.
748                         # fixfull (template insertion mode stack)
749
750                         # 12. If node is a head element and last is true, then switch the
751                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
752                         # these steps. (fragment case)
753                         if node.name is 'head' and last
754                                 ins_mode = ins_mode_in_body
755                                 return
756                         # 13. If node is a head element and last is false, then switch the
757                         # insertion mode to "in head" and abort these steps.
758                         if node.name is 'head' and last is false
759                                 ins_mode = ins_mode_in_head
760                                 return
761                         # 14. If node is a body element, then switch the insertion mode to
762                         # "in body" and abort these steps.
763                         if node.name is 'body'
764                                 ins_mode = ins_mode_in_body
765                                 return
766                         # 15. If node is a frameset element, then switch the insertion mode
767                         # to "in frameset" and abort these steps. (fragment case)
768                         if node.name is 'frameset'
769                                 ins_mode = ins_mode_in_frameset
770                                 return
771                         # 16. If node is an html element, run these substeps:
772                         if node.name is 'html'
773                                 # 1. If the head element pointer is null, switch the insertion
774                                 # mode to "before head" and abort these steps. (fragment case)
775                                 if head_element_pointer is null
776                                         ins_mode = ins_mode_before_head
777                                 else
778                                         # 2. Otherwise, the head element pointer is not null,
779                                         # switch the insertion mode to "after head" and abort these
780                                         # steps.
781                                         ins_mode = ins_mode_after_head
782                                 return
783                         # 17. If last is true, then switch the insertion mode to "in body"
784                         # and abort these steps. (fragment case)
785                         if last
786                                 ins_mode = ins_mode_in_body
787                                 return
788                         # 18. Let node now be the node before node in the stack of open
789                         # elements.
790                         node_i += 1
791                         node = open_els[node_i]
792                         # 19. Return to the step labeled loop.
793
794         # 8.2.3.2
795
796         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
797         adjusted_current_node = ->
798                 if open_els.length is 1 and flag_fragment_parsing
799                         return context_element
800                 return open_els[0]
801
802         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
803         # this implementation is structured (mostly) as described at the link above.
804         # capitalized comments are the "labels" described at the link above.
805         reconstruct_afe = ->
806                 return if afe.length is 0
807                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
808                         return
809                 # Rewind
810                 i = 0
811                 loop
812                         if i is afe.length - 1
813                                 break
814                         i += 1
815                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
816                                 i -= 1 # Advance
817                                 break
818                 # Create
819                 loop
820                         el = insert_html_element afe[i].token
821                         afe[i] = el
822                         break if i is 0
823                         i -= 1 # Advance
824
825         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
826         # adoption agency algorithm
827         # overview here:
828         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
829         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
830         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
831         adoption_agency = (subject) ->
832                 debug_log "adoption_agency()"
833                 debug_log "tree: #{serialize_els doc.children, false, true}"
834                 debug_log "open_els: #{serialize_els open_els, true, true}"
835                 debug_log "afe: #{serialize_els afe, true, true}"
836                 if open_els[0].name is subject
837                         el = open_els[0]
838                         open_els.shift()
839                         # remove it from the list of active formatting elements (if found)
840                         for t, i in afe
841                                 if t is el
842                                         afe.splice i, 1
843                                         break
844                         debug_log "aaa: starting off with subject on top of stack, exiting"
845                         return
846                 outer = 0
847                 loop
848                         if outer >= 8
849                                 return
850                         outer += 1
851                         # 5. Let formatting element be the last element in the list of
852                         # active formatting elements that: is between the end of the list
853                         # and the last scope marker in the list, if any, or the start of
854                         # the list otherwise, and  has the tag name subject.
855                         fe = null
856                         for t, fe_of_afe in afe
857                                 if t.type is TYPE_AFE_MARKER
858                                         break
859                                 if t.name is subject
860                                         fe = t
861                                         break
862                         # If there is no such element, then abort these steps and instead
863                         # act as described in the "any other end tag" entry above.
864                         if fe is null
865                                 debug_log "aaa: fe not found in afe"
866                                 in_body_any_other_end_tag subject
867                                 return
868                         # 6. If formatting element is not in the stack of open elements,
869                         # then this is a parse error; remove the element from the list, and
870                         # abort these steps.
871                         in_open_els = false
872                         for t, fe_of_open_els in open_els
873                                 if t is fe
874                                         in_open_els = true
875                                         break
876                         unless in_open_els
877                                 debug_log "aaa: fe not found in open_els"
878                                 parse_error()
879                                 # "remove it from the list" must mean afe, since it's not in open_els
880                                 afe.splice fe_of_afe, 1
881                                 return
882                         # 7. If formatting element is in the stack of open elements, but
883                         # the element is not in scope, then this is a parse error; abort
884                         # these steps.
885                         unless el_is_in_scope fe
886                                 debug_log "aaa: fe not in scope"
887                                 parse_error()
888                                 return
889                         # 8. If formatting element is not the current node, this is a parse
890                         # error. (But do not abort these steps.)
891                         unless open_els[0] is fe
892                                 parse_error()
893                                 # continue
894                         # 9. Let furthest block be the topmost node in the stack of open
895                         # elements that is lower in the stack than formatting element, and
896                         # is an element in the special category. There might not be one.
897                         fb = null
898                         fb_of_open_els = null
899                         for t, i in open_els
900                                 if t is fe
901                                         break
902                                 if el_is_special t
903                                         fb = t
904                                         fb_of_open_els = i
905                                         # and continue, to see if there's one that's more "topmost"
906                         # 10. If there is no furthest block, then the UA must first pop all
907                         # the nodes from the bottom of the stack of open elements, from the
908                         # current node up to and including formatting element, then remove
909                         # formatting element from the list of active formatting elements,
910                         # and finally abort these steps.
911                         if fb is null
912                                 debug_log "aaa: no fb"
913                                 loop
914                                         t = open_els.shift()
915                                         if t is fe
916                                                 afe.splice fe_of_afe, 1
917                                                 return
918                         # 11. Let common ancestor be the element immediately above
919                         # formatting element in the stack of open elements.
920                         ca = open_els[fe_of_open_els + 1] # common ancestor
921
922                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
923                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
924                         bookmark = new_aaa_bookmark()
925                         for t, i in afe
926                                 if t is fe
927                                         afe.splice i, 0, bookmark
928                                         break
929                         node = last_node = fb
930                         inner = 0
931                         loop
932                                 inner += 1
933                                 # 3. Let node be the element immediately above node in the
934                                 # stack of open elements, or if node is no longer in the stack
935                                 # of open elements (e.g. because it got removed by this
936                                 # algorithm), the element that was immediately above node in
937                                 # the stack of open elements before node was removed.
938                                 node_next = null
939                                 for t, i in open_els
940                                         if t is node
941                                                 node_next = open_els[i + 1]
942                                                 break
943                                 node = node_next ? node_above
944                                 debug_log "inner loop #{inner}"
945                                 debug_log "tree: #{serialize_els doc.children, false, true}"
946                                 debug_log "open_els: #{serialize_els open_els, true, true}"
947                                 debug_log "afe: #{serialize_els afe, true, true}"
948                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
949                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
950                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
951                                 debug_log "node: #{node.serialize true, true}"
952                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
953
954                                 # 4. If node is formatting element, then go to the next step in
955                                 # the overall algorithm.
956                                 if node is fe
957                                         break
958                                 debug_log "the meat"
959                                 # 5. If inner loop counter is greater than three and node is in
960                                 # the list of active formatting elements, then remove node from
961                                 # the list of active formatting elements.
962                                 node_in_afe = false
963                                 for t, i in afe
964                                         if t is node
965                                                 if inner > 3
966                                                         afe.splice i, 1
967                                                         debug_log "max out inner"
968                                                 else
969                                                         node_in_afe = true
970                                                         debug_log "in afe"
971                                                 break
972                                 # 6. If node is not in the list of active formatting elements,
973                                 # then remove node from the stack of open elements and then go
974                                 # back to the step labeled inner loop.
975                                 unless node_in_afe
976                                         debug_log "not in afe"
977                                         for t, i in open_els
978                                                 if t is node
979                                                         node_above = open_els[i + 1]
980                                                         open_els.splice i, 1
981                                                         break
982                                         continue
983                                 debug_log "the bones"
984                                 # 7. create an element for the token for which the element node
985                                 # was created, in the HTML namespace, with common ancestor as
986                                 # the intended parent; replace the entry for node in the list
987                                 # of active formatting elements with an entry for the new
988                                 # element, replace the entry for node in the stack of open
989                                 # elements with an entry for the new element, and let node be
990                                 # the new element.
991                                 new_node = token_to_element node.token, NS_HTML, ca
992                                 for t, i in afe
993                                         if t is node
994                                                 afe[i] = new_node
995                                                 debug_log "replaced in afe"
996                                                 break
997                                 for t, i in open_els
998                                         if t is node
999                                                 node_above = open_els[i + 1]
1000                                                 open_els[i] = new_node
1001                                                 debug_log "replaced in open_els"
1002                                                 break
1003                                 node = new_node
1004                                 # 8. If last node is furthest block, then move the
1005                                 # aforementioned bookmark to be immediately after the new node
1006                                 # in the list of active formatting elements.
1007                                 if last_node is fb
1008                                         for t, i in afe
1009                                                 if t is bookmark
1010                                                         afe.splice i, 1
1011                                                         debug_log "removed bookmark"
1012                                                         break
1013                                         for t, i in afe
1014                                                 if t is node
1015                                                         # "after" means lower
1016                                                         afe.splice i, 0, bookmark # "after as <-
1017                                                         debug_log "placed bookmark after node"
1018                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1019                                                         break
1020                                 # 9. Insert last node into node, first removing it from its
1021                                 # previous parent node if any.
1022                                 if last_node.parent?
1023                                         debug_log "last_node has parent"
1024                                         for c, i in last_node.parent.children
1025                                                 if c is last_node
1026                                                         debug_log "removing last_node from parent"
1027                                                         last_node.parent.children.splice i, 1
1028                                                         break
1029                                 node.children.push last_node
1030                                 last_node.parent = node
1031                                 # 10. Let last node be node.
1032                                 last_node = node
1033                                 debug_log "at last"
1034                                 # 11. Return to the step labeled inner loop.
1035                         # 14. Insert whatever last node ended up being in the previous step
1036                         # at the appropriate place for inserting a node, but using common
1037                         # ancestor as the override target.
1038
1039                         # In the case where fe is immediately followed by fb:
1040                         #   * inner loop exits out early (node==fe)
1041                         #   * last_node is fb
1042                         #   * last_node is still in the tree (not a duplicate)
1043                         if last_node.parent?
1044                                 debug_log "FEFIRST? last_node has parent"
1045                                 for c, i in last_node.parent.children
1046                                         if c is last_node
1047                                                 debug_log "removing last_node from parent"
1048                                                 last_node.parent.children.splice i, 1
1049                                                 break
1050
1051                         debug_log "after aaa inner loop"
1052                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1053                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1054                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1055                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1056                         debug_log "tree: #{serialize_els doc.children, false, true}"
1057
1058                         debug_log "insert"
1059
1060
1061                         # can't use standard insert token thing, because it's already in
1062                         # open_els and must stay at it's current position in open_els
1063                         dest = adjusted_insertion_location ca
1064                         dest[0].children.splice dest[1], 0, last_node
1065                         last_node.parent = dest[0]
1066
1067
1068                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1069                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1070                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1071                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1072                         debug_log "tree: #{serialize_els doc.children, false, true}"
1073
1074                         # 15. Create an element for the token for which formatting element
1075                         # was created, in the HTML namespace, with furthest block as the
1076                         # intended parent.
1077                         new_element = token_to_element fe.token, NS_HTML, fb
1078                         # 16. Take all of the child nodes of furthest block and append them
1079                         # to the element created in the last step.
1080                         while fb.children.length
1081                                 t = fb.children.shift()
1082                                 t.parent = new_element
1083                                 new_element.children.push t
1084                         # 17. Append that new element to furthest block.
1085                         new_element.parent = fb
1086                         fb.children.push new_element
1087                         # 18. Remove formatting element from the list of active formatting
1088                         # elements, and insert the new element into the list of active
1089                         # formatting elements at the position of the aforementioned
1090                         # bookmark.
1091                         for t, i in afe
1092                                 if t is fe
1093                                         afe.splice i, 1
1094                                         break
1095                         for t, i in afe
1096                                 if t is bookmark
1097                                         afe[i] = new_element
1098                                         break
1099                         # 19. Remove formatting element from the stack of open elements,
1100                         # and insert the new element into the stack of open elements
1101                         # immediately below the position of furthest block in that stack.
1102                         for t, i in open_els
1103                                 if t is fe
1104                                         open_els.splice i, 1
1105                                         break
1106                         for t, i in open_els
1107                                 if t is fb
1108                                         open_els.splice i, 0, new_element
1109                                         break
1110                         # 20. Jump back to the step labeled outer loop.
1111                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1112                         debug_log "tree: #{serialize_els doc.children, false, true}"
1113                         debug_log "open_els: #{serialize_els open_els, true, true}"
1114                         debug_log "afe: #{serialize_els afe, true, true}"
1115                 debug_log "AAA DONE"
1116
1117         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1118         close_p_element = ->
1119                 generate_implied_end_tags 'p' # arg is exception
1120                 if open_els[0].name isnt 'p'
1121                         parse_error()
1122                 while open_els.length > 1 # just in case
1123                         el = open_els.shift()
1124                         if el.name is 'p'
1125                                 return
1126         close_p_if_in_button_scope = ->
1127                 if is_in_button_scope 'p'
1128                         close_p_element()
1129
1130         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1131         # aka insert_a_character = (t) ->
1132         insert_character = (t) ->
1133                 dest = adjusted_insertion_location()
1134                 # fixfull check for Document node
1135                 if dest[1] > 0
1136                         prev = dest[0].children[dest[1] - 1]
1137                         if prev.type is TYPE_TEXT
1138                                 prev.text += t.text
1139                                 return
1140                 dest[0].children.splice dest[1], 0, t
1141
1142
1143         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1144         process_token = (t) ->
1145                 acn = adjusted_current_node()
1146                 unless acn?
1147                         ins_mode t
1148                         return
1149                 if acn.namespace is NS_HTML
1150                         ins_mode t
1151                         return
1152                 if is_mathml_text_integration_point(acn)
1153                         if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1154                                 ins_mode t
1155                                 return
1156                         if t.type is TYPE_TEXT
1157                                 ins_mode t
1158                                 return
1159                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1160                         ins_mode t
1161                         return
1162                 if is_html_integration acn
1163                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1164                                 ins_mode t
1165                                 return
1166                 if t.type is TYPE_EOF
1167                         ins_mode t
1168                         return
1169                 in_foreign_content t
1170                 return
1171
1172         # 8.2.5.1
1173         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1174         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1175         adjusted_insertion_location = (override_target = null) ->
1176                 # 1. If there was an override target specified, then let target be the
1177                 # override target.
1178                 if override_target?
1179                         target = override_target
1180                 else # Otherwise, let target be the current node.
1181                         target = open_els[0]
1182                 # 2. Determine the adjusted insertion location using the first matching
1183                 # steps from the following list:
1184                 #
1185                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1186                 # thead, or tr element Foster parenting happens when content is
1187                 # misnested in tables.
1188                 if flag_foster_parenting and foster_parenting_targets[target.name]
1189                         loop # once. this is here so we can ``break`` to "abort these substeps"
1190                                 # 1. Let last template be the last template element in the
1191                                 # stack of open elements, if any.
1192                                 last_template = null
1193                                 last_template_i = null
1194                                 for el, i in open_els
1195                                         if el.name is 'template' and el.namespace is NS_HTML
1196                                                 last_template = el
1197                                                 last_template_i = i
1198                                                 break
1199                                 # 2. Let last table be the last table element in the stack of
1200                                 # open elements, if any.
1201                                 last_table = null
1202                                 last_table_i
1203                                 for el, i in open_els
1204                                         if el.name is 'table' and el.namespace is NS_HTML
1205                                                 last_table = el
1206                                                 last_table_i = i
1207                                                 break
1208                                 # 3. If there is a last template and either there is no last
1209                                 # table, or there is one, but last template is lower (more
1210                                 # recently added) than last table in the stack of open
1211                                 # elements, then: let adjusted insertion location be inside
1212                                 # last template's template contents, after its last child (if
1213                                 # any), and abort these substeps.
1214                                 if last_template and (last_table is null or last_template_i < last_table_i)
1215                                         target = last_template # fixfull should be it's contents
1216                                         target_i = target.children.length
1217                                         break
1218                                 # 4. If there is no last table, then let adjusted insertion
1219                                 # location be inside the first element in the stack of open
1220                                 # elements (the html element), after its last child (if any),
1221                                 # and abort these substeps. (fragment case)
1222                                 if last_table is null
1223                                         # this is odd
1224                                         target = open_els[open_els.length - 1]
1225                                         target_i = target.children.length
1226                                         break
1227                                 # 5. If last table has a parent element, then let adjusted
1228                                 # insertion location be inside last table's parent element,
1229                                 # immediately before last table, and abort these substeps.
1230                                 if last_table.parent?
1231                                         for c, i in last_table.parent.children
1232                                                 if c is last_table
1233                                                         target = last_table.parent
1234                                                         target_i = i
1235                                                         break
1236                                         break
1237                                 # 6. Let previous element be the element immediately above last
1238                                 # table in the stack of open elements.
1239                                 #
1240                                 # huh? how could it not have a parent?
1241                                 previous_element = open_els[last_table_i + 1]
1242                                 # 7. Let adjusted insertion location be inside previous
1243                                 # element, after its last child (if any).
1244                                 target = previous_element
1245                                 target_i = target.children.length
1246                                 # Note: These steps are involved in part because it's possible
1247                                 # for elements, the table element in this case in particular,
1248                                 # to have been moved by a script around in the DOM, or indeed
1249                                 # removed from the DOM entirely, after the element was inserted
1250                                 # by the parser.
1251                                 break # don't really loop
1252                 else
1253                         # Otherwise Let adjusted insertion location be inside target, after
1254                         # its last child (if any).
1255                         target_i = target.children.length
1256
1257                 # 3. If the adjusted insertion location is inside a template element,
1258                 # let it instead be inside the template element's template contents,
1259                 # after its last child (if any).
1260                 # fixfull (template)
1261
1262                 # 4. Return the adjusted insertion location.
1263                 return [target, target_i]
1264
1265         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1266         # aka create_an_element_for_token
1267         token_to_element = (t, namespace, intended_parent) ->
1268                 # convert attributes into a hash
1269                 attrs = {}
1270                 for a in t.attrs_a
1271                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1272                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1273
1274                 # TODO 2. If the newly created element has an xmlns attribute in the
1275                 # XMLNS namespace whose value is not exactly the same as the element's
1276                 # namespace, that is a parse error. Similarly, if the newly created
1277                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1278                 # value is not the XLink Namespace, that is a parse error.
1279
1280                 # fixfull: the spec says stuff about form pointers and ownerDocument
1281
1282                 return el
1283
1284         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1285         insert_foreign_element = (token, namespace) ->
1286                 ail = adjusted_insertion_location()
1287                 ail_el = ail[0]
1288                 ail_i = ail[1]
1289                 el = token_to_element token, namespace, ail_el
1290                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1291                 el.parent = ail_el
1292                 ail_el.children.splice ail_i, 0, el
1293                 open_els.unshift el
1294                 return el
1295         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1296         insert_html_element = (token) ->
1297                 insert_foreign_element token, NS_HTML
1298
1299         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1300         # position should be [node, index_within_children]
1301         insert_comment = (t, position = null) ->
1302                 position ?= adjusted_insertion_location()
1303                 position[0].children.splice position[1], 0, t
1304
1305         # 8.2.5.2
1306         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1307         parse_generic_raw_text = (t) ->
1308                 insert_html_element t
1309                 tok_state = tok_state_rawtext
1310                 original_ins_mode = ins_mode
1311                 ins_mode = ins_mode_text
1312         parse_generic_rcdata_text = (t) ->
1313                 insert_html_element t
1314                 tok_state = tok_state_rcdata
1315                 original_ins_mode = ins_mode
1316                 ins_mode = ins_mode_text
1317
1318         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1319         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1320         generate_implied_end_tags = (except = null) ->
1321                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1322                         open_els.shift()
1323
1324         # 8.2.5.4 The rules for parsing tokens in HTML content
1325         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1326
1327         # 8.2.5.4.1 The "initial" insertion mode
1328         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1329         ins_mode_initial = (t) ->
1330                 if is_space_tok t
1331                         return
1332                 if t.type is TYPE_COMMENT
1333                         # ?fixfull
1334                         doc.children.push t
1335                         return
1336                 if t.type is TYPE_DOCTYPE
1337                         # FIXME check identifiers, set quirks, etc
1338                         # fixfull
1339                         doc.children.push t
1340                         ins_mode = ins_mode_before_html
1341                         return
1342                 # Anything else
1343                 #fixfull (iframe, quirks)
1344                 ins_mode = ins_mode_before_html
1345                 process_token t
1346                 return
1347
1348         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1349         ins_mode_before_html = (t) ->
1350                 if t.type is TYPE_DOCTYPE
1351                         parse_error()
1352                         return
1353                 if t.type is TYPE_COMMENT
1354                         doc.children.push t
1355                         return
1356                 if is_space_tok t
1357                         return
1358                 if t.type is TYPE_START_TAG and t.name is 'html'
1359                         el = token_to_element t, NS_HTML, doc
1360                         doc.children.push el
1361                         open_els.unshift(el)
1362                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1363                         ins_mode = ins_mode_before_head
1364                         return
1365                 if t.type is TYPE_END_TAG
1366                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1367                                 # fall through to "anything else"
1368                         else
1369                                 parse_error()
1370                                 return
1371                 # Anything else
1372                 html_tok = new_open_tag 'html'
1373                 el = token_to_element html_tok, NS_HTML, doc
1374                 doc.children.push el
1375                 open_els.unshift el
1376                 # ?fixfull browsing context
1377                 ins_mode = ins_mode_before_head
1378                 process_token t
1379                 return
1380
1381         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1382         ins_mode_before_head = (t) ->
1383                 if is_space_tok t
1384                         return
1385                 if t.type is TYPE_COMMENT
1386                         insert_comment t
1387                         return
1388                 if t.type is TYPE_DOCTYPE
1389                         parse_error()
1390                         return
1391                 if t.type is TYPE_START_TAG and t.name is 'html'
1392                         ins_mode_in_body t
1393                         return
1394                 if t.type is TYPE_START_TAG and t.name is 'head'
1395                         el = insert_html_element t
1396                         head_element_pointer = el
1397                         ins_mode = ins_mode_in_head
1398                         return
1399                 if t.type is TYPE_END_TAG
1400                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1401                                 # fall through to Anything else below
1402                         else
1403                                 parse_error()
1404                                 return
1405                 # Anything else
1406                 head_tok = new_open_tag 'head'
1407                 el = insert_html_element head_tok
1408                 head_element_pointer = el
1409                 ins_mode = ins_mode_in_head
1410                 process_token t
1411
1412         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1413         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1414                 open_els.shift() # spec says this will be a 'head' node
1415                 ins_mode = ins_mode_after_head
1416                 process_token t
1417         ins_mode_in_head = (t) ->
1418                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1419                         insert_character t
1420                         return
1421                 if t.type is TYPE_COMMENT
1422                         insert_comment t
1423                         return
1424                 if t.type is TYPE_DOCTYPE
1425                         parse_error()
1426                         return
1427                 if t.type is TYPE_START_TAG and t.name is 'html'
1428                         ins_mode_in_body t
1429                         return
1430                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1431                         el = insert_html_element t
1432                         open_els.shift()
1433                         t.acknowledge_self_closing()
1434                         return
1435                 if t.type is TYPE_START_TAG and t.name is 'meta'
1436                         el = insert_html_element t
1437                         open_els.shift()
1438                         t.acknowledge_self_closing()
1439                         # fixfull encoding stuff
1440                         return
1441                 if t.type is TYPE_START_TAG and t.name is 'title'
1442                         parse_generic_rcdata_text t
1443                         return
1444                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1445                         parse_generic_raw_text t
1446                         return
1447                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1448                         insert_html_element t
1449                         ins_mode = ins_mode_in_head_noscript
1450                         return
1451                 if t.type is TYPE_START_TAG and t.name is 'script'
1452                         ail = adjusted_insertion_location()
1453                         el = token_to_element t, NS_HTML, ail
1454                         el.flag 'parser-inserted', true
1455                         # fixfull frament case
1456                         ail[0].children.splice ail[1], 0, el
1457                         open_els.unshift el
1458                         tok_state = tok_state_script_data
1459                         original_ins_mode = ins_mode # make sure orig... is defined
1460                         ins_mode = ins_mode_text
1461                         return
1462                 if t.type is TYPE_END_TAG and t.name is 'head'
1463                         open_els.shift() # will be a head element... spec says so
1464                         ins_mode = ins_mode_after_head
1465                         return
1466                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1467                         ins_mode_in_head_else t
1468                         return
1469                 if t.type is TYPE_START_TAG and t.name is 'template'
1470                         insert_html_element t
1471                         afe_push_marker()
1472                         flag_frameset_ok = false
1473                         ins_mode = ins_mode_in_template
1474                         template_ins_modes.unshift ins_mode_in_template
1475                         return
1476                 if t.type is TYPE_END_TAG and t.name is 'template'
1477                         if template_tag_is_open()
1478                                 generate_implied_end_tags
1479                                 if open_els[0].name isnt 'template'
1480                                         parse_error()
1481                                 loop
1482                                         el = open_els.shift()
1483                                         if el.name is 'template'
1484                                                 break
1485                                 clear_afe_to_marker()
1486                                 template_ins_modes.shift()
1487                                 reset_ins_mode()
1488                         else
1489                                 parse_error()
1490                         return
1491                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1492                         parse_error()
1493                         return
1494                 ins_mode_in_head_else t
1495
1496         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1497         ins_mode_in_head_noscript_else = (t) ->
1498                 parse_error()
1499                 open_els.shift()
1500                 ins_mode = ins_mode_in_head
1501                 process_token t
1502         ins_mode_in_head_noscript = (t) ->
1503                 if t.type is TYPE_DOCTYPE
1504                         parse_error()
1505                         return
1506                 if t.type is TYPE_START_TAG and t.name is 'html'
1507                         ins_mode_in_body t
1508                         return
1509                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1510                         open_els.shift()
1511                         ins_mode = ins_mode_in_head
1512                         return
1513                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1514                         ins_mode_in_head t
1515                         return
1516                 if t.type is TYPE_END_TAG and t.name is 'br'
1517                         ins_mode_in_head_noscript_else t
1518                         return
1519                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1520                         parse_error()
1521                         return
1522                 # Anything else
1523                 ins_mode_in_head_noscript_else t
1524                 return
1525
1526
1527
1528         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1529         ins_mode_after_head_else = (t) ->
1530                 body_tok = new_open_tag 'body'
1531                 insert_html_element body_tok
1532                 ins_mode = ins_mode_in_body
1533                 process_token t
1534                 return
1535         ins_mode_after_head = (t) ->
1536                 if is_space_tok t
1537                         insert_character t
1538                         return
1539                 if t.type is TYPE_COMMENT
1540                         insert_comment t
1541                         return
1542                 if t.type is TYPE_DOCTYPE
1543                         parse_error()
1544                         return
1545                 if t.type is TYPE_START_TAG and t.name is 'html'
1546                         ins_mode_in_body t
1547                         return
1548                 if t.type is TYPE_START_TAG and t.name is 'body'
1549                         insert_html_element t
1550                         flag_frameset_ok = false
1551                         ins_mode = ins_mode_in_body
1552                         return
1553                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1554                         insert_html_element t
1555                         ins_mode = ins_mode_in_frameset
1556                         return
1557                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1558                         parse_error()
1559                         open_els.unshift head_element_pointer
1560                         ins_mode_in_head t
1561                         for el, i of open_els
1562                                 if el is head_element_pointer
1563                                         open_els.splice i, 1
1564                                         return
1565                         console.log "warning: 23904 couldn't find head element in open_els"
1566                         return
1567                 if t.type is TYPE_END_TAG and t.name is 'template'
1568                         ins_mode_in_head t
1569                         return
1570                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1571                         ins_mode_after_head_else t
1572                         return
1573                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1574                         parse_error()
1575                         return
1576                 # Anything else
1577                 ins_mode_after_head_else t
1578
1579         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1580         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1581                 for el, i in open_els
1582                         if el.namespace is NS_HTML and el.name is name
1583                                 generate_implied_end_tags name # arg is exception
1584                                 parse_error() unless i is 0
1585                                 while i >= 0
1586                                         open_els.shift()
1587                                         i -= 1
1588                                 return
1589                         if special_elements[el.name] is el.namespace
1590                                 parse_error()
1591                                 return
1592                 return
1593         ins_mode_in_body = (t) ->
1594                 if t.type is TYPE_TEXT and t.text is "\u0000"
1595                         parse_error()
1596                         return
1597                 if is_space_tok t
1598                         reconstruct_afe()
1599                         insert_character t
1600                         return
1601                 if t.type is TYPE_TEXT
1602                         reconstruct_afe()
1603                         insert_character t
1604                         flag_frameset_ok = false
1605                         return
1606                 if t.type is TYPE_COMMENT
1607                         insert_comment t
1608                         return
1609                 if t.type is TYPE_DOCTYPE
1610                         parse_error()
1611                         return
1612                 if t.type is TYPE_START_TAG and t.name is 'html'
1613                         parse_error()
1614                         return if template_tag_is_open()
1615                         root_attrs = open_els[open_els.length - 1].attrs
1616                         for a of t.attrs_a
1617                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1618                         return
1619
1620                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1621                         ins_mode_in_head t
1622                         return
1623                 if t.type is TYPE_START_TAG and t.name is 'body'
1624                         parse_error()
1625                         return if open_els.length < 2
1626                         second = open_els[open_els.length - 2]
1627                         return unless second.ns is NS_HTML
1628                         return unless second.name is 'body'
1629                         return if template_tag_is_open()
1630                         frameset_ok_flag = false
1631                         for a of t.attrs_a
1632                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1633                         return
1634                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1635                         parse_error()
1636                         return if open_els.length < 2
1637                         second_i = open_els.length - 2
1638                         second = open_els[second_i]
1639                         return unless second.ns is NS_HTML
1640                         return unless second.name is 'body'
1641                         flag_frameset_ok = false
1642                         if second.parent?
1643                                 for el, i in second.parent.children
1644                                         if el is second
1645                                                 second.parent.children.splice i, 1
1646                                                 break
1647                         open_els.splice second_i, 1
1648                         # pop everything except the "root html element"
1649                         while open_els.length > 1
1650                                 open_els.shift()
1651                         insert_html_element t
1652                         ins_mode = ins_mode_in_frameset
1653                         return
1654                 if t.type is TYPE_EOF
1655                         ok_tags = {
1656                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1657                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1658                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1659                         }
1660                         for el in open_els
1661                                 unless ok_tags[t.name] is el.namespace
1662                                         parse_error()
1663                                         break
1664                         if template_ins_modes.length > 0
1665                                 ins_mode_in_template t
1666                         else
1667                                 stop_parsing()
1668                         return
1669                 if t.type is TYPE_END_TAG and t.name is 'body'
1670                         unless is_in_scope 'body'
1671                                 parse_error()
1672                                 return
1673                         ok_tags = {
1674                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1675                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1676                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1677                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1678                                 html:NS_HTML
1679                         }
1680                         for el in open_els
1681                                 unless ok_tags[t.name] is el.namespace
1682                                         parse_error()
1683                                         break
1684                         ins_mode = ins_mode_after_body
1685                         return
1686                 if t.type is TYPE_END_TAG and t.name is 'html'
1687                         unless is_in_scope 'body'
1688                                 parse_error()
1689                                 return
1690                         ok_tags = {
1691                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1692                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1693                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1694                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1695                                 html:NS_HTML
1696                         }
1697                         for el in open_els
1698                                 unless ok_tags[t.name] is el.namespace
1699                                         parse_error()
1700                                         break
1701                         ins_mode = ins_mode_after_body
1702                         process_token t
1703                         return
1704                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1705                         close_p_if_in_button_scope()
1706                         insert_html_element t
1707                         return
1708                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1709                         close_p_if_in_button_scope()
1710                         if h_tags[open_els[0]] is NS_HTML
1711                                 parse_error()
1712                                 open_els.shift()
1713                         insert_html_element t
1714                         return
1715                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1716                         close_p_if_in_button_scope()
1717                         insert_html_element t
1718                         # spec: If the next token is a "LF" (U+000A) character token, then
1719                         # ignore that token and move on to the next one. (Newlines at the
1720                         # start of pre blocks are ignored as an authoring convenience.)
1721                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1722                                 cur += 1
1723                         flag_frameset_ok = false
1724                         return
1725                 if t.type is TYPE_START_TAG and t.name is 'form'
1726                         unless form_element_pointer is null or template_tag_is_open()
1727                                 parse_error()
1728                                 return
1729                         close_p_if_in_button_scope()
1730                         el = insert_html_element t
1731                         unless template_tag_is_open()
1732                                 form_element_pointer = el
1733                         return
1734                 if t.type is TYPE_START_TAG and t.name is 'li'
1735                         flag_frameset_ok = false
1736                         for node in open_els
1737                                 if node.name is 'li' and node.namespace is NS_HTML
1738                                         generate_implied_end_tags 'li' # arg is exception
1739                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1740                                                 parse_error()
1741                                         loop
1742                                                 el = open_els.shift()
1743                                                 if el.name is 'li' and el.namespace is NS_HTML
1744                                                         break
1745                                         break
1746                                 if el_is_special_not_adp node
1747                                                 break
1748                         close_p_if_in_button_scope()
1749                         insert_html_element t
1750                         return
1751                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1752                         flag_frameset_ok = false
1753                         for node in open_els
1754                                 if node.name is 'dd' and node.namespace is NS_HTML
1755                                         generate_implied_end_tags 'dd' # arg is exception
1756                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1757                                                 parse_error()
1758                                         loop
1759                                                 el = open_els.shift()
1760                                                 if el.name is 'dd' and el.namespace is NS_HTML
1761                                                         break
1762                                         break
1763                                 if node.name is 'dt' and node.namespace is NS_HTML
1764                                         generate_implied_end_tags 'dt' # arg is exception
1765                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1766                                                 parse_error()
1767                                         loop
1768                                                 el = open_els.shift()
1769                                                 if el.name is 'dt' and el.namespace is NS_HTML
1770                                                         break
1771                                         break
1772                                 if el_is_special_not_adp node
1773                                         break
1774                         close_p_if_in_button_scope()
1775                         insert_html_element t
1776                         return
1777                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1778                         close_p_if_in_button_scope()
1779                         insert_html_element t
1780                         tok_state = tok_state_plaintext
1781                         return
1782                 if t.type is TYPE_START_TAG and t.name is 'button'
1783                         if is_in_scope 'button', NS_HTML
1784                                 parse_error()
1785                                 generate_implied_end_tags()
1786                                 loop
1787                                         el = open_els.shift()
1788                                         if el.name is 'button' and el.namespace is NS_HTML
1789                                                 break
1790                         reconstruct_afe()
1791                         insert_html_element t
1792                         flag_frameset_ok = false
1793                         return
1794                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1795                         unless is_in_scope t.name, NS_HTML
1796                                 parse_error()
1797                                 return
1798                         generate_implied_end_tags()
1799                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1800                                 parse_error()
1801                         loop
1802                                 el = open_els.shift()
1803                                 if el.name is t.name and el.namespace is NS_HTML
1804                                         return
1805                         return
1806                 if t.type is TYPE_END_TAG and t.name is 'form'
1807                         unless template_tag_is_open()
1808                                 node = form_element_pointer
1809                                 form_element_pointer = null
1810                                 if node is null or not el_is_in_scope node
1811                                         parse_error()
1812                                         return
1813                                 generate_implied_end_tags()
1814                                 if open_els[0] isnt node
1815                                         parse_error()
1816                                 for el, i in open_els
1817                                         if el is node
1818                                                 open_els.splice i, 1
1819                                                 break
1820                         else
1821                                 unless is_in_scope 'form', NS_HTML
1822                                         parse_error()
1823                                         return
1824                                 generate_implied_end_tags()
1825                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1826                                         parse_error()
1827                                 loop
1828                                         el = open_els.shift()
1829                                         if el.name is 'form' and el.namespace is NS_HTML
1830                                                 break
1831                         return
1832                 if t.type is TYPE_END_TAG and t.name is 'p'
1833                         unless is_in_button_scope 'p', NS_HTML
1834                                 parse_error()
1835                                 insert_html_element new_open_tag 'p'
1836                         close_p_element()
1837                         return
1838                 if t.type is TYPE_END_TAG and t.name is 'li'
1839                         unless is_in_li_scope 'li', NS_HTML
1840                                 parse_error()
1841                                 return
1842                         generate_implied_end_tags 'li' # arg is exception
1843                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1844                                 parse_error()
1845                         loop
1846                                 el = open_els.shift()
1847                                 if el.name is 'li' and el.namespace is NS_HTML
1848                                         break
1849                         return
1850                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1851                         unless is_in_scope t.name, NS_HTML
1852                                 parse_error()
1853                                 return
1854                         generate_implied_end_tags t.name # arg is exception
1855                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1856                                 parse_error()
1857                         loop
1858                                 el = open_els.shift()
1859                                 if el.name is t.name and el.namespace is NS_HTML
1860                                         break
1861                         return
1862                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1863                         h_in_scope = false
1864                         for el in open_els
1865                                 if h_tags[el.name] is el.namespace
1866                                         h_in_scope = true
1867                                         break
1868                                 if standard_scopers[el.name] is el.namespace
1869                                         break
1870                         unless h_in_scope
1871                                 parse_error()
1872                                 return
1873                         generate_implied_end_tags()
1874                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1875                                 parse_error()
1876                         loop
1877                                 el = open_els.shift()
1878                                 if h_tags[el.name] is el.namespace
1879                                         break
1880                         return
1881                 # deep breath!
1882                 if t.type is TYPE_START_TAG and t.name is 'a'
1883                         # If the list of active formatting elements contains an a element
1884                         # between the end of the list and the last marker on the list (or
1885                         # the start of the list if there is no marker on the list), then
1886                         # this is a parse error; run the adoption agency algorithm for the
1887                         # tag name "a", then remove that element from the list of active
1888                         # formatting elements and the stack of open elements if the
1889                         # adoption agency algorithm didn't already remove it (it might not
1890                         # have if the element is not in table scope).
1891                         found = false
1892                         for el in afe
1893                                 if el.type is TYPE_AFE_MARKER
1894                                         break
1895                                 if el.name is 'a' and el.namespace is NS_HTML
1896                                         found = el
1897                         if found?
1898                                 parse_error()
1899                                 adoption_agency 'a'
1900                                 for el, i in afe
1901                                         if el is found
1902                                                 afe.splice i, 1
1903                                 for el, i in open_els
1904                                         if el is found
1905                                                 open_els.splice i, 1
1906                         reconstruct_afe()
1907                         el = insert_html_element t
1908                         afe_push el
1909                         return
1910                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1911                         reconstruct_afe()
1912                         el = insert_html_element t
1913                         afe_push el
1914                         return
1915                 if t.type is TYPE_START_TAG and t.name is 'nobr'
1916                         reconstruct_afe()
1917                         el = insert_html_element t
1918                         afe_push el
1919                         return
1920                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1921                         adoption_agency t.name
1922                         return
1923                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1924                         reconstruct_afe()
1925                         insert_html_element t
1926                         afe_push_marker()
1927                         flag_frameset_ok = false
1928                         return
1929                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1930                         unless is_in_scope t.name, NS_HTML
1931                                 parse_error()
1932                                 return
1933                         generate_implied_end_tags()
1934                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1935                                 parse_error()
1936                         loop
1937                                 el = open_els.shift()
1938                                 if el.name is t.name and el.namespace is NS_HTML
1939                                         break
1940                         clear_afe_to_marker()
1941                         return
1942                 if t.type is TYPE_START_TAG and t.name is 'table'
1943                         close_p_if_in_button_scope() # fixfull quirksmode thing
1944                         insert_html_element t
1945                         flag_frameset_ok = false
1946                         ins_mode = ins_mode_in_table
1947                         return
1948                 if t.type is TYPE_END_TAG and t.name is 'br'
1949                         parse_error()
1950                         t.type is TYPE_START_TAG
1951                         # fall through
1952                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1953                         reconstruct_afe()
1954                         insert_html_element t
1955                         open_els.shift()
1956                         t.acknowledge_self_closing()
1957                         flag_frameset_ok = false
1958                         return
1959                 if t.type is TYPE_START_TAG and t.name is 'input'
1960                         reconstruct_afe()
1961                         insert_html_element t
1962                         open_els.shift()
1963                         t.acknowledge_self_closing()
1964                         unless is_input_hidden_tok t
1965                                 flag_frameset_ok = false
1966                         return
1967                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1968                         insert_html_element t
1969                         open_els.shift()
1970                         t.acknowledge_self_closing()
1971                         return
1972                 if t.type is TYPE_START_TAG and t.name is 'hr'
1973                         close_p_if_in_button_scope()
1974                         insert_html_element t
1975                         open_els.shift()
1976                         t.acknowledge_self_closing()
1977                         flag_frameset_ok = false
1978                         return
1979                 if t.type is TYPE_START_TAG and t.name is 'image'
1980                         parse_error()
1981                         t.name = 'img'
1982                         process_token t
1983                         return
1984                 if t.type is TYPE_START_TAG and t.name is 'isindex'
1985                         parse_error()
1986                         if template_tag_is_open() is false and form_element_pointer isnt null
1987                                 return
1988                         t.acknowledge_self_closing()
1989                         flag_frameset_ok = false
1990                         close_p_if_in_button_scope()
1991                         el = insert_html_element new_open_tag 'form'
1992                         unless template_tag_is_open()
1993                                 form_element_pointer = el
1994                         for a in t.attrs_a
1995                                 if a[0] is 'action'
1996                                         el.attrs['action'] = a[1]
1997                                         break
1998                         insert_html_element new_open_tag 'hr'
1999                         open_els.shift()
2000                         reconstruct_afe()
2001                         insert_html_element new_open_tag 'label'
2002                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2003                         input_el = new_open_tag 'input'
2004                         prompt = null
2005                         for a in t.attrs_a
2006                                 if a[0] is 'prompt'
2007                                         prompt = a[1]
2008                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2009                                         input_el.attrs_a.push [a[0], a[1]]
2010                         input_el.attrs_a.push ['name', 'isindex']
2011                         # fixfull this next bit is in english... internationalize?
2012                         prompt ?= "This is a searchable index. Enter search keywords: "
2013                         insert_character new_character_token prompt # fixfull split
2014                         # TODO submit typo "balue" in spec
2015                         insert_html_element input_el
2016                         open_els.shift()
2017                         # insert_character '' # you can put chars here if promt attr missing
2018                         open_els.shift()
2019                         insert_html_element new_open_tag 'hr'
2020                         open_els.shift()
2021                         open_els.shift()
2022                         unless template_tag_is_open()
2023                                 form_element_pointer = null
2024                         return
2025                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2026                         insert_html_element t
2027                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2028                                 cur += 1
2029                         tok_state = tok_state_rcdata
2030                         original_ins_mode = ins_mode
2031                         flag_frameset_ok = false
2032                         ins_mode = ins_mode_text
2033                         return
2034                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2035                         close_p_if_in_button_scope()
2036                         reconstruct_afe()
2037                         flag_frameset_ok = false
2038                         parse_generic_raw_text t
2039                         return
2040                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2041                         flag_frameset_ok = false
2042                         parse_generic_raw_text t
2043                         return
2044                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2045                         parse_generic_raw_text t
2046                         return
2047                 if t.type is TYPE_START_TAG and t.name is 'select'
2048                         reconstruct_afe()
2049                         insert_html_element t
2050                         flag_frameset_ok = false
2051                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2052                                 ins_mode = ins_mode_in_select_in_table
2053                         else
2054                                 ins_mode = ins_mode_in_select
2055                         return
2056                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2057                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2058                                 open_els.shift()
2059                         reconstruct_afe()
2060                         insert_html_element t
2061                         return
2062                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2063                         if is_in_scope 'ruby', NS_HTML
2064                                 generate_implied_end_tags()
2065                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2066                                         parse_error()
2067                         insert_html_element t
2068                         return
2069                 if t.type is TYPE_START_TAG and t.name is 'rt'
2070                         if is_in_scope 'ruby', NS_HTML
2071                                 generate_implied_end_tags 'rtc' # arg is exception
2072                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2073                                         parse_error()
2074                         insert_html_element t
2075                         return
2076                 if t.type is TYPE_START_TAG and t.name is 'math'
2077                         reconstruct_afe()
2078                         adjust_mathml_attributes t
2079                         adjust_foreign_attributes t
2080                         insert_foreign_element t, NS_MATHML
2081                         if t.flag 'self-closing'
2082                                 open_els.shift()
2083                                 t.acknowledge_self_closing()
2084                         return
2085                 if t.type is TYPE_START_TAG and t.name is 'svg'
2086                         reconstruct_afe()
2087                         adjust_svg_attributes t
2088                         adjust_foreign_attributes t
2089                         insert_foreign_element t, NS_SVG
2090                         if t.flag 'self-closing'
2091                                 open_els.shift()
2092                                 t.acknowledge_self_closing()
2093                         return
2094                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2095                         parse_error()
2096                         return
2097                 if t.type is TYPE_START_TAG # any other start tag
2098                         reconstruct_afe()
2099                         insert_html_element t
2100                         return
2101                 if t.type is TYPE_END_TAG # any other end tag
2102                         in_body_any_other_end_tag t.name
2103                         return
2104                 return
2105
2106         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2107         ins_mode_text = (t) ->
2108                 if t.type is TYPE_TEXT
2109                         insert_character t
2110                         return
2111                 if t.type is TYPE_EOF
2112                         parse_error()
2113                         if open_els[0].name is 'script'
2114                                 open_els[0].flag 'already started', true
2115                         open_els.shift()
2116                         ins_mode = original_ins_mode
2117                         process_token t
2118                         return
2119                 if t.type is TYPE_END_TAG and t.name is 'script'
2120                         open_els.shift()
2121                         ins_mode = original_ins_mode
2122                         # fixfull the spec seems to assume that I'm going to run the script
2123                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2124                         return
2125                 if t.type is TYPE_END_TAG
2126                         open_els.shift()
2127                         ins_mode = original_ins_mode
2128                         return
2129                 console.log 'warning: end of ins_mode_text reached'
2130
2131         # the functions below implement the tokenizer stats described here:
2132         # http://www.w3.org/TR/html5/syntax.html#tokenization
2133
2134         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2135         ins_mode_in_table_else = (t) ->
2136                 parse_error()
2137                 flag_foster_parenting = true
2138                 ins_mode_in_body t
2139                 flag_foster_parenting = false
2140                 return
2141         can_in_table = { # FIXME do this inline like everywhere else
2142                 'table': true
2143                 'tbody': true
2144                 'tfoot': true
2145                 'thead': true
2146                 'tr': true
2147         }
2148         ins_mode_in_table = (t) ->
2149                 switch t.type
2150                         when TYPE_TEXT
2151                                 if can_in_table[t.name]
2152                                         original_ins_mode = ins_mode
2153                                         ins_mode = ins_mode_in_table_text
2154                                         process_token t
2155                                 else
2156                                         ins_mode_in_table_else t
2157                         when TYPE_COMMENT
2158                                 insert_comment t
2159                         when TYPE_DOCTYPE
2160                                 parse_error()
2161                         when TYPE_START_TAG
2162                                 switch t.name
2163                                         when 'caption'
2164                                                 clear_stack_to_table_context()
2165                                                 afe_push_marker()
2166                                                 insert_html_element t
2167                                                 ins_mode = ins_mode_in_caption
2168                                         when 'colgroup'
2169                                                 clear_stack_to_table_context()
2170                                                 insert_html_element t
2171                                                 ins_mode = ins_mode_in_column_group
2172                                         when 'col'
2173                                                 clear_stack_to_table_context()
2174                                                 insert_html_element new_open_tag 'colgroup'
2175                                                 ins_mode = ins_mode_in_column_group
2176                                                 process_token t
2177                                         when 'tbody', 'tfoot', 'thead'
2178                                                 clear_stack_to_table_context()
2179                                                 insert_html_element t
2180                                                 ins_mode = ins_mode_in_table_body
2181                                         when 'td', 'th', 'tr'
2182                                                 clear_stack_to_table_context()
2183                                                 insert_html_element new_open_tag 'tbody'
2184                                                 ins_mode = ins_mode_in_table_body
2185                                                 process_token t
2186                                         when 'table'
2187                                                 parse_error()
2188                                                 if is_in_table_scope 'table'
2189                                                         loop
2190                                                                 el = open_els.shift()
2191                                                                 if el.name is 'table'
2192                                                                         break
2193                                                         reset_ins_mode()
2194                                                         process_token t
2195                                         when 'style', 'script', 'template'
2196                                                 ins_mode_in_head t
2197                                         when 'input'
2198                                                 unless is_input_hidden_tok t
2199                                                         ins_mode_in_table_else t
2200                                                 else
2201                                                         parse_error()
2202                                                         el = insert_html_element t
2203                                                         open_els.shift()
2204                                                         t.acknowledge_self_closing()
2205                                         when 'form'
2206                                                 parse_error()
2207                                                 if form_element_pointer?
2208                                                         return
2209                                                 if template_tag_is_open()
2210                                                         return
2211                                                 form_element_pointer = insert_html_element t
2212                                                 open_els.shift()
2213                                         else
2214                                                 ins_mode_in_table_else t
2215                         when TYPE_END_TAG
2216                                 switch t.name
2217                                         when 'table'
2218                                                 if is_in_table_scope 'table'
2219                                                         loop
2220                                                                 el = open_els.shift()
2221                                                                 if el.name is 'table'
2222                                                                         break
2223                                                         reset_ins_mode()
2224                                                 else
2225                                                         parse_error
2226                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2227                                                 parse_error()
2228                                         when 'template'
2229                                                 ins_mode_in_head t
2230                                         else
2231                                                 ins_mode_in_table_else t
2232                         when TYPE_EOF
2233                                 ins_mode_in_body t
2234                         else
2235                                 ins_mode_in_table_else t
2236
2237
2238         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2239         ins_mode_in_table_text = (t) ->
2240                 if t.type is TYPE_TEXT and t.text is "\u0000"
2241                         # huh? I thought the tokenizer didn't emit these
2242                         parse_error()
2243                         return
2244                 if t.type is TYPE_TEXT
2245                         pending_table_character_tokens.push t
2246                         return
2247                 # Anything else
2248                 all_space = true
2249                 for old in pending_table_character_tokens
2250                         unless is_space_tok old
2251                                 all_space = false
2252                                 break
2253                 if all_space
2254                         for old in pending_table_character_tokens
2255                                 insert_character old
2256                 else
2257                         for old in pending_table_character_tokens
2258                                 ins_mode_table_else old
2259                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2260                 ins_mode = original_ins_mode
2261                 process_token t
2262
2263         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2264         ins_mode_in_caption = (t) ->
2265                 if t.type is TYPE_END_TAG and t.name is 'caption'
2266                         if is_in_table_scope 'caption'
2267                                 generate_implied_end_tags()
2268                                 if open_els[0].name isnt 'caption'
2269                                         parse_error()
2270                                 loop
2271                                         el = open_els.shift()
2272                                         if el.name is 'caption'
2273                                                 break
2274                                 clear_afe_to_marker()
2275                                 ins_mode = ins_mode_in_table
2276                         else
2277                                 parse_error()
2278                                 # fragment case
2279                         return
2280                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2281                         parse_error()
2282                         if is_in_table_scope 'caption'
2283                                 loop
2284                                         el = open_els.shift()
2285                                         if el.name is 'caption'
2286                                                 break
2287                                 clear_afe_to_marker()
2288                                 ins_mode = ins_mode_in_table
2289                                 process_token t
2290                         # else fragment case
2291                         return
2292                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2293                         parse_error()
2294                         return
2295                 # Anything else
2296                 ins_mode_in_body t
2297
2298         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2299         ins_mode_in_column_group = (t) ->
2300                 if is_space_tok t
2301                         insert_character t
2302                         return
2303                 if t.type is TYPE_COMMENT
2304                         insert_comment t
2305                         return
2306                 if t.type is TYPE_DOCTYPE
2307                         parse_error()
2308                         return
2309                 if t.type is TYPE_START_TAG and t.name is 'html'
2310                         ins_mode_in_body t
2311                         return
2312                 if t.type is TYPE_START_TAG and t.name is 'col'
2313                         el = insert_html_element t
2314                         open_els.shift()
2315                         t.acknowledge_self_closing()
2316                         return
2317                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2318                         if open_els[0].name is 'colgroup'
2319                                 open_els.shift()
2320                                 ins_mode = ins_mode_in_table
2321                         else
2322                                 parse_error()
2323                         return
2324                 if t.type is TYPE_END_TAG and t.name is 'col'
2325                         parse_error()
2326                         return
2327                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2328                         ins_mode_in_head t
2329                         return
2330                 if t.type is TYPE_EOF
2331                         ins_mode_in_body t
2332                         return
2333                 # Anything else
2334                 if open_els[0].name isnt 'colgroup'
2335                         parse_error()
2336                         return
2337                 open_els.shift()
2338                 ins_mode = ins_mode_in_table
2339                 process_token t
2340                 return
2341
2342         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2343         ins_mode_in_table_body = (t) ->
2344                 if t.type is TYPE_START_TAG and t.name is 'tr'
2345                         clear_stack_to_table_body_context()
2346                         insert_html_element t
2347                         ins_mode = ins_mode_in_row
2348                         return
2349                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2350                         parse_error()
2351                         clear_stack_to_table_body_context()
2352                         insert_html_element new_open_tag 'tr'
2353                         ins_mode = ins_mode_in_row
2354                         process_token t
2355                         return
2356                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2357                         unless is_in_table_scope t.name # fixfull check namespace
2358                                 parse_error()
2359                                 return
2360                         clear_stack_to_table_body_context()
2361                         open_els.shift()
2362                         ins_mode = ins_mode_in_table
2363                         return
2364                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2365                         has = false
2366                         for el in open_els
2367                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2368                                         has = true
2369                                         break
2370                                 if table_scopers[el.name]
2371                                         break
2372                         if !has
2373                                 parse_error()
2374                                 return
2375                         clear_stack_to_table_body_context()
2376                         open_els.shift()
2377                         ins_mode = ins_mode_in_table
2378                         process_token t
2379                         return
2380                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2381                         parse_error()
2382                         return
2383                 # Anything else
2384                 ins_mode_in_table t
2385
2386         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2387         ins_mode_in_row = (t) ->
2388                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2389                         clear_stack_to_table_row_context()
2390                         insert_html_element t
2391                         ins_mode = ins_mode_in_cell
2392                         afe_push_marker()
2393                         return
2394                 if t.type is TYPE_END_TAG and t.name is 'tr'
2395                         if is_in_table_scope 'tr'
2396                                 clear_stack_to_table_row_context()
2397                                 open_els.shift()
2398                                 ins_mode = ins_mode_in_table_body
2399                         else
2400                                 parse_error()
2401                         return
2402                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2403                         if is_in_table_scope 'tr'
2404                                 clear_stack_to_table_row_context()
2405                                 open_els.shift()
2406                                 ins_mode = ins_mode_in_table_body
2407                                 process_token t
2408                         else
2409                                 parse_error()
2410                         return
2411                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2412                         if is_in_table_scope t.name # fixfull namespace
2413                                 if is_in_table_scope 'tr'
2414                                         clear_stack_to_table_row_context()
2415                                         open_els.shift()
2416                                         ins_mode = ins_mode_in_table_body
2417                                         process_token t
2418                         else
2419                                 parse_error()
2420                         return
2421                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2422                         parse_error()
2423                         return
2424                 # Anything else
2425                 ins_mode_in_table t
2426
2427         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2428         close_the_cell = ->
2429                 generate_implied_end_tags()
2430                 unless open_els[0].name is 'td' or open_els[0] is 'th'
2431                         parse_error()
2432                 loop
2433                         el = open_els.shift()
2434                         if el.name is 'td' or el.name is 'th'
2435                                 break
2436                 clear_afe_to_marker()
2437                 ins_mode = ins_mode_in_row
2438
2439         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2440         ins_mode_in_cell = (t) ->
2441                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2442                         if is_in_table_scope t.name
2443                                 generate_implied_end_tags()
2444                                 if open_els[0].name isnt t.name
2445                                         parse_error
2446                                 loop
2447                                         el = open_els.shift()
2448                                         if el.name is t.name
2449                                                 break
2450                                 clear_afe_to_marker()
2451                                 ins_mode = ins_mode_in_row
2452                         else
2453                                 parse_error()
2454                         return
2455                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2456                         has = false
2457                         for el in open_els
2458                                 if el.name is 'td' or el.name is 'th'
2459                                         has = true
2460                                         break
2461                                 if table_scopers[el.name]
2462                                         break
2463                         if !has
2464                                 parse_error()
2465                                 return
2466                         close_the_cell()
2467                         process_token t
2468                         return
2469                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2470                         parse_error()
2471                         return
2472                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2473                         if is_in_table_scope t.name # fixfull namespace
2474                                 close_the_cell()
2475                                 process_token t
2476                         else
2477                                 parse_error()
2478                         return
2479                 # Anything Else
2480                 ins_mode_in_body t
2481
2482         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2483         ins_mode_in_select = (t) ->
2484                 if t.type is TYPE_TEXT and t.text is "\u0000"
2485                         parse_error()
2486                         return
2487                 if t.type is TYPE_TEXT
2488                         insert_character t
2489                         return
2490                 if t.type is TYPE_COMMENT
2491                         insert_comment t
2492                         return
2493                 if t.type is TYPE_DOCTYPE
2494                         parse_error()
2495                         return
2496                 if t.type is TYPE_START_TAG and t.name is 'html'
2497                         ins_mode_in_body t
2498                         return
2499                 if t.type is TYPE_START_TAG and t.name is 'option'
2500                         if open_els[0].name is 'option'
2501                                 open_els.shift()
2502                         insert_html_element t
2503                         return
2504                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2505                         if open_els[0].name is 'option'
2506                                 open_els.shift()
2507                         if open_els[0].name is 'optgroup'
2508                                 open_els.shift()
2509                         insert_html_element t
2510                         return
2511                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2512                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2513                                 open_els.shift()
2514                         if open_els[0].name is 'optgroup'
2515                                 open_els.shift()
2516                         else
2517                                 parse_error()
2518                         return
2519                 if t.type is TYPE_END_TAG and t.name is 'option'
2520                         if open_els[0].name is 'option'
2521                                 open_els.shift()
2522                         else
2523                                 parse_error()
2524                         return
2525                 if t.type is TYPE_END_TAG and t.name is 'select'
2526                         if is_in_select_scope 'select'
2527                                 loop
2528                                         el = open_els.shift()
2529                                         if el.name is 'select'
2530                                                 break
2531                                 reset_ins_mode()
2532                         else
2533                                 parse_error()
2534                         return
2535                 if t.type is TYPE_START_TAG and t.name is 'select'
2536                         parse_error()
2537                         loop
2538                                 el = open_els.shift()
2539                                 if el.name is 'select'
2540                                         break
2541                         reset_ins_mode()
2542                         # spec says that this is the same as </select> but it doesn't say
2543                         # to check scope first
2544                         return
2545                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2546                         parse_error()
2547                         if is_in_select_scope 'select'
2548                                 return
2549                         loop
2550                                 el = open_els.shift()
2551                                 if el.name is 'select'
2552                                         break
2553                         reset_ins_mode()
2554                         process_token t
2555                         return
2556                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2557                         ins_mode_in_head t
2558                         return
2559                 if t.type is TYPE_EOF
2560                         ins_mode_in_body t
2561                         return
2562                 # Anything else
2563                 parse_error()
2564                 return
2565
2566         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2567         ins_mode_in_select_in_table = (t) ->
2568                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2569                         parse_error()
2570                         loop
2571                                 el = open_els.shift()
2572                                 if el.name is 'select'
2573                                         break
2574                         reset_ins_mode()
2575                         process_token t
2576                         return
2577                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2578                         parse_error()
2579                         unless is_in_table_scope t.name, NS_HTML
2580                                 return
2581                         loop
2582                                 el = open_els.shift()
2583                                 if el.name is 'select'
2584                                         break
2585                         reset_ins_mode()
2586                         process_token t
2587                         return
2588                 # Anything else
2589                 ins_mode_in_select t
2590                 return
2591
2592         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2593         ins_mode_in_template = (t) ->
2594                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2595                         ins_mode_in_body t
2596                         return
2597                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2598                         ins_mode_in_head t
2599                         return
2600                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2601                         template_ins_modes.shift()
2602                         template_ins_modes.unshift ins_mode_in_table
2603                         ins_mode = ins_mode_in_table
2604                         process_token t
2605                         return
2606                 if t.type is TYPE_START_TAG and t.name is 'col'
2607                         template_ins_modes.shift()
2608                         template_ins_modes.unshift ins_mode_in_column_group
2609                         ins_mode = ins_mode_in_column_group
2610                         process_token t
2611                         return
2612                 if t.type is TYPE_START_TAG and t.name is 'tr'
2613                         template_ins_modes.shift()
2614                         template_ins_modes.unshift ins_mode_in_table_body
2615                         ins_mode = ins_mode_in_table_body
2616                         process_token t
2617                         return
2618                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2619                         template_ins_modes.shift()
2620                         template_ins_modes.unshift ins_mode_in_row
2621                         ins_mode = ins_mode_in_row
2622                         process_token t
2623                         return
2624                 if t.type is TYPE_START_TAG
2625                         template_ins_modes.shift()
2626                         template_ins_modes.unshift ins_mode_in_body
2627                         ins_mode = ins_mode_in_body
2628                         process_token t
2629                         return
2630                 if t.type is TYPE_END_TAG
2631                         parse_error()
2632                         return
2633                 if t.type is TYPE_EOF
2634                         unless template_tag_is_open()
2635                                 stop_parsing()
2636                                 return
2637                         parse_error()
2638                         loop
2639                                 el = open_els.shift()
2640                                 if el.name is 'template' # fixfull check namespace
2641                                         break
2642                         clear_afe_to_marker()
2643                         template_ins_modes.shift()
2644                         reset_ins_mode()
2645                         process_token t
2646
2647         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2648         ins_mode_after_body = (t) ->
2649                 if is_space_tok t
2650                         ins_mode_in_body t
2651                         return
2652                 if t.type is TYPE_COMMENT
2653                         insert_comment t, [open_els[0], open_els[0].children.length]
2654                         return
2655                 if t.type is TYPE_DOCTYPE
2656                         parse_error()
2657                         return
2658                 if t.type is TYPE_START_TAG and t.name is 'html'
2659                         ins_mode_in_body t
2660                         return
2661                 if t.type is TYPE_END_TAG and t.name is 'html'
2662                         # fixfull fragment case
2663                         ins_mode = ins_mode_after_after_body
2664                         return
2665                 if t.type is TYPE_EOF
2666                         stop_parsing()
2667                         return
2668                 # Anything ELse
2669                 parse_error()
2670                 ins_mode = ins_mode_in_body
2671                 process_token t
2672
2673         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2674         ins_mode_in_frameset = (t) ->
2675                 if is_space_tok t
2676                         insert_character t
2677                         return
2678                 if t.type is TYPE_COMMENT
2679                         insert_comment t
2680                         return
2681                 if t.type is TYPE_DOCTYPE
2682                         parse_error()
2683                         return
2684                 if t.type is TYPE_START_TAG and t.name is 'html'
2685                         ins_mode_in_body t
2686                         return
2687                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2688                         insert_html_element t
2689                         return
2690                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2691                         # TODO ?correct for: "if the current node is the root html element"
2692                         if open_els.length is 1
2693                                 parse_error()
2694                                 return # fragment case
2695                         open_els.shift()
2696                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2697                                 ins_mode = ins_mode_after_frameset
2698                         return
2699                 if t.type is TYPE_START_TAG and t.name is 'frame'
2700                         insert_html_element t
2701                         open_els.shift()
2702                         t.acknowledge_self_closing()
2703                         return
2704                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2705                         ins_mode_in_head t
2706                         return
2707                 if t.type is TYPE_EOF
2708                         # TODO ?correct for: "if the current node is not the root html element"
2709                         if open_els.length isnt 1
2710                                 parse_error()
2711                         stop_parsing()
2712                         return
2713                 # Anything else
2714                 parse_error()
2715                 return
2716
2717         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2718         ins_mode_after_frameset = (t) ->
2719                 if is_space_tok t
2720                         insert_character t
2721                         return
2722                 if t.type is TYPE_COMMENT
2723                         insert_comment t
2724                         return
2725                 if t.type is TYPE_DOCTYPE
2726                         parse_error()
2727                         return
2728                 if t.type is TYPE_START_TAG and t.name is 'html'
2729                         ins_mode_in_body t
2730                         return
2731                 if t.type is TYPE_END_TAG and t.name is 'html'
2732                         insert_mode = ins_mode_after_after_frameset
2733                         return
2734                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2735                         ins_mode_in_head t
2736                         return
2737                 if t.type is TYPE_EOF
2738                         stop_parsing()
2739                         return
2740                 # Anything else
2741                 parse_error()
2742                 return
2743
2744         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2745         ins_mode_after_after_body = (t) ->
2746                 if t.type is TYPE_COMMENT
2747                         insert_comment t, [doc, doc.children.length]
2748                         return
2749                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2750                         ins_mode_in_body t
2751                         return
2752                 if t.type is TYPE_EOF
2753                         stop_parsing()
2754                         return
2755                 # Anything else
2756                 parse_error()
2757                 ins_mode = ins_mode_in_body
2758                 return
2759
2760         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2761         ins_mode_after_after_frameset = (t) ->
2762                 if t.type is TYPE_COMMENT
2763                         insert_comment t, [doc, doc.children.length]
2764                         return
2765                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2766                         ins_mode_in_body t
2767                         return
2768                 if t.type is TYPE_EOF
2769                         stop_parsing()
2770                         return
2771                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2772                         ins_mode_in_head t
2773                         return
2774                 # Anything else
2775                 parse_error()
2776                 return
2777
2778         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2779         has_color_face_or_size = (t) ->
2780                 for a in t.attrs_a
2781                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2782                                 return true
2783                 return false
2784         in_foreign_content_end_script = ->
2785                 open_els.shift()
2786                 # fixfull
2787                 return
2788         in_foreign_content_other_start = (t) ->
2789                 acn = adjusted_current_node()
2790                 if acn.namespace is NS_MATHML
2791                         adjust_mathml_attributes t
2792                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2793                         t.name = svg_name_fixes[t.name]
2794                 if acn.namespace is NS_SVG
2795                         adjust_svg_attributes t
2796                 adjust_foreign_attributes t
2797                 insert_foreign_element t, acn.namespace
2798                 if t.flag 'self-closing'
2799                         if t.name is 'script'
2800                                 t.acknowledge_self_closing()
2801                                 in_foreign_content_end_script()
2802                         else
2803                                 open_els.shift()
2804                                 t.acknowledge_self_closing()
2805                 return
2806         in_foreign_content = (t) ->
2807                 if t.type is TYPE_TEXT and t.text is "\u0000"
2808                         parse_error()
2809                         insert_character new_character_token "\ufffd"
2810                         return
2811                 if is_space_tok t
2812                         insert_character t
2813                         return
2814                 if t.type is TYPE_TEXT
2815                         flag_frameset_ok = false
2816                         insert_character t
2817                         return
2818                 if t.type is TYPE_COMMENT
2819                         insert_comment t
2820                         return
2821                 if t.type is TYPE_DOCTYPE
2822                         parse_error()
2823                         return
2824                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2825                         parse_error()
2826                         if flag_fragment_parsing
2827                                 in_foreign_content_other_start t
2828                                 return
2829                         loop # is this safe?
2830                                 open_els.shift()
2831                                 cn = open_els[0]
2832                                 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2833                                         break
2834                         process_token t
2835                         return
2836                 if t.type is TYPE_START_TAG
2837                         in_foreign_content_other_start t
2838                         return
2839                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2840                         in_foreign_content_end_script()
2841                         return
2842                 if t.type is TYPE_END_TAG
2843                         if open_els[0].name.toLowerCase() isnt t.name
2844                                 parse_error()
2845                         for node in open_els
2846                                 if node is open_els[open_els.length - 1]
2847                                         return
2848                                 if node.name.toLowerCase() is t.name
2849                                         loop
2850                                                 el = open_els.shift()
2851                                                 if el is node
2852                                                         return
2853                                 if node.namespace is NS_HTML
2854                                         break
2855                         ins_mode t # explicitly call HTML insertion mode
2856
2857
2858         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2859         tok_state_data = ->
2860                 switch c = txt.charAt(cur++)
2861                         when '&'
2862                                 return new_text_node parse_character_reference()
2863                         when '<'
2864                                 tok_state = tok_state_tag_open
2865                         when "\u0000"
2866                                 parse_error()
2867                                 return new_text_node c
2868                         when '' # EOF
2869                                 return new_eof_token()
2870                         else
2871                                 return new_text_node c
2872                 return null
2873
2874         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2875         # not needed: tok_state_character_reference_in_data = ->
2876         # just call parse_character_reference()
2877
2878         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2879         tok_state_rcdata = ->
2880                 switch c = txt.charAt(cur++)
2881                         when '&'
2882                                 return new_text_node parse_character_reference()
2883                         when '<'
2884                                 tok_state = tok_state_rcdata_less_than_sign
2885                         when "\u0000"
2886                                 parse_error()
2887                                 return new_character_token "\ufffd"
2888                         when '' # EOF
2889                                 return new_eof_token()
2890                         else
2891                                 return new_character_token c
2892                 return null
2893
2894         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2895         # not needed: tok_state_character_reference_in_rcdata = ->
2896         # just call parse_character_reference()
2897
2898         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2899         tok_state_rawtext = ->
2900                 switch c = txt.charAt(cur++)
2901                         when '<'
2902                                 tok_state = tok_state_rawtext_less_than_sign
2903                         when "\u0000"
2904                                 parse_error()
2905                                 return new_character_token "\ufffd"
2906                         when '' # EOF
2907                                 return new_eof_token()
2908                         else
2909                                 return new_character_token c
2910                 return null
2911
2912         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2913         tok_state_script_data = ->
2914                 switch c = txt.charAt(cur++)
2915                         when '<'
2916                                 tok_state = tok_state_script_data_less_than_sign
2917                         when "\u0000"
2918                                 parse_error()
2919                                 return new_character_token "\ufffd"
2920                         when '' # EOF
2921                                 return new_eof_token()
2922                         else
2923                                 return new_character_token c
2924                 return null
2925
2926         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2927         tok_state_plaintext = ->
2928                 switch c = txt.charAt(cur++)
2929                         when "\u0000"
2930                                 parse_error()
2931                                 return new_character_token "\ufffd"
2932                         when '' # EOF
2933                                 return new_eof_token()
2934                         else
2935                                 return new_character_token c
2936                 return null
2937
2938
2939         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2940         tok_state_tag_open = ->
2941                 switch c = txt.charAt(cur++)
2942                         when '!'
2943                                 tok_state = tok_state_markup_declaration_open
2944                         when '/'
2945                                 tok_state = tok_state_end_tag_open
2946                         when '?'
2947                                 parse_error()
2948                                 tok_cur_tag = new_comment_token '?'
2949                                 tok_state = tok_state_bogus_comment
2950                         else
2951                                 if is_lc_alpha(c)
2952                                         tok_cur_tag = new_open_tag c
2953                                         tok_state = tok_state_tag_name
2954                                 else if is_uc_alpha(c)
2955                                         tok_cur_tag = new_open_tag c.toLowerCase()
2956                                         tok_state = tok_state_tag_name
2957                                 else
2958                                         parse_error()
2959                                         tok_state = tok_state_data
2960                                         cur -= 1 # we didn't parse/handle the char after <
2961                                         return new_text_node '<'
2962                 return null
2963
2964         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2965         tok_state_end_tag_open = ->
2966                 switch c = txt.charAt(cur++)
2967                         when '>'
2968                                 parse_error()
2969                                 tok_state = tok_state_data
2970                         when '' # EOF
2971                                 parse_error()
2972                                 tok_state = tok_state_data
2973                                 return new_text_node '</'
2974                         else
2975                                 if is_uc_alpha(c)
2976                                         tok_cur_tag = new_end_tag c.toLowerCase()
2977                                         tok_state = tok_state_tag_name
2978                                 else if is_lc_alpha(c)
2979                                         tok_cur_tag = new_end_tag c
2980                                         tok_state = tok_state_tag_name
2981                                 else
2982                                         parse_error()
2983                                         tok_cur_tag = new_comment_token '/'
2984                                         tok_state = tok_state_bogus_comment
2985                 return null
2986
2987         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2988         tok_state_tag_name = ->
2989                 switch c = txt.charAt(cur++)
2990                         when "\t", "\n", "\u000c", ' '
2991                                 tok_state = tok_state_before_attribute_name
2992                         when '/'
2993                                 tok_state = tok_state_self_closing_start_tag
2994                         when '>'
2995                                 tok_state = tok_state_data
2996                                 tmp = tok_cur_tag
2997                                 tok_cur_tag = null
2998                                 return tmp
2999                         when "\u0000"
3000                                 parse_error()
3001                                 tok_cur_tag.name += "\ufffd"
3002                         when '' # EOF
3003                                 parse_error()
3004                                 tok_state = tok_state_data
3005                         else
3006                                 if is_uc_alpha(c)
3007                                         tok_cur_tag.name += c.toLowerCase()
3008                                 else
3009                                         tok_cur_tag.name += c
3010                 return null
3011
3012         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3013         tok_state_rcdata_less_than_sign = ->
3014                 c = txt.charAt(cur++)
3015                 if c is '/'
3016                         temporary_buffer = ''
3017                         tok_state = tok_state_rcdata_end_tag_open
3018                         return null
3019                 # Anything else
3020                 tok_state = tok_state_rcdata
3021                 cur -= 1 # reconsume the input character
3022                 return new_character_token '<'
3023
3024         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3025         tok_state_rcdata_end_tag_open = ->
3026                 c = txt.charAt(cur++)
3027                 if is_uc_alpha(c)
3028                         tok_cur_tag = new_end_tag c.toLowerCase()
3029                         temporary_buffer += c
3030                         tok_state = tok_state_rcdata_end_tag_name
3031                         return null
3032                 if is_lc_alpha(c)
3033                         tok_cur_tag = new_end_tag c
3034                         temporary_buffer += c
3035                         tok_state = tok_state_rcdata_end_tag_name
3036                         return null
3037                 # Anything else
3038                 tok_state = tok_state_rcdata
3039                 cur -= 1 # reconsume the input character
3040                 return new_character_token "</" # fixfull separate these
3041
3042         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3043         is_appropriate_end_tag = (t) ->
3044                 # spec says to check against "the tag name of the last start tag to
3045                 # have been emitted from this tokenizer", but this is only called from
3046                 # the various "raw" states, so it's hopefully ok to assume that
3047                 # open_els[0].name will work instead TODO: verify this after the script
3048                 # data states are implemented
3049                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3050                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3051
3052         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3053         tok_state_rcdata_end_tag_name = ->
3054                 c = txt.charAt(cur++)
3055                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3056                         if is_appropriate_end_tag tok_cur_tag
3057                                 tok_state = tok_state_before_attribute_name
3058                                 return
3059                         # else fall through to "Anything else"
3060                 if c is '/'
3061                         if is_appropriate_end_tag tok_cur_tag
3062                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3063                                 return
3064                         # else fall through to "Anything else"
3065                 if c is '>'
3066                         if is_appropriate_end_tag tok_cur_tag
3067                                 tok_state = tok_state_data
3068                                 return tok_cur_tag
3069                         # else fall through to "Anything else"
3070                 if is_uc_alpha(c)
3071                         tok_cur_tag.name += c.toLowerCase()
3072                         temporary_buffer += c
3073                         return null
3074                 if is_lc_alpha(c)
3075                         tok_cur_tag.name += c
3076                         temporary_buffer += c
3077                         return null
3078                 # Anything else
3079                 tok_state = tok_state_rcdata
3080                 cur -= 1 # reconsume the input character
3081                 return new_character_token '</' + temporary_buffer # fixfull separate these
3082
3083         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3084         tok_state_rawtext_less_than_sign = ->
3085                 c = txt.charAt(cur++)
3086                 if c is '/'
3087                         temporary_buffer = ''
3088                         tok_state = tok_state_rawtext_end_tag_open
3089                         return null
3090                 # Anything else
3091                 tok_state = tok_state_rawtext
3092                 cur -= 1 # reconsume the input character
3093                 return new_character_token '<'
3094
3095         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3096         tok_state_rawtext_end_tag_open = ->
3097                 c = txt.charAt(cur++)
3098                 if is_uc_alpha(c)
3099                         tok_cur_tag = new_end_tag c.toLowerCase()
3100                         temporary_buffer += c
3101                         tok_state = tok_state_rawtext_end_tag_name
3102                         return null
3103                 if is_lc_alpha(c)
3104                         tok_cur_tag = new_end_tag c
3105                         temporary_buffer += c
3106                         tok_state = tok_state_rawtext_end_tag_name
3107                         return null
3108                 # Anything else
3109                 tok_state = tok_state_rawtext
3110                 cur -= 1 # reconsume the input character
3111                 return new_character_token "</" # fixfull separate these
3112
3113         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3114         tok_state_rawtext_end_tag_name = ->
3115                 c = txt.charAt(cur++)
3116                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3117                         if is_appropriate_end_tag tok_cur_tag
3118                                 tok_state = tok_state_before_attribute_name
3119                                 return
3120                         # else fall through to "Anything else"
3121                 if c is '/'
3122                         if is_appropriate_end_tag tok_cur_tag
3123                                 tok_state = tok_state_self_closing_start_tag
3124                                 return
3125                         # else fall through to "Anything else"
3126                 if c is '>'
3127                         if is_appropriate_end_tag tok_cur_tag
3128                                 tok_state = tok_state_data
3129                                 return tok_cur_tag
3130                         # else fall through to "Anything else"
3131                 if is_uc_alpha(c)
3132                         tok_cur_tag.name += c.toLowerCase()
3133                         temporary_buffer += c
3134                         return null
3135                 if is_lc_alpha(c)
3136                         tok_cur_tag.name += c
3137                         temporary_buffer += c
3138                         return null
3139                 # Anything else
3140                 tok_state = tok_state_rawtext
3141                 cur -= 1 # reconsume the input character
3142                 return new_character_token '</' + temporary_buffer # fixfull separate these
3143
3144         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3145         tok_state_script_data_less_than_sign = ->
3146                 c = txt.charAt(cur++)
3147                 if c is '/'
3148                         temporary_buffer = ''
3149                         tok_state = tok_state_script_data_end_tag_open
3150                         return
3151                 if c is '!'
3152                         tok_state = tok_state_script_data_escape_start
3153                         return new_character_token '<!' # fixfull split
3154                 # Anything else
3155                 tok_state = tok_state_script_data
3156                 cur -= 1 # Reconsume
3157                 return new_character_token '<'
3158
3159         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3160         tok_state_script_data_end_tag_open = ->
3161                 c = txt.charAt(cur++)
3162                 if is_uc_alpha(c)
3163                         tok_cur_tag = new_end_tag c.toLowerCase()
3164                         temporary_buffer += c
3165                         tok_state = tok_state_script_data_end_tag_name
3166                         return
3167                 if is_lc_alpha(c)
3168                         tok_cur_tag = new_end_tag c
3169                         temporary_buffer += c
3170                         tok_state = tok_state_script_data_end_tag_name
3171                         return
3172                 # Anything else
3173                 tok_state = tok_state_script_data
3174                 cur -= 1 # Reconsume
3175                 return new_character_token '</'
3176
3177         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3178         tok_state_script_data_end_tag_name = ->
3179                 c = txt.charAt(cur++)
3180                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3181                         if is_appropriate_end_tag tok_cur_tag
3182                                 tok_state = tok_state_before_attribute_name
3183                                 return
3184                         # fall through
3185                 if c is '/'
3186                         if is_appropriate_end_tag tok_cur_tag
3187                                 tok_state = tok_state_self_closing_start_tag
3188                                 return
3189                         # fall through
3190                 if c is '>'
3191                         if is_appropriate_end_tag tok_cur_tag
3192                                 tok_state = tok_state_data
3193                                 return tok_cur_tag
3194                         # fall through
3195                 if is_uc_alpha(c)
3196                         tok_cur_tag.name += c.toLowerCase()
3197                         temporary_buffer += c
3198                         return
3199                 if is_lc_alpha(c)
3200                         tok_cur_tag.name += c
3201                         temporary_buffer += c
3202                         return
3203                 # Anything else
3204                 tok_state = tok_state_script_data
3205                 cur -= 1 # Reconsume
3206                 return new_character_token "</#{temporary_buffer}" # fixfull split
3207
3208         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3209         tok_state_script_data_escape_start = ->
3210                 c = txt.charAt(cur++)
3211                 if c is '-'
3212                         tok_state = tok_state_script_data_escape_start_dash
3213                         return new_character_token '-'
3214                 # Anything else
3215                 tok_state = tok_state_script_data
3216                 cur -= 1 # Reconsume
3217                 return
3218
3219         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3220         tok_state_script_data_escape_start_dash = ->
3221                 c = txt.charAt(cur++)
3222                 if c is '-'
3223                         tok_state = tok_state_script_data_escaped_dash_dash
3224                         return new_character_token '-'
3225                 # Anything else
3226                 tok_state = tok_state_script_data
3227                 cur -= 1 # Reconsume
3228                 return
3229
3230         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3231         tok_state_script_data_escaped = ->
3232                 c = txt.charAt(cur++)
3233                 if c is '-'
3234                         tok_state = tok_state_script_data_escaped_dash
3235                         return new_character_token '-'
3236                 if c is '<'
3237                         tok_state = tok_state_script_data_escaped_less_than_sign
3238                         return
3239                 if c is "\u0000"
3240                         parse_error()
3241                         return new_character_token "\ufffd"
3242                 if c is '' # EOF
3243                         tok_state = tok_state_data
3244                         parse_error()
3245                         cur -= 1 # Reconsume
3246                         return
3247                 # Anything else
3248                 return new_character_token c
3249
3250         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3251         tok_state_script_data_escaped_dash = ->
3252                 c = txt.charAt(cur++)
3253                 if c is '-'
3254                         tok_state = tok_state_script_data_escaped_dash_dash
3255                         return new_character_token '-'
3256                 if c is '<'
3257                         tok_state = tok_state_script_data_escaped_less_than_sign
3258                         return
3259                 if c is "\u0000"
3260                         parse_error()
3261                         tok_state = tok_state_script_data_escaped
3262                         return new_character_token "\ufffd"
3263                 if c is '' # EOF
3264                         tok_state = tok_state_data
3265                         parse_error()
3266                         cur -= 1 # Reconsume
3267                         return
3268                 # Anything else
3269                 tok_state = tok_state_script_data_escaped
3270                 return new_character_token c
3271
3272         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3273         tok_state_script_data_escaped_dash_dash = ->
3274                 c = txt.charAt(cur++)
3275                 if c is '-'
3276                         return new_character_token '-'
3277                 if c is '<'
3278                         tok_state = tok_state_script_data_escaped_less_than_sign
3279                         return
3280                 if c is '>'
3281                         tok_state = tok_state_script_data
3282                         return new_character_token '>'
3283                 if c is "\u0000"
3284                         parse_error()
3285                         tok_state = tok_state_script_data_escaped
3286                         return new_character_token "\ufffd"
3287                 if c is '' # EOF
3288                         parse_error()
3289                         tok_state = tok_state_data
3290                         cur -= 1 # Reconsume
3291                         return
3292                 # Anything else
3293                 tok_state = tok_state_script_data_escaped
3294                 return new_character_token c
3295
3296         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3297         tok_state_script_data_escaped_less_than_sign = ->
3298                 c = txt.charAt(cur++)
3299                 if c is '/'
3300                         temporary_buffer = ''
3301                         tok_state = tok_state_script_data_escaped_end_tag_open
3302                         return
3303                 if is_uc_alpha(c)
3304                         temporary_buffer = c.toLowerCase() # yes, really
3305                         tok_state = tok_state_script_data_double_escape_start
3306                         return new_character_token "<#{c}" # fixfull split
3307                 if is_lc_alpha(c)
3308                         temporary_buffer = c
3309                         tok_state = tok_state_script_data_double_escape_start
3310                         return new_character_token "<#{c}" # fixfull split
3311                 # Anything else
3312                 tok_state = tok_state_script_data_escaped
3313                 cur -= 1 # Reconsume
3314                 return new_character_token c
3315
3316         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3317         tok_state_script_data_escaped_end_tag_open = ->
3318                 c = txt.charAt(cur++)
3319                 if is_uc_alpha(c)
3320                         tok_cur_tag = new_end_tag c.toLowerCase()
3321                         temporary_buffer += c
3322                         tok_state = tok_state_script_data_escaped_end_tag_name
3323                         return
3324                 if is_lc_alpha(c)
3325                         tok_cur_tag = new_end_tag c
3326                         temporary_buffer += c
3327                         tok_state = tok_state_script_data_escaped_end_tag_name
3328                         return
3329                 # Anything else
3330                 tok_state = tok_state_script_data_escaped
3331                 cur -= 1 # Reconsume
3332                 return new_character_token '</' # fixfull split
3333
3334         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3335         tok_state_script_data_escaped_end_tag_name = ->
3336                 c = txt.charAt(cur++)
3337                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3338                         if is_appropriate_end_tag tok_cur_tag
3339                                 tok_state = tok_state_before_attribute_name
3340                                 return
3341                         # fall through
3342                 if c is '/'
3343                         if is_appropriate_end_tag tok_cur_tag
3344                                 tok_state = tok_state_self_closing_start_tag
3345                                 return
3346                         # fall through
3347                 if c is '>'
3348                         if is_appropriate_end_tag tok_cur_tag
3349                                 tok_state = tok_state_data
3350                                 return tok_cur_tag
3351                         # fall through
3352                 if is_uc_alpha(c)
3353                         tok_cur_tag.name += c.toLowerCase()
3354                         temporary_buffer += c.toLowerCase()
3355                         return
3356                 if is_lc_alpha(c)
3357                         tok_cur_tag.name += c
3358                         temporary_buffer += c.toLowerCase()
3359                         return
3360                 # Anything else
3361                 tok_state = tok_state_script_data_escaped
3362                 cur -= 1 # Reconsume
3363                 return new_character_token "</#{temporary_buffer}" # fixfull split
3364
3365         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3366         tok_state_script_data_double_escape_start = ->
3367                 c = txt.charAt(cur++)
3368                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3369                         if temporary_buffer is 'script'
3370                                 tok_state = tok_state_script_data_double_escaped
3371                         else
3372                                 tok_state = tok_state_script_data_escaped
3373                         return new_character_token c
3374                 if is_uc_alpha(c)
3375                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3376                         return new_character_token c
3377                 if is_lc_alpha(c)
3378                         temporary_buffer += c
3379                         return new_character_token c
3380                 # Anything else
3381                 tok_state = tok_state_script_data_escaped
3382                 cur -= 1 # Reconsume
3383                 return
3384
3385         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3386         tok_state_script_data_double_escaped = ->
3387                 c = txt.charAt(cur++)
3388                 if c is '-'
3389                         tok_state = tok_state_script_data_double_escaped_dash
3390                         return new_character_token '-'
3391                 if c is '<'
3392                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3393                         return new_character_token '<'
3394                 if c is "\u0000"
3395                         parse_error()
3396                         return new_character_token "\ufffd"
3397                 if c is '' # EOF
3398                         parse_error()
3399                         tok_state = tok_state_data
3400                         cur -= 1 # Reconsume
3401                         return
3402                 # Anything else
3403                 return new_character_token c
3404
3405         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3406         tok_state_script_data_double_escaped_dash = ->
3407                 c = txt.charAt(cur++)
3408                 if c is '-'
3409                         tok_state = tok_state_script_data_double_escaped_dash_dash
3410                         return new_character_token '-'
3411                 if c is '<'
3412                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3413                         return new_character_token '<'
3414                 if c is "\u0000"
3415                         parse_error()
3416                         tok_state = tok_state_script_data_double_escaped
3417                         return new_character_token "\ufffd"
3418                 if c is '' # EOF
3419                         parse_error()
3420                         tok_state = tok_state_data
3421                         cur -= 1 # Reconsume
3422                         return
3423                 # Anything else
3424                 tok_state = tok_state_script_data_double_escaped
3425                 return new_character_token c
3426
3427         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3428         tok_state_script_data_double_escaped_dash_dash = ->
3429                 c = txt.charAt(cur++)
3430                 if c is '-'
3431                         return new_character_token '-'
3432                 if c is '<'
3433                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3434                         return new_character_token '<'
3435                 if c is '>'
3436                         tok_state = tok_state_script_data
3437                         return new_character_token '>'
3438                 if c is "\u0000"
3439                         parse_error()
3440                         tok_state = tok_state_script_data_double_escaped
3441                         return new_character_token "\ufffd"
3442                 if c is '' # EOF
3443                         parse_error()
3444                         tok_state = tok_state_data
3445                         cur -= 1 # Reconsume
3446                         return
3447                 # Anything else
3448                 tok_state = tok_state_script_data_double_escaped
3449                 return new_character_token c
3450
3451         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3452         tok_state_script_data_double_escaped_less_than_sign = ->
3453                 c = txt.charAt(cur++)
3454                 if c is '/'
3455                         temporary_buffer = ''
3456                         tok_state = tok_state_script_data_double_escape_end
3457                         return new_character_token '/'
3458                 # Anything else
3459                 tok_state = tok_state_script_data_double_escaped
3460                 cur -= 1 # Reconsume
3461                 return
3462
3463         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3464         tok_state_script_data_double_escape_end = ->
3465                 c = txt.charAt(cur++)
3466                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3467                         if temporary_buffer is 'script'
3468                                 tok_state = tok_state_script_data_escaped
3469                         else
3470                                 tok_state = tok_state_script_data_double_escaped
3471                         return new_character_token c
3472                 if is_uc_alpha(c)
3473                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3474                         return new_character_token c
3475                 if is_lc_alpha(c)
3476                         temporary_buffer += c
3477                         return new_character_token c
3478                 # Anything else
3479                 tok_state = tok_state_script_data_double_escaped
3480                 cur -= 1 # Reconsume
3481                 return
3482
3483         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3484         tok_state_before_attribute_name = ->
3485                 attr_name = null
3486                 switch c = txt.charAt(cur++)
3487                         when "\t", "\n", "\u000c", ' '
3488                                 return null
3489                         when '/'
3490                                 tok_state = tok_state_self_closing_start_tag
3491                                 return null
3492                         when '>'
3493                                 tok_state = tok_state_data
3494                                 tmp = tok_cur_tag
3495                                 tok_cur_tag = null
3496                                 return tmp
3497                         when "\u0000"
3498                                 parse_error()
3499                                 attr_name = "\ufffd"
3500                         when '"', "'", '<', '='
3501                                 parse_error()
3502                                 attr_name = c
3503                         when '' # EOF
3504                                 parse_error()
3505                                 tok_state = tok_state_data
3506                         else
3507                                 if is_uc_alpha(c)
3508                                         attr_name = c.toLowerCase()
3509                                 else
3510                                         attr_name = c
3511                 if attr_name?
3512                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3513                         tok_state = tok_state_attribute_name
3514                 return null
3515
3516         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3517         tok_state_attribute_name = ->
3518                 switch c = txt.charAt(cur++)
3519                         when "\t", "\n", "\u000c", ' '
3520                                 tok_state = tok_state_after_attribute_name
3521                         when '/'
3522                                 tok_state = tok_state_self_closing_start_tag
3523                         when '='
3524                                 tok_state = tok_state_before_attribute_value
3525                         when '>'
3526                                 tok_state = tok_state_data
3527                                 tmp = tok_cur_tag
3528                                 tok_cur_tag = null
3529                                 return tmp
3530                         when "\u0000"
3531                                 parse_error()
3532                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3533                         when '"', "'", '<'
3534                                 parse_error()
3535                                 tok_cur_tag.attrs_a[0][0] += c
3536                         when '' # EOF
3537                                 parse_error()
3538                                 tok_state = tok_state_data
3539                         else
3540                                 if is_uc_alpha(c)
3541                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3542                                 else
3543                                         tok_cur_tag.attrs_a[0][0] += c
3544                 return null
3545
3546         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3547         tok_state_after_attribute_name = ->
3548                 c = txt.charAt(cur++)
3549                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3550                         return
3551                 if c is '/'
3552                         tok_state = tok_state_self_closing_start_tag
3553                         return
3554                 if c is '='
3555                         tok_state = tok_state_before_attribute_value
3556                         return
3557                 if c is '>'
3558                         tok_state = tok_state_data
3559                         return
3560                 if is_uc_alpha(c)
3561                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3562                         tok_state = tok_state_attribute_name
3563                         return
3564                 if c is "\u0000"
3565                         parse_error()
3566                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3567                         tok_state = tok_state_attribute_name
3568                         return
3569                 if c is '' # EOF
3570                         parse_error()
3571                         tok_state = tok_state_data
3572                         cur -= 1 # reconsume
3573                         return
3574                 if c is '"' or c is "'" or c is '<'
3575                         parse_error()
3576                         # fall through to Anything else
3577                 # Anything else
3578                 tok_cur_tag.attrs_a.unshift [c, '']
3579                 tok_state = tok_state_attribute_name
3580
3581         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3582         tok_state_before_attribute_value = ->
3583                 switch c = txt.charAt(cur++)
3584                         when "\t", "\n", "\u000c", ' '
3585                                 return null
3586                         when '"'
3587                                 tok_state = tok_state_attribute_value_double_quoted
3588                         when '&'
3589                                 tok_state = tok_state_attribute_value_unquoted
3590                                 cur -= 1
3591                         when "'"
3592                                 tok_state = tok_state_attribute_value_single_quoted
3593                         when "\u0000"
3594                                 # Parse error
3595                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3596                                 tok_state = tok_state_attribute_value_unquoted
3597                         when '>'
3598                                 # Parse error
3599                                 tok_state = tok_state_data
3600                                 tmp = tok_cur_tag
3601                                 tok_cur_tag = null
3602                                 return tmp
3603                         when '' # EOF
3604                                 parse_error()
3605                                 tok_state = tok_state_data
3606                         else
3607                                 tok_cur_tag.attrs_a[0][1] += c
3608                                 tok_state = tok_state_attribute_value_unquoted
3609                 return null
3610
3611         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3612         tok_state_attribute_value_double_quoted = ->
3613                 switch c = txt.charAt(cur++)
3614                         when '"'
3615                                 tok_state = tok_state_after_attribute_value_quoted
3616                         when '&'
3617                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3618                         when "\u0000"
3619                                 # Parse error
3620                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3621                         when '' # EOF
3622                                 parse_error()
3623                                 tok_state = tok_state_data
3624                         else
3625                                 tok_cur_tag.attrs_a[0][1] += c
3626                 return null
3627
3628         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3629         tok_state_attribute_value_single_quoted = ->
3630                 switch c = txt.charAt(cur++)
3631                         when "'"
3632                                 tok_state = tok_state_after_attribute_value_quoted
3633                         when '&'
3634                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3635                         when "\u0000"
3636                                 # Parse error
3637                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3638                         when '' # EOF
3639                                 parse_error()
3640                                 tok_state = tok_state_data
3641                         else
3642                                 tok_cur_tag.attrs_a[0][1] += c
3643                 return null
3644
3645         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3646         tok_state_attribute_value_unquoted = ->
3647                 switch c = txt.charAt(cur++)
3648                         when "\t", "\n", "\u000c", ' '
3649                                 tok_state = tok_state_before_attribute_name
3650                         when '&'
3651                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3652                         when '>'
3653                                 tok_state = tok_state_data
3654                                 tmp = tok_cur_tag
3655                                 tok_cur_tag = null
3656                                 return tmp
3657                         when "\u0000"
3658                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3659                         when '' # EOF
3660                                 parse_error()
3661                                 tok_state = tok_state_data
3662                         else
3663                                 # Parse Error if ', <, = or ` (backtick)
3664                                 tok_cur_tag.attrs_a[0][1] += c
3665                 return null
3666
3667         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3668         tok_state_after_attribute_value_quoted = ->
3669                 switch c = txt.charAt(cur++)
3670                         when "\t", "\n", "\u000c", ' '
3671                                 tok_state = tok_state_before_attribute_name
3672                         when '/'
3673                                 tok_state = tok_state_self_closing_start_tag
3674                         when '>'
3675                                 tok_state = tok_state_data
3676                                 tmp = tok_cur_tag
3677                                 tok_cur_tag = null
3678                                 return tmp
3679                         when '' # EOF
3680                                 parse_error()
3681                                 tok_state = tok_state_data
3682                         else
3683                                 # Parse Error
3684                                 tok_state = tok_state_before_attribute_name
3685                                 cur -= 1 # we didn't handle that char
3686                 return null
3687
3688         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3689         tok_state_self_closing_start_tag = ->
3690                 c = txt.charAt(cur++)
3691                 if c is '>'
3692                         tok_cur_tag.flag 'self-closing'
3693                         tok_state = tok_state_data
3694                         return tok_cur_tag
3695                 if c is ''
3696                         parse_error()
3697                         tok_state = tok_state_data
3698                         cur -= 1 # Reconsume
3699                         return
3700                 # Anything else
3701                 parse_error()
3702                 tok_state = tok_state_before_attribute_name
3703                 cur -= 1 # Reconsume
3704                 return
3705
3706         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3707         # WARNING: put a comment token in tok_cur_tag before setting this state
3708         tok_state_bogus_comment = ->
3709                 next_gt = txt.indexOf '>', cur
3710                 if next_gt is -1
3711                         val = txt.substr cur
3712                         cur = txt.length
3713                 else
3714                         val = txt.substr cur, (next_gt - cur)
3715                         cur = next_gt + 1
3716                 val = val.replace "\u0000", "\ufffd"
3717                 tok_cur_tag.text += val
3718                 tok_state = tok_state_data
3719                 return tok_cur_tag
3720
3721         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3722         tok_state_markup_declaration_open = ->
3723                 if txt.substr(cur, 2) is '--'
3724                         cur += 2
3725                         tok_cur_tag = new_comment_token ''
3726                         tok_state = tok_state_comment_start
3727                         return
3728                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3729                         cur += 7
3730                         tok_state = tok_state_doctype
3731                         return
3732                 acn = adjusted_current_node()
3733                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3734                         cur += 7
3735                         tok_state = tok_state_cdata_section
3736                         return
3737                 # Otherwise
3738                 parse_error()
3739                 tok_cur_tag = new_comment_token ''
3740                 tok_state = tok_state_bogus_comment
3741                 return
3742
3743         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3744         tok_state_comment_start = ->
3745                 switch c = txt.charAt(cur++)
3746                         when '-'
3747                                 tok_state = tok_state_comment_start_dash
3748                         when "\u0000"
3749                                 parse_error()
3750                                 tok_state = tok_state_comment
3751                                 return new_character_token "\ufffd"
3752                         when '>'
3753                                 parse_error()
3754                                 tok_state = tok_state_data
3755                                 return tok_cur_tag
3756                         when '' # EOF
3757                                 parse_error()
3758                                 tok_state = tok_state_data
3759                                 cur -= 1 # Reconsume
3760                                 return tok_cur_tag
3761                         else
3762                                 tok_cur_tag.text += c
3763                                 tok_state = tok_state_comment
3764                 return null
3765
3766         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3767         tok_state_comment_start_dash = ->
3768                 switch c = txt.charAt(cur++)
3769                         when '-'
3770                                 tok_state = tok_state_comment_end
3771                         when "\u0000"
3772                                 parse_error()
3773                                 tok_cur_tag.text += "-\ufffd"
3774                                 tok_state = tok_state_comment
3775                         when '>'
3776                                 parse_error()
3777                                 tok_state = tok_state_data
3778                                 return tok_cur_tag
3779                         when '' # EOF
3780                                 parse_error()
3781                                 tok_state = tok_state_data
3782                                 cur -= 1 # Reconsume
3783                                 return tok_cur_tag
3784                         else
3785                                 tok_cur_tag.text += "-#{c}"
3786                                 tok_state = tok_state_comment
3787                 return null
3788
3789         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3790         tok_state_comment = ->
3791                 switch c = txt.charAt(cur++)
3792                         when '-'
3793                                 tok_state = tok_state_comment_end_dash
3794                         when "\u0000"
3795                                 parse_error()
3796                                 tok_cur_tag.text += "\ufffd"
3797                         when '' # EOF
3798                                 parse_error()
3799                                 tok_state = tok_state_data
3800                                 cur -= 1 # Reconsume
3801                                 return tok_cur_tag
3802                         else
3803                                 tok_cur_tag.text += c
3804                 return null
3805
3806         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3807         tok_state_comment_end_dash = ->
3808                 switch c = txt.charAt(cur++)
3809                         when '-'
3810                                 tok_state = tok_state_comment_end
3811                         when "\u0000"
3812                                 parse_error()
3813                                 tok_cur_tag.text += "-\ufffd"
3814                                 tok_state = tok_state_comment
3815                         when '' # EOF
3816                                 parse_error()
3817                                 tok_state = tok_state_data
3818                                 cur -= 1 # Reconsume
3819                                 return tok_cur_tag
3820                         else
3821                                 tok_cur_tag.text += "-#{c}"
3822                                 tok_state = tok_state_comment
3823                 return null
3824
3825         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3826         tok_state_comment_end = ->
3827                 switch c = txt.charAt(cur++)
3828                         when '>'
3829                                 tok_state = tok_state_data
3830                                 return tok_cur_tag
3831                         when "\u0000"
3832                                 parse_error()
3833                                 tok_cur_tag.text += "--\ufffd"
3834                                 tok_state = tok_state_comment
3835                         when '!'
3836                                 parse_error()
3837                                 tok_state = tok_state_comment_end_bang
3838                         when '-'
3839                                 parse_error()
3840                                 tok_cur_tag.text += '-'
3841                         when '' # EOF
3842                                 parse_error()
3843                                 tok_state = tok_state_data
3844                                 cur -= 1 # Reconsume
3845                                 return tok_cur_tag
3846                         else
3847                                 parse_error()
3848                                 tok_cur_tag.text += "--#{c}"
3849                                 tok_state = tok_state_comment
3850                 return null
3851
3852         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3853         tok_state_comment_end_bang = ->
3854                 switch c = txt.charAt(cur++)
3855                         when '-'
3856                                 tok_cur_tag.text += "--!#{c}"
3857                                 tok_state = tok_state_comment_end_dash
3858                         when '>'
3859                                 tok_state = tok_state_data
3860                                 return tok_cur_tag
3861                         when "\u0000"
3862                                 parse_error()
3863                                 tok_cur_tag.text += "--!\ufffd"
3864                                 tok_state = tok_state_comment
3865                         when '' # EOF
3866                                 parse_error()
3867                                 tok_state = tok_state_data
3868                                 cur -= 1 # Reconsume
3869                                 return tok_cur_tag
3870                         else
3871                                 tok_cur_tag.text += "--!#{c}"
3872                                 tok_state = tok_state_comment
3873                 return null
3874
3875         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3876         tok_state_doctype = ->
3877                 switch c = txt.charAt(cur++)
3878                         when "\t", "\u000a", "\u000c", ' '
3879                                 tok_state = tok_state_before_doctype_name
3880                         when '' # EOF
3881                                 parse_error()
3882                                 tok_state = tok_state_data
3883                                 el = new_doctype_token ''
3884                                 el.flag 'force-quirks', true
3885                                 cur -= 1 # Reconsume
3886                                 return el
3887                         else
3888                                 parse_error()
3889                                 tok_state = tok_state_before_doctype_name
3890                                 cur -= 1 # Reconsume
3891                 return null
3892
3893         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3894         tok_state_before_doctype_name = ->
3895                 c = txt.charAt(cur++)
3896                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3897                         return
3898                 if is_uc_alpha(c)
3899                         tok_cur_tag = new_doctype_token c.toLowerCase()
3900                         tok_state = tok_state_doctype_name
3901                         return
3902                 if c is "\u0000"
3903                         parse_error()
3904                         tok_cur_tag = new_doctype_token "\ufffd"
3905                         tok_state = tok_state_doctype_name
3906                         return
3907                 if c is '>'
3908                         parse_error()
3909                         el = new_doctype_token ''
3910                         el.flag 'force-quirks', true
3911                         tok_state = tok_state_data
3912                         return el
3913                 if c is '' # EOF
3914                         parse_error()
3915                         tok_state = tok_state_data
3916                         el = new_doctype_token ''
3917                         el.flag 'force-quirks', true
3918                         cur -= 1 # Reconsume
3919                         return el
3920                 # Anything else
3921                 tok_cur_tag = new_doctype_token c
3922                 tok_state = tok_state_doctype_name
3923                 return null
3924
3925         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3926         tok_state_doctype_name = ->
3927                 c = txt.charAt(cur++)
3928                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3929                         tok_state = tok_state_after_doctype_name
3930                         return
3931                 if c is '>'
3932                         tok_state = tok_state_data
3933                         return tok_cur_tag
3934                 if is_uc_alpha(c)
3935                         tok_cur_tag.name += c.toLowerCase()
3936                         return
3937                 if c is "\u0000"
3938                         parse_error()
3939                         tok_cur_tag.name += "\ufffd"
3940                         return
3941                 if c is '' # EOF
3942                         parse_error()
3943                         tok_state = tok_state_data
3944                         tok_cur_tag.flag 'force-quirks', true
3945                         cur -= 1 # Reconsume
3946                         return tok_cur_tag
3947                 # Anything else
3948                 tok_cur_tag.name += c
3949                 return null
3950
3951         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3952         tok_state_after_doctype_name = ->
3953                 c = txt.charAt(cur++)
3954                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3955                         return
3956                 if c is '>'
3957                         tok_state = tok_state_data
3958                         return tok_cur_tag
3959                 if c is '' # EOF
3960                         parse_error()
3961                         tok_state = tok_state_data
3962                         tok_cur_tag.flag 'force-quirks', true
3963                         cur -= 1 # Reconsume
3964                         return tok_cur_tag
3965                 # Anything else
3966                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3967                         cur += 5
3968                         tok_state = tok_state_after_doctype_public_keyword
3969                         return
3970                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3971                         cur += 5
3972                         tok_state = tok_state_after_doctype_system_keyword
3973                         return
3974                 parse_error()
3975                 tok_cur_tag.flag 'force-quirks', true
3976                 tok_state = tok_state_bogus_doctype
3977                 return null
3978
3979         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3980         tok_state_after_doctype_public_keyword = ->
3981                 c = txt.charAt(cur++)
3982                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3983                         tok_state = tok_state_before_doctype_public_identifier
3984                         return
3985                 if c is '"'
3986                         parse_error()
3987                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3988                         tok_state = tok_state_doctype_public_identifier_double_quoted
3989                         return
3990                 if c is "'"
3991                         parse_error()
3992                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3993                         tok_state = tok_state_doctype_public_identifier_single_quoted
3994                         return
3995                 if c is '>'
3996                         parse_error()
3997                         tok_cur_tag.flag 'force-quirks', true
3998                         tok_state = tok_state_data
3999                         return tok_cur_tag
4000                 if c is '' # EOF
4001                         parse_error()
4002                         tok_state = tok_state_data
4003                         tok_cur_tag.flag 'force-quirks', true
4004                         cur -= 1 # Reconsume
4005                         return tok_cur_tag
4006                 # Anything else
4007                 parse_error()
4008                 tok_cur_tag.flag 'force-quirks', true
4009                 tok_state = tok_state_bogus_doctype
4010                 return null
4011
4012         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4013         tok_state_before_doctype_public_identifier = ->
4014                 c = txt.charAt(cur++)
4015                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4016                         return
4017                 if c is '"'
4018                         parse_error()
4019                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4020                         tok_state = tok_state_doctype_public_identifier_double_quoted
4021                         return
4022                 if c is "'"
4023                         parse_error()
4024                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4025                         tok_state = tok_state_doctype_public_identifier_single_quoted
4026                         return
4027                 if c is '>'
4028                         parse_error()
4029                         tok_cur_tag.flag 'force-quirks', true
4030                         tok_state = tok_state_data
4031                         return tok_cur_tag
4032                 if c is '' # EOF
4033                         parse_error()
4034                         tok_state = tok_state_data
4035                         tok_cur_tag.flag 'force-quirks', true
4036                         cur -= 1 # Reconsume
4037                         return tok_cur_tag
4038                 # Anything else
4039                 parse_error()
4040                 tok_cur_tag.flag 'force-quirks', true
4041                 tok_state = tok_state_bogus_doctype
4042                 return null
4043
4044
4045         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4046         tok_state_doctype_public_identifier_double_quoted = ->
4047                 c = txt.charAt(cur++)
4048                 if c is '"'
4049                         tok_state = tok_state_after_doctype_public_identifier
4050                         return
4051                 if c is "\u0000"
4052                         parse_error()
4053                         tok_cur_tag.public_identifier += "\ufffd"
4054                         return
4055                 if c is '>'
4056                         parse_error()
4057                         tok_cur_tag.flag 'force-quirks', true
4058                         tok_state = tok_state_data
4059                         return tok_cur_tag
4060                 if c is '' # EOF
4061                         parse_error()
4062                         tok_state = tok_state_data
4063                         tok_cur_tag.flag 'force-quirks', true
4064                         cur -= 1 # Reconsume
4065                         return tok_cur_tag
4066                 # Anything else
4067                 tok_cur_tag.public_identifier += c
4068                 return null
4069
4070         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4071         tok_state_doctype_public_identifier_single_quoted = ->
4072                 c = txt.charAt(cur++)
4073                 if c is "'"
4074                         tok_state = tok_state_after_doctype_public_identifier
4075                         return
4076                 if c is "\u0000"
4077                         parse_error()
4078                         tok_cur_tag.public_identifier += "\ufffd"
4079                         return
4080                 if c is '>'
4081                         parse_error()
4082                         tok_cur_tag.flag 'force-quirks', true
4083                         tok_state = tok_state_data
4084                         return tok_cur_tag
4085                 if c is '' # EOF
4086                         parse_error()
4087                         tok_state = tok_state_data
4088                         tok_cur_tag.flag 'force-quirks', true
4089                         cur -= 1 # Reconsume
4090                         return tok_cur_tag
4091                 # Anything else
4092                 tok_cur_tag.public_identifier += c
4093                 return null
4094
4095         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4096         tok_state_after_doctype_public_identifier = ->
4097                 c = txt.charAt(cur++)
4098                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4099                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4100                         return
4101                 if c is '>'
4102                         tok_state = tok_state_data
4103                         return tok_cur_tag
4104                 if c is '"'
4105                         parse_error()
4106                         tok_cur_tag.system_identifier = ''
4107                         tok_state = tok_state_doctype_system_identifier_double_quoted
4108                         return
4109                 if c is "'"
4110                         parse_error()
4111                         tok_cur_tag.system_identifier = ''
4112                         tok_state = tok_state_doctype_system_identifier_single_quoted
4113                         return
4114                 if c is '' # EOF
4115                         parse_error()
4116                         tok_state = tok_state_data
4117                         tok_cur_tag.flag 'force-quirks', true
4118                         cur -= 1 # Reconsume
4119                         return tok_cur_tag
4120                 # Anything else
4121                 parse_error()
4122                 tok_cur_tag.flag 'force-quirks', true
4123                 tok_state = tok_state_bogus_doctype
4124                 return null
4125
4126         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4127         tok_state_between_doctype_public_and_system_identifiers = ->
4128                 c = txt.charAt(cur++)
4129                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4130                         return
4131                 if c is '>'
4132                         tok_state = tok_state_data
4133                         return tok_cur_tag
4134                 if c is '"'
4135                         parse_error()
4136                         tok_cur_tag.system_identifier = ''
4137                         tok_state = tok_state_doctype_system_identifier_double_quoted
4138                         return
4139                 if c is "'"
4140                         parse_error()
4141                         tok_cur_tag.system_identifier = ''
4142                         tok_state = tok_state_doctype_system_identifier_single_quoted
4143                         return
4144                 if c is '' # EOF
4145                         parse_error()
4146                         tok_state = tok_state_data
4147                         tok_cur_tag.flag 'force-quirks', true
4148                         cur -= 1 # Reconsume
4149                         return tok_cur_tag
4150                 # Anything else
4151                 parse_error()
4152                 tok_cur_tag.flag 'force-quirks', true
4153                 tok_state = tok_state_bogus_doctype
4154                 return null
4155
4156         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4157         tok_state_after_doctype_system_keyword = ->
4158                 c = txt.charAt(cur++)
4159                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4160                         tok_state = tok_state_before_doctype_system_identifier
4161                         return
4162                 if c is '"'
4163                         parse_error()
4164                         tok_cur_tag.system_identifier = ''
4165                         tok_state = tok_state_doctype_system_identifier_double_quoted
4166                         return
4167                 if c is "'"
4168                         parse_error()
4169                         tok_cur_tag.system_identifier = ''
4170                         tok_state = tok_state_doctype_system_identifier_single_quoted
4171                         return
4172                 if c is '>'
4173                         parse_error()
4174                         tok_cur_tag.flag 'force-quirks', true
4175                         tok_state = tok_state_data
4176                         return tok_cur_tag
4177                 if c is '' # EOF
4178                         parse_error()
4179                         tok_state = tok_state_data
4180                         tok_cur_tag.flag 'force-quirks', true
4181                         cur -= 1 # Reconsume
4182                         return tok_cur_tag
4183                 # Anything else
4184                 parse_error()
4185                 tok_cur_tag.flag 'force-quirks', true
4186                 tok_state = tok_state_bogus_doctype
4187                 return null
4188
4189         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4190         tok_state_before_doctype_system_identifier = ->
4191                 c = txt.charAt(cur++)
4192                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4193                         return
4194                 if c is '"'
4195                         tok_cur_tag.system_identifier = ''
4196                         tok_state = tok_state_doctype_system_identifier_double_quoted
4197                         return
4198                 if c is "'"
4199                         tok_cur_tag.system_identifier = ''
4200                         tok_state = tok_state_doctype_system_identifier_single_quoted
4201                         return
4202                 if c is '>'
4203                         parse_error()
4204                         tok_cur_tag.flag 'force-quirks', true
4205                         tok_state = tok_state_data
4206                         return tok_cur_tag
4207                 if c is '' # EOF
4208                         parse_error()
4209                         tok_state = tok_state_data
4210                         tok_cur_tag.flag 'force-quirks', true
4211                         cur -= 1 # Reconsume
4212                         return tok_cur_tag
4213                 # Anything else
4214                 parse_error()
4215                 tok_cur_tag.flag 'force-quirks', true
4216                 tok_state = tok_state_bogus_doctype
4217                 return null
4218
4219         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4220         tok_state_doctype_system_identifier_double_quoted = ->
4221                 c = txt.charAt(cur++)
4222                 if c is '"'
4223                         tok_state = tok_state_after_doctype_system_identifier
4224                         return
4225                 if c is "\u0000"
4226                         parse_error()
4227                         tok_cur_tag.system_identifier += "\ufffd"
4228                         return
4229                 if c is '>'
4230                         parse_error()
4231                         tok_cur_tag.flag 'force-quirks', true
4232                         tok_state = tok_state_data
4233                         return tok_cur_tag
4234                 if c is '' # EOF
4235                         parse_error()
4236                         tok_state = tok_state_data
4237                         tok_cur_tag.flag 'force-quirks', true
4238                         cur -= 1 # Reconsume
4239                         return tok_cur_tag
4240                 # Anything else
4241                 tok_cur_tag.system_identifier += c
4242                 return null
4243
4244         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4245         tok_state_doctype_system_identifier_single_quoted = ->
4246                 c = txt.charAt(cur++)
4247                 if c is "'"
4248                         tok_state = tok_state_after_doctype_system_identifier
4249                         return
4250                 if c is "\u0000"
4251                         parse_error()
4252                         tok_cur_tag.system_identifier += "\ufffd"
4253                         return
4254                 if c is '>'
4255                         parse_error()
4256                         tok_cur_tag.flag 'force-quirks', true
4257                         tok_state = tok_state_data
4258                         return tok_cur_tag
4259                 if c is '' # EOF
4260                         parse_error()
4261                         tok_state = tok_state_data
4262                         tok_cur_tag.flag 'force-quirks', true
4263                         cur -= 1 # Reconsume
4264                         return tok_cur_tag
4265                 # Anything else
4266                 tok_cur_tag.system_identifier += c
4267                 return null
4268
4269         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4270         tok_state_after_doctype_system_identifier = ->
4271                 c = txt.charAt(cur++)
4272                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4273                         return
4274                 if c is '>'
4275                         tok_state = tok_state_data
4276                         return tok_cur_tag
4277                 if c is '' # EOF
4278                         parse_error()
4279                         tok_state = tok_state_data
4280                         tok_cur_tag.flag 'force-quirks', true
4281                         cur -= 1 # Reconsume
4282                         return tok_cur_tag
4283                 # Anything else
4284                 parse_error()
4285                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4286                 tok_state = tok_state_bogus_doctype
4287                 return null
4288
4289         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4290         tok_state_bogus_doctype = ->
4291                 c = txt.charAt(cur++)
4292                 if c is '>'
4293                         tok_state = tok_state_data
4294                         return tok_cur_tag
4295                 if c is '' # EOF
4296                         tok_state = tok_state_data
4297                         cur -= 1 # Reconsume
4298                         return tok_cur_tag
4299                 # Anything else
4300                 return null
4301
4302         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4303         tok_state_cdata_section = ->
4304                 tok_state = tok_state_data
4305                 next_gt = txt.indexOf ']]>', cur
4306                 if next_gt is -1
4307                         val = txt.substr cur
4308                         cur = txt.length
4309                 else
4310                         val = txt.substr cur, (next_gt - cur)
4311                         cur = next_gt + 3
4312                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4313                 val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4314                 val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4315                 return new_character_token val # fixfull split
4316
4317         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4318         # Don't set this as a state, just call it
4319         # returns a string (NOT a text node)
4320         parse_character_reference = (allowed_char = null, in_attr = false) ->
4321                 if cur >= txt.length
4322                         return '&'
4323                 switch c = txt.charAt(cur)
4324                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4325                                 # explicitly not a parse error
4326                                 return '&'
4327                         when ';'
4328                                 # there has to be "one or more" alnums between & and ; to be a parse error
4329                                 return '&'
4330                         when '#'
4331                                 if cur + 1 >= txt.length
4332                                         return '&'
4333                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4334                                         prefix = '#x'
4335                                         charset = hex_chars
4336                                         start = cur + 2
4337                                 else
4338                                         charset = digits
4339                                         start = cur + 1
4340                                         prefix = '#'
4341                                 i = 0
4342                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4343                                         i += 1
4344                                 if i is 0
4345                                         return '&'
4346                                 if txt.charAt(start + i) is ';'
4347                                         i += 1
4348                                 # FIXME This is supposed to generate parse errors for some chars
4349                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4350                                 if decoded?
4351                                         cur = start + i
4352                                         return decoded
4353                                 return '&'
4354                         else
4355                                 for i in [0...31]
4356                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4357                                                 break
4358                                 if i is 0
4359                                         # exit early, because parse_error() below needs at least one alnum
4360                                         return '&'
4361                                 if txt.charAt(cur + i) is ';'
4362                                         i += 1 # include ';' terminator in value
4363                                         decoded = decode_named_char_ref txt.substr(cur, i)
4364                                         if decoded?
4365                                                 cur += i
4366                                                 return decoded
4367                                         parse_error()
4368                                         return '&'
4369                                 else
4370                                         # no ';' terminator (only legacy char refs)
4371                                         max = i
4372                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4373                                                 c = legacy_char_refs[txt.substr(cur, i)]
4374                                                 if c?
4375                                                         if in_attr
4376                                                                 if txt.charAt(cur + i) is '='
4377                                                                         # "because some legacy user agents will
4378                                                                         # misinterpret the markup in those cases"
4379                                                                         parse_error()
4380                                                                         return '&'
4381                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4382                                                                         # this makes attributes forgiving about url args
4383                                                                         return '&'
4384                                                         # ok, and besides the weird exceptions for attributes...
4385                                                         # return the matching char
4386                                                         cur += i # consume entity chars
4387                                                         parse_error() # because no terminating ";"
4388                                                         return c
4389                                         parse_error()
4390                                         return '&'
4391                 return # never reached
4392
4393         # tree constructor initialization
4394         # see comments on TYPE_TAG/etc for the structure of this data
4395         txt = args.html
4396         cur = 0
4397         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4398         open_els = []
4399         afe = [] # active formatting elements
4400         template_ins_modes = []
4401         ins_mode = ins_mode_initial
4402         original_ins_mode = ins_mode # TODO check spec
4403         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4404         flag_frameset_ok = true
4405         flag_parsing = true
4406         flag_foster_parenting = false
4407         form_element_pointer = null
4408         temporary_buffer = null
4409         pending_table_character_tokens = []
4410         head_element_pointer = null
4411         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4412         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4413
4414         # tokenizer initialization
4415         tok_state = tok_state_data
4416
4417         if args.name is "one_that_breaks #1"
4418                 throw "hi" # console.log "hi"
4419         # proccess input
4420         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4421         while flag_parsing
4422                 t = tok_state()
4423                 if t?
4424                         process_token t
4425                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4426         return doc.children
4427
4428 serialize_els = (els, shallow, show_ids) ->
4429         serialized = ''
4430         sep = ''
4431         for t in els
4432                 serialized += sep
4433                 sep = ','
4434                 serialized += t.serialize shallow, show_ids
4435         return serialized
4436
4437 # TODO export TYPE_*
4438 module.exports.parse_html = parse_html
4439 module.exports.debug_log_reset = debug_log_reset
4440 module.exports.debug_log_each = debug_log_each
4441 module.exports.TYPE_TAG = TYPE_TAG
4442 module.exports.TYPE_TEXT = TYPE_TEXT
4443 module.exports.TYPE_COMMENT = TYPE_COMMENT
4444 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4445 module.exports.NS_HTML = NS_HTML
4446 module.exports.NS_MATHML = NS_MATHML
4447 module.exports.NS_SVG = NS_SVG