JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
31a46f413fc2447fd4e1d21b03531d9f5ae71c66
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 @flags = args.flags ? {}
96                 if args.id?
97                         @id = "#{args.id}+"
98                 else
99                         @id = "#{++prev_node_id}"
100         acknowledge_self_closing: ->
101                 if @token?
102                         @token.flag 'did_self_close'
103                 else
104                         @flag 'did_self_close', true
105         flag: (key, value = null) ->
106                 if value?
107                         @flags[key] = value
108                 else
109                         return @flags[key]
110         serialize: (shallow = false, show_ids = false) -> # for unit tests
111                 ret = ''
112                 switch @type
113                         when TYPE_TAG
114                                 ret += 'tag:'
115                                 ret += JSON.stringify @name
116                                 ret += ','
117                                 if show_ids
118                                         ret += "##{@id},"
119                                 if shallow
120                                         break
121                                 attr_keys = []
122                                 for k of @attrs
123                                         attr_keys.push k
124                                 attr_keys.sort()
125                                 ret += '{'
126                                 sep = ''
127                                 for k in attr_keys
128                                         ret += sep
129                                         sep = ','
130                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
131                                 ret += '},['
132                                 sep = ''
133                                 for c in @children
134                                         ret += sep
135                                         sep = ','
136                                         ret += c.serialize shallow, show_ids
137                                 ret += ']'
138                         when TYPE_TEXT
139                                 ret += 'text:'
140                                 ret += JSON.stringify @text
141                         when TYPE_COMMENT
142                                 ret += 'comment:'
143                                 ret += JSON.stringify @text
144                         when TYPE_DOCTYPE
145                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
146                         when TYPE_AFE_MARKER
147                                 ret += 'marker'
148                         when TYPE_AAA_BOOKMARK
149                                 ret += 'aaa_bookmark'
150                         else
151                                 ret += 'unknown:'
152                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
153                 return ret
154
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157         return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159         return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161         return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163         return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166         return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168         return new Node TYPE_DOCTYPE, name: name
169 new_eof_token = ->
170         return new Node TYPE_EOF
171 new_afe_marker = ->
172         return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174         return new Node TYPE_AAA_BOOKMARK
175
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
181
182 is_uc_alpha = (str) ->
183         return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185         return str.length is 1 and lc_alpha.indexOf(str) > -1
186
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
189
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
192 is_space = (txt) ->
193         return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
196
197 is_input_hidden_tok = (t) ->
198         return unless t.type is TYPE_START_TAG
199         for a of t.attrs_a
200                 if a[0] is 'type'
201                         if a[1].toLowerCase() is 'hidden'
202                                 return true
203                         return false
204         return false
205
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
208
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
211 legacy_char_refs = {
212         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
229         yen: '¥', yuml: 'ÿ'
230 }
231
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
236 svg_elements = [
237         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
251         'view', 'vkern'
252 ]
253
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
255 mathml_elements = [
256         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262         'determinant', 'diff', 'divergence', 'divide', 'domain',
263         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283         'vectorproduct', 'xor'
284 ]
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
287
288 special_elements = {
289         # HTML:
290         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307         wbr:NS_HTML, xmp:NS_HTML,
308
309         # MathML:
310         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311         'annotation-xml':NS_MATHML,
312
313         # SVG:
314         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
315 }
316
317 formatting_elements = {
318          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
320          u: true
321 }
322
323 mathml_text_integration = {
324         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
325 }
326 is_mathml_text_integration_point = (el) ->
327         return mathml_text_integration[el.name] = el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329         if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330                 if el.attrs.encoding?
331                         if el.attrs.encoding.toLowerCase() is 'text/html'
332                                 return true
333                         if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
334                                 return true
335                 return false
336         if el.namespace is NS_SVG
337                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
338                         return true
339         return false
340
341 h_tags = {
342         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
343 }
344
345 # FIXME namespacify
346 foster_parenting_targets = {
347         table: true
348         tbody: true
349         tfoot: true
350         thead: true
351         tr: true
352 }
353
354 # FIXME namespacify
355 # all html I presume
356 end_tag_implied = {
357         dd: true
358         dt: true
359         li: true
360         option: true
361         optgroup: true
362         p: true
363         rb: true
364         rp: true
365         rt: true
366         rtc: true
367 }
368
369 el_is_special = (e) ->
370         return special_elements[e.name] is e.namespace
371
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
375
376 svg_name_fixes = {
377         altglyph: 'altGlyph'
378         altglyphdef: 'altGlyphDef'
379         altglyphitem: 'altGlyphItem'
380         animatecolor: 'animateColor'
381         animatemotion: 'animateMotion'
382         animatetransform: 'animateTransform'
383         clippath: 'clipPath'
384         feblend: 'feBlend'
385         fecolormatrix: 'feColorMatrix'
386         fecomponenttransfer: 'feComponentTransfer'
387         fecomposite: 'feComposite'
388         feconvolvematrix: 'feConvolveMatrix'
389         fediffuselighting: 'feDiffuseLighting'
390         fedisplacementmap: 'feDisplacementMap'
391         fedistantlight: 'feDistantLight'
392         fedropshadow: 'feDropShadow'
393         feflood: 'feFlood'
394         fefunca: 'feFuncA'
395         fefuncb: 'feFuncB'
396         fefuncg: 'feFuncG'
397         fefuncr: 'feFuncR'
398         fegaussianblur: 'feGaussianBlur'
399         feimage: 'feImage'
400         femerge: 'feMerge'
401         femergenode: 'feMergeNode'
402         femorphology: 'feMorphology'
403         feoffset: 'feOffset'
404         fepointlight: 'fePointLight'
405         fespecularlighting: 'feSpecularLighting'
406         fespotlight: 'feSpotLight'
407         fetile: 'feTile'
408         feturbulence: 'feTurbulence'
409         foreignobject: 'foreignObject'
410         glyphref: 'glyphRef'
411         lineargradient: 'linearGradient'
412         radialgradient: 'radialGradient'
413         textpath: 'textPath'
414 }
415 svg_attribute_fixes = {
416         attributename: 'attributeName'
417         attributetype: 'attributeType'
418         basefrequency: 'baseFrequency'
419         baseprofile: 'baseProfile'
420         calcmode: 'calcMode'
421         clippathunits: 'clipPathUnits'
422         contentscripttype: 'contentScriptType'
423         contentstyletype: 'contentStyleType'
424         diffuseconstant: 'diffuseConstant'
425         edgemode: 'edgeMode'
426         externalresourcesrequired: 'externalResourcesRequired'
427         filterres: 'filterRes'
428         filterunits: 'filterUnits'
429         glyphref: 'glyphRef'
430         gradienttransform: 'gradientTransform'
431         gradientunits: 'gradientUnits'
432         kernelmatrix: 'kernelMatrix'
433         kernelunitlength: 'kernelUnitLength'
434         keypoints: 'keyPoints'
435         keysplines: 'keySplines'
436         keytimes: 'keyTimes'
437         lengthadjust: 'lengthAdjust'
438         limitingconeangle: 'limitingConeAngle'
439         markerheight: 'markerHeight'
440         markerunits: 'markerUnits'
441         markerwidth: 'markerWidth'
442         maskcontentunits: 'maskContentUnits'
443         maskunits: 'maskUnits'
444         numoctaves: 'numOctaves'
445         pathlength: 'pathLength'
446         patterncontentunits: 'patternContentUnits'
447         patterntransform: 'patternTransform'
448         patternunits: 'patternUnits'
449         pointsatx: 'pointsAtX'
450         pointsaty: 'pointsAtY'
451         pointsatz: 'pointsAtZ'
452         preservealpha: 'preserveAlpha'
453         preserveaspectratio: 'preserveAspectRatio'
454         primitiveunits: 'primitiveUnits'
455         refx: 'refX'
456         refy: 'refY'
457         repeatcount: 'repeatCount'
458         repeatdur: 'repeatDur'
459         requiredextensions: 'requiredExtensions'
460         requiredfeatures: 'requiredFeatures'
461         specularconstant: 'specularConstant'
462         specularexponent: 'specularExponent'
463         spreadmethod: 'spreadMethod'
464         startoffset: 'startOffset'
465         stddeviation: 'stdDeviation'
466         stitchtiles: 'stitchTiles'
467         surfacescale: 'surfaceScale'
468         systemlanguage: 'systemLanguage'
469         tablevalues: 'tableValues'
470         targetx: 'targetX'
471         targety: 'targetY'
472         textlength: 'textLength'
473         viewbox: 'viewBox'
474         viewtarget: 'viewTarget'
475         xchannelselector: 'xChannelSelector'
476         ychannelselector: 'yChannelSelector'
477         zoomandpan: 'zoomAndPan'
478 }
479 adjust_mathml_attributes = (t) ->
480         for a in t.attrs_a
481                 if a[0] is 'definitionurl'
482                         a[0] = 'definitionURL'
483         return
484 adjust_svg_attributes = (t) ->
485         for a in t.attrs_a
486                 if svg_attribute_fixes[a[0]]?
487                         a[0] = svg_attribute_fixes[a[0]]
488         return
489 adjust_foreign_attributes = (t) ->
490         # fixfull
491         return
492
493 # decode_named_char_ref()
494 #
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
497 #
498 # Pass without the "&" but with the ";" examples:
499 #    for "&amp" pass "amp;"
500 #    for "&#x2032" pass "x2032;"
501 g_dncr = {
502         cache: {}
503         textarea: document.createElement('textarea')
504 }
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
507         txt = "&#{txt}"
508         decoded = g_dncr.cache[txt]
509         return decoded if decoded?
510         g_dncr.textarea.innerHTML = txt
511         decoded = g_dncr.textarea.value
512         return null if decoded is txt
513         return g_dncr.cache[txt] = decoded
514
515 parse_html = (txt, parse_error_cb = null) ->
516         cur = 0 # index of next char in txt to be parsed
517         # declare doc and tokenizer variables so they're in scope below
518         doc = null
519         open_els = null # stack of open elements
520         afe = null # active formatting elements
521         template_ins_modes = null
522         ins_mode = null
523         original_ins_mode = null
524         tok_state = null
525         tok_cur_tag = null # partially parsed tag
526         flag_scripting = null
527         flag_frameset_ok = null
528         flag_parsing = null
529         flag_foster_parenting = null
530         form_element_pointer = null
531         temporary_buffer = null
532         pending_table_character_tokens = null
533         head_element_pointer = null
534         flag_fragment_parsing = null
535         context_element = null
536
537         stop_parsing = ->
538                 flag_parsing = false
539
540         parse_error = ->
541                 if parse_error_cb?
542                         parse_error_cb cur
543                 else
544                         console.log "Parse error at character #{cur} of #{txt.length}"
545
546         afe_push = (new_el) ->
547                 matches = 0
548                 for el, i in afe
549                         if el.name is new_el.name and el.namespace is new_el.namespace
550                                 for k, v of el.attrs
551                                         continue unless new_el.attrs[k] is v
552                                 for k, v of new_el.attrs
553                                         continue unless el.attrs[k] is v
554                                 matches += 1
555                                 if matches is 3
556                                         afe.splice i, 1
557                                         break
558                 afe.unshift new_el
559         afe_push_marker = ->
560                 afe.unshift new_afe_marker()
561
562         # the functions below impliment the Tree Contstruction algorithm
563         # http://www.w3.org/TR/html5/syntax.html#tree-construction
564
565         # But first... the helpers
566         template_tag_is_open = ->
567                 for t in open_els
568                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
569                                 return true
570                 return false
571         is_in_scope_x = (tag_name, scope, namespace) ->
572                 for t in open_els
573                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
574                                 return true
575                         if scope[t.name] is t.namespace
576                                 return false
577                 return false
578         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
579                 for t in open_els
580                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
581                                 return true
582                         if scope[t.name] is t.namespace
583                                 return false
584                         if scope2[t.name] is t.namespace
585                                 return false
586                 return false
587         standard_scopers = {
588                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
589                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
590                 template: NS_HTML, mi: NS_MATHML,
591
592                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
593                 'annotation-xml': NS_MATHML,
594
595                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
596         }
597         button_scopers = button: NS_HTML
598         li_scopers = ol: NS_HTML, ul: NS_HTML
599         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
600         is_in_scope = (tag_name, namespace = null) ->
601                 return is_in_scope_x tag_name, standard_scopers, namespace
602         is_in_button_scope = (tag_name, namespace = null) ->
603                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
604         is_in_table_scope = (tag_name, namespace = null) ->
605                 return is_in_scope_x tag_name, table_scopers, namespace
606         # aka is_in_list_item_scope
607         is_in_li_scope = (tag_name, namespace = null) ->
608                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
609         is_in_select_scope = (tag_name, namespace = null) ->
610                 for t in open_els
611                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
612                                 return true
613                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
614                                 return false
615                 return false
616         # this checks for a particular element, not by name
617         el_is_in_scope = (el) ->
618                 for t in open_els
619                         if t is el
620                                 return true
621                         if standard_scopers[t.name] is t.namespace
622                                 return false
623                 return false
624
625         clear_to_table_stopers = {
626                 'table': true
627                 'template': true
628                 'html': true
629         }
630         clear_stack_to_table_context = ->
631                 loop
632                         if clear_to_table_stopers[open_els[0].name]?
633                                 break
634                         open_els.shift()
635                 return
636         clear_to_table_body_stopers = {
637                 'tbody': true
638                 'tfoot': true
639                 'thead': true
640                 'template': true
641                 'html': true
642         }
643         clear_stack_to_table_body_context = ->
644                 loop
645                         if clear_to_table_body_stopers[open_els[0].name]?
646                                 break
647                         open_els.shift()
648                 return
649         clear_to_table_row_stopers = {
650                 'tr': true
651                 'template': true
652                 'html': true
653         }
654         clear_stack_to_table_row_context = ->
655                 loop
656                         if clear_to_table_row_stopers[open_els[0].name]?
657                                 break
658                         open_els.shift()
659                 return
660         clear_afe_to_marker = ->
661                 loop
662                         return unless afe.length > 0 # this happens in fragment case, ?spec error
663                         el = afe.shift()
664                         if el.type is TYPE_AFE_MARKER
665                                 return
666                 return
667
668         # 8.2.3.1 ...
669         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
670         reset_ins_mode = ->
671                 # 1. Let last be false.
672                 last = false
673                 # 2. Let node be the last node in the stack of open elements.
674                 node_i = 0
675                 node = open_els[node_i]
676                 # 3. Loop: If node is the first node in the stack of open elements,
677                 # then set last to true, and, if the parser was originally created as
678                 # part of the HTML fragment parsing algorithm (fragment case) set node
679                 # to the context element.
680                 loop
681                         if node_i is open_els.length - 1
682                                 last = true
683                                 # fixfull (fragment case)
684
685                         # 4. If node is a select element, run these substeps:
686                         if node.name is 'select'
687                                 # 1. If last is true, jump to the step below labeled done.
688                                 unless last
689                                         # 2. Let ancestor be node.
690                                         ancestor_i = node_i
691                                         ancestor = node
692                                         # 3. Loop: If ancestor is the first node in the stack of
693                                         # open elements, jump to the step below labeled done.
694                                         loop
695                                                 if ancestor_i is open_els.length - 1
696                                                         break
697                                                 # 4. Let ancestor be the node before ancestor in the stack
698                                                 # of open elements.
699                                                 ancestor_i += 1
700                                                 ancestor = open_els[ancestor_i]
701                                                 # 5. If ancestor is a template node, jump to the step below
702                                                 # labeled done.
703                                                 if ancestor.name is 'template'
704                                                         break
705                                                 # 6. If ancestor is a table node, switch the insertion mode
706                                                 # to "in select in table" and abort these steps.
707                                                 if ancestor.name is 'table'
708                                                         ins_mode = ins_mode_in_select_in_table
709                                                         return
710                                                 # 7. Jump back to the step labeled loop.
711                                 # 8. Done: Switch the insertion mode to "in select" and abort
712                                 # these steps.
713                                 ins_mode = ins_mode_in_select
714                                 return
715                         # 5. If node is a td or th element and last is false, then switch
716                         # the insertion mode to "in cell" and abort these steps.
717                         if (node.name is 'td' or node.name is 'th') and last is false
718                                 ins_mode = ins_mode_in_cell
719                                 return
720                         # 6. If node is a tr element, then switch the insertion mode to "in
721                         # row" and abort these steps.
722                         if node.name is 'tr'
723                                 ins_mode = ins_mode_in_row
724                                 return
725                         # 7. If node is a tbody, thead, or tfoot element, then switch the
726                         # insertion mode to "in table body" and abort these steps.
727                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
728                                 ins_mode = ins_mode_in_table_body
729                                 return
730                         # 8. If node is a caption element, then switch the insertion mode
731                         # to "in caption" and abort these steps.
732                         if node.name is 'caption'
733                                 ins_mode = ins_mode_in_caption
734                                 return
735                         # 9. If node is a colgroup element, then switch the insertion mode
736                         # to "in column group" and abort these steps.
737                         if node.name is 'colgroup'
738                                 ins_mode = ins_mode_in_column_group
739                                 return
740                         # 10. If node is a table element, then switch the insertion mode to
741                         # "in table" and abort these steps.
742                         if node.name is 'table'
743                                 ins_mode = ins_mode_in_table
744                                 return
745                         # 11. If node is a template element, then switch the insertion mode
746                         # to the current template insertion mode and abort these steps.
747                         # fixfull (template insertion mode stack)
748
749                         # 12. If node is a head element and last is true, then switch the
750                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
751                         # these steps. (fragment case)
752                         if node.name is 'head' and last
753                                 ins_mode = ins_mode_in_body
754                                 return
755                         # 13. If node is a head element and last is false, then switch the
756                         # insertion mode to "in head" and abort these steps.
757                         if node.name is 'head' and last is false
758                                 ins_mode = ins_mode_in_head
759                                 return
760                         # 14. If node is a body element, then switch the insertion mode to
761                         # "in body" and abort these steps.
762                         if node.name is 'body'
763                                 ins_mode = ins_mode_in_body
764                                 return
765                         # 15. If node is a frameset element, then switch the insertion mode
766                         # to "in frameset" and abort these steps. (fragment case)
767                         if node.name is 'frameset'
768                                 ins_mode = ins_mode_in_frameset
769                                 return
770                         # 16. If node is an html element, run these substeps:
771                         if node.name is 'html'
772                                 # 1. If the head element pointer is null, switch the insertion
773                                 # mode to "before head" and abort these steps. (fragment case)
774                                 if head_element_pointer is null
775                                         ins_mode = ins_mode_before_head
776                                 else
777                                         # 2. Otherwise, the head element pointer is not null,
778                                         # switch the insertion mode to "after head" and abort these
779                                         # steps.
780                                         ins_mode = ins_mode_after_head
781                                 return
782                         # 17. If last is true, then switch the insertion mode to "in body"
783                         # and abort these steps. (fragment case)
784                         if last
785                                 ins_mode = ins_mode_in_body
786                                 return
787                         # 18. Let node now be the node before node in the stack of open
788                         # elements.
789                         node_i += 1
790                         node = open_els[node_i]
791                         # 19. Return to the step labeled loop.
792
793         # 8.2.3.2
794
795         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
796         adjusted_current_node = ->
797                 if open_els.length is 1 and flag_fragment_parsing
798                         return context_element
799                 return open_els[0]
800
801         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
802         # this implementation is structured (mostly) as described at the link above.
803         # capitalized comments are the "labels" described at the link above.
804         reconstruct_afe = ->
805                 return if afe.length is 0
806                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
807                         return
808                 # Rewind
809                 i = 0
810                 loop
811                         if i is afe.length - 1
812                                 break
813                         i += 1
814                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
815                                 i -= 1 # Advance
816                                 break
817                 # Create
818                 loop
819                         el = insert_html_element afe[i].token
820                         afe[i] = el
821                         break if i is 0
822                         i -= 1 # Advance
823
824         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
825         # adoption agency algorithm
826         # overview here:
827         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
828         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
829         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
830         adoption_agency = (subject) ->
831                 debug_log "adoption_agency()"
832                 debug_log "tree: #{serialize_els doc.children, false, true}"
833                 debug_log "open_els: #{serialize_els open_els, true, true}"
834                 debug_log "afe: #{serialize_els afe, true, true}"
835                 if open_els[0].name is subject
836                         el = open_els[0]
837                         open_els.shift()
838                         # remove it from the list of active formatting elements (if found)
839                         for t, i in afe
840                                 if t is el
841                                         afe.splice i, 1
842                                         break
843                         debug_log "aaa: starting off with subject on top of stack, exiting"
844                         return
845                 outer = 0
846                 loop
847                         if outer >= 8
848                                 return
849                         outer += 1
850                         # 5. Let formatting element be the last element in the list of
851                         # active formatting elements that: is between the end of the list
852                         # and the last scope marker in the list, if any, or the start of
853                         # the list otherwise, and  has the tag name subject.
854                         fe = null
855                         for t, fe_of_afe in afe
856                                 if t.type is TYPE_AFE_MARKER
857                                         break
858                                 if t.name is subject
859                                         fe = t
860                                         break
861                         # If there is no such element, then abort these steps and instead
862                         # act as described in the "any other end tag" entry above.
863                         if fe is null
864                                 debug_log "aaa: fe not found in afe"
865                                 in_body_any_other_end_tag subject
866                                 return
867                         # 6. If formatting element is not in the stack of open elements,
868                         # then this is a parse error; remove the element from the list, and
869                         # abort these steps.
870                         in_open_els = false
871                         for t, fe_of_open_els in open_els
872                                 if t is fe
873                                         in_open_els = true
874                                         break
875                         unless in_open_els
876                                 debug_log "aaa: fe not found in open_els"
877                                 parse_error()
878                                 # "remove it from the list" must mean afe, since it's not in open_els
879                                 afe.splice fe_of_afe, 1
880                                 return
881                         # 7. If formatting element is in the stack of open elements, but
882                         # the element is not in scope, then this is a parse error; abort
883                         # these steps.
884                         unless el_is_in_scope fe
885                                 debug_log "aaa: fe not in scope"
886                                 parse_error()
887                                 return
888                         # 8. If formatting element is not the current node, this is a parse
889                         # error. (But do not abort these steps.)
890                         unless open_els[0] is fe
891                                 parse_error()
892                                 # continue
893                         # 9. Let furthest block be the topmost node in the stack of open
894                         # elements that is lower in the stack than formatting element, and
895                         # is an element in the special category. There might not be one.
896                         fb = null
897                         fb_of_open_els = null
898                         for t, i in open_els
899                                 if t is fe
900                                         break
901                                 if el_is_special t
902                                         fb = t
903                                         fb_of_open_els = i
904                                         # and continue, to see if there's one that's more "topmost"
905                         # 10. If there is no furthest block, then the UA must first pop all
906                         # the nodes from the bottom of the stack of open elements, from the
907                         # current node up to and including formatting element, then remove
908                         # formatting element from the list of active formatting elements,
909                         # and finally abort these steps.
910                         if fb is null
911                                 debug_log "aaa: no fb"
912                                 loop
913                                         t = open_els.shift()
914                                         if t is fe
915                                                 afe.splice fe_of_afe, 1
916                                                 return
917                         # 11. Let common ancestor be the element immediately above
918                         # formatting element in the stack of open elements.
919                         ca = open_els[fe_of_open_els + 1] # common ancestor
920
921                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
922                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
923                         bookmark = new_aaa_bookmark()
924                         for t, i in afe
925                                 if t is fe
926                                         afe.splice i, 0, bookmark
927                                         break
928                         node = last_node = fb
929                         inner = 0
930                         loop
931                                 inner += 1
932                                 # 3. Let node be the element immediately above node in the
933                                 # stack of open elements, or if node is no longer in the stack
934                                 # of open elements (e.g. because it got removed by this
935                                 # algorithm), the element that was immediately above node in
936                                 # the stack of open elements before node was removed.
937                                 node_next = null
938                                 for t, i in open_els
939                                         if t is node
940                                                 node_next = open_els[i + 1]
941                                                 break
942                                 node = node_next ? node_above
943                                 debug_log "inner loop #{inner}"
944                                 debug_log "tree: #{serialize_els doc.children, false, true}"
945                                 debug_log "open_els: #{serialize_els open_els, true, true}"
946                                 debug_log "afe: #{serialize_els afe, true, true}"
947                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
948                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
949                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
950                                 debug_log "node: #{node.serialize true, true}"
951                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
952
953                                 # 4. If node is formatting element, then go to the next step in
954                                 # the overall algorithm.
955                                 if node is fe
956                                         break
957                                 debug_log "the meat"
958                                 # 5. If inner loop counter is greater than three and node is in
959                                 # the list of active formatting elements, then remove node from
960                                 # the list of active formatting elements.
961                                 node_in_afe = false
962                                 for t, i in afe
963                                         if t is node
964                                                 if inner > 3
965                                                         afe.splice i, 1
966                                                         debug_log "max out inner"
967                                                 else
968                                                         node_in_afe = true
969                                                         debug_log "in afe"
970                                                 break
971                                 # 6. If node is not in the list of active formatting elements,
972                                 # then remove node from the stack of open elements and then go
973                                 # back to the step labeled inner loop.
974                                 unless node_in_afe
975                                         debug_log "not in afe"
976                                         for t, i in open_els
977                                                 if t is node
978                                                         node_above = open_els[i + 1]
979                                                         open_els.splice i, 1
980                                                         break
981                                         continue
982                                 debug_log "the bones"
983                                 # 7. create an element for the token for which the element node
984                                 # was created, in the HTML namespace, with common ancestor as
985                                 # the intended parent; replace the entry for node in the list
986                                 # of active formatting elements with an entry for the new
987                                 # element, replace the entry for node in the stack of open
988                                 # elements with an entry for the new element, and let node be
989                                 # the new element.
990                                 new_node = token_to_element node.token, NS_HTML, ca
991                                 for t, i in afe
992                                         if t is node
993                                                 afe[i] = new_node
994                                                 debug_log "replaced in afe"
995                                                 break
996                                 for t, i in open_els
997                                         if t is node
998                                                 node_above = open_els[i + 1]
999                                                 open_els[i] = new_node
1000                                                 debug_log "replaced in open_els"
1001                                                 break
1002                                 node = new_node
1003                                 # 8. If last node is furthest block, then move the
1004                                 # aforementioned bookmark to be immediately after the new node
1005                                 # in the list of active formatting elements.
1006                                 if last_node is fb
1007                                         for t, i in afe
1008                                                 if t is bookmark
1009                                                         afe.splice i, 1
1010                                                         debug_log "removed bookmark"
1011                                                         break
1012                                         for t, i in afe
1013                                                 if t is node
1014                                                         # "after" means lower
1015                                                         afe.splice i, 0, bookmark # "after as <-
1016                                                         debug_log "placed bookmark after node"
1017                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1018                                                         break
1019                                 # 9. Insert last node into node, first removing it from its
1020                                 # previous parent node if any.
1021                                 if last_node.parent?
1022                                         debug_log "last_node has parent"
1023                                         for c, i in last_node.parent.children
1024                                                 if c is last_node
1025                                                         debug_log "removing last_node from parent"
1026                                                         last_node.parent.children.splice i, 1
1027                                                         break
1028                                 node.children.push last_node
1029                                 last_node.parent = node
1030                                 # 10. Let last node be node.
1031                                 last_node = node
1032                                 debug_log "at last"
1033                                 # 11. Return to the step labeled inner loop.
1034                         # 14. Insert whatever last node ended up being in the previous step
1035                         # at the appropriate place for inserting a node, but using common
1036                         # ancestor as the override target.
1037
1038                         # In the case where fe is immediately followed by fb:
1039                         #   * inner loop exits out early (node==fe)
1040                         #   * last_node is fb
1041                         #   * last_node is still in the tree (not a duplicate)
1042                         if last_node.parent?
1043                                 debug_log "FEFIRST? last_node has parent"
1044                                 for c, i in last_node.parent.children
1045                                         if c is last_node
1046                                                 debug_log "removing last_node from parent"
1047                                                 last_node.parent.children.splice i, 1
1048                                                 break
1049
1050                         debug_log "after aaa inner loop"
1051                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1052                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1053                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1054                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1055                         debug_log "tree: #{serialize_els doc.children, false, true}"
1056
1057                         debug_log "insert"
1058
1059
1060                         # can't use standard insert token thing, because it's already in
1061                         # open_els and must stay at it's current position in open_els
1062                         dest = adjusted_insertion_location ca
1063                         dest[0].children.splice dest[1], 0, last_node
1064                         last_node.parent = dest[0]
1065
1066
1067                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1068                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1069                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1070                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1071                         debug_log "tree: #{serialize_els doc.children, false, true}"
1072
1073                         # 15. Create an element for the token for which formatting element
1074                         # was created, in the HTML namespace, with furthest block as the
1075                         # intended parent.
1076                         new_element = token_to_element fe.token, NS_HTML, fb
1077                         # 16. Take all of the child nodes of furthest block and append them
1078                         # to the element created in the last step.
1079                         while fb.children.length
1080                                 t = fb.children.shift()
1081                                 t.parent = new_element
1082                                 new_element.children.push t
1083                         # 17. Append that new element to furthest block.
1084                         new_element.parent = fb
1085                         fb.children.push new_element
1086                         # 18. Remove formatting element from the list of active formatting
1087                         # elements, and insert the new element into the list of active
1088                         # formatting elements at the position of the aforementioned
1089                         # bookmark.
1090                         for t, i in afe
1091                                 if t is fe
1092                                         afe.splice i, 1
1093                                         break
1094                         for t, i in afe
1095                                 if t is bookmark
1096                                         afe[i] = new_element
1097                                         break
1098                         # 19. Remove formatting element from the stack of open elements,
1099                         # and insert the new element into the stack of open elements
1100                         # immediately below the position of furthest block in that stack.
1101                         for t, i in open_els
1102                                 if t is fe
1103                                         open_els.splice i, 1
1104                                         break
1105                         for t, i in open_els
1106                                 if t is fb
1107                                         open_els.splice i, 0, new_element
1108                                         break
1109                         # 20. Jump back to the step labeled outer loop.
1110                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1111                         debug_log "tree: #{serialize_els doc.children, false, true}"
1112                         debug_log "open_els: #{serialize_els open_els, true, true}"
1113                         debug_log "afe: #{serialize_els afe, true, true}"
1114                 debug_log "AAA DONE"
1115
1116         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1117         close_p_element = ->
1118                 generate_implied_end_tags 'p' # arg is exception
1119                 if open_els[0].name isnt 'p'
1120                         parse_error()
1121                 while open_els.length > 1 # just in case
1122                         el = open_els.shift()
1123                         if el.name is 'p'
1124                                 return
1125         close_p_if_in_button_scope = ->
1126                 if is_in_button_scope 'p'
1127                         close_p_element()
1128
1129         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1130         # aka insert_a_character = (t) ->
1131         insert_character = (t) ->
1132                 dest = adjusted_insertion_location()
1133                 # fixfull check for Document node
1134                 if dest[1] > 0
1135                         prev = dest[0].children[dest[1] - 1]
1136                         if prev.type is TYPE_TEXT
1137                                 prev.text += t.text
1138                                 return
1139                 dest[0].children.splice dest[1], 0, t
1140
1141
1142         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1143         process_token = (t) ->
1144                 acn = adjusted_current_node()
1145                 unless acn?
1146                         ins_mode t
1147                         return
1148                 if acn.namespace is NS_HTML
1149                         ins_mode t
1150                         return
1151                 if is_mathml_text_integration_point(acn)
1152                         if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1153                                 ins_mode t
1154                                 return
1155                         if t.type is TYPE_TEXT
1156                                 ins_mode t
1157                                 return
1158                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1159                         ins_mode t
1160                         return
1161                 if is_html_integration acn
1162                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1163                                 ins_mode t
1164                                 return
1165                 if t.type is TYPE_EOF
1166                         ins_mode t
1167                         return
1168                 in_foreign_content t
1169                 return
1170
1171         # 8.2.5.1
1172         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1173         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1174         adjusted_insertion_location = (override_target = null) ->
1175                 # 1. If there was an override target specified, then let target be the
1176                 # override target.
1177                 if override_target?
1178                         target = override_target
1179                 else # Otherwise, let target be the current node.
1180                         target = open_els[0]
1181                 # 2. Determine the adjusted insertion location using the first matching
1182                 # steps from the following list:
1183                 #
1184                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1185                 # thead, or tr element Foster parenting happens when content is
1186                 # misnested in tables.
1187                 if flag_foster_parenting and foster_parenting_targets[target.name]
1188                         loop # once. this is here so we can ``break`` to "abort these substeps"
1189                                 # 1. Let last template be the last template element in the
1190                                 # stack of open elements, if any.
1191                                 last_template = null
1192                                 last_template_i = null
1193                                 for el, i in open_els
1194                                         if el.name is 'template'
1195                                                 last_template = el
1196                                                 last_template_i = i
1197                                                 break
1198                                 # 2. Let last table be the last table element in the stack of
1199                                 # open elements, if any.
1200                                 last_table = null
1201                                 last_table_i
1202                                 for el, i in open_els
1203                                         if el.name is 'table'
1204                                                 last_table = el
1205                                                 last_table_i = i
1206                                                 break
1207                                 # 3. If there is a last template and either there is no last
1208                                 # table, or there is one, but last template is lower (more
1209                                 # recently added) than last table in the stack of open
1210                                 # elements, then: let adjusted insertion location be inside
1211                                 # last template's template contents, after its last child (if
1212                                 # any), and abort these substeps.
1213                                 if last_template and (last_table is null or last_template_i < last_table_i)
1214                                         target = last_template # fixfull should be it's contents
1215                                         target_i = target.children.length
1216                                         break
1217                                 # 4. If there is no last table, then let adjusted insertion
1218                                 # location be inside the first element in the stack of open
1219                                 # elements (the html element), after its last child (if any),
1220                                 # and abort these substeps. (fragment case)
1221                                 if last_table is null
1222                                         # this is odd
1223                                         target = open_els[open_els.length - 1]
1224                                         target_i = target.children.length
1225                                 # 5. If last table has a parent element, then let adjusted
1226                                 # insertion location be inside last table's parent element,
1227                                 # immediately before last table, and abort these substeps.
1228                                 if last_table.parent?
1229                                         for c, i in last_table.parent.children
1230                                                 if c is last_table
1231                                                         target = last_table.parent
1232                                                         target_i = i
1233                                                         break
1234                                         break
1235                                 # 6. Let previous element be the element immediately above last
1236                                 # table in the stack of open elements.
1237                                 #
1238                                 # huh? how could it not have a parent?
1239                                 previous_element = open_els[last_table_i + 1]
1240                                 # 7. Let adjusted insertion location be inside previous
1241                                 # element, after its last child (if any).
1242                                 target = previous_element
1243                                 target_i = target.children.length
1244                                 # Note: These steps are involved in part because it's possible
1245                                 # for elements, the table element in this case in particular,
1246                                 # to have been moved by a script around in the DOM, or indeed
1247                                 # removed from the DOM entirely, after the element was inserted
1248                                 # by the parser.
1249                                 break # don't really loop
1250                 else
1251                         # Otherwise Let adjusted insertion location be inside target, after
1252                         # its last child (if any).
1253                         target_i = target.children.length
1254
1255                 # 3. If the adjusted insertion location is inside a template element,
1256                 # let it instead be inside the template element's template contents,
1257                 # after its last child (if any).
1258                 # fixfull (template)
1259
1260                 # 4. Return the adjusted insertion location.
1261                 return [target, target_i]
1262
1263         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1264         # aka create_an_element_for_token
1265         token_to_element = (t, namespace, intended_parent) ->
1266                 # convert attributes into a hash
1267                 attrs = {}
1268                 for a in t.attrs_a
1269                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1270                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1271
1272                 # TODO 2. If the newly created element has an xmlns attribute in the
1273                 # XMLNS namespace whose value is not exactly the same as the element's
1274                 # namespace, that is a parse error. Similarly, if the newly created
1275                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1276                 # value is not the XLink Namespace, that is a parse error.
1277
1278                 # fixfull: the spec says stuff about form pointers and ownerDocument
1279
1280                 return el
1281
1282         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1283         insert_foreign_element = (token, namespace) ->
1284                 ail = adjusted_insertion_location()
1285                 ail_el = ail[0]
1286                 ail_i = ail[1]
1287                 el = token_to_element token, namespace, ail_el
1288                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1289                 el.parent = ail_el
1290                 ail_el.children.splice ail_i, 0, el
1291                 open_els.unshift el
1292                 return el
1293         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1294         insert_html_element = (token) ->
1295                 insert_foreign_element token, NS_HTML
1296
1297         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1298         # position should be [node, index_within_children]
1299         insert_comment = (t, position = null) ->
1300                 position ?= adjusted_insertion_location()
1301                 position[0].children.splice position[1], 0, t
1302
1303         # 8.2.5.2
1304         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1305         parse_generic_raw_text = (t) ->
1306                 insert_html_element t
1307                 tok_state = tok_state_rawtext
1308                 original_ins_mode = ins_mode
1309                 ins_mode = ins_mode_text
1310         parse_generic_rcdata_text = (t) ->
1311                 insert_html_element t
1312                 tok_state = tok_state_rcdata
1313                 original_ins_mode = ins_mode
1314                 ins_mode = ins_mode_text
1315
1316         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1317         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1318         generate_implied_end_tags = (except = null) ->
1319                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1320                         open_els.shift()
1321
1322         # 8.2.5.4 The rules for parsing tokens in HTML content
1323         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1324
1325         # 8.2.5.4.1 The "initial" insertion mode
1326         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1327         ins_mode_initial = (t) ->
1328                 if is_space_tok t
1329                         return
1330                 if t.type is TYPE_COMMENT
1331                         # ?fixfull
1332                         doc.children.push t
1333                         return
1334                 if t.type is TYPE_DOCTYPE
1335                         # FIXME check identifiers, set quirks, etc
1336                         # fixfull
1337                         doc.children.push t
1338                         ins_mode = ins_mode_before_html
1339                         return
1340                 # Anything else
1341                 #fixfull (iframe, quirks)
1342                 ins_mode = ins_mode_before_html
1343                 process_token t
1344                 return
1345
1346         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1347         ins_mode_before_html = (t) ->
1348                 if t.type is TYPE_DOCTYPE
1349                         parse_error()
1350                         return
1351                 if t.type is TYPE_COMMENT
1352                         doc.children.push t
1353                         return
1354                 if is_space_tok t
1355                         return
1356                 if t.type is TYPE_START_TAG and t.name is 'html'
1357                         el = token_to_element t, NS_HTML, doc
1358                         doc.children.push el
1359                         open_els.unshift(el)
1360                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1361                         ins_mode = ins_mode_before_head
1362                         return
1363                 if t.type is TYPE_END_TAG
1364                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1365                                 # fall through to "anything else"
1366                         else
1367                                 parse_error()
1368                                 return
1369                 # Anything else
1370                 html_tok = new_open_tag 'html'
1371                 el = token_to_element html_tok, NS_HTML, doc
1372                 doc.children.push el
1373                 open_els.unshift el
1374                 # ?fixfull browsing context
1375                 ins_mode = ins_mode_before_head
1376                 process_token t
1377                 return
1378
1379         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1380         ins_mode_before_head = (t) ->
1381                 if is_space_tok t
1382                         return
1383                 if t.type is TYPE_COMMENT
1384                         insert_comment t
1385                         return
1386                 if t.type is TYPE_DOCTYPE
1387                         parse_error()
1388                         return
1389                 if t.type is TYPE_START_TAG and t.name is 'html'
1390                         ins_mode_in_body t
1391                         return
1392                 if t.type is TYPE_START_TAG and t.name is 'head'
1393                         el = insert_html_element t
1394                         head_element_pointer = el
1395                         ins_mode = ins_mode_in_head
1396                 if t.type is TYPE_END_TAG
1397                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1398                                 # fall through to Anything else below
1399                         else
1400                                 parse_error()
1401                                 return
1402                 # Anything else
1403                 head_tok = new_open_tag 'head'
1404                 el = insert_html_element head_tok
1405                 head_element_pointer = el
1406                 ins_mode = ins_mode_in_head
1407                 process_token t
1408
1409         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1410         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1411                 open_els.shift() # spec says this will be a 'head' node
1412                 ins_mode = ins_mode_after_head
1413                 process_token t
1414         ins_mode_in_head = (t) ->
1415                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1416                         insert_character t
1417                         return
1418                 if t.type is TYPE_COMMENT
1419                         insert_comment t
1420                         return
1421                 if t.type is TYPE_DOCTYPE
1422                         parse_error()
1423                         return
1424                 if t.type is TYPE_START_TAG and t.name is 'html'
1425                         ins_mode_in_body t
1426                         return
1427                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1428                         el = insert_html_element t
1429                         open_els.shift()
1430                         t.acknowledge_self_closing()
1431                         return
1432                 if t.type is TYPE_START_TAG and t.name is 'meta'
1433                         el = insert_html_element t
1434                         open_els.shift()
1435                         t.acknowledge_self_closing()
1436                         # fixfull encoding stuff
1437                         return
1438                 if t.type is TYPE_START_TAG and t.name is 'title'
1439                         parse_generic_rcdata_text t
1440                         return
1441                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1442                         parse_generic_raw_text t
1443                         return
1444                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1445                         insert_html_element t
1446                         ins_mode = ins_mode_in_head_noscript
1447                         return
1448                 if t.type is TYPE_START_TAG and t.name is 'script'
1449                         ail = adjusted_insertion_location()
1450                         el = token_to_element t, NS_HTML, ail
1451                         el.flag 'parser-inserted', true
1452                         # fixfull frament case
1453                         ail[0].children.splice ail[1], 0, el
1454                         open_els.unshift el
1455                         tok_state = tok_state_script_data
1456                         original_ins_mode = ins_mode # make sure orig... is defined
1457                         ins_mode = ins_mode_text
1458                         return
1459                 if t.type is TYPE_END_TAG and t.name is 'head'
1460                         open_els.shift() # will be a head element... spec says so
1461                         ins_mode = ins_mode_after_head
1462                         return
1463                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1464                         ins_mode_in_head_else t
1465                         return
1466                 if t.type is TYPE_START_TAG and t.name is 'template'
1467                         insert_html_element t
1468                         afe_push_marker()
1469                         flag_frameset_ok = false
1470                         ins_mode = ins_mode_in_template
1471                         template_ins_modes.unshift ins_mode_in_template
1472                         return
1473                 if t.type is TYPE_END_TAG and t.name is 'template'
1474                         if template_tag_is_open()
1475                                 generate_implied_end_tags
1476                                 if open_els[0].name isnt 'template'
1477                                         parse_error()
1478                                 loop
1479                                         el = open_els.shift()
1480                                         if el.name is 'template'
1481                                                 break
1482                                 clear_afe_to_marker()
1483                                 template_ins_modes.shift()
1484                                 reset_ins_mode()
1485                         else
1486                                 parse_error()
1487                         return
1488                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1489                         parse_error()
1490                         return
1491                 ins_mode_in_head_else t
1492
1493         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1494         ins_mode_in_head_noscript_else = (t) ->
1495                 parse_error()
1496                 open_els.shift()
1497                 ins_mode = ins_mode_in_head
1498                 process_token t
1499         ins_mode_in_head_noscript = (t) ->
1500                 if t.type is TYPE_DOCTYPE
1501                         parse_error()
1502                         return
1503                 if t.type is TYPE_START_TAG
1504                         ins_mode_in_body t
1505                         return
1506                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1507                         open_els.shift()
1508                         ins_mode = ins_mode_in_head
1509                         return
1510                 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1511                         ins_mode_in_head t
1512                         return
1513                 if t.type is TYPE_END_TAG and t.name is 'br'
1514                         ins_mode_in_head_noscript_else t
1515                         return
1516                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1517                         parse_error()
1518                         return
1519                 # Anything else
1520                 ins_mode_in_head_noscript_else t
1521                 return
1522
1523
1524
1525         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1526         ins_mode_after_head_else = (t) ->
1527                 body_tok = new_open_tag 'body'
1528                 insert_html_element body_tok
1529                 ins_mode = ins_mode_in_body
1530                 process_token t
1531                 return
1532         ins_mode_after_head = (t) ->
1533                 if is_space_tok t
1534                         insert_character t
1535                         return
1536                 if t.type is TYPE_COMMENT
1537                         insert_comment t
1538                         return
1539                 if t.type is TYPE_DOCTYPE
1540                         parse_error()
1541                         return
1542                 if t.type is TYPE_START_TAG and t.name is 'html'
1543                         ins_mode_in_body t
1544                         return
1545                 if t.type is TYPE_START_TAG and t.name is 'body'
1546                         insert_html_element t
1547                         flag_frameset_ok = false
1548                         ins_mode = ins_mode_in_body
1549                         return
1550                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1551                         insert_html_element t
1552                         ins_mode = ins_mode_in_frameset
1553                         return
1554                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1555                         parse_error()
1556                         open_els.unshift head_element_pointer
1557                         ins_mode_in_head t
1558                         for el, i of open_els
1559                                 if el is head_element_pointer
1560                                         open_els.splice i, 1
1561                                         return
1562                         console.log "warning: 23904 couldn't find head element in open_els"
1563                         return
1564                 if t.type is TYPE_END_TAG and t.name is 'template'
1565                         ins_mode_in_head t
1566                         return
1567                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1568                         ins_mode_after_head_else t
1569                         return
1570                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1571                         parse_error()
1572                         return
1573                 # Anything else
1574                 ins_mode_after_head_else t
1575
1576         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1577         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1578                 for el, i in open_els
1579                         if el.namespace is NS_HTML and el.name is name
1580                                 generate_implied_end_tags name # arg is exception
1581                                 parse_error() unless i is 0
1582                                 while i >= 0
1583                                         open_els.shift()
1584                                         i -= 1
1585                                 return
1586                         if special_elements[el.name] is el.namespace
1587                                 parse_error()
1588                                 return
1589                 return
1590         ins_mode_in_body = (t) ->
1591                 if t.type is TYPE_TEXT and t.text is "\u0000"
1592                         parse_error()
1593                         return
1594                 if is_space_tok t
1595                         reconstruct_afe()
1596                         insert_character t
1597                         return
1598                 if t.type is TYPE_TEXT
1599                         reconstruct_afe()
1600                         insert_character t
1601                         flag_frameset_ok = false
1602                         return
1603                 if t.type is TYPE_COMMENT
1604                         insert_comment t
1605                         return
1606                 if t.type is TYPE_DOCTYPE
1607                         parse_error()
1608                         return
1609                 if t.type is TYPE_START_TAG and t.name is 'html'
1610                         parse_error()
1611                         return if template_tag_is_open()
1612                         root_attrs = open_els[open_els.length - 1].attrs
1613                         for a of t.attrs_a
1614                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1615                         return
1616
1617                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1618                         ins_mode_in_head t
1619                         return
1620                 if t.type is TYPE_START_TAG and t.name is 'body'
1621                         parse_error()
1622                         return if open_els.length < 2
1623                         second = open_els[open_els.length - 2]
1624                         return unless second.ns is NS_HTML
1625                         return unless second.name is 'body'
1626                         return if template_tag_is_open()
1627                         frameset_ok_flag = false
1628                         for a of t.attrs_a
1629                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1630                         return
1631                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1632                         parse_error()
1633                         return if open_els.length < 2
1634                         second_i = open_els.length - 2
1635                         second = open_els[second_i]
1636                         return unless second.ns is NS_HTML
1637                         return unless second.name is 'body'
1638                         flag_frameset_ok = false
1639                         if second.parent?
1640                                 for el, i in second.parent.children
1641                                         if el is second
1642                                                 second.parent.children.splice i, 1
1643                                                 break
1644                         open_els.splice second_i, 1
1645                         # pop everything except the "root html element"
1646                         while open_els.length > 1
1647                                 open_els.shift()
1648                         insert_html_element t
1649                         ins_mode = ins_mode_in_frameset
1650                         return
1651                 if t.type is TYPE_EOF
1652                         ok_tags = {
1653                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1654                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1655                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1656                         }
1657                         for el in open_els
1658                                 unless ok_tags[t.name] is el.namespace
1659                                         parse_error()
1660                                         break
1661                         if template_ins_modes.length > 0
1662                                 ins_mode_in_template t
1663                         else
1664                                 stop_parsing()
1665                         return
1666                 if t.type is TYPE_END_TAG and t.name is 'body'
1667                         unless is_in_scope 'body'
1668                                 parse_error()
1669                                 return
1670                         ok_tags = {
1671                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1672                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1673                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1674                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1675                                 html:NS_HTML
1676                         }
1677                         for el in open_els
1678                                 unless ok_tags[t.name] is el.namespace
1679                                         parse_error()
1680                                         break
1681                         ins_mode = ins_mode_after_body
1682                         return
1683                 if t.type is TYPE_END_TAG and t.name is 'html'
1684                         unless is_in_scope 'body'
1685                                 parse_error()
1686                                 return
1687                         ok_tags = {
1688                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1689                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1690                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1691                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1692                                 html:NS_HTML
1693                         }
1694                         for el in open_els
1695                                 unless ok_tags[t.name] is el.namespace
1696                                         parse_error()
1697                                         break
1698                         ins_mode = ins_mode_after_body
1699                         process_token t
1700                         return
1701                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1702                         close_p_if_in_button_scope()
1703                         insert_html_element t
1704                         return
1705                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1706                         close_p_if_in_button_scope()
1707                         if h_tags[open_els[0]] is NS_HTML
1708                                 parse_error()
1709                                 open_els.shift()
1710                         insert_html_element t
1711                         return
1712                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1713                         close_p_if_in_button_scope()
1714                         insert_html_element t
1715                         # spec: If the next token is a "LF" (U+000A) character token, then
1716                         # ignore that token and move on to the next one. (Newlines at the
1717                         # start of pre blocks are ignored as an authoring convenience.)
1718                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1719                                 cur += 1
1720                         flag_frameset_ok = false
1721                         return
1722                 if t.type is TYPE_START_TAG and t.name is 'form'
1723                         unless form_element_pointer is null or template_tag_is_open()
1724                                 parse_error()
1725                                 return
1726                         close_p_if_in_button_scope()
1727                         el = insert_html_element t
1728                         unless template_tag_is_open()
1729                                 form_element_pointer = el
1730                         return
1731                 if t.type is TYPE_START_TAG and t.name is 'li'
1732                         flag_frameset_ok = false
1733                         for node in open_els
1734                                 if node.name is 'li' and node.namespace is NS_HTML
1735                                         generate_implied_end_tags 'li' # arg is exception
1736                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1737                                                 parse_error()
1738                                         loop
1739                                                 el = open_els.shift()
1740                                                 if el.name is 'li' and el.namespace is NS_HTML
1741                                                         break
1742                                         break
1743                                 if el_is_special_not_adp node
1744                                                 break
1745                         close_p_if_in_button_scope()
1746                         insert_html_element t
1747                         return
1748                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1749                         flag_frameset_ok = false
1750                         for node in open_els
1751                                 if node.name is 'dd' and node.namespace is NS_HTML
1752                                         generate_implied_end_tags 'dd' # arg is exception
1753                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1754                                                 parse_error()
1755                                         loop
1756                                                 el = open_els.shift()
1757                                                 if el.name is 'dd' and el.namespace is NS_HTML
1758                                                         break
1759                                         break
1760                                 if node.name is 'dt' and node.namespace is NS_HTML
1761                                         generate_implied_end_tags 'dt' # arg is exception
1762                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1763                                                 parse_error()
1764                                         loop
1765                                                 el = open_els.shift()
1766                                                 if el.name is 'dt' and el.namespace is NS_HTML
1767                                                         break
1768                                         break
1769                                 if el_is_special_not_adp node
1770                                         break
1771                         close_p_if_in_button_scope()
1772                         insert_html_element t
1773                         return
1774                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1775                         close_p_if_in_button_scope()
1776                         insert_html_element t
1777                         tok_state = tok_state_plaintext
1778                         return
1779                 if t.type is TYPE_START_TAG and t.name is 'button'
1780                         if is_in_scope 'button', NS_HTML
1781                                 parse_error()
1782                                 generate_implied_end_tags()
1783                                 loop
1784                                         el = open_els.shift()
1785                                         if el.name is 'button' and el.namespace is NS_HTML
1786                                                 break
1787                         reconstruct_afe()
1788                         insert_html_element t
1789                         flag_frameset_ok = false
1790                         return
1791                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1792                         unless is_in_scope t.name, NS_HTML
1793                                 parse_error()
1794                                 return
1795                         generate_implied_end_tags()
1796                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1797                                 parse_error()
1798                         loop
1799                                 el = open_els.shift()
1800                                 if el.name is t.name and el.namespace is NS_HTML
1801                                         return
1802                         return
1803                 if t.type is TYPE_END_TAG and t.name is 'form'
1804                         unless template_tag_is_open()
1805                                 node = form_element_pointer
1806                                 form_element_pointer = null
1807                                 if node is null or not el_is_in_scope node
1808                                         parse_error()
1809                                         return
1810                                 generate_implied_end_tags()
1811                                 if open_els[0] isnt node
1812                                         parse_error()
1813                                 for el, i in open_els
1814                                         if el is node
1815                                                 open_els.splice i, 1
1816                                                 break
1817                         else
1818                                 unless is_in_scope 'form', NS_HTML
1819                                         parse_error()
1820                                         return
1821                                 generate_implied_end_tags()
1822                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1823                                         parse_error()
1824                                 loop
1825                                         el = open_els.shift()
1826                                         if el.name is 'form' and el.namespace is NS_HTML
1827                                                 break
1828                         return
1829                 if t.type is TYPE_END_TAG and t.name is 'p'
1830                         unless is_in_button_scope 'p', NS_HTML
1831                                 parse_error()
1832                                 insert_html_element new_open_tag 'p'
1833                         close_p_element()
1834                         return
1835                 if t.type is TYPE_END_TAG and t.name is 'li'
1836                         unless is_in_li_scope 'li', NS_HTML
1837                                 parse_error()
1838                                 return
1839                         generate_implied_end_tags 'li' # arg is exception
1840                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1841                                 parse_error()
1842                         loop
1843                                 el = open_els.shift()
1844                                 if el.name is 'li' and el.namespace is NS_HTML
1845                                         break
1846                         return
1847                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1848                         unless is_in_scope t.name, NS_HTML
1849                                 parse_error()
1850                                 return
1851                         generate_implied_end_tags t.name # arg is exception
1852                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1853                                 parse_error()
1854                         loop
1855                                 el = open_els.shift()
1856                                 if el.name is t.name and el.namespace is NS_HTML
1857                                         break
1858                         return
1859                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1860                         h_in_scope = false
1861                         for el in open_els
1862                                 if h_tags[el.name] is el.namespace
1863                                         h_in_scope = true
1864                                         break
1865                                 if standard_scopers[el.name] is el.namespace
1866                                         break
1867                         unless h_in_scope
1868                                 parse_error()
1869                                 return
1870                         generate_implied_end_tags()
1871                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1872                                 parse_error()
1873                         loop
1874                                 el = open_els.shift()
1875                                 if h_tags[el.name] is el.namespace
1876                                         break
1877                         return
1878                 # deep breath!
1879                 if t.type is TYPE_START_TAG and t.name is 'a'
1880                         # If the list of active formatting elements contains an a element
1881                         # between the end of the list and the last marker on the list (or
1882                         # the start of the list if there is no marker on the list), then
1883                         # this is a parse error; run the adoption agency algorithm for the
1884                         # tag name "a", then remove that element from the list of active
1885                         # formatting elements and the stack of open elements if the
1886                         # adoption agency algorithm didn't already remove it (it might not
1887                         # have if the element is not in table scope).
1888                         found = false
1889                         for el in afe
1890                                 if el.type is TYPE_AFE_MARKER
1891                                         break
1892                                 if el.name is 'a' and el.namespace is NS_HTML
1893                                         found = el
1894                         if found?
1895                                 parse_error()
1896                                 adoption_agency 'a'
1897                                 for el, i in afe
1898                                         if el is found
1899                                                 afe.splice i, 1
1900                                 for el, i in open_els
1901                                         if el is found
1902                                                 open_els.splice i, 1
1903                         reconstruct_afe()
1904                         el = insert_html_element t
1905                         afe_push el
1906                         return
1907                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1908                         reconstruct_afe()
1909                         el = insert_html_element t
1910                         afe_push el
1911                         return
1912                 if t.type is TYPE_START_TAG and t.name is 'nobr'
1913                         reconstruct_afe()
1914                         el = insert_html_element t
1915                         afe_push el
1916                         return
1917                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1918                         adoption_agency t.name
1919                         return
1920                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1921                         reconstruct_afe()
1922                         insert_html_element t
1923                         afe_push_marker()
1924                         flag_frameset_ok = false
1925                         return
1926                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1927                         unless is_in_scope t.name, NS_HTML
1928                                 parse_error()
1929                                 return
1930                         generate_implied_end_tags()
1931                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1932                                 parse_error()
1933                         loop
1934                                 el = open_els.shift()
1935                                 if el.name is t.name and el.namespace is NS_HTML
1936                                         break
1937                         clear_afe_to_marker()
1938                         return
1939                 if t.type is TYPE_START_TAG and t.name is 'table'
1940                         close_p_if_in_button_scope() # fixfull quirksmode thing
1941                         insert_html_element t
1942                         flag_frameset_ok = false
1943                         ins_mode = ins_mode_in_table
1944                         return
1945                 if t.type is TYPE_END_TAG and t.name is 'br'
1946                         parse_error()
1947                         t.type is TYPE_START_TAG
1948                         # fall through
1949                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1950                         reconstruct_afe()
1951                         insert_html_element t
1952                         open_els.shift()
1953                         t.acknowledge_self_closing()
1954                         flag_frameset_ok = false
1955                         return
1956                 if t.type is TYPE_START_TAG and t.name is 'input'
1957                         reconstruct_afe()
1958                         insert_html_element t
1959                         open_els.shift()
1960                         t.acknowledge_self_closing()
1961                         unless is_input_hidden_tok t
1962                                 flag_frameset_ok = false
1963                         return
1964                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1965                         insert_html_element t
1966                         open_els.shift()
1967                         t.acknowledge_self_closing()
1968                         return
1969                 if t.type is TYPE_START_TAG and t.name is 'hr'
1970                         close_p_if_in_button_scope()
1971                         insert_html_element t
1972                         open_els.shift()
1973                         t.acknowledge_self_closing()
1974                         flag_frameset_ok = false
1975                         return
1976                 if t.type is TYPE_START_TAG and t.name is 'image'
1977                         parse_error()
1978                         t.name = 'img'
1979                         process_token t
1980                         return
1981                 if t.type is TYPE_START_TAG and t.name is 'isindex'
1982                         parse_error()
1983                         if template_tag_is_open() is false and form_element_pointer isnt null
1984                                 return
1985                         t.acknowledge_self_closing()
1986                         flag_frameset_ok = false
1987                         close_p_if_in_button_scope()
1988                         el = insert_html_element new_open_tag 'form'
1989                         unless template_tag_is_open()
1990                                 form_element_pointer = el
1991                         for a in t.attrs_a
1992                                 if a[0] is 'action'
1993                                         el.attrs['action'] = a[1]
1994                                         break
1995                         insert_html_element new_open_tag 'hr'
1996                         open_els.shift()
1997                         reconstruct_afe()
1998                         insert_html_element new_open_tag 'label'
1999                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2000                         input_el = new_open_tag 'input'
2001                         prompt = null
2002                         for a in t.attrs_a
2003                                 if a[0] is 'prompt'
2004                                         prompt = a[1]
2005                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2006                                         input_el.attrs_a.push [a[0], a[1]]
2007                         input_el.attrs_a.push ['name', 'isindex']
2008                         # fixfull this next bit is in english... internationalize?
2009                         prompt ?= "This is a searchable index. Enter search keywords: "
2010                         insert_character new_character_token prompt # fixfull split
2011                         # TODO submit typo "balue" in spec
2012                         insert_html_element input_el
2013                         open_els.shift()
2014                         # insert_character '' # you can put chars here if promt attr missing
2015                         open_els.shift()
2016                         insert_html_element new_open_tag 'hr'
2017                         open_els.shift()
2018                         open_els.shift()
2019                         unless template_tag_is_open()
2020                                 form_element_pointer = null
2021                         return
2022                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2023                         insert_html_element t
2024                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2025                                 cur += 1
2026                         tok_state = tok_state_rcdata
2027                         original_ins_mode = ins_mode
2028                         flag_frameset_ok = false
2029                         ins_mode = ins_mode_text
2030                         return
2031                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2032                         close_p_if_in_button_scope()
2033                         reconstruct_afe()
2034                         flag_frameset_ok = false
2035                         parse_generic_raw_text t
2036                         return
2037                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2038                         flag_frameset_ok = false
2039                         parse_generic_raw_text t
2040                         return
2041                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2042                         parse_generic_raw_text t
2043                         return
2044                 if t.type is TYPE_START_TAG and t.name is 'select'
2045                         reconstruct_afe()
2046                         insert_html_element t
2047                         flag_frameset_ok = false
2048                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2049                                 ins_mode = ins_mode_in_select_in_table
2050                         else
2051                                 ins_mode = ins_mode_in_select
2052                         return
2053                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2054                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2055                                 open_els.shift()
2056                         reconstruct_afe()
2057                         insert_html_element t
2058                         return
2059                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2060                         if is_in_scope 'ruby', NS_HTML
2061                                 generate_implied_end_tags()
2062                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2063                                         parse_error()
2064                         insert_html_element t
2065                         return
2066                 if t.type is TYPE_START_TAG and t.name is 'rt'
2067                         if is_in_scope 'ruby', NS_HTML
2068                                 generate_implied_end_tags 'rtc' # arg is exception
2069                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2070                                         parse_error()
2071                         insert_html_element t
2072                         return
2073                 if t.type is TYPE_START_TAG and t.name is 'math'
2074                         reconstruct_afe()
2075                         adjust_mathml_attributes t
2076                         adjust_foreign_attributes t
2077                         insert_foreign_element t, NS_MATHML
2078                         if t.flag 'self-closing'
2079                                 open_els.shift()
2080                                 t.acknowledge_self_closing()
2081                         return
2082                 if t.type is TYPE_START_TAG and t.name is 'svg'
2083                         reconstruct_afe()
2084                         adjust_svg_attributes t
2085                         adjust_foreign_attributes t
2086                         insert_foreign_element t, NS_SVG
2087                         if t.flag 'self-closing'
2088                                 open_els.shift()
2089                                 t.acknowledge_self_closing()
2090                         return
2091                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2092                         parse_error()
2093                         return
2094                 if t.type is TYPE_START_TAG # any other start tag
2095                         reconstruct_afe()
2096                         insert_html_element t
2097                         return
2098                 if t.type is TYPE_END_TAG # any other end tag
2099                         in_body_any_other_end_tag t.name
2100                         return
2101                 return
2102
2103         ins_mode_in_table_else = (t) ->
2104                 parse_error()
2105                 flag_foster_parenting = true # FIXME
2106                 ins_mode_in_body t
2107                 flag_foster_parenting = false
2108         can_in_table = { # FIXME do this inline like everywhere else
2109                 'table': true
2110                 'tbody': true
2111                 'tfoot': true
2112                 'thead': true
2113                 'tr': true
2114         }
2115
2116         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2117         ins_mode_text = (t) ->
2118                 if t.type is TYPE_TEXT
2119                         insert_character t
2120                         return
2121                 if t.type is TYPE_EOF
2122                         parse_error()
2123                         if open_els[0].name is 'script'
2124                                 open_els[0].flag 'already started', true
2125                         open_els.shift()
2126                         ins_mode = original_ins_mode
2127                         process_token t
2128                         return
2129                 if t.type is TYPE_END_TAG and t.name is 'script'
2130                         open_els.shift()
2131                         ins_mode = original_ins_mode
2132                         # fixfull the spec seems to assume that I'm going to run the script
2133                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2134                         return
2135                 if t.type is TYPE_END_TAG
2136                         open_els.shift()
2137                         ins_mode = original_ins_mode
2138                         return
2139                 console.log 'warning: end of ins_mode_text reached'
2140
2141         # the functions below implement the tokenizer stats described here:
2142         # http://www.w3.org/TR/html5/syntax.html#tokenization
2143
2144         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2145         ins_mode_in_table = (t) ->
2146                 switch t.type
2147                         when TYPE_TEXT
2148                                 if can_in_table[t.name]
2149                                         original_ins_mode = ins_mode
2150                                         ins_mode = ins_mode_in_table_text
2151                                         process_token t
2152                                 else
2153                                         ins_mode_in_table_else t
2154                         when TYPE_COMMENT
2155                                 insert_comment t
2156                         when TYPE_DOCTYPE
2157                                 parse_error()
2158                         when TYPE_START_TAG
2159                                 switch t.name
2160                                         when 'caption'
2161                                                 clear_stack_to_table_context()
2162                                                 afe_push_marker()
2163                                                 insert_html_element t
2164                                                 ins_mode = ins_mode_in_caption
2165                                         when 'colgroup'
2166                                                 clear_stack_to_table_context()
2167                                                 insert_html_element t
2168                                                 ins_mode = ins_mode_in_column_group
2169                                         when 'col'
2170                                                 clear_stack_to_table_context()
2171                                                 insert_html_element new_open_tag 'colgroup'
2172                                                 ins_mode = ins_mode_in_column_group
2173                                                 process_token t
2174                                         when 'tbody', 'tfoot', 'thead'
2175                                                 clear_stack_to_table_context()
2176                                                 insert_html_element t
2177                                                 ins_mode = ins_mode_in_table_body
2178                                         when 'td', 'th', 'tr'
2179                                                 clear_stack_to_table_context()
2180                                                 insert_html_element new_open_tag 'tbody'
2181                                                 ins_mode = ins_mode_in_table_body
2182                                                 process_token t
2183                                         when 'table'
2184                                                 parse_error()
2185                                                 if is_in_table_scope 'table'
2186                                                         loop
2187                                                                 el = open_els.shift()
2188                                                                 if el.name is 'table'
2189                                                                         break
2190                                                         reset_ins_mode()
2191                                                         process_token t
2192                                         when 'style', 'script', 'template'
2193                                                 ins_mode_in_head t
2194                                         when 'input'
2195                                                 if is_input_hidden_tok t
2196                                                         ins_mode_in_table_else t
2197                                                 else
2198                                                         parse_error()
2199                                                         el = insert_html_element t
2200                                                         open_els.shift()
2201                                                         t.acknowledge_self_closing()
2202                                         when 'form'
2203                                                 parse_error()
2204                                                 if form_element_pointer?
2205                                                         return
2206                                                 if template_tag_is_open()
2207                                                         return
2208                                                 form_element_pointer = insert_html_element t
2209                                                 open_els.shift()
2210                                         else
2211                                                 ins_mode_in_table_else t
2212                         when TYPE_END_TAG
2213                                 switch t.name
2214                                         when 'table'
2215                                                 if is_in_table_scope 'table'
2216                                                         loop
2217                                                                 el = open_els.shift()
2218                                                                 if el.name is 'table'
2219                                                                         break
2220                                                         reset_ins_mode()
2221                                                 else
2222                                                         parse_error
2223                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2224                                                 parse_error()
2225                                         when 'template'
2226                                                 ins_mode_in_head t
2227                                         else
2228                                                 ins_mode_in_table_else t
2229                         when TYPE_EOF
2230                                 ins_mode_in_body t
2231                         else
2232                                 ins_mode_in_table_else t
2233
2234
2235         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2236         ins_mode_in_table_text = (t) ->
2237                 if t.type is TYPE_TEXT and t.text is "\u0000"
2238                         # huh? I thought the tokenizer didn't emit these
2239                         parse_error()
2240                         return
2241                 if t.type is TYPE_TEXT
2242                         pending_table_character_tokens.push t
2243                         return
2244                 # Anything else
2245                 all_space = true
2246                 for old in pending_table_character_tokens
2247                         unless is_space_tok old
2248                                 all_space = false
2249                                 break
2250                 if all_space
2251                         for old in pending_table_character_tokens
2252                                 insert_character old
2253                 else
2254                         for old in pending_table_character_tokens
2255                                 ins_mode_table_else old
2256                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2257                 ins_mode = original_ins_mode
2258                 process_token t
2259
2260         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2261         ins_mode_in_caption = (t) ->
2262                 if t.type is TYPE_END_TAG and t.name is 'caption'
2263                         if is_in_table_scope 'caption'
2264                                 generate_implied_end_tags()
2265                                 if open_els[0].name isnt 'caption'
2266                                         parse_error()
2267                                 loop
2268                                         el = open_els.shift()
2269                                         if el.name is 'caption'
2270                                                 break
2271                                 clear_afe_to_marker()
2272                                 ins_mode = ins_mode_in_table
2273                         else
2274                                 parse_error()
2275                                 # fragment case
2276                         return
2277                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2278                         parse_error()
2279                         if is_in_table_scope 'caption'
2280                                 loop
2281                                         el = open_els.shift()
2282                                         if el.name is 'caption'
2283                                                 break
2284                                 clear_afe_to_marker()
2285                                 ins_mode = ins_mode_in_table
2286                                 process_token t
2287                         # else fragment case
2288                         return
2289                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2290                         parse_error()
2291                         return
2292                 # Anything else
2293                 ins_mode_in_body t
2294
2295         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2296         ins_mode_in_column_group = (t) ->
2297                 if is_space_tok t
2298                         insert_character t
2299                         return
2300                 if t.type is TYPE_COMMENT
2301                         insert_comment t
2302                         return
2303                 if t.type is TYPE_DOCTYPE
2304                         parse_error()
2305                         return
2306                 if t.type is TYPE_START_TAG and t.name is 'html'
2307                         ins_mode_in_body t
2308                         return
2309                 if t.type is TYPE_START_TAG and t.name is 'col'
2310                         el = insert_html_element t
2311                         open_els.shift()
2312                         t.acknowledge_self_closing()
2313                         return
2314                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2315                         if open_els[0].name is 'colgroup'
2316                                 open_els.shift()
2317                                 ins_mode = ins_mode_in_table
2318                         else
2319                                 parse_error()
2320                         return
2321                 if t.type is TYPE_END_TAG and t.name is 'col'
2322                         parse_error()
2323                         return
2324                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2325                         ins_mode_in_head t
2326                         return
2327                 if t.type is TYPE_EOF
2328                         ins_mode_in_body t
2329                         return
2330                 # Anything else
2331                 if open_els[0].name isnt 'colgroup'
2332                         parse_error()
2333                         return
2334                 open_els.shift()
2335                 ins_mode = ins_mode_in_table
2336                 process_token t
2337                 return
2338
2339         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2340         ins_mode_in_table_body = (t) ->
2341                 if t.type is TYPE_START_TAG and t.name is 'tr'
2342                         clear_stack_to_table_body_context()
2343                         insert_html_element t
2344                         ins_mode = ins_mode_in_row
2345                         return
2346                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2347                         parse_error()
2348                         clear_stack_to_table_body_context()
2349                         insert_html_element new_open_tag 'tr'
2350                         ins_mode = ins_mode_in_row
2351                         process_token t
2352                         return
2353                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2354                         unless is_in_table_scope t.name # fixfull check namespace
2355                                 parse_error()
2356                                 return
2357                         clear_stack_to_table_body_context()
2358                         open_els.shift()
2359                         ins_mode = ins_mode_in_table
2360                         return
2361                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2362                         has = false
2363                         for el in open_els
2364                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2365                                         has = true
2366                                         break
2367                                 if table_scopers[el.name]
2368                                         break
2369                         if !has
2370                                 parse_error()
2371                                 return
2372                         clear_stack_to_table_body_context()
2373                         open_els.shift()
2374                         ins_mode = ins_mode_in_table
2375                         process_token t
2376                         return
2377                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2378                         parse_error()
2379                         return
2380                 # Anything else
2381                 ins_mode_in_table t
2382
2383         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2384         ins_mode_in_row = (t) ->
2385                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2386                         clear_stack_to_table_row_context()
2387                         insert_html_element t
2388                         ins_mode = ins_mode_in_cell
2389                         afe_push_marker()
2390                         return
2391                 if t.type is TYPE_END_TAG and t.name is 'tr'
2392                         if is_in_table_scope 'tr'
2393                                 clear_stack_to_table_row_context()
2394                                 open_els.shift()
2395                                 ins_mode = ins_mode_in_table_body
2396                         else
2397                                 parse_error()
2398                         return
2399                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2400                         if is_in_table_scope 'tr'
2401                                 clear_stack_to_table_row_context()
2402                                 open_els.shift()
2403                                 ins_mode = ins_mode_in_table_body
2404                                 process_token t
2405                         else
2406                                 parse_error()
2407                         return
2408                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2409                         if is_in_table_scope t.name # fixfull namespace
2410                                 if is_in_table_scope 'tr'
2411                                         clear_stack_to_table_row_context()
2412                                         open_els.shift()
2413                                         ins_mode = ins_mode_in_table_body
2414                                         process_token t
2415                         else
2416                                 parse_error()
2417                         return
2418                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2419                         parse_error()
2420                         return
2421                 # Anything else
2422                 ins_mode_in_table t
2423
2424         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2425         close_the_cell = ->
2426                 generate_implied_end_tags()
2427                 unless open_els[0].name is 'td' or open_els[0] is 'th'
2428                         parse_error()
2429                 loop
2430                         el = open_els.shift()
2431                         if el.name is 'td' or el.name is 'th'
2432                                 break
2433                 clear_afe_to_marker()
2434                 ins_mode = ins_mode_in_row
2435
2436         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2437         ins_mode_in_cell = (t) ->
2438                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2439                         if is_in_table_scope t.name
2440                                 generate_implied_end_tags()
2441                                 if open_els[0].name isnt t.name
2442                                         parse_error
2443                                 loop
2444                                         el = open_els.shift()
2445                                         if el.name is t.name
2446                                                 break
2447                                 clear_afe_to_marker()
2448                                 ins_mode = ins_mode_in_row
2449                         else
2450                                 parse_error()
2451                         return
2452                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2453                         has = false
2454                         for el in open_els
2455                                 if el.name is 'td' or el.name is 'th'
2456                                         has = true
2457                                         break
2458                                 if table_scopers[el.name]
2459                                         break
2460                         if !has
2461                                 parse_error()
2462                                 return
2463                         close_the_cell()
2464                         process_token t
2465                         return
2466                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2467                         parse_error()
2468                         return
2469                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2470                         if is_in_table_scope t.name # fixfull namespace
2471                                 close_the_cell()
2472                                 process_token t
2473                         else
2474                                 parse_error()
2475                         return
2476                 # Anything Else
2477                 ins_mode_in_body t
2478
2479         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2480         ins_mode_in_select = (t) ->
2481                 if t.type is TYPE_TEXT and t.text is "\u0000"
2482                         parse_error()
2483                         return
2484                 if t.type is TYPE_TEXT
2485                         insert_character t
2486                         return
2487                 if t.type is TYPE_COMMENT
2488                         insert_comment t
2489                         return
2490                 if t.type is TYPE_DOCTYPE
2491                         parse_error()
2492                         return
2493                 if t.type is TYPE_START_TAG and t.name is 'html'
2494                         ins_mode_in_body t
2495                         return
2496                 if t.type is TYPE_START_TAG and t.name is 'option'
2497                         if open_els[0].name is 'option'
2498                                 open_els.shift()
2499                         insert_html_element t
2500                         return
2501                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2502                         if open_els[0].name is 'option'
2503                                 open_els.shift()
2504                         if open_els[0].name is 'optgroup'
2505                                 open_els.shift()
2506                         insert_html_element t
2507                         return
2508                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2509                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2510                                 open_els.shift()
2511                         if open_els[0].name is 'optgroup'
2512                                 open_els.shift()
2513                         else
2514                                 parse_error()
2515                         return
2516                 if t.type is TYPE_END_TAG and t.name is 'option'
2517                         if open_els[0].name is 'option'
2518                                 open_els.shift()
2519                         else
2520                                 parse_error()
2521                         return
2522                 if t.type is TYPE_END_TAG and t.name is 'select'
2523                         if is_in_select_scope 'select'
2524                                 loop
2525                                         el = open_els.shift()
2526                                         if el.name is 'select'
2527                                                 break
2528                                 reset_ins_mode()
2529                         else
2530                                 parse_error()
2531                         return
2532                 if t.type is TYPE_START_TAG and t.name is 'select'
2533                         parse_error()
2534                         loop
2535                                 el = open_els.shift()
2536                                 if el.name is 'select'
2537                                         break
2538                         reset_ins_mode()
2539                         # spec says that this is the same as </select> but it doesn't say
2540                         # to check scope first
2541                         return
2542                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2543                         parse_error()
2544                         if is_in_select_scope 'select'
2545                                 return
2546                         loop
2547                                 el = open_els.shift()
2548                                 if el.name is 'select'
2549                                         break
2550                         reset_ins_mode()
2551                         process_token t
2552                         return
2553                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2554                         ins_mode_in_head t
2555                         return
2556                 if t.type is TYPE_EOF
2557                         ins_mode_in_body t
2558                         return
2559                 # Anything else
2560                 parse_error()
2561                 return
2562
2563         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2564         ins_mode_in_select_in_table = (t) ->
2565                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2566                         parse_error()
2567                         loop
2568                                 el = open_els.shift()
2569                                 if el.name is 'select'
2570                                         break
2571                         reset_ins_mode()
2572                         process_token t
2573                         return
2574                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2575                         parse_error()
2576                         unless is_in_table_scope t.name, NS_HTML
2577                                 return
2578                         loop
2579                                 el = open_els.shift()
2580                                 if el.name is 'select'
2581                                         break
2582                         reset_ins_mode()
2583                         process_token t
2584                         return
2585                 # Anything else
2586                 ins_mode_in_select t
2587                 return
2588
2589         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2590         ins_mode_in_template = (t) ->
2591                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2592                         ins_mode_in_body t
2593                         return
2594                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2595                         ins_mode_in_head t
2596                         return
2597                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2598                         template_ins_modes.shift()
2599                         template_ins_modes.unshift ins_mode_in_table
2600                         ins_mode = ins_mode_in_table
2601                         process_token t
2602                         return
2603                 if t.type is TYPE_START_TAG and t.name is 'col'
2604                         template_ins_modes.shift()
2605                         template_ins_modes.unshift ins_mode_in_column_group
2606                         ins_mode = ins_mode_in_column_group
2607                         process_token t
2608                         return
2609                 if t.type is TYPE_START_TAG and t.name is 'tr'
2610                         template_ins_modes.shift()
2611                         template_ins_modes.unshift ins_mode_in_table_body
2612                         ins_mode = ins_mode_in_table_body
2613                         process_token t
2614                         return
2615                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2616                         template_ins_modes.shift()
2617                         template_ins_modes.unshift ins_mode_in_row
2618                         ins_mode = ins_mode_in_row
2619                         process_token t
2620                         return
2621                 if t.type is TYPE_START_TAG
2622                         template_ins_modes.shift()
2623                         template_ins_modes.unshift ins_mode_in_body
2624                         ins_mode = ins_mode_in_body
2625                         process_token t
2626                         return
2627                 if t.type is TYPE_END_TAG
2628                         parse_error()
2629                         return
2630                 if t.type is TYPE_EOF
2631                         unless template_tag_is_open()
2632                                 stop_parsing()
2633                                 return
2634                         parse_error()
2635                         loop
2636                                 el = open_els.shift()
2637                                 if el.name is 'template' # fixfull check namespace
2638                                         break
2639                         clear_afe_to_marker()
2640                         template_ins_modes.shift()
2641                         reset_ins_mode()
2642                         process_token t
2643
2644         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2645         ins_mode_after_body = (t) ->
2646                 if is_space_tok t
2647                         ins_mode_in_body t
2648                         return
2649                 if t.type is TYPE_COMMENT
2650                         insert_comment t, [open_els[0], open_els[0].children.length]
2651                         return
2652                 if t.type is TYPE_DOCTYPE
2653                         parse_error()
2654                         return
2655                 if t.type is TYPE_START_TAG and t.name is 'html'
2656                         ins_mode_in_body t
2657                         return
2658                 if t.type is TYPE_END_TAG and t.name is 'html'
2659                         # fixfull fragment case
2660                         ins_mode = ins_mode_after_after_body
2661                         return
2662                 if t.type is TYPE_EOF
2663                         stop_parsing()
2664                         return
2665                 # Anything ELse
2666                 parse_error()
2667                 ins_mode = ins_mode_in_body
2668                 process_token t
2669
2670         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2671         ins_mode_in_frameset = (t) ->
2672                 if is_space_tok t
2673                         insert_character t
2674                         return
2675                 if t.type is TYPE_COMMENT
2676                         insert_comment t
2677                         return
2678                 if t.type is TYPE_DOCTYPE
2679                         parse_error()
2680                         return
2681                 if t.type is TYPE_START_TAG and t.name is 'html'
2682                         ins_mode_in_body t
2683                         return
2684                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2685                         insert_html_element t
2686                         return
2687                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2688                         # TODO ?correct for: "if the current node is the root html element"
2689                         if open_els.length is 1
2690                                 parse_error()
2691                                 return # fragment case
2692                         open_els.shift()
2693                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2694                                 ins_mode = ins_mode_after_frameset
2695                         return
2696                 if t.type is TYPE_START_TAG and t.name is 'frame'
2697                         insert_html_element t
2698                         open_els.shift()
2699                         t.acknowledge_self_closing()
2700                         return
2701                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2702                         ins_mode_in_head t
2703                         return
2704                 if t.type is TYPE_EOF
2705                         # TODO ?correct for: "if the current node is not the root html element"
2706                         if open_els.length isnt 1
2707                                 parse_error()
2708                         stop_parsing()
2709                         return
2710                 # Anything else
2711                 parse_error()
2712                 return
2713
2714         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2715         ins_mode_after_frameset = (t) ->
2716                 if is_space_tok t
2717                         insert_character t
2718                         return
2719                 if t.type is TYPE_COMMENT
2720                         insert_comment t
2721                         return
2722                 if t.type is TYPE_DOCTYPE
2723                         parse_error()
2724                         return
2725                 if t.type is TYPE_START_TAG and t.name is 'html'
2726                         ins_mode_in_body t
2727                         return
2728                 if t.type is TYPE_END_TAG and t.name is 'html'
2729                         insert_mode = ins_mode_after_after_frameset
2730                         return
2731                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2732                         ins_mode_in_head t
2733                         return
2734                 if t.type is TYPE_EOF
2735                         stop_parsing()
2736                         return
2737                 # Anything else
2738                 parse_error()
2739                 return
2740
2741         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2742         ins_mode_after_after_body = (t) ->
2743                 if t.type is TYPE_COMMENT
2744                         insert_comment t, [doc, doc.children.length]
2745                         return
2746                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2747                         ins_mode_in_body t
2748                         return
2749                 if t.type is TYPE_EOF
2750                         stop_parsing()
2751                         return
2752                 # Anything else
2753                 parse_error()
2754                 ins_mode = ins_mode_in_body
2755                 return
2756
2757         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2758         ins_mode_after_after_frameset = (t) ->
2759                 if t.type is TYPE_COMMENT
2760                         insert_comment t, [doc, doc.children.length]
2761                         return
2762                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2763                         ins_mode_in_body t
2764                         return
2765                 if t.type is TYPE_EOF
2766                         stop_parsing()
2767                         return
2768                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2769                         ins_mode_in_head t
2770                         return
2771                 # Anything else
2772                 parse_error()
2773                 return
2774
2775         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2776         has_color_face_or_size = (t) ->
2777                 for a in t.attrs_a
2778                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2779                                 return true
2780                 return false
2781         in_foreign_content_end_script = ->
2782                 open_els.shift()
2783                 # fixfull
2784                 return
2785         in_foreign_content_other_start = (t) ->
2786                 acn = adjusted_current_node()
2787                 if acn.namespace is NS_MATHML
2788                         adjust_mathml_attributes t
2789                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2790                         t.name = svg_name_fixes[t.name]
2791                 if acn.namespace is NS_SVG
2792                         adjust_svg_attributes t
2793                 adjust_foreign_attributes t
2794                 insert_foreign_element t, acn.namespace
2795                 if t.flag 'self-closing'
2796                         if t.name is 'script'
2797                                 t.acknowledge_self_closing()
2798                                 in_foreign_content_end_script()
2799                         else
2800                                 open_els.shift()
2801                                 t.acknowledge_self_closing()
2802                 return
2803         in_foreign_content = (t) ->
2804                 if t.type is TYPE_TEXT and t.text is "\u0000"
2805                         parse_error()
2806                         insert_character new_character_token "\ufffd"
2807                         return
2808                 if is_space_tok t
2809                         insert_character t
2810                         return
2811                 if t.type is TYPE_TEXT
2812                         flag_frameset_ok = false
2813                         insert_character t
2814                         return
2815                 if t.type is TYPE_COMMENT
2816                         insert_comment t
2817                         return
2818                 if t.type is TYPE_DOCTYPE
2819                         parse_error()
2820                         return
2821                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2822                         parse_error()
2823                         if flag_fragment_parsing
2824                                 in_foreign_content_other_start t
2825                                 return
2826                         loop # is this safe?
2827                                 open_els.shift()
2828                                 cn = open_els[0]
2829                                 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2830                                         break
2831                         process_token t
2832                         return
2833                 if t.type is TYPE_START_TAG
2834                         in_foreign_content_other_start t
2835                         return
2836                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2837                         in_foreign_content_end_script()
2838                         return
2839                 if t.type is TYPE_END_TAG
2840                         if open_els[0].name.toLowerCase() isnt t.name
2841                                 parse_error()
2842                         for node in open_els
2843                                 if node is open_els[open_els.length - 1]
2844                                         return
2845                                 if node.name.toLowerCase() is t.name
2846                                         loop
2847                                                 el = open_els.shift()
2848                                                 if el is node
2849                                                         return
2850                                 if node.namespace is NS_HTML
2851                                         break
2852                         ins_mode t # explicitly call HTML insertion mode
2853
2854
2855         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2856         tok_state_data = ->
2857                 switch c = txt.charAt(cur++)
2858                         when '&'
2859                                 return new_text_node parse_character_reference()
2860                         when '<'
2861                                 tok_state = tok_state_tag_open
2862                         when "\u0000"
2863                                 parse_error()
2864                                 return new_text_node c
2865                         when '' # EOF
2866                                 return new_eof_token()
2867                         else
2868                                 return new_text_node c
2869                 return null
2870
2871         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2872         # not needed: tok_state_character_reference_in_data = ->
2873         # just call parse_character_reference()
2874
2875         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2876         tok_state_rcdata = ->
2877                 switch c = txt.charAt(cur++)
2878                         when '&'
2879                                 return new_text_node parse_character_reference()
2880                         when '<'
2881                                 tok_state = tok_state_rcdata_less_than_sign
2882                         when "\u0000"
2883                                 parse_error()
2884                                 return new_character_token "\ufffd"
2885                         when '' # EOF
2886                                 return new_eof_token()
2887                         else
2888                                 return new_character_token c
2889                 return null
2890
2891         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2892         # not needed: tok_state_character_reference_in_rcdata = ->
2893         # just call parse_character_reference()
2894
2895         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2896         tok_state_rawtext = ->
2897                 switch c = txt.charAt(cur++)
2898                         when '<'
2899                                 tok_state = tok_state_rawtext_less_than_sign
2900                         when "\u0000"
2901                                 parse_error()
2902                                 return new_character_token "\ufffd"
2903                         when '' # EOF
2904                                 return new_eof_token()
2905                         else
2906                                 return new_character_token c
2907                 return null
2908
2909         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2910         tok_state_script_data = ->
2911                 switch c = txt.charAt(cur++)
2912                         when '<'
2913                                 tok_state = tok_state_script_data_less_than_sign
2914                         when "\u0000"
2915                                 parse_error()
2916                                 return new_character_token "\ufffd"
2917                         when '' # EOF
2918                                 return new_eof_token()
2919                         else
2920                                 return new_character_token c
2921                 return null
2922
2923         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2924         tok_state_plaintext = ->
2925                 switch c = txt.charAt(cur++)
2926                         when "\u0000"
2927                                 parse_error()
2928                                 return new_character_token "\ufffd"
2929                         when '' # EOF
2930                                 return new_eof_token()
2931                         else
2932                                 return new_character_token c
2933                 return null
2934
2935
2936         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2937         tok_state_tag_open = ->
2938                 switch c = txt.charAt(cur++)
2939                         when '!'
2940                                 tok_state = tok_state_markup_declaration_open
2941                         when '/'
2942                                 tok_state = tok_state_end_tag_open
2943                         when '?'
2944                                 parse_error()
2945                                 tok_cur_tag = new_comment_token '?'
2946                                 tok_state = tok_state_bogus_comment
2947                         else
2948                                 if is_lc_alpha(c)
2949                                         tok_cur_tag = new_open_tag c
2950                                         tok_state = tok_state_tag_name
2951                                 else if is_uc_alpha(c)
2952                                         tok_cur_tag = new_open_tag c.toLowerCase()
2953                                         tok_state = tok_state_tag_name
2954                                 else
2955                                         parse_error()
2956                                         tok_state = tok_state_data
2957                                         cur -= 1 # we didn't parse/handle the char after <
2958                                         return new_text_node '<'
2959                 return null
2960
2961         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2962         tok_state_end_tag_open = ->
2963                 switch c = txt.charAt(cur++)
2964                         when '>'
2965                                 parse_error()
2966                                 tok_state = tok_state_data
2967                         when '' # EOF
2968                                 parse_error()
2969                                 tok_state = tok_state_data
2970                                 return new_text_node '</'
2971                         else
2972                                 if is_uc_alpha(c)
2973                                         tok_cur_tag = new_end_tag c.toLowerCase()
2974                                         tok_state = tok_state_tag_name
2975                                 else if is_lc_alpha(c)
2976                                         tok_cur_tag = new_end_tag c
2977                                         tok_state = tok_state_tag_name
2978                                 else
2979                                         parse_error()
2980                                         tok_cur_tag = new_comment_token '/'
2981                                         tok_state = tok_state_bogus_comment
2982                 return null
2983
2984         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2985         tok_state_tag_name = ->
2986                 switch c = txt.charAt(cur++)
2987                         when "\t", "\n", "\u000c", ' '
2988                                 tok_state = tok_state_before_attribute_name
2989                         when '/'
2990                                 tok_state = tok_state_self_closing_start_tag
2991                         when '>'
2992                                 tok_state = tok_state_data
2993                                 tmp = tok_cur_tag
2994                                 tok_cur_tag = null
2995                                 return tmp
2996                         when "\u0000"
2997                                 parse_error()
2998                                 tok_cur_tag.name += "\ufffd"
2999                         when '' # EOF
3000                                 parse_error()
3001                                 tok_state = tok_state_data
3002                         else
3003                                 if is_uc_alpha(c)
3004                                         tok_cur_tag.name += c.toLowerCase()
3005                                 else
3006                                         tok_cur_tag.name += c
3007                 return null
3008
3009         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3010         tok_state_rcdata_less_than_sign = ->
3011                 c = txt.charAt(cur++)
3012                 if c is '/'
3013                         temporary_buffer = ''
3014                         tok_state = tok_state_rcdata_end_tag_open
3015                         return null
3016                 # Anything else
3017                 tok_state = tok_state_rcdata
3018                 cur -= 1 # reconsume the input character
3019                 return new_character_token '<'
3020
3021         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3022         tok_state_rcdata_end_tag_open = ->
3023                 c = txt.charAt(cur++)
3024                 if is_uc_alpha(c)
3025                         tok_cur_tag = new_end_tag c.toLowerCase()
3026                         temporary_buffer += c
3027                         tok_state = tok_state_rcdata_end_tag_name
3028                         return null
3029                 if is_lc_alpha(c)
3030                         tok_cur_tag = new_end_tag c
3031                         temporary_buffer += c
3032                         tok_state = tok_state_rcdata_end_tag_name
3033                         return null
3034                 # Anything else
3035                 tok_state = tok_state_rcdata
3036                 cur -= 1 # reconsume the input character
3037                 return new_character_token "</" # fixfull separate these
3038
3039         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3040         is_appropriate_end_tag = (t) ->
3041                 # spec says to check against "the tag name of the last start tag to
3042                 # have been emitted from this tokenizer", but this is only called from
3043                 # the various "raw" states, which I'm pretty sure all push the start
3044                 # token onto open_els. TODO: verify this after the script data states
3045                 # are implemented
3046                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3047                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3048
3049         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3050         tok_state_rcdata_end_tag_name = ->
3051                 c = txt.charAt(cur++)
3052                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3053                         if is_appropriate_end_tag tok_cur_tag
3054                                 tok_state = tok_state_before_attribute_name
3055                                 return
3056                         # else fall through to "Anything else"
3057                 if c is '/'
3058                         if is_appropriate_end_tag tok_cur_tag
3059                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3060                                 return
3061                         # else fall through to "Anything else"
3062                 if c is '>'
3063                         if is_appropriate_end_tag tok_cur_tag
3064                                 tok_state = tok_state_data
3065                                 return tok_cur_tag
3066                         # else fall through to "Anything else"
3067                 if is_uc_alpha(c)
3068                         tok_cur_tag.name += c.toLowerCase()
3069                         temporary_buffer += c
3070                         return null
3071                 if is_lc_alpha(c)
3072                         tok_cur_tag.name += c
3073                         temporary_buffer += c
3074                         return null
3075                 # Anything else
3076                 tok_state = tok_state_rcdata
3077                 cur -= 1 # reconsume the input character
3078                 return new_character_token '</' + temporary_buffer # fixfull separate these
3079
3080         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3081         tok_state_rawtext_less_than_sign = ->
3082                 c = txt.charAt(cur++)
3083                 if c is '/'
3084                         temporary_buffer = ''
3085                         tok_state = tok_state_rawtext_end_tag_open
3086                         return null
3087                 # Anything else
3088                 tok_state = tok_state_rawtext
3089                 cur -= 1 # reconsume the input character
3090                 return new_character_token '<'
3091
3092         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3093         tok_state_rawtext_end_tag_open = ->
3094                 c = txt.charAt(cur++)
3095                 if is_uc_alpha(c)
3096                         tok_cur_tag = new_end_tag c.toLowerCase()
3097                         temporary_buffer += c
3098                         tok_state = tok_state_rawtext_end_tag_name
3099                         return null
3100                 if is_lc_alpha(c)
3101                         tok_cur_tag = new_end_tag c
3102                         temporary_buffer += c
3103                         tok_state = tok_state_rawtext_end_tag_name
3104                         return null
3105                 # Anything else
3106                 tok_state = tok_state_rawtext
3107                 cur -= 1 # reconsume the input character
3108                 return new_character_token "</" # fixfull separate these
3109
3110         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3111         tok_state_rawtext_end_tag_name = ->
3112                 c = txt.charAt(cur++)
3113                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3114                         if is_appropriate_end_tag tok_cur_tag
3115                                 tok_state = tok_state_before_attribute_name
3116                                 return
3117                         # else fall through to "Anything else"
3118                 if c is '/'
3119                         if is_appropriate_end_tag tok_cur_tag
3120                                 tok_state = tok_state_self_closing_start_tag
3121                                 return
3122                         # else fall through to "Anything else"
3123                 if c is '>'
3124                         if is_appropriate_end_tag tok_cur_tag
3125                                 tok_state = tok_state_data
3126                                 return tok_cur_tag
3127                         # else fall through to "Anything else"
3128                 if is_uc_alpha(c)
3129                         tok_cur_tag.name += c.toLowerCase()
3130                         temporary_buffer += c
3131                         return null
3132                 if is_lc_alpha(c)
3133                         tok_cur_tag.name += c
3134                         temporary_buffer += c
3135                         return null
3136                 # Anything else
3137                 tok_state = tok_state_rawtext
3138                 cur -= 1 # reconsume the input character
3139                 return new_character_token '</' + temporary_buffer # fixfull separate these
3140
3141         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3142         tok_state_script_data_less_than_sign = ->
3143                 c = txt.charAt(cur++)
3144                 if c is '/'
3145                         temporary_buffer = ''
3146                         tok_state = tok_state_script_data_end_tag_open
3147                         return
3148                 if c is '!'
3149                         tok_state = tok_state_script_data_escape_start
3150                         return new_character_token '<!' # fixfull split
3151                 # Anything else
3152                 tok_state = tok_state_script_data
3153                 cur -= 1 # Reconsume
3154                 return new_character_token '<'
3155
3156         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3157         tok_state_script_data_end_tag_open = ->
3158                 c = txt.charAt(cur++)
3159                 if is_uc_alpha(c)
3160                         tok_cur_tag = new_end_tag c.toLowerCase()
3161                         temporary_buffer += c
3162                         tok_state = tok_state_script_data_end_tag_name
3163                         return
3164                 if is_lc_alpha(c)
3165                         tok_cur_tag = new_end_tag c
3166                         temporary_buffer += c
3167                         tok_state = tok_state_script_data_end_tag_name
3168                         return
3169                 # Anything else
3170                 tok_state = tok_state_script_data
3171                 cur -= 1 # Reconsume
3172                 return new_character_token '</'
3173
3174         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3175         tok_state_script_data_end_tag_name = ->
3176                 c = txt.charAt(cur++)
3177                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3178                         if is_appropriate_end_tag tok_cur_tag
3179                                 tok_state = tok_state_before_attribute_name
3180                                 return
3181                         # fall through
3182                 if c is '/'
3183                         if is_appropriate_end_tag tok_cur_tag
3184                                 tok_state = tok_state_self_closing_start_tag
3185                                 return
3186                         # fall through
3187                 if is_uc_alpha(c)
3188                         tok_cur_tag.name += c.toLowerCase()
3189                         temporary_buffer += c
3190                         return
3191                 if is_lc_alpha(c)
3192                         tok_cur_tag.name += c
3193                         temporary_buffer += c
3194                         return
3195                 # Anything else
3196                 tok_state = tok_state_script_data
3197                 cur -= 1 # Reconsume
3198                 return new_character_token "</#{temporary_buffer}" # fixfull split
3199
3200         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3201         tok_state_script_data_escape_start = ->
3202                 c = txt.charAt(cur++)
3203                 if c is '-'
3204                         tok_state = tok_state_script_data_escape_start_dash
3205                         return new_character_token '-'
3206                 # Anything else
3207                 tok_state = tok_state_script_data
3208                 cur -= 1 # Reconsume
3209                 return
3210
3211         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3212         tok_state_script_data_escape_start_dash = ->
3213                 c = txt.charAt(cur++)
3214                 if c is '-'
3215                         tok_state = tok_state_script_data_escaped_dash_dash
3216                         return new_character_token '-'
3217                 # Anything else
3218                 tok_state = tok_state_script_data
3219                 cur -= 1 # Reconsume
3220                 return
3221
3222         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3223         tok_state_script_data_escaped = ->
3224                 c = txt.charAt(cur++)
3225                 if c is '-'
3226                         tok_state = tok_state_script_data_escaped_dash
3227                         return new_character_token '-'
3228                 if c is '<'
3229                         tok_state = tok_state_script_data_escaped_less_than_sign
3230                         return
3231                 if c is "\u0000"
3232                         parse_error()
3233                         return new_character_token "\ufffd"
3234                 if c is '' # EOF
3235                         tok_state = tok_state_data
3236                         parse_error()
3237                         cur -= 1 # Reconsume
3238                         return
3239                 # Anything else
3240                 return new_character_token c
3241
3242         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3243         tok_state_script_data_escaped_dash = ->
3244                 c = txt.charAt(cur++)
3245                 if c is '-'
3246                         tok_state = tok_state_script_data_escaped_dash_dash
3247                         return new_character_token '-'
3248                 if c is '<'
3249                         tok_state = tok_state_script_data_escaped_less_than_sign
3250                         return
3251                 if c is "\u0000"
3252                         parse_error()
3253                         tok_state = tok_state_script_data_escaped
3254                         return new_character_token "\ufffd"
3255                 if c is '' # EOF
3256                         tok_state = tok_state_data
3257                         parse_error()
3258                         cur -= 1 # Reconsume
3259                         return
3260                 # Anything else
3261                 tok_state = tok_state_script_data_escaped
3262                 return new_character_token c
3263
3264         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3265         tok_state_script_data_escaped_dash_dash = ->
3266                 c = txt.charAt(cur++)
3267                 if c is '-'
3268                         return new_character_token '-'
3269                 if c is '<'
3270                         tok_state = tok_state_script_data_escaped_less_than_sign
3271                         return
3272                 if c is '>'
3273                         tok_state = tok_state_script_data
3274                         return new_character_token '>'
3275                 if c is "\u0000"
3276                         parse_error()
3277                         tok_state = tok_state_script_data_escaped
3278                         return new_character_token "\ufffd"
3279                 if c is '' # EOF
3280                         parse_error()
3281                         tok_state = tok_state_data
3282                         cur -= 1 # Reconsume
3283                         return
3284                 # Anything else
3285                 tok_state = tok_state_script_data_escaped
3286                 return new_character_token c
3287
3288         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3289         tok_state_script_data_escaped_less_than_sign = ->
3290                 c = txt.charAt(cur++)
3291                 if c is '/'
3292                         temporary_buffer = ''
3293                         tok_state = tok_state_script_data_escaped_end_tag_open
3294                         return
3295                 if is_uc_alpha(c)
3296                         temporary_buffer = c.toLowerCase() # yes, really
3297                         tok_state = tok_state_script_data_double_escape_start
3298                         return new_character_token "<#{c}" # fixfull split
3299                 if is_lc_alpha(c)
3300                         temporary_buffer = c
3301                         tok_state = tok_state_script_data_double_escape_start
3302                         return new_character_token "<#{c}" # fixfull split
3303                 # Anything else
3304                 tok_state = tok_state_script_data_escaped
3305                 cur -= 1 # Reconsume
3306                 return new_character_token c
3307
3308         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3309         tok_state_script_data_escaped_end_tag_open = ->
3310                 c = txt.charAt(cur++)
3311                 if is_uc_alpha(c)
3312                         tok_cur_tag = new_end_tag c.toLowerCase()
3313                         temporary_buffer += c
3314                         tok_state = tok_state_script_data_escaped_end_tag_name
3315                         return
3316                 if is_lc_alpha(c)
3317                         tok_cur_tag = new_end_tag c
3318                         temporary_buffer += c
3319                         tok_state = tok_state_script_data_escaped_end_tag_name
3320                         return
3321                 # Anything else
3322                 tok_state = tok_state_script_data_escaped
3323                 cur -= 1 # Reconsume
3324                 return new_character_token '</' # fixfull split
3325
3326         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3327         tok_state_script_data_escaped_end_tag_name = ->
3328                 c = txt.charAt(cur++)
3329                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3330                         if is_appropriate_end_tag tok_cur_tag
3331                                 tok_state = tok_state_before_attribute_name
3332                                 return
3333                         # fall through
3334                 if c is '/'
3335                         if is_appropriate_end_tag tok_cur_tag
3336                                 tok_state = tok_state_self_closing_start_tag
3337                                 return
3338                         # fall through
3339                 if is_uc_alpha(c)
3340                         tok_cur_tag.name += c.toLowerCase()
3341                         temporary_buffer += c.toLowerCase()
3342                         return
3343                 if is_lc_alpha(c)
3344                         tok_cur_tag.name += c
3345                         temporary_buffer += c.toLowerCase()
3346                         return
3347                 # Anything else
3348                 tok_state = tok_state_script_data_escaped
3349                 cur -= 1 # Reconsume
3350                 return new_character_token "</#{temporary_buffer}" # fixfull split
3351
3352         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3353         tok_state_script_data_double_escape_start = ->
3354                 c = txt.charAt(cur++)
3355                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3356                         if temporary_buffer is 'script'
3357                                 tok_state = tok_state_script_data_double_escaped
3358                         else
3359                                 tok_state = tok_state_script_data_escaped
3360                         return new_character_token c
3361                 if is_uc_alpha(c)
3362                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3363                         return new_character_token c
3364                 if is_lc_alpha(c)
3365                         temporary_buffer += c
3366                         return new_character_token c
3367                 # Anything else
3368                 tok_state = tok_state_script_data_escaped
3369                 cur -= 1 # Reconsume
3370                 return
3371
3372         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3373         tok_state_script_data_double_escaped = ->
3374                 c = txt.charAt(cur++)
3375                 if c is '-'
3376                         tok_state = tok_state_script_data_double_escaped_dash
3377                         return new_character_token '-'
3378                 if c is '<'
3379                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3380                         return new_character_token '<'
3381                 if c is "\u0000"
3382                         parse_error()
3383                         return new_character_token "\ufffd"
3384                 if c is '' # EOF
3385                         parse_error()
3386                         tok_state = tok_state_data
3387                         cur -= 1 # Reconsume
3388                         return
3389                 # Anything else
3390                 return new_character_token c
3391
3392         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3393         tok_state_script_data_double_escaped_dash = ->
3394                 c = txt.charAt(cur++)
3395                 if c is '-'
3396                         tok_state = tok_state_script_data_double_escaped_dash_dash
3397                         return new_character_token '-'
3398                 if c is '<'
3399                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3400                         return new_character_token '<'
3401                 if c is "\u0000"
3402                         parse_error()
3403                         tok_state = tok_state_script_data_double_escaped
3404                         return new_character_token "\ufffd"
3405                 if c is '' # EOF
3406                         parse_error()
3407                         tok_state = tok_state_data
3408                         cur -= 1 # Reconsume
3409                         return
3410                 # Anything else
3411                 tok_state = tok_state_script_data_double_escaped
3412                 return new_character_token c
3413
3414         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3415         tok_state_script_data_double_escaped_dash_dash = ->
3416                 c = txt.charAt(cur++)
3417                 if c is '-'
3418                         return new_character_token '-'
3419                 if c is '<'
3420                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3421                         return new_character_token '<'
3422                 if c is '>'
3423                         tok_state = tok_state_script_data
3424                         return new_character_token '>'
3425                 if c is "\u0000"
3426                         parse_error()
3427                         tok_state = tok_state_script_data_double_escaped
3428                         return new_character_token "\ufffd"
3429                 if c is '' # EOF
3430                         parse_error()
3431                         tok_state = tok_state_data
3432                         cur -= 1 # Reconsume
3433                         return
3434                 # Anything else
3435                 tok_state = tok_state_script_data_double_escaped
3436                 return new_character_token c
3437
3438         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3439         tok_state_script_data_double_escaped_less_than_sign = ->
3440                 c = txt.charAt(cur++)
3441                 if c is '/'
3442                         temporary_buffer = ''
3443                         tok_state = tok_state_script_data_double_escape_end
3444                         return new_character_token '/'
3445                 # Anything else
3446                 tok_state = tok_state_script_data_double_escaped
3447                 cur -= 1 # Reconsume
3448                 return
3449
3450         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3451         tok_state_script_data_double_escape_end = ->
3452                 c = txt.charAt(cur++)
3453                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3454                         if temporary_buffer is 'script'
3455                                 tok_state = tok_state_script_data_escaped
3456                         else
3457                                 tok_state = tok_state_script_data_double_escaped
3458                         return new_character_token c
3459                 if is_uc_alpha(c)
3460                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3461                         return new_character_token c
3462                 if is_lc_alpha(c)
3463                         temporary_buffer += c
3464                         return new_character_token c
3465                 # Anything else
3466                 tok_state = tok_state_script_data_double_escaped
3467                 cur -= 1 # Reconsume
3468                 return
3469
3470         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3471         tok_state_before_attribute_name = ->
3472                 attr_name = null
3473                 switch c = txt.charAt(cur++)
3474                         when "\t", "\n", "\u000c", ' '
3475                                 return null
3476                         when '/'
3477                                 tok_state = tok_state_self_closing_start_tag
3478                                 return null
3479                         when '>'
3480                                 tok_state = tok_state_data
3481                                 tmp = tok_cur_tag
3482                                 tok_cur_tag = null
3483                                 return tmp
3484                         when "\u0000"
3485                                 parse_error()
3486                                 attr_name = "\ufffd"
3487                         when '"', "'", '<', '='
3488                                 parse_error()
3489                                 attr_name = c
3490                         when '' # EOF
3491                                 parse_error()
3492                                 tok_state = tok_state_data
3493                         else
3494                                 if is_uc_alpha(c)
3495                                         attr_name = c.toLowerCase()
3496                                 else
3497                                         attr_name = c
3498                 if attr_name?
3499                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3500                         tok_state = tok_state_attribute_name
3501                 return null
3502
3503         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3504         tok_state_attribute_name = ->
3505                 switch c = txt.charAt(cur++)
3506                         when "\t", "\n", "\u000c", ' '
3507                                 tok_state = tok_state_after_attribute_name
3508                         when '/'
3509                                 tok_state = tok_state_self_closing_start_tag
3510                         when '='
3511                                 tok_state = tok_state_before_attribute_value
3512                         when '>'
3513                                 tok_state = tok_state_data
3514                                 tmp = tok_cur_tag
3515                                 tok_cur_tag = null
3516                                 return tmp
3517                         when "\u0000"
3518                                 parse_error()
3519                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3520                         when '"', "'", '<'
3521                                 parse_error()
3522                                 tok_cur_tag.attrs_a[0][0] = c
3523                         when '' # EOF
3524                                 parse_error()
3525                                 tok_state = tok_state_data
3526                         else
3527                                 if is_uc_alpha(c)
3528                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3529                                 else
3530                                         tok_cur_tag.attrs_a[0][0] += c
3531                 return null
3532
3533         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3534         tok_state_after_attribute_name = ->
3535                 c = txt.charAt(cur++)
3536                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3537                         return
3538                 if c is '/'
3539                         tok_state = tok_state_self_closing_start_tag
3540                         return
3541                 if c is '='
3542                         tok_state = tok_state_before_attribute_value
3543                         return
3544                 if c is '>'
3545                         tok_state = tok_state_data
3546                         return
3547                 if is_uc_alpha(c)
3548                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3549                         tok_state = tok_state_attribute_name
3550                         return
3551                 if c is "\u0000"
3552                         parse_error()
3553                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3554                         tok_state = tok_state_attribute_name
3555                         return
3556                 if c is '' # EOF
3557                         parse_error()
3558                         tok_state = tok_state_data
3559                         cur -= 1 # reconsume
3560                         return
3561                 if c is '"' or c is "'" or c is '<'
3562                         parse_error()
3563                         # fall through to Anything else
3564                 # Anything else
3565                 tok_cur_tag.attrs_a.unshift [c, '']
3566                 tok_state = tok_state_attribute_name
3567
3568         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3569         tok_state_before_attribute_value = ->
3570                 switch c = txt.charAt(cur++)
3571                         when "\t", "\n", "\u000c", ' '
3572                                 return null
3573                         when '"'
3574                                 tok_state = tok_state_attribute_value_double_quoted
3575                         when '&'
3576                                 tok_state = tok_state_attribute_value_unquoted
3577                                 cur -= 1
3578                         when "'"
3579                                 tok_state = tok_state_attribute_value_single_quoted
3580                         when "\u0000"
3581                                 # Parse error
3582                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3583                                 tok_state = tok_state_attribute_value_unquoted
3584                         when '>'
3585                                 # Parse error
3586                                 tok_state = tok_state_data
3587                                 tmp = tok_cur_tag
3588                                 tok_cur_tag = null
3589                                 return tmp
3590                         when '' # EOF
3591                                 parse_error()
3592                                 tok_state = tok_state_data
3593                         else
3594                                 tok_cur_tag.attrs_a[0][1] += c
3595                                 tok_state = tok_state_attribute_value_unquoted
3596                 return null
3597
3598         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3599         tok_state_attribute_value_double_quoted = ->
3600                 switch c = txt.charAt(cur++)
3601                         when '"'
3602                                 tok_state = tok_state_after_attribute_value_quoted
3603                         when '&'
3604                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3605                         when "\u0000"
3606                                 # Parse error
3607                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3608                         when '' # EOF
3609                                 parse_error()
3610                                 tok_state = tok_state_data
3611                         else
3612                                 tok_cur_tag.attrs_a[0][1] += c
3613                 return null
3614
3615         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3616         tok_state_attribute_value_single_quoted = ->
3617                 switch c = txt.charAt(cur++)
3618                         when "'"
3619                                 tok_state = tok_state_after_attribute_value_quoted
3620                         when '&'
3621                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3622                         when "\u0000"
3623                                 # Parse error
3624                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3625                         when '' # EOF
3626                                 parse_error()
3627                                 tok_state = tok_state_data
3628                         else
3629                                 tok_cur_tag.attrs_a[0][1] += c
3630                 return null
3631
3632         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3633         tok_state_attribute_value_unquoted = ->
3634                 switch c = txt.charAt(cur++)
3635                         when "\t", "\n", "\u000c", ' '
3636                                 tok_state = tok_state_before_attribute_name
3637                         when '&'
3638                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3639                         when '>'
3640                                 tok_state = tok_state_data
3641                                 tmp = tok_cur_tag
3642                                 tok_cur_tag = null
3643                                 return tmp
3644                         when "\u0000"
3645                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3646                         when '' # EOF
3647                                 parse_error()
3648                                 tok_state = tok_state_data
3649                         else
3650                                 # Parse Error if ', <, = or ` (backtick)
3651                                 tok_cur_tag.attrs_a[0][1] += c
3652                 return null
3653
3654         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3655         tok_state_after_attribute_value_quoted = ->
3656                 switch c = txt.charAt(cur++)
3657                         when "\t", "\n", "\u000c", ' '
3658                                 tok_state = tok_state_before_attribute_name
3659                         when '/'
3660                                 tok_state = tok_state_self_closing_start_tag
3661                         when '>'
3662                                 tok_state = tok_state_data
3663                                 tmp = tok_cur_tag
3664                                 tok_cur_tag = null
3665                                 return tmp
3666                         when '' # EOF
3667                                 parse_error()
3668                                 tok_state = tok_state_data
3669                         else
3670                                 # Parse Error
3671                                 tok_state = tok_state_before_attribute_name
3672                                 cur -= 1 # we didn't handle that char
3673                 return null
3674
3675         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3676         tok_state_self_closing_start_tag = ->
3677                 c = txt.charAt(cur++)
3678                 if c is '>'
3679                         tok_cur_tag.flag 'self-closing'
3680                         tok_state = tok_state_data
3681                         return tok_cur_tag
3682                 if c is ''
3683                         parse_error()
3684                         tok_state = tok_state_data
3685                         cur -= 1 # Reconsume
3686                         return
3687                 # Anything else
3688                 parse_error()
3689                 tok_state = tok_state_before_attribute_name
3690                 cur -= 1 # Reconsume
3691                 return
3692
3693         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3694         # WARNING: put a comment token in tok_cur_tag before setting this state
3695         tok_state_bogus_comment = ->
3696                 next_gt = txt.indexOf '>', cur
3697                 if next_gt is -1
3698                         val = txt.substr cur
3699                         cur = txt.length
3700                 else
3701                         val = txt.substr cur, (next_gt - cur)
3702                         cur = next_gt + 1
3703                 val = val.replace "\u0000", "\ufffd"
3704                 tok_cur_tag.text += val
3705                 tok_state = tok_state_data
3706                 return tok_cur_tag
3707
3708         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3709         tok_state_markup_declaration_open = ->
3710                 if txt.substr(cur, 2) is '--'
3711                         cur += 2
3712                         tok_cur_tag = new_comment_token ''
3713                         tok_state = tok_state_comment_start
3714                         return
3715                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3716                         cur += 7
3717                         tok_state = tok_state_doctype
3718                         return
3719                 acn = adjusted_current_node()
3720                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3721                         cur += 7
3722                         tok_state = tok_state_cdata_section
3723                         return
3724                 # Otherwise
3725                 parse_error()
3726                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3727                 tok_state = tok_state_bogus_comment
3728                 return
3729
3730         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3731         tok_state_comment_start = ->
3732                 switch c = txt.charAt(cur++)
3733                         when '-'
3734                                 tok_state = tok_state_comment_start_dash
3735                         when "\u0000"
3736                                 parse_error()
3737                                 return new_character_token "\ufffd"
3738                         when '>'
3739                                 parse_error()
3740                                 tok_state = tok_state_data
3741                                 return tok_cur_tag
3742                         when '' # EOF
3743                                 parse_error()
3744                                 tok_state = tok_state_data
3745                                 cur -= 1 # Reconsume
3746                                 return tok_cur_tag
3747                         else
3748                                 tok_cur_tag.text += c
3749                 return null
3750
3751         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3752         tok_state_comment_start_dash = ->
3753                 switch c = txt.charAt(cur++)
3754                         when '-'
3755                                 tok_state = tok_state_comment_end
3756                         when "\u0000"
3757                                 parse_error()
3758                                 tok_cur_tag.text += "-\ufffd"
3759                                 tok_state = tok_state_comment
3760                         when '>'
3761                                 parse_error()
3762                                 tok_state = tok_state_data
3763                                 return tok_cur_tag
3764                         when '' # EOF
3765                                 parse_error()
3766                                 tok_state = tok_state_data
3767                                 cur -= 1 # Reconsume
3768                                 return tok_cur_tag
3769                         else
3770                                 tok_cur_tag.text += "-#{c}"
3771                                 tok_state = tok_state_comment
3772                 return null
3773
3774         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3775         tok_state_comment = ->
3776                 switch c = txt.charAt(cur++)
3777                         when '-'
3778                                 tok_state = tok_state_comment_end_dash
3779                         when "\u0000"
3780                                 parse_error()
3781                                 tok_cur_tag.text += "\ufffd"
3782                         when '' # EOF
3783                                 parse_error()
3784                                 tok_state = tok_state_data
3785                                 cur -= 1 # Reconsume
3786                                 return tok_cur_tag
3787                         else
3788                                 tok_cur_tag.text += c
3789                 return null
3790
3791         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3792         tok_state_comment_end_dash = ->
3793                 switch c = txt.charAt(cur++)
3794                         when '-'
3795                                 tok_state = tok_state_comment_end
3796                         when "\u0000"
3797                                 parse_error()
3798                                 tok_cur_tag.text += "-\ufffd"
3799                                 tok_state = tok_state_comment
3800                         when '' # EOF
3801                                 parse_error()
3802                                 tok_state = tok_state_data
3803                                 cur -= 1 # Reconsume
3804                                 return tok_cur_tag
3805                         else
3806                                 tok_cur_tag.text += "-#{c}"
3807                                 tok_state = tok_state_comment
3808                 return null
3809
3810         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3811         tok_state_comment_end = ->
3812                 switch c = txt.charAt(cur++)
3813                         when '>'
3814                                 tok_state = tok_state_data
3815                                 return tok_cur_tag
3816                         when "\u0000"
3817                                 parse_error()
3818                                 tok_cur_tag.text += "--\ufffd"
3819                                 tok_state = tok_state_comment
3820                         when '!'
3821                                 parse_error()
3822                                 tok_state = tok_state_comment_end_bang
3823                         when '-'
3824                                 parse_error()
3825                                 tok_cur_tag.text += '-'
3826                         when '' # EOF
3827                                 parse_error()
3828                                 tok_state = tok_state_data
3829                                 cur -= 1 # Reconsume
3830                                 return tok_cur_tag
3831                         else
3832                                 parse_error()
3833                                 tok_cur_tag.text += "--#{c}"
3834                                 tok_state = tok_state_comment
3835                 return null
3836
3837         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3838         tok_state_comment_end_bang = ->
3839                 switch c = txt.charAt(cur++)
3840                         when '-'
3841                                 tok_cur_tag.text += "--!#{c}"
3842                                 tok_state = tok_state_comment_end_dash
3843                         when '>'
3844                                 tok_state = tok_state_data
3845                                 return tok_cur_tag
3846                         when "\u0000"
3847                                 parse_error()
3848                                 tok_cur_tag.text += "--!\ufffd"
3849                                 tok_state = tok_state_comment
3850                         when '' # EOF
3851                                 parse_error()
3852                                 tok_state = tok_state_data
3853                                 cur -= 1 # Reconsume
3854                                 return tok_cur_tag
3855                         else
3856                                 tok_cur_tag.text += "--!#{c}"
3857                                 tok_state = tok_state_comment
3858                 return null
3859
3860         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3861         tok_state_doctype = ->
3862                 switch c = txt.charAt(cur++)
3863                         when "\t", "\u000a", "\u000c", ' '
3864                                 tok_state = tok_state_before_doctype_name
3865                         when '' # EOF
3866                                 parse_error()
3867                                 tok_state = tok_state_data
3868                                 el = new_doctype_token ''
3869                                 el.flag 'force-quirks', true
3870                                 cur -= 1 # Reconsume
3871                                 return el
3872                         else
3873                                 parse_error()
3874                                 tok_state = tok_state_before_doctype_name
3875                                 cur -= 1 # Reconsume
3876                 return null
3877
3878         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3879         tok_state_before_doctype_name = ->
3880                 c = txt.charAt(cur++)
3881                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3882                         return
3883                 if is_uc_alpha(c)
3884                         tok_cur_tag = new_doctype_token c.toLowerCase()
3885                         tok_state = tok_state_doctype_name
3886                         return
3887                 if c is "\u0000"
3888                         parse_error()
3889                         tok_cur_tag = new_doctype_token "\ufffd"
3890                         tok_state = tok_state_doctype_name
3891                         return
3892                 if c is '>'
3893                         parse_error()
3894                         el = new_doctype_token ''
3895                         el.flag 'force-quirks', true
3896                         tok_state = tok_state_data
3897                         return el
3898                 if c is '' # EOF
3899                         parse_error()
3900                         tok_state = tok_state_data
3901                         el = new_doctype_token ''
3902                         el.flag 'force-quirks', true
3903                         cur -= 1 # Reconsume
3904                         return el
3905                 # Anything else
3906                 tok_cur_tag = new_doctype_token c
3907                 tok_state = tok_state_doctype_name
3908                 return null
3909
3910         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3911         tok_state_doctype_name = ->
3912                 c = txt.charAt(cur++)
3913                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3914                         tok_state = tok_state_after_doctype_name
3915                         return
3916                 if c is '>'
3917                         tok_state = tok_state_data
3918                         return tok_cur_tag
3919                 if is_uc_alpha(c)
3920                         tok_cur_tag.name += c.toLowerCase()
3921                         return
3922                 if c is "\u0000"
3923                         parse_error()
3924                         tok_cur_tag.name += "\ufffd"
3925                         return
3926                 if c is '' # EOF
3927                         parse_error()
3928                         tok_state = tok_state_data
3929                         tok_cur_tag.flag 'force-quirks', true
3930                         cur -= 1 # Reconsume
3931                         return tok_cur_tag
3932                 # Anything else
3933                 tok_cur_tag.name += c
3934                 return null
3935
3936         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3937         tok_state_after_doctype_name = ->
3938                 c = txt.charAt(cur++)
3939                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3940                         return
3941                 if c is '>'
3942                         tok_state = tok_state_data
3943                         return tok_cur_tag
3944                 if c is '' # EOF
3945                         parse_error()
3946                         tok_state = tok_state_data
3947                         tok_cur_tag.flag 'force-quirks', true
3948                         cur -= 1 # Reconsume
3949                         return tok_cur_tag
3950                 # Anything else
3951                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3952                         cur += 5
3953                         tok_state = tok_state_after_doctype_public_keyword
3954                         return
3955                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3956                         cur += 5
3957                         tok_state = tok_state_after_doctype_system_keyword
3958                         return
3959                 parse_error()
3960                 tok_cur_tag.flag 'force-quirks', true
3961                 tok_state = tok_state_bogus_doctype
3962                 return null
3963
3964         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3965         tok_state_after_doctype_public_keyword = ->
3966                 c = txt.charAt(cur++)
3967                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3968                         tok_state = tok_state_before_doctype_public_identifier
3969                         return
3970                 if c is '"'
3971                         parse_error()
3972                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3973                         tok_state = tok_state_doctype_public_identifier_double_quoted
3974                         return
3975                 if c is "'"
3976                         parse_error()
3977                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3978                         tok_state = tok_state_doctype_public_identifier_single_quoted
3979                         return
3980                 if c is '>'
3981                         parse_error()
3982                         tok_cur_tag.flag 'force-quirks', true
3983                         tok_state = tok_state_data
3984                         return tok_cur_tag
3985                 if c is '' # EOF
3986                         parse_error()
3987                         tok_state = tok_state_data
3988                         tok_cur_tag.flag 'force-quirks', true
3989                         cur -= 1 # Reconsume
3990                         return tok_cur_tag
3991                 # Anything else
3992                 parse_error()
3993                 tok_cur_tag.flag 'force-quirks', true
3994                 tok_state = tok_state_bogus_doctype
3995                 return null
3996
3997         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3998         tok_state_before_doctype_public_identifier = ->
3999                 c = txt.charAt(cur++)
4000                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4001                         return
4002                 if c is '"'
4003                         parse_error()
4004                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4005                         tok_state = tok_state_doctype_public_identifier_double_quoted
4006                         return
4007                 if c is "'"
4008                         parse_error()
4009                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4010                         tok_state = tok_state_doctype_public_identifier_single_quoted
4011                         return
4012                 if c is '>'
4013                         parse_error()
4014                         tok_cur_tag.flag 'force-quirks', true
4015                         tok_state = tok_state_data
4016                         return tok_cur_tag
4017                 if c is '' # EOF
4018                         parse_error()
4019                         tok_state = tok_state_data
4020                         tok_cur_tag.flag 'force-quirks', true
4021                         cur -= 1 # Reconsume
4022                         return tok_cur_tag
4023                 # Anything else
4024                 parse_error()
4025                 tok_cur_tag.flag 'force-quirks', true
4026                 tok_state = tok_state_bogus_doctype
4027                 return null
4028
4029
4030         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4031         tok_state_doctype_public_identifier_double_quoted = ->
4032                 c = txt.charAt(cur++)
4033                 if c is '"'
4034                         tok_state = tok_state_after_doctype_public_identifier
4035                         return
4036                 if c is "\u0000"
4037                         parse_error()
4038                         tok_cur_tag.public_identifier += "\ufffd"
4039                         return
4040                 if c is '>'
4041                         parse_error()
4042                         tok_cur_tag.flag 'force-quirks', true
4043                         tok_state = tok_state_data
4044                         return tok_cur_tag
4045                 if c is '' # EOF
4046                         parse_error()
4047                         tok_state = tok_state_data
4048                         tok_cur_tag.flag 'force-quirks', true
4049                         cur -= 1 # Reconsume
4050                         return tok_cur_tag
4051                 # Anything else
4052                 tok_cur_tag.public_identifier += c
4053                 return null
4054
4055         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4056         tok_state_doctype_public_identifier_single_quoted = ->
4057                 c = txt.charAt(cur++)
4058                 if c is "'"
4059                         tok_state = tok_state_after_doctype_public_identifier
4060                         return
4061                 if c is "\u0000"
4062                         parse_error()
4063                         tok_cur_tag.public_identifier += "\ufffd"
4064                         return
4065                 if c is '>'
4066                         parse_error()
4067                         tok_cur_tag.flag 'force-quirks', true
4068                         tok_state = tok_state_data
4069                         return tok_cur_tag
4070                 if c is '' # EOF
4071                         parse_error()
4072                         tok_state = tok_state_data
4073                         tok_cur_tag.flag 'force-quirks', true
4074                         cur -= 1 # Reconsume
4075                         return tok_cur_tag
4076                 # Anything else
4077                 tok_cur_tag.public_identifier += c
4078                 return null
4079
4080         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4081         tok_state_after_doctype_public_identifier = ->
4082                 c = txt.charAt(cur++)
4083                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4084                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4085                         return
4086                 if c is '>'
4087                         tok_state = tok_state_data
4088                         return tok_cur_tag
4089                 if c is '"'
4090                         parse_error()
4091                         tok_cur_tag.system_identifier = ''
4092                         tok_state = tok_state_doctype_system_identifier_double_quoted
4093                         return
4094                 if c is "'"
4095                         parse_error()
4096                         tok_cur_tag.system_identifier = ''
4097                         tok_state = tok_state_doctype_system_identifier_single_quoted
4098                         return
4099                 if c is '' # EOF
4100                         parse_error()
4101                         tok_state = tok_state_data
4102                         tok_cur_tag.flag 'force-quirks', true
4103                         cur -= 1 # Reconsume
4104                         return tok_cur_tag
4105                 # Anything else
4106                 parse_error()
4107                 tok_cur_tag.flag 'force-quirks', true
4108                 tok_state = tok_state_bogus_doctype
4109                 return null
4110
4111         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4112         tok_state_between_doctype_public_and_system_identifiers = ->
4113                 c = txt.charAt(cur++)
4114                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4115                         return
4116                 if c is '>'
4117                         tok_state = tok_state_data
4118                         return tok_cur_tag
4119                 if c is '"'
4120                         parse_error()
4121                         tok_cur_tag.system_identifier = ''
4122                         tok_state = tok_state_doctype_system_identifier_double_quoted
4123                         return
4124                 if c is "'"
4125                         parse_error()
4126                         tok_cur_tag.system_identifier = ''
4127                         tok_state = tok_state_doctype_system_identifier_single_quoted
4128                         return
4129                 if c is '' # EOF
4130                         parse_error()
4131                         tok_state = tok_state_data
4132                         tok_cur_tag.flag 'force-quirks', true
4133                         cur -= 1 # Reconsume
4134                         return tok_cur_tag
4135                 # Anything else
4136                 parse_error()
4137                 tok_cur_tag.flag 'force-quirks', true
4138                 tok_state = tok_state_bogus_doctype
4139                 return null
4140
4141         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4142         tok_state_after_doctype_system_keyword = ->
4143                 c = txt.charAt(cur++)
4144                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4145                         tok_state = tok_state_before_doctype_system_identifier
4146                         return
4147                 if c is '"'
4148                         parse_error()
4149                         tok_cur_tag.system_identifier = ''
4150                         tok_state = tok_state_doctype_system_identifier_double_quoted
4151                         return
4152                 if c is "'"
4153                         parse_error()
4154                         tok_cur_tag.system_identifier = ''
4155                         tok_state = tok_state_doctype_system_identifier_single_quoted
4156                         return
4157                 if c is '>'
4158                         parse_error()
4159                         tok_cur_tag.flag 'force-quirks', true
4160                         tok_state = tok_state_data
4161                         return tok_cur_tag
4162                 if c is '' # EOF
4163                         parse_error()
4164                         tok_state = tok_state_data
4165                         tok_cur_tag.flag 'force-quirks', true
4166                         cur -= 1 # Reconsume
4167                         return tok_cur_tag
4168                 # Anything else
4169                 parse_error()
4170                 tok_cur_tag.flag 'force-quirks', true
4171                 tok_state = tok_state_bogus_doctype
4172                 return null
4173
4174         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4175         tok_state_before_doctype_system_identifier = ->
4176                 c = txt.charAt(cur++)
4177                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4178                         return
4179                 if c is '"'
4180                         tok_cur_tag.system_identifier = ''
4181                         tok_state = tok_state_doctype_system_identifier_double_quoted
4182                         return
4183                 if c is "'"
4184                         tok_cur_tag.system_identifier = ''
4185                         tok_state = tok_state_doctype_system_identifier_single_quoted
4186                         return
4187                 if c is '>'
4188                         parse_error()
4189                         tok_cur_tag.flag 'force-quirks', true
4190                         tok_state = tok_state_data
4191                         return tok_cur_tag
4192                 if c is '' # EOF
4193                         parse_error()
4194                         tok_state = tok_state_data
4195                         tok_cur_tag.flag 'force-quirks', true
4196                         cur -= 1 # Reconsume
4197                         return tok_cur_tag
4198                 # Anything else
4199                 parse_error()
4200                 tok_cur_tag.flag 'force-quirks', true
4201                 tok_state = tok_state_bogus_doctype
4202                 return null
4203
4204         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4205         tok_state_doctype_system_identifier_double_quoted = ->
4206                 c = txt.charAt(cur++)
4207                 if c is '"'
4208                         tok_state = tok_state_after_doctype_system_identifier
4209                         return
4210                 if c is "\u0000"
4211                         parse_error()
4212                         tok_cur_tag.system_identifier += "\ufffd"
4213                         return
4214                 if c is '>'
4215                         parse_error()
4216                         tok_cur_tag.flag 'force-quirks', true
4217                         tok_state = tok_state_data
4218                         return tok_cur_tag
4219                 if c is '' # EOF
4220                         parse_error()
4221                         tok_state = tok_state_data
4222                         tok_cur_tag.flag 'force-quirks', true
4223                         cur -= 1 # Reconsume
4224                         return tok_cur_tag
4225                 # Anything else
4226                 tok_cur_tag.system_identifier += c
4227                 return null
4228
4229         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4230         tok_state_doctype_system_identifier_single_quoted = ->
4231                 c = txt.charAt(cur++)
4232                 if c is "'"
4233                         tok_state = tok_state_after_doctype_system_identifier
4234                         return
4235                 if c is "\u0000"
4236                         parse_error()
4237                         tok_cur_tag.system_identifier += "\ufffd"
4238                         return
4239                 if c is '>'
4240                         parse_error()
4241                         tok_cur_tag.flag 'force-quirks', true
4242                         tok_state = tok_state_data
4243                         return tok_cur_tag
4244                 if c is '' # EOF
4245                         parse_error()
4246                         tok_state = tok_state_data
4247                         tok_cur_tag.flag 'force-quirks', true
4248                         cur -= 1 # Reconsume
4249                         return tok_cur_tag
4250                 # Anything else
4251                 tok_cur_tag.system_identifier += c
4252                 return null
4253
4254         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4255         tok_state_after_doctype_system_identifier = ->
4256                 c = txt.charAt(cur++)
4257                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4258                         return
4259                 if c is '>'
4260                         tok_state = tok_state_data
4261                         return tok_cur_tag
4262                 if c is '' # EOF
4263                         parse_error()
4264                         tok_state = tok_state_data
4265                         tok_cur_tag.flag 'force-quirks', true
4266                         cur -= 1 # Reconsume
4267                         return tok_cur_tag
4268                 # Anything else
4269                 parse_error()
4270                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4271                 tok_state = tok_state_bogus_doctype
4272                 return null
4273
4274         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4275         tok_state_bogus_doctype = ->
4276                 c = txt.charAt(cur++)
4277                 if c is '>'
4278                         tok_state = tok_state_data
4279                         return tok_cur_tag
4280                 if c is '' # EOF
4281                         tok_state = tok_state_data
4282                         cur -= 1 # Reconsume
4283                         return tok_cur_tag
4284                 # Anything else
4285                 return null
4286
4287         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4288         tok_state_cdata_section = ->
4289                 tok_state = tok_state_data
4290                 next_gt = txt.indexOf ']]>', cur
4291                 if next_gt is -1
4292                         val = txt.substr cur
4293                         cur = txt.length
4294                 else
4295                         val = txt.substr cur, (next_gt - cur)
4296                         cur = next_gt + 3
4297                 val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
4298                 return new_character_token val # fixfull split
4299
4300         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4301         # Don't set this as a state, just call it
4302         # returns a string (NOT a text node)
4303         parse_character_reference = (allowed_char = null, in_attr = false) ->
4304                 if cur >= txt.length
4305                         return '&'
4306                 switch c = txt.charAt(cur)
4307                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4308                                 # explicitly not a parse error
4309                                 return '&'
4310                         when ';'
4311                                 # there has to be "one or more" alnums between & and ; to be a parse error
4312                                 return '&'
4313                         when '#'
4314                                 if cur + 1 >= txt.length
4315                                         return '&'
4316                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4317                                         prefix = '#x'
4318                                         charset = hex_chars
4319                                         start = cur + 2
4320                                 else
4321                                         charset = digits
4322                                         start = cur + 1
4323                                         prefix = '#'
4324                                 i = 0
4325                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4326                                         i += 1
4327                                 if i is 0
4328                                         return '&'
4329                                 if txt.charAt(start + i) is ';'
4330                                         i += 1
4331                                 # FIXME This is supposed to generate parse errors for some chars
4332                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4333                                 if decoded?
4334                                         cur = start + i
4335                                         return decoded
4336                                 return '&'
4337                         else
4338                                 for i in [0...31]
4339                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4340                                                 break
4341                                 if i is 0
4342                                         # exit early, because parse_error() below needs at least one alnum
4343                                         return '&'
4344                                 if txt.charAt(cur + i) is ';'
4345                                         i += 1 # include ';' terminator in value
4346                                         decoded = decode_named_char_ref txt.substr(cur, i)
4347                                         if decoded?
4348                                                 cur += i
4349                                                 return decoded
4350                                         parse_error()
4351                                         return '&'
4352                                 else
4353                                         # no ';' terminator (only legacy char refs)
4354                                         max = i
4355                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4356                                                 c = legacy_char_refs[txt.substr(cur, i)]
4357                                                 if c?
4358                                                         if in_attr
4359                                                                 if txt.charAt(cur + i) is '='
4360                                                                         # "because some legacy user agents will
4361                                                                         # misinterpret the markup in those cases"
4362                                                                         parse_error()
4363                                                                         return '&'
4364                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4365                                                                         # this makes attributes forgiving about url args
4366                                                                         return '&'
4367                                                         # ok, and besides the weird exceptions for attributes...
4368                                                         # return the matching char
4369                                                         cur += i # consume entity chars
4370                                                         parse_error() # because no terminating ";"
4371                                                         return c
4372                                         parse_error()
4373                                         return '&'
4374                 return # never reached
4375
4376         # tree constructor initialization
4377         # see comments on TYPE_TAG/etc for the structure of this data
4378         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4379         open_els = []
4380         afe = [] # active formatting elements
4381         template_ins_modes = []
4382         ins_mode = ins_mode_initial
4383         original_ins_mode = ins_mode # TODO check spec
4384         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4385         flag_frameset_ok = true
4386         flag_parsing = true
4387         flag_foster_parenting = false
4388         form_element_pointer = null
4389         temporary_buffer = null
4390         pending_table_character_tokens = []
4391         head_element_pointer = null
4392         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4393         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4394
4395         # tokenizer initialization
4396         tok_state = tok_state_data
4397
4398         # proccess input
4399         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4400         while flag_parsing
4401                 t = tok_state()
4402                 if t?
4403                         process_token t
4404                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4405         return doc.children
4406
4407 serialize_els = (els, shallow, show_ids) ->
4408         serialized = ''
4409         sep = ''
4410         for t in els
4411                 serialized += sep
4412                 sep = ','
4413                 serialized += t.serialize shallow, show_ids
4414         return serialized
4415
4416 # TODO export TYPE_*
4417 module.exports.parse_html = parse_html
4418 module.exports.debug_log_reset = debug_log_reset
4419 module.exports.debug_log_each = debug_log_each
4420 module.exports.TYPE_TAG = TYPE_TAG
4421 module.exports.TYPE_TEXT = TYPE_TEXT
4422 module.exports.TYPE_COMMENT = TYPE_COMMENT
4423 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4424 module.exports.NS_HTML = NS_HTML
4425 module.exports.NS_MATHML = NS_MATHML
4426 module.exports.NS_SVG = NS_SVG