JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
finish ins_mode_in_body and missing tok_state
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 @flags = args.flags ? {}
96                 if args.id?
97                         @id = "#{args.id}+"
98                 else
99                         @id = "#{++prev_node_id}"
100         acknowledge_self_closing: ->
101                 if @token?
102                         @token.flag 'did_self_close'
103                 else
104                         @flag 'did_self_close', true
105         flag: (key, value = null) ->
106                 if value?
107                         @flags[key] = value
108                 else
109                         return @flags[key]
110         serialize: (shallow = false, show_ids = false) -> # for unit tests
111                 ret = ''
112                 switch @type
113                         when TYPE_TAG
114                                 ret += 'tag:'
115                                 ret += JSON.stringify @name
116                                 ret += ','
117                                 if show_ids
118                                         ret += "##{@id},"
119                                 if shallow
120                                         break
121                                 attr_keys = []
122                                 for k of @attrs
123                                         attr_keys.push k
124                                 attr_keys.sort()
125                                 ret += '{'
126                                 sep = ''
127                                 for k in attr_keys
128                                         ret += sep
129                                         sep = ','
130                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
131                                 ret += '},['
132                                 sep = ''
133                                 for c in @children
134                                         ret += sep
135                                         sep = ','
136                                         ret += c.serialize shallow, show_ids
137                                 ret += ']'
138                         when TYPE_TEXT
139                                 ret += 'text:'
140                                 ret += JSON.stringify @text
141                         when TYPE_COMMENT
142                                 ret += 'comment:'
143                                 ret += JSON.stringify @text
144                         when TYPE_DOCTYPE
145                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
146                         when TYPE_AFE_MARKER
147                                 ret += 'marker'
148                         when TYPE_AAA_BOOKMARK
149                                 ret += 'aaa_bookmark'
150                         else
151                                 ret += 'unknown:'
152                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
153                 return ret
154
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157         return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159         return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161         return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163         return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166         return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168         return new Node TYPE_DOCTYPE, name: name
169 new_eof_token = ->
170         return new Node TYPE_EOF
171 new_afe_marker = ->
172         return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174         return new Node TYPE_AAA_BOOKMARK
175
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
181
182 is_uc_alpha = (str) ->
183         return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185         return str.length is 1 and lc_alpha.indexOf(str) > -1
186
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
189
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
192 is_space = (txt) ->
193         return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
196
197 is_input_hidden_tok = (t) ->
198         return unless t.type is TYPE_START_TAG
199         for a of t.attrs_a
200                 if a[0] is 'type'
201                         if a[1].toLowerCase() is 'hidden'
202                                 return true
203                         return false
204         return false
205
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
208
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
211 legacy_char_refs = {
212         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
229         yen: '¥', yuml: 'ÿ'
230 }
231
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
236 svg_elements = [
237         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
251         'view', 'vkern'
252 ]
253
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
255 mathml_elements = [
256         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262         'determinant', 'diff', 'divergence', 'divide', 'domain',
263         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283         'vectorproduct', 'xor'
284 ]
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
287
288 special_elements = {
289         # HTML:
290         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307         wbr:NS_HTML, xmp:NS_HTML,
308
309         # MathML:
310         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311         'annotation-xml':NS_MATHML,
312
313         # SVG:
314         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
315 }
316
317 formatting_elements = {
318          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
320          u: true
321 }
322
323 h_tags = {
324         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
325 }
326
327 # FIXME namespacify
328 foster_parenting_targets = {
329         table: true
330         tbody: true
331         tfoot: true
332         thead: true
333         tr: true
334 }
335
336 # FIXME namespacify
337 # all html I presume
338 end_tag_implied = {
339         dd: true
340         dt: true
341         li: true
342         option: true
343         optgroup: true
344         p: true
345         rb: true
346         rp: true
347         rt: true
348         rtc: true
349 }
350
351 el_is_special = (e) ->
352         return special_elements[e.name] is e.namespace
353
354 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
355 el_is_special_not_adp = (el) ->
356         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
357
358 svg_attribute_fixes = {
359         attributename: 'attributeName'
360         attributetype: 'attributeType'
361         basefrequency: 'baseFrequency'
362         baseprofile: 'baseProfile'
363         calcmode: 'calcMode'
364         clippathunits: 'clipPathUnits'
365         contentscripttype: 'contentScriptType'
366         contentstyletype: 'contentStyleType'
367         diffuseconstant: 'diffuseConstant'
368         edgemode: 'edgeMode'
369         externalresourcesrequired: 'externalResourcesRequired'
370         filterres: 'filterRes'
371         filterunits: 'filterUnits'
372         glyphref: 'glyphRef'
373         gradienttransform: 'gradientTransform'
374         gradientunits: 'gradientUnits'
375         kernelmatrix: 'kernelMatrix'
376         kernelunitlength: 'kernelUnitLength'
377         keypoints: 'keyPoints'
378         keysplines: 'keySplines'
379         keytimes: 'keyTimes'
380         lengthadjust: 'lengthAdjust'
381         limitingconeangle: 'limitingConeAngle'
382         markerheight: 'markerHeight'
383         markerunits: 'markerUnits'
384         markerwidth: 'markerWidth'
385         maskcontentunits: 'maskContentUnits'
386         maskunits: 'maskUnits'
387         numoctaves: 'numOctaves'
388         pathlength: 'pathLength'
389         patterncontentunits: 'patternContentUnits'
390         patterntransform: 'patternTransform'
391         patternunits: 'patternUnits'
392         pointsatx: 'pointsAtX'
393         pointsaty: 'pointsAtY'
394         pointsatz: 'pointsAtZ'
395         preservealpha: 'preserveAlpha'
396         preserveaspectratio: 'preserveAspectRatio'
397         primitiveunits: 'primitiveUnits'
398         refx: 'refX'
399         refy: 'refY'
400         repeatcount: 'repeatCount'
401         repeatdur: 'repeatDur'
402         requiredextensions: 'requiredExtensions'
403         requiredfeatures: 'requiredFeatures'
404         specularconstant: 'specularConstant'
405         specularexponent: 'specularExponent'
406         spreadmethod: 'spreadMethod'
407         startoffset: 'startOffset'
408         stddeviation: 'stdDeviation'
409         stitchtiles: 'stitchTiles'
410         surfacescale: 'surfaceScale'
411         systemlanguage: 'systemLanguage'
412         tablevalues: 'tableValues'
413         targetx: 'targetX'
414         targety: 'targetY'
415         textlength: 'textLength'
416         viewbox: 'viewBox'
417         viewtarget: 'viewTarget'
418         xchannelselector: 'xChannelSelector'
419         ychannelselector: 'yChannelSelector'
420         zoomandpan: 'zoomAndPan'
421 }
422 adjust_mathml_attributes = (t) ->
423         for a in t.attrs_a
424                 if a[0] is 'definitionurl'
425                         a[0] = 'definitionURL'
426         return
427 adjust_svg_attributes = (t) ->
428         for a in t.attrs_a
429                 if svg_attribute_fixes[a[0]]?
430                         a[0] = svg_attribute_fixes[a[0]]
431         return
432 adjust_foreign_attributes = (t) ->
433         # fixfull
434         return
435
436 # decode_named_char_ref()
437 #
438 # The list of named character references is _huge_ so ask the browser to decode
439 # for us instead of wasting bandwidth/space on including the table here.
440 #
441 # Pass without the "&" but with the ";" examples:
442 #    for "&amp" pass "amp;"
443 #    for "&#x2032" pass "x2032;"
444 g_dncr = {
445         cache: {}
446         textarea: document.createElement('textarea')
447 }
448 # TODO test this in IE8
449 decode_named_char_ref = (txt) ->
450         txt = "&#{txt}"
451         decoded = g_dncr.cache[txt]
452         return decoded if decoded?
453         g_dncr.textarea.innerHTML = txt
454         decoded = g_dncr.textarea.value
455         return null if decoded is txt
456         return g_dncr.cache[txt] = decoded
457
458 parse_html = (txt, parse_error_cb = null) ->
459         cur = 0 # index of next char in txt to be parsed
460         # declare doc and tokenizer variables so they're in scope below
461         doc = null
462         open_els = null # stack of open elements
463         afe = null # active formatting elements
464         template_ins_modes = null
465         ins_mode = null
466         original_ins_mode = null
467         tok_state = null
468         tok_cur_tag = null # partially parsed tag
469         flag_scripting = null
470         flag_frameset_ok = null
471         flag_parsing = null
472         flag_foster_parenting = null
473         form_element_pointer = null
474         temporary_buffer = null
475         pending_table_character_tokens = null
476         head_element_pointer = null
477         flag_fragment_parsing = null
478         context_element = null
479
480         stop_parsing = ->
481                 flag_parsing = false
482
483         parse_error = ->
484                 if parse_error_cb?
485                         parse_error_cb cur
486                 else
487                         console.log "Parse error at character #{cur} of #{txt.length}"
488
489         afe_push = (new_el) ->
490                 matches = 0
491                 for el, i in afe
492                         if el.name is new_el.name and el.namespace is new_el.namespace
493                                 for k, v of el.attrs
494                                         continue unless new_el.attrs[k] is v
495                                 for k, v of new_el.attrs
496                                         continue unless el.attrs[k] is v
497                                 matches += 1
498                                 if matches is 3
499                                         afe.splice i, 1
500                                         break
501                 afe.unshift new_el
502         afe_push_marker = ->
503                 afe.unshift new_afe_marker()
504
505         # the functions below impliment the Tree Contstruction algorithm
506         # http://www.w3.org/TR/html5/syntax.html#tree-construction
507
508         # But first... the helpers
509         template_tag_is_open = ->
510                 for t in open_els
511                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
512                                 return true
513                 return false
514         is_in_scope_x = (tag_name, scope, namespace) ->
515                 for t in open_els
516                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
517                                 return true
518                         if scope[t.name] is t.namespace
519                                 return false
520                 return false
521         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
522                 for t in open_els
523                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
524                                 return true
525                         if scope[t.name] is t.namespace
526                                 return false
527                         if scope2[t.name] is t.namespace
528                                 return false
529                 return false
530         standard_scopers = {
531                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
532                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
533                 template: NS_HTML, mi: NS_MATHML,
534
535                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
536                 'annotation-xml': NS_MATHML,
537
538                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
539         }
540         button_scopers = button: NS_HTML
541         li_scopers = ol: NS_HTML, ul: NS_HTML
542         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
543         is_in_scope = (tag_name, namespace = null) ->
544                 return is_in_scope_x tag_name, standard_scopers, namespace
545         is_in_button_scope = (tag_name, namespace = null) ->
546                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
547         is_in_table_scope = (tag_name, namespace = null) ->
548                 return is_in_scope_x tag_name, table_scopers, namespace
549         # aka is_in_list_item_scope
550         is_in_li_scope = (tag_name, namespace = null) ->
551                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
552         is_in_select_scope = (tag_name, namespace = null) ->
553                 for t in open_els
554                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
555                                 return true
556                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
557                                 return false
558                 return false
559         # this checks for a particular element, not by name
560         el_is_in_scope = (el) ->
561                 for t in open_els
562                         if t is el
563                                 return true
564                         if standard_scopers[t.name] is t.namespace
565                                 return false
566                 return false
567
568         clear_to_table_stopers = {
569                 'table': true
570                 'template': true
571                 'html': true
572         }
573         clear_stack_to_table_context = ->
574                 loop
575                         if clear_to_table_stopers[open_els[0].name]?
576                                 break
577                         open_els.shift()
578                 return
579         clear_to_table_body_stopers = {
580                 'tbody': true
581                 'tfoot': true
582                 'thead': true
583                 'template': true
584                 'html': true
585         }
586         clear_stack_to_table_body_context = ->
587                 loop
588                         if clear_to_table_body_stopers[open_els[0].name]?
589                                 break
590                         open_els.shift()
591                 return
592         clear_to_table_row_stopers = {
593                 'tr': true
594                 'template': true
595                 'html': true
596         }
597         clear_stack_to_table_row_context = ->
598                 loop
599                         if clear_to_table_row_stopers[open_els[0].name]?
600                                 break
601                         open_els.shift()
602                 return
603         clear_afe_to_marker = ->
604                 loop
605                         return unless afe.length > 0 # this happens in fragment case, ?spec error
606                         el = afe.shift()
607                         if el.type is TYPE_AFE_MARKER
608                                 return
609                 return
610
611         # 8.2.3.1 ...
612         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
613         reset_ins_mode = ->
614                 # 1. Let last be false.
615                 last = false
616                 # 2. Let node be the last node in the stack of open elements.
617                 node_i = 0
618                 node = open_els[node_i]
619                 # 3. Loop: If node is the first node in the stack of open elements,
620                 # then set last to true, and, if the parser was originally created as
621                 # part of the HTML fragment parsing algorithm (fragment case) set node
622                 # to the context element.
623                 loop
624                         if node_i is open_els.length - 1
625                                 last = true
626                                 # fixfull (fragment case)
627
628                         # 4. If node is a select element, run these substeps:
629                         if node.name is 'select'
630                                 # 1. If last is true, jump to the step below labeled done.
631                                 unless last
632                                         # 2. Let ancestor be node.
633                                         ancestor_i = node_i
634                                         ancestor = node
635                                         # 3. Loop: If ancestor is the first node in the stack of
636                                         # open elements, jump to the step below labeled done.
637                                         loop
638                                                 if ancestor_i is open_els.length - 1
639                                                         break
640                                                 # 4. Let ancestor be the node before ancestor in the stack
641                                                 # of open elements.
642                                                 ancestor_i += 1
643                                                 ancestor = open_els[ancestor_i]
644                                                 # 5. If ancestor is a template node, jump to the step below
645                                                 # labeled done.
646                                                 if ancestor.name is 'template'
647                                                         break
648                                                 # 6. If ancestor is a table node, switch the insertion mode
649                                                 # to "in select in table" and abort these steps.
650                                                 if ancestor.name is 'table'
651                                                         ins_mode = ins_mode_in_select_in_table
652                                                         return
653                                                 # 7. Jump back to the step labeled loop.
654                                 # 8. Done: Switch the insertion mode to "in select" and abort
655                                 # these steps.
656                                 ins_mode = ins_mode_in_select
657                                 return
658                         # 5. If node is a td or th element and last is false, then switch
659                         # the insertion mode to "in cell" and abort these steps.
660                         if (node.name is 'td' or node.name is 'th') and last is false
661                                 ins_mode = ins_mode_in_cell
662                                 return
663                         # 6. If node is a tr element, then switch the insertion mode to "in
664                         # row" and abort these steps.
665                         if node.name is 'tr'
666                                 ins_mode = ins_mode_in_row
667                                 return
668                         # 7. If node is a tbody, thead, or tfoot element, then switch the
669                         # insertion mode to "in table body" and abort these steps.
670                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
671                                 ins_mode = ins_mode_in_table_body
672                                 return
673                         # 8. If node is a caption element, then switch the insertion mode
674                         # to "in caption" and abort these steps.
675                         if node.name is 'caption'
676                                 ins_mode = ins_mode_in_caption
677                                 return
678                         # 9. If node is a colgroup element, then switch the insertion mode
679                         # to "in column group" and abort these steps.
680                         if node.name is 'colgroup'
681                                 ins_mode = ins_mode_in_column_group
682                                 return
683                         # 10. If node is a table element, then switch the insertion mode to
684                         # "in table" and abort these steps.
685                         if node.name is 'table'
686                                 ins_mode = ins_mode_in_table
687                                 return
688                         # 11. If node is a template element, then switch the insertion mode
689                         # to the current template insertion mode and abort these steps.
690                         # fixfull (template insertion mode stack)
691
692                         # 12. If node is a head element and last is true, then switch the
693                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
694                         # these steps. (fragment case)
695                         if node.name is 'head' and last
696                                 ins_mode = ins_mode_in_body
697                                 return
698                         # 13. If node is a head element and last is false, then switch the
699                         # insertion mode to "in head" and abort these steps.
700                         if node.name is 'head' and last is false
701                                 ins_mode = ins_mode_in_head
702                                 return
703                         # 14. If node is a body element, then switch the insertion mode to
704                         # "in body" and abort these steps.
705                         if node.name is 'body'
706                                 ins_mode = ins_mode_in_body
707                                 return
708                         # 15. If node is a frameset element, then switch the insertion mode
709                         # to "in frameset" and abort these steps. (fragment case)
710                         if node.name is 'frameset'
711                                 ins_mode = ins_mode_in_frameset
712                                 return
713                         # 16. If node is an html element, run these substeps:
714                         if node.name is 'html'
715                                 # 1. If the head element pointer is null, switch the insertion
716                                 # mode to "before head" and abort these steps. (fragment case)
717                                 if head_element_pointer is null
718                                         ins_mode = ins_mode_before_head
719                                 else
720                                         # 2. Otherwise, the head element pointer is not null,
721                                         # switch the insertion mode to "after head" and abort these
722                                         # steps.
723                                         ins_mode = ins_mode_after_head
724                                 return
725                         # 17. If last is true, then switch the insertion mode to "in body"
726                         # and abort these steps. (fragment case)
727                         if last
728                                 ins_mode = ins_mode_in_body
729                                 return
730                         # 18. Let node now be the node before node in the stack of open
731                         # elements.
732                         node_i += 1
733                         node = open_els[node_i]
734                         # 19. Return to the step labeled loop.
735
736         # 8.2.3.2
737
738         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
739         adjusted_current_node = ->
740                 if open_els.length is 1 and flag_fragment_parsing
741                         return context_element
742                 return open_els[0]
743
744         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
745         # this implementation is structured (mostly) as described at the link above.
746         # capitalized comments are the "labels" described at the link above.
747         reconstruct_afe = ->
748                 return if afe.length is 0
749                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
750                         return
751                 # Rewind
752                 i = 0
753                 loop
754                         if i is afe.length - 1
755                                 break
756                         i += 1
757                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
758                                 i -= 1 # Advance
759                                 break
760                 # Create
761                 loop
762                         el = insert_html_element afe[i].token
763                         afe[i] = el
764                         break if i is 0
765                         i -= 1 # Advance
766
767         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
768         # adoption agency algorithm
769         # overview here:
770         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
771         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
772         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
773         adoption_agency = (subject) ->
774                 debug_log "adoption_agency()"
775                 debug_log "tree: #{serialize_els doc.children, false, true}"
776                 debug_log "open_els: #{serialize_els open_els, true, true}"
777                 debug_log "afe: #{serialize_els afe, true, true}"
778                 if open_els[0].name is subject
779                         el = open_els[0]
780                         open_els.shift()
781                         # remove it from the list of active formatting elements (if found)
782                         for t, i in afe
783                                 if t is el
784                                         afe.splice i, 1
785                                         break
786                         debug_log "aaa: starting off with subject on top of stack, exiting"
787                         return
788                 outer = 0
789                 loop
790                         if outer >= 8
791                                 return
792                         outer += 1
793                         # 5. Let formatting element be the last element in the list of
794                         # active formatting elements that: is between the end of the list
795                         # and the last scope marker in the list, if any, or the start of
796                         # the list otherwise, and  has the tag name subject.
797                         fe = null
798                         for t, fe_of_afe in afe
799                                 if t.type is TYPE_AFE_MARKER
800                                         break
801                                 if t.name is subject
802                                         fe = t
803                                         break
804                         # If there is no such element, then abort these steps and instead
805                         # act as described in the "any other end tag" entry above.
806                         if fe is null
807                                 debug_log "aaa: fe not found in afe"
808                                 in_body_any_other_end_tag subject
809                                 return
810                         # 6. If formatting element is not in the stack of open elements,
811                         # then this is a parse error; remove the element from the list, and
812                         # abort these steps.
813                         in_open_els = false
814                         for t, fe_of_open_els in open_els
815                                 if t is fe
816                                         in_open_els = true
817                                         break
818                         unless in_open_els
819                                 debug_log "aaa: fe not found in open_els"
820                                 parse_error()
821                                 # "remove it from the list" must mean afe, since it's not in open_els
822                                 afe.splice fe_of_afe, 1
823                                 return
824                         # 7. If formatting element is in the stack of open elements, but
825                         # the element is not in scope, then this is a parse error; abort
826                         # these steps.
827                         unless el_is_in_scope fe
828                                 debug_log "aaa: fe not in scope"
829                                 parse_error()
830                                 return
831                         # 8. If formatting element is not the current node, this is a parse
832                         # error. (But do not abort these steps.)
833                         unless open_els[0] is fe
834                                 parse_error()
835                                 # continue
836                         # 9. Let furthest block be the topmost node in the stack of open
837                         # elements that is lower in the stack than formatting element, and
838                         # is an element in the special category. There might not be one.
839                         fb = null
840                         fb_of_open_els = null
841                         for t, i in open_els
842                                 if t is fe
843                                         break
844                                 if el_is_special t
845                                         fb = t
846                                         fb_of_open_els = i
847                                         # and continue, to see if there's one that's more "topmost"
848                         # 10. If there is no furthest block, then the UA must first pop all
849                         # the nodes from the bottom of the stack of open elements, from the
850                         # current node up to and including formatting element, then remove
851                         # formatting element from the list of active formatting elements,
852                         # and finally abort these steps.
853                         if fb is null
854                                 debug_log "aaa: no fb"
855                                 loop
856                                         t = open_els.shift()
857                                         if t is fe
858                                                 afe.splice fe_of_afe, 1
859                                                 return
860                         # 11. Let common ancestor be the element immediately above
861                         # formatting element in the stack of open elements.
862                         ca = open_els[fe_of_open_els + 1] # common ancestor
863
864                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
865                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
866                         bookmark = new_aaa_bookmark()
867                         for t, i in afe
868                                 if t is fe
869                                         afe.splice i, 0, bookmark
870                                         break
871                         node = last_node = fb
872                         inner = 0
873                         loop
874                                 inner += 1
875                                 # 3. Let node be the element immediately above node in the
876                                 # stack of open elements, or if node is no longer in the stack
877                                 # of open elements (e.g. because it got removed by this
878                                 # algorithm), the element that was immediately above node in
879                                 # the stack of open elements before node was removed.
880                                 node_next = null
881                                 for t, i in open_els
882                                         if t is node
883                                                 node_next = open_els[i + 1]
884                                                 break
885                                 node = node_next ? node_above
886                                 debug_log "inner loop #{inner}"
887                                 debug_log "tree: #{serialize_els doc.children, false, true}"
888                                 debug_log "open_els: #{serialize_els open_els, true, true}"
889                                 debug_log "afe: #{serialize_els afe, true, true}"
890                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
891                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
892                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
893                                 debug_log "node: #{node.serialize true, true}"
894                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
895
896                                 # 4. If node is formatting element, then go to the next step in
897                                 # the overall algorithm.
898                                 if node is fe
899                                         break
900                                 debug_log "the meat"
901                                 # 5. If inner loop counter is greater than three and node is in
902                                 # the list of active formatting elements, then remove node from
903                                 # the list of active formatting elements.
904                                 node_in_afe = false
905                                 for t, i in afe
906                                         if t is node
907                                                 if inner > 3
908                                                         afe.splice i, 1
909                                                         debug_log "max out inner"
910                                                 else
911                                                         node_in_afe = true
912                                                         debug_log "in afe"
913                                                 break
914                                 # 6. If node is not in the list of active formatting elements,
915                                 # then remove node from the stack of open elements and then go
916                                 # back to the step labeled inner loop.
917                                 unless node_in_afe
918                                         debug_log "not in afe"
919                                         for t, i in open_els
920                                                 if t is node
921                                                         node_above = open_els[i + 1]
922                                                         open_els.splice i, 1
923                                                         break
924                                         continue
925                                 debug_log "the bones"
926                                 # 7. create an element for the token for which the element node
927                                 # was created, in the HTML namespace, with common ancestor as
928                                 # the intended parent; replace the entry for node in the list
929                                 # of active formatting elements with an entry for the new
930                                 # element, replace the entry for node in the stack of open
931                                 # elements with an entry for the new element, and let node be
932                                 # the new element.
933                                 new_node = token_to_element node.token, NS_HTML, ca
934                                 for t, i in afe
935                                         if t is node
936                                                 afe[i] = new_node
937                                                 debug_log "replaced in afe"
938                                                 break
939                                 for t, i in open_els
940                                         if t is node
941                                                 node_above = open_els[i + 1]
942                                                 open_els[i] = new_node
943                                                 debug_log "replaced in open_els"
944                                                 break
945                                 node = new_node
946                                 # 8. If last node is furthest block, then move the
947                                 # aforementioned bookmark to be immediately after the new node
948                                 # in the list of active formatting elements.
949                                 if last_node is fb
950                                         for t, i in afe
951                                                 if t is bookmark
952                                                         afe.splice i, 1
953                                                         debug_log "removed bookmark"
954                                                         break
955                                         for t, i in afe
956                                                 if t is node
957                                                         # "after" means lower
958                                                         afe.splice i, 0, bookmark # "after as <-
959                                                         debug_log "placed bookmark after node"
960                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
961                                                         break
962                                 # 9. Insert last node into node, first removing it from its
963                                 # previous parent node if any.
964                                 if last_node.parent?
965                                         debug_log "last_node has parent"
966                                         for c, i in last_node.parent.children
967                                                 if c is last_node
968                                                         debug_log "removing last_node from parent"
969                                                         last_node.parent.children.splice i, 1
970                                                         break
971                                 node.children.push last_node
972                                 last_node.parent = node
973                                 # 10. Let last node be node.
974                                 last_node = node
975                                 debug_log "at last"
976                                 # 11. Return to the step labeled inner loop.
977                         # 14. Insert whatever last node ended up being in the previous step
978                         # at the appropriate place for inserting a node, but using common
979                         # ancestor as the override target.
980
981                         # In the case where fe is immediately followed by fb:
982                         #   * inner loop exits out early (node==fe)
983                         #   * last_node is fb
984                         #   * last_node is still in the tree (not a duplicate)
985                         if last_node.parent?
986                                 debug_log "FEFIRST? last_node has parent"
987                                 for c, i in last_node.parent.children
988                                         if c is last_node
989                                                 debug_log "removing last_node from parent"
990                                                 last_node.parent.children.splice i, 1
991                                                 break
992
993                         debug_log "after aaa inner loop"
994                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
995                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
996                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
997                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
998                         debug_log "tree: #{serialize_els doc.children, false, true}"
999
1000                         debug_log "insert"
1001
1002
1003                         # can't use standard insert token thing, because it's already in
1004                         # open_els and must stay at it's current position in open_els
1005                         dest = adjusted_insertion_location ca
1006                         dest[0].children.splice dest[1], 0, last_node
1007                         last_node.parent = dest[0]
1008
1009
1010                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1011                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1012                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1013                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1014                         debug_log "tree: #{serialize_els doc.children, false, true}"
1015
1016                         # 15. Create an element for the token for which formatting element
1017                         # was created, in the HTML namespace, with furthest block as the
1018                         # intended parent.
1019                         new_element = token_to_element fe.token, NS_HTML, fb
1020                         # 16. Take all of the child nodes of furthest block and append them
1021                         # to the element created in the last step.
1022                         while fb.children.length
1023                                 t = fb.children.shift()
1024                                 t.parent = new_element
1025                                 new_element.children.push t
1026                         # 17. Append that new element to furthest block.
1027                         new_element.parent = fb
1028                         fb.children.push new_element
1029                         # 18. Remove formatting element from the list of active formatting
1030                         # elements, and insert the new element into the list of active
1031                         # formatting elements at the position of the aforementioned
1032                         # bookmark.
1033                         for t, i in afe
1034                                 if t is fe
1035                                         afe.splice i, 1
1036                                         break
1037                         for t, i in afe
1038                                 if t is bookmark
1039                                         afe[i] = new_element
1040                                         break
1041                         # 19. Remove formatting element from the stack of open elements,
1042                         # and insert the new element into the stack of open elements
1043                         # immediately below the position of furthest block in that stack.
1044                         for t, i in open_els
1045                                 if t is fe
1046                                         open_els.splice i, 1
1047                                         break
1048                         for t, i in open_els
1049                                 if t is fb
1050                                         open_els.splice i, 0, new_element
1051                                         break
1052                         # 20. Jump back to the step labeled outer loop.
1053                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1054                         debug_log "tree: #{serialize_els doc.children, false, true}"
1055                         debug_log "open_els: #{serialize_els open_els, true, true}"
1056                         debug_log "afe: #{serialize_els afe, true, true}"
1057                 debug_log "AAA DONE"
1058
1059         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1060         close_p_element = ->
1061                 generate_implied_end_tags 'p' # arg is exception
1062                 if open_els[0].name isnt 'p'
1063                         parse_error()
1064                 while open_els.length > 1 # just in case
1065                         el = open_els.shift()
1066                         if el.name is 'p'
1067                                 return
1068         close_p_if_in_button_scope = ->
1069                 if is_in_button_scope 'p'
1070                         close_p_element()
1071
1072         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1073         # aka insert_a_character = (t) ->
1074         insert_character = (t) ->
1075                 dest = adjusted_insertion_location()
1076                 # fixfull check for Document node
1077                 if dest[1] > 0
1078                         prev = dest[0].children[dest[1] - 1]
1079                         if prev.type is TYPE_TEXT
1080                                 prev.text += t.text
1081                                 return
1082                 dest[0].children.splice dest[1], 0, t
1083
1084         # 8.2.5.1
1085         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1086         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1087         adjusted_insertion_location = (override_target = null) ->
1088                 # 1. If there was an override target specified, then let target be the
1089                 # override target.
1090                 if override_target?
1091                         target = override_target
1092                 else # Otherwise, let target be the current node.
1093                         target = open_els[0]
1094                 # 2. Determine the adjusted insertion location using the first matching
1095                 # steps from the following list:
1096                 #
1097                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1098                 # thead, or tr element Foster parenting happens when content is
1099                 # misnested in tables.
1100                 if flag_foster_parenting and foster_parenting_targets[target.name]
1101                         loop # once. this is here so we can ``break`` to "abort these substeps"
1102                                 # 1. Let last template be the last template element in the
1103                                 # stack of open elements, if any.
1104                                 last_template = null
1105                                 last_template_i = null
1106                                 for el, i in open_els
1107                                         if el.name is 'template'
1108                                                 last_template = el
1109                                                 last_template_i = i
1110                                                 break
1111                                 # 2. Let last table be the last table element in the stack of
1112                                 # open elements, if any.
1113                                 last_table = null
1114                                 last_table_i
1115                                 for el, i in open_els
1116                                         if el.name is 'table'
1117                                                 last_table = el
1118                                                 last_table_i = i
1119                                                 break
1120                                 # 3. If there is a last template and either there is no last
1121                                 # table, or there is one, but last template is lower (more
1122                                 # recently added) than last table in the stack of open
1123                                 # elements, then: let adjusted insertion location be inside
1124                                 # last template's template contents, after its last child (if
1125                                 # any), and abort these substeps.
1126                                 if last_template and (last_table is null or last_template_i < last_table_i)
1127                                         target = last_template # fixfull should be it's contents
1128                                         target_i = target.children.length
1129                                         break
1130                                 # 4. If there is no last table, then let adjusted insertion
1131                                 # location be inside the first element in the stack of open
1132                                 # elements (the html element), after its last child (if any),
1133                                 # and abort these substeps. (fragment case)
1134                                 if last_table is null
1135                                         # this is odd
1136                                         target = open_els[open_els.length - 1]
1137                                         target_i = target.children.length
1138                                 # 5. If last table has a parent element, then let adjusted
1139                                 # insertion location be inside last table's parent element,
1140                                 # immediately before last table, and abort these substeps.
1141                                 if last_table.parent?
1142                                         for c, i in last_table.parent.children
1143                                                 if c is last_table
1144                                                         target = last_table.parent
1145                                                         target_i = i
1146                                                         break
1147                                         break
1148                                 # 6. Let previous element be the element immediately above last
1149                                 # table in the stack of open elements.
1150                                 #
1151                                 # huh? how could it not have a parent?
1152                                 previous_element = open_els[last_table_i + 1]
1153                                 # 7. Let adjusted insertion location be inside previous
1154                                 # element, after its last child (if any).
1155                                 target = previous_element
1156                                 target_i = target.children.length
1157                                 # Note: These steps are involved in part because it's possible
1158                                 # for elements, the table element in this case in particular,
1159                                 # to have been moved by a script around in the DOM, or indeed
1160                                 # removed from the DOM entirely, after the element was inserted
1161                                 # by the parser.
1162                                 break # don't really loop
1163                 else
1164                         # Otherwise Let adjusted insertion location be inside target, after
1165                         # its last child (if any).
1166                         target_i = target.children.length
1167
1168                 # 3. If the adjusted insertion location is inside a template element,
1169                 # let it instead be inside the template element's template contents,
1170                 # after its last child (if any).
1171                 # fixfull (template)
1172
1173                 # 4. Return the adjusted insertion location.
1174                 return [target, target_i]
1175
1176         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1177         # aka create_an_element_for_token
1178         token_to_element = (t, namespace, intended_parent) ->
1179                 # convert attributes into a hash
1180                 attrs = {}
1181                 for a in t.attrs_a
1182                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1183                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1184
1185                 # TODO 2. If the newly created element has an xmlns attribute in the
1186                 # XMLNS namespace whose value is not exactly the same as the element's
1187                 # namespace, that is a parse error. Similarly, if the newly created
1188                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1189                 # value is not the XLink Namespace, that is a parse error.
1190
1191                 # fixfull: the spec says stuff about form pointers and ownerDocument
1192
1193                 return el
1194
1195         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1196         insert_foreign_element = (token, namespace) ->
1197                 ail = adjusted_insertion_location()
1198                 ail_el = ail[0]
1199                 ail_i = ail[1]
1200                 el = token_to_element token, namespace, ail_el
1201                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1202                 el.parent = ail_el
1203                 ail_el.children.splice ail_i, 0, el
1204                 open_els.unshift el
1205                 return el
1206         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1207         insert_html_element = (token) ->
1208                 insert_foreign_element token, NS_HTML
1209
1210         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1211         # position should be [node, index_within_children]
1212         insert_comment = (t, position = null) ->
1213                 position ?= adjusted_insertion_location()
1214                 position[0].children.splice position[1], 0, t
1215
1216         # 8.2.5.2
1217         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1218         parse_generic_raw_text = (t) ->
1219                 insert_html_element t
1220                 tok_state = tok_state_rawtext
1221                 original_ins_mode = ins_mode
1222                 ins_mode = ins_mode_text
1223         parse_generic_rcdata_text = (t) ->
1224                 insert_html_element t
1225                 tok_state = tok_state_rcdata
1226                 original_ins_mode = ins_mode
1227                 ins_mode = ins_mode_text
1228
1229         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1230         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1231         generate_implied_end_tags = (except = null) ->
1232                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1233                         open_els.shift()
1234
1235         # 8.2.5.4 The rules for parsing tokens in HTML content
1236         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1237
1238         # 8.2.5.4.1 The "initial" insertion mode
1239         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1240         ins_mode_initial = (t) ->
1241                 if is_space_tok t
1242                         return
1243                 if t.type is TYPE_COMMENT
1244                         # ?fixfull
1245                         doc.children.push t
1246                         return
1247                 if t.type is TYPE_DOCTYPE
1248                         # FIXME check identifiers, set quirks, etc
1249                         # fixfull
1250                         doc.children.push t
1251                         ins_mode = ins_mode_before_html
1252                         return
1253                 # Anything else
1254                 #fixfull (iframe, quirks)
1255                 ins_mode = ins_mode_before_html
1256                 ins_mode t # reprocess the token
1257                 return
1258
1259         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1260         ins_mode_before_html = (t) ->
1261                 if t.type is TYPE_DOCTYPE
1262                         parse_error()
1263                         return
1264                 if t.type is TYPE_COMMENT
1265                         doc.children.push t
1266                         return
1267                 if is_space_tok t
1268                         return
1269                 if t.type is TYPE_START_TAG and t.name is 'html'
1270                         el = token_to_element t, NS_HTML, doc
1271                         doc.children.push el
1272                         open_els.unshift(el)
1273                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1274                         ins_mode = ins_mode_before_head
1275                         return
1276                 if t.type is TYPE_END_TAG
1277                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1278                                 # fall through to "anything else"
1279                         else
1280                                 parse_error()
1281                                 return
1282                 # Anything else
1283                 html_tok = new_open_tag 'html'
1284                 el = token_to_element html_tok, NS_HTML, doc
1285                 doc.children.push el
1286                 open_els.unshift el
1287                 # ?fixfull browsing context
1288                 ins_mode = ins_mode_before_head
1289                 ins_mode t
1290                 return
1291
1292         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1293         ins_mode_before_head = (t) ->
1294                 if is_space_tok t
1295                         return
1296                 if t.type is TYPE_COMMENT
1297                         insert_comment t
1298                         return
1299                 if t.type is TYPE_DOCTYPE
1300                         parse_error()
1301                         return
1302                 if t.type is TYPE_START_TAG and t.name is 'html'
1303                         ins_mode_in_body t
1304                         return
1305                 if t.type is TYPE_START_TAG and t.name is 'head'
1306                         el = insert_html_element t
1307                         head_element_pointer = el
1308                         ins_mode = ins_mode_in_head
1309                 if t.type is TYPE_END_TAG
1310                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1311                                 # fall through to Anything else below
1312                         else
1313                                 parse_error()
1314                                 return
1315                 # Anything else
1316                 head_tok = new_open_tag 'head'
1317                 el = insert_html_element head_tok
1318                 head_element_pointer = el
1319                 ins_mode = ins_mode_in_head
1320                 ins_mode t # reprocess current token
1321
1322         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1323         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1324                 open_els.shift() # spec says this will be a 'head' node
1325                 ins_mode = ins_mode_after_head
1326                 ins_mode t
1327         ins_mode_in_head = (t) ->
1328                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1329                         insert_character t
1330                         return
1331                 if t.type is TYPE_COMMENT
1332                         insert_comment t
1333                         return
1334                 if t.type is TYPE_DOCTYPE
1335                         parse_error()
1336                         return
1337                 if t.type is TYPE_START_TAG and t.name is 'html'
1338                         ins_mode_in_body t
1339                         return
1340                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1341                         el = insert_html_element t
1342                         open_els.shift()
1343                         t.acknowledge_self_closing()
1344                         return
1345                 if t.type is TYPE_START_TAG and t.name is 'meta'
1346                         el = insert_html_element t
1347                         open_els.shift()
1348                         t.acknowledge_self_closing()
1349                         # fixfull encoding stuff
1350                         return
1351                 if t.type is TYPE_START_TAG and t.name is 'title'
1352                         parse_generic_rcdata_text t
1353                         return
1354                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1355                         parse_generic_raw_text t
1356                         return
1357                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1358                         insert_html_element t
1359                         ins_mode = ins_mode_in_head_noscript
1360                         return
1361                 if t.type is TYPE_START_TAG and t.name is 'script'
1362                         ail = adjusted_insertion_location()
1363                         el = token_to_element t, NS_HTML, ail
1364                         el.flag 'parser-inserted', true
1365                         # fixfull frament case
1366                         ail[0].children.splice ail[1], 0, el
1367                         open_els.unshift el
1368                         tok_state = tok_state_script_data
1369                         original_ins_mode = ins_mode # make sure orig... is defined
1370                         ins_mode = ins_mode_text
1371                         return
1372                 if t.type is TYPE_END_TAG and t.name is 'head'
1373                         open_els.shift() # will be a head element... spec says so
1374                         ins_mode = ins_mode_after_head
1375                         return
1376                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1377                         ins_mode_in_head_else t
1378                         return
1379                 if t.type is TYPE_START_TAG and t.name is 'template'
1380                         insert_html_element t
1381                         afe_push_marker()
1382                         flag_frameset_ok = false
1383                         ins_mode = ins_mode_in_template
1384                         template_ins_modes.unshift ins_mode_in_template
1385                         return
1386                 if t.type is TYPE_END_TAG and t.name is 'template'
1387                         if template_tag_is_open()
1388                                 generate_implied_end_tags
1389                                 if open_els[0].name isnt 'template'
1390                                         parse_error()
1391                                 loop
1392                                         el = open_els.shift()
1393                                         if el.name is 'template'
1394                                                 break
1395                                 clear_afe_to_marker()
1396                                 template_ins_modes.shift()
1397                                 reset_ins_mode()
1398                         else
1399                                 parse_error()
1400                         return
1401                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1402                         parse_error()
1403                         return
1404                 ins_mode_in_head_else t
1405
1406         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1407         ins_mode_in_head_noscript_else = (t) ->
1408                 parse_error()
1409                 open_els.shift()
1410                 ins_mode = ins_mode_in_head
1411                 ins_mode t
1412         ins_mode_in_head_noscript = (t) ->
1413                 if t.type is TYPE_DOCTYPE
1414                         parse_error()
1415                         return
1416                 if t.type is TYPE_START_TAG
1417                         ins_mode_in_body t
1418                         return
1419                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1420                         open_els.shift()
1421                         ins_mode = ins_mode_in_head
1422                         return
1423                 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1424                         ins_mode_in_head t
1425                         return
1426                 if t.type is TYPE_END_TAG and t.name is 'br'
1427                         ins_mode_in_head_noscript_else t
1428                         return
1429                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1430                         parse_error()
1431                         return
1432                 # Anything else
1433                 ins_mode_in_head_noscript_else t
1434                 return
1435
1436
1437
1438         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1439         ins_mode_after_head_else = (t) ->
1440                 body_tok = new_open_tag 'body'
1441                 insert_html_element body_tok
1442                 ins_mode = ins_mode_in_body
1443                 ins_mode t # reprocess token
1444                 return
1445         ins_mode_after_head = (t) ->
1446                 if is_space_tok t
1447                         insert_character t
1448                         return
1449                 if t.type is TYPE_COMMENT
1450                         insert_comment t
1451                         return
1452                 if t.type is TYPE_DOCTYPE
1453                         parse_error()
1454                         return
1455                 if t.type is TYPE_START_TAG and t.name is 'html'
1456                         ins_mode_in_body t
1457                         return
1458                 if t.type is TYPE_START_TAG and t.name is 'body'
1459                         insert_html_element t
1460                         flag_frameset_ok = false
1461                         ins_mode = ins_mode_in_body
1462                         return
1463                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1464                         insert_html_element t
1465                         ins_mode = ins_mode_in_frameset
1466                         return
1467                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1468                         parse_error()
1469                         open_els.unshift head_element_pointer
1470                         ins_mode_in_head t
1471                         for el, i of open_els
1472                                 if el is head_element_pointer
1473                                         open_els.splice i, 1
1474                                         return
1475                         console.log "warning: 23904 couldn't find head element in open_els"
1476                         return
1477                 if t.type is TYPE_END_TAG and t.name is 'template'
1478                         ins_mode_in_head t
1479                         return
1480                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1481                         ins_mode_after_head_else t
1482                         return
1483                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1484                         parse_error()
1485                         return
1486                 # Anything else
1487                 ins_mode_after_head_else t
1488
1489         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1490         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1491                 for el, i in open_els
1492                         if el.namespace is NS_HTML and el.name is name
1493                                 generate_implied_end_tags name # arg is exception
1494                                 parse_error() unless i is 0
1495                                 while i >= 0
1496                                         open_els.shift()
1497                                         i -= 1
1498                                 return
1499                         if special_elements[el.name] is el.namespace
1500                                 parse_error()
1501                                 return
1502                 return
1503         ins_mode_in_body = (t) ->
1504                 if t.type is TYPE_TEXT and t.text is "\u0000"
1505                         parse_error()
1506                         return
1507                 if is_space_tok t
1508                         reconstruct_afe()
1509                         insert_character t
1510                         return
1511                 if t.type is TYPE_TEXT
1512                         reconstruct_afe()
1513                         insert_character t
1514                         flag_frameset_ok = false
1515                         return
1516                 if t.type is TYPE_COMMENT
1517                         insert_comment t
1518                         return
1519                 if t.type is TYPE_DOCTYPE
1520                         parse_error()
1521                         return
1522                 if t.type is TYPE_START_TAG and t.name is 'html'
1523                         parse_error()
1524                         return if template_tag_is_open()
1525                         root_attrs = open_els[open_els.length - 1].attrs
1526                         for a of t.attrs_a
1527                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1528                         return
1529
1530                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1531                         ins_mode_in_head t
1532                         return
1533                 if t.type is TYPE_START_TAG and t.name is 'body'
1534                         parse_error()
1535                         return if open_els.length < 2
1536                         second = open_els[open_els.length - 2]
1537                         return unless second.ns is NS_HTML
1538                         return unless second.name is 'body'
1539                         return if template_tag_is_open()
1540                         frameset_ok_flag = false
1541                         for a of t.attrs_a
1542                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1543                         return
1544                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1545                         parse_error()
1546                         return if open_els.length < 2
1547                         second_i = open_els.length - 2
1548                         second = open_els[second_i]
1549                         return unless second.ns is NS_HTML
1550                         return unless second.name is 'body'
1551                         flag_frameset_ok = false
1552                         if second.parent?
1553                                 for el, i in second.parent.children
1554                                         if el is second
1555                                                 second.parent.children.splice i, 1
1556                                                 break
1557                         open_els.splice second_i, 1
1558                         # pop everything except the "root html element"
1559                         while open_els.length > 1
1560                                 open_els.shift()
1561                         insert_html_element t
1562                         ins_mode = ins_mode_in_frameset
1563                         return
1564                 if t.type is TYPE_EOF
1565                         ok_tags = {
1566                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1567                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1568                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1569                         }
1570                         for el in open_els
1571                                 unless ok_tags[t.name] is el.namespace
1572                                         parse_error()
1573                                         break
1574                         if template_ins_modes.length > 0
1575                                 ins_mode_in_template t
1576                         else
1577                                 stop_parsing()
1578                         return
1579                 if t.type is TYPE_END_TAG and t.name is 'body'
1580                         unless is_in_scope 'body'
1581                                 parse_error()
1582                                 return
1583                         ok_tags = {
1584                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1585                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1586                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1587                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1588                                 html:NS_HTML
1589                         }
1590                         for el in open_els
1591                                 unless ok_tags[t.name] is el.namespace
1592                                         parse_error()
1593                                         break
1594                         ins_mode = ins_mode_after_body
1595                         return
1596                 if t.type is TYPE_END_TAG and t.name is 'html'
1597                         unless is_in_scope 'body'
1598                                 parse_error()
1599                                 return
1600                         ok_tags = {
1601                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1602                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1603                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1604                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1605                                 html:NS_HTML
1606                         }
1607                         for el in open_els
1608                                 unless ok_tags[t.name] is el.namespace
1609                                         parse_error()
1610                                         break
1611                         ins_mode = ins_mode_after_body
1612                         ins_mode t
1613                         return
1614                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1615                         close_p_if_in_button_scope()
1616                         insert_html_element t
1617                         return
1618                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1619                         close_p_if_in_button_scope()
1620                         if h_tags[open_els[0]] is NS_HTML
1621                                 parse_error()
1622                                 open_els.shift()
1623                         insert_html_element t
1624                         return
1625                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1626                         close_p_if_in_button_scope()
1627                         insert_html_element t
1628                         # spec: If the next token is a "LF" (U+000A) character token, then
1629                         # ignore that token and move on to the next one. (Newlines at the
1630                         # start of pre blocks are ignored as an authoring convenience.)
1631                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1632                                 cur += 1
1633                         flag_frameset_ok = false
1634                         return
1635                 if t.type is TYPE_START_TAG and t.name is 'form'
1636                         unless form_element_pointer is null or template_tag_is_open()
1637                                 parse_error()
1638                                 return
1639                         close_p_if_in_button_scope()
1640                         el = insert_html_element t
1641                         unless template_tag_is_open()
1642                                 form_element_pointer = el
1643                         return
1644                 if t.type is TYPE_START_TAG and t.name is 'li'
1645                         flag_frameset_ok = false
1646                         for node in open_els
1647                                 if node.name is 'li' and node.namespace is NS_HTML
1648                                         generate_implied_end_tags 'li' # arg is exception
1649                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1650                                                 parse_error()
1651                                         loop
1652                                                 el = open_els.shift()
1653                                                 if el.name is 'li' and el.namespace is NS_HTML
1654                                                         break
1655                                         break
1656                                 if el_is_special_not_adp node
1657                                                 break
1658                         close_p_if_in_button_scope()
1659                         insert_html_element t
1660                         return
1661                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1662                         flag_frameset_ok = false
1663                         for node in open_els
1664                                 if node.name is 'dd' and node.namespace is NS_HTML
1665                                         generate_implied_end_tags 'dd' # arg is exception
1666                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1667                                                 parse_error()
1668                                         loop
1669                                                 el = open_els.shift()
1670                                                 if el.name is 'dd' and el.namespace is NS_HTML
1671                                                         break
1672                                         break
1673                                 if node.name is 'dt' and node.namespace is NS_HTML
1674                                         generate_implied_end_tags 'dt' # arg is exception
1675                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1676                                                 parse_error()
1677                                         loop
1678                                                 el = open_els.shift()
1679                                                 if el.name is 'dt' and el.namespace is NS_HTML
1680                                                         break
1681                                         break
1682                                 if el_is_special_not_adp node
1683                                         break
1684                         close_p_if_in_button_scope()
1685                         insert_html_element t
1686                         return
1687                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1688                         close_p_if_in_button_scope()
1689                         insert_html_element t
1690                         tok_state = tok_state_plaintext
1691                         return
1692                 if t.type is TYPE_START_TAG and t.name is 'button'
1693                         if is_in_scope 'button', NS_HTML
1694                                 parse_error()
1695                                 generate_implied_end_tags()
1696                                 loop
1697                                         el = open_els.shift()
1698                                         if el.name is 'button' and el.namespace is NS_HTML
1699                                                 break
1700                         reconstruct_afe()
1701                         insert_html_element t
1702                         flag_frameset_ok = false
1703                         return
1704                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1705                         unless is_in_scope t.name, NS_HTML
1706                                 parse_error()
1707                                 return
1708                         generate_implied_end_tags()
1709                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1710                                 parse_error()
1711                         loop
1712                                 el = open_els.shift()
1713                                 if el.name is t.name and el.namespace is NS_HTML
1714                                         return
1715                         return
1716                 if t.type is TYPE_END_TAG and t.name is 'form'
1717                         unless template_tag_is_open()
1718                                 node = form_element_pointer
1719                                 form_element_pointer = null
1720                                 if node is null or not el_is_in_scope node
1721                                         parse_error()
1722                                         return
1723                                 generate_implied_end_tags()
1724                                 if open_els[0] isnt node
1725                                         parse_error()
1726                                 for el, i in open_els
1727                                         if el is node
1728                                                 open_els.splice i, 1
1729                                                 break
1730                         else
1731                                 unless is_in_scope 'form', NS_HTML
1732                                         parse_error()
1733                                         return
1734                                 generate_implied_end_tags()
1735                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1736                                         parse_error()
1737                                 loop
1738                                         el = open_els.shift()
1739                                         if el.name is 'form' and el.namespace is NS_HTML
1740                                                 break
1741                         return
1742                 if t.type is TYPE_END_TAG and t.name is 'p'
1743                         unless is_in_button_scope 'p', NS_HTML
1744                                 parse_error()
1745                                 insert_html_element new_open_tag 'p'
1746                         close_p_element()
1747                         return
1748                 if t.type is TYPE_END_TAG and t.name is 'li'
1749                         unless is_in_li_scope 'li', NS_HTML
1750                                 parse_error()
1751                                 return
1752                         generate_implied_end_tags 'li' # arg is exception
1753                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1754                                 parse_error()
1755                         loop
1756                                 el = open_els.shift()
1757                                 if el.name is 'li' and el.namespace is NS_HTML
1758                                         break
1759                         return
1760                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1761                         unless is_in_scope t.name, NS_HTML
1762                                 parse_error()
1763                                 return
1764                         generate_implied_end_tags t.name # arg is exception
1765                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1766                                 parse_error()
1767                         loop
1768                                 el = open_els.shift()
1769                                 if el.name is t.name and el.namespace is NS_HTML
1770                                         break
1771                         return
1772                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1773                         h_in_scope = false
1774                         for el in open_els
1775                                 if h_tags[el.name] is el.namespace
1776                                         h_in_scope = true
1777                                         break
1778                                 if standard_scopers[el.name] is el.namespace
1779                                         break
1780                         unless h_in_scope
1781                                 parse_error()
1782                                 return
1783                         generate_implied_end_tags()
1784                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1785                                 parse_error()
1786                         loop
1787                                 el = open_els.shift()
1788                                 if h_tags[el.name] is el.namespace
1789                                         break
1790                         return
1791                 # deep breath!
1792                 if t.type is TYPE_START_TAG and t.name is 'a'
1793                         # If the list of active formatting elements contains an a element
1794                         # between the end of the list and the last marker on the list (or
1795                         # the start of the list if there is no marker on the list), then
1796                         # this is a parse error; run the adoption agency algorithm for the
1797                         # tag name "a", then remove that element from the list of active
1798                         # formatting elements and the stack of open elements if the
1799                         # adoption agency algorithm didn't already remove it (it might not
1800                         # have if the element is not in table scope).
1801                         found = false
1802                         for el in afe
1803                                 if el.type is TYPE_AFE_MARKER
1804                                         break
1805                                 if el.name is 'a' and el.namespace is NS_HTML
1806                                         found = el
1807                         if found?
1808                                 parse_error()
1809                                 adoption_agency 'a'
1810                                 for el, i in afe
1811                                         if el is found
1812                                                 afe.splice i, 1
1813                                 for el, i in open_els
1814                                         if el is found
1815                                                 open_els.splice i, 1
1816                         reconstruct_afe()
1817                         el = insert_html_element t
1818                         afe_push el
1819                         return
1820                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1821                         reconstruct_afe()
1822                         el = insert_html_element t
1823                         afe_push el
1824                         return
1825                 if t.type is TYPE_START_TAG and t.name is 'nobr'
1826                         reconstruct_afe()
1827                         el = insert_html_element t
1828                         afe_push el
1829                         return
1830                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1831                         adoption_agency t.name
1832                         return
1833                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1834                         reconstruct_afe()
1835                         insert_html_element t
1836                         afe_push_marker()
1837                         flag_frameset_ok = false
1838                         return
1839                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1840                         unless is_in_scope t.name, NS_HTML
1841                                 parse_error()
1842                                 return
1843                         generate_implied_end_tags()
1844                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1845                                 parse_error()
1846                         loop
1847                                 el = open_els.shift()
1848                                 if el.name is t.name and el.namespace is NS_HTML
1849                                         break
1850                         clear_afe_to_marker()
1851                         return
1852                 if t.type is TYPE_START_TAG and t.name is 'table'
1853                         close_p_if_in_button_scope() # fixfull quirksmode thing
1854                         insert_html_element t
1855                         flag_frameset_ok = false
1856                         ins_mode = ins_mode_in_table
1857                         return
1858                 if t.type is TYPE_END_TAG and t.name is 'br'
1859                         parse_error()
1860                         t.type is TYPE_START_TAG
1861                         # fall through
1862                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1863                         reconstruct_afe()
1864                         insert_html_element t
1865                         open_els.shift()
1866                         t.acknowledge_self_closing()
1867                         flag_frameset_ok = false
1868                         return
1869                 if t.type is TYPE_START_TAG and t.name is 'input'
1870                         reconstruct_afe()
1871                         insert_html_element t
1872                         open_els.shift()
1873                         t.acknowledge_self_closing()
1874                         unless is_input_hidden_tok t
1875                                 flag_frameset_ok = false
1876                         return
1877                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1878                         insert_html_element t
1879                         open_els.shift()
1880                         t.acknowledge_self_closing()
1881                         return
1882                 if t.type is TYPE_START_TAG and t.name is 'hr'
1883                         close_p_if_in_button_scope()
1884                         insert_html_element t
1885                         open_els.shift()
1886                         t.acknowledge_self_closing()
1887                         flag_frameset_ok = false
1888                         return
1889                 if t.type is TYPE_START_TAG and t.name is 'image'
1890                         parse_error()
1891                         t.name = 'img'
1892                         ins_mode t
1893                         return
1894                 if t.type is TYPE_START_TAG and t.name is 'isindex'
1895                         parse_error()
1896                         if template_tag_is_open() is false and form_element_pointer isnt null
1897                                 return
1898                         t.acknowledge_self_closing()
1899                         flag_frameset_ok = false
1900                         close_p_if_in_button_scope()
1901                         el = insert_html_element new_open_tag 'form'
1902                         unless template_tag_is_open()
1903                                 form_element_pointer = el
1904                         for a in t.attrs_a
1905                                 if a[0] is 'action'
1906                                         el.attrs['action'] = a[1]
1907                                         break
1908                         insert_html_element new_open_tag 'hr'
1909                         open_els.shift()
1910                         reconstruct_afe()
1911                         insert_html_element new_open_tag 'label'
1912                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
1913                         input_el = new_open_tag 'input'
1914                         prompt = null
1915                         for a in t.attrs_a
1916                                 if a[0] is 'prompt'
1917                                         prompt = a[1]
1918                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
1919                                         input_el.attrs_a.push [a[0], a[1]]
1920                         input_el.attrs_a.push ['name', 'isindex']
1921                         # fixfull this next bit is in english... internationalize?
1922                         prompt ?= "This is a searchable index. Enter search keywords: "
1923                         insert_character prompt # fixfull split
1924                         # TODO submit typo "balue" in spec
1925                         insert_html_element input_el
1926                         open_els.shift()
1927                         # insert_character '' # you can put chars here if promt attr missing
1928                         open_els.shift()
1929                         insert_html_element new_open_tag 'hr'
1930                         open_els.shift()
1931                         open_els.shift()
1932                         unless template_tag_is_open()
1933                                 form_element_pointer = null
1934                         return
1935                 if t.type is TYPE_START_TAG and t.name is 'textarea'
1936                         insert_html_element t
1937                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1938                                 cur += 1
1939                         tok_state = tok_state_rcdata
1940                         original_ins_mode = ins_mode
1941                         flag_frameset_ok = false
1942                         ins_mode = ins_mode_text
1943                         return
1944                 if t.type is TYPE_START_TAG and t.name is 'xmp'
1945                         close_p_if_in_button_scope()
1946                         reconstruct_afe()
1947                         flag_frameset_ok = false
1948                         parse_generic_raw_text t
1949                         return
1950                 if t.type is TYPE_START_TAG and t.name is 'iframe'
1951                         flag_frameset_ok = false
1952                         parse_generic_raw_text t
1953                         return
1954                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
1955                         parse_generic_raw_text t
1956                         return
1957                 if t.type is TYPE_START_TAG and t.name is 'select'
1958                         reconstruct_afe()
1959                         insert_html_element t
1960                         flag_frameset_ok = false
1961                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
1962                                 ins_mode = ins_mode_in_select_in_table
1963                         else
1964                                 ins_mode = ins_mode_in_select
1965                         return
1966                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
1967                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
1968                                 open_els.shift()
1969                         reconstruct_afe()
1970                         insert_html_element t
1971                         return
1972                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
1973                         if is_in_scope 'ruby', NS_HTML
1974                                 generate_implied_end_tags()
1975                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
1976                                         parse_error()
1977                         insert_html_element t
1978                         return
1979                 if t.type is TYPE_START_TAG and t.name is 'rt'
1980                         if is_in_scope 'ruby', NS_HTML
1981                                 generate_implied_end_tags 'rtc' # arg is exception
1982                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
1983                                         parse_error()
1984                         insert_html_element t
1985                         return
1986                 if t.type is TYPE_START_TAG and t.name is 'math'
1987                         reconstruct_afe()
1988                         adjust_mathml_attributes t
1989                         adjust_foreign_attributes t
1990                         insert_foreign_element t, NS_MATHML
1991                         if t.flag 'self-closing'
1992                                 open_els.shift()
1993                                 t.acknowledge_self_closing()
1994                         return
1995                 if t.type is TYPE_START_TAG and t.name is 'svg'
1996                         reconstruct_afe()
1997                         adjust_svg_attributes t
1998                         adjust_foreign_attributes t
1999                         insert_foreign_element t, NS_SVG
2000                         if t.flag 'self-closing'
2001                                 open_els.shift()
2002                                 t.acknowledge_self_closing()
2003                         return
2004                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2005                         parse_error()
2006                         return
2007                 if t.type is TYPE_START_TAG # any other start tag
2008                         reconstruct_afe()
2009                         insert_html_element t
2010                         return
2011                 if t.type is TYPE_END_TAG # any other end tag
2012                         in_body_any_other_end_tag t.name
2013                         return
2014                 return
2015
2016         ins_mode_in_table_else = (t) ->
2017                 parse_error()
2018                 flag_foster_parenting = true # FIXME
2019                 ins_mode_in_body t
2020                 flag_foster_parenting = false
2021         can_in_table = { # FIXME do this inline like everywhere else
2022                 'table': true
2023                 'tbody': true
2024                 'tfoot': true
2025                 'thead': true
2026                 'tr': true
2027         }
2028
2029         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2030         ins_mode_text = (t) ->
2031                 if t.type is TYPE_TEXT
2032                         insert_character t
2033                         return
2034                 if t.type is TYPE_EOF
2035                         parse_error()
2036                         if open_els[0].name is 'script'
2037                                 open_els[0].flag 'already started', true
2038                         open_els.shift()
2039                         ins_mode = original_ins_mode
2040                         ins_mode t
2041                         return
2042                 if t.type is TYPE_END_TAG and t.name is 'script'
2043                         open_els.shift()
2044                         ins_mode = original_ins_mode
2045                         # fixfull the spec seems to assume that I'm going to run the script
2046                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2047                         return
2048                 if t.type is TYPE_END_TAG
2049                         open_els.shift()
2050                         ins_mode = original_ins_mode
2051                         return
2052                 console.log 'warning: end of ins_mode_text reached'
2053
2054         # the functions below implement the tokenizer stats described here:
2055         # http://www.w3.org/TR/html5/syntax.html#tokenization
2056
2057         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2058         ins_mode_in_table = (t) ->
2059                 switch t.type
2060                         when TYPE_TEXT
2061                                 if can_in_table[t.name]
2062                                         original_ins_mode = ins_mode
2063                                         ins_mode = ins_mode_in_table_text
2064                                         ins_mode t
2065                                 else
2066                                         ins_mode_in_table_else t
2067                         when TYPE_COMMENT
2068                                 insert_comment t
2069                         when TYPE_DOCTYPE
2070                                 parse_error()
2071                         when TYPE_START_TAG
2072                                 switch t.name
2073                                         when 'caption'
2074                                                 clear_stack_to_table_context()
2075                                                 afe_push_marker()
2076                                                 insert_html_element t
2077                                                 ins_mode = ins_mode_in_caption
2078                                         when 'colgroup'
2079                                                 clear_stack_to_table_context()
2080                                                 insert_html_element t
2081                                                 ins_mode = ins_mode_in_column_group
2082                                         when 'col'
2083                                                 clear_stack_to_table_context()
2084                                                 insert_html_element new_open_tag 'colgroup'
2085                                                 ins_mode = ins_mode_in_column_group
2086                                                 ins_mode t
2087                                         when 'tbody', 'tfoot', 'thead'
2088                                                 clear_stack_to_table_context()
2089                                                 insert_html_element t
2090                                                 ins_mode = ins_mode_in_table_body
2091                                         when 'td', 'th', 'tr'
2092                                                 clear_stack_to_table_context()
2093                                                 insert_html_element new_open_tag 'tbody'
2094                                                 ins_mode = ins_mode_in_table_body
2095                                                 ins_mode t
2096                                         when 'table'
2097                                                 parse_error()
2098                                                 if is_in_table_scope 'table'
2099                                                         loop
2100                                                                 el = open_els.shift()
2101                                                                 if el.name is 'table'
2102                                                                         break
2103                                                         reset_ins_mode()
2104                                                         ins_mode t
2105                                         when 'style', 'script', 'template'
2106                                                 ins_mode_in_head t
2107                                         when 'input'
2108                                                 if is_input_hidden_tok t
2109                                                         ins_mode_in_table_else t
2110                                                 else
2111                                                         parse_error()
2112                                                         el = insert_html_element t
2113                                                         open_els.shift()
2114                                                         t.acknowledge_self_closing()
2115                                         when 'form'
2116                                                 parse_error()
2117                                                 if form_element_pointer?
2118                                                         return
2119                                                 if template_tag_is_open()
2120                                                         return
2121                                                 form_element_pointer = insert_html_element t
2122                                                 open_els.shift()
2123                                         else
2124                                                 ins_mode_in_table_else t
2125                         when TYPE_END_TAG
2126                                 switch t.name
2127                                         when 'table'
2128                                                 if is_in_table_scope 'table'
2129                                                         loop
2130                                                                 el = open_els.shift()
2131                                                                 if el.name is 'table'
2132                                                                         break
2133                                                         reset_ins_mode()
2134                                                 else
2135                                                         parse_error
2136                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2137                                                 parse_error()
2138                                         when 'template'
2139                                                 ins_mode_in_head t
2140                                         else
2141                                                 ins_mode_in_table_else t
2142                         when TYPE_EOF
2143                                 ins_mode_in_body t
2144                         else
2145                                 ins_mode_in_table_else t
2146
2147
2148         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2149         ins_mode_in_table_text = (t) ->
2150                 if t.type is TYPE_TEXT and t.text is "\u0000"
2151                         # huh? I thought the tokenizer didn't emit these
2152                         parse_error()
2153                         return
2154                 if t.type is TYPE_TEXT
2155                         pending_table_character_tokens.push t
2156                         return
2157                 # Anything else
2158                 all_space = true
2159                 for old in pending_table_character_tokens
2160                         unless is_space_tok old
2161                                 all_space = false
2162                                 break
2163                 if all_space
2164                         for old in pending_table_character_tokens
2165                                 insert_character old
2166                 else
2167                         for old in pending_table_character_tokens
2168                                 ins_mode_table_else old
2169                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2170                 ins_mode = original_ins_mode
2171                 ins_mode t
2172
2173         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2174         ins_mode_in_caption = (t) ->
2175                 if t.type is TYPE_END_TAG and t.name is 'caption'
2176                         if is_in_table_scope 'caption'
2177                                 generate_implied_end_tags()
2178                                 if open_els[0].name isnt 'caption'
2179                                         parse_error()
2180                                 loop
2181                                         el = open_els.shift()
2182                                         if el.name is 'caption'
2183                                                 break
2184                                 clear_afe_to_marker()
2185                                 ins_mode = ins_mode_in_table
2186                         else
2187                                 parse_error()
2188                                 # fragment case
2189                         return
2190                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2191                         parse_error()
2192                         if is_in_table_scope 'caption'
2193                                 loop
2194                                         el = open_els.shift()
2195                                         if el.name is 'caption'
2196                                                 break
2197                                 clear_afe_to_marker()
2198                                 ins_mode = ins_mode_in_table
2199                                 ins_mode t
2200                         # else fragment case
2201                         return
2202                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2203                         parse_error()
2204                         return
2205                 # Anything else
2206                 ins_mode_in_body t
2207
2208         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2209         ins_mode_in_column_group = (t) ->
2210                 if is_space_tok t
2211                         insert_character t
2212                         return
2213                 if t.type is TYPE_COMMENT
2214                         insert_comment t
2215                         return
2216                 if t.type is TYPE_DOCTYPE
2217                         parse_error()
2218                         return
2219                 if t.type is TYPE_START_TAG and t.name is 'html'
2220                         ins_mode_in_body t
2221                         return
2222                 if t.type is TYPE_START_TAG and t.name is 'col'
2223                         el = insert_html_element t
2224                         open_els.shift()
2225                         t.acknowledge_self_closing()
2226                         return
2227                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2228                         if open_els[0].name is 'colgroup'
2229                                 open_els.shift()
2230                                 ins_mode = ins_mode_in_table
2231                         else
2232                                 parse_error()
2233                         return
2234                 if t.type is TYPE_END_TAG and t.name is 'col'
2235                         parse_error()
2236                         return
2237                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2238                         ins_mode_in_head t
2239                         return
2240                 if t.type is TYPE_EOF
2241                         ins_mode_in_body t
2242                         return
2243                 # Anything else
2244                 if open_els[0].name isnt 'colgroup'
2245                         parse_error()
2246                         return
2247                 open_els.shift()
2248                 ins_mode = ins_mode_in_table
2249                 ins_mode t
2250                 return
2251
2252         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2253         ins_mode_in_table_body = (t) ->
2254                 if t.type is TYPE_START_TAG and t.name is 'tr'
2255                         clear_stack_to_table_body_context()
2256                         insert_html_element t
2257                         ins_mode = ins_mode_in_row
2258                         return
2259                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2260                         parse_error()
2261                         clear_stack_to_table_body_context()
2262                         insert_html_element new_open_tag 'tr'
2263                         ins_mode = ins_mode_in_row
2264                         ins_mode t
2265                         return
2266                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2267                         unless is_in_table_scope t.name # fixfull check namespace
2268                                 parse_error()
2269                                 return
2270                         clear_stack_to_table_body_context()
2271                         open_els.shift()
2272                         ins_mode = ins_mode_in_table
2273                         return
2274                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2275                         has = false
2276                         for el in open_els
2277                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2278                                         has = true
2279                                         break
2280                                 if table_scopers[el.name]
2281                                         break
2282                         if !has
2283                                 parse_error()
2284                                 return
2285                         clear_stack_to_table_body_context()
2286                         open_els.shift()
2287                         ins_mode = ins_mode_in_table
2288                         ins_mode t
2289                         return
2290                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2291                         parse_error()
2292                         return
2293                 # Anything else
2294                 ins_mode_in_table t
2295
2296         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2297         ins_mode_in_row = (t) ->
2298                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2299                         clear_stack_to_table_row_context()
2300                         insert_html_element t
2301                         ins_mode = ins_mode_in_cell
2302                         afe_push_marker()
2303                         return
2304                 if t.type is TYPE_END_TAG and t.name is 'tr'
2305                         if is_in_table_scope 'tr'
2306                                 clear_stack_to_table_row_context()
2307                                 open_els.shift()
2308                                 ins_mode = ins_mode_in_table_body
2309                         else
2310                                 parse_error()
2311                         return
2312                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2313                         if is_in_table_scope 'tr'
2314                                 clear_stack_to_table_row_context()
2315                                 open_els.shift()
2316                                 ins_mode = ins_mode_in_table_body
2317                                 ins_mode t
2318                         else
2319                                 parse_error()
2320                         return
2321                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2322                         if is_in_table_scope t.name # fixfull namespace
2323                                 if is_in_table_scope 'tr'
2324                                         clear_stack_to_table_row_context()
2325                                         open_els.shift()
2326                                         ins_mode = ins_mode_in_table_body
2327                                         ins_mode t
2328                         else
2329                                 parse_error()
2330                         return
2331                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2332                         parse_error()
2333                         return
2334                 # Anything else
2335                 ins_mode_in_table t
2336
2337         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2338         close_the_cell = ->
2339                 generate_implied_end_tags()
2340                 unless open_els[0].name is 'td' or open_els[0] is 'th'
2341                         parse_error()
2342                 loop
2343                         el = open_els.shift()
2344                         if el.name is 'td' or el.name is 'th'
2345                                 break
2346                 clear_afe_to_marker()
2347                 ins_mode = ins_mode_in_row
2348
2349         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2350         ins_mode_in_cell = (t) ->
2351                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2352                         if is_in_table_scope t.name
2353                                 generate_implied_end_tags()
2354                                 if open_els[0].name isnt t.name
2355                                         parse_error
2356                                 loop
2357                                         el = open_els.shift()
2358                                         if el.name is t.name
2359                                                 break
2360                                 clear_afe_to_marker()
2361                                 ins_mode = ins_mode_in_row
2362                         else
2363                                 parse_error()
2364                         return
2365                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2366                         has = false
2367                         for el in open_els
2368                                 if el.name is 'td' or el.name is 'th'
2369                                         has = true
2370                                         break
2371                                 if table_scopers[el.name]
2372                                         break
2373                         if !has
2374                                 parse_error()
2375                                 return
2376                         close_the_cell()
2377                         ins_mode t
2378                         return
2379                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2380                         parse_error()
2381                         return
2382                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2383                         if is_in_table_scope t.name # fixfull namespace
2384                                 close_the_cell()
2385                                 ins_mode t
2386                         else
2387                                 parse_error()
2388                         return
2389                 # Anything Else
2390                 ins_mode_in_body t
2391
2392         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2393         ins_mode_in_select = (t) ->
2394                 if t.type is TYPE_TEXT and t.text is "\u0000"
2395                         parse_error()
2396                         return
2397                 if t.type is TYPE_TEXT
2398                         insert_character t
2399                         return
2400                 if t.type is TYPE_COMMENT
2401                         insert_comment t
2402                         return
2403                 if t.type is TYPE_DOCTYPE
2404                         parse_error()
2405                         return
2406                 if t.type is TYPE_START_TAG and t.name is 'html'
2407                         ins_mode_in_body t
2408                         return
2409                 if t.type is TYPE_START_TAG and t.name is 'option'
2410                         if open_els[0].name is 'option'
2411                                 open_els.shift()
2412                         insert_html_element t
2413                         return
2414                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2415                         if open_els[0].name is 'option'
2416                                 open_els.shift()
2417                         if open_els[0].name is 'optgroup'
2418                                 open_els.shift()
2419                         insert_html_element t
2420                         return
2421                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2422                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2423                                 open_els.shift()
2424                         if open_els[0].name is 'optgroup'
2425                                 open_els.shift()
2426                         else
2427                                 parse_error()
2428                         return
2429                 if t.type is TYPE_END_TAG and t.name is 'option'
2430                         if open_els[0].name is 'option'
2431                                 open_els.shift()
2432                         else
2433                                 parse_error()
2434                         return
2435                 if t.type is TYPE_END_TAG and t.name is 'select'
2436                         if is_in_select_scope 'select'
2437                                 loop
2438                                         el = open_els.shift()
2439                                         if el.name is 'select'
2440                                                 break
2441                                 reset_ins_mode()
2442                         else
2443                                 parse_error()
2444                         return
2445                 if t.type is TYPE_START_TAG and t.name is 'select'
2446                         parse_error()
2447                         loop
2448                                 el = open_els.shift()
2449                                 if el.name is 'select'
2450                                         break
2451                         reset_ins_mode()
2452                         # spec says that this is the same as </select> but it doesn't say
2453                         # to check scope first
2454                         return
2455                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2456                         parse_error()
2457                         if is_in_select_scope 'select'
2458                                 return
2459                         loop
2460                                 el = open_els.shift()
2461                                 if el.name is 'select'
2462                                         break
2463                         reset_ins_mode()
2464                         ins_mode t
2465                         return
2466                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2467                         ins_mode_in_head t
2468                         return
2469                 if t.type is TYPE_EOF
2470                         ins_mode_in_body t
2471                         return
2472                 # Anything else
2473                 parse_error()
2474                 return
2475
2476         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2477         ins_mode_in_select_in_table = (t) ->
2478                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2479                         parse_error()
2480                         loop
2481                                 el = open_els.shift()
2482                                 if el.name is 'select'
2483                                         break
2484                         reset_ins_mode()
2485                         ins_mode t
2486                         return
2487                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2488                         parse_error()
2489                         unless is_in_table_scope t.name, NS_HTML
2490                                 return
2491                         loop
2492                                 el = open_els.shift()
2493                                 if el.name is 'select'
2494                                         break
2495                         reset_ins_mode()
2496                         ins_mode t
2497                         return
2498                 # Anything else
2499                 ins_mode_in_select t
2500                 return
2501
2502         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2503         ins_mode_in_template = (t) ->
2504                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2505                         ins_mode_in_body t
2506                         return
2507                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2508                         ins_mode_in_head t
2509                         return
2510                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2511                         template_ins_modes.shift()
2512                         template_ins_modes.unshift ins_mode_in_table
2513                         ins_mode = ins_mode_in_table
2514                         ins_mode t
2515                         return
2516                 if t.type is TYPE_START_TAG and t.name is 'col'
2517                         template_ins_modes.shift()
2518                         template_ins_modes.unshift ins_mode_in_column_group
2519                         ins_mode = ins_mode_in_column_group
2520                         ins_mode t
2521                         return
2522                 if t.type is TYPE_START_TAG and t.name is 'tr'
2523                         template_ins_modes.shift()
2524                         template_ins_modes.unshift ins_mode_in_table_body
2525                         ins_mode = ins_mode_in_table_body
2526                         ins_mode t
2527                         return
2528                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2529                         template_ins_modes.shift()
2530                         template_ins_modes.unshift ins_mode_in_row
2531                         ins_mode = ins_mode_in_row
2532                         ins_mode t
2533                         return
2534                 if t.type is TYPE_START_TAG
2535                         template_ins_modes.shift()
2536                         template_ins_modes.unshift ins_mode_in_body
2537                         ins_mode = ins_mode_in_body
2538                         ins_mode t
2539                         return
2540                 if t.type is TYPE_END_TAG
2541                         parse_error()
2542                         return
2543                 if t.type is TYPE_EOF
2544                         unless template_tag_is_open()
2545                                 stop_parsing()
2546                                 return
2547                         parse_error()
2548                         loop
2549                                 el = open_els.shift()
2550                                 if el.name is 'template' # fixfull check namespace
2551                                         break
2552                         clear_afe_to_marker()
2553                         template_ins_modes.shift()
2554                         reset_ins_mode()
2555                         ins_mode t
2556
2557         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2558         ins_mode_after_body = (t) ->
2559                 if is_space_tok t
2560                         ins_mode_in_body t
2561                         return
2562                 if t.type is TYPE_COMMENT
2563                         insert_comment t, [open_els[0], open_els[0].children.length]
2564                         return
2565                 if t.type is TYPE_DOCTYPE
2566                         parse_error()
2567                         return
2568                 if t.type is TYPE_START_TAG and t.name is 'html'
2569                         ins_mode_in_body t
2570                         return
2571                 if t.type is TYPE_END_TAG and t.name is 'html'
2572                         # fixfull fragment case
2573                         ins_mode = ins_mode_after_after_body
2574                         return
2575                 if t.type is TYPE_EOF
2576                         stop_parsing()
2577                         return
2578                 # Anything ELse
2579                 parse_error()
2580                 ins_mode = ins_mode_in_body
2581                 ins_mode t
2582
2583         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2584         ins_mode_in_frameset = (t) ->
2585                 if is_space_tok t
2586                         insert_character t
2587                         return
2588                 if t.type is TYPE_COMMENT
2589                         insert_comment t
2590                         return
2591                 if t.type is TYPE_DOCTYPE
2592                         parse_error()
2593                         return
2594                 if t.type is TYPE_START_TAG and t.name is 'html'
2595                         ins_mode_in_body t
2596                         return
2597                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2598                         insert_html_element t
2599                         return
2600                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2601                         # TODO ?correct for: "if the current node is the root html element"
2602                         if open_els.length is 1
2603                                 parse_error()
2604                                 return # fragment case
2605                         open_els.shift()
2606                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2607                                 ins_mode = ins_mode_after_frameset
2608                         return
2609                 if t.type is TYPE_START_TAG and t.name is 'frame'
2610                         insert_html_element t
2611                         open_els.shift()
2612                         t.acknowledge_self_closing()
2613                         return
2614                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2615                         ins_mode_in_head t
2616                         return
2617                 if t.type is TYPE_EOF
2618                         # TODO ?correct for: "if the current node is not the root html element"
2619                         if open_els.length isnt 1
2620                                 parse_error()
2621                         stop_parsing()
2622                         return
2623                 # Anything else
2624                 parse_error()
2625                 return
2626
2627         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2628         ins_mode_after_frameset = (t) ->
2629                 if is_space_tok t
2630                         insert_character t
2631                         return
2632                 if t.type is TYPE_COMMENT
2633                         insert_comment t
2634                         return
2635                 if t.type is TYPE_DOCTYPE
2636                         parse_error()
2637                         return
2638                 if t.type is TYPE_START_TAG and t.name is 'html'
2639                         ins_mode_in_body t
2640                         return
2641                 if t.type is TYPE_END_TAG and t.name is 'html'
2642                         insert_mode = ins_mode_after_after_frameset
2643                         return
2644                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2645                         ins_mode_in_head t
2646                         return
2647                 if t.type is TYPE_EOF
2648                         stop_parsing()
2649                         return
2650                 # Anything else
2651                 parse_error()
2652                 return
2653
2654         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2655         ins_mode_after_after_body = (t) ->
2656                 if t.type is TYPE_COMMENT
2657                         insert_comment t, [doc, doc.children.length]
2658                         return
2659                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2660                         ins_mode_in_body t
2661                         return
2662                 if t.type is TYPE_EOF
2663                         stop_parsing()
2664                         return
2665                 # Anything else
2666                 parse_error()
2667                 ins_mode = ins_mode_in_body
2668                 return
2669
2670         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2671         ins_mode_after_after_frameset = (t) ->
2672                 if t.type is TYPE_COMMENT
2673                         insert_comment t, [doc, doc.children.length]
2674                         return
2675                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2676                         ins_mode_in_body t
2677                         return
2678                 if t.type is TYPE_EOF
2679                         stop_parsing()
2680                         return
2681                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2682                         ins_mode_in_head t
2683                         return
2684                 # Anything else
2685                 parse_error()
2686                 return
2687
2688
2689
2690
2691
2692         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2693         tok_state_data = ->
2694                 switch c = txt.charAt(cur++)
2695                         when '&'
2696                                 return new_text_node parse_character_reference()
2697                         when '<'
2698                                 tok_state = tok_state_tag_open
2699                         when "\u0000"
2700                                 parse_error()
2701                                 return new_text_node c
2702                         when '' # EOF
2703                                 return new_eof_token()
2704                         else
2705                                 return new_text_node c
2706                 return null
2707
2708         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2709         # not needed: tok_state_character_reference_in_data = ->
2710         # just call parse_character_reference()
2711
2712         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2713         tok_state_rcdata = ->
2714                 switch c = txt.charAt(cur++)
2715                         when '&'
2716                                 return new_text_node parse_character_reference()
2717                         when '<'
2718                                 tok_state = tok_state_rcdata_less_than_sign
2719                         when "\u0000"
2720                                 parse_error()
2721                                 return new_character_token "\ufffd"
2722                         when '' # EOF
2723                                 return new_eof_token()
2724                         else
2725                                 return new_character_token c
2726                 return null
2727
2728         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2729         # not needed: tok_state_character_reference_in_rcdata = ->
2730         # just call parse_character_reference()
2731
2732         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2733         tok_state_rawtext = ->
2734                 switch c = txt.charAt(cur++)
2735                         when '<'
2736                                 tok_state = tok_state_rawtext_less_than_sign
2737                         when "\u0000"
2738                                 parse_error()
2739                                 return new_character_token "\ufffd"
2740                         when '' # EOF
2741                                 return new_eof_token()
2742                         else
2743                                 return new_character_token c
2744                 return null
2745
2746         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2747         tok_state_script_data = ->
2748                 switch c = txt.charAt(cur++)
2749                         when '<'
2750                                 tok_state = tok_state_script_data_less_than_sign
2751                         when "\u0000"
2752                                 parse_error()
2753                                 return new_character_token "\ufffd"
2754                         when '' # EOF
2755                                 return new_eof_token()
2756                         else
2757                                 return new_character_token c
2758                 return null
2759
2760         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2761         tok_state_plaintext = ->
2762                 switch c = txt.charAt(cur++)
2763                         when "\u0000"
2764                                 parse_error()
2765                                 return new_character_token "\ufffd"
2766                         when '' # EOF
2767                                 return new_eof_token()
2768                         else
2769                                 return new_character_token c
2770                 return null
2771
2772
2773         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2774         tok_state_tag_open = ->
2775                 switch c = txt.charAt(cur++)
2776                         when '!'
2777                                 tok_state = tok_state_markup_declaration_open
2778                         when '/'
2779                                 tok_state = tok_state_end_tag_open
2780                         when '?'
2781                                 parse_error()
2782                                 tok_cur_tag = new_comment_token '?'
2783                                 tok_state = tok_state_bogus_comment
2784                         else
2785                                 if is_lc_alpha(c)
2786                                         tok_cur_tag = new_open_tag c
2787                                         tok_state = tok_state_tag_name
2788                                 else if is_uc_alpha(c)
2789                                         tok_cur_tag = new_open_tag c.toLowerCase()
2790                                         tok_state = tok_state_tag_name
2791                                 else
2792                                         parse_error()
2793                                         tok_state = tok_state_data
2794                                         cur -= 1 # we didn't parse/handle the char after <
2795                                         return new_text_node '<'
2796                 return null
2797
2798         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2799         tok_state_end_tag_open = ->
2800                 switch c = txt.charAt(cur++)
2801                         when '>'
2802                                 parse_error()
2803                                 tok_state = tok_state_data
2804                         when '' # EOF
2805                                 parse_error()
2806                                 tok_state = tok_state_data
2807                                 return new_text_node '</'
2808                         else
2809                                 if is_uc_alpha(c)
2810                                         tok_cur_tag = new_end_tag c.toLowerCase()
2811                                         tok_state = tok_state_tag_name
2812                                 else if is_lc_alpha(c)
2813                                         tok_cur_tag = new_end_tag c
2814                                         tok_state = tok_state_tag_name
2815                                 else
2816                                         parse_error()
2817                                         tok_cur_tag = new_comment_token '/'
2818                                         tok_state = tok_state_bogus_comment
2819                 return null
2820
2821         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2822         tok_state_tag_name = ->
2823                 switch c = txt.charAt(cur++)
2824                         when "\t", "\n", "\u000c", ' '
2825                                 tok_state = tok_state_before_attribute_name
2826                         when '/'
2827                                 tok_state = tok_state_self_closing_start_tag
2828                         when '>'
2829                                 tok_state = tok_state_data
2830                                 tmp = tok_cur_tag
2831                                 tok_cur_tag = null
2832                                 return tmp
2833                         when "\u0000"
2834                                 parse_error()
2835                                 tok_cur_tag.name += "\ufffd"
2836                         when '' # EOF
2837                                 parse_error()
2838                                 tok_state = tok_state_data
2839                         else
2840                                 if is_uc_alpha(c)
2841                                         tok_cur_tag.name += c.toLowerCase()
2842                                 else
2843                                         tok_cur_tag.name += c
2844                 return null
2845
2846         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2847         tok_state_rcdata_less_than_sign = ->
2848                 c = txt.charAt(cur++)
2849                 if c is '/'
2850                         temporary_buffer = ''
2851                         tok_state = tok_state_rcdata_end_tag_open
2852                         return null
2853                 # Anything else
2854                 tok_state = tok_state_rcdata
2855                 cur -= 1 # reconsume the input character
2856                 return new_character_token '<'
2857
2858         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2859         tok_state_rcdata_end_tag_open = ->
2860                 c = txt.charAt(cur++)
2861                 if is_uc_alpha(c)
2862                         tok_cur_tag = new_end_tag c.toLowerCase()
2863                         temporary_buffer += c
2864                         tok_state = tok_state_rcdata_end_tag_name
2865                         return null
2866                 if is_lc_alpha(c)
2867                         tok_cur_tag = new_end_tag c
2868                         temporary_buffer += c
2869                         tok_state = tok_state_rcdata_end_tag_name
2870                         return null
2871                 # Anything else
2872                 tok_state = tok_state_rcdata
2873                 cur -= 1 # reconsume the input character
2874                 return new_character_token "</" # fixfull separate these
2875
2876         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2877         is_appropriate_end_tag = (t) ->
2878                 # spec says to check against "the tag name of the last start tag to
2879                 # have been emitted from this tokenizer", but this is only called from
2880                 # the various "raw" states, which I'm pretty sure all push the start
2881                 # token onto open_els. TODO: verify this after the script data states
2882                 # are implemented
2883                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2884                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2885
2886         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2887         tok_state_rcdata_end_tag_name = ->
2888                 c = txt.charAt(cur++)
2889                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2890                         if is_appropriate_end_tag tok_cur_tag
2891                                 tok_state = tok_state_before_attribute_name
2892                                 return
2893                         # else fall through to "Anything else"
2894                 if c is '/'
2895                         if is_appropriate_end_tag tok_cur_tag
2896                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2897                                 return
2898                         # else fall through to "Anything else"
2899                 if c is '>'
2900                         if is_appropriate_end_tag tok_cur_tag
2901                                 tok_state = tok_state_data
2902                                 return tok_cur_tag
2903                         # else fall through to "Anything else"
2904                 if is_uc_alpha(c)
2905                         tok_cur_tag.name += c.toLowerCase()
2906                         temporary_buffer += c
2907                         return null
2908                 if is_lc_alpha(c)
2909                         tok_cur_tag.name += c
2910                         temporary_buffer += c
2911                         return null
2912                 # Anything else
2913                 tok_state = tok_state_rcdata
2914                 cur -= 1 # reconsume the input character
2915                 return new_character_token '</' + temporary_buffer # fixfull separate these
2916
2917         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2918         tok_state_rawtext_less_than_sign = ->
2919                 c = txt.charAt(cur++)
2920                 if c is '/'
2921                         temporary_buffer = ''
2922                         tok_state = tok_state_rawtext_end_tag_open
2923                         return null
2924                 # Anything else
2925                 tok_state = tok_state_rawtext
2926                 cur -= 1 # reconsume the input character
2927                 return new_character_token '<'
2928
2929         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2930         tok_state_rawtext_end_tag_open = ->
2931                 c = txt.charAt(cur++)
2932                 if is_uc_alpha(c)
2933                         tok_cur_tag = new_end_tag c.toLowerCase()
2934                         temporary_buffer += c
2935                         tok_state = tok_state_rawtext_end_tag_name
2936                         return null
2937                 if is_lc_alpha(c)
2938                         tok_cur_tag = new_end_tag c
2939                         temporary_buffer += c
2940                         tok_state = tok_state_rawtext_end_tag_name
2941                         return null
2942                 # Anything else
2943                 tok_state = tok_state_rawtext
2944                 cur -= 1 # reconsume the input character
2945                 return new_character_token "</" # fixfull separate these
2946
2947         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2948         tok_state_rawtext_end_tag_name = ->
2949                 c = txt.charAt(cur++)
2950                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2951                         if is_appropriate_end_tag tok_cur_tag
2952                                 tok_state = tok_state_before_attribute_name
2953                                 return
2954                         # else fall through to "Anything else"
2955                 if c is '/'
2956                         if is_appropriate_end_tag tok_cur_tag
2957                                 tok_state = tok_state_self_closing_start_tag
2958                                 return
2959                         # else fall through to "Anything else"
2960                 if c is '>'
2961                         if is_appropriate_end_tag tok_cur_tag
2962                                 tok_state = tok_state_data
2963                                 return tok_cur_tag
2964                         # else fall through to "Anything else"
2965                 if is_uc_alpha(c)
2966                         tok_cur_tag.name += c.toLowerCase()
2967                         temporary_buffer += c
2968                         return null
2969                 if is_lc_alpha(c)
2970                         tok_cur_tag.name += c
2971                         temporary_buffer += c
2972                         return null
2973                 # Anything else
2974                 tok_state = tok_state_rawtext
2975                 cur -= 1 # reconsume the input character
2976                 return new_character_token '</' + temporary_buffer # fixfull separate these
2977
2978         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2979         tok_state_script_data_less_than_sign = ->
2980                 c = txt.charAt(cur++)
2981                 if c is '/'
2982                         temporary_buffer = ''
2983                         tok_state = tok_state_script_data_end_tag_open
2984                         return
2985                 if c is '!'
2986                         tok_state = tok_state_script_data_escape_start
2987                         return new_character_token '<!' # fixfull split
2988                 # Anything else
2989                 tok_state = tok_state_script_data
2990                 cur -= 1 # Reconsume
2991                 return new_character_token '<'
2992
2993         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2994         tok_state_script_data_end_tag_open = ->
2995                 c = txt.charAt(cur++)
2996                 if is_uc_alpha(c)
2997                         tok_cur_tag = new_end_tag c.toLowerCase()
2998                         temporary_buffer += c
2999                         tok_state = tok_state_script_data_end_tag_name
3000                         return
3001                 if is_lc_alpha(c)
3002                         tok_cur_tag = new_end_tag c
3003                         temporary_buffer += c
3004                         tok_state = tok_state_script_data_end_tag_name
3005                         return
3006                 # Anything else
3007                 tok_state = tok_state_script_data
3008                 cur -= 1 # Reconsume
3009                 return new_character_token '</'
3010
3011         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3012         tok_state_script_data_end_tag_name = ->
3013                 c = txt.charAt(cur++)
3014                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3015                         if is_appropriate_end_tag tok_cur_tag
3016                                 tok_state = tok_state_before_attribute_name
3017                                 return
3018                         # fall through
3019                 if c is '/'
3020                         if is_appropriate_end_tag tok_cur_tag
3021                                 tok_state = tok_state_self_closing_start_tag
3022                                 return
3023                         # fall through
3024                 if is_uc_alpha(c)
3025                         tok_cur_tag.name += c.toLowerCase()
3026                         temporary_buffer += c
3027                         return
3028                 if is_lc_alpha(c)
3029                         tok_cur_tag.name += c
3030                         temporary_buffer += c
3031                         return
3032                 # Anything else
3033                 tok_state = tok_state_script_data
3034                 cur -= 1 # Reconsume
3035                 return new_character_token "</#{temporary_buffer}" # fixfull split
3036
3037         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3038         tok_state_script_data_escape_start = ->
3039                 c = txt.charAt(cur++)
3040                 if c is '-'
3041                         tok_state = tok_state_script_data_escape_start_dash
3042                         return new_character_token '-'
3043                 # Anything else
3044                 tok_state = tok_state_script_data
3045                 cur -= 1 # Reconsume
3046                 return
3047
3048         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3049         tok_state_script_data_escape_start_dash = ->
3050                 c = txt.charAt(cur++)
3051                 if c is '-'
3052                         tok_state = tok_state_script_data_escaped_dash_dash
3053                         return new_character_token '-'
3054                 # Anything else
3055                 tok_state = tok_state_script_data
3056                 cur -= 1 # Reconsume
3057                 return
3058
3059         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3060         tok_state_script_data_escaped = ->
3061                 c = txt.charAt(cur++)
3062                 if c is '-'
3063                         tok_state = tok_state_script_data_escaped_dash
3064                         return new_character_token '-'
3065                 if c is '<'
3066                         tok_state = tok_state_script_data_escaped_less_than_sign
3067                         return
3068                 if c is "\u0000"
3069                         parse_error()
3070                         return new_character_token "\ufffd"
3071                 if c is '' # EOF
3072                         tok_state = tok_state_data
3073                         parse_error()
3074                         cur -= 1 # Reconsume
3075                         return
3076                 # Anything else
3077                 return new_character_token c
3078
3079         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3080         tok_state_script_data_escaped_dash = ->
3081                 c = txt.charAt(cur++)
3082                 if c is '-'
3083                         tok_state = tok_state_script_data_escaped_dash_dash
3084                         return new_character_token '-'
3085                 if c is '<'
3086                         tok_state = tok_state_script_data_escaped_less_than_sign
3087                         return
3088                 if c is "\u0000"
3089                         parse_error()
3090                         tok_state = tok_state_script_data_escaped
3091                         return new_character_token "\ufffd"
3092                 if c is '' # EOF
3093                         tok_state = tok_state_data
3094                         parse_error()
3095                         cur -= 1 # Reconsume
3096                         return
3097                 # Anything else
3098                 tok_state = tok_state_script_data_escaped
3099                 return new_character_token c
3100
3101         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3102         tok_state_script_data_escaped_dash_dash = ->
3103                 c = txt.charAt(cur++)
3104                 if c is '-'
3105                         return new_character_token '-'
3106                 if c is '<'
3107                         tok_state = tok_state_script_data_escaped_less_than_sign
3108                         return
3109                 if c is '>'
3110                         tok_state = tok_state_script_data
3111                         return new_character_token '>'
3112                 if c is "\u0000"
3113                         parse_error()
3114                         tok_state = tok_state_script_data_escaped
3115                         return new_character_token "\ufffd"
3116                 if c is '' # EOF
3117                         parse_error()
3118                         tok_state = tok_state_data
3119                         cur -= 1 # Reconsume
3120                         return
3121                 # Anything else
3122                 tok_state = tok_state_script_data_escaped
3123                 return new_character_token c
3124
3125         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3126         tok_state_script_data_escaped_less_than_sign = ->
3127                 c = txt.charAt(cur++)
3128                 if c is '/'
3129                         temporary_buffer = ''
3130                         tok_state = tok_state_script_data_escaped_end_tag_open
3131                         return
3132                 if is_uc_alpha(c)
3133                         temporary_buffer = c.toLowerCase() # yes, really
3134                         tok_state = tok_state_script_data_double_escape_start
3135                         return new_character_token "<#{c}" # fixfull split
3136                 if is_lc_alpha(c)
3137                         temporary_buffer = c
3138                         tok_state = tok_state_script_data_double_escape_start
3139                         return new_character_token "<#{c}" # fixfull split
3140                 # Anything else
3141                 tok_state = tok_state_script_data_escaped
3142                 cur -= 1 # Reconsume
3143                 return new_character_token c
3144
3145         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3146         tok_state_script_data_escaped_end_tag_open = ->
3147                 c = txt.charAt(cur++)
3148                 if is_uc_alpha(c)
3149                         tok_cur_tag = new_end_tag c.toLowerCase()
3150                         temporary_buffer += c
3151                         tok_state = tok_state_script_data_escaped_end_tag_name
3152                         return
3153                 if is_lc_alpha(c)
3154                         tok_cur_tag = new_end_tag c
3155                         temporary_buffer += c
3156                         tok_state = tok_state_script_data_escaped_end_tag_name
3157                         return
3158                 # Anything else
3159                 tok_state = tok_state_script_data_escaped
3160                 cur -= 1 # Reconsume
3161                 return new_character_token '</' # fixfull split
3162
3163         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3164         tok_state_script_data_escaped_end_tag_name = ->
3165                 c = txt.charAt(cur++)
3166                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3167                         if is_appropriate_end_tag tok_cur_tag
3168                                 tok_state = tok_state_before_attribute_name
3169                                 return
3170                         # fall through
3171                 if c is '/'
3172                         if is_appropriate_end_tag tok_cur_tag
3173                                 tok_state = tok_state_self_closing_start_tag
3174                                 return
3175                         # fall through
3176                 if is_uc_alpha(c)
3177                         tok_cur_tag.name += c.toLowerCase()
3178                         temporary_buffer += c.toLowerCase()
3179                         return
3180                 if is_lc_alpha(c)
3181                         tok_cur_tag.name += c
3182                         temporary_buffer += c.toLowerCase()
3183                         return
3184                 # Anything else
3185                 tok_state = tok_state_script_data_escaped
3186                 cur -= 1 # Reconsume
3187                 return new_character_token "</#{temporary_buffer}" # fixfull split
3188
3189         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3190         tok_state_script_data_double_escape_start = ->
3191                 c = txt.charAt(cur++)
3192                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3193                         if temporary_buffer is 'script'
3194                                 tok_state = tok_state_script_data_double_escaped
3195                         else
3196                                 tok_state = tok_state_script_data_escaped
3197                         return new_character_token c
3198                 if is_uc_alpha(c)
3199                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3200                         return new_character_token c
3201                 if is_lc_alpha(c)
3202                         temporary_buffer += c
3203                         return new_character_token c
3204                 # Anything else
3205                 tok_state = tok_state_script_data_escaped
3206                 cur -= 1 # Reconsume
3207                 return
3208
3209         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3210         tok_state_script_data_double_escaped = ->
3211                 c = txt.charAt(cur++)
3212                 if c is '-'
3213                         tok_state = tok_state_script_data_double_escaped_dash
3214                         return new_character_token '-'
3215                 if c is '<'
3216                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3217                         return new_character_token '<'
3218                 if c is "\u0000"
3219                         parse_error()
3220                         return new_character_token "\ufffd"
3221                 if c is '' # EOF
3222                         parse_error()
3223                         tok_state = tok_state_data
3224                         cur -= 1 # Reconsume
3225                         return
3226                 # Anything else
3227                 return new_character_token c
3228
3229         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3230         tok_state_script_data_double_escaped_dash = ->
3231                 c = txt.charAt(cur++)
3232                 if c is '-'
3233                         tok_state = tok_state_script_data_double_escaped_dash_dash
3234                         return new_character_token '-'
3235                 if c is '<'
3236                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3237                         return new_character_token '<'
3238                 if c is "\u0000"
3239                         parse_error()
3240                         tok_state = tok_state_script_data_double_escaped
3241                         return new_character_token "\ufffd"
3242                 if c is '' # EOF
3243                         parse_error()
3244                         tok_state = tok_state_data
3245                         cur -= 1 # Reconsume
3246                         return
3247                 # Anything else
3248                 tok_state = tok_state_script_data_double_escaped
3249                 return new_character_token c
3250
3251         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3252         tok_state_script_data_double_escaped_dash_dash = ->
3253                 c = txt.charAt(cur++)
3254                 if c is '-'
3255                         return new_character_token '-'
3256                 if c is '<'
3257                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3258                         return new_character_token '<'
3259                 if c is '>'
3260                         tok_state = tok_state_script_data
3261                         return new_character_token '>'
3262                 if c is "\u0000"
3263                         parse_error()
3264                         tok_state = tok_state_script_data_double_escaped
3265                         return new_character_token "\ufffd"
3266                 if c is '' # EOF
3267                         parse_error()
3268                         tok_state = tok_state_data
3269                         cur -= 1 # Reconsume
3270                         return
3271                 # Anything else
3272                 tok_state = tok_state_script_data_double_escaped
3273                 return new_character_token c
3274
3275         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3276         tok_state_script_data_double_escaped_less_than_sign = ->
3277                 c = txt.charAt(cur++)
3278                 if c is '/'
3279                         temporary_buffer = ''
3280                         tok_state = tok_state_script_data_double_escape_end
3281                         return new_character_token '/'
3282                 # Anything else
3283                 tok_state = tok_state_script_data_double_escaped
3284                 cur -= 1 # Reconsume
3285                 return
3286
3287         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3288         tok_state_script_data_double_escape_end = ->
3289                 c = txt.charAt(cur++)
3290                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3291                         if temporary_buffer is 'script'
3292                                 tok_state = tok_state_script_data_escaped
3293                         else
3294                                 tok_state = tok_state_script_data_double_escaped
3295                         return new_character_token c
3296                 if is_uc_alpha(c)
3297                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3298                         return new_character_token c
3299                 if is_lc_alpha(c)
3300                         temporary_buffer += c
3301                         return new_character_token c
3302                 # Anything else
3303                 tok_state = tok_state_script_data_double_escaped
3304                 cur -= 1 # Reconsume
3305                 return
3306
3307         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3308         tok_state_before_attribute_name = ->
3309                 attr_name = null
3310                 switch c = txt.charAt(cur++)
3311                         when "\t", "\n", "\u000c", ' '
3312                                 return null
3313                         when '/'
3314                                 tok_state = tok_state_self_closing_start_tag
3315                                 return null
3316                         when '>'
3317                                 tok_state = tok_state_data
3318                                 tmp = tok_cur_tag
3319                                 tok_cur_tag = null
3320                                 return tmp
3321                         when "\u0000"
3322                                 parse_error()
3323                                 attr_name = "\ufffd"
3324                         when '"', "'", '<', '='
3325                                 parse_error()
3326                                 attr_name = c
3327                         when '' # EOF
3328                                 parse_error()
3329                                 tok_state = tok_state_data
3330                         else
3331                                 if is_uc_alpha(c)
3332                                         attr_name = c.toLowerCase()
3333                                 else
3334                                         attr_name = c
3335                 if attr_name?
3336                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3337                         tok_state = tok_state_attribute_name
3338                 return null
3339
3340         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3341         tok_state_attribute_name = ->
3342                 switch c = txt.charAt(cur++)
3343                         when "\t", "\n", "\u000c", ' '
3344                                 tok_state = tok_state_after_attribute_name
3345                         when '/'
3346                                 tok_state = tok_state_self_closing_start_tag
3347                         when '='
3348                                 tok_state = tok_state_before_attribute_value
3349                         when '>'
3350                                 tok_state = tok_state_data
3351                                 tmp = tok_cur_tag
3352                                 tok_cur_tag = null
3353                                 return tmp
3354                         when "\u0000"
3355                                 parse_error()
3356                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3357                         when '"', "'", '<'
3358                                 parse_error()
3359                                 tok_cur_tag.attrs_a[0][0] = c
3360                         when '' # EOF
3361                                 parse_error()
3362                                 tok_state = tok_state_data
3363                         else
3364                                 if is_uc_alpha(c)
3365                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3366                                 else
3367                                         tok_cur_tag.attrs_a[0][0] += c
3368                 return null
3369
3370         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3371         tok_state_after_attribute_name = ->
3372                 c = txt.charAt(cur++)
3373                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3374                         return
3375                 if c is '/'
3376                         tok_state = tok_state_self_closing_start_tag
3377                         return
3378                 if c is '='
3379                         tok_state = tok_state_before_attribute_value
3380                         return
3381                 if c is '>'
3382                         tok_state = tok_state_data
3383                         return
3384                 if is_uc_alpha(c)
3385                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3386                         tok_state = tok_state_attribute_name
3387                         return
3388                 if c is "\u0000"
3389                         parse_error()
3390                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3391                         tok_state = tok_state_attribute_name
3392                         return
3393                 if c is '' # EOF
3394                         parse_error()
3395                         tok_state = tok_state_data
3396                         cur -= 1 # reconsume
3397                         return
3398                 if c is '"' or c is "'" or c is '<'
3399                         parse_error()
3400                         # fall through to Anything else
3401                 # Anything else
3402                 tok_cur_tag.attrs_a.unshift [c, '']
3403                 tok_state = tok_state_attribute_name
3404
3405         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3406         tok_state_before_attribute_value = ->
3407                 switch c = txt.charAt(cur++)
3408                         when "\t", "\n", "\u000c", ' '
3409                                 return null
3410                         when '"'
3411                                 tok_state = tok_state_attribute_value_double_quoted
3412                         when '&'
3413                                 tok_state = tok_state_attribute_value_unquoted
3414                                 cur -= 1
3415                         when "'"
3416                                 tok_state = tok_state_attribute_value_single_quoted
3417                         when "\u0000"
3418                                 # Parse error
3419                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3420                                 tok_state = tok_state_attribute_value_unquoted
3421                         when '>'
3422                                 # Parse error
3423                                 tok_state = tok_state_data
3424                                 tmp = tok_cur_tag
3425                                 tok_cur_tag = null
3426                                 return tmp
3427                         when '' # EOF
3428                                 parse_error()
3429                                 tok_state = tok_state_data
3430                         else
3431                                 tok_cur_tag.attrs_a[0][1] += c
3432                                 tok_state = tok_state_attribute_value_unquoted
3433                 return null
3434
3435         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3436         tok_state_attribute_value_double_quoted = ->
3437                 switch c = txt.charAt(cur++)
3438                         when '"'
3439                                 tok_state = tok_state_after_attribute_value_quoted
3440                         when '&'
3441                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3442                         when "\u0000"
3443                                 # Parse error
3444                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3445                         when '' # EOF
3446                                 parse_error()
3447                                 tok_state = tok_state_data
3448                         else
3449                                 tok_cur_tag.attrs_a[0][1] += c
3450                 return null
3451
3452         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3453         tok_state_attribute_value_single_quoted = ->
3454                 switch c = txt.charAt(cur++)
3455                         when "'"
3456                                 tok_state = tok_state_after_attribute_value_quoted
3457                         when '&'
3458                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3459                         when "\u0000"
3460                                 # Parse error
3461                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3462                         when '' # EOF
3463                                 parse_error()
3464                                 tok_state = tok_state_data
3465                         else
3466                                 tok_cur_tag.attrs_a[0][1] += c
3467                 return null
3468
3469         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3470         tok_state_attribute_value_unquoted = ->
3471                 switch c = txt.charAt(cur++)
3472                         when "\t", "\n", "\u000c", ' '
3473                                 tok_state = tok_state_before_attribute_name
3474                         when '&'
3475                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3476                         when '>'
3477                                 tok_state = tok_state_data
3478                                 tmp = tok_cur_tag
3479                                 tok_cur_tag = null
3480                                 return tmp
3481                         when "\u0000"
3482                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3483                         when '' # EOF
3484                                 parse_error()
3485                                 tok_state = tok_state_data
3486                         else
3487                                 # Parse Error if ', <, = or ` (backtick)
3488                                 tok_cur_tag.attrs_a[0][1] += c
3489                 return null
3490
3491         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3492         tok_state_after_attribute_value_quoted = ->
3493                 switch c = txt.charAt(cur++)
3494                         when "\t", "\n", "\u000c", ' '
3495                                 tok_state = tok_state_before_attribute_name
3496                         when '/'
3497                                 tok_state = tok_state_self_closing_start_tag
3498                         when '>'
3499                                 tok_state = tok_state_data
3500                                 tmp = tok_cur_tag
3501                                 tok_cur_tag = null
3502                                 return tmp
3503                         when '' # EOF
3504                                 parse_error()
3505                                 tok_state = tok_state_data
3506                         else
3507                                 # Parse Error
3508                                 tok_state = tok_state_before_attribute_name
3509                                 cur -= 1 # we didn't handle that char
3510                 return null
3511
3512         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3513         tok_state_self_closing_start_tag = ->
3514                 c = txt.charAt(cur++)
3515                 if c is '>'
3516                         tok_cur_tag.flag 'self-closing'
3517                         tok_state = tok_state_data
3518                         return tok_cur_tag
3519                 if c is ''
3520                         parse_error()
3521                         tok_state = tok_state_data
3522                         cur -= 1 # Reconsume
3523                         return
3524                 # Anything else
3525                 parse_error()
3526                 tok_state = tok_state_before_attribute_name
3527                 cur -= 1 # Reconsume
3528                 return
3529
3530         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3531         # WARNING: put a comment token in tok_cur_tag before setting this state
3532         tok_state_bogus_comment = ->
3533                 next_gt = txt.indexOf '>', cur
3534                 if next_gt is -1
3535                         val = txt.substr cur
3536                         cur = txt.length
3537                 else
3538                         val = txt.substr cur, (next_gt - cur)
3539                         cur = next_gt + 1
3540                 val = val.replace "\u0000", "\ufffd"
3541                 tok_cur_tag.text += val
3542                 tok_state = tok_state_data
3543                 return tok_cur_tag
3544
3545         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3546         tok_state_markup_declaration_open = ->
3547                 if txt.substr(cur, 2) is '--'
3548                         cur += 2
3549                         tok_cur_tag = new_comment_token ''
3550                         tok_state = tok_state_comment_start
3551                         return
3552                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3553                         cur += 7
3554                         tok_state = tok_state_doctype
3555                         return
3556                 acn = adjusted_current_node()
3557                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3558                         cur += 7
3559                         tok_state = tok_state_cdata_section
3560                         return
3561                 # Otherwise
3562                 parse_error()
3563                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3564                 tok_state = tok_state_bogus_comment
3565                 return
3566
3567         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3568         tok_state_comment_start = ->
3569                 switch c = txt.charAt(cur++)
3570                         when '-'
3571                                 tok_state = tok_state_comment_start_dash
3572                         when "\u0000"
3573                                 parse_error()
3574                                 return new_character_token "\ufffd"
3575                         when '>'
3576                                 parse_error()
3577                                 tok_state = tok_state_data
3578                                 return tok_cur_tag
3579                         when '' # EOF
3580                                 parse_error()
3581                                 tok_state = tok_state_data
3582                                 cur -= 1 # Reconsume
3583                                 return tok_cur_tag
3584                         else
3585                                 tok_cur_tag.text += c
3586                 return null
3587
3588         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3589         tok_state_comment_start_dash = ->
3590                 switch c = txt.charAt(cur++)
3591                         when '-'
3592                                 tok_state = tok_state_comment_end
3593                         when "\u0000"
3594                                 parse_error()
3595                                 tok_cur_tag.text += "-\ufffd"
3596                                 tok_state = tok_state_comment
3597                         when '>'
3598                                 parse_error()
3599                                 tok_state = tok_state_data
3600                                 return tok_cur_tag
3601                         when '' # EOF
3602                                 parse_error()
3603                                 tok_state = tok_state_data
3604                                 cur -= 1 # Reconsume
3605                                 return tok_cur_tag
3606                         else
3607                                 tok_cur_tag.text += "-#{c}"
3608                                 tok_state = tok_state_comment
3609                 return null
3610
3611         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3612         tok_state_comment = ->
3613                 switch c = txt.charAt(cur++)
3614                         when '-'
3615                                 tok_state = tok_state_comment_end_dash
3616                         when "\u0000"
3617                                 parse_error()
3618                                 tok_cur_tag.text += "\ufffd"
3619                         when '' # EOF
3620                                 parse_error()
3621                                 tok_state = tok_state_data
3622                                 cur -= 1 # Reconsume
3623                                 return tok_cur_tag
3624                         else
3625                                 tok_cur_tag.text += c
3626                 return null
3627
3628         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3629         tok_state_comment_end_dash = ->
3630                 switch c = txt.charAt(cur++)
3631                         when '-'
3632                                 tok_state = tok_state_comment_end
3633                         when "\u0000"
3634                                 parse_error()
3635                                 tok_cur_tag.text += "-\ufffd"
3636                                 tok_state = tok_state_comment
3637                         when '' # EOF
3638                                 parse_error()
3639                                 tok_state = tok_state_data
3640                                 cur -= 1 # Reconsume
3641                                 return tok_cur_tag
3642                         else
3643                                 tok_cur_tag.text += "-#{c}"
3644                                 tok_state = tok_state_comment
3645                 return null
3646
3647         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3648         tok_state_comment_end = ->
3649                 switch c = txt.charAt(cur++)
3650                         when '>'
3651                                 tok_state = tok_state_data
3652                                 return tok_cur_tag
3653                         when "\u0000"
3654                                 parse_error()
3655                                 tok_cur_tag.text += "--\ufffd"
3656                                 tok_state = tok_state_comment
3657                         when '!'
3658                                 parse_error()
3659                                 tok_state = tok_state_comment_end_bang
3660                         when '-'
3661                                 parse_error()
3662                                 tok_cur_tag.text += '-'
3663                         when '' # EOF
3664                                 parse_error()
3665                                 tok_state = tok_state_data
3666                                 cur -= 1 # Reconsume
3667                                 return tok_cur_tag
3668                         else
3669                                 parse_error()
3670                                 tok_cur_tag.text += "--#{c}"
3671                                 tok_state = tok_state_comment
3672                 return null
3673
3674         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3675         tok_state_comment_end_bang = ->
3676                 switch c = txt.charAt(cur++)
3677                         when '-'
3678                                 tok_cur_tag.text += "--!#{c}"
3679                                 tok_state = tok_state_comment_end_dash
3680                         when '>'
3681                                 tok_state = tok_state_data
3682                                 return tok_cur_tag
3683                         when "\u0000"
3684                                 parse_error()
3685                                 tok_cur_tag.text += "--!\ufffd"
3686                                 tok_state = tok_state_comment
3687                         when '' # EOF
3688                                 parse_error()
3689                                 tok_state = tok_state_data
3690                                 cur -= 1 # Reconsume
3691                                 return tok_cur_tag
3692                         else
3693                                 tok_cur_tag.text += "--!#{c}"
3694                                 tok_state = tok_state_comment
3695                 return null
3696
3697         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3698         tok_state_doctype = ->
3699                 switch c = txt.charAt(cur++)
3700                         when "\t", "\u000a", "\u000c", ' '
3701                                 tok_state = tok_state_before_doctype_name
3702                         when '' # EOF
3703                                 parse_error()
3704                                 tok_state = tok_state_data
3705                                 el = new_doctype_token ''
3706                                 el.flag 'force-quirks', true
3707                                 cur -= 1 # Reconsume
3708                                 return el
3709                         else
3710                                 parse_error()
3711                                 tok_state = tok_state_before_doctype_name
3712                                 cur -= 1 # Reconsume
3713                 return null
3714
3715         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3716         tok_state_before_doctype_name = ->
3717                 c = txt.charAt(cur++)
3718                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3719                         return
3720                 if is_uc_alpha(c)
3721                         tok_cur_tag = new_doctype_token c.toLowerCase()
3722                         tok_state = tok_state_doctype_name
3723                         return
3724                 if c is "\u0000"
3725                         parse_error()
3726                         tok_cur_tag = new_doctype_token "\ufffd"
3727                         tok_state = tok_state_doctype_name
3728                         return
3729                 if c is '>'
3730                         parse_error()
3731                         el = new_doctype_token ''
3732                         el.flag 'force-quirks', true
3733                         tok_state = tok_state_data
3734                         return el
3735                 if c is '' # EOF
3736                         parse_error()
3737                         tok_state = tok_state_data
3738                         el = new_doctype_token ''
3739                         el.flag 'force-quirks', true
3740                         cur -= 1 # Reconsume
3741                         return el
3742                 # Anything else
3743                 tok_cur_tag = new_doctype_token c
3744                 tok_state = tok_state_doctype_name
3745                 return null
3746
3747         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3748         tok_state_doctype_name = ->
3749                 c = txt.charAt(cur++)
3750                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3751                         tok_state = tok_state_after_doctype_name
3752                         return
3753                 if c is '>'
3754                         tok_state = tok_state_data
3755                         return tok_cur_tag
3756                 if is_uc_alpha(c)
3757                         tok_cur_tag.name += c.toLowerCase()
3758                         return
3759                 if c is "\u0000"
3760                         parse_error()
3761                         tok_cur_tag.name += "\ufffd"
3762                         return
3763                 if c is '' # EOF
3764                         parse_error()
3765                         tok_state = tok_state_data
3766                         tok_cur_tag.flag 'force-quirks', true
3767                         cur -= 1 # Reconsume
3768                         return tok_cur_tag
3769                 # Anything else
3770                 tok_cur_tag.name += c
3771                 return null
3772
3773         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3774         tok_state_after_doctype_name = ->
3775                 c = txt.charAt(cur++)
3776                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3777                         return
3778                 if c is '>'
3779                         tok_state = tok_state_data
3780                         return tok_cur_tag
3781                 if c is '' # EOF
3782                         parse_error()
3783                         tok_state = tok_state_data
3784                         tok_cur_tag.flag 'force-quirks', true
3785                         cur -= 1 # Reconsume
3786                         return tok_cur_tag
3787                 # Anything else
3788                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3789                         cur += 5
3790                         tok_state = tok_state_after_doctype_public_keyword
3791                         return
3792                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3793                         cur += 5
3794                         tok_state = tok_state_after_doctype_system_keyword
3795                         return
3796                 parse_error()
3797                 tok_cur_tag.flag 'force-quirks', true
3798                 tok_state = tok_state_bogus_doctype
3799                 return null
3800
3801         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3802         tok_state_after_doctype_public_keyword = ->
3803                 c = txt.charAt(cur++)
3804                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3805                         tok_state = tok_state_before_doctype_public_identifier
3806                         return
3807                 if c is '"'
3808                         parse_error()
3809                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3810                         tok_state = tok_state_doctype_public_identifier_double_quoted
3811                         return
3812                 if c is "'"
3813                         parse_error()
3814                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3815                         tok_state = tok_state_doctype_public_identifier_single_quoted
3816                         return
3817                 if c is '>'
3818                         parse_error()
3819                         tok_cur_tag.flag 'force-quirks', true
3820                         tok_state = tok_state_data
3821                         return tok_cur_tag
3822                 if c is '' # EOF
3823                         parse_error()
3824                         tok_state = tok_state_data
3825                         tok_cur_tag.flag 'force-quirks', true
3826                         cur -= 1 # Reconsume
3827                         return tok_cur_tag
3828                 # Anything else
3829                 parse_error()
3830                 tok_cur_tag.flag 'force-quirks', true
3831                 tok_state = tok_state_bogus_doctype
3832                 return null
3833
3834         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3835         tok_state_before_doctype_public_identifier = ->
3836                 c = txt.charAt(cur++)
3837                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3838                         return
3839                 if c is '"'
3840                         parse_error()
3841                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3842                         tok_state = tok_state_doctype_public_identifier_double_quoted
3843                         return
3844                 if c is "'"
3845                         parse_error()
3846                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3847                         tok_state = tok_state_doctype_public_identifier_single_quoted
3848                         return
3849                 if c is '>'
3850                         parse_error()
3851                         tok_cur_tag.flag 'force-quirks', true
3852                         tok_state = tok_state_data
3853                         return tok_cur_tag
3854                 if c is '' # EOF
3855                         parse_error()
3856                         tok_state = tok_state_data
3857                         tok_cur_tag.flag 'force-quirks', true
3858                         cur -= 1 # Reconsume
3859                         return tok_cur_tag
3860                 # Anything else
3861                 parse_error()
3862                 tok_cur_tag.flag 'force-quirks', true
3863                 tok_state = tok_state_bogus_doctype
3864                 return null
3865
3866
3867         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3868         tok_state_doctype_public_identifier_double_quoted = ->
3869                 c = txt.charAt(cur++)
3870                 if c is '"'
3871                         tok_state = tok_state_after_doctype_public_identifier
3872                         return
3873                 if c is "\u0000"
3874                         parse_error()
3875                         tok_cur_tag.public_identifier += "\ufffd"
3876                         return
3877                 if c is '>'
3878                         parse_error()
3879                         tok_cur_tag.flag 'force-quirks', true
3880                         tok_state = tok_state_data
3881                         return tok_cur_tag
3882                 if c is '' # EOF
3883                         parse_error()
3884                         tok_state = tok_state_data
3885                         tok_cur_tag.flag 'force-quirks', true
3886                         cur -= 1 # Reconsume
3887                         return tok_cur_tag
3888                 # Anything else
3889                 tok_cur_tag.public_identifier += c
3890                 return null
3891
3892         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3893         tok_state_doctype_public_identifier_single_quoted = ->
3894                 c = txt.charAt(cur++)
3895                 if c is "'"
3896                         tok_state = tok_state_after_doctype_public_identifier
3897                         return
3898                 if c is "\u0000"
3899                         parse_error()
3900                         tok_cur_tag.public_identifier += "\ufffd"
3901                         return
3902                 if c is '>'
3903                         parse_error()
3904                         tok_cur_tag.flag 'force-quirks', true
3905                         tok_state = tok_state_data
3906                         return tok_cur_tag
3907                 if c is '' # EOF
3908                         parse_error()
3909                         tok_state = tok_state_data
3910                         tok_cur_tag.flag 'force-quirks', true
3911                         cur -= 1 # Reconsume
3912                         return tok_cur_tag
3913                 # Anything else
3914                 tok_cur_tag.public_identifier += c
3915                 return null
3916
3917         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3918         tok_state_after_doctype_public_identifier = ->
3919                 c = txt.charAt(cur++)
3920                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3921                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3922                         return
3923                 if c is '>'
3924                         tok_state = tok_state_data
3925                         return tok_cur_tag
3926                 if c is '"'
3927                         parse_error()
3928                         tok_cur_tag.system_identifier = ''
3929                         tok_state = tok_state_doctype_system_identifier_double_quoted
3930                         return
3931                 if c is "'"
3932                         parse_error()
3933                         tok_cur_tag.system_identifier = ''
3934                         tok_state = tok_state_doctype_system_identifier_single_quoted
3935                         return
3936                 if c is '' # EOF
3937                         parse_error()
3938                         tok_state = tok_state_data
3939                         tok_cur_tag.flag 'force-quirks', true
3940                         cur -= 1 # Reconsume
3941                         return tok_cur_tag
3942                 # Anything else
3943                 parse_error()
3944                 tok_cur_tag.flag 'force-quirks', true
3945                 tok_state = tok_state_bogus_doctype
3946                 return null
3947
3948         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3949         tok_state_between_doctype_public_and_system_identifiers = ->
3950                 c = txt.charAt(cur++)
3951                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3952                         return
3953                 if c is '>'
3954                         tok_state = tok_state_data
3955                         return tok_cur_tag
3956                 if c is '"'
3957                         parse_error()
3958                         tok_cur_tag.system_identifier = ''
3959                         tok_state = tok_state_doctype_system_identifier_double_quoted
3960                         return
3961                 if c is "'"
3962                         parse_error()
3963                         tok_cur_tag.system_identifier = ''
3964                         tok_state = tok_state_doctype_system_identifier_single_quoted
3965                         return
3966                 if c is '' # EOF
3967                         parse_error()
3968                         tok_state = tok_state_data
3969                         tok_cur_tag.flag 'force-quirks', true
3970                         cur -= 1 # Reconsume
3971                         return tok_cur_tag
3972                 # Anything else
3973                 parse_error()
3974                 tok_cur_tag.flag 'force-quirks', true
3975                 tok_state = tok_state_bogus_doctype
3976                 return null
3977
3978         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3979         tok_state_after_doctype_system_keyword = ->
3980                 c = txt.charAt(cur++)
3981                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3982                         tok_state = tok_state_before_doctype_system_identifier
3983                         return
3984                 if c is '"'
3985                         parse_error()
3986                         tok_cur_tag.system_identifier = ''
3987                         tok_state = tok_state_doctype_system_identifier_double_quoted
3988                         return
3989                 if c is "'"
3990                         parse_error()
3991                         tok_cur_tag.system_identifier = ''
3992                         tok_state = tok_state_doctype_system_identifier_single_quoted
3993                         return
3994                 if c is '>'
3995                         parse_error()
3996                         tok_cur_tag.flag 'force-quirks', true
3997                         tok_state = tok_state_data
3998                         return tok_cur_tag
3999                 if c is '' # EOF
4000                         parse_error()
4001                         tok_state = tok_state_data
4002                         tok_cur_tag.flag 'force-quirks', true
4003                         cur -= 1 # Reconsume
4004                         return tok_cur_tag
4005                 # Anything else
4006                 parse_error()
4007                 tok_cur_tag.flag 'force-quirks', true
4008                 tok_state = tok_state_bogus_doctype
4009                 return null
4010
4011         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4012         tok_state_before_doctype_system_identifier = ->
4013                 c = txt.charAt(cur++)
4014                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4015                         return
4016                 if c is '"'
4017                         tok_cur_tag.system_identifier = ''
4018                         tok_state = tok_state_doctype_system_identifier_double_quoted
4019                         return
4020                 if c is "'"
4021                         tok_cur_tag.system_identifier = ''
4022                         tok_state = tok_state_doctype_system_identifier_single_quoted
4023                         return
4024                 if c is '>'
4025                         parse_error()
4026                         tok_cur_tag.flag 'force-quirks', true
4027                         tok_state = tok_state_data
4028                         return tok_cur_tag
4029                 if c is '' # EOF
4030                         parse_error()
4031                         tok_state = tok_state_data
4032                         tok_cur_tag.flag 'force-quirks', true
4033                         cur -= 1 # Reconsume
4034                         return tok_cur_tag
4035                 # Anything else
4036                 parse_error()
4037                 tok_cur_tag.flag 'force-quirks', true
4038                 tok_state = tok_state_bogus_doctype
4039                 return null
4040
4041         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4042         tok_state_doctype_system_identifier_double_quoted = ->
4043                 c = txt.charAt(cur++)
4044                 if c is '"'
4045                         tok_state = tok_state_after_doctype_system_identifier
4046                         return
4047                 if c is "\u0000"
4048                         parse_error()
4049                         tok_cur_tag.system_identifier += "\ufffd"
4050                         return
4051                 if c is '>'
4052                         parse_error()
4053                         tok_cur_tag.flag 'force-quirks', true
4054                         tok_state = tok_state_data
4055                         return tok_cur_tag
4056                 if c is '' # EOF
4057                         parse_error()
4058                         tok_state = tok_state_data
4059                         tok_cur_tag.flag 'force-quirks', true
4060                         cur -= 1 # Reconsume
4061                         return tok_cur_tag
4062                 # Anything else
4063                 tok_cur_tag.system_identifier += c
4064                 return null
4065
4066         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4067         tok_state_doctype_system_identifier_single_quoted = ->
4068                 c = txt.charAt(cur++)
4069                 if c is "'"
4070                         tok_state = tok_state_after_doctype_system_identifier
4071                         return
4072                 if c is "\u0000"
4073                         parse_error()
4074                         tok_cur_tag.system_identifier += "\ufffd"
4075                         return
4076                 if c is '>'
4077                         parse_error()
4078                         tok_cur_tag.flag 'force-quirks', true
4079                         tok_state = tok_state_data
4080                         return tok_cur_tag
4081                 if c is '' # EOF
4082                         parse_error()
4083                         tok_state = tok_state_data
4084                         tok_cur_tag.flag 'force-quirks', true
4085                         cur -= 1 # Reconsume
4086                         return tok_cur_tag
4087                 # Anything else
4088                 tok_cur_tag.system_identifier += c
4089                 return null
4090
4091         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4092         tok_state_after_doctype_system_identifier = ->
4093                 c = txt.charAt(cur++)
4094                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4095                         return
4096                 if c is '>'
4097                         tok_state = tok_state_data
4098                         return tok_cur_tag
4099                 if c is '' # EOF
4100                         parse_error()
4101                         tok_state = tok_state_data
4102                         tok_cur_tag.flag 'force-quirks', true
4103                         cur -= 1 # Reconsume
4104                         return tok_cur_tag
4105                 # Anything else
4106                 parse_error()
4107                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4108                 tok_state = tok_state_bogus_doctype
4109                 return null
4110
4111         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4112         tok_state_bogus_doctype = ->
4113                 c = txt.charAt(cur++)
4114                 if c is '>'
4115                         tok_state = tok_state_data
4116                         return tok_cur_tag
4117                 if c is '' # EOF
4118                         tok_state = tok_state_data
4119                         cur -= 1 # Reconsume
4120                         return tok_cur_tag
4121                 # Anything else
4122                 return null
4123
4124         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4125         tok_state_cdata_section = ->
4126                 tok_state = tok_state_data
4127                 next_gt = txt.indexOf ']]>', cur
4128                 if next_gt is -1
4129                         val = txt.substr cur
4130                         cur = txt.length
4131                 else
4132                         val = txt.substr cur, (next_gt - cur)
4133                         cur = next_gt + 3
4134                 val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
4135                 return new_character_token val # fixfull split
4136
4137         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4138         # Don't set this as a state, just call it
4139         # returns a string (NOT a text node)
4140         parse_character_reference = (allowed_char = null, in_attr = false) ->
4141                 if cur >= txt.length
4142                         return '&'
4143                 switch c = txt.charAt(cur)
4144                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4145                                 # explicitly not a parse error
4146                                 return '&'
4147                         when ';'
4148                                 # there has to be "one or more" alnums between & and ; to be a parse error
4149                                 return '&'
4150                         when '#'
4151                                 if cur + 1 >= txt.length
4152                                         return '&'
4153                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4154                                         prefix = '#x'
4155                                         charset = hex_chars
4156                                         start = cur + 2
4157                                 else
4158                                         charset = digits
4159                                         start = cur + 1
4160                                         prefix = '#'
4161                                 i = 0
4162                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4163                                         i += 1
4164                                 if i is 0
4165                                         return '&'
4166                                 if txt.charAt(start + i) is ';'
4167                                         i += 1
4168                                 # FIXME This is supposed to generate parse errors for some chars
4169                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4170                                 if decoded?
4171                                         cur = start + i
4172                                         return decoded
4173                                 return '&'
4174                         else
4175                                 for i in [0...31]
4176                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4177                                                 break
4178                                 if i is 0
4179                                         # exit early, because parse_error() below needs at least one alnum
4180                                         return '&'
4181                                 if txt.charAt(cur + i) is ';'
4182                                         i += 1 # include ';' terminator in value
4183                                         decoded = decode_named_char_ref txt.substr(cur, i)
4184                                         if decoded?
4185                                                 cur += i
4186                                                 return decoded
4187                                         parse_error()
4188                                         return '&'
4189                                 else
4190                                         # no ';' terminator (only legacy char refs)
4191                                         max = i
4192                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4193                                                 c = legacy_char_refs[txt.substr(cur, i)]
4194                                                 if c?
4195                                                         if in_attr
4196                                                                 if txt.charAt(cur + i) is '='
4197                                                                         # "because some legacy user agents will
4198                                                                         # misinterpret the markup in those cases"
4199                                                                         parse_error()
4200                                                                         return '&'
4201                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4202                                                                         # this makes attributes forgiving about url args
4203                                                                         return '&'
4204                                                         # ok, and besides the weird exceptions for attributes...
4205                                                         # return the matching char
4206                                                         cur += i # consume entity chars
4207                                                         parse_error() # because no terminating ";"
4208                                                         return c
4209                                         parse_error()
4210                                         return '&'
4211                 return # never reached
4212
4213         # tree constructor initialization
4214         # see comments on TYPE_TAG/etc for the structure of this data
4215         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4216         open_els = []
4217         afe = [] # active formatting elements
4218         template_ins_modes = []
4219         ins_mode = ins_mode_initial
4220         original_ins_mode = ins_mode # TODO check spec
4221         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4222         flag_frameset_ok = true
4223         flag_parsing = true
4224         flag_foster_parenting = false
4225         form_element_pointer = null
4226         temporary_buffer = null
4227         pending_table_character_tokens = []
4228         head_element_pointer = null
4229         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4230         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4231
4232         # tokenizer initialization
4233         tok_state = tok_state_data
4234
4235         # proccess input
4236         while flag_parsing
4237                 t = tok_state()
4238                 if t?
4239                         ins_mode t
4240                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4241         return doc.children
4242
4243 serialize_els = (els, shallow, show_ids) ->
4244         serialized = ''
4245         sep = ''
4246         for t in els
4247                 serialized += sep
4248                 sep = ','
4249                 serialized += t.serialize shallow, show_ids
4250         return serialized
4251
4252 # TODO export TYPE_*
4253 module.exports.parse_html = parse_html
4254 module.exports.debug_log_reset = debug_log_reset
4255 module.exports.debug_log_each = debug_log_each
4256 module.exports.TYPE_TAG = TYPE_TAG
4257 module.exports.TYPE_TEXT = TYPE_TEXT
4258 module.exports.TYPE_COMMENT = TYPE_COMMENT
4259 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE