JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
code cleanup and some minor fixes
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         acknowledge_self_closing: ->
100                 if @token?
101                         @token.flag 'did_self_close'
102                 else
103                         @flag 'did_self_close', true
104         flag: ->
105                 # fixfull
106         serialize: (shallow = false, show_ids = false) -> # for unit tests
107                 ret = ''
108                 switch @type
109                         when TYPE_TAG
110                                 ret += 'tag:'
111                                 ret += JSON.stringify @name
112                                 ret += ','
113                                 if show_ids
114                                         ret += "##{@id},"
115                                 if shallow
116                                         break
117                                 attr_keys = []
118                                 for k of @attrs
119                                         attr_keys.push k
120                                 attr_keys.sort()
121                                 ret += '{'
122                                 sep = ''
123                                 for k in attr_keys
124                                         ret += sep
125                                         sep = ','
126                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
127                                 ret += '},['
128                                 sep = ''
129                                 for c in @children
130                                         ret += sep
131                                         sep = ','
132                                         ret += c.serialize shallow, show_ids
133                                 ret += ']'
134                         when TYPE_TEXT
135                                 ret += 'text:'
136                                 ret += JSON.stringify @text
137                         when TYPE_COMMENT
138                                 ret += 'comment:'
139                                 ret += JSON.stringify @text
140                         when TYPE_DOCTYPE
141                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
142                         when TYPE_AFE_MARKER
143                                 ret += 'marker'
144                         when TYPE_AAA_BOOKMARK
145                                 ret += 'aaa_bookmark'
146                         else
147                                 ret += 'unknown:'
148                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
149                 return ret
150
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153         return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155         return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157         return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159         return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162         return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164         return new Node TYPE_DOCTYPE, name: name
165 new_eof_token = ->
166         return new Node TYPE_EOF
167 new_afe_marker = ->
168         return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170         return new Node TYPE_AAA_BOOKMARK
171
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
177
178 is_uc_alpha = (str) ->
179         return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181         return str.length is 1 and lc_alpha.indexOf(str) > -1
182
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
185
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
188 is_space = (txt) ->
189         return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
192
193 is_input_hidden_tok = (t) ->
194         return unless t.type is TYPE_START_TAG
195         for a of t.attrs_a
196                 if a[0] is 'type'
197                         if a[1].toLowerCase() is 'hidden'
198                                 return true
199                         return false
200         return false
201
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
204
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
207 legacy_char_refs = {
208         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
225         yen: '¥', yuml: 'ÿ'
226 }
227
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
232 svg_elements = [
233         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
247         'view', 'vkern'
248 ]
249
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
251 mathml_elements = [
252         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258         'determinant', 'diff', 'divergence', 'divide', 'domain',
259         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279         'vectorproduct', 'xor'
280 ]
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
283
284 special_elements = {
285         # HTML:
286         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303         wbr:NS_HTML, xmp:NS_HTML,
304
305         # MathML:
306         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307         'annotation-xml':NS_MATHML,
308
309         # SVG:
310         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
311 }
312
313 formatting_elements = {
314          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
316          u: true
317 }
318
319 foster_parenting_targets = {
320         table: true
321         tbody: true
322         tfoot: true
323         thead: true
324         tr: true
325 }
326
327 # all html I presume
328 end_tag_implied = {
329         dd: true
330         dt: true
331         li: true
332         option: true
333         optgroup: true
334         p: true
335         rb: true
336         rp: true
337         rt: true
338         rtc: true
339 }
340
341 el_is_special = (e) ->
342         return special_elements[e.name] is e.namespace
343
344 # decode_named_char_ref()
345 #
346 # The list of named character references is _huge_ so ask the browser to decode
347 # for us instead of wasting bandwidth/space on including the table here.
348 #
349 # Pass without the "&" but with the ";" examples:
350 #    for "&amp" pass "amp;"
351 #    for "&#x2032" pass "x2032;"
352 g_dncr = {
353         cache: {}
354         textarea: document.createElement('textarea')
355 }
356 # TODO test this in IE8
357 decode_named_char_ref = (txt) ->
358         txt = "&#{txt}"
359         decoded = g_dncr.cache[txt]
360         return decoded if decoded?
361         g_dncr.textarea.innerHTML = txt
362         decoded = g_dncr.textarea.value
363         return null if decoded is txt
364         return g_dncr.cache[txt] = decoded
365
366 parse_html = (txt, parse_error_cb = null) ->
367         cur = 0 # index of next char in txt to be parsed
368         # declare doc and tokenizer variables so they're in scope below
369         doc = null
370         open_els = null # stack of open elements
371         afe = null # active formatting elements
372         template_insertion_modes = null
373         insertion_mode = null
374         original_insertion_mode = null
375         tok_state = null
376         tok_cur_tag = null # partially parsed tag
377         flag_scripting = null
378         flag_frameset_ok = null
379         flag_parsing = null
380         flag_foster_parenting = null
381         form_element_pointer = null
382         temporary_buffer = null
383         pending_table_character_tokens = null
384         head_element_pointer = null
385         flag_fragment_parsing = null
386         context_element = null
387
388         stop_parsing = ->
389                 flag_parsing = false
390
391         parse_error = ->
392                 if parse_error_cb?
393                         parse_error_cb cur
394                 else
395                         console.log "Parse error at character #{cur} of #{txt.length}"
396
397         afe_push = (new_el) ->
398                 matches = 0
399                 for el, i in afe
400                         if el.name is new_el.name and el.namespace is new_el.namespace
401                                 for k, v of el.attrs
402                                         continue unless new_el.attrs[k] is v
403                                 for k, v of new_el.attrs
404                                         continue unless el.attrs[k] is v
405                                 matches += 1
406                                 if matches is 3
407                                         afe.splice i, 1
408                                         break
409                 afe.unshift new_el
410         afe_push_marker = ->
411                 afe.unshift new_afe_marker()
412
413         # the functions below impliment the Tree Contstruction algorithm
414         # http://www.w3.org/TR/html5/syntax.html#tree-construction
415
416         # But first... the helpers
417         template_tag_is_open = ->
418                 for t in open_els
419                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
420                                 return true
421                 return false
422         is_in_scope_x = (tag_name, scope, namespace) ->
423                 for t in open_els
424                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
425                                 return true
426                         if scope[t.name] is t.namespace
427                                 return false
428                 return false
429         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
430                 for t in open_els
431                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
432                                 return true
433                         if scope[t.name] is t.namespace
434                                 return false
435                         if scope2[t.name] is t.namespace
436                                 return false
437                 return false
438         standard_scopers = {
439                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
440                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
441                 template: NS_HTML, mi: NS_MATHML,
442
443                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
444                 'annotation-xml': NS_MATHML,
445
446                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
447         }
448         button_scopers = button: NS_HTML
449         li_scopers = ol: NS_HTML, ul: NS_HTML
450         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
451         is_in_scope = (tag_name, namespace = null) ->
452                 return is_in_scope_x tag_name, standard_scopers, namespace
453         is_in_button_scope = (tag_name, namespace = null) ->
454                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
455         is_in_table_scope = (tag_name, namespace = null) ->
456                 return is_in_scope_x tag_name, table_scopers, namespace
457         is_in_select_scope = (tag_name, namespace = null) ->
458                 for t in open_els
459                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
460                                 return true
461                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
462                                 return false
463                 return false
464         # this checks for a particular element, not by name
465         el_is_in_scope = (el) ->
466                 for t in open_els
467                         if t is el
468                                 return true
469                         if standard_scopers[t.name] is t.namespace
470                                 return false
471                 return false
472
473         clear_to_table_stopers = {
474                 'table': true
475                 'template': true
476                 'html': true
477         }
478         clear_stack_to_table_context = ->
479                 loop
480                         if clear_to_table_stopers[open_els[0].name]?
481                                 break
482                         open_els.shift()
483                 return
484         clear_to_table_body_stopers = {
485                 'tbody': true
486                 'tfoot': true
487                 'thead': true
488                 'template': true
489                 'html': true
490         }
491         clear_stack_to_table_body_context = ->
492                 loop
493                         if clear_to_table_body_stopers[open_els[0].name]?
494                                 break
495                         open_els.shift()
496                 return
497         clear_to_table_row_stopers = {
498                 'tr': true
499                 'template': true
500                 'html': true
501         }
502         clear_stack_to_table_row_context = ->
503                 loop
504                         if clear_to_table_row_stopers[open_els[0].name]?
505                                 break
506                         open_els.shift()
507                 return
508         clear_afe_to_marker = ->
509                 loop
510                         return unless afe.length > 0 # this happens in fragment case, ?spec error
511                         el = afe.shift()
512                         if el.type is TYPE_AFE_MARKER
513                                 return
514                 return
515
516         # 8.2.3.1 ...
517         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
518         reset_insertion_mode = ->
519                 # 1. Let last be false.
520                 last = false
521                 # 2. Let node be the last node in the stack of open elements.
522                 node_i = 0
523                 node = open_els[node_i]
524                 # 3. Loop: If node is the first node in the stack of open elements,
525                 # then set last to true, and, if the parser was originally created as
526                 # part of the HTML fragment parsing algorithm (fragment case) set node
527                 # to the context element.
528                 loop
529                         if node_i is open_els.length - 1
530                                 last = true
531                                 # fixfull (fragment case)
532
533                         # 4. If node is a select element, run these substeps:
534                         if node.name is 'select'
535                                 # 1. If last is true, jump to the step below labeled done.
536                                 unless last
537                                         # 2. Let ancestor be node.
538                                         ancestor_i = node_i
539                                         ancestor = node
540                                         # 3. Loop: If ancestor is the first node in the stack of
541                                         # open elements, jump to the step below labeled done.
542                                         loop
543                                                 if ancestor_i is open_els.length - 1
544                                                         break
545                                                 # 4. Let ancestor be the node before ancestor in the stack
546                                                 # of open elements.
547                                                 ancestor_i += 1
548                                                 ancestor = open_els[ancestor_i]
549                                                 # 5. If ancestor is a template node, jump to the step below
550                                                 # labeled done.
551                                                 if ancestor.name is 'template'
552                                                         break
553                                                 # 6. If ancestor is a table node, switch the insertion mode
554                                                 # to "in select in table" and abort these steps.
555                                                 if ancestor.name is 'table'
556                                                         insertion_mode = ins_mode_in_select_in_table
557                                                         return
558                                                 # 7. Jump back to the step labeled loop.
559                                 # 8. Done: Switch the insertion mode to "in select" and abort
560                                 # these steps.
561                                 insertion_mode = ins_mode_in_select
562                                 return
563                         # 5. If node is a td or th element and last is false, then switch
564                         # the insertion mode to "in cell" and abort these steps.
565                         if (node.name is 'td' or node.name is 'th') and last is false
566                                 insertion_mode = ins_mode_in_cell
567                                 return
568                         # 6. If node is a tr element, then switch the insertion mode to "in
569                         # row" and abort these steps.
570                         if node.name is 'tr'
571                                 insertion_mode = ins_mode_in_row
572                                 return
573                         # 7. If node is a tbody, thead, or tfoot element, then switch the
574                         # insertion mode to "in table body" and abort these steps.
575                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
576                                 insertion_mode = ins_mode_in_table_body
577                                 return
578                         # 8. If node is a caption element, then switch the insertion mode
579                         # to "in caption" and abort these steps.
580                         if node.name is 'caption'
581                                 insertion_mode = ins_mode_in_caption
582                                 return
583                         # 9. If node is a colgroup element, then switch the insertion mode
584                         # to "in column group" and abort these steps.
585                         if node.name is 'colgroup'
586                                 insertion_mode = ins_mode_in_column_group
587                                 return
588                         # 10. If node is a table element, then switch the insertion mode to
589                         # "in table" and abort these steps.
590                         if node.name is 'table'
591                                 insertion_mode = ins_mode_in_table
592                                 return
593                         # 11. If node is a template element, then switch the insertion mode
594                         # to the current template insertion mode and abort these steps.
595                         # fixfull (template insertion mode stack)
596
597                         # 12. If node is a head element and last is true, then switch the
598                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
599                         # these steps. (fragment case)
600                         if node.name is 'head' and last
601                                 insertion_mode = ins_mode_in_body
602                                 return
603                         # 13. If node is a head element and last is false, then switch the
604                         # insertion mode to "in head" and abort these steps.
605                         if node.name is 'head' and last is false
606                                 insertion_mode = ins_mode_in_head
607                                 return
608                         # 14. If node is a body element, then switch the insertion mode to
609                         # "in body" and abort these steps.
610                         if node.name is 'body'
611                                 insertion_mode = ins_mode_in_body
612                                 return
613                         # 15. If node is a frameset element, then switch the insertion mode
614                         # to "in frameset" and abort these steps. (fragment case)
615                         if node.name is 'frameset'
616                                 insertion_mode = ins_mode_in_frameset
617                                 return
618                         # 16. If node is an html element, run these substeps:
619                         if node.name is 'html'
620                                 # 1. If the head element pointer is null, switch the insertion
621                                 # mode to "before head" and abort these steps. (fragment case)
622                                 if head_element_pointer is null
623                                         ins_mode = ins_mode_before_head
624                                 else
625                                         # 2. Otherwise, the head element pointer is not null,
626                                         # switch the insertion mode to "after head" and abort these
627                                         # steps.
628                                         insertion_mode = ins_mode_after_head
629                                 return
630                         # 17. If last is true, then switch the insertion mode to "in body"
631                         # and abort these steps. (fragment case)
632                         if last
633                                 insertion_mode = ins_mode_in_body
634                                 return
635                         # 18. Let node now be the node before node in the stack of open
636                         # elements.
637                         node_i += 1
638                         node = open_els[node_i]
639                         # 19. Return to the step labeled loop.
640
641         # 8.2.3.2
642
643         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
644         adjusted_current_node = ->
645                 if open_els.length is 1 and flag_fragment_parsing
646                         return context_element
647                 return open_els[0]
648
649         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
650         # this implementation is structured (mostly) as described at the link above.
651         # capitalized comments are the "labels" described at the link above.
652         reconstruct_active_formatting_elements = ->
653                 return if afe.length is 0
654                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
655                         return
656                 # Rewind
657                 i = 0
658                 loop
659                         if i is afe.length - 1
660                                 break
661                         i += 1
662                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
663                                 i -= 1 # Advance
664                                 break
665                 # Create
666                 loop
667                         el = insert_html_element afe[i].token
668                         afe[i] = el
669                         break if i is 0
670                         i -= 1 # Advance
671
672         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
673         # adoption agency algorithm
674         # overview here:
675         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
676         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
677         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
678         adoption_agency = (subject) ->
679                 debug_log "adoption_agency()"
680                 debug_log "tree: #{serialize_els doc.children, false, true}"
681                 debug_log "open_els: #{serialize_els open_els, true, true}"
682                 debug_log "afe: #{serialize_els afe, true, true}"
683                 if open_els[0].name is subject
684                         el = open_els[0]
685                         open_els.shift()
686                         # remove it from the list of active formatting elements (if found)
687                         for t, i in afe
688                                 if t is el
689                                         afe.splice i, 1
690                                         break
691                         debug_log "aaa: starting off with subject on top of stack, exiting"
692                         return
693                 outer = 0
694                 loop
695                         if outer >= 8
696                                 return
697                         outer += 1
698                         # 5. Let formatting element be the last element in the list of
699                         # active formatting elements that: is between the end of the list
700                         # and the last scope marker in the list, if any, or the start of
701                         # the list otherwise, and  has the tag name subject.
702                         fe = null
703                         for t, fe_of_afe in afe
704                                 if t.type is TYPE_AFE_MARKER
705                                         break
706                                 if t.name is subject
707                                         fe = t
708                                         break
709                         # If there is no such element, then abort these steps and instead
710                         # act as described in the "any other end tag" entry above.
711                         if fe is null
712                                 debug_log "aaa: fe not found in afe"
713                                 in_body_any_other_end_tag subject
714                                 return
715                         # 6. If formatting element is not in the stack of open elements,
716                         # then this is a parse error; remove the element from the list, and
717                         # abort these steps.
718                         in_open_els = false
719                         for t, fe_of_open_els in open_els
720                                 if t is fe
721                                         in_open_els = true
722                                         break
723                         unless in_open_els
724                                 debug_log "aaa: fe not found in open_els"
725                                 parse_error()
726                                 # "remove it from the list" must mean afe, since it's not in open_els
727                                 afe.splice fe_of_afe, 1
728                                 return
729                         # 7. If formatting element is in the stack of open elements, but
730                         # the element is not in scope, then this is a parse error; abort
731                         # these steps.
732                         unless el_is_in_scope fe
733                                 debug_log "aaa: fe not in scope"
734                                 parse_error()
735                                 return
736                         # 8. If formatting element is not the current node, this is a parse
737                         # error. (But do not abort these steps.)
738                         unless open_els[0] is fe
739                                 parse_error()
740                                 # continue
741                         # 9. Let furthest block be the topmost node in the stack of open
742                         # elements that is lower in the stack than formatting element, and
743                         # is an element in the special category. There might not be one.
744                         fb = null
745                         fb_of_open_els = null
746                         for t, i in open_els
747                                 if t is fe
748                                         break
749                                 if el_is_special t
750                                         fb = t
751                                         fb_of_open_els = i
752                                         # and continue, to see if there's one that's more "topmost"
753                         # 10. If there is no furthest block, then the UA must first pop all
754                         # the nodes from the bottom of the stack of open elements, from the
755                         # current node up to and including formatting element, then remove
756                         # formatting element from the list of active formatting elements,
757                         # and finally abort these steps.
758                         if fb is null
759                                 debug_log "aaa: no fb"
760                                 loop
761                                         t = open_els.shift()
762                                         if t is fe
763                                                 afe.splice fe_of_afe, 1
764                                                 return
765                         # 11. Let common ancestor be the element immediately above
766                         # formatting element in the stack of open elements.
767                         ca = open_els[fe_of_open_els + 1] # common ancestor
768
769                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
770                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
771                         bookmark = new_aaa_bookmark()
772                         for t, i in afe
773                                 if t is fe
774                                         afe.splice i, 0, bookmark
775                                         break
776                         node = last_node = fb
777                         inner = 0
778                         loop
779                                 inner += 1
780                                 # 3. Let node be the element immediately above node in the
781                                 # stack of open elements, or if node is no longer in the stack
782                                 # of open elements (e.g. because it got removed by this
783                                 # algorithm), the element that was immediately above node in
784                                 # the stack of open elements before node was removed.
785                                 node_next = null
786                                 for t, i in open_els
787                                         if t is node
788                                                 node_next = open_els[i + 1]
789                                                 break
790                                 node = node_next ? node_above
791                                 debug_log "inner loop #{inner}"
792                                 debug_log "tree: #{serialize_els doc.children, false, true}"
793                                 debug_log "open_els: #{serialize_els open_els, true, true}"
794                                 debug_log "afe: #{serialize_els afe, true, true}"
795                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
796                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
797                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
798                                 debug_log "node: #{node.serialize true, true}"
799                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
800
801                                 # 4. If node is formatting element, then go to the next step in
802                                 # the overall algorithm.
803                                 if node is fe
804                                         break
805                                 debug_log "the meat"
806                                 # 5. If inner loop counter is greater than three and node is in
807                                 # the list of active formatting elements, then remove node from
808                                 # the list of active formatting elements.
809                                 node_in_afe = false
810                                 for t, i in afe
811                                         if t is node
812                                                 if inner > 3
813                                                         afe.splice i, 1
814                                                         debug_log "max out inner"
815                                                 else
816                                                         node_in_afe = true
817                                                         debug_log "in afe"
818                                                 break
819                                 # 6. If node is not in the list of active formatting elements,
820                                 # then remove node from the stack of open elements and then go
821                                 # back to the step labeled inner loop.
822                                 unless node_in_afe
823                                         debug_log "not in afe"
824                                         for t, i in open_els
825                                                 if t is node
826                                                         node_above = open_els[i + 1]
827                                                         open_els.splice i, 1
828                                                         break
829                                         continue
830                                 debug_log "the bones"
831                                 # 7. create an element for the token for which the element node
832                                 # was created, in the HTML namespace, with common ancestor as
833                                 # the intended parent; replace the entry for node in the list
834                                 # of active formatting elements with an entry for the new
835                                 # element, replace the entry for node in the stack of open
836                                 # elements with an entry for the new element, and let node be
837                                 # the new element.
838                                 new_node = token_to_element node.token, NS_HTML, ca
839                                 for t, i in afe
840                                         if t is node
841                                                 afe[i] = new_node
842                                                 debug_log "replaced in afe"
843                                                 break
844                                 for t, i in open_els
845                                         if t is node
846                                                 node_above = open_els[i + 1]
847                                                 open_els[i] = new_node
848                                                 debug_log "replaced in open_els"
849                                                 break
850                                 node = new_node
851                                 # 8. If last node is furthest block, then move the
852                                 # aforementioned bookmark to be immediately after the new node
853                                 # in the list of active formatting elements.
854                                 if last_node is fb
855                                         for t, i in afe
856                                                 if t is bookmark
857                                                         afe.splice i, 1
858                                                         debug_log "removed bookmark"
859                                                         break
860                                         for t, i in afe
861                                                 if t is node
862                                                         # "after" means lower
863                                                         afe.splice i, 0, bookmark # "after as <-
864                                                         debug_log "placed bookmark after node"
865                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
866                                                         break
867                                 # 9. Insert last node into node, first removing it from its
868                                 # previous parent node if any.
869                                 if last_node.parent?
870                                         debug_log "last_node has parent"
871                                         for c, i in last_node.parent.children
872                                                 if c is last_node
873                                                         debug_log "removing last_node from parent"
874                                                         last_node.parent.children.splice i, 1
875                                                         break
876                                 node.children.push last_node
877                                 last_node.parent = node
878                                 # 10. Let last node be node.
879                                 last_node = node
880                                 debug_log "at last"
881                                 # 11. Return to the step labeled inner loop.
882                         # 14. Insert whatever last node ended up being in the previous step
883                         # at the appropriate place for inserting a node, but using common
884                         # ancestor as the override target.
885
886                         # In the case where fe is immediately followed by fb:
887                         #   * inner loop exits out early (node==fe)
888                         #   * last_node is fb
889                         #   * last_node is still in the tree (not a duplicate)
890                         if last_node.parent?
891                                 debug_log "FEFIRST? last_node has parent"
892                                 for c, i in last_node.parent.children
893                                         if c is last_node
894                                                 debug_log "removing last_node from parent"
895                                                 last_node.parent.children.splice i, 1
896                                                 break
897
898                         debug_log "after aaa inner loop"
899                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
900                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
901                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
902                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
903                         debug_log "tree: #{serialize_els doc.children, false, true}"
904
905                         debug_log "insert"
906
907
908                         # can't use standard insert token thing, because it's already in
909                         # open_els and must stay at it's current position in open_els
910                         dest = adjusted_insertion_location ca
911                         dest[0].children.splice dest[1], 0, last_node
912                         last_node.parent = dest[0]
913
914
915                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
916                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
917                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
918                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
919                         debug_log "tree: #{serialize_els doc.children, false, true}"
920
921                         # 15. Create an element for the token for which formatting element
922                         # was created, in the HTML namespace, with furthest block as the
923                         # intended parent.
924                         new_element = token_to_element fe.token, NS_HTML, fb
925                         # 16. Take all of the child nodes of furthest block and append them
926                         # to the element created in the last step.
927                         while fb.children.length
928                                 t = fb.children.shift()
929                                 t.parent = new_element
930                                 new_element.children.push t
931                         # 17. Append that new element to furthest block.
932                         new_element.parent = fb
933                         fb.children.push new_element
934                         # 18. Remove formatting element from the list of active formatting
935                         # elements, and insert the new element into the list of active
936                         # formatting elements at the position of the aforementioned
937                         # bookmark.
938                         for t, i in afe
939                                 if t is fe
940                                         afe.splice i, 1
941                                         break
942                         for t, i in afe
943                                 if t is bookmark
944                                         afe[i] = new_element
945                                         break
946                         # 19. Remove formatting element from the stack of open elements,
947                         # and insert the new element into the stack of open elements
948                         # immediately below the position of furthest block in that stack.
949                         for t, i in open_els
950                                 if t is fe
951                                         open_els.splice i, 1
952                                         break
953                         for t, i in open_els
954                                 if t is fb
955                                         open_els.splice i, 0, new_element
956                                         break
957                         # 20. Jump back to the step labeled outer loop.
958                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
959                         debug_log "tree: #{serialize_els doc.children, false, true}"
960                         debug_log "open_els: #{serialize_els open_els, true, true}"
961                         debug_log "afe: #{serialize_els afe, true, true}"
962                 debug_log "AAA DONE"
963
964         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
965         close_p_element = ->
966                 generate_implied_end_tags 'p' # arg is exception
967                 if open_els[0].name isnt 'p'
968                         parse_error()
969                 while open_els.length > 1 # just in case
970                         el = open_els.shift()
971                         if el.name is 'p'
972                                 return
973         close_p_if_in_button_scope = ->
974                 if is_in_button_scope 'p'
975                         close_p_element()
976
977         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
978         # aka insert_a_character = (t) ->
979         insert_character = (t) ->
980                 dest = adjusted_insertion_location()
981                 # fixfull check for Document node
982                 if dest[1] > 0
983                         prev = dest[0].children[dest[1] - 1]
984                         if prev.type is TYPE_TEXT
985                                 prev.text += t.text
986                                 return
987                 dest[0].children.splice dest[1], 0, t
988
989         # 8.2.5.1
990         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
991         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
992         adjusted_insertion_location = (override_target = null) ->
993                 # 1. If there was an override target specified, then let target be the
994                 # override target.
995                 if override_target?
996                         target = override_target
997                 else # Otherwise, let target be the current node.
998                         target = open_els[0]
999                 # 2. Determine the adjusted insertion location using the first matching
1000                 # steps from the following list:
1001                 #
1002                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1003                 # thead, or tr element Foster parenting happens when content is
1004                 # misnested in tables.
1005                 if flag_foster_parenting and foster_parenting_targets[target.name]
1006                         loop # once. this is here so we can ``break`` to "abort these substeps"
1007                                 # 1. Let last template be the last template element in the
1008                                 # stack of open elements, if any.
1009                                 last_template = null
1010                                 last_template_i = null
1011                                 for el, i in open_els
1012                                         if el.name is 'template'
1013                                                 last_template = el
1014                                                 last_template_i = i
1015                                                 break
1016                                 # 2. Let last table be the last table element in the stack of
1017                                 # open elements, if any.
1018                                 last_table = null
1019                                 last_table_i
1020                                 for el, i in open_els
1021                                         if el.name is 'table'
1022                                                 last_table = el
1023                                                 last_table_i = i
1024                                                 break
1025                                 # 3. If there is a last template and either there is no last
1026                                 # table, or there is one, but last template is lower (more
1027                                 # recently added) than last table in the stack of open
1028                                 # elements, then: let adjusted insertion location be inside
1029                                 # last template's template contents, after its last child (if
1030                                 # any), and abort these substeps.
1031                                 if last_template and (last_table is null or last_template_i < last_table_i)
1032                                         target = last_template # fixfull should be it's contents
1033                                         target_i = target.children.length
1034                                         break
1035                                 # 4. If there is no last table, then let adjusted insertion
1036                                 # location be inside the first element in the stack of open
1037                                 # elements (the html element), after its last child (if any),
1038                                 # and abort these substeps. (fragment case)
1039                                 if last_table is null
1040                                         # this is odd
1041                                         target = open_els[open_els.length - 1]
1042                                         target_i = target.children.length
1043                                 # 5. If last table has a parent element, then let adjusted
1044                                 # insertion location be inside last table's parent element,
1045                                 # immediately before last table, and abort these substeps.
1046                                 if last_table.parent?
1047                                         for c, i in last_table.parent.children
1048                                                 if c is last_table
1049                                                         target = last_table.parent
1050                                                         target_i = i
1051                                                         break
1052                                         break
1053                                 # 6. Let previous element be the element immediately above last
1054                                 # table in the stack of open elements.
1055                                 #
1056                                 # huh? how could it not have a parent?
1057                                 previous_element = open_els[last_table_i + 1]
1058                                 # 7. Let adjusted insertion location be inside previous
1059                                 # element, after its last child (if any).
1060                                 target = previous_element
1061                                 target_i = target.children.length
1062                                 # Note: These steps are involved in part because it's possible
1063                                 # for elements, the table element in this case in particular,
1064                                 # to have been moved by a script around in the DOM, or indeed
1065                                 # removed from the DOM entirely, after the element was inserted
1066                                 # by the parser.
1067                                 break # don't really loop
1068                 else
1069                         # Otherwise Let adjusted insertion location be inside target, after
1070                         # its last child (if any).
1071                         target_i = target.children.length
1072
1073                 # 3. If the adjusted insertion location is inside a template element,
1074                 # let it instead be inside the template element's template contents,
1075                 # after its last child (if any).
1076                 # fixfull (template)
1077
1078                 # 4. Return the adjusted insertion location.
1079                 return [target, target_i]
1080
1081         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1082         # aka create_an_element_for_token
1083         token_to_element = (t, namespace, intended_parent) ->
1084                 # convert attributes into a hash
1085                 attrs = {}
1086                 for a in t.attrs_a
1087                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1088                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1089
1090                 # TODO 2. If the newly created element has an xmlns attribute in the
1091                 # XMLNS namespace whose value is not exactly the same as the element's
1092                 # namespace, that is a parse error. Similarly, if the newly created
1093                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1094                 # value is not the XLink Namespace, that is a parse error.
1095
1096                 # fixfull: the spec says stuff about form pointers and ownerDocument
1097
1098                 return el
1099
1100         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1101         insert_foreign_element = (token, namespace) ->
1102                 ail = adjusted_insertion_location()
1103                 ail_el = ail[0]
1104                 ail_i = ail[1]
1105                 el = token_to_element token, namespace, ail_el
1106                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1107                 el.parent = ail_el
1108                 ail_el.children.splice ail_i, 0, el
1109                 open_els.unshift el
1110                 return el
1111         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1112         insert_html_element = insert_foreign_element # (token, namespace) ->
1113
1114         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1115         # position should be [node, index_within_children]
1116         insert_comment = (t, position = null) ->
1117                 position ?= adjusted_insertion_location()
1118                 position[0].children.splice position[1], 0, t
1119
1120         # 8.2.5.2
1121         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1122         parse_generic_raw_text = (t) ->
1123                 insert_html_element t
1124                 tok_state = tok_state_rawtext
1125                 original_insertion_mode = insertion_mode
1126                 insertion_mode = ins_mode_text
1127         parse_generic_rcdata_text = (t) ->
1128                 insert_html_element t
1129                 tok_state = tok_state_rcdata
1130                 original_insertion_mode = insertion_mode
1131                 insertion_mode = ins_mode_text
1132
1133         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1134         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1135         generate_implied_end_tags = (except = null) ->
1136                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1137                         open_els.shift()
1138
1139         # 8.2.5.4 The rules for parsing tokens in HTML content
1140         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1141
1142         # 8.2.5.4.1 The "initial" insertion mode
1143         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1144         ins_mode_initial = (t) ->
1145                 if is_space_tok t
1146                         return
1147                 if t.type is TYPE_COMMENT
1148                         # ?fixfull
1149                         doc.children.push t
1150                         return
1151                 if t.type is TYPE_DOCTYPE
1152                         # FIXME check identifiers, set quirks, etc
1153                         # fixfull
1154                         doc.children.push t
1155                         insertion_mode = ins_mode_before_html
1156                         return
1157                 # Anything else
1158                 #fixfull (iframe, quirks)
1159                 insertion_mode = ins_mode_before_html
1160                 insertion_mode t # reprocess the token
1161                 return
1162
1163         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1164         ins_mode_before_html = (t) ->
1165                 if t.type is TYPE_DOCTYPE
1166                         parse_error()
1167                         return
1168                 if t.type is TYPE_COMMENT
1169                         doc.children.push t
1170                         return
1171                 if is_space_tok t
1172                         return
1173                 if t.type is TYPE_START_TAG and t.name is 'html'
1174                         el = token_to_element t, NS_HTML, doc
1175                         doc.children.push el
1176                         open_els.unshift(el)
1177                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178                         insertion_mode = ins_mode_before_head
1179                         return
1180                 if t.type is TYPE_END_TAG
1181                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182                                 # fall through to "anything else"
1183                         else
1184                                 parse_error()
1185                                 return
1186                 # Anything else
1187                 html_tok = new_open_tag 'html'
1188                 el = token_to_element html_tok, NS_HTML, doc
1189                 doc.children.push el
1190                 open_els.unshift el
1191                 # ?fixfull browsing context
1192                 insertion_mode = ins_mode_before_head
1193                 insertion_mode t
1194                 return
1195
1196         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197         ins_mode_before_head = (t) ->
1198                 if is_space_tok t
1199                         return
1200                 if t.type is TYPE_COMMENT
1201                         insert_comment t
1202                         return
1203                 if t.type is TYPE_DOCTYPE
1204                         parse_error()
1205                         return
1206                 if t.type is TYPE_START_TAG and t.name is 'html'
1207                         ins_mode_in_body t
1208                         return
1209                 if t.type is TYPE_START_TAG and t.name is 'head'
1210                         el = insert_html_element t
1211                         head_element_pointer = el
1212                         insertion_mode = ins_mode_in_head
1213                 if t.type is TYPE_END_TAG
1214                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215                                 # fall through to Anything else below
1216                         else
1217                                 parse_error()
1218                                 return
1219                 # Anything else
1220                 head_tok = new_open_tag 'head'
1221                 el = insert_html_element head_tok
1222                 head_element_pointer = el
1223                 insertion_mode = ins_mode_in_head
1224                 insertion_mode t # reprocess current token
1225
1226         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228                 open_els.shift() # spec says this will be a 'head' node
1229                 insertion_mode = ins_mode_after_head
1230                 insertion_mode t
1231         ins_mode_in_head = (t) ->
1232                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1233                         insert_character t
1234                         return
1235                 if t.type is TYPE_COMMENT
1236                         insert_comment t
1237                         return
1238                 if t.type is TYPE_DOCTYPE
1239                         parse_error()
1240                         return
1241                 if t.type is TYPE_START_TAG and t.name is 'html'
1242                         ins_mode_in_body t
1243                         return
1244                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245                         el = insert_html_element t
1246                         open_els.shift()
1247                         t.acknowledge_self_closing()
1248                         return
1249                 if t.type is TYPE_START_TAG and t.name is 'meta'
1250                         el = insert_html_element t
1251                         open_els.shift()
1252                         t.acknowledge_self_closing()
1253                         # fixfull encoding stuff
1254                         return
1255                 if t.type is TYPE_START_TAG and t.name is 'title'
1256                         parse_generic_rcdata_text t
1257                         return
1258                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259                         parse_generic_raw_text t
1260                         return
1261                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262                         insert_html_element t
1263                         insertion_mode = ins_mode_in_head_noscript
1264                         return
1265                 if t.type is TYPE_START_TAG and t.name is 'script'
1266                         ail = adjusted_insertion_location()
1267                         el = token_to_element t, NS_HTML, ail
1268                         el.flag 'parser-inserted', true
1269                         # fixfull frament case
1270                         ail[0].children.splice ail[1], 0, el
1271                         open_els.unshift el
1272                         tok_state = tok_state_script_data
1273                         original_insertion_mode = insertion_mode # make sure orig... is defined
1274                         insertion_mode = ins_mode_text
1275                         return
1276                 if t.type is TYPE_END_TAG and t.name is 'head'
1277                         open_els.shift() # will be a head element... spec says so
1278                         insertion_mode = ins_mode_after_head
1279                         return
1280                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281                         ins_mode_in_head_else t
1282                         return
1283                 if t.type is TYPE_START_TAG and t.name is 'template'
1284                         insert_html_element t
1285                         afe_push_marker()
1286                         flag_frameset_ok = false
1287                         insertion_mode = ins_mode_in_template
1288                         template_insertion_modes.unshift ins_mode_in_template
1289                         return
1290                 if t.type is TYPE_END_TAG and t.name is 'template'
1291                         if template_tag_is_open()
1292                                 generate_implied_end_tags
1293                                 if open_els[0].name isnt 'template'
1294                                         parse_error()
1295                                 loop
1296                                         el = open_els.shift()
1297                                         if el.name is 'template'
1298                                                 break
1299                                 clear_afe_to_marker()
1300                                 template_insertion_modes.shift()
1301                                 reset_insertion_mode()
1302                         else
1303                                 parse_error()
1304                         return
1305                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1306                         parse_error()
1307                         return
1308                 ins_mode_in_head_else t
1309
1310         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311         ins_mode_in_head_noscript_else = (t) ->
1312                 parse_error()
1313                 open_els.shift()
1314                 insertion_mode = ins_mode_in_head
1315                 insertion_mode t
1316         ins_mode_in_head_noscript = (t) ->
1317                 if t.type is TYPE_DOCTYPE
1318                         parse_error()
1319                         return
1320                 if t.type is TYPE_START_TAG
1321                         ins_mode_in_body t
1322                         return
1323                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1324                         open_els.shift()
1325                         insertion_mode = ins_mode_in_head
1326                         return
1327                 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1328                         ins_mode_in_head t
1329                         return
1330                 if t.type is TYPE_END_TAG and t.name is 'br'
1331                         ins_mode_in_head_noscript_else t
1332                         return
1333                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1334                         parse_error()
1335                         return
1336                 # Anything else
1337                 ins_mode_in_head_noscript_else t
1338                 return
1339
1340
1341
1342         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1343         ins_mode_after_head_else = (t) ->
1344                 body_tok = new_open_tag 'body'
1345                 insert_html_element body_tok
1346                 insertion_mode = ins_mode_in_body
1347                 insertion_mode t # reprocess token
1348                 return
1349         ins_mode_after_head = (t) ->
1350                 if is_space_tok t
1351                         insert_character t
1352                         return
1353                 if t.type is TYPE_COMMENT
1354                         insert_comment t
1355                         return
1356                 if t.type is TYPE_DOCTYPE
1357                         parse_error()
1358                         return
1359                 if t.type is TYPE_START_TAG and t.name is 'html'
1360                         ins_mode_in_body t
1361                         return
1362                 if t.type is TYPE_START_TAG and t.name is 'body'
1363                         insert_html_element t
1364                         flag_frameset_ok = false
1365                         insertion_mode = ins_mode_in_body
1366                         return
1367                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1368                         insert_html_element t
1369                         insertion_mode = ins_mode_in_frameset
1370                         return
1371                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1372                         parse_error()
1373                         open_els.unshift head_element_pointer
1374                         ins_mode_in_head t
1375                         for el, i of open_els
1376                                 if el is head_element_pointer
1377                                         open_els.splice i, 1
1378                                         return
1379                         console.log "warning: 23904 couldn't find head element in open_els"
1380                         return
1381                 if t.type is TYPE_END_TAG and t.name is 'template'
1382                         ins_mode_in_head t
1383                         return
1384                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1385                         ins_mode_after_head_else t
1386                         return
1387                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1388                         parse_error()
1389                         return
1390                 # Anything else
1391                 ins_mode_after_head_else t
1392
1393         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1394         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1395                 for el, i in open_els
1396                         if el.namespace is NS_HTML and el.name is name
1397                                 generate_implied_end_tags name # arg is exception
1398                                 parse_error() unless i is 0
1399                                 while i >= 0
1400                                         open_els.shift()
1401                                         i -= 1
1402                                 return
1403                         if special_elements[el.name] is el.namespace
1404                                 parse_error()
1405                                 return
1406                 return
1407         ins_mode_in_body = (t) ->
1408                 if t.type is TYPE_TEXT and t.text is "\u0000"
1409                         parse_error()
1410                         return
1411                 if is_space_tok t
1412                         reconstruct_active_formatting_elements()
1413                         insert_character t
1414                         return
1415                 if t.type is TYPE_TEXT
1416                         reconstruct_active_formatting_elements()
1417                         insert_character t
1418                         flag_frameset_ok = false
1419                         return
1420                 if t.type is TYPE_COMMENT
1421                         insert_comment t
1422                         return
1423                 if t.type is TYPE_DOCTYPE
1424                         parse_error()
1425                         return
1426                 if t.type is TYPE_START_TAG and t.name is 'html'
1427                         parse_error()
1428                         return if template_tag_is_open()
1429                         root_attrs = open_els[open_els.length - 1].attrs
1430                         for a of t.attrs_a
1431                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1432                         return
1433
1434                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1435                         ins_mode_in_head t
1436                         return
1437                 if t.type is TYPE_START_TAG and t.name is 'body'
1438                         parse_error()
1439                         return if open_els.length < 2
1440                         second = open_els[open_els.length - 2]
1441                         return unless second.ns is NS_HTML
1442                         return unless second.name is 'body'
1443                         return if template_tag_is_open()
1444                         frameset_ok_flag = false
1445                         for a of t.attrs_a
1446                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1447                         return
1448                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1449                         parse_error()
1450                         # FIXME CONTINUE
1451                         return
1452                 # FIXME CONTINUE
1453                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1454                         close_p_if_in_button_scope()
1455                         insert_html_element t
1456                         return
1457                 if t.type is TYPE_START_TAG and (t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6')
1458                         close_p_if_in_button_scope()
1459                         if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1460                                 parse_error()
1461                                 open_els.shift()
1462                         insert_html_element t
1463                         return
1464                 # FIXME CONTINUE
1465                 if t.type is TYPE_START_TAG and t.name is 'a'
1466                         # If the list of active formatting elements contains an a element
1467                         # between the end of the list and the last marker on the list (or
1468                         # the start of the list if there is no marker on the list), then
1469                         # this is a parse error; run the adoption agency algorithm for the
1470                         # tag name "a", then remove that element from the list of active
1471                         # formatting elements and the stack of open elements if the
1472                         # adoption agency algorithm didn't already remove it (it might not
1473                         # have if the element is not in table scope).
1474                         found = false
1475                         for el in afe
1476                                 if el.type is TYPE_AFE_MARKER
1477                                         break
1478                                 if el.name is 'a'
1479                                         found = el
1480                         if found?
1481                                 parse_error()
1482                                 adoption_agency 'a'
1483                                 for el, i in afe
1484                                         if el is found
1485                                                 afe.splice i, 1
1486                                 for el, i in open_els
1487                                         if el is found
1488                                                 open_els.splice i, 1
1489                         reconstruct_active_formatting_elements()
1490                         el = insert_html_element t
1491                         afe_push el
1492                         return
1493                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1494                         reconstruct_active_formatting_elements()
1495                         el = insert_html_element t
1496                         afe_push el
1497                         return
1498                 if t.type is TYPE_START_TAG and t.name is 'table'
1499                         # fixfull quirksmode thing
1500                         close_p_if_in_button_scope()
1501                         insert_html_element t
1502                         insertion_mode = ins_mode_in_table
1503                         return
1504                 # FIXME CONTINUE
1505                 if t.type is TYPE_EOF
1506                         ok_tags = {
1507                                 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1508                                 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1509                         }
1510                         for t in open_els
1511                                 unless ok_tags[t.name]?
1512                                         parse_error()
1513                                         break
1514                         # FIXME stack of template insertion modes thing
1515                         stop_parsing()
1516                         return
1517                 # FIXME CONTINUE some of these next ones are out of order I think
1518                 if t.type is TYPE_END_TAG and t.name is 'body'
1519                         unless is_in_scope 'body'
1520                                 parse_error()
1521                                 return
1522                         # fixme implement parse error and move to tree_after_body
1523                         return
1524                 if t.type is TYPE_END_TAG and t.name is 'html'
1525                         unless is_in_scope 'body' # weird, but it's what the spec says
1526                                 parse_error()
1527                                 return
1528                         # TODO implement parse error and move to tree_after_body, reprocess
1529                         return
1530                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1531                         unless is_in_scope t.name, NS_HTML
1532                                 parse_error()
1533                                 return
1534                         generate_implied_end_tags()
1535                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1536                                 parse_error()
1537                         loop
1538                                 el = open_els.shift()
1539                                 if el.name is t.name and el.namespace is NS_HTML
1540                                         return
1541                         return
1542                 if t.type is TYPE_END_TAG and t.name is 'p'
1543                         unless is_in_button_scope 'p'
1544                                 parse_error()
1545                                 insert_html_element new_open_tag 'p'
1546                         close_p_element()
1547                         return
1548                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1549                         adoption_agency t.name
1550                         return
1551                 if t.type is TYPE_START_TAG # any other start tag
1552                         reconstruct_active_formatting_elements()
1553                         insert_html_element t
1554                         return
1555                 if t.type is TYPE_END_TAG # any other end tag
1556                         in_body_any_other_end_tag t.name
1557                 return
1558
1559         ins_mode_in_table_else = (t) ->
1560                 parse_error()
1561                 flag_foster_parenting = true # FIXME
1562                 ins_mode_in_body t
1563                 flag_foster_parenting = false
1564         can_in_table = { # FIXME do this inline like everywhere else
1565                 'table': true
1566                 'tbody': true
1567                 'tfoot': true
1568                 'thead': true
1569                 'tr': true
1570         }
1571
1572         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1573         ins_mode_text = (t) ->
1574                 if t.type is TYPE_TEXT
1575                         insert_character t
1576                         return
1577                 if t.type is TYPE_EOF
1578                         parse_error()
1579                         if open_els[0].name is 'script'
1580                                 open_els[0].flag 'already started', true
1581                         open_els.shift()
1582                         insertion_mode = original_insertion_mode
1583                         insertion_mode t
1584                         return
1585                 if t.type is TYPE_END_TAG and t.name is 'script'
1586                         open_els.shift()
1587                         insertion_mode = original_insertion_mode
1588                         # fixfull the spec seems to assume that I'm going to run the script
1589                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1590                         return
1591                 if t.type is TYPE_END_TAG
1592                         open_els.shift()
1593                         insertion_mode = original_insertion_mode
1594                         return
1595                 console.log 'warning: end of ins_mode_text reached'
1596
1597         # the functions below implement the tokenizer stats described here:
1598         # http://www.w3.org/TR/html5/syntax.html#tokenization
1599
1600         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1601         ins_mode_in_table = (t) ->
1602                 switch t.type
1603                         when TYPE_TEXT
1604                                 if can_in_table[t.name]
1605                                         original_insertion_mode = insertion_mode
1606                                         insertion_mode = ins_mode_in_table_text
1607                                         insertion_mode t
1608                                 else
1609                                         ins_mode_in_table_else t
1610                         when TYPE_COMMENT
1611                                 insert_comment t
1612                         when TYPE_DOCTYPE
1613                                 parse_error()
1614                         when TYPE_START_TAG
1615                                 switch t.name
1616                                         when 'caption'
1617                                                 clear_stack_to_table_context()
1618                                                 afe_push_marker()
1619                                                 insert_html_element t
1620                                                 insertion_mode = ins_mode_in_caption
1621                                         when 'colgroup'
1622                                                 clear_stack_to_table_context()
1623                                                 insert_html_element t
1624                                                 insertion_mode = ins_mode_in_column_group
1625                                         when 'col'
1626                                                 clear_stack_to_table_context()
1627                                                 insert_html_element new_open_tag 'colgroup'
1628                                                 insertion_mode = ins_mode_in_column_group
1629                                                 insertion_mode t
1630                                         when 'tbody', 'tfoot', 'thead'
1631                                                 clear_stack_to_table_context()
1632                                                 insert_html_element t
1633                                                 insertion_mode = ins_mode_in_table_body
1634                                         when 'td', 'th', 'tr'
1635                                                 clear_stack_to_table_context()
1636                                                 insert_html_element new_open_tag 'tbody'
1637                                                 insertion_mode = ins_mode_in_table_body
1638                                                 insertion_mode t
1639                                         when 'table'
1640                                                 parse_error()
1641                                                 if is_in_table_scope 'table'
1642                                                         loop
1643                                                                 el = open_els.shift()
1644                                                                 if el.name is 'table'
1645                                                                         break
1646                                                         reset_insertion_mode()
1647                                                         insertion_mode t
1648                                         when 'style', 'script', 'template'
1649                                                 ins_mode_in_head t
1650                                         when 'input'
1651                                                 if is_input_hidden_tok t
1652                                                         ins_mode_in_table_else t
1653                                                 else
1654                                                         parse_error()
1655                                                         el = insert_html_element t
1656                                                         open_els.shift()
1657                                                         t.acknowledge_self_closing()
1658                                         when 'form'
1659                                                 parse_error()
1660                                                 if form_element_pointer?
1661                                                         return
1662                                                 if template_tag_is_open()
1663                                                         return
1664                                                 form_element_pointer = insert_html_element t
1665                                                 open_els.shift()
1666                                         else
1667                                                 ins_mode_in_table_else t
1668                         when TYPE_END_TAG
1669                                 switch t.name
1670                                         when 'table'
1671                                                 if is_in_table_scope 'table'
1672                                                         loop
1673                                                                 el = open_els.shift()
1674                                                                 if el.name is 'table'
1675                                                                         break
1676                                                         reset_insertion_mode()
1677                                                 else
1678                                                         parse_error
1679                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1680                                                 parse_error()
1681                                         when 'template'
1682                                                 ins_mode_in_head t
1683                                         else
1684                                                 ins_mode_in_table_else t
1685                         when TYPE_EOF
1686                                 ins_mode_in_body t
1687                         else
1688                                 ins_mode_in_table_else t
1689
1690
1691         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1692         ins_mode_in_table_text = (t) ->
1693                 if t.type is TYPE_TEXT and t.text is "\u0000"
1694                         # huh? I thought the tokenizer didn't emit these
1695                         parse_error()
1696                         return
1697                 if t.type is TYPE_TEXT
1698                         pending_table_character_tokens.push t
1699                         return
1700                 # Anything else
1701                 all_space = true
1702                 for old in pending_table_character_tokens
1703                         unless is_space_tok old
1704                                 all_space = false
1705                                 break
1706                 if all_space
1707                         for old in pending_table_character_tokens
1708                                 insert_character old
1709                 else
1710                         for old in pending_table_character_tokens
1711                                 ins_mode_table_else old
1712                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1713                 insertion_mode = original_insertion_mode
1714                 insertion_mode t
1715
1716         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1717         ins_mode_in_caption = (t) ->
1718                 if t.type is TYPE_END_TAG and t.name is 'caption'
1719                         if is_in_table_scope 'caption'
1720                                 generate_implied_end_tags()
1721                                 if open_els[0].name isnt 'caption'
1722                                         parse_error()
1723                                 loop
1724                                         el = open_els.shift()
1725                                         if el.name is 'caption'
1726                                                 break
1727                                 clear_afe_to_marker()
1728                                 insertion_mode = ins_mode_in_table
1729                         else
1730                                 parse_error()
1731                                 # fragment case
1732                         return
1733                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1734                         parse_error()
1735                         if is_in_table_scope 'caption'
1736                                 loop
1737                                         el = open_els.shift()
1738                                         if el.name is 'caption'
1739                                                 break
1740                                 clear_afe_to_marker()
1741                                 insertion_mode = ins_mode_in_table
1742                                 insertion_mode t
1743                         # else fragment case
1744                         return
1745                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1746                         parse_error()
1747                         return
1748                 # Anything else
1749                 ins_mode_in_body t
1750
1751         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1752         ins_mode_in_column_group = (t) ->
1753                 if is_space_tok t
1754                         insert_character t
1755                         return
1756                 if t.type is TYPE_COMMENT
1757                         insert_comment t
1758                         return
1759                 if t.type is TYPE_DOCTYPE
1760                         parse_error()
1761                         return
1762                 if t.type is TYPE_START_TAG and t.name is 'html'
1763                         ins_mode_in_body t
1764                         return
1765                 if t.type is TYPE_START_TAG and t.name is 'col'
1766                         el = insert_html_element t
1767                         open_els.shift()
1768                         t.acknowledge_self_closing()
1769                         return
1770                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1771                         if open_els[0].name is 'colgroup'
1772                                 open_els.shift()
1773                                 insertion_mode = ins_mode_in_table
1774                         else
1775                                 parse_error()
1776                         return
1777                 if t.type is TYPE_END_TAG and t.name is 'col'
1778                         parse_error()
1779                         return
1780                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1781                         ins_mode_in_head t
1782                         return
1783                 if t.type is TYPE_EOF
1784                         ins_mode_in_body t
1785                         return
1786                 # Anything else
1787                 if open_els[0].name isnt 'colgroup'
1788                         parse_error()
1789                         return
1790                 open_els.shift()
1791                 insertion_mode = ins_mode_in_table
1792                 insertion_mode t
1793                 return
1794
1795         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1796         ins_mode_in_table_body = (t) ->
1797                 if t.type is TYPE_START_TAG and t.name is 'tr'
1798                         clear_stack_to_table_body_context()
1799                         insert_html_element t
1800                         insertion_mode = ins_mode_in_row
1801                         return
1802                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1803                         parse_error()
1804                         clear_stack_to_table_body_context()
1805                         insert_html_element new_open_tag 'tr'
1806                         insertion_mode = ins_mode_in_row
1807                         insertion_mode t
1808                         return
1809                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1810                         unless is_in_table_scope t.name # fixfull check namespace
1811                                 parse_error()
1812                                 return
1813                         clear_stack_to_table_body_context()
1814                         open_els.shift()
1815                         insertion_mode = ins_mode_in_table
1816                         return
1817                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1818                         has = false
1819                         for el in open_els
1820                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1821                                         has = true
1822                                         break
1823                                 if table_scopers[el.name]
1824                                         break
1825                         if !has
1826                                 parse_error()
1827                                 return
1828                         clear_stack_to_table_body_context()
1829                         open_els.shift()
1830                         insertion_mode = ins_mode_in_table
1831                         insertion_mode t
1832                         return
1833                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1834                         parse_error()
1835                         return
1836                 # Anything else
1837                 ins_mode_in_table t
1838
1839         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1840         ins_mode_in_row = (t) ->
1841                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1842                         clear_stack_to_table_row_context()
1843                         insert_html_element t
1844                         insertion_mode = ins_mode_in_cell
1845                         afe_push_marker()
1846                         return
1847                 if t.type is TYPE_END_TAG and t.name is 'tr'
1848                         if is_in_table_scope 'tr'
1849                                 clear_stack_to_table_row_context()
1850                                 open_els.shift()
1851                                 insertion_mode = ins_mode_in_table_body
1852                         else
1853                                 parse_error()
1854                         return
1855                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1856                         if is_in_table_scope 'tr'
1857                                 clear_stack_to_table_row_context()
1858                                 open_els.shift()
1859                                 insertion_mode = ins_mode_in_table_body
1860                                 insertion_mode t
1861                         else
1862                                 parse_error()
1863                         return
1864                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1865                         if is_in_table_scope t.name # fixfull namespace
1866                                 if is_in_table_scope 'tr'
1867                                         clear_stack_to_table_row_context()
1868                                         open_els.shift()
1869                                         insertion_mode = ins_mode_in_table_body
1870                                         insertion_mode t
1871                         else
1872                                 parse_error()
1873                         return
1874                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1875                         parse_error()
1876                         return
1877                 # Anything else
1878                 ins_mode_in_table t
1879
1880         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1881         close_the_cell = ->
1882                 generate_implied_end_tags()
1883                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1884                         parse_error()
1885                 loop
1886                         el = open_els.shift()
1887                         if el.name is 'td' or el.name is 'th'
1888                                 break
1889                 clear_afe_to_marker()
1890                 insertion_mode = ins_mode_in_row
1891
1892         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1893         ins_mode_in_cell = (t) ->
1894                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1895                         if is_in_table_scope t.name
1896                                 generate_implied_end_tags()
1897                                 if open_els[0].name isnt t.name
1898                                         parse_error
1899                                 loop
1900                                         el = open_els.shift()
1901                                         if el.name is t.name
1902                                                 break
1903                                 clear_afe_to_marker()
1904                                 insertion_mode = ins_mode_in_row
1905                         else
1906                                 parse_error()
1907                         return
1908                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1909                         has = false
1910                         for el in open_els
1911                                 if el.name is 'td' or el.name is 'th'
1912                                         has = true
1913                                         break
1914                                 if table_scopers[el.name]
1915                                         break
1916                         if !has
1917                                 parse_error()
1918                                 return
1919                         close_the_cell()
1920                         insertion_mode t
1921                         return
1922                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1923                         parse_error()
1924                         return
1925                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1926                         if is_in_table_scope t.name # fixfull namespace
1927                                 close_the_cell()
1928                                 insertion_mode t
1929                         else
1930                                 parse_error()
1931                         return
1932                 # Anything Else
1933                 ins_mode_in_body t
1934
1935         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1936         ins_mode_in_select = (t) ->
1937                 if t.type is TYPE_TEXT and t.text is "\u0000"
1938                         parse_error()
1939                         return
1940                 if t.type is TYPE_TEXT
1941                         insert_character t
1942                         return
1943                 if t.type is TYPE_COMMENT
1944                         insert_comment t
1945                         return
1946                 if t.type is TYPE_DOCTYPE
1947                         parse_error()
1948                         return
1949                 if t.type is TYPE_START_TAG and t.name is 'html'
1950                         ins_mode_in_body t
1951                         return
1952                 if t.type is TYPE_START_TAG and t.name is 'option'
1953                         if open_els[0].name is 'option'
1954                                 open_els.shift()
1955                         insert_html_element t
1956                         return
1957                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1958                         if open_els[0].name is 'option'
1959                                 open_els.shift()
1960                         if open_els[0].name is 'optgroup'
1961                                 open_els.shift()
1962                         insert_html_element t
1963                         return
1964                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1965                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1966                                 open_els.shift()
1967                         if open_els[0].name is 'optgroup'
1968                                 open_els.shift()
1969                         else
1970                                 parse_error()
1971                         return
1972                 if t.type is TYPE_END_TAG and t.name is 'option'
1973                         if open_els[0].name is 'option'
1974                                 open_els.shift()
1975                         else
1976                                 parse_error()
1977                         return
1978                 if t.type is TYPE_END_TAG and t.name is 'select'
1979                         if is_in_select_scope 'select'
1980                                 loop
1981                                         el = open_els.shift()
1982                                         if el.name is 'select'
1983                                                 break
1984                                 reset_insertion_mode()
1985                         else
1986                                 parse_error()
1987                         return
1988                 if t.type is TYPE_START_TAG and t.name is 'select'
1989                         parse_error()
1990                         loop
1991                                 el = open_els.shift()
1992                                 if el.name is 'select'
1993                                         break
1994                         reset_insertion_mode()
1995                         # spec says that this is the same as </select> but it doesn't say
1996                         # to check scope first
1997                         return
1998                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1999                         parse_error()
2000                         if is_in_select_scope 'select'
2001                                 return
2002                         loop
2003                                 el = open_els.shift()
2004                                 if el.name is 'select'
2005                                         break
2006                         reset_insertion_mode()
2007                         insertion_mode t
2008                         return
2009                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2010                         ins_mode_in_head t
2011                         return
2012                 if t.type is TYPE_EOF
2013                         ins_mode_in_body t
2014                         return
2015                 # Anything else
2016                 parse_error()
2017                 return
2018
2019         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2020         ins_mode_in_select_in_table = (t) ->
2021                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2022                         parse_error()
2023                         loop
2024                                 el = open_els.shift()
2025                                 if el.name is 'select'
2026                                         break
2027                         reset_insertion_mode()
2028                         insertion_mode t
2029                         return
2030                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2031                         parse_error()
2032                         unless is_in_table_scope t.name, NS_HTML
2033                                 return
2034                         loop
2035                                 el = open_els.shift()
2036                                 if el.name is 'select'
2037                                         break
2038                         reset_insertion_mode()
2039                         insertion_mode t
2040                         return
2041                 # Anything else
2042                 ins_mode_in_select t
2043                 return
2044
2045         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2046         ins_mode_in_template = (t) ->
2047                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2048                         ins_mode_in_body t
2049                         return
2050                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2051                         ins_mode_in_head t
2052                         return
2053                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2054                         template_insertion_modes.shift()
2055                         template_insertion_modes.unshift ins_mode_in_table
2056                         insertion_mode = ins_mode_in_table
2057                         insertion_mode t
2058                         return
2059                 if t.type is TYPE_START_TAG and t.name is 'col'
2060                         template_insertion_modes.shift()
2061                         template_insertion_modes.unshift ins_mode_in_column_group
2062                         insertion_mode = ins_mode_in_column_group
2063                         insertion_mode t
2064                         return
2065                 if t.type is TYPE_START_TAG and t.name is 'tr'
2066                         template_insertion_modes.shift()
2067                         template_insertion_modes.unshift ins_mode_in_table_body
2068                         insertion_mode = ins_mode_in_table_body
2069                         insertion_mode t
2070                         return
2071                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2072                         template_insertion_modes.shift()
2073                         template_insertion_modes.unshift ins_mode_in_row
2074                         insertion_mode = ins_mode_in_row
2075                         insertion_mode t
2076                         return
2077                 if t.type is TYPE_START_TAG
2078                         template_insertion_modes.shift()
2079                         template_insertion_modes.unshift ins_mode_in_body
2080                         insertion_mode = ins_mode_in_body
2081                         insertion_mode t
2082                         return
2083                 if t.type is TYPE_END_TAG
2084                         parse_error()
2085                         return
2086                 if t.type is TYPE_EOF
2087                         unless template_tag_is_open()
2088                                 stop_parsing()
2089                                 return
2090                         parse_error()
2091                         loop
2092                                 el = open_els.shift()
2093                                 if el.name is 'template' # fixfull check namespace
2094                                         break
2095                         clear_afe_to_marker()
2096                         template_insertion_modes.shift()
2097                         reset_insertion_mode()
2098                         insertion_mode t
2099
2100         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2101         ins_mode_after_body = (t) ->
2102                 if is_space_tok t
2103                         ins_mode_in_body t
2104                         return
2105                 if t.type is TYPE_COMMENT
2106                         insert_comment t, [open_els[0], open_els[0].children.length]
2107                         return
2108                 if t.type is TYPE_DOCTYPE
2109                         parse_error()
2110                         return
2111                 if t.type is TYPE_START_TAG and t.name is 'html'
2112                         ins_mode_in_body t
2113                         return
2114                 if t.type is TYPE_END_TAG and t.name is 'html'
2115                         # fixfull fragment case
2116                         insertion_mode = ins_mode_after_after_body
2117                         return
2118                 if t.type is TYPE_EOF
2119                         stop_parsing()
2120                         return
2121                 # Anything ELse
2122                 parse_error()
2123                 insertion_mode = ins_mode_in_body
2124                 insertion_mode t
2125
2126         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2127         ins_mode_in_frameset = (t) ->
2128                 if is_space_tok t
2129                         insert_character t
2130                         return
2131                 if t.type is TYPE_COMMENT
2132                         insert_comment t
2133                         return
2134                 if t.type is TYPE_DOCTYPE
2135                         parse_error()
2136                         return
2137                 if t.type is TYPE_START_TAG and t.name is 'html'
2138                         ins_mode_in_body t
2139                         return
2140                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2141                         insert_html_element t
2142                         return
2143                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2144                         # TODO ?correct for: "if the current node is the root html element"
2145                         if open_els.length is 1
2146                                 parse_error()
2147                                 return # fragment case
2148                         open_els.shift()
2149                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2150                                 insertion_mode = ins_mode_after_frameset
2151                         return
2152                 if t.type is TYPE_START_TAG and t.name is 'frame'
2153                         insert_html_element t
2154                         open_els.shift()
2155                         t.acknowledge_self_closing()
2156                         return
2157                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2158                         ins_mode_in_head t
2159                         return
2160                 if t.type is TYPE_EOF
2161                         # TODO ?correct for: "if the current node is not the root html element"
2162                         if open_els.length isnt 1
2163                                 parse_error()
2164                         stop_parsing()
2165                         return
2166                 # Anything else
2167                 parse_error()
2168                 return
2169
2170         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2171         ins_mode_after_frameset = (t) ->
2172                 if is_space_tok t
2173                         insert_character t
2174                         return
2175                 if t.type is TYPE_COMMENT
2176                         insert_comment t
2177                         return
2178                 if t.type is TYPE_DOCTYPE
2179                         parse_error()
2180                         return
2181                 if t.type is TYPE_START_TAG and t.name is 'html'
2182                         ins_mode_in_body t
2183                         return
2184                 if t.type is TYPE_END_TAG and t.name is 'html'
2185                         insert_mode = ins_mode_after_after_frameset
2186                         return
2187                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2188                         ins_mode_in_head t
2189                         return
2190                 if t.type is TYPE_EOF
2191                         stop_parsing()
2192                         return
2193                 # Anything else
2194                 parse_error()
2195                 return
2196
2197         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2198         ins_mode_after_after_body = (t) ->
2199                 if t.type is TYPE_COMMENT
2200                         insert_comment t, [doc, doc.children.length]
2201                         return
2202                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2203                         ins_mode_in_body t
2204                         return
2205                 if t.type is TYPE_EOF
2206                         stop_parsing()
2207                         return
2208                 # Anything else
2209                 parse_error()
2210                 insertion_mode = ins_mode_in_body
2211                 return
2212
2213         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2214         ins_mode_after_after_frameset = (t) ->
2215                 if t.type is TYPE_COMMENT
2216                         insert_comment t, [doc, doc.children.length]
2217                         return
2218                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2219                         ins_mode_in_body t
2220                         return
2221                 if t.type is TYPE_EOF
2222                         stop_parsing()
2223                         return
2224                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2225                         ins_mode_in_head t
2226                         return
2227                 # Anything else
2228                 parse_error()
2229                 return
2230
2231
2232
2233
2234
2235         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2236         tok_state_data = ->
2237                 switch c = txt.charAt(cur++)
2238                         when '&'
2239                                 return new_text_node parse_character_reference()
2240                         when '<'
2241                                 tok_state = tok_state_tag_open
2242                         when "\u0000"
2243                                 parse_error()
2244                                 return new_text_node c
2245                         when '' # EOF
2246                                 return new_eof_token()
2247                         else
2248                                 return new_text_node c
2249                 return null
2250
2251         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2252         # not needed: tok_state_character_reference_in_data = ->
2253         # just call parse_character_reference()
2254
2255         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2256         tok_state_rcdata = ->
2257                 switch c = txt.charAt(cur++)
2258                         when '&'
2259                                 return new_text_node parse_character_reference()
2260                         when '<'
2261                                 tok_state = tok_state_rcdata_less_than_sign
2262                         when "\u0000"
2263                                 parse_error()
2264                                 return new_character_token "\ufffd"
2265                         when '' # EOF
2266                                 return new_eof_token()
2267                         else
2268                                 return new_character_token c
2269                 return null
2270
2271         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2272         # not needed: tok_state_character_reference_in_rcdata = ->
2273         # just call parse_character_reference()
2274
2275         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2276         tok_state_rawtext = ->
2277                 switch c = txt.charAt(cur++)
2278                         when '<'
2279                                 tok_state = tok_state_rawtext_less_than_sign
2280                         when "\u0000"
2281                                 parse_error()
2282                                 return new_character_token "\ufffd"
2283                         when '' # EOF
2284                                 return new_eof_token()
2285                         else
2286                                 return new_character_token c
2287                 return null
2288
2289         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2290         tok_state_script_data = ->
2291                 switch c = txt.charAt(cur++)
2292                         when '<'
2293                                 tok_state = tok_state_script_data_less_than_sign
2294                         when "\u0000"
2295                                 parse_error()
2296                                 return new_character_token "\ufffd"
2297                         when '' # EOF
2298                                 return new_eof_token()
2299                         else
2300                                 return new_character_token c
2301                 return null
2302
2303         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2304         tok_state_plaintext = ->
2305                 switch c = txt.charAt(cur++)
2306                         when "\u0000"
2307                                 parse_error()
2308                                 return new_character_token "\ufffd"
2309                         when '' # EOF
2310                                 return new_eof_token()
2311                         else
2312                                 return new_character_token c
2313                 return null
2314
2315
2316         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2317         tok_state_tag_open = ->
2318                 switch c = txt.charAt(cur++)
2319                         when '!'
2320                                 tok_state = tok_state_markup_declaration_open
2321                         when '/'
2322                                 tok_state = tok_state_end_tag_open
2323                         when '?'
2324                                 parse_error()
2325                                 tok_cur_tag = new_comment_token '?'
2326                                 tok_state = tok_state_bogus_comment
2327                         else
2328                                 if is_lc_alpha(c)
2329                                         tok_cur_tag = new_open_tag c
2330                                         tok_state = tok_state_tag_name
2331                                 else if is_uc_alpha(c)
2332                                         tok_cur_tag = new_open_tag c.toLowerCase()
2333                                         tok_state = tok_state_tag_name
2334                                 else
2335                                         parse_error()
2336                                         tok_state = tok_state_data
2337                                         cur -= 1 # we didn't parse/handle the char after <
2338                                         return new_text_node '<'
2339                 return null
2340
2341         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2342         tok_state_end_tag_open = ->
2343                 switch c = txt.charAt(cur++)
2344                         when '>'
2345                                 parse_error()
2346                                 tok_state = tok_state_data
2347                         when '' # EOF
2348                                 parse_error()
2349                                 tok_state = tok_state_data
2350                                 return new_text_node '</'
2351                         else
2352                                 if is_uc_alpha(c)
2353                                         tok_cur_tag = new_end_tag c.toLowerCase()
2354                                         tok_state = tok_state_tag_name
2355                                 else if is_lc_alpha(c)
2356                                         tok_cur_tag = new_end_tag c
2357                                         tok_state = tok_state_tag_name
2358                                 else
2359                                         parse_error()
2360                                         tok_cur_tag = new_comment_token '/'
2361                                         tok_state = tok_state_bogus_comment
2362                 return null
2363
2364         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2365         tok_state_tag_name = ->
2366                 switch c = txt.charAt(cur++)
2367                         when "\t", "\n", "\u000c", ' '
2368                                 tok_state = tok_state_before_attribute_name
2369                         when '/'
2370                                 tok_state = tok_state_self_closing_start_tag
2371                         when '>'
2372                                 tok_state = tok_state_data
2373                                 tmp = tok_cur_tag
2374                                 tok_cur_tag = null
2375                                 return tmp
2376                         when "\u0000"
2377                                 parse_error()
2378                                 tok_cur_tag.name += "\ufffd"
2379                         when '' # EOF
2380                                 parse_error()
2381                                 tok_state = tok_state_data
2382                         else
2383                                 if is_uc_alpha(c)
2384                                         tok_cur_tag.name += c.toLowerCase()
2385                                 else
2386                                         tok_cur_tag.name += c
2387                 return null
2388
2389         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2390         tok_state_rcdata_less_than_sign = ->
2391                 c = txt.charAt(cur++)
2392                 if c is '/'
2393                         temporary_buffer = ''
2394                         tok_state = tok_state_rcdata_end_tag_open
2395                         return null
2396                 # Anything else
2397                 tok_state = tok_state_rcdata
2398                 cur -= 1 # reconsume the input character
2399                 return new_character_token '<'
2400
2401         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2402         tok_state_rcdata_end_tag_open = ->
2403                 c = txt.charAt(cur++)
2404                 if is_uc_alpha(c)
2405                         tok_cur_tag = new_end_tag c.toLowerCase()
2406                         temporary_buffer += c
2407                         tok_state = tok_state_rcdata_end_tag_name
2408                         return null
2409                 if is_lc_alpha(c)
2410                         tok_cur_tag = new_end_tag c
2411                         temporary_buffer += c
2412                         tok_state = tok_state_rcdata_end_tag_name
2413                         return null
2414                 # Anything else
2415                 tok_state = tok_state_rcdata
2416                 cur -= 1 # reconsume the input character
2417                 return new_character_token "</" # fixfull separate these
2418
2419         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2420         is_appropriate_end_tag = (t) ->
2421                 # spec says to check against "the tag name of the last start tag to
2422                 # have been emitted from this tokenizer", but this is only called from
2423                 # the various "raw" states, which I'm pretty sure all push the start
2424                 # token onto open_els. TODO: verify this after the script data states
2425                 # are implemented
2426                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2427                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2428
2429         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2430         tok_state_rcdata_end_tag_name = ->
2431                 c = txt.charAt(cur++)
2432                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2433                         if is_appropriate_end_tag tok_cur_tag
2434                                 tok_state = tok_state_before_attribute_name
2435                                 return
2436                         # else fall through to "Anything else"
2437                 if c is '/'
2438                         if is_appropriate_end_tag tok_cur_tag
2439                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2440                                 return
2441                         # else fall through to "Anything else"
2442                 if c is '>'
2443                         if is_appropriate_end_tag tok_cur_tag
2444                                 tok_state = tok_state_data
2445                                 return tok_cur_tag
2446                         # else fall through to "Anything else"
2447                 if is_uc_alpha(c)
2448                         tok_cur_tag.name += c.toLowerCase()
2449                         temporary_buffer += c
2450                         return null
2451                 if is_lc_alpha(c)
2452                         tok_cur_tag.name += c
2453                         temporary_buffer += c
2454                         return null
2455                 # Anything else
2456                 tok_state = tok_state_rcdata
2457                 cur -= 1 # reconsume the input character
2458                 return new_character_token '</' + temporary_buffer # fixfull separate these
2459
2460         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2461         tok_state_rawtext_less_than_sign = ->
2462                 c = txt.charAt(cur++)
2463                 if c is '/'
2464                         temporary_buffer = ''
2465                         tok_state = tok_state_rawtext_end_tag_open
2466                         return null
2467                 # Anything else
2468                 tok_state = tok_state_rawtext
2469                 cur -= 1 # reconsume the input character
2470                 return new_character_token '<'
2471
2472         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2473         tok_state_rawtext_end_tag_open = ->
2474                 c = txt.charAt(cur++)
2475                 if is_uc_alpha(c)
2476                         tok_cur_tag = new_end_tag c.toLowerCase()
2477                         temporary_buffer += c
2478                         tok_state = tok_state_rawtext_end_tag_name
2479                         return null
2480                 if is_lc_alpha(c)
2481                         tok_cur_tag = new_end_tag c
2482                         temporary_buffer += c
2483                         tok_state = tok_state_rawtext_end_tag_name
2484                         return null
2485                 # Anything else
2486                 tok_state = tok_state_rawtext
2487                 cur -= 1 # reconsume the input character
2488                 return new_character_token "</" # fixfull separate these
2489
2490         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2491         tok_state_rawtext_end_tag_name = ->
2492                 c = txt.charAt(cur++)
2493                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2494                         if is_appropriate_end_tag tok_cur_tag
2495                                 tok_state = tok_state_before_attribute_name
2496                                 return
2497                         # else fall through to "Anything else"
2498                 if c is '/'
2499                         if is_appropriate_end_tag tok_cur_tag
2500                                 tok_state = tok_state_self_closing_start_tag
2501                                 return
2502                         # else fall through to "Anything else"
2503                 if c is '>'
2504                         if is_appropriate_end_tag tok_cur_tag
2505                                 tok_state = tok_state_data
2506                                 return tok_cur_tag
2507                         # else fall through to "Anything else"
2508                 if is_uc_alpha(c)
2509                         tok_cur_tag.name += c.toLowerCase()
2510                         temporary_buffer += c
2511                         return null
2512                 if is_lc_alpha(c)
2513                         tok_cur_tag.name += c
2514                         temporary_buffer += c
2515                         return null
2516                 # Anything else
2517                 tok_state = tok_state_rawtext
2518                 cur -= 1 # reconsume the input character
2519                 return new_character_token '</' + temporary_buffer # fixfull separate these
2520
2521         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2522         tok_state_script_data_less_than_sign = ->
2523                 c = txt.charAt(cur++)
2524                 if c is '/'
2525                         temporary_buffer = ''
2526                         tok_state = tok_state_script_data_end_tag_open
2527                         return
2528                 if c is '!'
2529                         tok_state = tok_state_script_data_escape_start
2530                         return new_character_token '<!' # fixfull split
2531                 # Anything else
2532                 tok_state = tok_state_script_data
2533                 cur -= 1 # Reconsume
2534                 return new_character_token '<'
2535
2536         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2537         tok_state_script_data_end_tag_open = ->
2538                 c = txt.charAt(cur++)
2539                 if is_uc_alpha(c)
2540                         tok_cur_tag = new_end_tag c.toLowerCase()
2541                         temporary_buffer += c
2542                         tok_state = tok_state_script_data_end_tag_name
2543                         return
2544                 if is_lc_alpha(c)
2545                         tok_cur_tag = new_end_tag c
2546                         temporary_buffer += c
2547                         tok_state = tok_state_script_data_end_tag_name
2548                         return
2549                 # Anything else
2550                 tok_state = tok_state_script_data
2551                 cur -= 1 # Reconsume
2552                 return new_character_token '</'
2553
2554         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2555         tok_state_script_data_end_tag_name = ->
2556                 c = txt.charAt(cur++)
2557                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2558                         if is_appropriate_end_tag tok_cur_tag
2559                                 tok_state = tok_state_before_attribute_name
2560                                 return
2561                         # fall through
2562                 if c is '/'
2563                         if is_appropriate_end_tag tok_cur_tag
2564                                 tok_state = tok_state_self_closing_start_tag
2565                                 return
2566                         # fall through
2567                 if is_uc_alpha(c)
2568                         tok_cur_tag.name += c.toLowerCase()
2569                         temporary_buffer += c
2570                         return
2571                 if is_lc_alpha(c)
2572                         tok_cur_tag.name += c
2573                         temporary_buffer += c
2574                         return
2575                 # Anything else
2576                 tok_state = tok_state_script_data
2577                 cur -= 1 # Reconsume
2578                 return new_character_token "</#{temporary_buffer}" # fixfull split
2579
2580         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2581         tok_state_script_data_escape_start = ->
2582                 c = txt.charAt(cur++)
2583                 if c is '-'
2584                         tok_state = tok_state_script_data_escape_start_dash
2585                         return new_character_token '-'
2586                 # Anything else
2587                 tok_state = tok_state_script_data
2588                 cur -= 1 # Reconsume
2589                 return
2590
2591         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2592         tok_state_script_data_escape_start_dash = ->
2593                 c = txt.charAt(cur++)
2594                 if c is '-'
2595                         tok_state = tok_state_script_data_escaped_dash_dash
2596                         return new_character_token '-'
2597                 # Anything else
2598                 tok_state = tok_state_script_data
2599                 cur -= 1 # Reconsume
2600                 return
2601
2602         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2603         tok_state_script_data_escaped = ->
2604                 c = txt.charAt(cur++)
2605                 if c is '-'
2606                         tok_state = tok_state_script_data_escaped_dash
2607                         return new_character_token '-'
2608                 if c is '<'
2609                         tok_state = tok_state_script_data_escaped_less_than_sign
2610                         return
2611                 if c is "\u0000"
2612                         parse_error()
2613                         return new_character_token "\ufffd"
2614                 if c is '' # EOF
2615                         tok_state = tok_state_data
2616                         parse_error()
2617                         cur -= 1 # Reconsume
2618                         return
2619                 # Anything else
2620                 return new_character_token c
2621
2622         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2623         tok_state_script_data_escaped_dash = ->
2624                 c = txt.charAt(cur++)
2625                 if c is '-'
2626                         tok_state = tok_state_script_data_escaped_dash_dash
2627                         return new_character_token '-'
2628                 if c is '<'
2629                         tok_state = tok_state_script_data_escaped_less_than_sign
2630                         return
2631                 if c is "\u0000"
2632                         parse_error()
2633                         tok_state = tok_state_script_data_escaped
2634                         return new_character_token "\ufffd"
2635                 if c is '' # EOF
2636                         tok_state = tok_state_data
2637                         parse_error()
2638                         cur -= 1 # Reconsume
2639                         return
2640                 # Anything else
2641                 tok_state = tok_state_script_data_escaped
2642                 return new_character_token c
2643
2644         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2645         tok_state_script_data_escaped_dash_dash = ->
2646                 c = txt.charAt(cur++)
2647                 if c is '-'
2648                         return new_character_token '-'
2649                 if c is '<'
2650                         tok_state = tok_state_script_data_escaped_less_than_sign
2651                         return
2652                 if c is '>'
2653                         tok_state = tok_state_script_data
2654                         return new_character_token '>'
2655                 if c is "\u0000"
2656                         parse_error()
2657                         tok_state = tok_state_script_data_escaped
2658                         return new_character_token "\ufffd"
2659                 if c is '' # EOF
2660                         parse_error()
2661                         tok_state = tok_state_data
2662                         cur -= 1 # Reconsume
2663                         return
2664                 # Anything else
2665                 tok_state = tok_state_script_data_escaped
2666                 return new_character_token c
2667
2668         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2669         tok_state_script_data_escaped_less_than_sign = ->
2670                 c = txt.charAt(cur++)
2671                 if c is '/'
2672                         temporary_buffer = ''
2673                         tok_state = tok_state_script_data_escaped_end_tag_open
2674                         return
2675                 if is_uc_alpha(c)
2676                         temporary_buffer = c.toLowerCase() # yes, really
2677                         tok_state = tok_state_script_data_double_escape_start
2678                         return new_character_token "<#{c}" # fixfull split
2679                 if is_lc_alpha(c)
2680                         temporary_buffer = c
2681                         tok_state = tok_state_script_data_double_escape_start
2682                         return new_character_token "<#{c}" # fixfull split
2683                 # Anything else
2684                 tok_state = tok_state_script_data_escaped
2685                 cur -= 1 # Reconsume
2686                 return new_character_token c
2687
2688         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2689         tok_state_script_data_escaped_end_tag_open = ->
2690                 c = txt.charAt(cur++)
2691                 if is_uc_alpha(c)
2692                         tok_cur_tag = new_end_tag c.toLowerCase()
2693                         temporary_buffer += c
2694                         tok_state = tok_state_script_data_escaped_end_tag_name
2695                         return
2696                 if is_lc_alpha(c)
2697                         tok_cur_tag = new_end_tag c
2698                         temporary_buffer += c
2699                         tok_state = tok_state_script_data_escaped_end_tag_name
2700                         return
2701                 # Anything else
2702                 tok_state = tok_state_script_data_escaped
2703                 cur -= 1 # Reconsume
2704                 return new_character_token '</' # fixfull split
2705
2706         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2707         tok_state_script_data_escaped_end_tag_name = ->
2708                 c = txt.charAt(cur++)
2709                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2710                         if is_appropriate_end_tag tok_cur_tag
2711                                 tok_state = tok_state_before_attribute_name
2712                                 return
2713                         # fall through
2714                 if c is '/'
2715                         if is_appropriate_end_tag tok_cur_tag
2716                                 tok_state = tok_state_self_closing_start_tag
2717                                 return
2718                         # fall through
2719                 if is_uc_alpha(c)
2720                         tok_cur_tag.name += c.toLowerCase()
2721                         temporary_buffer += c.toLowerCase()
2722                         return
2723                 if is_lc_alpha(c)
2724                         tok_cur_tag.name += c
2725                         temporary_buffer += c.toLowerCase()
2726                         return
2727                 # Anything else
2728                 tok_state = tok_state_script_data_escaped
2729                 cur -= 1 # Reconsume
2730                 return new_character_token "</#{temporary_buffer}" # fixfull split
2731
2732         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2733         tok_state_script_data_double_escape_start = ->
2734                 c = txt.charAt(cur++)
2735                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2736                         if temporary_buffer is 'script'
2737                                 tok_state = tok_state_script_data_double_escaped
2738                         else
2739                                 tok_state = tok_state_script_data_escaped
2740                         return new_character_token c
2741                 if is_uc_alpha(c)
2742                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2743                         return new_character_token c
2744                 if is_lc_alpha(c)
2745                         temporary_buffer += c
2746                         return new_character_token c
2747                 # Anything else
2748                 tok_state = tok_state_script_data_escaped
2749                 cur -= 1 # Reconsume
2750                 return
2751
2752         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2753         tok_state_script_data_double_escaped = ->
2754                 c = txt.charAt(cur++)
2755                 if c is '-'
2756                         tok_state = tok_state_script_data_double_escaped_dash
2757                         return new_character_token '-'
2758                 if c is '<'
2759                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2760                         return new_character_token '<'
2761                 if c is "\u0000"
2762                         parse_error()
2763                         return new_character_token "\ufffd"
2764                 if c is '' # EOF
2765                         parse_error()
2766                         tok_state = tok_state_data
2767                         cur -= 1 # Reconsume
2768                         return
2769                 # Anything else
2770                 return new_character_token c
2771
2772         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2773         tok_state_script_data_double_escaped_dash = ->
2774                 c = txt.charAt(cur++)
2775                 if c is '-'
2776                         tok_state = tok_state_script_data_double_escaped_dash_dash
2777                         return new_character_token '-'
2778                 if c is '<'
2779                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2780                         return new_character_token '<'
2781                 if c is "\u0000"
2782                         parse_error()
2783                         tok_state = tok_state_script_data_double_escaped
2784                         return new_character_token "\ufffd"
2785                 if c is '' # EOF
2786                         parse_error()
2787                         tok_state = tok_state_data
2788                         cur -= 1 # Reconsume
2789                         return
2790                 # Anything else
2791                 tok_state = tok_state_script_data_double_escaped
2792                 return new_character_token c
2793
2794         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2795         tok_state_script_data_double_escaped_dash_dash = ->
2796                 c = txt.charAt(cur++)
2797                 if c is '-'
2798                         return new_character_token '-'
2799                 if c is '<'
2800                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2801                         return new_character_token '<'
2802                 if c is '>'
2803                         tok_state = tok_state_script_data
2804                         return new_character_token '>'
2805                 if c is "\u0000"
2806                         parse_error()
2807                         tok_state = tok_state_script_data_double_escaped
2808                         return new_character_token "\ufffd"
2809                 if c is '' # EOF
2810                         parse_error()
2811                         tok_state = tok_state_data
2812                         cur -= 1 # Reconsume
2813                         return
2814                 # Anything else
2815                 tok_state = tok_state_script_data_double_escaped
2816                 return new_character_token c
2817
2818         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2819         tok_state_script_data_double_escaped_less_than_sign = ->
2820                 c = txt.charAt(cur++)
2821                 if c is '/'
2822                         temporary_buffer = ''
2823                         tok_state = tok_state_script_data_double_escape_end
2824                         return new_character_token '/'
2825                 # Anything else
2826                 tok_state = tok_state_script_data_double_escaped
2827                 cur -= 1 # Reconsume
2828                 return
2829
2830         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2831         tok_state_script_data_double_escape_end = ->
2832                 c = txt.charAt(cur++)
2833                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2834                         if temporary_buffer is 'script'
2835                                 tok_state = tok_state_script_data_escaped
2836                         else
2837                                 tok_state = tok_state_script_data_double_escaped
2838                         return new_character_token c
2839                 if is_uc_alpha(c)
2840                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2841                         return new_character_token c
2842                 if is_lc_alpha(c)
2843                         temporary_buffer += c
2844                         return new_character_token c
2845                 # Anything else
2846                 tok_state = tok_state_script_data_double_escaped
2847                 cur -= 1 # Reconsume
2848                 return
2849
2850         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2851         tok_state_before_attribute_name = ->
2852                 attr_name = null
2853                 switch c = txt.charAt(cur++)
2854                         when "\t", "\n", "\u000c", ' '
2855                                 return null
2856                         when '/'
2857                                 tok_state = tok_state_self_closing_start_tag
2858                                 return null
2859                         when '>'
2860                                 tok_state = tok_state_data
2861                                 tmp = tok_cur_tag
2862                                 tok_cur_tag = null
2863                                 return tmp
2864                         when "\u0000"
2865                                 parse_error()
2866                                 attr_name = "\ufffd"
2867                         when '"', "'", '<', '='
2868                                 parse_error()
2869                                 attr_name = c
2870                         when '' # EOF
2871                                 parse_error()
2872                                 tok_state = tok_state_data
2873                         else
2874                                 if is_uc_alpha(c)
2875                                         attr_name = c.toLowerCase()
2876                                 else
2877                                         attr_name = c
2878                 if attr_name?
2879                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2880                         tok_state = tok_state_attribute_name
2881                 return null
2882
2883         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2884         tok_state_attribute_name = ->
2885                 switch c = txt.charAt(cur++)
2886                         when "\t", "\n", "\u000c", ' '
2887                                 tok_state = tok_state_after_attribute_name
2888                         when '/'
2889                                 tok_state = tok_state_self_closing_start_tag
2890                         when '='
2891                                 tok_state = tok_state_before_attribute_value
2892                         when '>'
2893                                 tok_state = tok_state_data
2894                                 tmp = tok_cur_tag
2895                                 tok_cur_tag = null
2896                                 return tmp
2897                         when "\u0000"
2898                                 parse_error()
2899                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2900                         when '"', "'", '<'
2901                                 parse_error()
2902                                 tok_cur_tag.attrs_a[0][0] = c
2903                         when '' # EOF
2904                                 parse_error()
2905                                 tok_state = tok_state_data
2906                         else
2907                                 if is_uc_alpha(c)
2908                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2909                                 else
2910                                         tok_cur_tag.attrs_a[0][0] += c
2911                 return null
2912
2913         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2914         tok_state_after_attribute_name = ->
2915                 c = txt.charAt(cur++)
2916                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2917                         return
2918                 if c is '/'
2919                         tok_state = tok_state_self_closing_start_tag
2920                         return
2921                 if c is '='
2922                         tok_state = tok_state_before_attribute_value
2923                         return
2924                 if c is '>'
2925                         tok_state = tok_state_data
2926                         return
2927                 if is_uc_alpha(c)
2928                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2929                         tok_state = tok_state_attribute_name
2930                         return
2931                 if c is "\u0000"
2932                         parse_error()
2933                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2934                         tok_state = tok_state_attribute_name
2935                         return
2936                 if c is '' # EOF
2937                         parse_error()
2938                         tok_state = tok_state_data
2939                         cur -= 1 # reconsume
2940                         return
2941                 if c is '"' or c is "'" or c is '<'
2942                         parse_error()
2943                         # fall through to Anything else
2944                 # Anything else
2945                 tok_cur_tag.attrs_a.unshift [c, '']
2946                 tok_state = tok_state_attribute_name
2947
2948         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2949         tok_state_before_attribute_value = ->
2950                 switch c = txt.charAt(cur++)
2951                         when "\t", "\n", "\u000c", ' '
2952                                 return null
2953                         when '"'
2954                                 tok_state = tok_state_attribute_value_double_quoted
2955                         when '&'
2956                                 tok_state = tok_state_attribute_value_unquoted
2957                                 cur -= 1
2958                         when "'"
2959                                 tok_state = tok_state_attribute_value_single_quoted
2960                         when "\u0000"
2961                                 # Parse error
2962                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2963                                 tok_state = tok_state_attribute_value_unquoted
2964                         when '>'
2965                                 # Parse error
2966                                 tok_state = tok_state_data
2967                                 tmp = tok_cur_tag
2968                                 tok_cur_tag = null
2969                                 return tmp
2970                         when '' # EOF
2971                                 parse_error()
2972                                 tok_state = tok_state_data
2973                         else
2974                                 tok_cur_tag.attrs_a[0][1] += c
2975                                 tok_state = tok_state_attribute_value_unquoted
2976                 return null
2977
2978         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2979         tok_state_attribute_value_double_quoted = ->
2980                 switch c = txt.charAt(cur++)
2981                         when '"'
2982                                 tok_state = tok_state_after_attribute_value_quoted
2983                         when '&'
2984                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2985                         when "\u0000"
2986                                 # Parse error
2987                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2988                         when '' # EOF
2989                                 parse_error()
2990                                 tok_state = tok_state_data
2991                         else
2992                                 tok_cur_tag.attrs_a[0][1] += c
2993                 return null
2994
2995         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2996         tok_state_attribute_value_single_quoted = ->
2997                 switch c = txt.charAt(cur++)
2998                         when "'"
2999                                 tok_state = tok_state_after_attribute_value_quoted
3000                         when '&'
3001                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3002                         when "\u0000"
3003                                 # Parse error
3004                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3005                         when '' # EOF
3006                                 parse_error()
3007                                 tok_state = tok_state_data
3008                         else
3009                                 tok_cur_tag.attrs_a[0][1] += c
3010                 return null
3011
3012         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3013         tok_state_attribute_value_unquoted = ->
3014                 switch c = txt.charAt(cur++)
3015                         when "\t", "\n", "\u000c", ' '
3016                                 tok_state = tok_state_before_attribute_name
3017                         when '&'
3018                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3019                         when '>'
3020                                 tok_state = tok_state_data
3021                                 tmp = tok_cur_tag
3022                                 tok_cur_tag = null
3023                                 return tmp
3024                         when "\u0000"
3025                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3026                         when '' # EOF
3027                                 parse_error()
3028                                 tok_state = tok_state_data
3029                         else
3030                                 # Parse Error if ', <, = or ` (backtick)
3031                                 tok_cur_tag.attrs_a[0][1] += c
3032                 return null
3033
3034         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3035         tok_state_after_attribute_value_quoted = ->
3036                 switch c = txt.charAt(cur++)
3037                         when "\t", "\n", "\u000c", ' '
3038                                 tok_state = tok_state_before_attribute_name
3039                         when '/'
3040                                 tok_state = tok_state_self_closing_start_tag
3041                         when '>'
3042                                 tok_state = tok_state_data
3043                                 tmp = tok_cur_tag
3044                                 tok_cur_tag = null
3045                                 return tmp
3046                         when '' # EOF
3047                                 parse_error()
3048                                 tok_state = tok_state_data
3049                         else
3050                                 # Parse Error
3051                                 tok_state = tok_state_before_attribute_name
3052                                 cur -= 1 # we didn't handle that char
3053                 return null
3054
3055         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3056         tok_state_self_closing_start_tag = ->
3057                 c = txt.charAt(cur++)
3058                 if c is '>'
3059                         tok_cur_tag.flag 'self-closing'
3060                         tok_state = tok_state_data
3061                         return tok_cur_tag
3062                 if c is ''
3063                         parse_error()
3064                         tok_state = tok_state_data
3065                         cur -= 1 # Reconsume
3066                         return
3067                 # Anything else
3068                 parse_error()
3069                 tok_state = tok_state_before_attribute_name
3070                 cur -= 1 # Reconsume
3071                 return
3072
3073         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3074         # WARNING: put a comment token in tok_cur_tag before setting this state
3075         tok_state_bogus_comment = ->
3076                 next_gt = txt.indexOf '>', cur
3077                 if next_gt is -1
3078                         val = txt.substr cur
3079                         cur = txt.length
3080                 else
3081                         val = txt.substr cur, (next_gt - cur)
3082                         cur = next_gt + 1
3083                 val = val.replace "\u0000", "\ufffd"
3084                 tok_cur_tag.text += val
3085                 tok_state = tok_state_data
3086                 return tok_cur_tag
3087
3088         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3089         tok_state_markup_declaration_open = ->
3090                 if txt.substr(cur, 2) is '--'
3091                         cur += 2
3092                         tok_cur_tag = new_comment_token ''
3093                         tok_state = tok_state_comment_start
3094                         return
3095                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3096                         cur += 7
3097                         tok_state = tok_state_doctype
3098                         return
3099                 acn = adjusted_current_node()
3100                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3101                         cur += 7
3102                         tok_state = tok_state_cdata_section
3103                         return
3104                 # Otherwise
3105                 parse_error()
3106                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3107                 tok_state = tok_state_bogus_comment
3108                 return
3109
3110         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3111         tok_state_comment_start = ->
3112                 switch c = txt.charAt(cur++)
3113                         when '-'
3114                                 tok_state = tok_state_comment_start_dash
3115                         when "\u0000"
3116                                 parse_error()
3117                                 return new_character_token "\ufffd"
3118                         when '>'
3119                                 parse_error()
3120                                 tok_state = tok_state_data
3121                                 return tok_cur_tag
3122                         when '' # EOF
3123                                 parse_error()
3124                                 tok_state = tok_state_data
3125                                 cur -= 1 # Reconsume
3126                                 return tok_cur_tag
3127                         else
3128                                 tok_cur_tag.text += c
3129                 return null
3130
3131         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3132         tok_state_comment_start_dash = ->
3133                 switch c = txt.charAt(cur++)
3134                         when '-'
3135                                 tok_state = tok_state_comment_end
3136                         when "\u0000"
3137                                 parse_error()
3138                                 tok_cur_tag.text += "-\ufffd"
3139                                 tok_state = tok_state_comment
3140                         when '>'
3141                                 parse_error()
3142                                 tok_state = tok_state_data
3143                                 return tok_cur_tag
3144                         when '' # EOF
3145                                 parse_error()
3146                                 tok_state = tok_state_data
3147                                 cur -= 1 # Reconsume
3148                                 return tok_cur_tag
3149                         else
3150                                 tok_cur_tag.text += "-#{c}"
3151                                 tok_state = tok_state_comment
3152                 return null
3153
3154         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3155         tok_state_comment = ->
3156                 switch c = txt.charAt(cur++)
3157                         when '-'
3158                                 tok_state = tok_state_comment_end_dash
3159                         when "\u0000"
3160                                 parse_error()
3161                                 tok_cur_tag.text += "\ufffd"
3162                         when '' # EOF
3163                                 parse_error()
3164                                 tok_state = tok_state_data
3165                                 cur -= 1 # Reconsume
3166                                 return tok_cur_tag
3167                         else
3168                                 tok_cur_tag.text += c
3169                 return null
3170
3171         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3172         tok_state_comment_end_dash = ->
3173                 switch c = txt.charAt(cur++)
3174                         when '-'
3175                                 tok_state = tok_state_comment_end
3176                         when "\u0000"
3177                                 parse_error()
3178                                 tok_cur_tag.text += "-\ufffd"
3179                                 tok_state = tok_state_comment
3180                         when '' # EOF
3181                                 parse_error()
3182                                 tok_state = tok_state_data
3183                                 cur -= 1 # Reconsume
3184                                 return tok_cur_tag
3185                         else
3186                                 tok_cur_tag.text += "-#{c}"
3187                                 tok_state = tok_state_comment
3188                 return null
3189
3190         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3191         tok_state_comment_end = ->
3192                 switch c = txt.charAt(cur++)
3193                         when '>'
3194                                 tok_state = tok_state_data
3195                                 return tok_cur_tag
3196                         when "\u0000"
3197                                 parse_error()
3198                                 tok_cur_tag.text += "--\ufffd"
3199                                 tok_state = tok_state_comment
3200                         when '!'
3201                                 parse_error()
3202                                 tok_state = tok_state_comment_end_bang
3203                         when '-'
3204                                 parse_error()
3205                                 tok_cur_tag.text += '-'
3206                         when '' # EOF
3207                                 parse_error()
3208                                 tok_state = tok_state_data
3209                                 cur -= 1 # Reconsume
3210                                 return tok_cur_tag
3211                         else
3212                                 parse_error()
3213                                 tok_cur_tag.text += "--#{c}"
3214                                 tok_state = tok_state_comment
3215                 return null
3216
3217         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3218         tok_state_comment_end_bang = ->
3219                 switch c = txt.charAt(cur++)
3220                         when '-'
3221                                 tok_cur_tag.text += "--!#{c}"
3222                                 tok_state = tok_state_comment_end_dash
3223                         when '>'
3224                                 tok_state = tok_state_data
3225                                 return tok_cur_tag
3226                         when "\u0000"
3227                                 parse_error()
3228                                 tok_cur_tag.text += "--!\ufffd"
3229                                 tok_state = tok_state_comment
3230                         when '' # EOF
3231                                 parse_error()
3232                                 tok_state = tok_state_data
3233                                 cur -= 1 # Reconsume
3234                                 return tok_cur_tag
3235                         else
3236                                 tok_cur_tag.text += "--!#{c}"
3237                                 tok_state = tok_state_comment
3238                 return null
3239
3240         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3241         tok_state_doctype = ->
3242                 switch c = txt.charAt(cur++)
3243                         when "\t", "\u000a", "\u000c", ' '
3244                                 tok_state = tok_state_before_doctype_name
3245                         when '' # EOF
3246                                 parse_error()
3247                                 tok_state = tok_state_data
3248                                 el = new_doctype_token ''
3249                                 el.flag 'force-quirks', true
3250                                 cur -= 1 # Reconsume
3251                                 return el
3252                         else
3253                                 parse_error()
3254                                 tok_state = tok_state_before_doctype_name
3255                                 cur -= 1 # Reconsume
3256                 return null
3257
3258         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3259         tok_state_before_doctype_name = ->
3260                 c = txt.charAt(cur++)
3261                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3262                         return
3263                 if is_uc_alpha(c)
3264                         tok_cur_tag = new_doctype_token c.toLowerCase()
3265                         tok_state = tok_state_doctype_name
3266                         return
3267                 if c is "\u0000"
3268                         parse_error()
3269                         tok_cur_tag = new_doctype_token "\ufffd"
3270                         tok_state = tok_state_doctype_name
3271                         return
3272                 if c is '>'
3273                         parse_error()
3274                         el = new_doctype_token ''
3275                         el.flag 'force-quirks', true
3276                         tok_state = tok_state_data
3277                         return el
3278                 if c is '' # EOF
3279                         parse_error()
3280                         tok_state = tok_state_data
3281                         el = new_doctype_token ''
3282                         el.flag 'force-quirks', true
3283                         cur -= 1 # Reconsume
3284                         return el
3285                 # Anything else
3286                 tok_cur_tag = new_doctype_token c
3287                 tok_state = tok_state_doctype_name
3288                 return null
3289
3290         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3291         tok_state_doctype_name = ->
3292                 c = txt.charAt(cur++)
3293                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3294                         tok_state = tok_state_after_doctype_name
3295                         return
3296                 if c is '>'
3297                         tok_state = tok_state_data
3298                         return tok_cur_tag
3299                 if is_uc_alpha(c)
3300                         tok_cur_tag.name += c.toLowerCase()
3301                         return
3302                 if c is "\u0000"
3303                         parse_error()
3304                         tok_cur_tag.name += "\ufffd"
3305                         return
3306                 if c is '' # EOF
3307                         parse_error()
3308                         tok_state = tok_state_data
3309                         tok_cur_tag.flag 'force-quirks', true
3310                         cur -= 1 # Reconsume
3311                         return tok_cur_tag
3312                 # Anything else
3313                 tok_cur_tag.name += c
3314                 return null
3315
3316         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3317         tok_state_after_doctype_name = ->
3318                 c = txt.charAt(cur++)
3319                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3320                         return
3321                 if c is '>'
3322                         tok_state = tok_state_data
3323                         return tok_cur_tag
3324                 if c is '' # EOF
3325                         parse_error()
3326                         tok_state = tok_state_data
3327                         tok_cur_tag.flag 'force-quirks', true
3328                         cur -= 1 # Reconsume
3329                         return tok_cur_tag
3330                 # Anything else
3331                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3332                         cur += 5
3333                         tok_state = tok_state_after_doctype_public_keyword
3334                         return
3335                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3336                         cur += 5
3337                         tok_state = tok_state_after_doctype_system_keyword
3338                         return
3339                 parse_error()
3340                 tok_cur_tag.flag 'force-quirks', true
3341                 tok_state = tok_state_bogus_doctype
3342                 return null
3343
3344         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3345         tok_state_after_doctype_public_keyword = ->
3346                 c = txt.charAt(cur++)
3347                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3348                         tok_state = tok_state_before_doctype_public_identifier
3349                         return
3350                 if c is '"'
3351                         parse_error()
3352                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3353                         tok_state = tok_state_doctype_public_identifier_double_quoted
3354                         return
3355                 if c is "'"
3356                         parse_error()
3357                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3358                         tok_state = tok_state_doctype_public_identifier_single_quoted
3359                         return
3360                 if c is '>'
3361                         parse_error()
3362                         tok_cur_tag.flag 'force-quirks', true
3363                         tok_state = tok_state_data
3364                         return tok_cur_tag
3365                 if c is '' # EOF
3366                         parse_error()
3367                         tok_state = tok_state_data
3368                         tok_cur_tag.flag 'force-quirks', true
3369                         cur -= 1 # Reconsume
3370                         return tok_cur_tag
3371                 # Anything else
3372                 parse_error()
3373                 tok_cur_tag.flag 'force-quirks', true
3374                 tok_state = tok_state_bogus_doctype
3375                 return null
3376
3377         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3378         tok_state_before_doctype_public_identifier = ->
3379                 c = txt.charAt(cur++)
3380                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3381                         return
3382                 if c is '"'
3383                         parse_error()
3384                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3385                         tok_state = tok_state_doctype_public_identifier_double_quoted
3386                         return
3387                 if c is "'"
3388                         parse_error()
3389                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3390                         tok_state = tok_state_doctype_public_identifier_single_quoted
3391                         return
3392                 if c is '>'
3393                         parse_error()
3394                         tok_cur_tag.flag 'force-quirks', true
3395                         tok_state = tok_state_data
3396                         return tok_cur_tag
3397                 if c is '' # EOF
3398                         parse_error()
3399                         tok_state = tok_state_data
3400                         tok_cur_tag.flag 'force-quirks', true
3401                         cur -= 1 # Reconsume
3402                         return tok_cur_tag
3403                 # Anything else
3404                 parse_error()
3405                 tok_cur_tag.flag 'force-quirks', true
3406                 tok_state = tok_state_bogus_doctype
3407                 return null
3408
3409
3410         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3411         tok_state_doctype_public_identifier_double_quoted = ->
3412                 c = txt.charAt(cur++)
3413                 if c is '"'
3414                         tok_state = tok_state_after_doctype_public_identifier
3415                         return
3416                 if c is "\u0000"
3417                         parse_error()
3418                         tok_cur_tag.public_identifier += "\ufffd"
3419                         return
3420                 if c is '>'
3421                         parse_error()
3422                         tok_cur_tag.flag 'force-quirks', true
3423                         tok_state = tok_state_data
3424                         return tok_cur_tag
3425                 if c is '' # EOF
3426                         parse_error()
3427                         tok_state = tok_state_data
3428                         tok_cur_tag.flag 'force-quirks', true
3429                         cur -= 1 # Reconsume
3430                         return tok_cur_tag
3431                 # Anything else
3432                 tok_cur_tag.public_identifier += c
3433                 return null
3434
3435         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3436         tok_state_doctype_public_identifier_single_quoted = ->
3437                 c = txt.charAt(cur++)
3438                 if c is "'"
3439                         tok_state = tok_state_after_doctype_public_identifier
3440                         return
3441                 if c is "\u0000"
3442                         parse_error()
3443                         tok_cur_tag.public_identifier += "\ufffd"
3444                         return
3445                 if c is '>'
3446                         parse_error()
3447                         tok_cur_tag.flag 'force-quirks', true
3448                         tok_state = tok_state_data
3449                         return tok_cur_tag
3450                 if c is '' # EOF
3451                         parse_error()
3452                         tok_state = tok_state_data
3453                         tok_cur_tag.flag 'force-quirks', true
3454                         cur -= 1 # Reconsume
3455                         return tok_cur_tag
3456                 # Anything else
3457                 tok_cur_tag.public_identifier += c
3458                 return null
3459
3460         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3461         tok_state_after_doctype_public_identifier = ->
3462                 c = txt.charAt(cur++)
3463                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3464                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3465                         return
3466                 if c is '>'
3467                         tok_state = tok_state_data
3468                         return tok_cur_tag
3469                 if c is '"'
3470                         parse_error()
3471                         tok_cur_tag.system_identifier = ''
3472                         tok_state = tok_state_doctype_system_identifier_double_quoted
3473                         return
3474                 if c is "'"
3475                         parse_error()
3476                         tok_cur_tag.system_identifier = ''
3477                         tok_state = tok_state_doctype_system_identifier_single_quoted
3478                         return
3479                 if c is '' # EOF
3480                         parse_error()
3481                         tok_state = tok_state_data
3482                         tok_cur_tag.flag 'force-quirks', true
3483                         cur -= 1 # Reconsume
3484                         return tok_cur_tag
3485                 # Anything else
3486                 parse_error()
3487                 tok_cur_tag.flag 'force-quirks', true
3488                 tok_state = tok_state_bogus_doctype
3489                 return null
3490
3491         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3492         tok_state_between_doctype_public_and_system_identifiers = ->
3493                 c = txt.charAt(cur++)
3494                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3495                         return
3496                 if c is '>'
3497                         tok_state = tok_state_data
3498                         return tok_cur_tag
3499                 if c is '"'
3500                         parse_error()
3501                         tok_cur_tag.system_identifier = ''
3502                         tok_state = tok_state_doctype_system_identifier_double_quoted
3503                         return
3504                 if c is "'"
3505                         parse_error()
3506                         tok_cur_tag.system_identifier = ''
3507                         tok_state = tok_state_doctype_system_identifier_single_quoted
3508                         return
3509                 if c is '' # EOF
3510                         parse_error()
3511                         tok_state = tok_state_data
3512                         tok_cur_tag.flag 'force-quirks', true
3513                         cur -= 1 # Reconsume
3514                         return tok_cur_tag
3515                 # Anything else
3516                 parse_error()
3517                 tok_cur_tag.flag 'force-quirks', true
3518                 tok_state = tok_state_bogus_doctype
3519                 return null
3520
3521         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3522         tok_state_after_doctype_system_keyword = ->
3523                 c = txt.charAt(cur++)
3524                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3525                         tok_state = tok_state_before_doctype_system_identifier
3526                         return
3527                 if c is '"'
3528                         parse_error()
3529                         tok_cur_tag.system_identifier = ''
3530                         tok_state = tok_state_doctype_system_identifier_double_quoted
3531                         return
3532                 if c is "'"
3533                         parse_error()
3534                         tok_cur_tag.system_identifier = ''
3535                         tok_state = tok_state_doctype_system_identifier_single_quoted
3536                         return
3537                 if c is '>'
3538                         parse_error()
3539                         tok_cur_tag.flag 'force-quirks', true
3540                         tok_state = tok_state_data
3541                         return tok_cur_tag
3542                 if c is '' # EOF
3543                         parse_error()
3544                         tok_state = tok_state_data
3545                         tok_cur_tag.flag 'force-quirks', true
3546                         cur -= 1 # Reconsume
3547                         return tok_cur_tag
3548                 # Anything else
3549                 parse_error()
3550                 tok_cur_tag.flag 'force-quirks', true
3551                 tok_state = tok_state_bogus_doctype
3552                 return null
3553
3554         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3555         tok_state_before_doctype_system_identifier = ->
3556                 c = txt.charAt(cur++)
3557                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3558                         return
3559                 if c is '"'
3560                         tok_cur_tag.system_identifier = ''
3561                         tok_state = tok_state_doctype_system_identifier_double_quoted
3562                         return
3563                 if c is "'"
3564                         tok_cur_tag.system_identifier = ''
3565                         tok_state = tok_state_doctype_system_identifier_single_quoted
3566                         return
3567                 if c is '>'
3568                         parse_error()
3569                         tok_cur_tag.flag 'force-quirks', true
3570                         tok_state = tok_state_data
3571                         return tok_cur_tag
3572                 if c is '' # EOF
3573                         parse_error()
3574                         tok_state = tok_state_data
3575                         tok_cur_tag.flag 'force-quirks', true
3576                         cur -= 1 # Reconsume
3577                         return tok_cur_tag
3578                 # Anything else
3579                 parse_error()
3580                 tok_cur_tag.flag 'force-quirks', true
3581                 tok_state = tok_state_bogus_doctype
3582                 return null
3583
3584         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3585         tok_state_doctype_system_identifier_double_quoted = ->
3586                 c = txt.charAt(cur++)
3587                 if c is '"'
3588                         tok_state = tok_state_after_doctype_system_identifier
3589                         return
3590                 if c is "\u0000"
3591                         parse_error()
3592                         tok_cur_tag.system_identifier += "\ufffd"
3593                         return
3594                 if c is '>'
3595                         parse_error()
3596                         tok_cur_tag.flag 'force-quirks', true
3597                         tok_state = tok_state_data
3598                         return tok_cur_tag
3599                 if c is '' # EOF
3600                         parse_error()
3601                         tok_state = tok_state_data
3602                         tok_cur_tag.flag 'force-quirks', true
3603                         cur -= 1 # Reconsume
3604                         return tok_cur_tag
3605                 # Anything else
3606                 tok_cur_tag.system_identifier += c
3607                 return null
3608
3609         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3610         tok_state_doctype_system_identifier_single_quoted = ->
3611                 c = txt.charAt(cur++)
3612                 if c is "'"
3613                         tok_state = tok_state_after_doctype_system_identifier
3614                         return
3615                 if c is "\u0000"
3616                         parse_error()
3617                         tok_cur_tag.system_identifier += "\ufffd"
3618                         return
3619                 if c is '>'
3620                         parse_error()
3621                         tok_cur_tag.flag 'force-quirks', true
3622                         tok_state = tok_state_data
3623                         return tok_cur_tag
3624                 if c is '' # EOF
3625                         parse_error()
3626                         tok_state = tok_state_data
3627                         tok_cur_tag.flag 'force-quirks', true
3628                         cur -= 1 # Reconsume
3629                         return tok_cur_tag
3630                 # Anything else
3631                 tok_cur_tag.system_identifier += c
3632                 return null
3633
3634         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3635         tok_state_after_doctype_system_identifier = ->
3636                 c = txt.charAt(cur++)
3637                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3638                         return
3639                 if c is '>'
3640                         tok_state = tok_state_data
3641                         return tok_cur_tag
3642                 if c is '' # EOF
3643                         parse_error()
3644                         tok_state = tok_state_data
3645                         tok_cur_tag.flag 'force-quirks', true
3646                         cur -= 1 # Reconsume
3647                         return tok_cur_tag
3648                 # Anything else
3649                 parse_error()
3650                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3651                 tok_state = tok_state_bogus_doctype
3652                 return null
3653
3654         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3655         tok_state_bogus_doctype = ->
3656                 c = txt.charAt(cur++)
3657                 if c is '>'
3658                         tok_state = tok_state_data
3659                         return tok_cur_tag
3660                 if c is '' # EOF
3661                         tok_state = tok_state_data
3662                         cur -= 1 # Reconsume
3663                         return tok_cur_tag
3664                 # Anything else
3665                 return null
3666
3667
3668         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3669         # Don't set this as a state, just call it
3670         # returns a string (NOT a text node)
3671         parse_character_reference = (allowed_char = null, in_attr = false) ->
3672                 if cur >= txt.length
3673                         return '&'
3674                 switch c = txt.charAt(cur)
3675                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3676                                 # explicitly not a parse error
3677                                 return '&'
3678                         when ';'
3679                                 # there has to be "one or more" alnums between & and ; to be a parse error
3680                                 return '&'
3681                         when '#'
3682                                 if cur + 1 >= txt.length
3683                                         return '&'
3684                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3685                                         prefix = '#x'
3686                                         charset = hex_chars
3687                                         start = cur + 2
3688                                 else
3689                                         charset = digits
3690                                         start = cur + 1
3691                                         prefix = '#'
3692                                 i = 0
3693                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3694                                         i += 1
3695                                 if i is 0
3696                                         return '&'
3697                                 if txt.charAt(start + i) is ';'
3698                                         i += 1
3699                                 # FIXME This is supposed to generate parse errors for some chars
3700                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3701                                 if decoded?
3702                                         cur = start + i
3703                                         return decoded
3704                                 return '&'
3705                         else
3706                                 for i in [0...31]
3707                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3708                                                 break
3709                                 if i is 0
3710                                         # exit early, because parse_error() below needs at least one alnum
3711                                         return '&'
3712                                 if txt.charAt(cur + i) is ';'
3713                                         i += 1 # include ';' terminator in value
3714                                         decoded = decode_named_char_ref txt.substr(cur, i)
3715                                         if decoded?
3716                                                 cur += i
3717                                                 return decoded
3718                                         parse_error()
3719                                         return '&'
3720                                 else
3721                                         # no ';' terminator (only legacy char refs)
3722                                         max = i
3723                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3724                                                 c = legacy_char_refs[txt.substr(cur, i)]
3725                                                 if c?
3726                                                         if in_attr
3727                                                                 if txt.charAt(cur + i) is '='
3728                                                                         # "because some legacy user agents will
3729                                                                         # misinterpret the markup in those cases"
3730                                                                         parse_error()
3731                                                                         return '&'
3732                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3733                                                                         # this makes attributes forgiving about url args
3734                                                                         return '&'
3735                                                         # ok, and besides the weird exceptions for attributes...
3736                                                         # return the matching char
3737                                                         cur += i # consume entity chars
3738                                                         parse_error() # because no terminating ";"
3739                                                         return c
3740                                         parse_error()
3741                                         return '&'
3742                 return # never reached
3743
3744         # tree constructor initialization
3745         # see comments on TYPE_TAG/etc for the structure of this data
3746         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3747         open_els = []
3748         afe = [] # active formatting elements
3749         template_insertion_modes = []
3750         insertion_mode = ins_mode_initial
3751         original_insertion_mode = insertion_mode # TODO check spec
3752         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3753         flag_frameset_ok = true
3754         flag_parsing = true
3755         flag_foster_parenting = false
3756         form_element_pointer = null
3757         temporary_buffer = null
3758         pending_table_character_tokens = []
3759         head_element_pointer = null
3760         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3761         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3762
3763         # tokenizer initialization
3764         tok_state = tok_state_data
3765
3766         # proccess input
3767         while flag_parsing
3768                 t = tok_state()
3769                 if t?
3770                         insertion_mode t
3771                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3772         return doc.children
3773
3774 serialize_els = (els, shallow, show_ids) ->
3775         serialized = ''
3776         sep = ''
3777         for t in els
3778                 serialized += sep
3779                 sep = ','
3780                 serialized += t.serialize shallow, show_ids
3781         return serialized
3782
3783 # TODO export TYPE_*
3784 module.exports.parse_html = parse_html
3785 module.exports.debug_log_reset = debug_log_reset
3786 module.exports.debug_log_each = debug_log_each
3787 module.exports.TYPE_TAG = TYPE_TAG
3788 module.exports.TYPE_TEXT = TYPE_TEXT
3789 module.exports.TYPE_COMMENT = TYPE_COMMENT
3790 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE