JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement rest of tokenizer states
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         shallow_clone: -> # return a new node that's the same except without the children or parent
100                 # WARNING this doesn't work right on open tags that are still being parsed
101                 attrs = {}
102                 attrs[k] = v for k, v of @attrs
103                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104         acknowledge_self_closing: ->
105                 if @token?
106                         @token.flag 'did_self_close'
107                 else
108                         @flag 'did_self_close', true
109         flag: ->
110                 # fixfull
111         serialize: (shallow = false, show_ids = false) -> # for unit tests
112                 ret = ''
113                 switch @type
114                         when TYPE_TAG
115                                 ret += 'tag:'
116                                 ret += JSON.stringify @name
117                                 ret += ','
118                                 if show_ids
119                                         ret += "##{@id},"
120                                 if shallow
121                                         break
122                                 attr_keys = []
123                                 for k of @attrs
124                                         attr_keys.push k
125                                 attr_keys.sort()
126                                 ret += '{'
127                                 sep = ''
128                                 for k in attr_keys
129                                         ret += sep
130                                         sep = ','
131                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132                                 ret += '},['
133                                 sep = ''
134                                 for c in @children
135                                         ret += sep
136                                         sep = ','
137                                         ret += c.serialize shallow, show_ids
138                                 ret += ']'
139                         when TYPE_TEXT
140                                 ret += 'text:'
141                                 ret += JSON.stringify @text
142                         when TYPE_COMMENT
143                                 ret += 'comment:'
144                                 ret += JSON.stringify @text
145                         when TYPE_DOCTYPE
146                                 ret += 'doctype'
147                                 # FIXME
148                         when TYPE_AFE_MARKER
149                                 ret += 'marker'
150                         when TYPE_AAA_BOOKMARK
151                                 ret += 'aaa_bookmark'
152                         else
153                                 ret += 'unknown:'
154                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155                 return ret
156
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159         return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161         return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163         return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165         return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168         return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170         return new Node TYPE_DOCTYPE, name: name
171 new_eof_token = ->
172         return new Node TYPE_EOF
173 new_afe_marker = ->
174         return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176         return new Node TYPE_AAA_BOOKMARK
177
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
183
184 is_uc_alpha = (str) ->
185         return str.length is 1 and uc_alpha.indexOf(str) > -1
186 is_lc_alpha = (str) ->
187         return str.length is 1 and lc_alpha.indexOf(str) > -1
188
189 # some SVG elements have dashes in them
190 tag_name_chars = alnum + "-"
191
192 # http://www.w3.org/TR/html5/infrastructure.html#space-character
193 space_chars = "\u0009\u000a\u000c\u000d\u0020"
194 is_space = (txt) ->
195         return txt.length is 1 and space_chars.indexOf(txt) > -1
196 is_space_tok = (t) ->
197         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
198
199 is_input_hidden_tok = (t) ->
200         return unless t.type is TYPE_START_TAG
201         for a of t.attrs_a
202                 if a[0] is 'type'
203                         if a[1].toLowerCase() is 'hidden'
204                                 return true
205                         return false
206         return false
207
208 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
209 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
210
211 # These are the character references that don't need a terminating semicolon
212 # min length: 2, max: 6, none are a prefix of any other.
213 legacy_char_refs = {
214         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
215         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
216         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
217         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
218         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
219         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
220         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
221         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
222         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
223         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
224         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
225         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
226         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
227         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
228         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
229         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
230         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
231         yen: '¥', yuml: 'ÿ'
232 }
233
234 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
235 raw_text_elements = ['script', 'style']
236 escapable_raw_text_elements = ['textarea', 'title']
237 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
238 svg_elements = [
239         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
240         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
241         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
242         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
243         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
244         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
245         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
246         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
247         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
248         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
249         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
250         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
251         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
252         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
253         'view', 'vkern'
254 ]
255
256 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
257 mathml_elements = [
258         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
259         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
260         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
261         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
262         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
263         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
264         'determinant', 'diff', 'divergence', 'divide', 'domain',
265         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
266         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
267         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
268         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
269         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
270         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
271         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
272         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
273         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
274         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
275         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
276         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
277         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
278         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
279         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
280         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
281         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
282         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
283         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
284         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
285         'vectorproduct', 'xor'
286 ]
287 # foreign_elements = [svg_elements..., mathml_elements...]
288 #normal_elements = All other allowed HTML elements are normal elements.
289
290 special_elements = {
291         # HTML:
292         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
293         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
294         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
295         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
296         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
297         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
298         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
299         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
300         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
301         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
302         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
303         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
304         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
305         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
306         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
307         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
308         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
309         wbr:NS_HTML, xmp:NS_HTML,
310
311         # MathML:
312         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
313         'annotation-xml':NS_MATHML,
314
315         # SVG:
316         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 }
318
319 formatting_elements = {
320          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
321          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
322          u: true
323 }
324
325 foster_parenting_targets = {
326         table: true
327         tbody: true
328         tfoot: true
329         thead: true
330         tr: true
331 }
332
333 # all html I presume
334 end_tag_implied = {
335         dd: true
336         dt: true
337         li: true
338         option: true
339         optgroup: true
340         p: true
341         rb: true
342         rp: true
343         rt: true
344         rtc: true
345 }
346
347 el_is_special = (e) ->
348         return special_elements[e.name] is e.namespace
349
350 # decode_named_char_ref()
351 #
352 # The list of named character references is _huge_ so ask the browser to decode
353 # for us instead of wasting bandwidth/space on including the table here.
354 #
355 # Pass without the "&" but with the ";" examples:
356 #    for "&amp" pass "amp;"
357 #    for "&#x2032" pass "x2032;"
358 g_dncr = {
359         cache: {}
360         textarea: document.createElement('textarea')
361 }
362 # TODO test this in IE8
363 decode_named_char_ref = (txt) ->
364         txt = "&#{txt}"
365         decoded = g_dncr.cache[txt]
366         return decoded if decoded?
367         g_dncr.textarea.innerHTML = txt
368         decoded = g_dncr.textarea.value
369         return null if decoded is txt
370         return g_dncr.cache[txt] = decoded
371
372 parse_html = (txt, parse_error_cb = null) ->
373         cur = 0 # index of next char in txt to be parsed
374         # declare doc and tokenizer variables so they're in scope below
375         doc = null
376         open_els = null # stack of open elements
377         afe = null # active formatting elements
378         template_insertion_modes = null
379         insertion_mode = null
380         original_insertion_mode = null
381         tok_state = null
382         tok_cur_tag = null # partially parsed tag
383         flag_scripting = null
384         flag_frameset_ok = null
385         flag_parsing = null
386         flag_foster_parenting = null
387         form_element_pointer = null
388         temporary_buffer = null
389         pending_table_character_tokens = null
390         head_element_pointer = null
391         flag_fragment_parsing = null
392         context_element = null
393
394         stop_parsing = ->
395                 flag_parsing = false
396
397         parse_error = ->
398                 if parse_error_cb?
399                         parse_error_cb cur
400                 else
401                         console.log "Parse error at character #{cur} of #{txt.length}"
402
403         afe_push = (new_el) ->
404                 matches = 0
405                 for el, i in afe
406                         if el.name is new_el.name and el.namespace is new_el.namespace
407                                 for k, v of el.attrs
408                                         continue unless new_el.attrs[k] is v
409                                 for k, v of new_el.attrs
410                                         continue unless el.attrs[k] is v
411                                 matches += 1
412                                 if matches is 3
413                                         afe.splice i, 1
414                                         break
415                 afe.unshift new_el
416         afe_push_marker = ->
417                 afe.unshift new_afe_marker()
418
419         # the functions below impliment the Tree Contstruction algorithm
420         # http://www.w3.org/TR/html5/syntax.html#tree-construction
421
422         # But first... the helpers
423         template_tag_is_open = ->
424                 for t in open_els
425                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
426                                 return true
427                 return false
428         is_in_scope_x = (tag_name, scope, namespace) ->
429                 for t in open_els
430                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
431                                 return true
432                         if scope[t.name] is t.namespace
433                                 return false
434                 return false
435         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
436                 for t in open_els
437                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
438                                 return true
439                         if scope[t.name] is t.namespace
440                                 return false
441                         if scope2[t.name] is t.namespace
442                                 return false
443                 return false
444         standard_scopers = { # FIXME these are supposed to be namespace specific
445                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
446                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
447                 template: NS_HTML, mi: NS_MATHML,
448
449                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
450                 'annotation-xml': NS_MATHML,
451
452                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
453         }
454         button_scopers = button: NS_HTML
455         li_scopers = ol: NS_HTML, ul: NS_HTML
456         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
457         is_in_scope = (tag_name, namespace = null) ->
458                 return is_in_scope_x tag_name, standard_scopers, namespace
459         is_in_button_scope = (tag_name, namespace = null) ->
460                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
461         is_in_table_scope = (tag_name, namespace = null) ->
462                 return is_in_scope_x tag_name, table_scopers, namespace
463         is_in_select_scope = (tag_name, namespace = null) ->
464                 for t in open_els
465                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
466                                 return true
467                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
468                                 return false
469                 return false
470         # this checks for a particular element, not by name
471         el_is_in_scope = (el) ->
472                 for t in open_els
473                         if t is el
474                                 return true
475                         if standard_scopers[t.name] is t.namespace
476                                 return false
477                 return false
478
479         clear_to_table_stopers = {
480                 'table': true
481                 'template': true
482                 'html': true
483         }
484         clear_stack_to_table_context = ->
485                 loop
486                         if clear_to_table_stopers[open_els[0].name]?
487                                 break
488                         open_els.shift()
489                 return
490         clear_to_table_body_stopers = {
491                 'tbody': true
492                 'tfoot': true
493                 'thead': true
494                 'template': true
495                 'html': true
496         }
497         clear_stack_to_table_body_context = ->
498                 loop
499                         if clear_to_table_body_stopers[open_els[0].name]?
500                                 break
501                         open_els.shift()
502                 return
503         clear_to_table_row_stopers = {
504                 'tr': true
505                 'template': true
506                 'html': true
507         }
508         clear_stack_to_table_row_context = ->
509                 loop
510                         if clear_to_table_row_stopers[open_els[0].name]?
511                                 break
512                         open_els.shift()
513                 return
514         clear_afe_to_marker = ->
515                 loop
516                         return unless afe.length > 0 # this happens in fragment case, ?spec error
517                         el = afe.shift()
518                         if el.type is TYPE_AFE_MARKER
519                                 return
520                 return
521
522         # 8.2.3.1 ...
523         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
524         reset_insertion_mode = ->
525                 # 1. Let last be false.
526                 last = false
527                 # 2. Let node be the last node in the stack of open elements.
528                 node_i = 0
529                 node = open_els[node_i]
530                 # 3. Loop: If node is the first node in the stack of open elements,
531                 # then set last to true, and, if the parser was originally created as
532                 # part of the HTML fragment parsing algorithm (fragment case) set node
533                 # to the context element.
534                 loop
535                         if node_i is open_els.length - 1
536                                 last = true
537                                 # fixfull (fragment case)
538
539                         # 4. If node is a select element, run these substeps:
540                         if node.name is 'select'
541                                 # 1. If last is true, jump to the step below labeled done.
542                                 unless last
543                                         # 2. Let ancestor be node.
544                                         ancestor_i = node_i
545                                         ancestor = node
546                                         # 3. Loop: If ancestor is the first node in the stack of
547                                         # open elements, jump to the step below labeled done.
548                                         loop
549                                                 if ancestor_i is open_els.length - 1
550                                                         break
551                                                 # 4. Let ancestor be the node before ancestor in the stack
552                                                 # of open elements.
553                                                 ancestor_i += 1
554                                                 ancestor = open_els[ancestor_i]
555                                                 # 5. If ancestor is a template node, jump to the step below
556                                                 # labeled done.
557                                                 if ancestor.name is 'template'
558                                                         break
559                                                 # 6. If ancestor is a table node, switch the insertion mode
560                                                 # to "in select in table" and abort these steps.
561                                                 if ancestor.name is 'table'
562                                                         insertion_mode = ins_mode_in_select_in_table
563                                                         return
564                                                 # 7. Jump back to the step labeled loop.
565                                 # 8. Done: Switch the insertion mode to "in select" and abort
566                                 # these steps.
567                                 insertion_mode = ins_mode_in_select
568                                 return
569                         # 5. If node is a td or th element and last is false, then switch
570                         # the insertion mode to "in cell" and abort these steps.
571                         if (node.name is 'td' or node.name is 'th') and last is false
572                                 insertion_mode = ins_mode_in_cell
573                                 return
574                         # 6. If node is a tr element, then switch the insertion mode to "in
575                         # row" and abort these steps.
576                         if node.name is 'tr'
577                                 insertion_mode = ins_mode_in_row
578                                 return
579                         # 7. If node is a tbody, thead, or tfoot element, then switch the
580                         # insertion mode to "in table body" and abort these steps.
581                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
582                                 insertion_mode = ins_mode_in_table_body
583                                 return
584                         # 8. If node is a caption element, then switch the insertion mode
585                         # to "in caption" and abort these steps.
586                         if node.name is 'caption'
587                                 insertion_mode = ins_mode_in_caption
588                                 return
589                         # 9. If node is a colgroup element, then switch the insertion mode
590                         # to "in column group" and abort these steps.
591                         if node.name is 'colgroup'
592                                 insertion_mode = ins_mode_in_column_group
593                                 return
594                         # 10. If node is a table element, then switch the insertion mode to
595                         # "in table" and abort these steps.
596                         if node.name is 'table'
597                                 insertion_mode = ins_mode_in_table
598                                 return
599                         # 11. If node is a template element, then switch the insertion mode
600                         # to the current template insertion mode and abort these steps.
601                         # fixfull (template insertion mode stack)
602
603                         # 12. If node is a head element and last is true, then switch the
604                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
605                         # these steps. (fragment case)
606                         if node.name is 'head' and last
607                                 insertion_mode = ins_mode_in_body
608                                 return
609                         # 13. If node is a head element and last is false, then switch the
610                         # insertion mode to "in head" and abort these steps.
611                         if node.name is 'head' and last is false
612                                 insertion_mode = ins_mode_in_head
613                                 return
614                         # 14. If node is a body element, then switch the insertion mode to
615                         # "in body" and abort these steps.
616                         if node.name is 'body'
617                                 insertion_mode = ins_mode_in_body
618                                 return
619                         # 15. If node is a frameset element, then switch the insertion mode
620                         # to "in frameset" and abort these steps. (fragment case)
621                         if node.name is 'frameset'
622                                 insertion_mode = ins_mode_in_frameset
623                                 return
624                         # 16. If node is an html element, run these substeps:
625                         if node.name is 'html'
626                                 # 1. If the head element pointer is null, switch the insertion
627                                 # mode to "before head" and abort these steps. (fragment case)
628                                 # fixfull (fragment case)
629
630                                 # 2. Otherwise, the head element pointer is not null, switch
631                                 # the insertion mode to "after head" and abort these steps.
632                                 insertion_mode = ins_mode_in_body # FIXME fixfull
633                                 return
634                         # 17. If last is true, then switch the insertion mode to "in body"
635                         # and abort these steps. (fragment case)
636                         if last
637                                 insertion_mode = ins_mode_in_body
638                                 return
639                         # 18. Let node now be the node before node in the stack of open
640                         # elements.
641                         node_i += 1
642                         node = open_els[node_i]
643                         # 19. Return to the step labeled loop.
644
645         # 8.2.3.2
646
647         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
648         adjusted_current_node = ->
649                 if open_els.length is 1 and flag_fragment_parsing
650                         return context_element
651                 return open_els[0]
652
653         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
654         # this implementation is structured (mostly) as described at the link above.
655         # capitalized comments are the "labels" described at the link above.
656         reconstruct_active_formatting_elements = ->
657                 return if afe.length is 0
658                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
659                         return
660                 # Rewind
661                 i = 0
662                 loop
663                         if i is afe.length - 1
664                                 break
665                         i += 1
666                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
667                                 i -= 1 # Advance
668                                 break
669                 # Create
670                 loop
671                         el = afe[i].shallow_clone()
672                         tree_insert_element el
673                         afe[i] = el
674                         break if i is 0
675                         i -= 1 # Advance
676
677         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
678         # adoption agency algorithm
679         # overview here:
680         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
681         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
682         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
683         adoption_agency = (subject) ->
684                 debug_log "adoption_agency()"
685                 debug_log "tree: #{serialize_els doc.children, false, true}"
686                 debug_log "open_els: #{serialize_els open_els, true, true}"
687                 debug_log "afe: #{serialize_els afe, true, true}"
688                 if open_els[0].name is subject
689                         el = open_els[0]
690                         open_els.shift()
691                         # remove it from the list of active formatting elements (if found)
692                         for t, i in afe
693                                 if t is el
694                                         afe.splice i, 1
695                                         break
696                         debug_log "aaa: starting off with subject on top of stack, exiting"
697                         return
698                 outer = 0
699                 loop
700                         if outer >= 8
701                                 return
702                         outer += 1
703                         # 5. Let formatting element be the last element in the list of
704                         # active formatting elements that: is between the end of the list
705                         # and the last scope marker in the list, if any, or the start of
706                         # the list otherwise, and  has the tag name subject.
707                         fe = null
708                         for t, fe_of_afe in afe
709                                 if t.type is TYPE_AFE_MARKER
710                                         break
711                                 if t.name is subject
712                                         fe = t
713                                         break
714                         # If there is no such element, then abort these steps and instead
715                         # act as described in the "any other end tag" entry above.
716                         if fe is null
717                                 debug_log "aaa: fe not found in afe"
718                                 in_body_any_other_end_tag subject
719                                 return
720                         # 6. If formatting element is not in the stack of open elements,
721                         # then this is a parse error; remove the element from the list, and
722                         # abort these steps.
723                         in_open_els = false
724                         for t, fe_of_open_els in open_els
725                                 if t is fe
726                                         in_open_els = true
727                                         break
728                         unless in_open_els
729                                 debug_log "aaa: fe not found in open_els"
730                                 parse_error()
731                                 # "remove it from the list" must mean afe, since it's not in open_els
732                                 afe.splice fe_of_afe, 1
733                                 return
734                         # 7. If formatting element is in the stack of open elements, but
735                         # the element is not in scope, then this is a parse error; abort
736                         # these steps.
737                         unless el_is_in_scope fe
738                                 debug_log "aaa: fe not in scope"
739                                 parse_error()
740                                 return
741                         # 8. If formatting element is not the current node, this is a parse
742                         # error. (But do not abort these steps.)
743                         unless open_els[0] is fe
744                                 parse_error()
745                                 # continue
746                         # 9. Let furthest block be the topmost node in the stack of open
747                         # elements that is lower in the stack than formatting element, and
748                         # is an element in the special category. There might not be one.
749                         fb = null
750                         fb_of_open_els = null
751                         for t, i in open_els
752                                 if t is fe
753                                         break
754                                 if el_is_special t
755                                         fb = t
756                                         fb_of_open_els = i
757                                         # and continue, to see if there's one that's more "topmost"
758                         # 10. If there is no furthest block, then the UA must first pop all
759                         # the nodes from the bottom of the stack of open elements, from the
760                         # current node up to and including formatting element, then remove
761                         # formatting element from the list of active formatting elements,
762                         # and finally abort these steps.
763                         if fb is null
764                                 debug_log "aaa: no fb"
765                                 loop
766                                         t = open_els.shift()
767                                         if t is fe
768                                                 afe.splice fe_of_afe, 1
769                                                 return
770                         # 11. Let common ancestor be the element immediately above
771                         # formatting element in the stack of open elements.
772                         ca = open_els[fe_of_open_els + 1] # common ancestor
773
774                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
775                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
776                         bookmark = new_aaa_bookmark()
777                         for t, i in afe
778                                 if t is fe
779                                         afe.splice i, 0, bookmark
780                                         break
781                         node = last_node = fb
782                         inner = 0
783                         loop
784                                 inner += 1
785                                 # 3. Let node be the element immediately above node in the
786                                 # stack of open elements, or if node is no longer in the stack
787                                 # of open elements (e.g. because it got removed by this
788                                 # algorithm), the element that was immediately above node in
789                                 # the stack of open elements before node was removed.
790                                 node_next = null
791                                 for t, i in open_els
792                                         if t is node
793                                                 node_next = open_els[i + 1]
794                                                 break
795                                 node = node_next ? node_above
796                                 debug_log "inner loop #{inner}"
797                                 debug_log "tree: #{serialize_els doc.children, false, true}"
798                                 debug_log "open_els: #{serialize_els open_els, true, true}"
799                                 debug_log "afe: #{serialize_els afe, true, true}"
800                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
801                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
802                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
803                                 debug_log "node: #{node.serialize true, true}"
804                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
805
806                                 # 4. If node is formatting element, then go to the next step in
807                                 # the overall algorithm.
808                                 if node is fe
809                                         break
810                                 debug_log "the meat"
811                                 # 5. If inner loop counter is greater than three and node is in
812                                 # the list of active formatting elements, then remove node from
813                                 # the list of active formatting elements.
814                                 node_in_afe = false
815                                 for t, i in afe
816                                         if t is node
817                                                 if inner > 3
818                                                         afe.splice i, 1
819                                                         debug_log "max out inner"
820                                                 else
821                                                         node_in_afe = true
822                                                         debug_log "in afe"
823                                                 break
824                                 # 6. If node is not in the list of active formatting elements,
825                                 # then remove node from the stack of open elements and then go
826                                 # back to the step labeled inner loop.
827                                 unless node_in_afe
828                                         debug_log "not in afe"
829                                         for t, i in open_els
830                                                 if t is node
831                                                         node_above = open_els[i + 1]
832                                                         open_els.splice i, 1
833                                                         break
834                                         continue
835                                 debug_log "the bones"
836                                 # 7. create an element for the token for which the element node
837                                 # was created, in the HTML namespace, with common ancestor as
838                                 # the intended parent; replace the entry for node in the list
839                                 # of active formatting elements with an entry for the new
840                                 # element, replace the entry for node in the stack of open
841                                 # elements with an entry for the new element, and let node be
842                                 # the new element.
843                                 new_node = node.shallow_clone()
844                                 for t, i in afe
845                                         if t is node
846                                                 afe[i] = new_node
847                                                 debug_log "replaced in afe"
848                                                 break
849                                 for t, i in open_els
850                                         if t is node
851                                                 node_above = open_els[i + 1]
852                                                 open_els[i] = new_node
853                                                 debug_log "replaced in open_els"
854                                                 break
855                                 node = new_node
856                                 # 8. If last node is furthest block, then move the
857                                 # aforementioned bookmark to be immediately after the new node
858                                 # in the list of active formatting elements.
859                                 if last_node is fb
860                                         for t, i in afe
861                                                 if t is bookmark
862                                                         afe.splice i, 1
863                                                         debug_log "removed bookmark"
864                                                         break
865                                         for t, i in afe
866                                                 if t is node
867                                                         # "after" means lower
868                                                         afe.splice i, 0, bookmark # "after as <-
869                                                         debug_log "placed bookmark after node"
870                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
871                                                         break
872                                 # 9. Insert last node into node, first removing it from its
873                                 # previous parent node if any.
874                                 if last_node.parent?
875                                         debug_log "last_node has parent"
876                                         for c, i in last_node.parent.children
877                                                 if c is last_node
878                                                         debug_log "removing last_node from parent"
879                                                         last_node.parent.children.splice i, 1
880                                                         break
881                                 node.children.push last_node
882                                 last_node.parent = node
883                                 # 10. Let last node be node.
884                                 last_node = node
885                                 debug_log "at last"
886                                 # 11. Return to the step labeled inner loop.
887                         # 14. Insert whatever last node ended up being in the previous step
888                         # at the appropriate place for inserting a node, but using common
889                         # ancestor as the override target.
890
891                         # In the case where fe is immediately followed by fb:
892                         #   * inner loop exits out early (node==fe)
893                         #   * last_node is fb
894                         #   * last_node is still in the tree (not a duplicate)
895                         if last_node.parent?
896                                 debug_log "FEFIRST? last_node has parent"
897                                 for c, i in last_node.parent.children
898                                         if c is last_node
899                                                 debug_log "removing last_node from parent"
900                                                 last_node.parent.children.splice i, 1
901                                                 break
902
903                         debug_log "after aaa inner loop"
904                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908                         debug_log "tree: #{serialize_els doc.children, false, true}"
909
910                         debug_log "insert"
911
912
913                         # can't use standard insert token thing, because it's already in
914                         # open_els and must stay at it's current position in open_els
915                         dest = adjusted_insertion_location ca
916                         dest[0].children.splice dest[1], 0, last_node
917                         last_node.parent = dest[0]
918
919
920                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
921                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
922                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
923                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
924                         debug_log "tree: #{serialize_els doc.children, false, true}"
925
926                         # 15. Create an element for the token for which formatting element
927                         # was created, in the HTML namespace, with furthest block as the
928                         # intended parent.
929                         new_element = fe.shallow_clone() # FIXME intended parent thing
930                         # 16. Take all of the child nodes of furthest block and append them
931                         # to the element created in the last step.
932                         while fb.children.length
933                                 t = fb.children.shift()
934                                 t.parent = new_element
935                                 new_element.children.push t
936                         # 17. Append that new element to furthest block.
937                         new_element.parent = fb
938                         fb.children.push new_element
939                         # 18. Remove formatting element from the list of active formatting
940                         # elements, and insert the new element into the list of active
941                         # formatting elements at the position of the aforementioned
942                         # bookmark.
943                         for t, i in afe
944                                 if t is fe
945                                         afe.splice i, 1
946                                         break
947                         for t, i in afe
948                                 if t is bookmark
949                                         afe[i] = new_element
950                                         break
951                         # 19. Remove formatting element from the stack of open elements,
952                         # and insert the new element into the stack of open elements
953                         # immediately below the position of furthest block in that stack.
954                         for t, i in open_els
955                                 if t is fe
956                                         open_els.splice i, 1
957                                         break
958                         for t, i in open_els
959                                 if t is fb
960                                         open_els.splice i, 0, new_element
961                                         break
962                         # 20. Jump back to the step labeled outer loop.
963                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
964                         debug_log "tree: #{serialize_els doc.children, false, true}"
965                         debug_log "open_els: #{serialize_els open_els, true, true}"
966                         debug_log "afe: #{serialize_els afe, true, true}"
967                 debug_log "AAA DONE"
968
969         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
970         close_p_element = ->
971                 generate_implied_end_tags 'p' # arg is exception
972                 if open_els[0].name isnt 'p'
973                         parse_error()
974                 while open_els.length > 1 # just in case
975                         el = open_els.shift()
976                         if el.name is 'p'
977                                 return
978         close_p_if_in_button_scope = ->
979                 if is_in_button_scope 'p'
980                         close_p_element()
981
982         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
983         # aka insert_a_character = (t) ->
984         insert_character = (t) ->
985                 dest = adjusted_insertion_location()
986                 # fixfull check for Document node
987                 if dest[1] > 0
988                         prev = dest[0].children[dest[1] - 1]
989                         if prev.type is TYPE_TEXT
990                                 prev.text += t.text
991                                 return
992                 dest[0].children.splice dest[1], 0, t
993
994         # 8.2.5.1
995         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
996         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
997         adjusted_insertion_location = (override_target = null) ->
998                 # 1. If there was an override target specified, then let target be the
999                 # override target.
1000                 if override_target?
1001                         target = override_target
1002                 else # Otherwise, let target be the current node.
1003                         target = open_els[0]
1004                 # 2. Determine the adjusted insertion location using the first matching
1005                 # steps from the following list:
1006                 #
1007                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1008                 # thead, or tr element Foster parenting happens when content is
1009                 # misnested in tables.
1010                 if flag_foster_parenting and foster_parenting_targets[target.name]
1011                         loop # once. this is here so we can ``break`` to "abort these substeps"
1012                                 # 1. Let last template be the last template element in the
1013                                 # stack of open elements, if any.
1014                                 last_template = null
1015                                 last_template_i = null
1016                                 for el, i in open_els
1017                                         if el.name is 'template'
1018                                                 last_template = el
1019                                                 last_template_i = i
1020                                                 break
1021                                 # 2. Let last table be the last table element in the stack of
1022                                 # open elements, if any.
1023                                 last_table = null
1024                                 last_table_i
1025                                 for el, i in open_els
1026                                         if el.name is 'table'
1027                                                 last_table = el
1028                                                 last_table_i = i
1029                                                 break
1030                                 # 3. If there is a last template and either there is no last
1031                                 # table, or there is one, but last template is lower (more
1032                                 # recently added) than last table in the stack of open
1033                                 # elements, then: let adjusted insertion location be inside
1034                                 # last template's template contents, after its last child (if
1035                                 # any), and abort these substeps.
1036                                 if last_template and (last_table is null or last_template_i < last_table_i)
1037                                         target = last_template # fixfull should be it's contents
1038                                         target_i = target.children.length
1039                                         break
1040                                 # 4. If there is no last table, then let adjusted insertion
1041                                 # location be inside the first element in the stack of open
1042                                 # elements (the html element), after its last child (if any),
1043                                 # and abort these substeps. (fragment case)
1044                                 if last_table is null
1045                                         # this is odd
1046                                         target = open_els[open_els.length - 1]
1047                                         target_i = target.children.length
1048                                 # 5. If last table has a parent element, then let adjusted
1049                                 # insertion location be inside last table's parent element,
1050                                 # immediately before last table, and abort these substeps.
1051                                 if last_table.parent?
1052                                         for c, i in last_table.parent.children
1053                                                 if c is last_table
1054                                                         target = last_table.parent
1055                                                         target_i = i
1056                                                         break
1057                                         break
1058                                 # 6. Let previous element be the element immediately above last
1059                                 # table in the stack of open elements.
1060                                 #
1061                                 # huh? how could it not have a parent?
1062                                 previous_element = open_els[last_table_i + 1]
1063                                 # 7. Let adjusted insertion location be inside previous
1064                                 # element, after its last child (if any).
1065                                 target = previous_element
1066                                 target_i = target.children.length
1067                                 # Note: These steps are involved in part because it's possible
1068                                 # for elements, the table element in this case in particular,
1069                                 # to have been moved by a script around in the DOM, or indeed
1070                                 # removed from the DOM entirely, after the element was inserted
1071                                 # by the parser.
1072                                 break # don't really loop
1073                 else
1074                         # Otherwise Let adjusted insertion location be inside target, after
1075                         # its last child (if any).
1076                         target_i = target.children.length
1077
1078                 # 3. If the adjusted insertion location is inside a template element,
1079                 # let it instead be inside the template element's template contents,
1080                 # after its last child (if any).
1081                 # fixfull (template)
1082
1083                 # 4. Return the adjusted insertion location.
1084                 return [target, target_i]
1085
1086         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1087         # aka create_an_element_for_token
1088         token_to_element = (t, namespace, intended_parent) ->
1089                 t.type = TYPE_TAG # not TYPE_START_TAG
1090                 # convert attributes into a hash
1091                 attrs = {}
1092                 while t.attrs_a.length
1093                         a = t.attrs_a.pop()
1094                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1095                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1096
1097                 # TODO 2. If the newly created element has an xmlns attribute in the
1098                 # XMLNS namespace whose value is not exactly the same as the element's
1099                 # namespace, that is a parse error. Similarly, if the newly created
1100                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1101                 # value is not the XLink Namespace, that is a parse error.
1102
1103                 # fixfull: the spec says stuff about form pointers and ownerDocument
1104
1105                 return el
1106
1107         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1108         insert_foreign_element = (token, namespace) ->
1109                 ail = adjusted_insertion_location()
1110                 ail_el = ail[0]
1111                 ail_i = ail[1]
1112                 el = token_to_element token, namespace, ail_el
1113                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1114                 el.parent = ail_el
1115                 ail_el.children.splice ail_i, 0, el
1116                 open_els.unshift el
1117                 return el
1118         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1119         insert_html_element = insert_foreign_element # (token, namespace) ->
1120
1121         # FIXME read implement "foster parenting" part
1122         # FIXME read spec, do this right
1123         # FIXME implement the override target thing
1124         # note: this assumes it's an open tag
1125         # FIXME what part of the spec is this?
1126         # TODO look through all callers of this, and see what they should really be doing.
1127         #   eg probably insert_html_element for tokens
1128         tree_insert_element = (el, override_target = null, namespace = null) ->
1129                 if namespace?
1130                         el.namespace = namespace
1131                 dest = adjusted_insertion_location override_target
1132                 if el.type is TYPE_START_TAG # means it's a "token"
1133                         el = token_to_element el, namespace, dest[0]
1134                 unless el.namespace?
1135                         namespace = dest.namespace
1136                 # fixfull: Document nodes sometimes can't accept more chidren
1137                 dest[0].children.splice dest[1], 0, el
1138                 el.parent = dest[0]
1139                 open_els.unshift el
1140                 return el
1141
1142         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1143         # position should be [node, index_within_children]
1144         insert_comment = (t, position = null) ->
1145                 position ?= adjusted_insertion_location()
1146                 position[0].children.splice position[1], 0, t
1147
1148         # 8.2.5.2
1149         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1150         parse_generic_raw_text = (t) ->
1151                 insert_html_element t
1152                 tok_state = tok_state_rawtext
1153                 original_insertion_mode = insertion_mode
1154                 insertion_mode = ins_mode_text
1155         parse_generic_rcdata_text = (t) ->
1156                 insert_html_element t
1157                 tok_state = tok_state_rcdata
1158                 original_insertion_mode = insertion_mode
1159                 insertion_mode = ins_mode_text
1160
1161         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1162         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1163         generate_implied_end_tags = (except = null) ->
1164                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1165                         open_els.shift()
1166
1167         # 8.2.5.4 The rules for parsing tokens in HTML content
1168         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1169
1170         # 8.2.5.4.1 The "initial" insertion mode
1171         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1172         ins_mode_initial = (t) ->
1173                 if is_space_tok t
1174                         return
1175                 if t.type is TYPE_COMMENT
1176                         # ?fixfull
1177                         doc.children.push t
1178                         return
1179                 if t.type is TYPE_DOCTYPE
1180                         # FIXME check identifiers, set quirks, etc
1181                         # fixfull
1182                         doc.children.push t
1183                         insertion_mode = ins_mode_before_html
1184                         return
1185                 # Anything else
1186                 #fixfull (iframe, quirks)
1187                 insertion_mode = ins_mode_before_html
1188                 insertion_mode t # reprocess the token
1189                 return
1190
1191         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1192         ins_mode_before_html = (t) ->
1193                 if t.type is TYPE_DOCTYPE
1194                         parse_error()
1195                         return
1196                 if t.type is TYPE_COMMENT
1197                         doc.children.push t
1198                         return
1199                 if is_space_tok t
1200                         return
1201                 if t.type is TYPE_START_TAG and t.name is 'html'
1202                         el = token_to_element t, NS_HTML, doc
1203                         doc.children.push el
1204                         open_els.unshift(el)
1205                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1206                         insertion_mode = ins_mode_before_head
1207                         return
1208                 if t.type is TYPE_END_TAG
1209                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1210                                 # fall through to "anything else"
1211                         else
1212                                 parse_error()
1213                                 return
1214                 # Anything else
1215                 html_tok = new_open_tag 'html'
1216                 el = token_to_element html_tok, NS_HTML, doc
1217                 doc.children.push el
1218                 open_els.unshift el
1219                 # ?fixfull browsing context
1220                 insertion_mode = ins_mode_before_head
1221                 insertion_mode t
1222                 return
1223
1224         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1225         ins_mode_before_head = (t) ->
1226                 if is_space_tok t
1227                         return
1228                 if t.type is TYPE_COMMENT
1229                         insert_comment t
1230                         return
1231                 if t.type is TYPE_DOCTYPE
1232                         parse_error()
1233                         return
1234                 if t.type is TYPE_START_TAG and t.name is 'html'
1235                         ins_mode_in_body t
1236                         return
1237                 if t.type is TYPE_START_TAG and t.name is 'head'
1238                         el = insert_html_element t
1239                         head_element_pointer = el
1240                         insertion_mode = ins_mode_in_head
1241                 if t.type is TYPE_END_TAG
1242                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1243                                 # fall through to Anything else below
1244                         else
1245                                 parse_error()
1246                                 return
1247                 # Anything else
1248                 head_tok = new_open_tag 'head'
1249                 el = insert_html_element head_tok
1250                 head_element_pointer = el
1251                 insertion_mode = ins_mode_in_head
1252                 insertion_mode t # reprocess current token
1253
1254         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1255         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1256                 open_els.shift() # spec says this will be a 'head' node
1257                 insertion_mode = ins_mode_after_head
1258                 insertion_mode t
1259         ins_mode_in_head = (t) ->
1260                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1261                         insert_character t
1262                         return
1263                 if t.type is TYPE_COMMENT
1264                         insert_comment t
1265                         return
1266                 if t.type is TYPE_DOCTYPE
1267                         parse_error()
1268                         return
1269                 if t.type is TYPE_START_TAG and t.name is 'html'
1270                         ins_mode_in_body t
1271                         return
1272                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1273                         el = insert_html_element t
1274                         open_els.shift()
1275                         t.acknowledge_self_closing()
1276                         return
1277                 if t.type is TYPE_START_TAG and t.name is 'meta'
1278                         el = insert_html_element t
1279                         open_els.shift()
1280                         t.acknowledge_self_closing()
1281                         # fixfull encoding stuff
1282                         return
1283                 if t.type is TYPE_START_TAG and t.name is 'title'
1284                         parse_generic_rcdata_text t
1285                         return
1286                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1287                         parse_generic_raw_text t
1288                         return
1289                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1290                         insert_html_element t
1291                         insertion_mode = ins_mode_in_head_noscript # FIXME implement
1292                         return
1293                 if t.type is TYPE_START_TAG and t.name is 'script'
1294                         ail = adjusted_insertion_location()
1295                         el = token_to_element t, NS_HTML, ail
1296                         el.flag 'parser-inserted', true # FIXME implement
1297                         # fixfull frament case
1298                         ail[0].children.splice ail[1], 0, el
1299                         open_els.unshift el
1300                         tok_state = tok_state_script_data
1301                         original_insertion_mode = insertion_mode # make sure orig... is defined
1302                         insertion_mode = ins_mode_text # FIXME implement
1303                         return
1304                 if t.type is TYPE_END_TAG and t.name is 'head'
1305                         open_els.shift() # will be a head element... spec says so
1306                         insertion_mode = ins_mode_after_head
1307                         return
1308                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1309                         ins_mode_in_head_else t
1310                         return
1311                 if t.type is TYPE_START_TAG and t.name is 'template'
1312                         insert_html_element t
1313                         afe_push_marker()
1314                         flag_frameset_ok = false
1315                         insertion_mode = ins_mode_in_template
1316                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1317                         return
1318                 if t.type is TYPE_END_TAG and t.name is 'template'
1319                         if template_tag_is_open()
1320                                 generate_implied_end_tags
1321                                 if open_els[0].name isnt 'template'
1322                                         parse_error()
1323                                 loop
1324                                         el = open_els.shift()
1325                                         if el.name is 'template'
1326                                                 break
1327                                 clear_afe_to_marker()
1328                                 template_insertion_modes.shift()
1329                                 reset_insertion_mode()
1330                         else
1331                                 parse_error()
1332                         return
1333                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1334                         parse_error()
1335                         return
1336                 ins_mode_in_head_else t
1337
1338         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1339         ins_mode_in_head_noscript = (t) ->
1340                 # FIXME ?fixfull
1341                 console.log "ins_mode_in_head_noscript unimplemented"
1342
1343         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1344         ins_mode_after_head_else = (t) ->
1345                 body_tok = new_open_tag 'body'
1346                 insert_html_element body_tok
1347                 insertion_mode = ins_mode_in_body
1348                 insertion_mode t # reprocess token
1349                 return
1350         ins_mode_after_head = (t) ->
1351                 if is_space_tok t
1352                         insert_character t
1353                         return
1354                 if t.type is TYPE_COMMENT
1355                         insert_comment t
1356                         return
1357                 if t.type is TYPE_DOCTYPE
1358                         parse_error()
1359                         return
1360                 if t.type is TYPE_START_TAG and t.name is 'html'
1361                         ins_mode_in_body t
1362                         return
1363                 if t.type is TYPE_START_TAG and t.name is 'body'
1364                         insert_html_element t
1365                         flag_frameset_ok = false
1366                         insertion_mode = ins_mode_in_body
1367                         return
1368                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1369                         insert_html_element t
1370                         insertion_mode = ins_mode_in_frameset
1371                         return
1372                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1373                         parse_error()
1374                         open_els.unshift head_element_pointer
1375                         ins_mode_in_head t
1376                         for el, i of open_els
1377                                 if el is head_element_pointer
1378                                         open_els.splice i, 1
1379                                         return
1380                         console.log "warning: 23904 couldn't find head element in open_els"
1381                         return
1382                 if t.type is TYPE_END_TAG and t.name is 'template'
1383                         ins_mode_in_head t
1384                         return
1385                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1386                         ins_mode_after_head_else t
1387                         return
1388                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1389                         parse_error()
1390                         return
1391                 # Anything else
1392                 ins_mode_after_head_else t
1393
1394         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1395         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1396                 for node, i in open_els
1397                         if node.name is name # FIXME check namespace too
1398                                 generate_implied_end_tags name # arg is exception
1399                                 parse_error() unless i is 0
1400                                 while i >= 0
1401                                         open_els.shift()
1402                                         i -= 1
1403                                 return
1404                         if special_elements[node.name]? # FIXME check namespac too
1405                                 parse_error()
1406                                 return
1407         ins_mode_in_body = (t) ->
1408                 switch t.type
1409                         when TYPE_TEXT
1410                                 switch t.text
1411                                         when "\u0000"
1412                                                 parse_error()
1413                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1414                                                 reconstruct_active_formatting_elements()
1415                                                 insert_character t
1416                                         else
1417                                                 reconstruct_active_formatting_elements()
1418                                                 insert_character t
1419                                                 flag_frameset_ok = false
1420                         when TYPE_COMMENT
1421                                 insert_comment t
1422                         when TYPE_DOCTYPE
1423                                 parse_error()
1424                         when TYPE_START_TAG
1425                                 switch t.name
1426                                         when 'html'
1427                                                 parse_error()
1428                                                 return if template_tag_is_open()
1429                                                 root_attrs = open_els[open_els.length - 1].attrs
1430                                                 for k, v of t.attrs
1431                                                         root_attrs[k] = v unless root_attrs[k]?
1432                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1433                                                 # FIXME also do this for </template> (end tag)
1434                                                 return ins_mode_in_head t
1435                                         when 'body'
1436                                                 parse_error()
1437                                                 # TODO
1438                                         when 'frameset'
1439                                                 parse_error()
1440                                                 # TODO
1441                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1442                                                 close_p_if_in_button_scope()
1443                                                 insert_html_element t
1444                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1445                                                 close_p_if_in_button_scope()
1446                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1447                                                         parse_error()
1448                                                         open_els.shift()
1449                                                 insert_html_element t
1450                                         # TODO lots more to implement here
1451                                         when 'a'
1452                                                 # If the list of active formatting elements
1453                                                 # contains an a element between the end of the list and
1454                                                 # the last marker on the list (or the start of the list
1455                                                 # if there is no marker on the list), then this is a
1456                                                 # parse error; run the adoption agency algorithm for
1457                                                 # the tag name "a", then remove that element from the
1458                                                 # list of active formatting elements and the stack of
1459                                                 # open elements if the adoption agency algorithm didn't
1460                                                 # already remove it (it might not have if the element
1461                                                 # is not in table scope).
1462                                                 found = false
1463                                                 for el in afe
1464                                                         if el.type is TYPE_AFE_MARKER
1465                                                                 break
1466                                                         if el.name is 'a'
1467                                                                 found = el
1468                                                 if found?
1469                                                         parse_error()
1470                                                         adoption_agency 'a'
1471                                                         for el, i in afe
1472                                                                 if el is found
1473                                                                         afe.splice i, 1
1474                                                         for el, i in open_els
1475                                                                 if el is found
1476                                                                         open_els.splice i, 1
1477                                                 reconstruct_active_formatting_elements()
1478                                                 el = insert_html_element t
1479                                                 afe_push el
1480                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1481                                                 reconstruct_active_formatting_elements()
1482                                                 el = insert_html_element t
1483                                                 afe_push el
1484                                         when 'table'
1485                                                 # fixfull quirksmode thing
1486                                                 close_p_if_in_button_scope()
1487                                                 insert_html_element t
1488                                                 insertion_mode = ins_mode_in_table
1489                                         # TODO lots more to implement here
1490                                         else # any other start tag
1491                                                 reconstruct_active_formatting_elements()
1492                                                 insert_html_element t
1493                         when TYPE_EOF
1494                                 ok_tags = {
1495                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1496                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1497                                 }
1498                                 for t in open_els
1499                                         unless ok_tags[t.name]?
1500                                                 parse_error()
1501                                                 break
1502                                 # TODO stack of template insertion modes thing
1503                                 stop_parsing()
1504                         when TYPE_END_TAG
1505                                 switch t.name
1506                                         when 'body'
1507                                                 unless is_in_scope 'body'
1508                                                         parse_error()
1509                                                         return
1510                                                 # TODO implement parse error and move to tree_after_body
1511                                         when 'html'
1512                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1513                                                         parse_error()
1514                                                         return
1515                                                 # TODO implement parse error and move to tree_after_body, reprocess
1516                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1517                                                 unless is_in_scope t.name, NS_HTML
1518                                                         parse_error()
1519                                                         return
1520                                                 generate_implied_end_tags()
1521                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1522                                                         parse_error()
1523                                                 loop
1524                                                         el = open_els.shift()
1525                                                         if el.name is t.name and el.namespace is NS_HTML
1526                                                                 return
1527                                         # TODO lots more close tags to implement here
1528                                         when 'p'
1529                                                 unless is_in_button_scope 'p'
1530                                                         parse_error()
1531                                                         insert_html_element new_open_tag 'p'
1532                                                 close_p_element()
1533                                         # TODO lots more close tags to implement here
1534                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1535                                                 adoption_agency t.name
1536                                         # TODO lots more close tags to implement here
1537                                         else
1538                                                 in_body_any_other_end_tag t.name
1539                 return
1540
1541         ins_mode_in_table_else = (t) ->
1542                 parse_error()
1543                 flag_foster_parenting = true # FIXME
1544                 ins_mode_in_body t
1545                 flag_foster_parenting = false
1546         can_in_table = { # FIXME do this inline like everywhere else
1547                 'table': true
1548                 'tbody': true
1549                 'tfoot': true
1550                 'thead': true
1551                 'tr': true
1552         }
1553
1554         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1555         ins_mode_text = (t) ->
1556                 if t.type is TYPE_TEXT
1557                         insert_character t
1558                         return
1559                 if t.type is TYPE_EOF
1560                         parse_error()
1561                         if open_els[0].name is 'script'
1562                                 open_els[0].flag 'already started', true
1563                         open_els.shift()
1564                         insertion_mode = original_insertion_mode
1565                         insertion_mode t
1566                         return
1567                 if t.type is TYPE_END_TAG and t.name is 'script'
1568                         open_els.shift()
1569                         insertion_mode = original_insertion_mode
1570                         # fixfull the spec seems to assume that I'm going to run the script
1571                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1572                         return
1573                 if t.type is TYPE_END_TAG
1574                         open_els.shift()
1575                         insertion_mode = original_insertion_mode
1576                         return
1577                 console.log 'warning: end of ins_mode_text reached'
1578
1579         # the functions below implement the tokenizer stats described here:
1580         # http://www.w3.org/TR/html5/syntax.html#tokenization
1581
1582         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1583         ins_mode_in_table = (t) ->
1584                 switch t.type
1585                         when TYPE_TEXT
1586                                 if can_in_table[t.name]
1587                                         original_insertion_mode = insertion_mode
1588                                         insertion_mode = ins_mode_in_table_text
1589                                         insertion_mode t
1590                                 else
1591                                         ins_mode_in_table_else t
1592                         when TYPE_COMMENT
1593                                 insert_comment t
1594                         when TYPE_DOCTYPE
1595                                 parse_error()
1596                         when TYPE_START_TAG
1597                                 switch t.name
1598                                         when 'caption'
1599                                                 clear_stack_to_table_context()
1600                                                 afe_push_marker()
1601                                                 insert_html_element t
1602                                                 insertion_mode = ins_mode_in_caption
1603                                         when 'colgroup'
1604                                                 clear_stack_to_table_context()
1605                                                 insert_html_element t
1606                                                 insertion_mode = ins_mode_in_column_group
1607                                         when 'col'
1608                                                 clear_stack_to_table_context()
1609                                                 insert_html_element new_open_tag 'colgroup'
1610                                                 insertion_mode = ins_mode_in_column_group
1611                                                 insertion_mode t
1612                                         when 'tbody', 'tfoot', 'thead'
1613                                                 clear_stack_to_table_context()
1614                                                 insert_html_element t
1615                                                 insertion_mode = ins_mode_in_table_body
1616                                         when 'td', 'th', 'tr'
1617                                                 clear_stack_to_table_context()
1618                                                 insert_html_element new_open_tag 'tbody'
1619                                                 insertion_mode = ins_mode_in_table_body
1620                                                 insertion_mode t
1621                                         when 'table'
1622                                                 parse_error()
1623                                                 if is_in_table_scope 'table'
1624                                                         loop
1625                                                                 el = open_els.shift()
1626                                                                 if el.name is 'table'
1627                                                                         break
1628                                                         reset_insertion_mode()
1629                                                         insertion_mode t
1630                                         when 'style', 'script', 'template'
1631                                                 ins_mode_in_head t
1632                                         when 'input'
1633                                                 if is_input_hidden_tok t
1634                                                         ins_mode_in_table_else t
1635                                                 else
1636                                                         parse_error()
1637                                                         el = insert_html_element t
1638                                                         open_els.shift()
1639                                                         t.acknowledge_self_closing()
1640                                         when 'form'
1641                                                 parse_error()
1642                                                 if form_element_pointer?
1643                                                         return
1644                                                 if template_tag_is_open()
1645                                                         return
1646                                                 form_element_pointer = insert_html_element t
1647                                                 open_els.shift()
1648                                         else
1649                                                 ins_mode_in_table_else t
1650                         when TYPE_END_TAG
1651                                 switch t.name
1652                                         when 'table'
1653                                                 if is_in_table_scope 'table'
1654                                                         loop
1655                                                                 el = open_els.shift()
1656                                                                 if el.name is 'table'
1657                                                                         break
1658                                                         reset_insertion_mode()
1659                                                 else
1660                                                         parse_error
1661                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1662                                                 parse_error()
1663                                         when 'template'
1664                                                 ins_mode_in_head t
1665                                         else
1666                                                 ins_mode_in_table_else t
1667                         when TYPE_EOF
1668                                 ins_mode_in_body t
1669                         else
1670                                 ins_mode_in_table_else t
1671
1672
1673         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1674         ins_mode_in_table_text = (t) ->
1675                 if t.type is TYPE_TEXT and t.text is "\u0000"
1676                         # huh? I thought the tokenizer didn't emit these
1677                         parse_error()
1678                         return
1679                 if t.type is TYPE_TEXT
1680                         pending_table_character_tokens.push t
1681                         return
1682                 # Anything else
1683                 all_space = true
1684                 for old in pending_table_character_tokens
1685                         unless is_space_tok old
1686                                 all_space = false
1687                                 break
1688                 if all_space
1689                         for old in pending_table_character_tokens
1690                                 insert_character old
1691                 else
1692                         for old in pending_table_character_tokens
1693                                 ins_mode_table_else old
1694                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1695                 insertion_mode = original_insertion_mode
1696                 insertion_mode t
1697
1698         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1699         ins_mode_in_caption = (t) ->
1700                 if t.type is TYPE_END_TAG and t.name is 'caption'
1701                         if is_in_table_scope 'caption'
1702                                 generate_implied_end_tags()
1703                                 if open_els[0].name isnt 'caption'
1704                                         parse_error()
1705                                 loop
1706                                         el = open_els.shift()
1707                                         if el.name is 'caption'
1708                                                 break
1709                                 clear_afe_to_marker()
1710                                 insertion_mode = ins_mode_in_table
1711                         else
1712                                 parse_error()
1713                                 # fragment case
1714                         return
1715                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1716                         parse_error()
1717                         if is_in_table_scope 'caption'
1718                                 loop
1719                                         el = open_els.shift()
1720                                         if el.name is 'caption'
1721                                                 break
1722                                 clear_afe_to_marker()
1723                                 insertion_mode = ins_mode_in_table
1724                                 insertion_mode t
1725                         # else fragment case
1726                         return
1727                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1728                         parse_error()
1729                         return
1730                 # Anything else
1731                 ins_mode_in_body t
1732
1733         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1734         ins_mode_in_column_group = (t) ->
1735                 if is_space_tok t
1736                         insert_character t
1737                         return
1738                 if t.type is TYPE_COMMENT
1739                         insert_comment t
1740                         return
1741                 if t.type is TYPE_DOCTYPE
1742                         parse_error()
1743                         return
1744                 if t.type is TYPE_START_TAG and t.name is 'html'
1745                         ins_mode_in_body t
1746                         return
1747                 if t.type is TYPE_START_TAG and t.name is 'col'
1748                         el = insert_html_element t
1749                         open_els.shift()
1750                         t.acknowledge_self_closing()
1751                         return
1752                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1753                         if open_els[0].name is 'colgroup'
1754                                 open_els.shift()
1755                                 insertion_mode = ins_mode_in_table
1756                         else
1757                                 parse_error()
1758                         return
1759                 if t.type is TYPE_END_TAG and t.name is 'col'
1760                         parse_error()
1761                         return
1762                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1763                         ins_mode_in_head t
1764                         return
1765                 if t.type is TYPE_EOF
1766                         ins_mode_in_body t
1767                         return
1768                 # Anything else
1769                 if open_els[0].name isnt 'colgroup'
1770                         parse_error()
1771                         return
1772                 open_els.shift()
1773                 insertion_mode = ins_mode_in_table
1774                 insertion_mode t
1775                 return
1776
1777         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1778         ins_mode_in_table_body = (t) ->
1779                 if t.type is TYPE_START_TAG and t.name is 'tr'
1780                         clear_stack_to_table_body_context()
1781                         insert_html_element t
1782                         insertion_mode = ins_mode_in_row
1783                         return
1784                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1785                         parse_error()
1786                         clear_stack_to_table_body_context()
1787                         insert_html_element new_open_tag 'tr'
1788                         insertion_mode = ins_mode_in_row
1789                         insertion_mode t
1790                         return
1791                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1792                         unless is_in_table_scope t.name # fixfull check namespace
1793                                 parse_error()
1794                                 return
1795                         clear_stack_to_table_body_context()
1796                         open_els.shift()
1797                         insertion_mode = ins_mode_in_table
1798                         return
1799                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1800                         has = false
1801                         for el in open_els
1802                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1803                                         has = true
1804                                         break
1805                                 if table_scopers[el.name]
1806                                         break
1807                         if !has
1808                                 parse_error()
1809                                 return
1810                         clear_stack_to_table_body_context()
1811                         open_els.shift()
1812                         insertion_mode = ins_mode_in_table
1813                         insertion_mode t
1814                         return
1815                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1816                         parse_error()
1817                         return
1818                 # Anything else
1819                 ins_mode_in_table t
1820
1821         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1822         ins_mode_in_row = (t) ->
1823                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1824                         clear_stack_to_table_row_context()
1825                         insert_html_element t
1826                         insertion_mode = ins_mode_in_cell
1827                         afe_push_marker()
1828                         return
1829                 if t.type is TYPE_END_TAG and t.name is 'tr'
1830                         if is_in_table_scope 'tr'
1831                                 clear_stack_to_table_row_context()
1832                                 open_els.shift()
1833                                 insertion_mode = ins_mode_in_table_body
1834                         else
1835                                 parse_error()
1836                         return
1837                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1838                         if is_in_table_scope 'tr'
1839                                 clear_stack_to_table_row_context()
1840                                 open_els.shift()
1841                                 insertion_mode = ins_mode_in_table_body
1842                                 insertion_mode t
1843                         else
1844                                 parse_error()
1845                         return
1846                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1847                         if is_in_table_scope t.name # fixfull namespace
1848                                 if is_in_table_scope 'tr'
1849                                         clear_stack_to_table_row_context()
1850                                         open_els.shift()
1851                                         insertion_mode = ins_mode_in_table_body
1852                                         insertion_mode t
1853                         else
1854                                 parse_error()
1855                         return
1856                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1857                         parse_error()
1858                         return
1859                 # Anything else
1860                 ins_mode_in_table t
1861
1862         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1863         close_the_cell = ->
1864                 generate_implied_end_tags()
1865                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1866                         parse_error()
1867                 loop
1868                         el = open_els.shift()
1869                         if el.name is 'td' or el.name is 'th'
1870                                 break
1871                 clear_afe_to_marker()
1872                 insertion_mode = ins_mode_in_row
1873
1874         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1875         ins_mode_in_cell = (t) ->
1876                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1877                         if is_in_table_scope t.name
1878                                 generate_implied_end_tags()
1879                                 if open_els[0].name isnt t.name
1880                                         parse_error
1881                                 loop
1882                                         el = open_els.shift()
1883                                         if el.name is t.name
1884                                                 break
1885                                 clear_afe_to_marker()
1886                                 insertion_mode = ins_mode_in_row
1887                         else
1888                                 parse_error()
1889                         return
1890                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1891                         has = false
1892                         for el in open_els
1893                                 if el.name is 'td' or el.name is 'th'
1894                                         has = true
1895                                         break
1896                                 if table_scopers[el.name]
1897                                         break
1898                         if !has
1899                                 parse_error()
1900                                 return
1901                         close_the_cell()
1902                         insertion_mode t
1903                         return
1904                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1905                         parse_error()
1906                         return
1907                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1908                         if is_in_table_scope t.name # fixfull namespace
1909                                 close_the_cell()
1910                                 insertion_mode t
1911                         else
1912                                 parse_error()
1913                         return
1914                 # Anything Else
1915                 ins_mode_in_body t
1916
1917         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1918         ins_mode_in_select = (t) ->
1919                 if t.type is TYPE_TEXT and t.text is "\u0000"
1920                         parse_error()
1921                         return
1922                 if t.type is TYPE_TEXT
1923                         insert_character t
1924                         return
1925                 if t.type is TYPE_COMMENT
1926                         insert_comment t
1927                         return
1928                 if t.type is TYPE_DOCTYPE
1929                         parse_error()
1930                         return
1931                 if t.type is TYPE_START_TAG and t.name is 'html'
1932                         ins_mode_in_body t
1933                         return
1934                 if t.type is TYPE_START_TAG and t.name is 'option'
1935                         if open_els[0].name is 'option'
1936                                 open_els.shift()
1937                         insert_html_element t
1938                         return
1939                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1940                         if open_els[0].name is 'option'
1941                                 open_els.shift()
1942                         if open_els[0].name is 'optgroup'
1943                                 open_els.shift()
1944                         insert_html_element t
1945                         return
1946                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1947                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1948                                 open_els.shift()
1949                         if open_els[0].name is 'optgroup'
1950                                 open_els.shift()
1951                         else
1952                                 parse_error()
1953                         return
1954                 if t.type is TYPE_END_TAG and t.name is 'option'
1955                         if open_els[0].name is 'option'
1956                                 open_els.shift()
1957                         else
1958                                 parse_error()
1959                         return
1960                 if t.type is TYPE_END_TAG and t.name is 'select'
1961                         if is_in_select_scope 'select'
1962                                 loop
1963                                         el = open_els.shift()
1964                                         if el.name is 'select'
1965                                                 break
1966                                 reset_insertion_mode()
1967                         else
1968                                 parse_error()
1969                         return
1970                 if t.type is TYPE_START_TAG and t.name is 'select'
1971                         parse_error()
1972                         loop
1973                                 el = open_els.shift()
1974                                 if el.name is 'select'
1975                                         break
1976                         reset_insertion_mode()
1977                         # spec says that this is the same as </select> but it doesn't say
1978                         # to check scope first
1979                         return
1980                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1981                         parse_error()
1982                         if is_in_select_scope 'select'
1983                                 return
1984                         loop
1985                                 el = open_els.shift()
1986                                 if el.name is 'select'
1987                                         break
1988                         reset_insertion_mode()
1989                         insertion_mode t
1990                         return
1991                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1992                         ins_mode_in_head t
1993                         return
1994                 if t.type is TYPE_EOF
1995                         ins_mode_in_body t
1996                         return
1997                 # Anything else
1998                 parse_error()
1999                 return
2000
2001         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2002         ins_mode_in_select_in_table = (t) ->
2003                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2004                         parse_error()
2005                         loop
2006                                 el = open_els.shift()
2007                                 if el.name is 'select'
2008                                         break
2009                         reset_insertion_mode()
2010                         insertion_mode t
2011                         return
2012                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2013                         parse_error()
2014                         unless is_in_table_scope t.name, NS_HTML
2015                                 return
2016                         loop
2017                                 el = open_els.shift()
2018                                 if el.name is 'select'
2019                                         break
2020                         reset_insertion_mode()
2021                         insertion_mode t
2022                         return
2023                 # Anything else
2024                 ins_mode_in_select t
2025                 return
2026
2027         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2028         ins_mode_in_template = (t) ->
2029                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2030                         ins_mode_in_body t
2031                         return
2032                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2033                         ins_mode_in_head t
2034                         return
2035                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2036                         template_insertion_modes.shift()
2037                         template_insertion_modes.unshift ins_mode_in_table
2038                         insertion_mode = ins_mode_in_table
2039                         insertion_mode t
2040                         return
2041                 if t.type is TYPE_START_TAG and t.name is 'col'
2042                         template_insertion_modes.shift()
2043                         template_insertion_modes.unshift ins_mode_in_column_group
2044                         insertion_mode = ins_mode_in_column_group
2045                         insertion_mode t
2046                         return
2047                 if t.type is TYPE_START_TAG and t.name is 'tr'
2048                         template_insertion_modes.shift()
2049                         template_insertion_modes.unshift ins_mode_in_table_body
2050                         insertion_mode = ins_mode_in_table_body
2051                         insertion_mode t
2052                         return
2053                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2054                         template_insertion_modes.shift()
2055                         template_insertion_modes.unshift ins_mode_in_row
2056                         insertion_mode = ins_mode_in_row
2057                         insertion_mode t
2058                         return
2059                 if t.type is TYPE_START_TAG
2060                         template_insertion_modes.shift()
2061                         template_insertion_modes.unshift ins_mode_in_body
2062                         insertion_mode = ins_mode_in_body
2063                         insertion_mode t
2064                         return
2065                 if t.type is TYPE_END_TAG
2066                         parse_error()
2067                         return
2068                 if t.type is TYPE_EOF
2069                         unless template_tag_is_open()
2070                                 stop_parsing()
2071                                 return
2072                         parse_error()
2073                         loop
2074                                 el = open_els.shift()
2075                                 if el.name is 'template' # fixfull check namespace
2076                                         break
2077                         clear_afe_to_marker()
2078                         template_insertion_modes.shift()
2079                         reset_insertion_mode()
2080                         insertion_mode t
2081
2082         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2083         ins_mode_after_body = (t) ->
2084                 if is_space_tok t
2085                         ins_mode_in_body t
2086                         return
2087                 if t.type is TYPE_COMMENT
2088                         insert_comment t, [open_els[0], open_els[0].children.length]
2089                         return
2090                 if t.type is TYPE_DOCTYPE
2091                         parse_error()
2092                         return
2093                 if t.type is TYPE_START_TAG and t.name is 'html'
2094                         ins_mode_in_body t
2095                         return
2096                 if t.type is TYPE_END_TAG and t.name is 'html'
2097                         # fixfull fragment case
2098                         insertion_mode = ins_mode_after_after_body
2099                         return
2100                 if t.type is TYPE_EOF
2101                         stop_parsing()
2102                         return
2103                 # Anything ELse
2104                 parse_error()
2105                 insertion_mode = ins_mode_in_body
2106                 insertion_mode t
2107
2108         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2109         ins_mode_in_frameset = (t) ->
2110                 if is_space_tok t
2111                         insert_character t
2112                         return
2113                 if t.type is TYPE_COMMENT
2114                         insert_comment t
2115                         return
2116                 if t.type is TYPE_DOCTYPE
2117                         parse_error()
2118                         return
2119                 if t.type is TYPE_START_TAG and t.name is 'html'
2120                         ins_mode_in_body t
2121                         return
2122                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2123                         insert_html_element t
2124                         return
2125                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2126                         # TODO ?correct for: "if the current node is the root html element"
2127                         if open_els.length is 1
2128                                 parse_error()
2129                                 return # fragment case
2130                         open_els.shift()
2131                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2132                                 insertion_mode = ins_mode_after_frameset
2133                         return
2134                 if t.type is TYPE_START_TAG and t.name is 'frame'
2135                         insert_html_element t
2136                         open_els.shift()
2137                         t.acknowledge_self_closing()
2138                         return
2139                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2140                         ins_mode_in_head t
2141                         return
2142                 if t.type is TYPE_EOF
2143                         # TODO ?correct for: "if the current node is not the root html element"
2144                         if open_els.length isnt 1
2145                                 parse_error()
2146                         stop_parsing()
2147                         return
2148                 # Anything else
2149                 parse_error()
2150                 return
2151
2152         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2153         ins_mode_after_frameset = (t) ->
2154                 if is_space_tok t
2155                         insert_character t
2156                         return
2157                 if t.type is TYPE_COMMENT
2158                         insert_comment t
2159                         return
2160                 if t.type is TYPE_DOCTYPE
2161                         parse_error()
2162                         return
2163                 if t.type is TYPE_START_TAG and t.name is 'html'
2164                         ins_mode_in_body t
2165                         return
2166                 if t.type is TYPE_END_TAG and t.name is 'html'
2167                         insert_mode = ins_mode_after_after_frameset
2168                         return
2169                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2170                         ins_mode_in_head t
2171                         return
2172                 if t.type is TYPE_EOF
2173                         stop_parsing()
2174                         return
2175                 # Anything else
2176                 parse_error()
2177                 return
2178
2179         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2180         ins_mode_after_after_body = (t) ->
2181                 if t.type is TYPE_COMMENT
2182                         insert_comment t, [doc, doc.children.length]
2183                         return
2184                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2185                         ins_mode_in_body t
2186                         return
2187                 if t.type is TYPE_EOF
2188                         stop_parsing()
2189                         return
2190                 # Anything else
2191                 parse_error()
2192                 insertion_mode = ins_mode_in_body
2193                 return
2194
2195         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2196         ins_mode_after_after_frameset = (t) ->
2197                 if t.type is TYPE_COMMENT
2198                         insert_comment t, [doc, doc.children.length]
2199                         return
2200                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2201                         ins_mode_in_body t
2202                         return
2203                 if t.type is TYPE_EOF
2204                         stop_parsing()
2205                         return
2206                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2207                         ins_mode_in_head t
2208                         return
2209                 # Anything else
2210                 parse_error()
2211                 return
2212
2213
2214
2215
2216
2217         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2218         tok_state_data = ->
2219                 switch c = txt.charAt(cur++)
2220                         when '&'
2221                                 return new_text_node parse_character_reference()
2222                         when '<'
2223                                 tok_state = tok_state_tag_open
2224                         when "\u0000"
2225                                 parse_error()
2226                                 return new_text_node c
2227                         when '' # EOF
2228                                 return new_eof_token()
2229                         else
2230                                 return new_text_node c
2231                 return null
2232
2233         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2234         # not needed: tok_state_character_reference_in_data = ->
2235         # just call parse_character_reference()
2236
2237         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2238         tok_state_rcdata = ->
2239                 switch c = txt.charAt(cur++)
2240                         when '&'
2241                                 return new_text_node parse_character_reference()
2242                         when '<'
2243                                 tok_state = tok_state_rcdata_less_than_sign
2244                         when "\u0000"
2245                                 parse_error()
2246                                 return new_character_token "\ufffd"
2247                         when '' # EOF
2248                                 return new_eof_token()
2249                         else
2250                                 return new_character_token c
2251                 return null
2252
2253         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2254         # not needed: tok_state_character_reference_in_rcdata = ->
2255         # just call parse_character_reference()
2256
2257         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2258         tok_state_rawtext = ->
2259                 switch c = txt.charAt(cur++)
2260                         when '<'
2261                                 tok_state = tok_state_rawtext_less_than_sign
2262                         when "\u0000"
2263                                 parse_error()
2264                                 return new_character_token "\ufffd"
2265                         when '' # EOF
2266                                 return new_eof_token()
2267                         else
2268                                 return new_character_token c
2269                 return null
2270
2271         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2272         tok_state_script_data = ->
2273                 switch c = txt.charAt(cur++)
2274                         when '<'
2275                                 tok_state = tok_state_script_data_less_than_sign
2276                         when "\u0000"
2277                                 parse_error()
2278                                 return new_character_token "\ufffd"
2279                         when '' # EOF
2280                                 return new_eof_token()
2281                         else
2282                                 return new_character_token c
2283                 return null
2284
2285         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2286         tok_state_plaintext = ->
2287                 switch c = txt.charAt(cur++)
2288                         when "\u0000"
2289                                 parse_error()
2290                                 return new_character_token "\ufffd"
2291                         when '' # EOF
2292                                 return new_eof_token()
2293                         else
2294                                 return new_character_token c
2295                 return null
2296
2297
2298         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2299         tok_state_tag_open = ->
2300                 switch c = txt.charAt(cur++)
2301                         when '!'
2302                                 tok_state = tok_state_markup_declaration_open
2303                         when '/'
2304                                 tok_state = tok_state_end_tag_open
2305                         when '?'
2306                                 parse_error()
2307                                 tok_cur_tag = new_comment_token '?'
2308                                 tok_state = tok_state_bogus_comment
2309                         else
2310                                 if is_lc_alpha(c)
2311                                         tok_cur_tag = new_open_tag c
2312                                         tok_state = tok_state_tag_name
2313                                 else if is_uc_alpha(c)
2314                                         tok_cur_tag = new_open_tag c.toLowerCase()
2315                                         tok_state = tok_state_tag_name
2316                                 else
2317                                         parse_error()
2318                                         tok_state = tok_state_data
2319                                         cur -= 1 # we didn't parse/handle the char after <
2320                                         return new_text_node '<'
2321                 return null
2322
2323         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2324         tok_state_end_tag_open = ->
2325                 switch c = txt.charAt(cur++)
2326                         when '>'
2327                                 parse_error()
2328                                 tok_state = tok_state_data
2329                         when '' # EOF
2330                                 parse_error()
2331                                 tok_state = tok_state_data
2332                                 return new_text_node '</'
2333                         else
2334                                 if is_uc_alpha(c)
2335                                         tok_cur_tag = new_end_tag c.toLowerCase()
2336                                         tok_state = tok_state_tag_name
2337                                 else if is_lc_alpha(c)
2338                                         tok_cur_tag = new_end_tag c
2339                                         tok_state = tok_state_tag_name
2340                                 else
2341                                         parse_error()
2342                                         tok_cur_tag = new_comment_token '/'
2343                                         tok_state = tok_state_bogus_comment
2344                 return null
2345
2346         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2347         tok_state_tag_name = ->
2348                 switch c = txt.charAt(cur++)
2349                         when "\t", "\n", "\u000c", ' '
2350                                 tok_state = tok_state_before_attribute_name
2351                         when '/'
2352                                 tok_state = tok_state_self_closing_start_tag
2353                         when '>'
2354                                 tok_state = tok_state_data
2355                                 tmp = tok_cur_tag
2356                                 tok_cur_tag = null
2357                                 return tmp
2358                         when "\u0000"
2359                                 parse_error()
2360                                 tok_cur_tag.name += "\ufffd"
2361                         when '' # EOF
2362                                 parse_error()
2363                                 tok_state = tok_state_data
2364                         else
2365                                 if is_uc_alpha(c)
2366                                         tok_cur_tag.name += c.toLowerCase()
2367                                 else
2368                                         tok_cur_tag.name += c
2369                 return null
2370
2371         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2372         tok_state_rcdata_less_than_sign = ->
2373                 c = txt.charAt(cur++)
2374                 if c is '/'
2375                         temporary_buffer = ''
2376                         tok_state = tok_state_rcdata_end_tag_open
2377                         return null
2378                 # Anything else
2379                 tok_state = tok_state_rcdata
2380                 cur -= 1 # reconsume the input character
2381                 return new_character_token '<'
2382
2383         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2384         tok_state_rcdata_end_tag_open = ->
2385                 c = txt.charAt(cur++)
2386                 if is_uc_alpha(c)
2387                         tok_cur_tag = new_end_tag c.toLowerCase()
2388                         temporary_buffer += c
2389                         tok_state = tok_state_rcdata_end_tag_name
2390                         return null
2391                 if is_lc_alpha(c)
2392                         tok_cur_tag = new_end_tag c
2393                         temporary_buffer += c
2394                         tok_state = tok_state_rcdata_end_tag_name
2395                         return null
2396                 # Anything else
2397                 tok_state = tok_state_rcdata
2398                 cur -= 1 # reconsume the input character
2399                 return new_character_token "</" # fixfull separate these
2400
2401         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2402         is_appropriate_end_tag = (t) ->
2403                 # spec says to check against "the tag name of the last start tag to
2404                 # have been emitted from this tokenizer", but this is only called from
2405                 # the various "raw" states, which I'm pretty sure all push the start
2406                 # token onto open_els. TODO: verify this after the script data states
2407                 # are implemented
2408                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2409                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2410
2411         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2412         tok_state_rcdata_end_tag_name = ->
2413                 c = txt.charAt(cur++)
2414                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2415                         if is_appropriate_end_tag tok_cur_tag
2416                                 tok_state = tok_state_before_attribute_name
2417                                 return
2418                         # else fall through to "Anything else"
2419                 if c is '/'
2420                         if is_appropriate_end_tag tok_cur_tag
2421                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2422                                 return
2423                         # else fall through to "Anything else"
2424                 if c is '>'
2425                         if is_appropriate_end_tag tok_cur_tag
2426                                 tok_state = tok_state_data
2427                                 return tok_cur_tag
2428                         # else fall through to "Anything else"
2429                 if is_uc_alpha(c)
2430                         tok_cur_tag.name += c.toLowerCase()
2431                         temporary_buffer += c
2432                         return null
2433                 if is_lc_alpha(c)
2434                         tok_cur_tag.name += c
2435                         temporary_buffer += c
2436                         return null
2437                 # Anything else
2438                 tok_state = tok_state_rcdata
2439                 cur -= 1 # reconsume the input character
2440                 return new_character_token '</' + temporary_buffer # fixfull separate these
2441
2442         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2443         tok_state_rawtext_less_than_sign = ->
2444                 c = txt.charAt(cur++)
2445                 if c is '/'
2446                         temporary_buffer = ''
2447                         tok_state = tok_state_rawtext_end_tag_open
2448                         return null
2449                 # Anything else
2450                 tok_state = tok_state_rawtext
2451                 cur -= 1 # reconsume the input character
2452                 return new_character_token '<'
2453
2454         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2455         tok_state_rawtext_end_tag_open = ->
2456                 c = txt.charAt(cur++)
2457                 if is_uc_alpha(c)
2458                         tok_cur_tag = new_end_tag c.toLowerCase()
2459                         temporary_buffer += c
2460                         tok_state = tok_state_rawtext_end_tag_name
2461                         return null
2462                 if is_lc_alpha(c)
2463                         tok_cur_tag = new_end_tag c
2464                         temporary_buffer += c
2465                         tok_state = tok_state_rawtext_end_tag_name
2466                         return null
2467                 # Anything else
2468                 tok_state = tok_state_rawtext
2469                 cur -= 1 # reconsume the input character
2470                 return new_character_token "</" # fixfull separate these
2471
2472         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2473         tok_state_rawtext_end_tag_name = ->
2474                 c = txt.charAt(cur++)
2475                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2476                         if is_appropriate_end_tag tok_cur_tag
2477                                 tok_state = tok_state_before_attribute_name
2478                                 return
2479                         # else fall through to "Anything else"
2480                 if c is '/'
2481                         if is_appropriate_end_tag tok_cur_tag
2482                                 tok_state = tok_state_self_closing_start_tag
2483                                 return
2484                         # else fall through to "Anything else"
2485                 if c is '>'
2486                         if is_appropriate_end_tag tok_cur_tag
2487                                 tok_state = tok_state_data
2488                                 return tok_cur_tag
2489                         # else fall through to "Anything else"
2490                 if is_uc_alpha(c)
2491                         tok_cur_tag.name += c.toLowerCase()
2492                         temporary_buffer += c
2493                         return null
2494                 if is_lc_alpha(c)
2495                         tok_cur_tag.name += c
2496                         temporary_buffer += c
2497                         return null
2498                 # Anything else
2499                 tok_state = tok_state_rawtext
2500                 cur -= 1 # reconsume the input character
2501                 return new_character_token '</' + temporary_buffer # fixfull separate these
2502
2503         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2504         tok_state_script_data_less_than_sign = ->
2505                 c = txt.charAt(cur++)
2506                 if c is '/'
2507                         temporary_buffer = ''
2508                         tok_state = tok_state_script_data_end_tag_open
2509                         return
2510                 if c is '!'
2511                         tok_state = tok_state_script_data_escape_start
2512                         return new_character_token '<!' # fixfull split
2513                 # Anything else
2514                 tok_state = tok_state_script_data
2515                 cur -= 1 # Reconsume
2516                 return new_character_token '<'
2517
2518         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2519         tok_state_script_data_end_tag_open = ->
2520                 c = txt.charAt(cur++)
2521                 if is_uc_alpha(c)
2522                         tok_cur_tag = new_end_tag c.toLowerCase()
2523                         temporary_buffer += c
2524                         tok_state = tok_state_script_data_end_tag_name
2525                         return
2526                 if is_lc_alpha(c)
2527                         tok_cur_tag = new_end_tag c
2528                         temporary_buffer += c
2529                         tok_state = tok_state_script_data_end_tag_name
2530                         return
2531                 # Anything else
2532                 tok_state = tok_state_script_data
2533                 cur -= 1 # Reconsume
2534                 return new_character_token '</'
2535
2536         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2537         tok_state_script_data_end_tag_name = ->
2538                 c = txt.charAt(cur++)
2539                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2540                         if is_appropriate_end_tag tok_cur_tag
2541                                 tok_state = tok_state_before_attribute_name
2542                                 return
2543                         # fall through
2544                 if c is '/'
2545                         if is_appropriate_end_tag tok_cur_tag
2546                                 tok_state = tok_state_self_closing_start_tag
2547                                 return
2548                         # fall through
2549                 if is_uc_alpha(c)
2550                         tok_cur_tag.name += c.toLowerCase()
2551                         temporary_buffer += c
2552                         return
2553                 if is_lc_alpha(c)
2554                         tok_cur_tag.name += c
2555                         temporary_buffer += c
2556                         return
2557                 # Anything else
2558                 tok_state = tok_state_script_data
2559                 cur -= 1 # Reconsume
2560                 return new_character_token "</#{temporary_buffer}" # fixfull split
2561
2562         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2563         tok_state_script_data_escape_start = ->
2564                 c = txt.charAt(cur++)
2565                 if c is '-'
2566                         tok_state = tok_state_script_data_escape_start_dash
2567                         return new_character_token '-'
2568                 # Anything else
2569                 tok_state = tok_state_script_data
2570                 cur -= 1 # Reconsume
2571                 return
2572
2573         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2574         tok_state_script_data_escape_start_dash = ->
2575                 c = txt.charAt(cur++)
2576                 if c is '-'
2577                         tok_state = tok_state_script_data_escaped_dash_dash
2578                         return new_character_token '-'
2579                 # Anything else
2580                 tok_state = tok_state_script_data
2581                 cur -= 1 # Reconsume
2582                 return
2583
2584         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2585         tok_state_script_data_escaped = ->
2586                 c = txt.charAt(cur++)
2587                 if c is '-'
2588                         tok_state = tok_state_script_data_escaped_dash
2589                         return new_character_token '-'
2590                 if c is '<'
2591                         tok_state = tok_state_script_data_escaped_less_than_sign
2592                         return
2593                 if c is "\u0000"
2594                         parse_error()
2595                         return new_character_token "\ufffd"
2596                 if c is '' # EOF
2597                         tok_state = tok_state_data
2598                         parse_error()
2599                         cur -= 1 # Reconsume
2600                         return
2601                 # Anything else
2602                 return new_character_token c
2603
2604         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2605         tok_state_script_data_escaped_dash = ->
2606                 c = txt.charAt(cur++)
2607                 if c is '-'
2608                         tok_state = tok_state_script_data_escaped_dash_dash
2609                         return new_character_token '-'
2610                 if c is '<'
2611                         tok_state = tok_state_script_data_escaped_less_than_sign
2612                         return
2613                 if c is "\u0000"
2614                         parse_error()
2615                         tok_state = tok_state_script_data_escaped
2616                         return new_character_token "\ufffd"
2617                 if c is '' # EOF
2618                         tok_state = tok_state_data
2619                         parse_error()
2620                         cur -= 1 # Reconsume
2621                         return
2622                 # Anything else
2623                 tok_state = tok_state_script_data_escaped
2624                 return new_character_token c
2625
2626         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2627         tok_state_script_data_escaped_dash_dash = ->
2628                 c = txt.charAt(cur++)
2629                 if c is '-'
2630                         return new_character_token '-'
2631                 if c is '<'
2632                         tok_state = tok_state_script_data_escaped_less_than_sign
2633                         return
2634                 if c is '>'
2635                         tok_state = tok_state_script_data
2636                         return new_character_token '>'
2637                 if c is "\u0000"
2638                         parse_error()
2639                         tok_state = tok_state_script_data_escaped
2640                         return new_character_token "\ufffd"
2641                 if c is '' # EOF
2642                         parse_error()
2643                         tok_state = tok_state_data
2644                         cur -= 1 # Reconsume
2645                         return
2646                 # Anything else
2647                 tok_state = tok_state_script_data_escaped
2648                 return new_character_token c
2649
2650         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2651         tok_state_script_data_escaped_less_than_sign = ->
2652                 c = txt.charAt(cur++)
2653                 if c is '/'
2654                         temporary_buffer = ''
2655                         tok_state = tok_state_script_data_escaped_end_tag_open
2656                         return
2657                 if is_uc_alpha(c)
2658                         temporary_buffer = c.toLowerCase() # yes, really
2659                         tok_state = tok_state_script_data_double_escape_start
2660                         return new_character_token "<#{c}" # fixfull split
2661                 if is_lc_alpha(c)
2662                         temporary_buffer = c
2663                         tok_state = tok_state_script_data_double_escape_start
2664                         return new_character_token "<#{c}" # fixfull split
2665                 # Anything else
2666                 tok_state = tok_state_script_data_escaped
2667                 cur -= 1 # Reconsume
2668                 return new_character_token c
2669
2670         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2671         tok_state_script_data_escaped_end_tag_open = ->
2672                 c = txt.charAt(cur++)
2673                 if is_uc_alpha(c)
2674                         tok_cur_tag = new_end_tag c.toLowerCase()
2675                         temporary_buffer += c
2676                         tok_state = tok_state_script_data_escaped_end_tag_name
2677                         return
2678                 if is_lc_alpha(c)
2679                         tok_cur_tag = new_end_tag c
2680                         temporary_buffer += c
2681                         tok_state = tok_state_script_data_escaped_end_tag_name
2682                         return
2683                 # Anything else
2684                 tok_state = tok_state_script_data_escaped
2685                 cur -= 1 # Reconsume
2686                 return new_character_token '</' # fixfull split
2687
2688         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2689         tok_state_script_data_escaped_end_tag_name = ->
2690                 c = txt.charAt(cur++)
2691                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2692                         if is_appropriate_end_tag tok_cur_tag
2693                                 tok_state = tok_state_before_attribute_name
2694                                 return
2695                         # fall through
2696                 if c is '/'
2697                         if is_appropriate_end_tag tok_cur_tag
2698                                 tok_state = tok_state_self_closing_start_tag
2699                                 return
2700                         # fall through
2701                 if is_uc_alpha(c)
2702                         tok_cur_tag.name += c.toLowerCase()
2703                         temporary_buffer += c.toLowerCase()
2704                         return
2705                 if is_lc_alpha(c)
2706                         tok_cur_tag.name += c
2707                         temporary_buffer += c.toLowerCase()
2708                         return
2709                 # Anything else
2710                 tok_state = tok_state_script_data_escaped
2711                 cur -= 1 # Reconsume
2712                 return new_character_token "</#{temporary_buffer}" # fixfull split
2713
2714         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2715         tok_state_script_data_double_escape_start = ->
2716                 c = txt.charAt(cur++)
2717                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2718                         if temporary_buffer is 'script'
2719                                 tok_state = tok_state_script_data_double_escaped
2720                         else
2721                                 tok_state = tok_state_script_data_escaped
2722                         return new_character_token c
2723                 if is_uc_alpha(c)
2724                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2725                         return new_character_token c
2726                 if is_lc_alpha(c)
2727                         temporary_buffer += c
2728                         return new_character_token c
2729                 # Anything else
2730                 tok_state = tok_state_script_data_escaped
2731                 cur -= 1 # Reconsume
2732                 return
2733
2734         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2735         tok_state_script_data_double_escaped = ->
2736                 c = txt.charAt(cur++)
2737                 if c is '-'
2738                         tok_state = tok_state_script_data_double_escaped_dash
2739                         return new_character_token '-'
2740                 if c is '<'
2741                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2742                         return new_character_token '<'
2743                 if c is "\u0000"
2744                         parse_error()
2745                         return new_character_token "\ufffd"
2746                 if c is '' # EOF
2747                         parse_error()
2748                         tok_state = tok_state_data
2749                         cur -= 1 # Reconsume
2750                         return
2751                 # Anything else
2752                 return new_character_token c
2753
2754         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2755         tok_state_script_data_double_escaped_dash = ->
2756                 c = txt.charAt(cur++)
2757                 if c is '-'
2758                         tok_state = tok_state_script_data_double_escaped_dash_dash
2759                         return new_character_token '-'
2760                 if c is '<'
2761                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2762                         return new_character_token '<'
2763                 if c is "\u0000"
2764                         parse_error()
2765                         tok_state = tok_state_script_data_double_escaped
2766                         return new_character_token "\ufffd"
2767                 if c is '' # EOF
2768                         parse_error()
2769                         tok_state = tok_state_data
2770                         cur -= 1 # Reconsume
2771                         return
2772                 # Anything else
2773                 tok_state = tok_state_script_data_double_escaped
2774                 return new_character_token c
2775
2776         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2777         tok_state_script_data_double_escaped_dash_dash = ->
2778                 c = txt.charAt(cur++)
2779                 if c is '-'
2780                         return new_character_token '-'
2781                 if c is '<'
2782                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2783                         return new_character_token '<'
2784                 if c is '>'
2785                         tok_state = tok_state_script_data
2786                         return new_character_token '>'
2787                 if c is "\u0000"
2788                         parse_error()
2789                         tok_state = tok_state_script_data_double_escaped
2790                         return new_character_token "\ufffd"
2791                 if c is '' # EOF
2792                         parse_error()
2793                         tok_state = tok_state_data
2794                         cur -= 1 # Reconsume
2795                         return
2796                 # Anything else
2797                 tok_state = tok_state_script_data_double_escaped
2798                 return new_character_token c
2799
2800         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2801         tok_state_script_data_double_escaped_less_than_sign = ->
2802                 c = txt.charAt(cur++)
2803                 if c is '/'
2804                         temporary_buffer = ''
2805                         tok_state = tok_state_script_data_double_escape_end
2806                         return new_character_token '/'
2807                 # Anything else
2808                 tok_state = tok_state_script_data_double_escaped
2809                 cur -= 1 # Reconsume
2810                 return
2811
2812         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2813         tok_state_script_data_double_escape_end = ->
2814                 c = txt.charAt(cur++)
2815                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2816                         if temporary_buffer is 'script'
2817                                 tok_state = tok_state_script_data_escaped
2818                         else
2819                                 tok_state = tok_state_script_data_double_escaped
2820                         return new_character_token c
2821                 if is_uc_alpha(c)
2822                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2823                         return new_character_token c
2824                 if is_lc_alpha(c)
2825                         temporary_buffer += c
2826                         return new_character_token c
2827                 # Anything else
2828                 tok_state = tok_state_script_data_double_escaped
2829                 cur -= 1 # Reconsume
2830                 return
2831
2832         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2833         tok_state_before_attribute_name = ->
2834                 attr_name = null
2835                 switch c = txt.charAt(cur++)
2836                         when "\t", "\n", "\u000c", ' '
2837                                 return null
2838                         when '/'
2839                                 tok_state = tok_state_self_closing_start_tag
2840                                 return null
2841                         when '>'
2842                                 tok_state = tok_state_data
2843                                 tmp = tok_cur_tag
2844                                 tok_cur_tag = null
2845                                 return tmp
2846                         when "\u0000"
2847                                 parse_error()
2848                                 attr_name = "\ufffd"
2849                         when '"', "'", '<', '='
2850                                 parse_error()
2851                                 attr_name = c
2852                         when '' # EOF
2853                                 parse_error()
2854                                 tok_state = tok_state_data
2855                         else
2856                                 if is_uc_alpha(c)
2857                                         attr_name = c.toLowerCase()
2858                                 else
2859                                         attr_name = c
2860                 if attr_name?
2861                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2862                         tok_state = tok_state_attribute_name
2863                 return null
2864
2865         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2866         tok_state_attribute_name = ->
2867                 switch c = txt.charAt(cur++)
2868                         when "\t", "\n", "\u000c", ' '
2869                                 tok_state = tok_state_after_attribute_name
2870                         when '/'
2871                                 tok_state = tok_state_self_closing_start_tag
2872                         when '='
2873                                 tok_state = tok_state_before_attribute_value
2874                         when '>'
2875                                 tok_state = tok_state_data
2876                                 tmp = tok_cur_tag
2877                                 tok_cur_tag = null
2878                                 return tmp
2879                         when "\u0000"
2880                                 parse_error()
2881                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2882                         when '"', "'", '<'
2883                                 parse_error()
2884                                 tok_cur_tag.attrs_a[0][0] = c
2885                         when '' # EOF
2886                                 parse_error()
2887                                 tok_state = tok_state_data
2888                         else
2889                                 if is_uc_alpha(c)
2890                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2891                                 else
2892                                         tok_cur_tag.attrs_a[0][0] += c
2893                 return null
2894
2895         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2896         tok_state_after_attribute_name = ->
2897                 c = txt.charAt(cur++)
2898                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2899                         return
2900                 if c is '/'
2901                         tok_state = tok_state_self_closing_start_tag
2902                         return
2903                 if c is '='
2904                         tok_state = tok_state_before_attribute_value
2905                         return
2906                 if c is '>'
2907                         tok_state = tok_state_data
2908                         return
2909                 if is_uc_alpha(c)
2910                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2911                         tok_state = tok_state_attribute_name
2912                         return
2913                 if c is "\u0000"
2914                         parse_error()
2915                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2916                         tok_state = tok_state_attribute_name
2917                         return
2918                 if c is '' # EOF
2919                         parse_error()
2920                         tok_state = tok_state_data
2921                         cur -= 1 # reconsume
2922                         return
2923                 if c is '"' or c is "'" or c is '<'
2924                         parse_error()
2925                         # fall through to Anything else
2926                 # Anything else
2927                 tok_cur_tag.attrs_a.unshift [c, '']
2928                 tok_state = tok_state_attribute_name
2929
2930         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2931         tok_state_before_attribute_value = ->
2932                 switch c = txt.charAt(cur++)
2933                         when "\t", "\n", "\u000c", ' '
2934                                 return null
2935                         when '"'
2936                                 tok_state = tok_state_attribute_value_double_quoted
2937                         when '&'
2938                                 tok_state = tok_state_attribute_value_unquoted
2939                                 cur -= 1
2940                         when "'"
2941                                 tok_state = tok_state_attribute_value_single_quoted
2942                         when "\u0000"
2943                                 # Parse error
2944                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2945                                 tok_state = tok_state_attribute_value_unquoted
2946                         when '>'
2947                                 # Parse error
2948                                 tok_state = tok_state_data
2949                                 tmp = tok_cur_tag
2950                                 tok_cur_tag = null
2951                                 return tmp
2952                         when '' # EOF
2953                                 parse_error()
2954                                 tok_state = tok_state_data
2955                         else
2956                                 tok_cur_tag.attrs_a[0][1] += c
2957                                 tok_state = tok_state_attribute_value_unquoted
2958                 return null
2959
2960         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2961         tok_state_attribute_value_double_quoted = ->
2962                 switch c = txt.charAt(cur++)
2963                         when '"'
2964                                 tok_state = tok_state_after_attribute_value_quoted
2965                         when '&'
2966                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2967                         when "\u0000"
2968                                 # Parse error
2969                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2970                         when '' # EOF
2971                                 parse_error()
2972                                 tok_state = tok_state_data
2973                         else
2974                                 tok_cur_tag.attrs_a[0][1] += c
2975                 return null
2976
2977         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2978         tok_state_attribute_value_single_quoted = ->
2979                 switch c = txt.charAt(cur++)
2980                         when "'"
2981                                 tok_state = tok_state_after_attribute_value_quoted
2982                         when '&'
2983                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2984                         when "\u0000"
2985                                 # Parse error
2986                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2987                         when '' # EOF
2988                                 parse_error()
2989                                 tok_state = tok_state_data
2990                         else
2991                                 tok_cur_tag.attrs_a[0][1] += c
2992                 return null
2993
2994         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2995         tok_state_attribute_value_unquoted = ->
2996                 switch c = txt.charAt(cur++)
2997                         when "\t", "\n", "\u000c", ' '
2998                                 tok_state = tok_state_before_attribute_name
2999                         when '&'
3000                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3001                         when '>'
3002                                 tok_state = tok_state_data
3003                                 tmp = tok_cur_tag
3004                                 tok_cur_tag = null
3005                                 return tmp
3006                         when "\u0000"
3007                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3008                         when '' # EOF
3009                                 parse_error()
3010                                 tok_state = tok_state_data
3011                         else
3012                                 # Parse Error if ', <, = or ` (backtick)
3013                                 tok_cur_tag.attrs_a[0][1] += c
3014                 return null
3015
3016         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3017         tok_state_after_attribute_value_quoted = ->
3018                 switch c = txt.charAt(cur++)
3019                         when "\t", "\n", "\u000c", ' '
3020                                 tok_state = tok_state_before_attribute_name
3021                         when '/'
3022                                 tok_state = tok_state_self_closing_start_tag
3023                         when '>'
3024                                 tok_state = tok_state_data
3025                                 tmp = tok_cur_tag
3026                                 tok_cur_tag = null
3027                                 return tmp
3028                         when '' # EOF
3029                                 parse_error()
3030                                 tok_state = tok_state_data
3031                         else
3032                                 # Parse Error
3033                                 tok_state = tok_state_before_attribute_name
3034                                 cur -= 1 # we didn't handle that char
3035                 return null
3036
3037         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3038         tok_state_self_closing_start_tag = ->
3039                 c = txt.charAt(cur++)
3040                 if c is '>'
3041                         tok_cur_tag.flag 'self-closing'
3042                         tok_state = tok_state_data
3043                         return tok_cur_tag
3044                 if c is ''
3045                         parse_error()
3046                         tok_state = tok_state_data
3047                         cur -= 1 # Reconsume
3048                         return
3049                 # Anything else
3050                 parse_error()
3051                 tok_state = tok_state_before_attribute_name
3052                 cur -= 1 # Reconsume
3053                 return
3054
3055         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3056         # WARNING: put a comment token in tok_cur_tag before setting this state
3057         tok_state_bogus_comment = ->
3058                 next_gt = txt.indexOf '>', cur
3059                 if next_gt is -1
3060                         val = txt.substr cur
3061                         cur = txt.length
3062                 else
3063                         val = txt.substr cur, (next_gt - cur)
3064                         cur = next_gt + 1
3065                 val = val.replace "\u0000", "\ufffd"
3066                 tok_cur_tag.text += val
3067                 tok_state = tok_state_data
3068                 return tok_cur_tag
3069
3070         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3071         tok_state_markup_declaration_open = ->
3072                 if txt.substr(cur, 2) is '--'
3073                         cur += 2
3074                         tok_cur_tag = new_comment_token ''
3075                         tok_state = tok_state_comment_start
3076                         return
3077                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3078                         cur += 7
3079                         tok_state = tok_state_doctype
3080                         return
3081                 acn = adjusted_current_node()
3082                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3083                         cur += 7
3084                         tok_state = tok_state_cdata_section
3085                         return
3086                 # Otherwise
3087                 parse_error()
3088                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3089                 tok_state = tok_state_bogus_comment
3090                 return
3091
3092         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3093         tok_state_comment_start = ->
3094                 switch c = txt.charAt(cur++)
3095                         when '-'
3096                                 tok_state = tok_state_comment_start_dash
3097                         when "\u0000"
3098                                 parse_error()
3099                                 return new_character_token "\ufffd"
3100                         when '>'
3101                                 parse_error()
3102                                 tok_state = tok_state_data
3103                                 return tok_cur_tag
3104                         when '' # EOF
3105                                 parse_error()
3106                                 tok_state = tok_state_data
3107                                 cur -= 1 # Reconsume
3108                                 return tok_cur_tag
3109                         else
3110                                 tok_cur_tag.text += c
3111                 return null
3112
3113         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3114         tok_state_comment_start_dash = ->
3115                 switch c = txt.charAt(cur++)
3116                         when '-'
3117                                 tok_state = tok_state_comment_end
3118                         when "\u0000"
3119                                 parse_error()
3120                                 tok_cur_tag.text += "-\ufffd"
3121                                 tok_state = tok_state_comment
3122                         when '>'
3123                                 parse_error()
3124                                 tok_state = tok_state_data
3125                                 return tok_cur_tag
3126                         when '' # EOF
3127                                 parse_error()
3128                                 tok_state = tok_state_data
3129                                 cur -= 1 # Reconsume
3130                                 return tok_cur_tag
3131                         else
3132                                 tok_cur_tag.text += "-#{c}"
3133                                 tok_state = tok_state_comment
3134                 return null
3135
3136         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3137         tok_state_comment = ->
3138                 switch c = txt.charAt(cur++)
3139                         when '-'
3140                                 tok_state = tok_state_comment_end_dash
3141                         when "\u0000"
3142                                 parse_error()
3143                                 tok_cur_tag.text += "\ufffd"
3144                         when '' # EOF
3145                                 parse_error()
3146                                 tok_state = tok_state_data
3147                                 cur -= 1 # Reconsume
3148                                 return tok_cur_tag
3149                         else
3150                                 tok_cur_tag.text += c
3151                 return null
3152
3153         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3154         tok_state_comment_end_dash = ->
3155                 switch c = txt.charAt(cur++)
3156                         when '-'
3157                                 tok_state = tok_state_comment_end
3158                         when "\u0000"
3159                                 parse_error()
3160                                 tok_cur_tag.text += "-\ufffd"
3161                                 tok_state = tok_state_comment
3162                         when '' # EOF
3163                                 parse_error()
3164                                 tok_state = tok_state_data
3165                                 cur -= 1 # Reconsume
3166                                 return tok_cur_tag
3167                         else
3168                                 tok_cur_tag.text += "-#{c}"
3169                                 tok_state = tok_state_comment
3170                 return null
3171
3172         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3173         tok_state_comment_end = ->
3174                 switch c = txt.charAt(cur++)
3175                         when '>'
3176                                 tok_state = tok_state_data
3177                                 return tok_cur_tag
3178                         when "\u0000"
3179                                 parse_error()
3180                                 tok_cur_tag.text += "--\ufffd"
3181                                 tok_state = tok_state_comment
3182                         when '!'
3183                                 parse_error()
3184                                 tok_state = tok_state_comment_end_bang
3185                         when '-'
3186                                 parse_error()
3187                                 tok_cur_tag.text += '-'
3188                         when '' # EOF
3189                                 parse_error()
3190                                 tok_state = tok_state_data
3191                                 cur -= 1 # Reconsume
3192                                 return tok_cur_tag
3193                         else
3194                                 parse_error()
3195                                 tok_cur_tag.text += "--#{c}"
3196                                 tok_state = tok_state_comment
3197                 return null
3198
3199         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3200         tok_state_comment_end_bang = ->
3201                 switch c = txt.charAt(cur++)
3202                         when '-'
3203                                 tok_cur_tag.text += "--!#{c}"
3204                                 tok_state = tok_state_comment_end_dash
3205                         when '>'
3206                                 tok_state = tok_state_data
3207                                 return tok_cur_tag
3208                         when "\u0000"
3209                                 parse_error()
3210                                 tok_cur_tag.text += "--!\ufffd"
3211                                 tok_state = tok_state_comment
3212                         when '' # EOF
3213                                 parse_error()
3214                                 tok_state = tok_state_data
3215                                 cur -= 1 # Reconsume
3216                                 return tok_cur_tag
3217                         else
3218                                 tok_cur_tag.text += "--!#{c}"
3219                                 tok_state = tok_state_comment
3220                 return null
3221
3222         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3223         tok_state_doctype = ->
3224                 switch c = txt.charAt(cur++)
3225                         when "\t", "\u000a", "\u000c", ' '
3226                                 tok_state = tok_state_before_doctype_name
3227                         when '' # EOF
3228                                 parse_error()
3229                                 tok_state = tok_state_data
3230                                 el = new_doctype_token ''
3231                                 el.flag 'force-quirks', true
3232                                 cur -= 1 # Reconsume
3233                                 return el
3234                         else
3235                                 parse_error()
3236                                 tok_state = tok_state_before_doctype_name
3237                                 cur -= 1 # Reconsume
3238                 return null
3239
3240         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3241         tok_state_before_doctype_name = ->
3242                 c = txt.charAt(cur++)
3243                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3244                         return
3245                 if is_uc_alpha(c)
3246                         tok_cur_tag = new_doctype_token c.toLowerCase()
3247                         tok_state = tok_state_doctype_name
3248                         return
3249                 if c is "\u0000"
3250                         parse_error()
3251                         tok_cur_tag = new_doctype_token "\ufffd"
3252                         tok_state = tok_state_doctype_name
3253                         return
3254                 if c is '>'
3255                         parse_error()
3256                         el = new_doctype_token ''
3257                         el.flag 'force-quirks', true
3258                         tok_state = tok_state_data
3259                         return el
3260                 if c is '' # EOF
3261                         parse_error()
3262                         tok_state = tok_state_data
3263                         el = new_doctype_token ''
3264                         el.flag 'force-quirks', true
3265                         cur -= 1 # Reconsume
3266                         return el
3267                 # Anything else
3268                 tok_cur_tag = new_doctype_token c
3269                 tok_state = tok_state_doctype_name
3270                 return null
3271
3272         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3273         tok_state_doctype_name = ->
3274                 c = txt.charAt(cur++)
3275                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3276                         tok_state = tok_state_after_doctype_name
3277                         return
3278                 if c is '>'
3279                         tok_state = tok_state_data
3280                         return tok_cur_tag
3281                 if is_uc_alpha(c)
3282                         tok_cur_tag.name += c.toLowerCase()
3283                         return
3284                 if c is "\u0000"
3285                         parse_error()
3286                         tok_cur_tag.name += "\ufffd"
3287                         return
3288                 if c is '' # EOF
3289                         parse_error()
3290                         tok_state = tok_state_data
3291                         tok_cur_tag.flag 'force-quirks', true
3292                         cur -= 1 # Reconsume
3293                         return tok_cur_tag
3294                 # Anything else
3295                 tok_cur_tag.name += c
3296                 return null
3297
3298         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3299         tok_state_after_doctype_name = ->
3300                 c = txt.charAt(cur++)
3301                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3302                         return
3303                 if c is '>'
3304                         tok_state = tok_state_data
3305                         return tok_cur_tag
3306                 if c is '' # EOF
3307                         parse_error()
3308                         tok_state = tok_state_data
3309                         tok_cur_tag.flag 'force-quirks', true
3310                         cur -= 1 # Reconsume
3311                         return tok_cur_tag
3312                 # Anything else
3313                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3314                         cur += 5
3315                         tok_state = tok_state_after_doctype_public_keyword
3316                         return
3317                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3318                         cur += 5
3319                         tok_state = tok_state_after_doctype_system_keyword
3320                         return
3321                 parse_error()
3322                 tok_cur_tag.flag 'force-quirks', true
3323                 tok_state = tok_state_bogus_doctype
3324                 return null
3325
3326         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3327         tok_state_after_doctype_public_keyword = ->
3328                 c = txt.charAt(cur++)
3329                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3330                         tok_state = tok_state_before_doctype_public_identifier
3331                         return
3332                 if c is '"'
3333                         parse_error()
3334                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3335                         tok_state = tok_state_doctype_public_identifier_double_quoted
3336                         return
3337                 if c is "'"
3338                         parse_error()
3339                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3340                         tok_state = tok_state_doctype_public_identifier_single_quoted
3341                         return
3342                 if c is '>'
3343                         parse_error()
3344                         tok_cur_tag.flag 'force-quirks', true
3345                         tok_state = tok_state_data
3346                         return tok_cur_tag
3347                 if c is '' # EOF
3348                         parse_error()
3349                         tok_state = tok_state_data
3350                         tok_cur_tag.flag 'force-quirks', true
3351                         cur -= 1 # Reconsume
3352                         return tok_cur_tag
3353                 # Anything else
3354                 parse_error()
3355                 tok_cur_tag.flag 'force-quirks', true
3356                 tok_state = tok_state_bogus_doctype
3357                 return null
3358
3359         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3360         tok_state_before_doctype_public_identifier = ->
3361                 c = txt.charAt(cur++)
3362                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3363                         return
3364                 if c is '"'
3365                         parse_error()
3366                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3367                         tok_state = tok_state_doctype_public_identifier_double_quoted
3368                         return
3369                 if c is "'"
3370                         parse_error()
3371                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3372                         tok_state = tok_state_doctype_public_identifier_single_quoted
3373                         return
3374                 if c is '>'
3375                         parse_error()
3376                         tok_cur_tag.flag 'force-quirks', true
3377                         tok_state = tok_state_data
3378                         return tok_cur_tag
3379                 if c is '' # EOF
3380                         parse_error()
3381                         tok_state = tok_state_data
3382                         tok_cur_tag.flag 'force-quirks', true
3383                         cur -= 1 # Reconsume
3384                         return tok_cur_tag
3385                 # Anything else
3386                 parse_error()
3387                 tok_cur_tag.flag 'force-quirks', true
3388                 tok_state = tok_state_bogus_doctype
3389                 return null
3390
3391
3392         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3393         tok_state_doctype_public_identifier_double_quoted = ->
3394                 c = txt.charAt(cur++)
3395                 if c is '"'
3396                         tok_state = tok_state_after_doctype_public_identifier
3397                         return
3398                 if c is "\u0000"
3399                         parse_error()
3400                         tok_cur_tag.public_identifier += "\ufffd"
3401                         return
3402                 if c is '>'
3403                         parse_error()
3404                         tok_cur_tag.flag 'force-quirks', true
3405                         tok_state = tok_state_data
3406                         return tok_cur_tag
3407                 if c is '' # EOF
3408                         parse_error()
3409                         tok_state = tok_state_data
3410                         tok_cur_tag.flag 'force-quirks', true
3411                         cur -= 1 # Reconsume
3412                         return tok_cur_tag
3413                 # Anything else
3414                 tok_cur_tag.public_identifier += c
3415                 return null
3416
3417         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3418         tok_state_doctype_public_identifier_single_quoted = ->
3419                 c = txt.charAt(cur++)
3420                 if c is "'"
3421                         tok_state = tok_state_after_doctype_public_identifier
3422                         return
3423                 if c is "\u0000"
3424                         parse_error()
3425                         tok_cur_tag.public_identifier += "\ufffd"
3426                         return
3427                 if c is '>'
3428                         parse_error()
3429                         tok_cur_tag.flag 'force-quirks', true
3430                         tok_state = tok_state_data
3431                         return tok_cur_tag
3432                 if c is '' # EOF
3433                         parse_error()
3434                         tok_state = tok_state_data
3435                         tok_cur_tag.flag 'force-quirks', true
3436                         cur -= 1 # Reconsume
3437                         return tok_cur_tag
3438                 # Anything else
3439                 tok_cur_tag.public_identifier += c
3440                 return null
3441
3442         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3443         tok_state_after_doctype_public_identifier = ->
3444                 c = txt.charAt(cur++)
3445                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3446                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3447                         return
3448                 if c is '>'
3449                         tok_state = tok_state_data
3450                         return tok_cur_tag
3451                 if c is '"'
3452                         parse_error()
3453                         tok_cur_tag.system_identifier = ''
3454                         tok_state = tok_state_doctype_system_identifier_double_quoted
3455                         return
3456                 if c is "'"
3457                         parse_error()
3458                         tok_cur_tag.system_identifier = ''
3459                         tok_state = tok_state_doctype_system_identifier_single_quoted
3460                         return
3461                 if c is '' # EOF
3462                         parse_error()
3463                         tok_state = tok_state_data
3464                         tok_cur_tag.flag 'force-quirks', true
3465                         cur -= 1 # Reconsume
3466                         return tok_cur_tag
3467                 # Anything else
3468                 parse_error()
3469                 tok_cur_tag.flag 'force-quirks', true
3470                 tok_state = tok_state_bogus_doctype
3471                 return null
3472
3473         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3474         tok_state_between_doctype_public_and_system_identifiers = ->
3475                 c = txt.charAt(cur++)
3476                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3477                         return
3478                 if c is '>'
3479                         tok_state = tok_state_data
3480                         return tok_cur_tag
3481                 if c is '"'
3482                         parse_error()
3483                         tok_cur_tag.system_identifier = ''
3484                         tok_state = tok_state_doctype_system_identifier_double_quoted
3485                         return
3486                 if c is "'"
3487                         parse_error()
3488                         tok_cur_tag.system_identifier = ''
3489                         tok_state = tok_state_doctype_system_identifier_single_quoted
3490                         return
3491                 if c is '' # EOF
3492                         parse_error()
3493                         tok_state = tok_state_data
3494                         tok_cur_tag.flag 'force-quirks', true
3495                         cur -= 1 # Reconsume
3496                         return tok_cur_tag
3497                 # Anything else
3498                 parse_error()
3499                 tok_cur_tag.flag 'force-quirks', true
3500                 tok_state = tok_state_bogus_doctype
3501                 return null
3502
3503         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3504         tok_state_after_doctype_system_keyword = ->
3505                 c = txt.charAt(cur++)
3506                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3507                         tok_state = tok_state_before_doctype_system_identifier
3508                         return
3509                 if c is '"'
3510                         parse_error()
3511                         tok_cur_tag.system_identifier = ''
3512                         tok_state = tok_state_doctype_system_identifier_double_quoted
3513                         return
3514                 if c is "'"
3515                         parse_error()
3516                         tok_cur_tag.system_identifier = ''
3517                         tok_state = tok_state_doctype_system_identifier_single_quoted
3518                         return
3519                 if c is '>'
3520                         parse_error()
3521                         tok_cur_tag.flag 'force-quirks', true
3522                         tok_state = tok_state_data
3523                         return tok_cur_tag
3524                 if c is '' # EOF
3525                         parse_error()
3526                         tok_state = tok_state_data
3527                         tok_cur_tag.flag 'force-quirks', true
3528                         cur -= 1 # Reconsume
3529                         return tok_cur_tag
3530                 # Anything else
3531                 parse_error()
3532                 tok_cur_tag.flag 'force-quirks', true
3533                 tok_state = tok_state_bogus_doctype
3534                 return null
3535
3536         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3537         tok_state_before_doctype_system_identifier = ->
3538                 c = txt.charAt(cur++)
3539                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3540                         return
3541                 if c is '"'
3542                         tok_cur_tag.system_identifier = ''
3543                         tok_state = tok_state_doctype_system_identifier_double_quoted
3544                         return
3545                 if c is "'"
3546                         tok_cur_tag.system_identifier = ''
3547                         tok_state = tok_state_doctype_system_identifier_single_quoted
3548                         return
3549                 if c is '>'
3550                         parse_error()
3551                         tok_cur_tag.flag 'force-quirks', true
3552                         tok_state = tok_state_data
3553                         return tok_cur_tag
3554                 if c is '' # EOF
3555                         parse_error()
3556                         tok_state = tok_state_data
3557                         tok_cur_tag.flag 'force-quirks', true
3558                         cur -= 1 # Reconsume
3559                         return tok_cur_tag
3560                 # Anything else
3561                 parse_error()
3562                 tok_cur_tag.flag 'force-quirks', true
3563                 tok_state = tok_state_bogus_doctype
3564                 return null
3565
3566         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3567         tok_state_doctype_system_identifier_double_quoted = ->
3568                 c = txt.charAt(cur++)
3569                 if c is '"'
3570                         tok_state = tok_state_after_doctype_system_identifier
3571                         return
3572                 if c is "\u0000"
3573                         parse_error()
3574                         tok_cur_tag.system_identifier += "\ufffd"
3575                         return
3576                 if c is '>'
3577                         parse_error()
3578                         tok_cur_tag.flag 'force-quirks', true
3579                         tok_state = tok_state_data
3580                         return tok_cur_tag
3581                 if c is '' # EOF
3582                         parse_error()
3583                         tok_state = tok_state_data
3584                         tok_cur_tag.flag 'force-quirks', true
3585                         cur -= 1 # Reconsume
3586                         return tok_cur_tag
3587                 # Anything else
3588                 tok_cur_tag.system_identifier += c
3589                 return null
3590
3591         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3592         tok_state_doctype_system_identifier_single_quoted = ->
3593                 c = txt.charAt(cur++)
3594                 if c is "'"
3595                         tok_state = tok_state_after_doctype_system_identifier
3596                         return
3597                 if c is "\u0000"
3598                         parse_error()
3599                         tok_cur_tag.system_identifier += "\ufffd"
3600                         return
3601                 if c is '>'
3602                         parse_error()
3603                         tok_cur_tag.flag 'force-quirks', true
3604                         tok_state = tok_state_data
3605                         return tok_cur_tag
3606                 if c is '' # EOF
3607                         parse_error()
3608                         tok_state = tok_state_data
3609                         tok_cur_tag.flag 'force-quirks', true
3610                         cur -= 1 # Reconsume
3611                         return tok_cur_tag
3612                 # Anything else
3613                 tok_cur_tag.system_identifier += c
3614                 return null
3615
3616         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3617         tok_state_after_doctype_system_identifier = ->
3618                 c = txt.charAt(cur++)
3619                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3620                         return
3621                 if c is '>'
3622                         tok_state = tok_state_data
3623                         return tok_cur_tag
3624                 if c is '' # EOF
3625                         parse_error()
3626                         tok_state = tok_state_data
3627                         tok_cur_tag.flag 'force-quirks', true
3628                         cur -= 1 # Reconsume
3629                         return tok_cur_tag
3630                 # Anything else
3631                 parse_error()
3632                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3633                 tok_state = tok_state_bogus_doctype
3634                 return null
3635
3636         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3637         tok_state_bogus_doctype = ->
3638                 c = txt.charAt(cur++)
3639                 if c is '>'
3640                         tok_state = tok_state_data
3641                         return tok_cur_tag
3642                 if c is '' # EOF
3643                         tok_state = tok_state_data
3644                         cur -= 1 # Reconsume
3645                         return tok_cur_tag
3646                 # Anything else
3647                 return null
3648
3649
3650         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3651         # Don't set this as a state, just call it
3652         # returns a string (NOT a text node)
3653         parse_character_reference = (allowed_char = null, in_attr = false) ->
3654                 if cur >= txt.length
3655                         return '&'
3656                 switch c = txt.charAt(cur)
3657                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3658                                 # explicitly not a parse error
3659                                 return '&'
3660                         when ';'
3661                                 # there has to be "one or more" alnums between & and ; to be a parse error
3662                                 return '&'
3663                         when '#'
3664                                 if cur + 1 >= txt.length
3665                                         return '&'
3666                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3667                                         prefix = '#x'
3668                                         charset = hex_chars
3669                                         start = cur + 2
3670                                 else
3671                                         charset = digits
3672                                         start = cur + 1
3673                                         prefix = '#'
3674                                 i = 0
3675                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3676                                         i += 1
3677                                 if i is 0
3678                                         return '&'
3679                                 if txt.charAt(start + i) is ';'
3680                                         i += 1
3681                                 # FIXME This is supposed to generate parse errors for some chars
3682                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3683                                 if decoded?
3684                                         cur = start + i
3685                                         return decoded
3686                                 return '&'
3687                         else
3688                                 for i in [0...31]
3689                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3690                                                 break
3691                                 if i is 0
3692                                         # exit early, because parse_error() below needs at least one alnum
3693                                         return '&'
3694                                 if txt.charAt(cur + i) is ';'
3695                                         i += 1 # include ';' terminator in value
3696                                         decoded = decode_named_char_ref txt.substr(cur, i)
3697                                         if decoded?
3698                                                 cur += i
3699                                                 return decoded
3700                                         parse_error()
3701                                         return '&'
3702                                 else
3703                                         # no ';' terminator (only legacy char refs)
3704                                         max = i
3705                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3706                                                 c = legacy_char_refs[txt.substr(cur, i)]
3707                                                 if c?
3708                                                         if in_attr
3709                                                                 if txt.charAt(cur + i) is '='
3710                                                                         # "because some legacy user agents will
3711                                                                         # misinterpret the markup in those cases"
3712                                                                         parse_error()
3713                                                                         return '&'
3714                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3715                                                                         # this makes attributes forgiving about url args
3716                                                                         return '&'
3717                                                         # ok, and besides the weird exceptions for attributes...
3718                                                         # return the matching char
3719                                                         cur += i # consume entity chars
3720                                                         parse_error() # because no terminating ";"
3721                                                         return c
3722                                         parse_error()
3723                                         return '&'
3724                 return # never reached
3725
3726         # tree constructor initialization
3727         # see comments on TYPE_TAG/etc for the structure of this data
3728         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3729         open_els = []
3730         afe = [] # active formatting elements
3731         template_insertion_modes = []
3732         insertion_mode = ins_mode_initial
3733         original_insertion_mode = insertion_mode # TODO check spec
3734         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3735         flag_frameset_ok = true
3736         flag_parsing = true
3737         flag_foster_parenting = false
3738         form_element_pointer = null
3739         temporary_buffer = null
3740         pending_table_character_tokens = []
3741         head_element_pointer = null
3742         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3743         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3744
3745         # tokenizer initialization
3746         tok_state = tok_state_data
3747
3748         # proccess input
3749         while flag_parsing
3750                 t = tok_state()
3751                 if t?
3752                         insertion_mode t
3753                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3754         return doc.children
3755
3756 serialize_els = (els, shallow, show_ids) ->
3757         serialized = ''
3758         sep = ''
3759         for t in els
3760                 serialized += sep
3761                 sep = ','
3762                 serialized += t.serialize shallow, show_ids
3763         return serialized
3764
3765 # TODO export TYPE_*
3766 module.exports.parse_html = parse_html
3767 module.exports.debug_log_reset = debug_log_reset
3768 module.exports.debug_log_each = debug_log_each
3769 module.exports.TYPE_TAG = TYPE_TAG
3770 module.exports.TYPE_TEXT = TYPE_TEXT
3771 module.exports.TYPE_COMMENT = TYPE_COMMENT
3772 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE