JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
doctypes: parsing, tests pass
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         shallow_clone: -> # return a new node that's the same except without the children or parent
100                 # WARNING this doesn't work right on open tags that are still being parsed
101                 attrs = {}
102                 attrs[k] = v for k, v of @attrs
103                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104         acknowledge_self_closing: ->
105                 if @token?
106                         @token.flag 'did_self_close'
107                 else
108                         @flag 'did_self_close', true
109         flag: ->
110                 # fixfull
111         serialize: (shallow = false, show_ids = false) -> # for unit tests
112                 ret = ''
113                 switch @type
114                         when TYPE_TAG
115                                 ret += 'tag:'
116                                 ret += JSON.stringify @name
117                                 ret += ','
118                                 if show_ids
119                                         ret += "##{@id},"
120                                 if shallow
121                                         break
122                                 attr_keys = []
123                                 for k of @attrs
124                                         attr_keys.push k
125                                 attr_keys.sort()
126                                 ret += '{'
127                                 sep = ''
128                                 for k in attr_keys
129                                         ret += sep
130                                         sep = ','
131                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132                                 ret += '},['
133                                 sep = ''
134                                 for c in @children
135                                         ret += sep
136                                         sep = ','
137                                         ret += c.serialize shallow, show_ids
138                                 ret += ']'
139                         when TYPE_TEXT
140                                 ret += 'text:'
141                                 ret += JSON.stringify @text
142                         when TYPE_COMMENT
143                                 ret += 'comment:'
144                                 ret += JSON.stringify @text
145                         when TYPE_DOCTYPE
146                                 ret += 'doctype'
147                                 # FIXME
148                         when TYPE_AFE_MARKER
149                                 ret += 'marker'
150                         when TYPE_AAA_BOOKMARK
151                                 ret += 'aaa_bookmark'
152                         else
153                                 ret += 'unknown:'
154                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155                 return ret
156
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159         return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161         return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163         return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165         return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168         return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170         return new Node TYPE_DOCTYPE, name: name
171 new_eof_token = ->
172         return new Node TYPE_EOF
173 new_afe_marker = ->
174         return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176         return new Node TYPE_AAA_BOOKMARK
177
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
183
184 # some SVG elements have dashes in them
185 tag_name_chars = alnum + "-"
186
187 # http://www.w3.org/TR/html5/infrastructure.html#space-character
188 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 is_space = (txt) ->
190         return txt.length is 1 and space_chars.indexOf(txt) > -1
191 is_space_tok = (t) ->
192         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193
194 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
195 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
196
197 # These are the character references that don't need a terminating semicolon
198 # min length: 2, max: 6, none are a prefix of any other.
199 legacy_char_refs = {
200         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
201         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
202         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
203         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
204         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
205         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
206         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
207         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
208         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
209         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
210         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
211         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
212         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
213         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
214         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
215         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
216         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
217         yen: '¥', yuml: 'ÿ'
218 }
219
220 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
221 raw_text_elements = ['script', 'style']
222 escapable_raw_text_elements = ['textarea', 'title']
223 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
224 svg_elements = [
225         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
226         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
227         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
228         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
229         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
230         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
231         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
232         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
233         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
234         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
235         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
236         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
237         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
238         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
239         'view', 'vkern'
240 ]
241
242 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
243 mathml_elements = [
244         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
245         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
246         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
247         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
248         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
249         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
250         'determinant', 'diff', 'divergence', 'divide', 'domain',
251         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
252         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
253         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
254         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
255         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
256         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
257         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
258         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
259         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
260         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
261         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
262         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
263         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
264         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
265         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
266         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
267         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
268         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
269         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
270         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
271         'vectorproduct', 'xor'
272 ]
273 # foreign_elements = [svg_elements..., mathml_elements...]
274 #normal_elements = All other allowed HTML elements are normal elements.
275
276 special_elements = {
277         # HTML:
278         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
279         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
280         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
281         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
282         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
283         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
284         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
285         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
286         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
287         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
288         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
289         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
290         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
291         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
292         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
293         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
294         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
295         wbr:NS_HTML, xmp:NS_HTML,
296
297         # MathML:
298         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
299         'annotation-xml':NS_MATHML,
300
301         # SVG:
302         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
303 }
304
305 formatting_elements = {
306          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
307          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
308          u: true
309 }
310
311 foster_parenting_targets = {
312         table: true
313         tbody: true
314         tfoot: true
315         thead: true
316         tr: true
317 }
318
319 # all html I presume
320 end_tag_implied = {
321         dd: true
322         dt: true
323         li: true
324         option: true
325         optgroup: true
326         p: true
327         rb: true
328         rp: true
329         rt: true
330         rtc: true
331 }
332
333 el_is_special = (e) ->
334         return special_elements[e.name] is e.namespace
335
336 # decode_named_char_ref()
337 #
338 # The list of named character references is _huge_ so ask the browser to decode
339 # for us instead of wasting bandwidth/space on including the table here.
340 #
341 # Pass without the "&" but with the ";" examples:
342 #    for "&amp" pass "amp;"
343 #    for "&#x2032" pass "x2032;"
344 g_dncr = {
345         cache: {}
346         textarea: document.createElement('textarea')
347 }
348 # TODO test this in IE8
349 decode_named_char_ref = (txt) ->
350         txt = "&#{txt}"
351         decoded = g_dncr.cache[txt]
352         return decoded if decoded?
353         g_dncr.textarea.innerHTML = txt
354         decoded = g_dncr.textarea.value
355         return null if decoded is txt
356         return g_dncr.cache[txt] = decoded
357
358 parse_html = (txt, parse_error_cb = null) ->
359         cur = 0 # index of next char in txt to be parsed
360         # declare doc and tokenizer variables so they're in scope below
361         doc = null
362         open_els = null # stack of open elements
363         afe = null # active formatting elements
364         template_insertion_modes = null
365         insertion_mode = null
366         original_insertion_mode = null
367         tok_state = null
368         tok_cur_tag = null # partially parsed tag
369         flag_scripting = null
370         flag_frameset_ok = null
371         flag_parsing = null
372         flag_foster_parenting = null
373         form_element_pointer = null
374         temporary_buffer = null
375         pending_table_character_tokens = null
376         head_element_pointer = null
377         flag_fragment_parsing = null
378         context_element = null
379
380         stop_parsing = ->
381                 flag_parsing = false
382
383         parse_error = ->
384                 if parse_error_cb?
385                         parse_error_cb cur
386                 else
387                         console.log "Parse error at character #{cur} of #{txt.length}"
388
389         afe_push = (new_el) ->
390                 matches = 0
391                 for el, i in afe
392                         if el.name is new_el.name and el.namespace is new_el.namespace
393                                 for k, v of el.attrs
394                                         continue unless new_el.attrs[k] is v
395                                 for k, v of new_el.attrs
396                                         continue unless el.attrs[k] is v
397                                 matches += 1
398                                 if matches is 3
399                                         afe.splice i, 1
400                                         break
401                 afe.unshift new_el
402         afe_push_marker = ->
403                 afe.unshift new_afe_marker()
404
405         # the functions below impliment the Tree Contstruction algorithm
406         # http://www.w3.org/TR/html5/syntax.html#tree-construction
407
408         # But first... the helpers
409         template_tag_is_open = ->
410                 for t in open_els
411                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
412                                 return true
413                 return false
414         is_in_scope_x = (tag_name, scope, namespace) ->
415                 for t in open_els
416                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
417                                 return true
418                         if scope[t.name] is t.namespace
419                                 return false
420                 return false
421         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
422                 for t in open_els
423                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
424                                 return true
425                         if scope[t.name] is t.namespace
426                                 return false
427                         if scope2[t.name] is t.namespace
428                                 return false
429                 return false
430         standard_scopers = { # FIXME these are supposed to be namespace specific
431                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
432                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
433                 template: NS_HTML, mi: NS_MATHML,
434
435                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
436                 'annotation-xml': NS_MATHML,
437
438                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
439         }
440         button_scopers = button: NS_HTML
441         li_scopers = ol: NS_HTML, ul: NS_HTML
442         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
443         is_in_scope = (tag_name, namespace = null) ->
444                 return is_in_scope_x tag_name, standard_scopers, namespace
445         is_in_button_scope = (tag_name, namespace = null) ->
446                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
447         is_in_table_scope = (tag_name, namespace = null) ->
448                 return is_in_scope_x tag_name, table_scopers, namespace
449         is_in_select_scope = (tag_name, namespace = null) ->
450                 for t in open_els
451                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
452                                 return true
453                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
454                                 return false
455                 return false
456         # this checks for a particular element, not by name
457         el_is_in_scope = (el) ->
458                 for t in open_els
459                         if t is el
460                                 return true
461                         if standard_scopers[t.name] is t.namespace
462                                 return false
463                 return false
464
465         clear_to_table_stopers = {
466                 'table': true
467                 'template': true
468                 'html': true
469         }
470         clear_stack_to_table_context = ->
471                 loop
472                         if clear_to_table_stopers[open_els[0].name]?
473                                 break
474                         open_els.shift()
475                 return
476         clear_to_table_body_stopers = {
477                 'tbody': true
478                 'tfoot': true
479                 'thead': true
480                 'template': true
481                 'html': true
482         }
483         clear_stack_to_table_body_context = ->
484                 loop
485                         if clear_to_table_body_stopers[open_els[0].name]?
486                                 break
487                         open_els.shift()
488                 return
489         clear_to_table_row_stopers = {
490                 'tr': true
491                 'template': true
492                 'html': true
493         }
494         clear_stack_to_table_row_context = ->
495                 loop
496                         if clear_to_table_row_stopers[open_els[0].name]?
497                                 break
498                         open_els.shift()
499                 return
500         clear_afe_to_marker = ->
501                 loop
502                         el = afe.shift()
503                         if el.type is TYPE_AFE_MARKER
504                                 return
505
506         # 8.2.3.1 ...
507         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
508         reset_insertion_mode = ->
509                 # 1. Let last be false.
510                 last = false
511                 # 2. Let node be the last node in the stack of open elements.
512                 node_i = 0
513                 node = open_els[node_i]
514                 # 3. Loop: If node is the first node in the stack of open elements,
515                 # then set last to true, and, if the parser was originally created as
516                 # part of the HTML fragment parsing algorithm (fragment case) set node
517                 # to the context element.
518                 loop
519                         if node_i is open_els.length - 1
520                                 last = true
521                                 # fixfull (fragment case)
522
523                         # 4. If node is a select element, run these substeps:
524                         if node.name is 'select'
525                                 # 1. If last is true, jump to the step below labeled done.
526                                 unless last
527                                         # 2. Let ancestor be node.
528                                         ancestor_i = node_i
529                                         ancestor = node
530                                         # 3. Loop: If ancestor is the first node in the stack of
531                                         # open elements, jump to the step below labeled done.
532                                         loop
533                                                 if ancestor_i is open_els.length - 1
534                                                         break
535                                                 # 4. Let ancestor be the node before ancestor in the stack
536                                                 # of open elements.
537                                                 ancestor_i += 1
538                                                 ancestor = open_els[ancestor_i]
539                                                 # 5. If ancestor is a template node, jump to the step below
540                                                 # labeled done.
541                                                 if ancestor.name is 'template'
542                                                         break
543                                                 # 6. If ancestor is a table node, switch the insertion mode
544                                                 # to "in select in table" and abort these steps.
545                                                 if ancestor.name is 'table'
546                                                         insertion_mode = ins_mode_in_select_in_table
547                                                         return
548                                                 # 7. Jump back to the step labeled loop.
549                                 # 8. Done: Switch the insertion mode to "in select" and abort
550                                 # these steps.
551                                 insertion_mode = ins_mode_in_select
552                                 return
553                         # 5. If node is a td or th element and last is false, then switch
554                         # the insertion mode to "in cell" and abort these steps.
555                         if (node.name is 'td' or node.name is 'th') and last is false
556                                 insertion_mode = ins_mode_in_cell
557                                 return
558                         # 6. If node is a tr element, then switch the insertion mode to "in
559                         # row" and abort these steps.
560                         if node.name is 'tr'
561                                 insertion_mode = ins_mode_in_row
562                                 return
563                         # 7. If node is a tbody, thead, or tfoot element, then switch the
564                         # insertion mode to "in table body" and abort these steps.
565                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
566                                 insertion_mode = ins_mode_in_table_body
567                                 return
568                         # 8. If node is a caption element, then switch the insertion mode
569                         # to "in caption" and abort these steps.
570                         if node.name is 'caption'
571                                 insertion_mode = ins_mode_in_caption
572                                 return
573                         # 9. If node is a colgroup element, then switch the insertion mode
574                         # to "in column group" and abort these steps.
575                         if node.name is 'colgroup'
576                                 insertion_mode = ins_mode_in_column_group
577                                 return
578                         # 10. If node is a table element, then switch the insertion mode to
579                         # "in table" and abort these steps.
580                         if node.name is 'table'
581                                 insertion_mode = ins_mode_in_table
582                                 return
583                         # 11. If node is a template element, then switch the insertion mode
584                         # to the current template insertion mode and abort these steps.
585                         # fixfull (template insertion mode stack)
586
587                         # 12. If node is a head element and last is true, then switch the
588                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
589                         # these steps. (fragment case)
590                         if node.name is 'head' and last
591                                 insertion_mode = ins_mode_in_body
592                                 return
593                         # 13. If node is a head element and last is false, then switch the
594                         # insertion mode to "in head" and abort these steps.
595                         if node.name is 'head' and last is false
596                                 insertion_mode = ins_mode_in_head
597                                 return
598                         # 14. If node is a body element, then switch the insertion mode to
599                         # "in body" and abort these steps.
600                         if node.name is 'body'
601                                 insertion_mode = ins_mode_in_body
602                                 return
603                         # 15. If node is a frameset element, then switch the insertion mode
604                         # to "in frameset" and abort these steps. (fragment case)
605                         if node.name is 'frameset'
606                                 insertion_mode = ins_mode_in_frameset
607                                 return
608                         # 16. If node is an html element, run these substeps:
609                         if node.name is 'html'
610                                 # 1. If the head element pointer is null, switch the insertion
611                                 # mode to "before head" and abort these steps. (fragment case)
612                                 # fixfull (fragment case)
613
614                                 # 2. Otherwise, the head element pointer is not null, switch
615                                 # the insertion mode to "after head" and abort these steps.
616                                 insertion_mode = ins_mode_in_body # FIXME fixfull
617                                 return
618                         # 17. If last is true, then switch the insertion mode to "in body"
619                         # and abort these steps. (fragment case)
620                         if last
621                                 insertion_mode = ins_mode_in_body
622                                 return
623                         # 18. Let node now be the node before node in the stack of open
624                         # elements.
625                         node_i += 1
626                         node = open_els[node_i]
627                         # 19. Return to the step labeled loop.
628
629         # 8.2.3.2
630
631         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
632         adjusted_current_node = ->
633                 if open_els.length is 1 and flag_fragment_parsing
634                         return context_element
635                 return open_els[0]
636
637         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
638         # this implementation is structured (mostly) as described at the link above.
639         # capitalized comments are the "labels" described at the link above.
640         reconstruct_active_formatting_elements = ->
641                 return if afe.length is 0
642                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
643                         return
644                 # Rewind
645                 i = 0
646                 loop
647                         if i is afe.length - 1
648                                 break
649                         i += 1
650                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
651                                 i -= 1 # Advance
652                                 break
653                 # Create
654                 loop
655                         el = afe[i].shallow_clone()
656                         tree_insert_element el
657                         afe[i] = el
658                         break if i is 0
659                         i -= 1 # Advance
660
661         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
662         # adoption agency algorithm
663         # overview here:
664         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
665         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
666         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
667         adoption_agency = (subject) ->
668                 debug_log "adoption_agency()"
669                 debug_log "tree: #{serialize_els doc.children, false, true}"
670                 debug_log "open_els: #{serialize_els open_els, true, true}"
671                 debug_log "afe: #{serialize_els afe, true, true}"
672                 if open_els[0].name is subject
673                         el = open_els[0]
674                         open_els.shift()
675                         # remove it from the list of active formatting elements (if found)
676                         for t, i in afe
677                                 if t is el
678                                         afe.splice i, 1
679                                         break
680                         debug_log "aaa: starting off with subject on top of stack, exiting"
681                         return
682                 outer = 0
683                 loop
684                         if outer >= 8
685                                 return
686                         outer += 1
687                         # 5. Let formatting element be the last element in the list of
688                         # active formatting elements that: is between the end of the list
689                         # and the last scope marker in the list, if any, or the start of
690                         # the list otherwise, and  has the tag name subject.
691                         fe = null
692                         for t, fe_of_afe in afe
693                                 if t.type is TYPE_AFE_MARKER
694                                         break
695                                 if t.name is subject
696                                         fe = t
697                                         break
698                         # If there is no such element, then abort these steps and instead
699                         # act as described in the "any other end tag" entry above.
700                         if fe is null
701                                 debug_log "aaa: fe not found in afe"
702                                 in_body_any_other_end_tag subject
703                                 return
704                         # 6. If formatting element is not in the stack of open elements,
705                         # then this is a parse error; remove the element from the list, and
706                         # abort these steps.
707                         in_open_els = false
708                         for t, fe_of_open_els in open_els
709                                 if t is fe
710                                         in_open_els = true
711                                         break
712                         unless in_open_els
713                                 debug_log "aaa: fe not found in open_els"
714                                 parse_error()
715                                 # "remove it from the list" must mean afe, since it's not in open_els
716                                 afe.splice fe_of_afe, 1
717                                 return
718                         # 7. If formatting element is in the stack of open elements, but
719                         # the element is not in scope, then this is a parse error; abort
720                         # these steps.
721                         unless el_is_in_scope fe
722                                 debug_log "aaa: fe not in scope"
723                                 parse_error()
724                                 return
725                         # 8. If formatting element is not the current node, this is a parse
726                         # error. (But do not abort these steps.)
727                         unless open_els[0] is fe
728                                 parse_error()
729                                 # continue
730                         # 9. Let furthest block be the topmost node in the stack of open
731                         # elements that is lower in the stack than formatting element, and
732                         # is an element in the special category. There might not be one.
733                         fb = null
734                         fb_of_open_els = null
735                         for t, i in open_els
736                                 if t is fe
737                                         break
738                                 if el_is_special t
739                                         fb = t
740                                         fb_of_open_els = i
741                                         # and continue, to see if there's one that's more "topmost"
742                         # 10. If there is no furthest block, then the UA must first pop all
743                         # the nodes from the bottom of the stack of open elements, from the
744                         # current node up to and including formatting element, then remove
745                         # formatting element from the list of active formatting elements,
746                         # and finally abort these steps.
747                         if fb is null
748                                 debug_log "aaa: no fb"
749                                 loop
750                                         t = open_els.shift()
751                                         if t is fe
752                                                 afe.splice fe_of_afe, 1
753                                                 return
754                         # 11. Let common ancestor be the element immediately above
755                         # formatting element in the stack of open elements.
756                         ca = open_els[fe_of_open_els + 1] # common ancestor
757
758                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
759                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
760                         bookmark = new_aaa_bookmark()
761                         for t, i in afe
762                                 if t is fe
763                                         afe.splice i, 0, bookmark
764                                         break
765                         node = last_node = fb
766                         inner = 0
767                         loop
768                                 inner += 1
769                                 # 3. Let node be the element immediately above node in the
770                                 # stack of open elements, or if node is no longer in the stack
771                                 # of open elements (e.g. because it got removed by this
772                                 # algorithm), the element that was immediately above node in
773                                 # the stack of open elements before node was removed.
774                                 node_next = null
775                                 for t, i in open_els
776                                         if t is node
777                                                 node_next = open_els[i + 1]
778                                                 break
779                                 node = node_next ? node_above
780                                 debug_log "inner loop #{inner}"
781                                 debug_log "tree: #{serialize_els doc.children, false, true}"
782                                 debug_log "open_els: #{serialize_els open_els, true, true}"
783                                 debug_log "afe: #{serialize_els afe, true, true}"
784                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
785                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
786                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
787                                 debug_log "node: #{node.serialize true, true}"
788                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
789
790                                 # 4. If node is formatting element, then go to the next step in
791                                 # the overall algorithm.
792                                 if node is fe
793                                         break
794                                 debug_log "the meat"
795                                 # 5. If inner loop counter is greater than three and node is in
796                                 # the list of active formatting elements, then remove node from
797                                 # the list of active formatting elements.
798                                 node_in_afe = false
799                                 for t, i in afe
800                                         if t is node
801                                                 if inner > 3
802                                                         afe.splice i, 1
803                                                         debug_log "max out inner"
804                                                 else
805                                                         node_in_afe = true
806                                                         debug_log "in afe"
807                                                 break
808                                 # 6. If node is not in the list of active formatting elements,
809                                 # then remove node from the stack of open elements and then go
810                                 # back to the step labeled inner loop.
811                                 unless node_in_afe
812                                         debug_log "not in afe"
813                                         for t, i in open_els
814                                                 if t is node
815                                                         node_above = open_els[i + 1]
816                                                         open_els.splice i, 1
817                                                         break
818                                         continue
819                                 debug_log "the bones"
820                                 # 7. create an element for the token for which the element node
821                                 # was created, in the HTML namespace, with common ancestor as
822                                 # the intended parent; replace the entry for node in the list
823                                 # of active formatting elements with an entry for the new
824                                 # element, replace the entry for node in the stack of open
825                                 # elements with an entry for the new element, and let node be
826                                 # the new element.
827                                 new_node = node.shallow_clone()
828                                 for t, i in afe
829                                         if t is node
830                                                 afe[i] = new_node
831                                                 debug_log "replaced in afe"
832                                                 break
833                                 for t, i in open_els
834                                         if t is node
835                                                 node_above = open_els[i + 1]
836                                                 open_els[i] = new_node
837                                                 debug_log "replaced in open_els"
838                                                 break
839                                 node = new_node
840                                 # 8. If last node is furthest block, then move the
841                                 # aforementioned bookmark to be immediately after the new node
842                                 # in the list of active formatting elements.
843                                 if last_node is fb
844                                         for t, i in afe
845                                                 if t is bookmark
846                                                         afe.splice i, 1
847                                                         debug_log "removed bookmark"
848                                                         break
849                                         for t, i in afe
850                                                 if t is node
851                                                         # "after" means lower
852                                                         afe.splice i, 0, bookmark # "after as <-
853                                                         debug_log "placed bookmark after node"
854                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
855                                                         break
856                                 # 9. Insert last node into node, first removing it from its
857                                 # previous parent node if any.
858                                 if last_node.parent?
859                                         debug_log "last_node has parent"
860                                         for c, i in last_node.parent.children
861                                                 if c is last_node
862                                                         debug_log "removing last_node from parent"
863                                                         last_node.parent.children.splice i, 1
864                                                         break
865                                 node.children.push last_node
866                                 last_node.parent = node
867                                 # 10. Let last node be node.
868                                 last_node = node
869                                 debug_log "at last"
870                                 # 11. Return to the step labeled inner loop.
871                         # 14. Insert whatever last node ended up being in the previous step
872                         # at the appropriate place for inserting a node, but using common
873                         # ancestor as the override target.
874
875                         # In the case where fe is immediately followed by fb:
876                         #   * inner loop exits out early (node==fe)
877                         #   * last_node is fb
878                         #   * last_node is still in the tree (not a duplicate)
879                         if last_node.parent?
880                                 debug_log "FEFIRST? last_node has parent"
881                                 for c, i in last_node.parent.children
882                                         if c is last_node
883                                                 debug_log "removing last_node from parent"
884                                                 last_node.parent.children.splice i, 1
885                                                 break
886
887                         debug_log "after aaa inner loop"
888                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
889                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
890                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
891                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
892                         debug_log "tree: #{serialize_els doc.children, false, true}"
893
894                         debug_log "insert"
895
896
897                         # can't use standard insert token thing, because it's already in
898                         # open_els and must stay at it's current position in open_els
899                         dest = adjusted_insertion_location ca
900                         dest[0].children.splice dest[1], 0, last_node
901                         last_node.parent = dest[0]
902
903
904                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908                         debug_log "tree: #{serialize_els doc.children, false, true}"
909
910                         # 15. Create an element for the token for which formatting element
911                         # was created, in the HTML namespace, with furthest block as the
912                         # intended parent.
913                         new_element = fe.shallow_clone() # FIXME intended parent thing
914                         # 16. Take all of the child nodes of furthest block and append them
915                         # to the element created in the last step.
916                         while fb.children.length
917                                 t = fb.children.shift()
918                                 t.parent = new_element
919                                 new_element.children.push t
920                         # 17. Append that new element to furthest block.
921                         new_element.parent = fb
922                         fb.children.push new_element
923                         # 18. Remove formatting element from the list of active formatting
924                         # elements, and insert the new element into the list of active
925                         # formatting elements at the position of the aforementioned
926                         # bookmark.
927                         for t, i in afe
928                                 if t is fe
929                                         afe.splice i, 1
930                                         break
931                         for t, i in afe
932                                 if t is bookmark
933                                         afe[i] = new_element
934                                         break
935                         # 19. Remove formatting element from the stack of open elements,
936                         # and insert the new element into the stack of open elements
937                         # immediately below the position of furthest block in that stack.
938                         for t, i in open_els
939                                 if t is fe
940                                         open_els.splice i, 1
941                                         break
942                         for t, i in open_els
943                                 if t is fb
944                                         open_els.splice i, 0, new_element
945                                         break
946                         # 20. Jump back to the step labeled outer loop.
947                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
948                         debug_log "tree: #{serialize_els doc.children, false, true}"
949                         debug_log "open_els: #{serialize_els open_els, true, true}"
950                         debug_log "afe: #{serialize_els afe, true, true}"
951                 debug_log "AAA DONE"
952
953         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
954         close_p_element = ->
955                 generate_implied_end_tags 'p' # arg is exception
956                 if open_els[0].name isnt 'p'
957                         parse_error()
958                 while open_els.length > 1 # just in case
959                         el = open_els.shift()
960                         if el.name is 'p'
961                                 return
962         close_p_if_in_button_scope = ->
963                 if is_in_button_scope 'p'
964                         close_p_element()
965
966         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
967         # aka insert_a_character = (t) ->
968         insert_character = (t) ->
969                 dest = adjusted_insertion_location()
970                 # fixfull check for Document node
971                 if dest[1] > 0
972                         prev = dest[0].children[dest[1] - 1]
973                         if prev.type is TYPE_TEXT
974                                 prev.text += t.text
975                                 return
976                 dest[0].children.splice dest[1], 0, t
977
978         # 8.2.5.1
979         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
980         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
981         adjusted_insertion_location = (override_target = null) ->
982                 # 1. If there was an override target specified, then let target be the
983                 # override target.
984                 if override_target?
985                         target = override_target
986                 else # Otherwise, let target be the current node.
987                         target = open_els[0]
988                 # 2. Determine the adjusted insertion location using the first matching
989                 # steps from the following list:
990                 #
991                 # If foster parenting is enabled and target is a table, tbody, tfoot,
992                 # thead, or tr element Foster parenting happens when content is
993                 # misnested in tables.
994                 if flag_foster_parenting and foster_parenting_targets[target.name]
995                         loop # once. this is here so we can ``break`` to "abort these substeps"
996                                 # 1. Let last template be the last template element in the
997                                 # stack of open elements, if any.
998                                 last_template = null
999                                 last_template_i = null
1000                                 for el, i in open_els
1001                                         if el.name is 'template'
1002                                                 last_template = el
1003                                                 last_template_i = i
1004                                                 break
1005                                 # 2. Let last table be the last table element in the stack of
1006                                 # open elements, if any.
1007                                 last_table = null
1008                                 last_table_i
1009                                 for el, i in open_els
1010                                         if el.name is 'table'
1011                                                 last_table = el
1012                                                 last_table_i = i
1013                                                 break
1014                                 # 3. If there is a last template and either there is no last
1015                                 # table, or there is one, but last template is lower (more
1016                                 # recently added) than last table in the stack of open
1017                                 # elements, then: let adjusted insertion location be inside
1018                                 # last template's template contents, after its last child (if
1019                                 # any), and abort these substeps.
1020                                 if last_template and (last_table is null or last_template_i < last_table_i)
1021                                         target = template # fixfull should be it's contents
1022                                         target_i = target.children.length
1023                                         break
1024                                 # 4. If there is no last table, then let adjusted insertion
1025                                 # location be inside the first element in the stack of open
1026                                 # elements (the html element), after its last child (if any),
1027                                 # and abort these substeps. (fragment case)
1028                                 if last_table is null
1029                                         # this is odd
1030                                         target = open_els[open_els.length - 1]
1031                                         target_i = target.children.length
1032                                 # 5. If last table has a parent element, then let adjusted
1033                                 # insertion location be inside last table's parent element,
1034                                 # immediately before last table, and abort these substeps.
1035                                 if last_table.parent?
1036                                         for c, i in last_table.parent.children
1037                                                 if c is last_table
1038                                                         target = last_table.parent
1039                                                         target_i = i
1040                                                         break
1041                                         break
1042                                 # 6. Let previous element be the element immediately above last
1043                                 # table in the stack of open elements.
1044                                 #
1045                                 # huh? how could it not have a parent?
1046                                 previous_element = open_els[last_table_i + 1]
1047                                 # 7. Let adjusted insertion location be inside previous
1048                                 # element, after its last child (if any).
1049                                 target = previous_element
1050                                 target_i = target.children.length
1051                                 # Note: These steps are involved in part because it's possible
1052                                 # for elements, the table element in this case in particular,
1053                                 # to have been moved by a script around in the DOM, or indeed
1054                                 # removed from the DOM entirely, after the element was inserted
1055                                 # by the parser.
1056                                 break # don't really loop
1057                 else
1058                         # Otherwise Let adjusted insertion location be inside target, after
1059                         # its last child (if any).
1060                         target_i = target.children.length
1061
1062                 # 3. If the adjusted insertion location is inside a template element,
1063                 # let it instead be inside the template element's template contents,
1064                 # after its last child (if any).
1065                 # fixfull (template)
1066
1067                 # 4. Return the adjusted insertion location.
1068                 return [target, target_i]
1069
1070         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1071         # aka create_an_element_for_token
1072         token_to_element = (t, namespace, intended_parent) ->
1073                 t.type = TYPE_TAG # not TYPE_START_TAG
1074                 # convert attributes into a hash
1075                 attrs = {}
1076                 while t.attrs_a.length
1077                         a = t.attrs_a.pop()
1078                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1079                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1080
1081                 # TODO 2. If the newly created element has an xmlns attribute in the
1082                 # XMLNS namespace whose value is not exactly the same as the element's
1083                 # namespace, that is a parse error. Similarly, if the newly created
1084                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1085                 # value is not the XLink Namespace, that is a parse error.
1086
1087                 # fixfull: the spec says stuff about form pointers and ownerDocument
1088
1089                 return el
1090
1091         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1092         insert_foreign_element = (token, namespace) ->
1093                 ail = adjusted_insertion_location()
1094                 ail_el = ail[0]
1095                 ail_i = ail[1]
1096                 el = token_to_element token, namespace, ail_el
1097                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1098                 el.parent = ail_el
1099                 ail_el.children.splice ail_i, 0, el
1100                 open_els.unshift el
1101                 return el
1102         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1103         insert_html_element = insert_foreign_element # (token, namespace) ->
1104
1105         # FIXME read implement "foster parenting" part
1106         # FIXME read spec, do this right
1107         # FIXME implement the override target thing
1108         # note: this assumes it's an open tag
1109         # FIXME what part of the spec is this?
1110         # TODO look through all callers of this, and see what they should really be doing.
1111         #   eg probably insert_html_element for tokens
1112         tree_insert_element = (el, override_target = null, namespace = null) ->
1113                 if namespace?
1114                         el.namespace = namespace
1115                 dest = adjusted_insertion_location override_target
1116                 if el.type is TYPE_START_TAG # means it's a "token"
1117                         el = token_to_element el, namespace, dest[0]
1118                 unless el.namespace?
1119                         namespace = dest.namespace
1120                 # fixfull: Document nodes sometimes can't accept more chidren
1121                 dest[0].children.splice dest[1], 0, el
1122                 el.parent = dest[0]
1123                 open_els.unshift el
1124                 return el
1125
1126         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1127         # position should be [node, index_within_children]
1128         insert_comment = (t, position = null) ->
1129                 position ?= adjusted_insertion_location()
1130                 position[0].children.splice position[1], 0, t
1131
1132         # 8.2.5.2
1133         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1134         parse_generic_raw_text = (t) ->
1135                 insert_html_element t
1136                 tok_state = tok_state_rawtext
1137                 original_insertion_mode = insertion_mode
1138                 insertion_mode = ins_mode_text
1139         parse_generic_rcdata_text = (t) ->
1140                 insert_html_element t
1141                 tok_state = tok_state_rcdata
1142                 original_insertion_mode = insertion_mode
1143                 insertion_mode = ins_mode_text
1144
1145         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1146         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1147         generate_implied_end_tags = (except = null) ->
1148                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1149                         open_els.shift()
1150
1151         # 8.2.5.4 The rules for parsing tokens in HTML content
1152         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1153
1154         # 8.2.5.4.1 The "initial" insertion mode
1155         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1156         ins_mode_initial = (t) ->
1157                 if is_space_tok t
1158                         return
1159                 if t.type is TYPE_COMMENT
1160                         # fixfull this is supposed to be "the last child of the document object"
1161                         doc.children.push t
1162                         return
1163                 if t.type is TYPE_DOCTYPE
1164                         # FIXME check identifiers, set quirks, etc
1165                         # fixfull
1166                         doc.children.push t
1167                         insertion_mode = ins_mode_before_html
1168                         return
1169                 # Anything else
1170                 #fixfull (iframe, quirks)
1171                 insertion_mode = ins_mode_before_html
1172                 insertion_mode t # reprocess the token
1173                 return
1174
1175         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1176         ins_mode_before_html = (t) ->
1177                 if t.type is TYPE_DOCTYPE
1178                         parse_error()
1179                         return
1180                 if t.type is TYPE_COMMENT
1181                         doc.children.push t
1182                         return
1183                 if is_space_tok t
1184                         return
1185                 if t.type is TYPE_START_TAG and t.name is 'html'
1186                         el = token_to_element t, NS_HTML, doc
1187                         open_els.unshift(el)
1188                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1189                         insertion_mode = ins_mode_before_head
1190                         return
1191                 if t.type is TYPE_END_TAG
1192                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1193                                 # fall through to "anything else"
1194                         else
1195                                 parse_error()
1196                                 return
1197                 # Anything else
1198                 html_tok = new_open_tag 'html'
1199                 el = token_to_element html_tok, NS_HTML, doc
1200                 doc.children.push el
1201                 open_els.unshift el
1202                 # ?fixfull browsing context
1203                 insertion_mode = ins_mode_before_head
1204                 insertion_mode t
1205                 return
1206
1207         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1208         ins_mode_before_head = (t) ->
1209                 if is_space_tok t
1210                         return
1211                 if t.type is TYPE_COMMENT
1212                         insert_comment t
1213                         return
1214                 if t.type is TYPE_DOCTYPE
1215                         parse_error()
1216                         return
1217                 if t.type is TYPE_START_TAG and t.name is 'html'
1218                         ins_mode_in_body t
1219                         return
1220                 if t.type is TYPE_START_TAG and t.name is 'head'
1221                         el = insert_html_element t
1222                         head_element_pointer = el
1223                         insertion_mode = ins_mode_in_head
1224                 if t.type is TYPE_END_TAG
1225                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1226                                 # fall through to Anything else below
1227                         else
1228                                 parse_error()
1229                                 return
1230                 # Anything else
1231                 head_tok = new_open_tag 'head'
1232                 el = insert_html_element head_tok
1233                 head_element_pointer = el
1234                 insertion_mode = ins_mode_in_head
1235                 insertion_mode t # reprocess current token
1236
1237         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1238         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1239                 open_els.shift() # spec says this will be a 'head' node
1240                 insertion_mode = ins_mode_after_head
1241                 insertion_mode t
1242         ins_mode_in_head = (t) ->
1243                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1244                         insert_character t
1245                         return
1246                 if t.type is TYPE_COMMENT
1247                         insert_comment t
1248                         return
1249                 if t.type is TYPE_DOCTYPE
1250                         parse_error()
1251                         return
1252                 if t.type is TYPE_START_TAG and t.name is 'html'
1253                         ins_mode_in_body t
1254                         return
1255                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1256                         el = insert_html_element t
1257                         open_els.shift()
1258                         t.acknowledge_self_closing()
1259                         return
1260                 if t.type is TYPE_START_TAG and t.name is 'meta'
1261                         el = insert_html_element t
1262                         open_els.shift()
1263                         t.acknowledge_self_closing()
1264                         # fixfull encoding stuff
1265                         return
1266                 if t.type is TYPE_START_TAG and t.name is 'title'
1267                         parse_generic_rcdata_text t
1268                         return
1269                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1270                         parse_generic_raw_text t
1271                         return
1272                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1273                         insert_html_element t
1274                         insertion_mode = in_head_noscript # FIXME implement
1275                         return
1276                 if t.type is TYPE_START_TAG and t.name is 'script'
1277                         ail = adjusted_insertion_location()
1278                         el = token_to_element t, NS_HTML, ail
1279                         el.flag 'parser-inserted', true # FIXME implement
1280                         # fixfull frament case
1281                         ail[0].children.splice ail[1], 0, el
1282                         open_els.unshift el
1283                         tok_state = tok_state_script_data
1284                         original_insertion_mode = insertion_mode # make sure orig... is defined
1285                         insertion_mode = ins_mode_text # FIXME implement
1286                         return
1287                 if t.type is TYPE_END_TAG and t.name is 'head'
1288                         open_els.shift() # will be a head element... spec says so
1289                         insertion_mode = ins_mode_after_head
1290                         return
1291                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1292                         ins_mode_in_head_else t
1293                         return
1294                 if t.type is TYPE_START_TAG and t.name is 'template'
1295                         insert_html_element t
1296                         afe_push_marker()
1297                         flag_frameset_ok = false
1298                         insertion_mode = ins_mode_in_template
1299                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1300                         return
1301                 if t.type is TYPE_END_TAG and t.name is 'template'
1302                         if template_tag_is_open()
1303                                 generate_implied_end_tags
1304                                 if open_els[0].name isnt 'template'
1305                                         parse_error()
1306                                 loop
1307                                         el = open_els.shift()
1308                                         if el.name is 'template'
1309                                                 break
1310                                 clear_afe_to_marker()
1311                                 template_insertion_modes.shift()
1312                                 reset_insertion_mode()
1313                         else
1314                                 parse_error()
1315                         return
1316                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1317                         parse_error()
1318                         return
1319                 ins_mode_in_head_else t
1320
1321         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1322         ins_mode_in_head_noscript = (t) ->
1323                 # FIXME ?fixfull
1324                 console.log "ins_mode_in_head_noscript unimplemented"
1325
1326         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1327         ins_mode_after_head_else = (t) ->
1328                 body_tok = new_open_tag 'body'
1329                 insert_html_element body_tok
1330                 insertion_mode = ins_mode_in_body
1331                 insertion_mode t # reprocess token
1332                 return
1333         ins_mode_after_head = (t) ->
1334                 if is_space_tok t
1335                         insert_character t
1336                         return
1337                 if t.type is TYPE_COMMENT
1338                         insert_comment t
1339                         return
1340                 if t.type is TYPE_DOCTYPE
1341                         parse_error()
1342                         return
1343                 if t.type is TYPE_START_TAG and t.name is 'html'
1344                         ins_mode_in_body t
1345                         return
1346                 if t.type is TYPE_START_TAG and t.name is 'body'
1347                         insert_html_element t
1348                         flag_frameset_ok = false
1349                         insertion_mode = ins_mode_in_body
1350                         return
1351                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1352                         insert_html_element t
1353                         insertion_mode = ins_mode_in_frameset
1354                         return
1355                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1356                         parse_error()
1357                         open_els.unshift head_element_pointer
1358                         ins_mode_in_head t
1359                         for el, i of open_els
1360                                 if el is head_element_pointer
1361                                         open_els.splice i, 1
1362                                         return
1363                         console.log "warning: 23904 couldn't find head element in open_els"
1364                         return
1365                 if t.type is TYPE_END_TAG and t.name is 'template'
1366                         ins_mode_in_head t
1367                         return
1368                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1369                         ins_mode_after_head_else t
1370                         return
1371                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1372                         parse_error()
1373                         return
1374                 # Anything else
1375                 ins_mode_after_head_else t
1376
1377         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1378         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1379                 for node, i in open_els
1380                         if node.name is name # FIXME check namespace too
1381                                 generate_implied_end_tags name # arg is exception
1382                                 parse_error() unless i is 0
1383                                 while i >= 0
1384                                         open_els.shift()
1385                                         i -= 1
1386                                 return
1387                         if special_elements[node.name]? # FIXME check namespac too
1388                                 parse_error()
1389                                 return
1390         ins_mode_in_body = (t) ->
1391                 switch t.type
1392                         when TYPE_TEXT
1393                                 switch t.text
1394                                         when "\u0000"
1395                                                 parse_error()
1396                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1397                                                 reconstruct_active_formatting_elements()
1398                                                 insert_character t
1399                                         else
1400                                                 reconstruct_active_formatting_elements()
1401                                                 insert_character t
1402                                                 flag_frameset_ok = false
1403                         when TYPE_COMMENT
1404                                 insert_comment t
1405                         when TYPE_DOCTYPE
1406                                 parse_error()
1407                         when TYPE_START_TAG
1408                                 switch t.name
1409                                         when 'html'
1410                                                 parse_error()
1411                                                 return if template_tag_is_open()
1412                                                 root_attrs = open_els[open_els.length - 1].attrs
1413                                                 for k, v of t.attrs
1414                                                         root_attrs[k] = v unless root_attrs[k]?
1415                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1416                                                 # FIXME also do this for </template> (end tag)
1417                                                 return ins_mode_in_head t
1418                                         when 'body'
1419                                                 parse_error()
1420                                                 # TODO
1421                                         when 'frameset'
1422                                                 parse_error()
1423                                                 # TODO
1424                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1425                                                 close_p_if_in_button_scope()
1426                                                 insert_html_element t
1427                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1428                                                 close_p_if_in_button_scope()
1429                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1430                                                         parse_error()
1431                                                         open_els.shift()
1432                                                 insert_html_element t
1433                                         # TODO lots more to implement here
1434                                         when 'a'
1435                                                 # If the list of active formatting elements
1436                                                 # contains an a element between the end of the list and
1437                                                 # the last marker on the list (or the start of the list
1438                                                 # if there is no marker on the list), then this is a
1439                                                 # parse error; run the adoption agency algorithm for
1440                                                 # the tag name "a", then remove that element from the
1441                                                 # list of active formatting elements and the stack of
1442                                                 # open elements if the adoption agency algorithm didn't
1443                                                 # already remove it (it might not have if the element
1444                                                 # is not in table scope).
1445                                                 found = false
1446                                                 for el in afe
1447                                                         if el.type is TYPE_AFE_MARKER
1448                                                                 break
1449                                                         if el.name is 'a'
1450                                                                 found = el
1451                                                 if found?
1452                                                         parse_error()
1453                                                         adoption_agency 'a'
1454                                                         for el, i in afe
1455                                                                 if el is found
1456                                                                         afe.splice i, 1
1457                                                         for el, i in open_els
1458                                                                 if el is found
1459                                                                         open_els.splice i, 1
1460                                                 reconstruct_active_formatting_elements()
1461                                                 el = insert_html_element t
1462                                                 afe_push el
1463                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1464                                                 reconstruct_active_formatting_elements()
1465                                                 el = insert_html_element t
1466                                                 afe_push el
1467                                         when 'table'
1468                                                 # fixfull quirksmode thing
1469                                                 close_p_if_in_button_scope()
1470                                                 insert_html_element t
1471                                                 insertion_mode = ins_mode_in_table
1472                                         # TODO lots more to implement here
1473                                         else # any other start tag
1474                                                 reconstruct_active_formatting_elements()
1475                                                 insert_html_element t
1476                         when TYPE_EOF
1477                                 ok_tags = {
1478                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1479                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1480                                 }
1481                                 for t in open_els
1482                                         unless ok_tags[t.name]?
1483                                                 parse_error()
1484                                                 break
1485                                 # TODO stack of template insertion modes thing
1486                                 stop_parsing()
1487                         when TYPE_END_TAG
1488                                 switch t.name
1489                                         when 'body'
1490                                                 unless is_in_scope 'body'
1491                                                         parse_error()
1492                                                         return
1493                                                 # TODO implement parse error and move to tree_after_body
1494                                         when 'html'
1495                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1496                                                         parse_error()
1497                                                         return
1498                                                 # TODO implement parse error and move to tree_after_body, reprocess
1499                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1500                                                 unless is_in_scope t.name, NS_HTML
1501                                                         parse_error()
1502                                                         return
1503                                                 generate_implied_end_tags()
1504                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1505                                                         parse_error()
1506                                                 loop
1507                                                         el = open_els.shift()
1508                                                         if el.name is t.name and el.namespace is NS_HTML
1509                                                                 return
1510                                         # TODO lots more close tags to implement here
1511                                         when 'p'
1512                                                 unless is_in_button_scope 'p'
1513                                                         parse_error()
1514                                                         insert_html_element new_open_tag 'p'
1515                                                 close_p_element()
1516                                         # TODO lots more close tags to implement here
1517                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1518                                                 adoption_agency t.name
1519                                         # TODO lots more close tags to implement here
1520                                         else
1521                                                 in_body_any_other_end_tag t.name
1522                 return
1523
1524         ins_mode_in_table_else = (t) ->
1525                 parse_error()
1526                 flag_foster_parenting = true # FIXME
1527                 ins_mode_in_body t
1528                 flag_foster_parenting = false
1529         can_in_table = { # FIXME do this inline like everywhere else
1530                 'table': true
1531                 'tbody': true
1532                 'tfoot': true
1533                 'thead': true
1534                 'tr': true
1535         }
1536
1537         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1538         ins_mode_text = (t) ->
1539                 if t.type is TYPE_TEXT
1540                         insert_character t
1541                         return
1542                 if t.type is TYPE_EOF
1543                         parse_error()
1544                         if open_els[0].name is 'script'
1545                                 open_els[0].flag 'already started', true
1546                         open_els.shift()
1547                         insertion_mode = original_insertion_mode
1548                         insertion_mode t
1549                         return
1550                 if t.type is TYPE_END_TAG and t.name is 'script'
1551                         open_els.shift()
1552                         insertion_mode = original_insertion_mode
1553                         # fixfull the spec seems to assume that I'm going to run the script
1554                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1555                         return
1556                 if t.type is TYPE_END_TAG
1557                         open_els.shift()
1558                         insertion_mode = original_insertion_mode
1559                         return
1560                 console.log 'warning: end of ins_mode_text reached'
1561
1562         # the functions below implement the tokenizer stats described here:
1563         # http://www.w3.org/TR/html5/syntax.html#tokenization
1564
1565         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1566         ins_mode_in_table = (t) ->
1567                 switch t.type
1568                         when TYPE_TEXT
1569                                 if can_in_table[t.name]
1570                                         original_insertion_mode = insertion_mode
1571                                         insertion_mode = ins_mode_in_table_text
1572                                         insertion_mode t
1573                                 else
1574                                         ins_mode_in_table_else t
1575                         when TYPE_COMMENT
1576                                 insert_comment t
1577                         when TYPE_DOCTYPE
1578                                 parse_error()
1579                         when TYPE_START_TAG
1580                                 switch t.name
1581                                         when 'caption'
1582                                                 clear_stack_to_table_context()
1583                                                 afe_push_marker()
1584                                                 insert_html_element t
1585                                                 insertion_mode = ins_mode_in_caption
1586                                         when 'colgroup'
1587                                                 clear_stack_to_table_context()
1588                                                 insert_html_element t
1589                                                 insertion_mode = ins_mode_in_column_group
1590                                         when 'col'
1591                                                 clear_stack_to_table_context()
1592                                                 insert_html_element new_open_tag 'colgroup'
1593                                                 insertion_mode = ins_mode_in_column_group
1594                                                 insertion_mode t
1595                                         when 'tbody', 'tfoot', 'thead'
1596                                                 clear_stack_to_table_context()
1597                                                 insert_html_element t
1598                                                 insertion_mode = ins_mode_in_table_body
1599                                         when 'td', 'th', 'tr'
1600                                                 clear_stack_to_table_context()
1601                                                 insert_html_element new_open_tag 'tbody'
1602                                                 insertion_mode = ins_mode_in_table_body
1603                                                 insertion_mode t
1604                                         when 'table'
1605                                                 parse_error()
1606                                                 if is_in_table_scope 'table'
1607                                                         loop
1608                                                                 el = open_els.shift()
1609                                                                 if el.name is 'table'
1610                                                                         break
1611                                                         reset_insertion_mode()
1612                                                         insertion_mode t
1613                                         when 'style', 'script', 'template'
1614                                                 ins_mode_in_head t
1615                                         when 'input'
1616                                                 if token_is_input_hidden t
1617                                                         ins_mode_in_table_else t
1618                                                 else
1619                                                         parse_error()
1620                                                         el = insert_html_element t
1621                                                         open_els.shift()
1622                                                         t.acknowledge_self_closing()
1623                                         when 'form'
1624                                                 parse_error()
1625                                                 if form_element_pointer?
1626                                                         return
1627                                                 if template_tag_is_open()
1628                                                         return
1629                                                 form_element_pointer = insert_html_element t
1630                                                 open_els.shift()
1631                                         else
1632                                                 ins_mode_in_table_else t
1633                         when TYPE_END_TAG
1634                                 switch t.name
1635                                         when 'table'
1636                                                 if is_in_table_scope 'table'
1637                                                         loop
1638                                                                 el = open_els.shift()
1639                                                                 if el.name is 'table'
1640                                                                         break
1641                                                         reset_insertion_mode()
1642                                                 else
1643                                                         parse_error
1644                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1645                                                 parse_error()
1646                                         when 'template'
1647                                                 ins_mode_in_head t
1648                                         else
1649                                                 ins_mode_in_table_else t
1650                         when TYPE_EOF
1651                                 ins_mode_in_body t
1652                         else
1653                                 ins_mode_in_table_else t
1654
1655
1656         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1657         ins_mode_in_table_text = (t) ->
1658                 if t.type is TYPE_TEXT and t.text is "\u0000"
1659                         # huh? I thought the tokenizer didn't emit these
1660                         parse_error()
1661                         return
1662                 if t.type is TYPE_TEXT
1663                         pending_table_character_tokens.push t
1664                         return
1665                 # Anything else
1666                 all_space = true
1667                 for old in pending_table_character_tokens
1668                         unless is_space_tok old
1669                                 all_space = false
1670                                 break
1671                 if all_space
1672                         for old in pending_table_character_tokens
1673                                 insert_character old
1674                 else
1675                         for old in pending_table_character_tokens
1676                                 ins_mode_table_else old
1677                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1678                 insertion_mode = original_insertion_mode
1679                 insertion_mode t
1680
1681         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1682         ins_mode_in_caption = (t) ->
1683                 if t.type is TYPE_END_TAG and t.name is 'caption'
1684                         if is_in_table_scope 'caption'
1685                                 generate_implied_end_tags()
1686                                 if open_els[0].name isnt 'caption'
1687                                         parse_error()
1688                                 loop
1689                                         el = open_els.shift()
1690                                         if el.name is 'caption'
1691                                                 break
1692                                 clear_afe_to_marker()
1693                                 insertion_mode = in_table
1694                         else
1695                                 parse_error()
1696                                 # fragment case
1697                         return
1698                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1699                         parse_error()
1700                         if is_in_table_scope 'caption'
1701                                 loop
1702                                         el = open_els.shift()
1703                                         if el.name is 'caption'
1704                                                 break
1705                                 clear_afe_to_marker()
1706                                 insertion_mode = in_table
1707                                 insertion_mode t
1708                         # else fragment case
1709                         return
1710                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1711                         parse_error()
1712                         return
1713                 # Anything else
1714                 ins_mode_in_body t
1715
1716         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1717         ins_mode_in_column_group = (t) ->
1718                 if is_space_tok t
1719                         insert_character t
1720                         return
1721                 if t.type is TYPE_COMMENT
1722                         insert_comment t
1723                         return
1724                 if t.type is TYPE_DOCTYPE
1725                         parse_error()
1726                         return
1727                 if t.type is TYPE_START_TAG and t.name is 'html'
1728                         ins_mode_in_body t
1729                         return
1730                 if t.type is TYPE_START_TAG and t.name is 'col'
1731                         el = insert_html_element t
1732                         open_els.shift()
1733                         t.acknowledge_self_closing()
1734                         return
1735                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1736                         if open_els[0].name is 'colgroup'
1737                                 open_els[0].shift()
1738                                 insertion_mode = ins_mode_in_table
1739                         else
1740                                 parse_error()
1741                         return
1742                 if t.type is TYPE_END_TAG and t.name is 'col'
1743                         parse_error()
1744                         return
1745                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1746                         ins_mode_in_head t
1747                         return
1748                 if t.type is TYPE_EOF
1749                         ins_mode_in_body t
1750                         return
1751                 # Anything else
1752                 if open_els[0].name isnt 'colgroup'
1753                         parse_error()
1754                         return
1755                 open_els.shift()
1756                 insertion_mode = ins_mode_in_table
1757                 insertion_mode t
1758                 return
1759
1760         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1761         ins_mode_in_table_body = (t) ->
1762                 if t.type is TYPE_START_TAG and t.name is 'tr'
1763                         clear_stack_to_table_body_context()
1764                         insert_html_element t
1765                         insertion_mode = ins_mode_in_row
1766                         return
1767                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1768                         parse_error()
1769                         clear_stack_to_table_body_context()
1770                         insert_html_element new_open_tag 'tr'
1771                         insertion_mode = ins_mode_in_row
1772                         insertion_mode t
1773                         return
1774                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1775                         unless is_in_table_scope t.name # fixfull check namespace
1776                                 parse_error()
1777                                 return
1778                         clear_stack_to_table_body_context()
1779                         open_els.shift()
1780                         insertion_mode = ins_mode_in_table
1781                         return
1782                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1783                         has = false
1784                         for el in open_els
1785                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1786                                         has = true
1787                                         break
1788                                 if table_scopers[el.name]
1789                                         break
1790                         if !has
1791                                 parse_error()
1792                                 return
1793                         clear_stack_to_table_body_context()
1794                         open_els.shift()
1795                         insertion_mode = ins_mode_in_table
1796                         insertion_mode t
1797                         return
1798                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1799                         parse_error()
1800                         return
1801                 # Anything else
1802                 ins_mode_in_table t
1803
1804         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1805         ins_mode_in_row = (t) ->
1806                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1807                         clear_stack_to_table_row_context()
1808                         insert_html_element t
1809                         insertion_mode = ins_mode_in_cell
1810                         afe_push_marker()
1811                         return
1812                 if t.type is TYPE_END_TAG and t.name is 'tr'
1813                         if is_in_table_scope 'tr'
1814                                 clear_stack_to_table_row_context()
1815                                 open_els.shift()
1816                                 insertion_mode = ins_mode_in_table_body
1817                         else
1818                                 parse_error()
1819                         return
1820                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1821                         if is_in_table_scope 'tr'
1822                                 clear_stack_to_table_row_context()
1823                                 open_els.shift()
1824                                 insertion_mode = ins_mode_in_table_body
1825                                 insertion_mode t
1826                         else
1827                                 parse_error()
1828                         return
1829                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1830                         if is_in_table_scope t.name # fixfull namespace
1831                                 if is_in_table_scope 'tr'
1832                                         clear_stack_to_table_row_context()
1833                                         open_els.shift()
1834                                         insertion_mode = ins_mode_in_table_body
1835                                         insertion_mode t
1836                         else
1837                                 parse_error()
1838                         return
1839                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1840                         parse_error()
1841                         return
1842                 # Anything else
1843                 ins_mode_in_table t
1844
1845         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1846         close_the_cell = ->
1847                 generate_implied_end_tags()
1848                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1849                         parse_error()
1850                 loop
1851                         el = open_els.shift()
1852                         if el.name is 'td' or el.name is 'th'
1853                                 break
1854                 clear_afe_to_marker()
1855                 insertion_mode = ins_mode_in_row
1856
1857         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1858         ins_mode_in_cell = (t) ->
1859                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1860                         if is_in_table_scope t.name
1861                                 generate_implied_end_tags()
1862                                 if open_els[0].name isnt t.name
1863                                         parse_error
1864                                 loop
1865                                         el = open_els.shift()
1866                                         if el.name is t.name
1867                                                 break
1868                                 clear_afe_to_marker()
1869                                 insertion_mode = ins_mode_in_row
1870                         else
1871                                 parse_error()
1872                         return
1873                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1874                         has = false
1875                         for el in open_els
1876                                 if el.name is 'td' or el.name is 'th'
1877                                         has = true
1878                                         break
1879                                 if table_scopers[el.name]
1880                                         break
1881                         if !has
1882                                 parse_error()
1883                                 return
1884                         close_the_cell()
1885                         insertion_mode t
1886                         return
1887                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1888                         parse_error()
1889                         return
1890                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1891                         if is_in_table_scope t.name # fixfull namespace
1892                                 close_the_cell()
1893                                 insertion_mode t
1894                         else
1895                                 parse_error()
1896                         return
1897                 # Anything Else
1898                 ins_mode_in_body t
1899
1900         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1901         ins_mode_in_select = (t) ->
1902                 if t.type is TYPE_TEXT and t.text is "\u0000"
1903                         parse_error()
1904                         return
1905                 if t.type is TYPE_TEXT
1906                         insert_character t
1907                         return
1908                 if t.type is TYPE_COMMENT
1909                         insert_comment t
1910                         return
1911                 if t.type is TYPE_DOCTYPE
1912                         parse_error()
1913                         return
1914                 if t.type is TYPE_START_TAG and t.name is 'html'
1915                         ins_mode_in_body t
1916                         return
1917                 if t.type is TYPE_START_TAG and t.name is 'option'
1918                         if open_els[0].name is 'option'
1919                                 open_els.shift()
1920                         insert_html_element t
1921                         return
1922                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1923                         if open_els[0].name is 'option'
1924                                 open_els.shift()
1925                         if open_els[0].name is 'optgroup'
1926                                 open_els.shift()
1927                         insert_html_element t
1928                         return
1929                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1930                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1931                                 open_els.shift()
1932                         if open_els[0].name is 'optgroup'
1933                                 open_els.shift()
1934                         else
1935                                 parse_error()
1936                         return
1937                 if t.type is TYPE_END_TAG and t.name is 'option'
1938                         if open_els[0].name is 'option'
1939                                 open_els.shift()
1940                         else
1941                                 parse_error()
1942                         return
1943                 if t.type is TYPE_END_TAG and t.name is 'select'
1944                         if is_in_select_scope 'select'
1945                                 loop
1946                                         el = open_els.shift()
1947                                         if el.name is 'select'
1948                                                 break
1949                                 reset_insertion_mode()
1950                         else
1951                                 parse_error()
1952                         return
1953                 if t.type is TYPE_START_TAG and t.name is 'select'
1954                         parse_error()
1955                         loop
1956                                 el = open_els.shift()
1957                                 if el.name is 'select'
1958                                         break
1959                         reset_insertion_mode()
1960                         # spec says that this is the same as </select> but it doesn't say
1961                         # to check scope first
1962                         return
1963                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1964                         parse_error()
1965                         if is_in_select_scope 'select'
1966                                 return
1967                         loop
1968                                 el = open_els.shift()
1969                                 if el.name is 'select'
1970                                         break
1971                         reset_insertion_mode()
1972                         insertion_mode t
1973                         return
1974                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1975                         ins_mode_in_head t
1976                         return
1977                 if t.type is TYPE_EOF
1978                         ins_mode_in_body t
1979                         return
1980                 # Anything else
1981                 parse_error()
1982                 return
1983
1984         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1985         ins_mode_in_select_in_table = (t) ->
1986                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1987                         parse_error()
1988                         loop
1989                                 el = open_els.shift()
1990                                 if el.name is 'select'
1991                                         break
1992                         reset_insertion_mode()
1993                         insertion_mode t
1994                         return
1995                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1996                         parse_error()
1997                         unless is_in_table_scope t.name, NS_HTML
1998                                 return
1999                         loop
2000                                 el = open_els.shift()
2001                                 if el.name is 'select'
2002                                         break
2003                         reset_insertion_mode()
2004                         insertion_mode t
2005                         return
2006                 # Anything else
2007                 ins_mode_in_select t
2008                 return
2009
2010         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2011         ins_mode_in_template = (t) ->
2012                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2013                         ins_mode_in_body t
2014                         return
2015                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2016                         ins_mode_in_head t
2017                         return
2018                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2019                         template_insertion_modes.shift()
2020                         template_insertion_modes.unshift ins_mode_in_table
2021                         insertion_mode = ins_mode_in_table
2022                         insertion_mode t
2023                         return
2024                 if t.type is TYPE_START_TAG and t.name is 'col'
2025                         template_insertion_modes.shift()
2026                         template_insertion_modes.unshift ins_mode_in_column_group
2027                         insertion_mode = ins_mode_in_column_group
2028                         insertion_mode t
2029                         return
2030                 if t.type is TYPE_START_TAG and t.name is 'tr'
2031                         template_insertion_modes.shift()
2032                         template_insertion_modes.unshift ins_mode_in_table_body
2033                         insertion_mode = ins_mode_in_table_body
2034                         insertion_mode t
2035                         return
2036                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2037                         template_insertion_modes.shift()
2038                         template_insertion_modes.unshift ins_mode_in_row
2039                         insertion_mode = ins_mode_in_row
2040                         insertion_mode t
2041                         return
2042                 if t.type is TYPE_START_TAG
2043                         template_insertion_modes.shift()
2044                         template_insertion_modes.unshift ins_mode_in_body
2045                         insertion_mode = ins_mode_in_body
2046                         insertion_mode t
2047                         return
2048                 if t.type is TYPE_END_TAG
2049                         parse_error()
2050                         return
2051                 if t.type is EOF
2052                         unless template_tag_is_open()
2053                                 stop_parsing()
2054                                 return
2055                         parse_error()
2056                         loop
2057                                 el = open_els.shift()
2058                                 if el.name is 'template' # fixfull check namespace
2059                                         break
2060                         clear_afe_to_marker()
2061                         template_insertion_modes.shift()
2062                         reset_insertion_mode()
2063                         insertion_mode t
2064
2065         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2066         ins_mode_after_body = (t) ->
2067                 if is_space_tok t
2068                         ins_mode_in_body t
2069                         return
2070                 if t.type is TYPE_COMMENT
2071                         insert_comment t, [open_els[0], open_els[0].children.length]
2072                         return
2073                 if t.type is TYPE_DOCTYPE
2074                         parse_error()
2075                         return
2076                 if t.type is TYPE_START_TAG and t.name is 'html'
2077                         ins_mode_in_body t
2078                         return
2079                 if t.type is TYPE_END_TAG and t.name is 'html'
2080                         # fixfull fragment case
2081                         insertion_mode = ins_mode_after_after_body
2082                         return
2083                 if t.type is TYPE_EOF
2084                         stop_parsing()
2085                         return
2086                 # Anything ELse
2087                 parse_error()
2088                 insertion_mode = ins_mode_in_body
2089                 insertion_mode t
2090
2091         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2092         ins_mode_in_frameset = (t) ->
2093                 if is_space_tok t
2094                         insert_character t
2095                         return
2096                 if t.type is TYPE_COMMENT
2097                         insert_comment t
2098                         return
2099                 if t.type is TYPE_DOCTYPE
2100                         parse_error()
2101                         return
2102                 if t.type is TYPE_START_TAG and t.name is 'html'
2103                         ins_mode_in_body t
2104                         return
2105                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2106                         insert_html_element t
2107                         return
2108                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2109                         # TODO ?correct for: "if the current node is the root html element"
2110                         if open_els.length is 1
2111                                 parse_error()
2112                                 return # fragment case
2113                         open_els.shift()
2114                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2115                                 insertion_mode = ins_mode_after_frameset
2116                         return
2117                 if t.type is TYPE_START_TAG and t.name is 'frame'
2118                         insert_html_element t
2119                         open_els.shift()
2120                         t.acknowledge_self_closing()
2121                         return
2122                 if t.type is TYPE_START TAG and t.name is 'noframes'
2123                         ins_mode_in_head t
2124                         return
2125                 if t.type is TYPE_EOF
2126                         # TODO ?correct for: "if the current node is not the root html element"
2127                         if open_els.length isnt 1
2128                                 parse_error()
2129                         stop_parsing()
2130                         return
2131                 # Anything else
2132                 parse_error()
2133                 return
2134
2135         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2136         ins_mode_after_frameset = (t) ->
2137                 if is_space_tok t
2138                         insert_character t
2139                         return
2140                 if t.type is TYPE_COMMENT
2141                         insert_comment t
2142                         return
2143                 if t.type is TYPE_DOCTYPE
2144                         parse_error()
2145                         return
2146                 if t.type is TYPE_START_TAG and t.name is 'html'
2147                         ins_mode_in_body t
2148                         return
2149                 if t.type is TYPE_END_TAG and t.name is 'html'
2150                         insert_mode = ins_mode_after_after_frameset
2151                         return
2152                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2153                         ins_mode_in_head t
2154                         return
2155                 if t.type is TYPE_EOF
2156                         stop_parsing()
2157                         return
2158                 # Anything else
2159                 parse_error()
2160                 return
2161
2162         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2163         ins_mode_after_after_body = (t) ->
2164                 if t.type is TYPE_COMMENT
2165                         insert_comment t, [doc, doc.children.length]
2166                         return
2167                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2168                         ins_mode_in_body t
2169                         return
2170                 if t.type is TYPE_EOF
2171                         stop_parsing()
2172                         return
2173                 # Anything else
2174                 parse_error()
2175                 insertion_mode = ins_mode_in_body
2176                 return
2177
2178         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2179         ins_mode_after_after_frameset = (t) ->
2180                 if t.type is TYPE_COMMENT
2181                         insert_comment t, [doc, doc.children.length]
2182                         return
2183                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2184                         ins_mode_in_body t
2185                         return
2186                 if t.type is TYPE_EOF
2187                         stop_parsing()
2188                         return
2189                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2190                         ins_mode_in_head t
2191                         return
2192                 # Anything else
2193                 parse_error()
2194                 return
2195
2196
2197
2198
2199
2200         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2201         tok_state_data = ->
2202                 switch c = txt.charAt(cur++)
2203                         when '&'
2204                                 return new_text_node parse_character_reference()
2205                         when '<'
2206                                 tok_state = tok_state_tag_open
2207                         when "\u0000"
2208                                 parse_error()
2209                                 return new_text_node c
2210                         when '' # EOF
2211                                 return new_eof_token()
2212                         else
2213                                 return new_text_node c
2214                 return null
2215
2216         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2217         # not needed: tok_state_character_reference_in_data = ->
2218         # just call parse_character_reference()
2219
2220         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2221         tok_state_rcdata = ->
2222                 switch c = txt.charAt(cur++)
2223                         when '&'
2224                                 return new_text_node parse_character_reference()
2225                         when '<'
2226                                 tok_state = tok_state_rcdata_less_than_sign
2227                         when "\u0000"
2228                                 parse_error()
2229                                 return new_character_token "\ufffd"
2230                         when '' # EOF
2231                                 return new_eof_token()
2232                         else
2233                                 return new_character_token c
2234                 return null
2235
2236         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2237         # not needed: tok_state_character_reference_in_rcdata = ->
2238         # just call parse_character_reference()
2239
2240         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2241         tok_state_rawtext = ->
2242                 switch c = txt.charAt(cur++)
2243                         when '<'
2244                                 tok_state = tok_state_rawtext_less_than_sign
2245                         when "\u0000"
2246                                 parse_error()
2247                                 return new_character_token "\ufffd"
2248                         when '' # EOF
2249                                 return new_eof_token()
2250                         else
2251                                 return new_character_token c
2252                 return null
2253
2254         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2255         tok_state_script_data = ->
2256                 switch c = txt.charAt(cur++)
2257                         when '<'
2258                                 tok_state = tok_state_script_data_less_than_sign
2259                         when "\u0000"
2260                                 parse_error()
2261                                 return new_character_token "\ufffd"
2262                         when '' # EOF
2263                                 return new_eof_token()
2264                         else
2265                                 return new_character_token c
2266                 return null
2267
2268         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2269         tok_state_plaintext = ->
2270                 switch c = txt.charAt(cur++)
2271                         when "\u0000"
2272                                 parse_error()
2273                                 return new_character_token "\ufffd"
2274                         when '' # EOF
2275                                 return new_eof_token()
2276                         else
2277                                 return new_character_token c
2278                 return null
2279
2280
2281         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2282         tok_state_tag_open = ->
2283                 switch c = txt.charAt(cur++)
2284                         when '!'
2285                                 tok_state = tok_state_markup_declaration_open
2286                         when '/'
2287                                 tok_state = tok_state_end_tag_open
2288                         when '?'
2289                                 parse_error()
2290                                 tok_cur_tag = new_comment_token '?'
2291                                 tok_state = tok_state_bogus_comment
2292                         else
2293                                 if lc_alpha.indexOf(c) > -1
2294                                         tok_cur_tag = new_open_tag c
2295                                         tok_state = tok_state_tag_name
2296                                 else if uc_alpha.indexOf(c) > -1
2297                                         tok_cur_tag = new_open_tag c.toLowerCase()
2298                                         tok_state = tok_state_tag_name
2299                                 else
2300                                         parse_error()
2301                                         tok_state = tok_state_data
2302                                         cur -= 1 # we didn't parse/handle the char after <
2303                                         return new_text_node '<'
2304                 return null
2305
2306         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2307         tok_state_end_tag_open = ->
2308                 switch c = txt.charAt(cur++)
2309                         when '>'
2310                                 parse_error()
2311                                 tok_state = tok_state_data
2312                         when '' # EOF
2313                                 parse_error()
2314                                 tok_state = tok_state_data
2315                                 return new_text_node '</'
2316                         else
2317                                 if uc_alpha.indexOf(c) > -1
2318                                         tok_cur_tag = new_end_tag c.toLowerCase()
2319                                         tok_state = tok_state_tag_name
2320                                 else if lc_alpha.indexOf(c) > -1
2321                                         tok_cur_tag = new_end_tag c
2322                                         tok_state = tok_state_tag_name
2323                                 else
2324                                         parse_error()
2325                                         tok_cur_tag = new_comment_token '/'
2326                                         tok_state = tok_state_bogus_comment
2327                 return null
2328
2329         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2330         tok_state_tag_name = ->
2331                 switch c = txt.charAt(cur++)
2332                         when "\t", "\n", "\u000c", ' '
2333                                 tok_state = tok_state_before_attribute_name
2334                         when '/'
2335                                 tok_state = tok_state_self_closing_start_tag
2336                         when '>'
2337                                 tok_state = tok_state_data
2338                                 tmp = tok_cur_tag
2339                                 tok_cur_tag = null
2340                                 return tmp
2341                         when "\u0000"
2342                                 parse_error()
2343                                 tok_cur_tag.name += "\ufffd"
2344                         when '' # EOF
2345                                 parse_error()
2346                                 tok_state = tok_state_data
2347                         else
2348                                 if uc_alpha.indexOf(c) > -1
2349                                         tok_cur_tag.name += c.toLowerCase()
2350                                 else
2351                                         tok_cur_tag.name += c
2352                 return null
2353
2354         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2355         tok_state_rcdata_less_than_sign = ->
2356                 c = txt.charAt(cur++)
2357                 if c is '/'
2358                         temporary_buffer = ''
2359                         tok_state = tok_state_rcdata_end_tag_open
2360                         return null
2361                 # Anything else
2362                 tok_state = tok_state_rcdata
2363                 cur -= 1 # reconsume the input character
2364                 return new_character_token '<'
2365
2366         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2367         tok_state_rcdata_end_tag_open = ->
2368                 c = txt.charAt(cur++)
2369                 if uc_alpha.indexOf(c) > -1
2370                         tok_cur_tag = new_end_tag c.toLowerCase()
2371                         temporary_buffer += c
2372                         tok_state = tok_state_rcdata_end_tag_name
2373                         return null
2374                 if lc_alpha.indexOf(c) > -1
2375                         tok_cur_tag = new_end_tag c
2376                         temporary_buffer += c
2377                         tok_state = tok_state_rcdata_end_tag_name
2378                         return null
2379                 # Anything else
2380                 tok_state = tok_state_rcdata
2381                 cur -= 1 # reconsume the input character
2382                 return new_character_token "</" # fixfull separate these
2383
2384         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2385         is_appropriate_end_tag = (t) ->
2386                 # spec says to check against "the tag name of the last start tag to
2387                 # have been emitted from this tokenizer", but this is only called from
2388                 # the various "raw" states, which I'm pretty sure all push the start
2389                 # token onto open_els. TODO: verify this after the script data states
2390                 # are implemented
2391                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2392                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2393
2394         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2395         tok_state_rcdata_end_tag_name = ->
2396                 c = txt.charAt(cur++)
2397                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2398                         if is_appropriate_end_tag tok_cur_tag
2399                                 tok_state = tok_state_before_attribute_name
2400                                 return
2401                         # else fall through to "Anything else"
2402                 if c is '/'
2403                         if is_appropriate_end_tag tok_cur_tag
2404                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2405                                 return
2406                         # else fall through to "Anything else"
2407                 if c is '>'
2408                         if is_appropriate_end_tag tok_cur_tag
2409                                 tok_state = tok_state_data
2410                                 return tok_cur_tag
2411                         # else fall through to "Anything else"
2412                 if uc_alpha.indexOf(c) > -1
2413                         tok_cur_tag.name += c.toLowerCase()
2414                         temporary_buffer += c
2415                         return null
2416                 if lc_alpha.indexOf(c) > -1
2417                         tok_cur_tag.name += c
2418                         temporary_buffer += c
2419                         return null
2420                 # Anything else
2421                 tok_state = tok_state_rcdata
2422                 cur -= 1 # reconsume the input character
2423                 return new_character_token '</' + temporary_buffer # fixfull separate these
2424
2425         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2426         tok_state_rawtext_less_than_sign = ->
2427                 c = txt.charAt(cur++)
2428                 if c is '/'
2429                         temporary_buffer = ''
2430                         tok_state = tok_state_rawtext_end_tag_open
2431                         return null
2432                 # Anything else
2433                 tok_state = tok_state_rawtext
2434                 cur -= 1 # reconsume the input character
2435                 return new_character_token '<'
2436
2437         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2438         tok_state_rawtext_end_tag_open = ->
2439                 c = txt.charAt(cur++)
2440                 if uc_alpha.indexOf(c) > -1
2441                         tok_cur_tag = new_end_tag c.toLowerCase()
2442                         temporary_buffer += c
2443                         tok_state = tok_state_rawtext_end_tag_name
2444                         return null
2445                 if lc_alpha.indexOf(c) > -1
2446                         tok_cur_tag = new_end_tag c
2447                         temporary_buffer += c
2448                         tok_state = tok_state_rawtext_end_tag_name
2449                         return null
2450                 # Anything else
2451                 tok_state = tok_state_rawtext
2452                 cur -= 1 # reconsume the input character
2453                 return new_character_token "</" # fixfull separate these
2454
2455         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2456         tok_state_rawtext_end_tag_name = ->
2457                 c = txt.charAt(cur++)
2458                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2459                         if is_appropriate_end_tag tok_cur_tag
2460                                 tok_state = tok_state_before_attribute_name
2461                                 return
2462                         # else fall through to "Anything else"
2463                 if c is '/'
2464                         if is_appropriate_end_tag tok_cur_tag
2465                                 tok_state = tok_state_self_closing_start_tag
2466                                 return
2467                         # else fall through to "Anything else"
2468                 if c is '>'
2469                         if is_appropriate_end_tag tok_cur_tag
2470                                 tok_state = tok_state_data
2471                                 return tok_cur_tag
2472                         # else fall through to "Anything else"
2473                 if uc_alpha.indexOf(c) > -1
2474                         tok_cur_tag.name += c.toLowerCase()
2475                         temporary_buffer += c
2476                         return null
2477                 if lc_alpha.indexOf(c) > -1
2478                         tok_cur_tag.name += c
2479                         temporary_buffer += c
2480                         return null
2481                 # Anything else
2482                 tok_state = tok_state_rawtext
2483                 cur -= 1 # reconsume the input character
2484                 return new_character_token '</' + temporary_buffer # fixfull separate these
2485
2486         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2487
2488         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2489         tok_state_before_attribute_name = ->
2490                 attr_name = null
2491                 switch c = txt.charAt(cur++)
2492                         when "\t", "\n", "\u000c", ' '
2493                                 return null
2494                         when '/'
2495                                 tok_state = tok_state_self_closing_start_tag
2496                                 return null
2497                         when '>'
2498                                 tok_state = tok_state_data
2499                                 tmp = tok_cur_tag
2500                                 tok_cur_tag = null
2501                                 return tmp
2502                         when "\u0000"
2503                                 parse_error()
2504                                 attr_name = "\ufffd"
2505                         when '"', "'", '<', '='
2506                                 parse_error()
2507                                 attr_name = c
2508                         when '' # EOF
2509                                 parse_error()
2510                                 tok_state = tok_state_data
2511                         else
2512                                 if uc_alpha.indexOf(c) > -1
2513                                         attr_name = c.toLowerCase()
2514                                 else
2515                                         attr_name = c
2516                 if attr_name?
2517                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2518                         tok_state = tok_state_attribute_name
2519                 return null
2520
2521         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2522         tok_state_attribute_name = ->
2523                 switch c = txt.charAt(cur++)
2524                         when "\t", "\n", "\u000c", ' '
2525                                 tok_state = tok_state_after_attribute_name
2526                         when '/'
2527                                 tok_state = tok_state_self_closing_start_tag
2528                         when '='
2529                                 tok_state = tok_state_before_attribute_value
2530                         when '>'
2531                                 tok_state = tok_state_data
2532                                 tmp = tok_cur_tag
2533                                 tok_cur_tag = null
2534                                 return tmp
2535                         when "\u0000"
2536                                 parse_error()
2537                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2538                         when '"', "'", '<'
2539                                 parse_error()
2540                                 tok_cur_tag.attrs_a[0][0] = c
2541                         when '' # EOF
2542                                 parse_error()
2543                                 tok_state = tok_state_data
2544                         else
2545                                 if uc_alpha.indexOf(c) > -1
2546                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2547                                 else
2548                                         tok_cur_tag.attrs_a[0][0] += c
2549                 return null
2550
2551         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2552         tok_state_after_attribute_name = ->
2553                 c = txt.charAt(cur++)
2554                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2555                         return
2556                 if c is '/'
2557                         tok_state = tok_state_self_closing_start_tag
2558                         return
2559                 if c is '='
2560                         tok_state = tok_state_before_attribute_value
2561                         return
2562                 if c is '>'
2563                         tok_state = tok_state_data
2564                         return
2565                 if uc_alpha.indexOf(c) > -1
2566                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2567                         tok_state = tok_state_attribute_name
2568                         return
2569                 if c is "\u0000"
2570                         parse_error()
2571                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2572                         tok_state = tok_state_attribute_name
2573                         return
2574                 if c is '' # EOF
2575                         parse_error()
2576                         tok_state = tok_state_data
2577                         cur -= 1 # reconsume
2578                         return
2579                 if c is '"' or c is "'" or c is '<'
2580                         parse_error()
2581                         # fall through to Anything else
2582                 # Anything else
2583                 tok_cur_tag.attrs_a.unshift [c, '']
2584                 tok_state = tok_state_attribute_name
2585
2586         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2587         tok_state_before_attribute_value = ->
2588                 switch c = txt.charAt(cur++)
2589                         when "\t", "\n", "\u000c", ' '
2590                                 return null
2591                         when '"'
2592                                 tok_state = tok_state_attribute_value_double_quoted
2593                         when '&'
2594                                 tok_state = tok_state_attribute_value_unquoted
2595                                 cur -= 1
2596                         when "'"
2597                                 tok_state = tok_state_attribute_value_single_quoted
2598                         when "\u0000"
2599                                 # Parse error
2600                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2601                                 tok_state = tok_state_attribute_value_unquoted
2602                         when '>'
2603                                 # Parse error
2604                                 tok_state = tok_state_data
2605                                 tmp = tok_cur_tag
2606                                 tok_cur_tag = null
2607                                 return tmp
2608                         when '' # EOF
2609                                 parse_error()
2610                                 tok_state = tok_state_data
2611                         else
2612                                 tok_cur_tag.attrs_a[0][1] += c
2613                                 tok_state = tok_state_attribute_value_unquoted
2614                 return null
2615
2616         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2617         tok_state_attribute_value_double_quoted = ->
2618                 switch c = txt.charAt(cur++)
2619                         when '"'
2620                                 tok_state = tok_state_after_attribute_value_quoted
2621                         when '&'
2622                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2623                         when "\u0000"
2624                                 # Parse error
2625                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2626                         when '' # EOF
2627                                 parse_error()
2628                                 tok_state = tok_state_data
2629                         else
2630                                 tok_cur_tag.attrs_a[0][1] += c
2631                 return null
2632
2633         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2634         tok_state_attribute_value_single_quoted = ->
2635                 switch c = txt.charAt(cur++)
2636                         when "'"
2637                                 tok_state = tok_state_after_attribute_value_quoted
2638                         when '&'
2639                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2640                         when "\u0000"
2641                                 # Parse error
2642                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2643                         when '' # EOF
2644                                 parse_error()
2645                                 tok_state = tok_state_data
2646                         else
2647                                 tok_cur_tag.attrs_a[0][1] += c
2648                 return null
2649
2650         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2651         tok_state_attribute_value_unquoted = ->
2652                 switch c = txt.charAt(cur++)
2653                         when "\t", "\n", "\u000c", ' '
2654                                 tok_state = tok_state_before_attribute_name
2655                         when '&'
2656                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2657                         when '>'
2658                                 tok_state = tok_state_data
2659                                 tmp = tok_cur_tag
2660                                 tok_cur_tag = null
2661                                 return tmp
2662                         when "\u0000"
2663                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2664                         when '' # EOF
2665                                 parse_error()
2666                                 tok_state = tok_state_data
2667                         else
2668                                 # Parse Error if ', <, = or ` (backtick)
2669                                 tok_cur_tag.attrs_a[0][1] += c
2670                 return null
2671
2672         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2673         tok_state_after_attribute_value_quoted = ->
2674                 switch c = txt.charAt(cur++)
2675                         when "\t", "\n", "\u000c", ' '
2676                                 tok_state = tok_state_before_attribute_name
2677                         when '/'
2678                                 tok_state = tok_state_self_closing_start_tag
2679                         when '>'
2680                                 tok_state = tok_state_data
2681                                 tmp = tok_cur_tag
2682                                 tok_cur_tag = null
2683                                 return tmp
2684                         when '' # EOF
2685                                 parse_error()
2686                                 tok_state = tok_state_data
2687                         else
2688                                 # Parse Error
2689                                 tok_state = tok_state_before_attribute_name
2690                                 cur -= 1 # we didn't handle that char
2691                 return null
2692
2693         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2694         # WARNING: put a comment token in tok_cur_tag before setting this state
2695         tok_state_bogus_comment = ->
2696                 next_gt = txt.indexOf '>', cur
2697                 if next_gt is -1
2698                         val = txt.substr cur
2699                         cur = txt.length
2700                 else
2701                         val = txt.substr cur, (next_gt - cur)
2702                         cur = next_gt + 1
2703                 val = val.replace "\u0000", "\ufffd"
2704                 tok_cur_tag.text += val
2705                 tok_state = tok_state_data
2706                 return tok_cur_tag
2707
2708         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2709         tok_state_markup_declaration_open = ->
2710                 if txt.substr(cur, 2) is '--'
2711                         cur += 2
2712                         tok_cur_tag = new_comment_token ''
2713                         tok_state = tok_state_comment_start
2714                         return
2715                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2716                         cur += 7
2717                         tok_state = tok_state_doctype
2718                         return
2719                 acn = adjusted_current_node()
2720                 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2721                         cur += 7
2722                         tok_state = tok_state_cdata_section
2723                         return
2724                 # Otherwise
2725                 parse_error()
2726                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2727                 tok_state = tok_state_bogus_comment
2728                 return
2729
2730         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2731         tok_state_comment_start = ->
2732                 switch c = txt.charAt(cur++)
2733                         when '-'
2734                                 tok_state = tok_state_comment_start_dash
2735                         when "\u0000"
2736                                 parse_error()
2737                                 return new_character_token "\ufffd"
2738                         when '>'
2739                                 parse_error()
2740                                 tok_state = tok_state_data
2741                                 return tok_cur_tag
2742                         when '' # EOF
2743                                 parse_error()
2744                                 tok_state = tok_state_data
2745                                 cur -= 1 # Reconsume
2746                                 return tok_cur_tag
2747                         else
2748                                 tok_cur_tag.text += c
2749                 return null
2750
2751         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2752         tok_state_comment_start_dash = ->
2753                 switch c = txt.charAt(cur++)
2754                         when '-'
2755                                 tok_state = tok_state_comment_end
2756                         when "\u0000"
2757                                 parse_error()
2758                                 tok_cur_tag.text += "-\ufffd"
2759                                 tok_state = tok_state_comment
2760                         when '>'
2761                                 parse_error()
2762                                 tok_state = tok_state_data
2763                                 return tok_cur_tag
2764                         when '' # EOF
2765                                 parse_error()
2766                                 tok_state = tok_state_data
2767                                 cur -= 1 # Reconsume
2768                                 return tok_cur_tag
2769                         else
2770                                 tok_cur_tag.text += "-#{c}"
2771                                 tok_state = tok_state_comment
2772                 return null
2773
2774         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2775         tok_state_comment = ->
2776                 switch c = txt.charAt(cur++)
2777                         when '-'
2778                                 tok_state = tok_state_comment_end_dash
2779                         when "\u0000"
2780                                 parse_error()
2781                                 tok_cur_tag.text += "\ufffd"
2782                         when '' # EOF
2783                                 parse_error()
2784                                 tok_state = tok_state_data
2785                                 cur -= 1 # Reconsume
2786                                 return tok_cur_tag
2787                         else
2788                                 tok_cur_tag.text += c
2789                 return null
2790
2791         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2792         tok_state_comment_end_dash = ->
2793                 switch c = txt.charAt(cur++)
2794                         when '-'
2795                                 tok_state = tok_state_comment_end
2796                         when "\u0000"
2797                                 parse_error()
2798                                 tok_cur_tag.text += "-\ufffd"
2799                                 tok_state = tok_state_comment
2800                         when '' # EOF
2801                                 parse_error()
2802                                 tok_state = tok_state_data
2803                                 cur -= 1 # Reconsume
2804                                 return tok_cur_tag
2805                         else
2806                                 tok_cur_tag.text += "-#{c}"
2807                                 tok_state = tok_state_comment
2808                 return null
2809
2810         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2811         tok_state_comment_end = ->
2812                 switch c = txt.charAt(cur++)
2813                         when '>'
2814                                 tok_state = tok_state_data
2815                                 return tok_cur_tag
2816                         when "\u0000"
2817                                 parse_error()
2818                                 tok_cur_tag.text += "--\ufffd"
2819                                 tok_state = tok_state_comment
2820                         when '!'
2821                                 parse_error()
2822                                 tok_state = tok_state_comment_end_bang
2823                         when '-'
2824                                 parse_error()
2825                                 tok_cur_tag.text += '-'
2826                         when '' # EOF
2827                                 parse_error()
2828                                 tok_state = tok_state_data
2829                                 cur -= 1 # Reconsume
2830                                 return tok_cur_tag
2831                         else
2832                                 parse_error()
2833                                 tok_cur_tag.text += "--#{c}"
2834                                 tok_state = tok_state_comment
2835                 return null
2836
2837         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2838         tok_state_comment_end_bang = ->
2839                 switch c = txt.charAt(cur++)
2840                         when '-'
2841                                 tok_cur_tag.text += "--!#{c}"
2842                                 tok_state = tok_state_comment_end_dash
2843                         when '>'
2844                                 tok_state = tok_state_data
2845                                 return tok_cur_tag
2846                         when "\u0000"
2847                                 parse_error()
2848                                 tok_cur_tag.text += "--!\ufffd"
2849                                 tok_state = tok_state_comment
2850                         when '' # EOF
2851                                 parse_error()
2852                                 tok_state = tok_state_data
2853                                 cur -= 1 # Reconsume
2854                                 return tok_cur_tag
2855                         else
2856                                 tok_cur_tag.text += "--!#{c}"
2857                                 tok_state = tok_state_comment
2858                 return null
2859
2860         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2861         tok_state_doctype = ->
2862                 switch c = txt.charAt(cur++)
2863                         when "\t", "\u000a", "\u000c", ' '
2864                                 tok_state = tok_state_before_doctype_name
2865                         when '' # EOF
2866                                 parse_error()
2867                                 tok_state = tok_state_data
2868                                 el = new_doctype_token ''
2869                                 el.flag 'force-quirks', true
2870                                 cur -= 1 # Reconsume
2871                                 return el
2872                         else
2873                                 parse_error()
2874                                 tok_state = tok_state_before_doctype_name
2875                                 cur -= 1 # Reconsume
2876                 return null
2877
2878         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2879         tok_state_before_doctype_name = ->
2880                 c = txt.charAt(cur++)
2881                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2882                         return
2883                 if uc_alpha.indexOf(c) > -1
2884                         tok_cur_tag = new_doctype_token c.toLowerCase()
2885                         tok_state = tok_state_doctype_name
2886                         return
2887                 if c is "\u0000"
2888                         parse_error()
2889                         tok_cur_tag = new_doctype_token "\ufffd"
2890                         tok_state = tok_state_doctype_name
2891                         return
2892                 if c is '>'
2893                         parse_error()
2894                         el = new_doctype_token ''
2895                         el.flag 'force-quirks', true
2896                         tok_state = tok_state_data
2897                         return el
2898                 if c is '' # EOF
2899                         parse_error()
2900                         tok_state = tok_state_data
2901                         el = new_doctype_token ''
2902                         el.flag 'force-quirks', true
2903                         cur -= 1 # Reconsume
2904                         return el
2905                 # Anything else
2906                 tok_cur_tag = new_doctype_token c
2907                 tok_state = tok_state_doctype_name
2908                 return null
2909
2910         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
2911         tok_state_doctype_name = ->
2912                 c = txt.charAt(cur++)
2913                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2914                         tok_state = tok_state_after_doctype_name
2915                         return
2916                 if c is '>'
2917                         tok_state = tok_state_data
2918                         return tok_cur_tag
2919                 if uc_alpha.indexOf(c) > -1
2920                         tok_cur_tag.name += c.toLowerCase()
2921                         return
2922                 if c is "\u0000"
2923                         parse_error()
2924                         tok_cur_tag.name += "\ufffd"
2925                         return
2926                 if c is '' # EOF
2927                         parse_error()
2928                         tok_state = tok_state_data
2929                         tok_cur_tag.flag 'force-quirks', true
2930                         cur -= 1 # Reconsume
2931                         return tok_cur_tag
2932                 # Anything else
2933                 tok_cur_tag.name += c
2934                 return null
2935
2936         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
2937         tok_state_after_doctype_name = ->
2938                 c = txt.charAt(cur++)
2939                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2940                         return
2941                 if c is '>'
2942                         tok_state = tok_state_data
2943                         return tok_cur_tag
2944                 if c is '' # EOF
2945                         parse_error()
2946                         tok_state = tok_state_data
2947                         tok_cur_tag.flag 'force-quirks', true
2948                         cur -= 1 # Reconsume
2949                         return tok_cur_tag
2950                 # Anything else
2951                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
2952                         cur += 5
2953                         tok_state = tok_state_after_doctype_public_keyword
2954                         return
2955                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
2956                         cur += 5
2957                         tok_state = tok_state_after_doctype_system_keyword
2958                         return
2959                 parse_error()
2960                 tok_cur_tag.flag 'force-quirks', true
2961                 tok_state = tok_state_bogus_doctype
2962                 return null
2963
2964         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
2965         tok_state_after_doctype_public_keyword = ->
2966                 c = txt.charAt(cur++)
2967                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2968                         tok_state = tok_state_before_doctype_public_identifier
2969                         return
2970                 if c is '"'
2971                         parse_error()
2972                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2973                         tok_state = tok_state_doctype_public_identifier_double_quoted
2974                         return
2975                 if c is "'"
2976                         parse_error()
2977                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2978                         tok_state = tok_state_doctype_public_identifier_single_quoted
2979                         return
2980                 if c is '>'
2981                         parse_error()
2982                         tok_cur_tag.flag 'force-quirks', true
2983                         tok_state = tok_state_data
2984                         return tok_cur_tag
2985                 if c is '' # EOF
2986                         parse_error()
2987                         tok_state = tok_state_data
2988                         tok_cur_tag.flag 'force-quirks', true
2989                         cur -= 1 # Reconsume
2990                         return tok_cur_tag
2991                 # Anything else
2992                 parse_error()
2993                 tok_cur_tag.flag 'force-quirks', true
2994                 tok_state = tok_state_bogus_doctype
2995                 return null
2996
2997         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
2998         tok_state_before_doctype_public_identifier = ->
2999                 c = txt.charAt(cur++)
3000                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3001                         return
3002                 if c is '"'
3003                         parse_error()
3004                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3005                         tok_state = tok_state_doctype_public_identifier_double_quoted
3006                         return
3007                 if c is "'"
3008                         parse_error()
3009                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3010                         tok_state = tok_state_doctype_public_identifier_single_quoted
3011                         return
3012                 if c is '>'
3013                         parse_error()
3014                         tok_cur_tag.flag 'force-quirks', true
3015                         tok_state = tok_state_data
3016                         return tok_cur_tag
3017                 if c is '' # EOF
3018                         parse_error()
3019                         tok_state = tok_state_data
3020                         tok_cur_tag.flag 'force-quirks', true
3021                         cur -= 1 # Reconsume
3022                         return tok_cur_tag
3023                 # Anything else
3024                 parse_error()
3025                 tok_cur_tag.flag 'force-quirks', true
3026                 tok_state = tok_state_bogus_doctype
3027                 return null
3028
3029
3030         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3031         tok_state_doctype_public_identifier_double_quoted = ->
3032                 c = txt.charAt(cur++)
3033                 if c is '"'
3034                         tok_state = tok_state_after_doctype_public_identifier
3035                         return
3036                 if c is "\u0000"
3037                         parse_error()
3038                         tok_cur_tag.public_identifier += "\ufffd"
3039                         return
3040                 if c is '>'
3041                         parse_error()
3042                         tok_cur_tag.flag 'force-quirks', true
3043                         tok_state = tok_state_data
3044                         return tok_cur_tag
3045                 if c is '' # EOF
3046                         parse_error()
3047                         tok_state = tok_state_data
3048                         tok_cur_tag.flag 'force-quirks', true
3049                         cur -= 1 # Reconsume
3050                         return tok_cur_tag
3051                 # Anything else
3052                 tok_cur_tag.public_identifier += c
3053                 return null
3054
3055         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3056         tok_state_doctype_public_identifier_single_quoted = ->
3057                 c = txt.charAt(cur++)
3058                 if c is "'"
3059                         tok_state = tok_state_after_doctype_public_identifier
3060                         return
3061                 if c is "\u0000"
3062                         parse_error()
3063                         tok_cur_tag.public_identifier += "\ufffd"
3064                         return
3065                 if c is '>'
3066                         parse_error()
3067                         tok_cur_tag.flag 'force-quirks', true
3068                         tok_state = tok_state_data
3069                         return tok_cur_tag
3070                 if c is '' # EOF
3071                         parse_error()
3072                         tok_state = tok_state_data
3073                         tok_cur_tag.flag 'force-quirks', true
3074                         cur -= 1 # Reconsume
3075                         return tok_cur_tag
3076                 # Anything else
3077                 tok_cur_tag.public_identifier += c
3078                 return null
3079
3080         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3081         tok_state_after_doctype_public_identifier = ->
3082                 c = txt.charAt(cur++)
3083                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3084                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3085                         return
3086                 if c is '>'
3087                         tok_state = tok_state_data
3088                         return tok_cur_tag
3089                 if c is '"'
3090                         parse_error()
3091                         tok_cur_tag.system_identifier = ''
3092                         tok_state = tok_state_doctype_system_identifier_double_quoted
3093                         return
3094                 if c is "'"
3095                         parse_error()
3096                         tok_cur_tag.system_identifier = ''
3097                         tok_state = tok_state_doctype_system_identifier_single_quoted
3098                         return
3099                 if c is '' # EOF
3100                         parse_error()
3101                         tok_state = tok_state_data
3102                         tok_cur_tag.flag 'force-quirks', true
3103                         cur -= 1 # Reconsume
3104                         return tok_cur_tag
3105                 # Anything else
3106                 parse_error()
3107                 tok_cur_tag.flag 'force-quirks', true
3108                 tok_state = tok_state_bogus_doctype
3109                 return null
3110
3111         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3112         tok_state_between_doctype_public_and_system_identifiers = ->
3113                 c = txt.charAt(cur++)
3114                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3115                         return
3116                 if c is '>'
3117                         tok_state = tok_state_data
3118                         return tok_cur_tag
3119                 if c is '"'
3120                         parse_error()
3121                         tok_cur_tag.system_identifier = ''
3122                         tok_state = tok_state_doctype_system_identifier_double_quoted
3123                         return
3124                 if c is "'"
3125                         parse_error()
3126                         tok_cur_tag.system_identifier = ''
3127                         tok_state = tok_state_doctype_system_identifier_single_quoted
3128                         return
3129                 if c is '' # EOF
3130                         parse_error()
3131                         tok_state = tok_state_data
3132                         tok_cur_tag.flag 'force-quirks', true
3133                         cur -= 1 # Reconsume
3134                         return tok_cur_tag
3135                 # Anything else
3136                 parse_error()
3137                 tok_cur_tag.flag 'force-quirks', true
3138                 tok_state = tok_state_bogus_doctype
3139                 return null
3140
3141         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3142         tok_state_after_doctype_system_keyword = ->
3143                 c = txt.charAt(cur++)
3144                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3145                         tok_state = tok_state_before_doctype_system_identifier
3146                         return
3147                 if c is '"'
3148                         parse_error()
3149                         tok_cur_tag.system_identifier = ''
3150                         tok_state = tok_state_doctype_system_identifier_double_quoted
3151                         return
3152                 if c is "'"
3153                         parse_error()
3154                         tok_cur_tag.system_identifier = ''
3155                         tok_state = tok_state_doctype_system_identifier_single_quoted
3156                         return
3157                 if c is '>'
3158                         parse_error()
3159                         tok_cur_tag.flag 'force-quirks', true
3160                         tok_state = tok_state_data
3161                         return tok_cur_tag
3162                 if c is '' # EOF
3163                         parse_error()
3164                         tok_state = tok_state_data
3165                         tok_cur_tag.flag 'force-quirks', true
3166                         cur -= 1 # Reconsume
3167                         return tok_cur_tag
3168                 # Anything else
3169                 parse_error()
3170                 tok_cur_tag.flag 'force-quirks', true
3171                 tok_state = tok_state_bogus_doctype
3172                 return null
3173
3174         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3175         tok_state_before_doctype_system_identifier = ->
3176                 c = txt.charAt(cur++)
3177                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3178                         return
3179                 if c is '"'
3180                         tok_cur_tag.system_identifier = ''
3181                         tok_state = tok_state_doctype_system_identifier_double_quoted
3182                         return
3183                 if c is "'"
3184                         tok_cur_tag.system_identifier = ''
3185                         tok_state = tok_state_doctype_system_identifier_single_quoted
3186                         return
3187                 if c is '>'
3188                         parse_error()
3189                         tok_cur_tag.flag 'force-quirks', true
3190                         tok_state = tok_state_data
3191                         return tok_cur_tag
3192                 if c is '' # EOF
3193                         parse_error()
3194                         tok_state = tok_state_data
3195                         tok_cur_tag.flag 'force-quirks', true
3196                         cur -= 1 # Reconsume
3197                         return tok_cur_tag
3198                 # Anything else
3199                 parse_error()
3200                 tok_cur_tag.flag 'force-quirks', true
3201                 tok_state = tok_state_bogus_doctype
3202                 return null
3203
3204         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3205         tok_state_doctype_system_identifier_double_quoted = ->
3206                 c = txt.charAt(cur++)
3207                 if c is '"'
3208                         tok_state = tok_state_after_doctype_system_identifier
3209                         return
3210                 if c is "\u0000"
3211                         parse_error()
3212                         tok_cur_tag.system_identifier += "\ufffd"
3213                         return
3214                 if c is '>'
3215                         parse_error()
3216                         tok_cur_tag.flag 'force-quirks', true
3217                         tok_state = tok_state_data
3218                         return tok_cur_tag
3219                 if c is '' # EOF
3220                         parse_error()
3221                         tok_state = tok_state_data
3222                         tok_cur_tag.flag 'force-quirks', true
3223                         cur -= 1 # Reconsume
3224                         return tok_cur_tag
3225                 # Anything else
3226                 tok_cur_tag.system_identifier += c
3227                 return null
3228
3229         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3230         tok_state_doctype_system_identifier_single_quoted = ->
3231                 c = txt.charAt(cur++)
3232                 if c is "'"
3233                         tok_state = tok_state_after_doctype_system_identifier
3234                         return
3235                 if c is "\u0000"
3236                         parse_error()
3237                         tok_cur_tag.system_identifier += "\ufffd"
3238                         return
3239                 if c is '>'
3240                         parse_error()
3241                         tok_cur_tag.flag 'force-quirks', true
3242                         tok_state = tok_state_data
3243                         return tok_cur_tag
3244                 if c is '' # EOF
3245                         parse_error()
3246                         tok_state = tok_state_data
3247                         tok_cur_tag.flag 'force-quirks', true
3248                         cur -= 1 # Reconsume
3249                         return tok_cur_tag
3250                 # Anything else
3251                 tok_cur_tag.system_identifier += c
3252                 return null
3253
3254         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3255         tok_state_after_doctype_system_identifier = ->
3256                 c = txt.charAt(cur++)
3257                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3258                         return
3259                 if c is '>'
3260                         tok_state = tok_state_data
3261                         return tok_cur_tag
3262                 if c is '' # EOF
3263                         parse_error()
3264                         tok_state = tok_state_data
3265                         tok_cur_tag.flag 'force-quirks', true
3266                         cur -= 1 # Reconsume
3267                         return tok_cur_tag
3268                 # Anything else
3269                 parse_error()
3270                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3271                 tok_state = tok_state_bogus_doctype
3272                 return null
3273
3274         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3275         tok_state_bogus_doctype = ->
3276                 c = txt.charAt(cur++)
3277                 if c is '>'
3278                         tok_state = tok_state_data
3279                         return tok_cur_tag
3280                 if c is '' # EOF
3281                         tok_state = tok_state_data
3282                         cur -= 1 # Reconsume
3283                         return tok_cur_tag
3284                 # Anything else
3285                 return null
3286
3287
3288         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3289         # Don't set this as a state, just call it
3290         # returns a string (NOT a text node)
3291         parse_character_reference = (allowed_char = null, in_attr = false) ->
3292                 if cur >= txt.length
3293                         return '&'
3294                 switch c = txt.charAt(cur)
3295                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3296                                 # explicitly not a parse error
3297                                 return '&'
3298                         when ';'
3299                                 # there has to be "one or more" alnums between & and ; to be a parse error
3300                                 return '&'
3301                         when '#'
3302                                 if cur + 1 >= txt.length
3303                                         return '&'
3304                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3305                                         prefix = '#x'
3306                                         charset = hex_chars
3307                                         start = cur + 2
3308                                 else
3309                                         charset = digits
3310                                         start = cur + 1
3311                                         prefix = '#'
3312                                 i = 0
3313                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3314                                         i += 1
3315                                 if i is 0
3316                                         return '&'
3317                                 if txt.charAt(start + i) is ';'
3318                                         i += 1
3319                                 # FIXME This is supposed to generate parse errors for some chars
3320                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3321                                 if decoded?
3322                                         cur = start + i
3323                                         return decoded
3324                                 return '&'
3325                         else
3326                                 for i in [0...31]
3327                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3328                                                 break
3329                                 if i is 0
3330                                         # exit early, because parse_error() below needs at least one alnum
3331                                         return '&'
3332                                 if txt.charAt(cur + i) is ';'
3333                                         i += 1 # include ';' terminator in value
3334                                         decoded = decode_named_char_ref txt.substr(cur, i)
3335                                         if decoded?
3336                                                 cur += i
3337                                                 return decoded
3338                                         parse_error()
3339                                         return '&'
3340                                 else
3341                                         # no ';' terminator (only legacy char refs)
3342                                         max = i
3343                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3344                                                 c = legacy_char_refs[txt.substr(cur, i)]
3345                                                 if c?
3346                                                         if in_attr
3347                                                                 if txt.charAt(cur + i) is '='
3348                                                                         # "because some legacy user agents will
3349                                                                         # misinterpret the markup in those cases"
3350                                                                         parse_error()
3351                                                                         return '&'
3352                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3353                                                                         # this makes attributes forgiving about url args
3354                                                                         return '&'
3355                                                         # ok, and besides the weird exceptions for attributes...
3356                                                         # return the matching char
3357                                                         cur += i # consume entity chars
3358                                                         parse_error() # because no terminating ";"
3359                                                         return c
3360                                         parse_error()
3361                                         return '&'
3362                 return # never reached
3363
3364         # tree constructor initialization
3365         # see comments on TYPE_TAG/etc for the structure of this data
3366         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3367         open_els = [doc]
3368         afe = [] # active formatting elements
3369         template_insertion_modes = []
3370         insertion_mode = ins_mode_initial
3371         original_insertion_mode = insertion_mode # TODO check spec
3372         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3373         flag_frameset_ok = true
3374         flag_parsing = true
3375         flag_foster_parenting = false
3376         form_element_pointer = null
3377         temporary_buffer = null
3378         pending_table_character_tokens = []
3379         head_element_pointer = null
3380         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3381         context_element = null # FIXME initialize from args.fragment
3382
3383         # tokenizer initialization
3384         tok_state = tok_state_data
3385
3386         # proccess input
3387         while flag_parsing
3388                 t = tok_state()
3389                 if t?
3390                         insertion_mode t
3391                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3392         return doc.children
3393
3394 serialize_els = (els, shallow, show_ids) ->
3395         serialized = ''
3396         sep = ''
3397         for t in els
3398                 serialized += sep
3399                 sep = ','
3400                 serialized += t.serialize shallow, show_ids
3401         return serialized
3402
3403 # TODO export TYPE_*
3404 module.exports.parse_html = parse_html
3405 module.exports.debug_log_reset = debug_log_reset
3406 module.exports.debug_log_each = debug_log_each
3407 module.exports.TYPE_TAG = TYPE_TAG
3408 module.exports.TYPE_TEXT = TYPE_TEXT
3409 module.exports.TYPE_COMMENT = TYPE_COMMENT
3410 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE