JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix <html> at start
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         shallow_clone: -> # return a new node that's the same except without the children or parent
100                 # WARNING this doesn't work right on open tags that are still being parsed
101                 attrs = {}
102                 attrs[k] = v for k, v of @attrs
103                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104         acknowledge_self_closing: ->
105                 if @token?
106                         @token.flag 'did_self_close'
107                 else
108                         @flag 'did_self_close', true
109         flag: ->
110                 # fixfull
111         serialize: (shallow = false, show_ids = false) -> # for unit tests
112                 ret = ''
113                 switch @type
114                         when TYPE_TAG
115                                 ret += 'tag:'
116                                 ret += JSON.stringify @name
117                                 ret += ','
118                                 if show_ids
119                                         ret += "##{@id},"
120                                 if shallow
121                                         break
122                                 attr_keys = []
123                                 for k of @attrs
124                                         attr_keys.push k
125                                 attr_keys.sort()
126                                 ret += '{'
127                                 sep = ''
128                                 for k in attr_keys
129                                         ret += sep
130                                         sep = ','
131                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132                                 ret += '},['
133                                 sep = ''
134                                 for c in @children
135                                         ret += sep
136                                         sep = ','
137                                         ret += c.serialize shallow, show_ids
138                                 ret += ']'
139                         when TYPE_TEXT
140                                 ret += 'text:'
141                                 ret += JSON.stringify @text
142                         when TYPE_COMMENT
143                                 ret += 'comment:'
144                                 ret += JSON.stringify @text
145                         when TYPE_DOCTYPE
146                                 ret += 'doctype'
147                                 # FIXME
148                         when TYPE_AFE_MARKER
149                                 ret += 'marker'
150                         when TYPE_AAA_BOOKMARK
151                                 ret += 'aaa_bookmark'
152                         else
153                                 ret += 'unknown:'
154                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155                 return ret
156
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159         return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161         return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163         return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165         return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168         return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170         return new Node TYPE_DOCTYPE, name: name
171 new_eof_token = ->
172         return new Node TYPE_EOF
173 new_afe_marker = ->
174         return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176         return new Node TYPE_AAA_BOOKMARK
177
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
183
184 # some SVG elements have dashes in them
185 tag_name_chars = alnum + "-"
186
187 # http://www.w3.org/TR/html5/infrastructure.html#space-character
188 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 is_space = (txt) ->
190         return txt.length is 1 and space_chars.indexOf(txt) > -1
191 is_space_tok = (t) ->
192         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193
194 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
195 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
196
197 # These are the character references that don't need a terminating semicolon
198 # min length: 2, max: 6, none are a prefix of any other.
199 legacy_char_refs = {
200         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
201         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
202         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
203         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
204         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
205         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
206         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
207         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
208         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
209         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
210         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
211         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
212         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
213         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
214         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
215         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
216         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
217         yen: '¥', yuml: 'ÿ'
218 }
219
220 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
221 raw_text_elements = ['script', 'style']
222 escapable_raw_text_elements = ['textarea', 'title']
223 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
224 svg_elements = [
225         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
226         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
227         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
228         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
229         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
230         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
231         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
232         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
233         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
234         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
235         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
236         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
237         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
238         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
239         'view', 'vkern'
240 ]
241
242 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
243 mathml_elements = [
244         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
245         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
246         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
247         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
248         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
249         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
250         'determinant', 'diff', 'divergence', 'divide', 'domain',
251         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
252         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
253         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
254         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
255         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
256         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
257         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
258         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
259         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
260         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
261         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
262         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
263         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
264         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
265         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
266         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
267         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
268         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
269         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
270         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
271         'vectorproduct', 'xor'
272 ]
273 # foreign_elements = [svg_elements..., mathml_elements...]
274 #normal_elements = All other allowed HTML elements are normal elements.
275
276 special_elements = {
277         # HTML:
278         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
279         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
280         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
281         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
282         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
283         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
284         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
285         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
286         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
287         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
288         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
289         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
290         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
291         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
292         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
293         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
294         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
295         wbr:NS_HTML, xmp:NS_HTML,
296
297         # MathML:
298         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
299         'annotation-xml':NS_MATHML,
300
301         # SVG:
302         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
303 }
304
305 formatting_elements = {
306          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
307          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
308          u: true
309 }
310
311 foster_parenting_targets = {
312         table: true
313         tbody: true
314         tfoot: true
315         thead: true
316         tr: true
317 }
318
319 # all html I presume
320 end_tag_implied = {
321         dd: true
322         dt: true
323         li: true
324         option: true
325         optgroup: true
326         p: true
327         rb: true
328         rp: true
329         rt: true
330         rtc: true
331 }
332
333 el_is_special = (e) ->
334         return special_elements[e.name] is e.namespace
335
336 # decode_named_char_ref()
337 #
338 # The list of named character references is _huge_ so ask the browser to decode
339 # for us instead of wasting bandwidth/space on including the table here.
340 #
341 # Pass without the "&" but with the ";" examples:
342 #    for "&amp" pass "amp;"
343 #    for "&#x2032" pass "x2032;"
344 g_dncr = {
345         cache: {}
346         textarea: document.createElement('textarea')
347 }
348 # TODO test this in IE8
349 decode_named_char_ref = (txt) ->
350         txt = "&#{txt}"
351         decoded = g_dncr.cache[txt]
352         return decoded if decoded?
353         g_dncr.textarea.innerHTML = txt
354         decoded = g_dncr.textarea.value
355         return null if decoded is txt
356         return g_dncr.cache[txt] = decoded
357
358 parse_html = (txt, parse_error_cb = null) ->
359         cur = 0 # index of next char in txt to be parsed
360         # declare doc and tokenizer variables so they're in scope below
361         doc = null
362         open_els = null # stack of open elements
363         afe = null # active formatting elements
364         template_insertion_modes = null
365         insertion_mode = null
366         original_insertion_mode = null
367         tok_state = null
368         tok_cur_tag = null # partially parsed tag
369         flag_scripting = null
370         flag_frameset_ok = null
371         flag_parsing = null
372         flag_foster_parenting = null
373         form_element_pointer = null
374         temporary_buffer = null
375         pending_table_character_tokens = null
376         head_element_pointer = null
377         flag_fragment_parsing = null
378         context_element = null
379
380         stop_parsing = ->
381                 flag_parsing = false
382
383         parse_error = ->
384                 if parse_error_cb?
385                         parse_error_cb cur
386                 else
387                         console.log "Parse error at character #{cur} of #{txt.length}"
388
389         afe_push = (new_el) ->
390                 matches = 0
391                 for el, i in afe
392                         if el.name is new_el.name and el.namespace is new_el.namespace
393                                 for k, v of el.attrs
394                                         continue unless new_el.attrs[k] is v
395                                 for k, v of new_el.attrs
396                                         continue unless el.attrs[k] is v
397                                 matches += 1
398                                 if matches is 3
399                                         afe.splice i, 1
400                                         break
401                 afe.unshift new_el
402         afe_push_marker = ->
403                 afe.unshift new_afe_marker()
404
405         # the functions below impliment the Tree Contstruction algorithm
406         # http://www.w3.org/TR/html5/syntax.html#tree-construction
407
408         # But first... the helpers
409         template_tag_is_open = ->
410                 for t in open_els
411                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
412                                 return true
413                 return false
414         is_in_scope_x = (tag_name, scope, namespace) ->
415                 for t in open_els
416                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
417                                 return true
418                         if scope[t.name] is t.namespace
419                                 return false
420                 return false
421         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
422                 for t in open_els
423                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
424                                 return true
425                         if scope[t.name] is t.namespace
426                                 return false
427                         if scope2[t.name] is t.namespace
428                                 return false
429                 return false
430         standard_scopers = { # FIXME these are supposed to be namespace specific
431                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
432                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
433                 template: NS_HTML, mi: NS_MATHML,
434
435                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
436                 'annotation-xml': NS_MATHML,
437
438                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
439         }
440         button_scopers = button: NS_HTML
441         li_scopers = ol: NS_HTML, ul: NS_HTML
442         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
443         is_in_scope = (tag_name, namespace = null) ->
444                 return is_in_scope_x tag_name, standard_scopers, namespace
445         is_in_button_scope = (tag_name, namespace = null) ->
446                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
447         is_in_table_scope = (tag_name, namespace = null) ->
448                 return is_in_scope_x tag_name, table_scopers, namespace
449         is_in_select_scope = (tag_name, namespace = null) ->
450                 for t in open_els
451                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
452                                 return true
453                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
454                                 return false
455                 return false
456         # this checks for a particular element, not by name
457         el_is_in_scope = (el) ->
458                 for t in open_els
459                         if t is el
460                                 return true
461                         if standard_scopers[t.name] is t.namespace
462                                 return false
463                 return false
464
465         clear_to_table_stopers = {
466                 'table': true
467                 'template': true
468                 'html': true
469         }
470         clear_stack_to_table_context = ->
471                 loop
472                         if clear_to_table_stopers[open_els[0].name]?
473                                 break
474                         open_els.shift()
475                 return
476         clear_to_table_body_stopers = {
477                 'tbody': true
478                 'tfoot': true
479                 'thead': true
480                 'template': true
481                 'html': true
482         }
483         clear_stack_to_table_body_context = ->
484                 loop
485                         if clear_to_table_body_stopers[open_els[0].name]?
486                                 break
487                         open_els.shift()
488                 return
489         clear_to_table_row_stopers = {
490                 'tr': true
491                 'template': true
492                 'html': true
493         }
494         clear_stack_to_table_row_context = ->
495                 loop
496                         if clear_to_table_row_stopers[open_els[0].name]?
497                                 break
498                         open_els.shift()
499                 return
500         clear_afe_to_marker = ->
501                 loop
502                         el = afe.shift()
503                         if el.type is TYPE_AFE_MARKER
504                                 return
505
506         # 8.2.3.1 ...
507         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
508         reset_insertion_mode = ->
509                 # 1. Let last be false.
510                 last = false
511                 # 2. Let node be the last node in the stack of open elements.
512                 node_i = 0
513                 node = open_els[node_i]
514                 # 3. Loop: If node is the first node in the stack of open elements,
515                 # then set last to true, and, if the parser was originally created as
516                 # part of the HTML fragment parsing algorithm (fragment case) set node
517                 # to the context element.
518                 loop
519                         if node_i is open_els.length - 1
520                                 last = true
521                                 # fixfull (fragment case)
522
523                         # 4. If node is a select element, run these substeps:
524                         if node.name is 'select'
525                                 # 1. If last is true, jump to the step below labeled done.
526                                 unless last
527                                         # 2. Let ancestor be node.
528                                         ancestor_i = node_i
529                                         ancestor = node
530                                         # 3. Loop: If ancestor is the first node in the stack of
531                                         # open elements, jump to the step below labeled done.
532                                         loop
533                                                 if ancestor_i is open_els.length - 1
534                                                         break
535                                                 # 4. Let ancestor be the node before ancestor in the stack
536                                                 # of open elements.
537                                                 ancestor_i += 1
538                                                 ancestor = open_els[ancestor_i]
539                                                 # 5. If ancestor is a template node, jump to the step below
540                                                 # labeled done.
541                                                 if ancestor.name is 'template'
542                                                         break
543                                                 # 6. If ancestor is a table node, switch the insertion mode
544                                                 # to "in select in table" and abort these steps.
545                                                 if ancestor.name is 'table'
546                                                         insertion_mode = ins_mode_in_select_in_table
547                                                         return
548                                                 # 7. Jump back to the step labeled loop.
549                                 # 8. Done: Switch the insertion mode to "in select" and abort
550                                 # these steps.
551                                 insertion_mode = ins_mode_in_select
552                                 return
553                         # 5. If node is a td or th element and last is false, then switch
554                         # the insertion mode to "in cell" and abort these steps.
555                         if (node.name is 'td' or node.name is 'th') and last is false
556                                 insertion_mode = ins_mode_in_cell
557                                 return
558                         # 6. If node is a tr element, then switch the insertion mode to "in
559                         # row" and abort these steps.
560                         if node.name is 'tr'
561                                 insertion_mode = ins_mode_in_row
562                                 return
563                         # 7. If node is a tbody, thead, or tfoot element, then switch the
564                         # insertion mode to "in table body" and abort these steps.
565                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
566                                 insertion_mode = ins_mode_in_table_body
567                                 return
568                         # 8. If node is a caption element, then switch the insertion mode
569                         # to "in caption" and abort these steps.
570                         if node.name is 'caption'
571                                 insertion_mode = ins_mode_in_caption
572                                 return
573                         # 9. If node is a colgroup element, then switch the insertion mode
574                         # to "in column group" and abort these steps.
575                         if node.name is 'colgroup'
576                                 insertion_mode = ins_mode_in_column_group
577                                 return
578                         # 10. If node is a table element, then switch the insertion mode to
579                         # "in table" and abort these steps.
580                         if node.name is 'table'
581                                 insertion_mode = ins_mode_in_table
582                                 return
583                         # 11. If node is a template element, then switch the insertion mode
584                         # to the current template insertion mode and abort these steps.
585                         # fixfull (template insertion mode stack)
586
587                         # 12. If node is a head element and last is true, then switch the
588                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
589                         # these steps. (fragment case)
590                         if node.name is 'head' and last
591                                 insertion_mode = ins_mode_in_body
592                                 return
593                         # 13. If node is a head element and last is false, then switch the
594                         # insertion mode to "in head" and abort these steps.
595                         if node.name is 'head' and last is false
596                                 insertion_mode = ins_mode_in_head
597                                 return
598                         # 14. If node is a body element, then switch the insertion mode to
599                         # "in body" and abort these steps.
600                         if node.name is 'body'
601                                 insertion_mode = ins_mode_in_body
602                                 return
603                         # 15. If node is a frameset element, then switch the insertion mode
604                         # to "in frameset" and abort these steps. (fragment case)
605                         if node.name is 'frameset'
606                                 insertion_mode = ins_mode_in_frameset
607                                 return
608                         # 16. If node is an html element, run these substeps:
609                         if node.name is 'html'
610                                 # 1. If the head element pointer is null, switch the insertion
611                                 # mode to "before head" and abort these steps. (fragment case)
612                                 # fixfull (fragment case)
613
614                                 # 2. Otherwise, the head element pointer is not null, switch
615                                 # the insertion mode to "after head" and abort these steps.
616                                 insertion_mode = ins_mode_in_body # FIXME fixfull
617                                 return
618                         # 17. If last is true, then switch the insertion mode to "in body"
619                         # and abort these steps. (fragment case)
620                         if last
621                                 insertion_mode = ins_mode_in_body
622                                 return
623                         # 18. Let node now be the node before node in the stack of open
624                         # elements.
625                         node_i += 1
626                         node = open_els[node_i]
627                         # 19. Return to the step labeled loop.
628
629         # 8.2.3.2
630
631         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
632         adjusted_current_node = ->
633                 if open_els.length is 1 and flag_fragment_parsing
634                         return context_element
635                 return open_els[0]
636
637         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
638         # this implementation is structured (mostly) as described at the link above.
639         # capitalized comments are the "labels" described at the link above.
640         reconstruct_active_formatting_elements = ->
641                 return if afe.length is 0
642                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
643                         return
644                 # Rewind
645                 i = 0
646                 loop
647                         if i is afe.length - 1
648                                 break
649                         i += 1
650                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
651                                 i -= 1 # Advance
652                                 break
653                 # Create
654                 loop
655                         el = afe[i].shallow_clone()
656                         tree_insert_element el
657                         afe[i] = el
658                         break if i is 0
659                         i -= 1 # Advance
660
661         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
662         # adoption agency algorithm
663         # overview here:
664         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
665         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
666         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
667         adoption_agency = (subject) ->
668                 debug_log "adoption_agency()"
669                 debug_log "tree: #{serialize_els doc.children, false, true}"
670                 debug_log "open_els: #{serialize_els open_els, true, true}"
671                 debug_log "afe: #{serialize_els afe, true, true}"
672                 if open_els[0].name is subject
673                         el = open_els[0]
674                         open_els.shift()
675                         # remove it from the list of active formatting elements (if found)
676                         for t, i in afe
677                                 if t is el
678                                         afe.splice i, 1
679                                         break
680                         debug_log "aaa: starting off with subject on top of stack, exiting"
681                         return
682                 outer = 0
683                 loop
684                         if outer >= 8
685                                 return
686                         outer += 1
687                         # 5. Let formatting element be the last element in the list of
688                         # active formatting elements that: is between the end of the list
689                         # and the last scope marker in the list, if any, or the start of
690                         # the list otherwise, and  has the tag name subject.
691                         fe = null
692                         for t, fe_of_afe in afe
693                                 if t.type is TYPE_AFE_MARKER
694                                         break
695                                 if t.name is subject
696                                         fe = t
697                                         break
698                         # If there is no such element, then abort these steps and instead
699                         # act as described in the "any other end tag" entry above.
700                         if fe is null
701                                 debug_log "aaa: fe not found in afe"
702                                 in_body_any_other_end_tag subject
703                                 return
704                         # 6. If formatting element is not in the stack of open elements,
705                         # then this is a parse error; remove the element from the list, and
706                         # abort these steps.
707                         in_open_els = false
708                         for t, fe_of_open_els in open_els
709                                 if t is fe
710                                         in_open_els = true
711                                         break
712                         unless in_open_els
713                                 debug_log "aaa: fe not found in open_els"
714                                 parse_error()
715                                 # "remove it from the list" must mean afe, since it's not in open_els
716                                 afe.splice fe_of_afe, 1
717                                 return
718                         # 7. If formatting element is in the stack of open elements, but
719                         # the element is not in scope, then this is a parse error; abort
720                         # these steps.
721                         unless el_is_in_scope fe
722                                 debug_log "aaa: fe not in scope"
723                                 parse_error()
724                                 return
725                         # 8. If formatting element is not the current node, this is a parse
726                         # error. (But do not abort these steps.)
727                         unless open_els[0] is fe
728                                 parse_error()
729                                 # continue
730                         # 9. Let furthest block be the topmost node in the stack of open
731                         # elements that is lower in the stack than formatting element, and
732                         # is an element in the special category. There might not be one.
733                         fb = null
734                         fb_of_open_els = null
735                         for t, i in open_els
736                                 if t is fe
737                                         break
738                                 if el_is_special t
739                                         fb = t
740                                         fb_of_open_els = i
741                                         # and continue, to see if there's one that's more "topmost"
742                         # 10. If there is no furthest block, then the UA must first pop all
743                         # the nodes from the bottom of the stack of open elements, from the
744                         # current node up to and including formatting element, then remove
745                         # formatting element from the list of active formatting elements,
746                         # and finally abort these steps.
747                         if fb is null
748                                 debug_log "aaa: no fb"
749                                 loop
750                                         t = open_els.shift()
751                                         if t is fe
752                                                 afe.splice fe_of_afe, 1
753                                                 return
754                         # 11. Let common ancestor be the element immediately above
755                         # formatting element in the stack of open elements.
756                         ca = open_els[fe_of_open_els + 1] # common ancestor
757
758                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
759                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
760                         bookmark = new_aaa_bookmark()
761                         for t, i in afe
762                                 if t is fe
763                                         afe.splice i, 0, bookmark
764                                         break
765                         node = last_node = fb
766                         inner = 0
767                         loop
768                                 inner += 1
769                                 # 3. Let node be the element immediately above node in the
770                                 # stack of open elements, or if node is no longer in the stack
771                                 # of open elements (e.g. because it got removed by this
772                                 # algorithm), the element that was immediately above node in
773                                 # the stack of open elements before node was removed.
774                                 node_next = null
775                                 for t, i in open_els
776                                         if t is node
777                                                 node_next = open_els[i + 1]
778                                                 break
779                                 node = node_next ? node_above
780                                 debug_log "inner loop #{inner}"
781                                 debug_log "tree: #{serialize_els doc.children, false, true}"
782                                 debug_log "open_els: #{serialize_els open_els, true, true}"
783                                 debug_log "afe: #{serialize_els afe, true, true}"
784                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
785                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
786                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
787                                 debug_log "node: #{node.serialize true, true}"
788                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
789
790                                 # 4. If node is formatting element, then go to the next step in
791                                 # the overall algorithm.
792                                 if node is fe
793                                         break
794                                 debug_log "the meat"
795                                 # 5. If inner loop counter is greater than three and node is in
796                                 # the list of active formatting elements, then remove node from
797                                 # the list of active formatting elements.
798                                 node_in_afe = false
799                                 for t, i in afe
800                                         if t is node
801                                                 if inner > 3
802                                                         afe.splice i, 1
803                                                         debug_log "max out inner"
804                                                 else
805                                                         node_in_afe = true
806                                                         debug_log "in afe"
807                                                 break
808                                 # 6. If node is not in the list of active formatting elements,
809                                 # then remove node from the stack of open elements and then go
810                                 # back to the step labeled inner loop.
811                                 unless node_in_afe
812                                         debug_log "not in afe"
813                                         for t, i in open_els
814                                                 if t is node
815                                                         node_above = open_els[i + 1]
816                                                         open_els.splice i, 1
817                                                         break
818                                         continue
819                                 debug_log "the bones"
820                                 # 7. create an element for the token for which the element node
821                                 # was created, in the HTML namespace, with common ancestor as
822                                 # the intended parent; replace the entry for node in the list
823                                 # of active formatting elements with an entry for the new
824                                 # element, replace the entry for node in the stack of open
825                                 # elements with an entry for the new element, and let node be
826                                 # the new element.
827                                 new_node = node.shallow_clone()
828                                 for t, i in afe
829                                         if t is node
830                                                 afe[i] = new_node
831                                                 debug_log "replaced in afe"
832                                                 break
833                                 for t, i in open_els
834                                         if t is node
835                                                 node_above = open_els[i + 1]
836                                                 open_els[i] = new_node
837                                                 debug_log "replaced in open_els"
838                                                 break
839                                 node = new_node
840                                 # 8. If last node is furthest block, then move the
841                                 # aforementioned bookmark to be immediately after the new node
842                                 # in the list of active formatting elements.
843                                 if last_node is fb
844                                         for t, i in afe
845                                                 if t is bookmark
846                                                         afe.splice i, 1
847                                                         debug_log "removed bookmark"
848                                                         break
849                                         for t, i in afe
850                                                 if t is node
851                                                         # "after" means lower
852                                                         afe.splice i, 0, bookmark # "after as <-
853                                                         debug_log "placed bookmark after node"
854                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
855                                                         break
856                                 # 9. Insert last node into node, first removing it from its
857                                 # previous parent node if any.
858                                 if last_node.parent?
859                                         debug_log "last_node has parent"
860                                         for c, i in last_node.parent.children
861                                                 if c is last_node
862                                                         debug_log "removing last_node from parent"
863                                                         last_node.parent.children.splice i, 1
864                                                         break
865                                 node.children.push last_node
866                                 last_node.parent = node
867                                 # 10. Let last node be node.
868                                 last_node = node
869                                 debug_log "at last"
870                                 # 11. Return to the step labeled inner loop.
871                         # 14. Insert whatever last node ended up being in the previous step
872                         # at the appropriate place for inserting a node, but using common
873                         # ancestor as the override target.
874
875                         # In the case where fe is immediately followed by fb:
876                         #   * inner loop exits out early (node==fe)
877                         #   * last_node is fb
878                         #   * last_node is still in the tree (not a duplicate)
879                         if last_node.parent?
880                                 debug_log "FEFIRST? last_node has parent"
881                                 for c, i in last_node.parent.children
882                                         if c is last_node
883                                                 debug_log "removing last_node from parent"
884                                                 last_node.parent.children.splice i, 1
885                                                 break
886
887                         debug_log "after aaa inner loop"
888                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
889                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
890                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
891                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
892                         debug_log "tree: #{serialize_els doc.children, false, true}"
893
894                         debug_log "insert"
895
896
897                         # can't use standard insert token thing, because it's already in
898                         # open_els and must stay at it's current position in open_els
899                         dest = adjusted_insertion_location ca
900                         dest[0].children.splice dest[1], 0, last_node
901                         last_node.parent = dest[0]
902
903
904                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908                         debug_log "tree: #{serialize_els doc.children, false, true}"
909
910                         # 15. Create an element for the token for which formatting element
911                         # was created, in the HTML namespace, with furthest block as the
912                         # intended parent.
913                         new_element = fe.shallow_clone() # FIXME intended parent thing
914                         # 16. Take all of the child nodes of furthest block and append them
915                         # to the element created in the last step.
916                         while fb.children.length
917                                 t = fb.children.shift()
918                                 t.parent = new_element
919                                 new_element.children.push t
920                         # 17. Append that new element to furthest block.
921                         new_element.parent = fb
922                         fb.children.push new_element
923                         # 18. Remove formatting element from the list of active formatting
924                         # elements, and insert the new element into the list of active
925                         # formatting elements at the position of the aforementioned
926                         # bookmark.
927                         for t, i in afe
928                                 if t is fe
929                                         afe.splice i, 1
930                                         break
931                         for t, i in afe
932                                 if t is bookmark
933                                         afe[i] = new_element
934                                         break
935                         # 19. Remove formatting element from the stack of open elements,
936                         # and insert the new element into the stack of open elements
937                         # immediately below the position of furthest block in that stack.
938                         for t, i in open_els
939                                 if t is fe
940                                         open_els.splice i, 1
941                                         break
942                         for t, i in open_els
943                                 if t is fb
944                                         open_els.splice i, 0, new_element
945                                         break
946                         # 20. Jump back to the step labeled outer loop.
947                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
948                         debug_log "tree: #{serialize_els doc.children, false, true}"
949                         debug_log "open_els: #{serialize_els open_els, true, true}"
950                         debug_log "afe: #{serialize_els afe, true, true}"
951                 debug_log "AAA DONE"
952
953         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
954         close_p_element = ->
955                 generate_implied_end_tags 'p' # arg is exception
956                 if open_els[0].name isnt 'p'
957                         parse_error()
958                 while open_els.length > 1 # just in case
959                         el = open_els.shift()
960                         if el.name is 'p'
961                                 return
962         close_p_if_in_button_scope = ->
963                 if is_in_button_scope 'p'
964                         close_p_element()
965
966         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
967         # aka insert_a_character = (t) ->
968         insert_character = (t) ->
969                 dest = adjusted_insertion_location()
970                 # fixfull check for Document node
971                 if dest[1] > 0
972                         prev = dest[0].children[dest[1] - 1]
973                         if prev.type is TYPE_TEXT
974                                 prev.text += t.text
975                                 return
976                 dest[0].children.splice dest[1], 0, t
977
978         # 8.2.5.1
979         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
980         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
981         adjusted_insertion_location = (override_target = null) ->
982                 # 1. If there was an override target specified, then let target be the
983                 # override target.
984                 if override_target?
985                         target = override_target
986                 else # Otherwise, let target be the current node.
987                         target = open_els[0]
988                 # 2. Determine the adjusted insertion location using the first matching
989                 # steps from the following list:
990                 #
991                 # If foster parenting is enabled and target is a table, tbody, tfoot,
992                 # thead, or tr element Foster parenting happens when content is
993                 # misnested in tables.
994                 if flag_foster_parenting and foster_parenting_targets[target.name]
995                         loop # once. this is here so we can ``break`` to "abort these substeps"
996                                 # 1. Let last template be the last template element in the
997                                 # stack of open elements, if any.
998                                 last_template = null
999                                 last_template_i = null
1000                                 for el, i in open_els
1001                                         if el.name is 'template'
1002                                                 last_template = el
1003                                                 last_template_i = i
1004                                                 break
1005                                 # 2. Let last table be the last table element in the stack of
1006                                 # open elements, if any.
1007                                 last_table = null
1008                                 last_table_i
1009                                 for el, i in open_els
1010                                         if el.name is 'table'
1011                                                 last_table = el
1012                                                 last_table_i = i
1013                                                 break
1014                                 # 3. If there is a last template and either there is no last
1015                                 # table, or there is one, but last template is lower (more
1016                                 # recently added) than last table in the stack of open
1017                                 # elements, then: let adjusted insertion location be inside
1018                                 # last template's template contents, after its last child (if
1019                                 # any), and abort these substeps.
1020                                 if last_template and (last_table is null or last_template_i < last_table_i)
1021                                         target = template # fixfull should be it's contents
1022                                         target_i = target.children.length
1023                                         break
1024                                 # 4. If there is no last table, then let adjusted insertion
1025                                 # location be inside the first element in the stack of open
1026                                 # elements (the html element), after its last child (if any),
1027                                 # and abort these substeps. (fragment case)
1028                                 if last_table is null
1029                                         # this is odd
1030                                         target = open_els[open_els.length - 1]
1031                                         target_i = target.children.length
1032                                 # 5. If last table has a parent element, then let adjusted
1033                                 # insertion location be inside last table's parent element,
1034                                 # immediately before last table, and abort these substeps.
1035                                 if last_table.parent?
1036                                         for c, i in last_table.parent.children
1037                                                 if c is last_table
1038                                                         target = last_table.parent
1039                                                         target_i = i
1040                                                         break
1041                                         break
1042                                 # 6. Let previous element be the element immediately above last
1043                                 # table in the stack of open elements.
1044                                 #
1045                                 # huh? how could it not have a parent?
1046                                 previous_element = open_els[last_table_i + 1]
1047                                 # 7. Let adjusted insertion location be inside previous
1048                                 # element, after its last child (if any).
1049                                 target = previous_element
1050                                 target_i = target.children.length
1051                                 # Note: These steps are involved in part because it's possible
1052                                 # for elements, the table element in this case in particular,
1053                                 # to have been moved by a script around in the DOM, or indeed
1054                                 # removed from the DOM entirely, after the element was inserted
1055                                 # by the parser.
1056                                 break # don't really loop
1057                 else
1058                         # Otherwise Let adjusted insertion location be inside target, after
1059                         # its last child (if any).
1060                         target_i = target.children.length
1061
1062                 # 3. If the adjusted insertion location is inside a template element,
1063                 # let it instead be inside the template element's template contents,
1064                 # after its last child (if any).
1065                 # fixfull (template)
1066
1067                 # 4. Return the adjusted insertion location.
1068                 return [target, target_i]
1069
1070         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1071         # aka create_an_element_for_token
1072         token_to_element = (t, namespace, intended_parent) ->
1073                 t.type = TYPE_TAG # not TYPE_START_TAG
1074                 # convert attributes into a hash
1075                 attrs = {}
1076                 while t.attrs_a.length
1077                         a = t.attrs_a.pop()
1078                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1079                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1080
1081                 # TODO 2. If the newly created element has an xmlns attribute in the
1082                 # XMLNS namespace whose value is not exactly the same as the element's
1083                 # namespace, that is a parse error. Similarly, if the newly created
1084                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1085                 # value is not the XLink Namespace, that is a parse error.
1086
1087                 # fixfull: the spec says stuff about form pointers and ownerDocument
1088
1089                 return el
1090
1091         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1092         insert_foreign_element = (token, namespace) ->
1093                 ail = adjusted_insertion_location()
1094                 ail_el = ail[0]
1095                 ail_i = ail[1]
1096                 el = token_to_element token, namespace, ail_el
1097                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1098                 el.parent = ail_el
1099                 ail_el.children.splice ail_i, 0, el
1100                 open_els.unshift el
1101                 return el
1102         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1103         insert_html_element = insert_foreign_element # (token, namespace) ->
1104
1105         # FIXME read implement "foster parenting" part
1106         # FIXME read spec, do this right
1107         # FIXME implement the override target thing
1108         # note: this assumes it's an open tag
1109         # FIXME what part of the spec is this?
1110         # TODO look through all callers of this, and see what they should really be doing.
1111         #   eg probably insert_html_element for tokens
1112         tree_insert_element = (el, override_target = null, namespace = null) ->
1113                 if namespace?
1114                         el.namespace = namespace
1115                 dest = adjusted_insertion_location override_target
1116                 if el.type is TYPE_START_TAG # means it's a "token"
1117                         el = token_to_element el, namespace, dest[0]
1118                 unless el.namespace?
1119                         namespace = dest.namespace
1120                 # fixfull: Document nodes sometimes can't accept more chidren
1121                 dest[0].children.splice dest[1], 0, el
1122                 el.parent = dest[0]
1123                 open_els.unshift el
1124                 return el
1125
1126         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1127         # position should be [node, index_within_children]
1128         insert_comment = (t, position = null) ->
1129                 position ?= adjusted_insertion_location()
1130                 position[0].children.splice position[1], 0, t
1131
1132         # 8.2.5.2
1133         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1134         parse_generic_raw_text = (t) ->
1135                 insert_html_element t
1136                 tok_state = tok_state_rawtext
1137                 original_insertion_mode = insertion_mode
1138                 insertion_mode = ins_mode_text
1139         parse_generic_rcdata_text = (t) ->
1140                 insert_html_element t
1141                 tok_state = tok_state_rcdata
1142                 original_insertion_mode = insertion_mode
1143                 insertion_mode = ins_mode_text
1144
1145         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1146         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1147         generate_implied_end_tags = (except = null) ->
1148                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1149                         open_els.shift()
1150
1151         # 8.2.5.4 The rules for parsing tokens in HTML content
1152         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1153
1154         # 8.2.5.4.1 The "initial" insertion mode
1155         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1156         ins_mode_initial = (t) ->
1157                 if is_space_tok t
1158                         return
1159                 if t.type is TYPE_COMMENT
1160                         # ?fixfull
1161                         doc.children.push t
1162                         return
1163                 if t.type is TYPE_DOCTYPE
1164                         # FIXME check identifiers, set quirks, etc
1165                         # fixfull
1166                         doc.children.push t
1167                         insertion_mode = ins_mode_before_html
1168                         return
1169                 # Anything else
1170                 #fixfull (iframe, quirks)
1171                 insertion_mode = ins_mode_before_html
1172                 insertion_mode t # reprocess the token
1173                 return
1174
1175         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1176         ins_mode_before_html = (t) ->
1177                 if t.type is TYPE_DOCTYPE
1178                         parse_error()
1179                         return
1180                 if t.type is TYPE_COMMENT
1181                         doc.children.push t
1182                         return
1183                 if is_space_tok t
1184                         return
1185                 if t.type is TYPE_START_TAG and t.name is 'html'
1186                         el = token_to_element t, NS_HTML, doc
1187                         doc.children.push el
1188                         open_els.unshift(el)
1189                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1190                         insertion_mode = ins_mode_before_head
1191                         return
1192                 if t.type is TYPE_END_TAG
1193                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1194                                 # fall through to "anything else"
1195                         else
1196                                 parse_error()
1197                                 return
1198                 # Anything else
1199                 html_tok = new_open_tag 'html'
1200                 el = token_to_element html_tok, NS_HTML, doc
1201                 doc.children.push el
1202                 open_els.unshift el
1203                 # ?fixfull browsing context
1204                 insertion_mode = ins_mode_before_head
1205                 insertion_mode t
1206                 return
1207
1208         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1209         ins_mode_before_head = (t) ->
1210                 if is_space_tok t
1211                         return
1212                 if t.type is TYPE_COMMENT
1213                         insert_comment t
1214                         return
1215                 if t.type is TYPE_DOCTYPE
1216                         parse_error()
1217                         return
1218                 if t.type is TYPE_START_TAG and t.name is 'html'
1219                         ins_mode_in_body t
1220                         return
1221                 if t.type is TYPE_START_TAG and t.name is 'head'
1222                         el = insert_html_element t
1223                         head_element_pointer = el
1224                         insertion_mode = ins_mode_in_head
1225                 if t.type is TYPE_END_TAG
1226                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1227                                 # fall through to Anything else below
1228                         else
1229                                 parse_error()
1230                                 return
1231                 # Anything else
1232                 head_tok = new_open_tag 'head'
1233                 el = insert_html_element head_tok
1234                 head_element_pointer = el
1235                 insertion_mode = ins_mode_in_head
1236                 insertion_mode t # reprocess current token
1237
1238         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1239         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1240                 open_els.shift() # spec says this will be a 'head' node
1241                 insertion_mode = ins_mode_after_head
1242                 insertion_mode t
1243         ins_mode_in_head = (t) ->
1244                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1245                         insert_character t
1246                         return
1247                 if t.type is TYPE_COMMENT
1248                         insert_comment t
1249                         return
1250                 if t.type is TYPE_DOCTYPE
1251                         parse_error()
1252                         return
1253                 if t.type is TYPE_START_TAG and t.name is 'html'
1254                         ins_mode_in_body t
1255                         return
1256                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1257                         el = insert_html_element t
1258                         open_els.shift()
1259                         t.acknowledge_self_closing()
1260                         return
1261                 if t.type is TYPE_START_TAG and t.name is 'meta'
1262                         el = insert_html_element t
1263                         open_els.shift()
1264                         t.acknowledge_self_closing()
1265                         # fixfull encoding stuff
1266                         return
1267                 if t.type is TYPE_START_TAG and t.name is 'title'
1268                         parse_generic_rcdata_text t
1269                         return
1270                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1271                         parse_generic_raw_text t
1272                         return
1273                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1274                         insert_html_element t
1275                         insertion_mode = in_head_noscript # FIXME implement
1276                         return
1277                 if t.type is TYPE_START_TAG and t.name is 'script'
1278                         ail = adjusted_insertion_location()
1279                         el = token_to_element t, NS_HTML, ail
1280                         el.flag 'parser-inserted', true # FIXME implement
1281                         # fixfull frament case
1282                         ail[0].children.splice ail[1], 0, el
1283                         open_els.unshift el
1284                         tok_state = tok_state_script_data
1285                         original_insertion_mode = insertion_mode # make sure orig... is defined
1286                         insertion_mode = ins_mode_text # FIXME implement
1287                         return
1288                 if t.type is TYPE_END_TAG and t.name is 'head'
1289                         open_els.shift() # will be a head element... spec says so
1290                         insertion_mode = ins_mode_after_head
1291                         return
1292                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1293                         ins_mode_in_head_else t
1294                         return
1295                 if t.type is TYPE_START_TAG and t.name is 'template'
1296                         insert_html_element t
1297                         afe_push_marker()
1298                         flag_frameset_ok = false
1299                         insertion_mode = ins_mode_in_template
1300                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1301                         return
1302                 if t.type is TYPE_END_TAG and t.name is 'template'
1303                         if template_tag_is_open()
1304                                 generate_implied_end_tags
1305                                 if open_els[0].name isnt 'template'
1306                                         parse_error()
1307                                 loop
1308                                         el = open_els.shift()
1309                                         if el.name is 'template'
1310                                                 break
1311                                 clear_afe_to_marker()
1312                                 template_insertion_modes.shift()
1313                                 reset_insertion_mode()
1314                         else
1315                                 parse_error()
1316                         return
1317                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1318                         parse_error()
1319                         return
1320                 ins_mode_in_head_else t
1321
1322         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1323         ins_mode_in_head_noscript = (t) ->
1324                 # FIXME ?fixfull
1325                 console.log "ins_mode_in_head_noscript unimplemented"
1326
1327         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1328         ins_mode_after_head_else = (t) ->
1329                 body_tok = new_open_tag 'body'
1330                 insert_html_element body_tok
1331                 insertion_mode = ins_mode_in_body
1332                 insertion_mode t # reprocess token
1333                 return
1334         ins_mode_after_head = (t) ->
1335                 if is_space_tok t
1336                         insert_character t
1337                         return
1338                 if t.type is TYPE_COMMENT
1339                         insert_comment t
1340                         return
1341                 if t.type is TYPE_DOCTYPE
1342                         parse_error()
1343                         return
1344                 if t.type is TYPE_START_TAG and t.name is 'html'
1345                         ins_mode_in_body t
1346                         return
1347                 if t.type is TYPE_START_TAG and t.name is 'body'
1348                         insert_html_element t
1349                         flag_frameset_ok = false
1350                         insertion_mode = ins_mode_in_body
1351                         return
1352                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1353                         insert_html_element t
1354                         insertion_mode = ins_mode_in_frameset
1355                         return
1356                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1357                         parse_error()
1358                         open_els.unshift head_element_pointer
1359                         ins_mode_in_head t
1360                         for el, i of open_els
1361                                 if el is head_element_pointer
1362                                         open_els.splice i, 1
1363                                         return
1364                         console.log "warning: 23904 couldn't find head element in open_els"
1365                         return
1366                 if t.type is TYPE_END_TAG and t.name is 'template'
1367                         ins_mode_in_head t
1368                         return
1369                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1370                         ins_mode_after_head_else t
1371                         return
1372                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1373                         parse_error()
1374                         return
1375                 # Anything else
1376                 ins_mode_after_head_else t
1377
1378         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1379         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1380                 for node, i in open_els
1381                         if node.name is name # FIXME check namespace too
1382                                 generate_implied_end_tags name # arg is exception
1383                                 parse_error() unless i is 0
1384                                 while i >= 0
1385                                         open_els.shift()
1386                                         i -= 1
1387                                 return
1388                         if special_elements[node.name]? # FIXME check namespac too
1389                                 parse_error()
1390                                 return
1391         ins_mode_in_body = (t) ->
1392                 switch t.type
1393                         when TYPE_TEXT
1394                                 switch t.text
1395                                         when "\u0000"
1396                                                 parse_error()
1397                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1398                                                 reconstruct_active_formatting_elements()
1399                                                 insert_character t
1400                                         else
1401                                                 reconstruct_active_formatting_elements()
1402                                                 insert_character t
1403                                                 flag_frameset_ok = false
1404                         when TYPE_COMMENT
1405                                 insert_comment t
1406                         when TYPE_DOCTYPE
1407                                 parse_error()
1408                         when TYPE_START_TAG
1409                                 switch t.name
1410                                         when 'html'
1411                                                 parse_error()
1412                                                 return if template_tag_is_open()
1413                                                 root_attrs = open_els[open_els.length - 1].attrs
1414                                                 for k, v of t.attrs
1415                                                         root_attrs[k] = v unless root_attrs[k]?
1416                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1417                                                 # FIXME also do this for </template> (end tag)
1418                                                 return ins_mode_in_head t
1419                                         when 'body'
1420                                                 parse_error()
1421                                                 # TODO
1422                                         when 'frameset'
1423                                                 parse_error()
1424                                                 # TODO
1425                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1426                                                 close_p_if_in_button_scope()
1427                                                 insert_html_element t
1428                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1429                                                 close_p_if_in_button_scope()
1430                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1431                                                         parse_error()
1432                                                         open_els.shift()
1433                                                 insert_html_element t
1434                                         # TODO lots more to implement here
1435                                         when 'a'
1436                                                 # If the list of active formatting elements
1437                                                 # contains an a element between the end of the list and
1438                                                 # the last marker on the list (or the start of the list
1439                                                 # if there is no marker on the list), then this is a
1440                                                 # parse error; run the adoption agency algorithm for
1441                                                 # the tag name "a", then remove that element from the
1442                                                 # list of active formatting elements and the stack of
1443                                                 # open elements if the adoption agency algorithm didn't
1444                                                 # already remove it (it might not have if the element
1445                                                 # is not in table scope).
1446                                                 found = false
1447                                                 for el in afe
1448                                                         if el.type is TYPE_AFE_MARKER
1449                                                                 break
1450                                                         if el.name is 'a'
1451                                                                 found = el
1452                                                 if found?
1453                                                         parse_error()
1454                                                         adoption_agency 'a'
1455                                                         for el, i in afe
1456                                                                 if el is found
1457                                                                         afe.splice i, 1
1458                                                         for el, i in open_els
1459                                                                 if el is found
1460                                                                         open_els.splice i, 1
1461                                                 reconstruct_active_formatting_elements()
1462                                                 el = insert_html_element t
1463                                                 afe_push el
1464                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1465                                                 reconstruct_active_formatting_elements()
1466                                                 el = insert_html_element t
1467                                                 afe_push el
1468                                         when 'table'
1469                                                 # fixfull quirksmode thing
1470                                                 close_p_if_in_button_scope()
1471                                                 insert_html_element t
1472                                                 insertion_mode = ins_mode_in_table
1473                                         # TODO lots more to implement here
1474                                         else # any other start tag
1475                                                 reconstruct_active_formatting_elements()
1476                                                 insert_html_element t
1477                         when TYPE_EOF
1478                                 ok_tags = {
1479                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1480                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1481                                 }
1482                                 for t in open_els
1483                                         unless ok_tags[t.name]?
1484                                                 parse_error()
1485                                                 break
1486                                 # TODO stack of template insertion modes thing
1487                                 stop_parsing()
1488                         when TYPE_END_TAG
1489                                 switch t.name
1490                                         when 'body'
1491                                                 unless is_in_scope 'body'
1492                                                         parse_error()
1493                                                         return
1494                                                 # TODO implement parse error and move to tree_after_body
1495                                         when 'html'
1496                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1497                                                         parse_error()
1498                                                         return
1499                                                 # TODO implement parse error and move to tree_after_body, reprocess
1500                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1501                                                 unless is_in_scope t.name, NS_HTML
1502                                                         parse_error()
1503                                                         return
1504                                                 generate_implied_end_tags()
1505                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1506                                                         parse_error()
1507                                                 loop
1508                                                         el = open_els.shift()
1509                                                         if el.name is t.name and el.namespace is NS_HTML
1510                                                                 return
1511                                         # TODO lots more close tags to implement here
1512                                         when 'p'
1513                                                 unless is_in_button_scope 'p'
1514                                                         parse_error()
1515                                                         insert_html_element new_open_tag 'p'
1516                                                 close_p_element()
1517                                         # TODO lots more close tags to implement here
1518                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1519                                                 adoption_agency t.name
1520                                         # TODO lots more close tags to implement here
1521                                         else
1522                                                 in_body_any_other_end_tag t.name
1523                 return
1524
1525         ins_mode_in_table_else = (t) ->
1526                 parse_error()
1527                 flag_foster_parenting = true # FIXME
1528                 ins_mode_in_body t
1529                 flag_foster_parenting = false
1530         can_in_table = { # FIXME do this inline like everywhere else
1531                 'table': true
1532                 'tbody': true
1533                 'tfoot': true
1534                 'thead': true
1535                 'tr': true
1536         }
1537
1538         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1539         ins_mode_text = (t) ->
1540                 if t.type is TYPE_TEXT
1541                         insert_character t
1542                         return
1543                 if t.type is TYPE_EOF
1544                         parse_error()
1545                         if open_els[0].name is 'script'
1546                                 open_els[0].flag 'already started', true
1547                         open_els.shift()
1548                         insertion_mode = original_insertion_mode
1549                         insertion_mode t
1550                         return
1551                 if t.type is TYPE_END_TAG and t.name is 'script'
1552                         open_els.shift()
1553                         insertion_mode = original_insertion_mode
1554                         # fixfull the spec seems to assume that I'm going to run the script
1555                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1556                         return
1557                 if t.type is TYPE_END_TAG
1558                         open_els.shift()
1559                         insertion_mode = original_insertion_mode
1560                         return
1561                 console.log 'warning: end of ins_mode_text reached'
1562
1563         # the functions below implement the tokenizer stats described here:
1564         # http://www.w3.org/TR/html5/syntax.html#tokenization
1565
1566         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1567         ins_mode_in_table = (t) ->
1568                 switch t.type
1569                         when TYPE_TEXT
1570                                 if can_in_table[t.name]
1571                                         original_insertion_mode = insertion_mode
1572                                         insertion_mode = ins_mode_in_table_text
1573                                         insertion_mode t
1574                                 else
1575                                         ins_mode_in_table_else t
1576                         when TYPE_COMMENT
1577                                 insert_comment t
1578                         when TYPE_DOCTYPE
1579                                 parse_error()
1580                         when TYPE_START_TAG
1581                                 switch t.name
1582                                         when 'caption'
1583                                                 clear_stack_to_table_context()
1584                                                 afe_push_marker()
1585                                                 insert_html_element t
1586                                                 insertion_mode = ins_mode_in_caption
1587                                         when 'colgroup'
1588                                                 clear_stack_to_table_context()
1589                                                 insert_html_element t
1590                                                 insertion_mode = ins_mode_in_column_group
1591                                         when 'col'
1592                                                 clear_stack_to_table_context()
1593                                                 insert_html_element new_open_tag 'colgroup'
1594                                                 insertion_mode = ins_mode_in_column_group
1595                                                 insertion_mode t
1596                                         when 'tbody', 'tfoot', 'thead'
1597                                                 clear_stack_to_table_context()
1598                                                 insert_html_element t
1599                                                 insertion_mode = ins_mode_in_table_body
1600                                         when 'td', 'th', 'tr'
1601                                                 clear_stack_to_table_context()
1602                                                 insert_html_element new_open_tag 'tbody'
1603                                                 insertion_mode = ins_mode_in_table_body
1604                                                 insertion_mode t
1605                                         when 'table'
1606                                                 parse_error()
1607                                                 if is_in_table_scope 'table'
1608                                                         loop
1609                                                                 el = open_els.shift()
1610                                                                 if el.name is 'table'
1611                                                                         break
1612                                                         reset_insertion_mode()
1613                                                         insertion_mode t
1614                                         when 'style', 'script', 'template'
1615                                                 ins_mode_in_head t
1616                                         when 'input'
1617                                                 if token_is_input_hidden t
1618                                                         ins_mode_in_table_else t
1619                                                 else
1620                                                         parse_error()
1621                                                         el = insert_html_element t
1622                                                         open_els.shift()
1623                                                         t.acknowledge_self_closing()
1624                                         when 'form'
1625                                                 parse_error()
1626                                                 if form_element_pointer?
1627                                                         return
1628                                                 if template_tag_is_open()
1629                                                         return
1630                                                 form_element_pointer = insert_html_element t
1631                                                 open_els.shift()
1632                                         else
1633                                                 ins_mode_in_table_else t
1634                         when TYPE_END_TAG
1635                                 switch t.name
1636                                         when 'table'
1637                                                 if is_in_table_scope 'table'
1638                                                         loop
1639                                                                 el = open_els.shift()
1640                                                                 if el.name is 'table'
1641                                                                         break
1642                                                         reset_insertion_mode()
1643                                                 else
1644                                                         parse_error
1645                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1646                                                 parse_error()
1647                                         when 'template'
1648                                                 ins_mode_in_head t
1649                                         else
1650                                                 ins_mode_in_table_else t
1651                         when TYPE_EOF
1652                                 ins_mode_in_body t
1653                         else
1654                                 ins_mode_in_table_else t
1655
1656
1657         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1658         ins_mode_in_table_text = (t) ->
1659                 if t.type is TYPE_TEXT and t.text is "\u0000"
1660                         # huh? I thought the tokenizer didn't emit these
1661                         parse_error()
1662                         return
1663                 if t.type is TYPE_TEXT
1664                         pending_table_character_tokens.push t
1665                         return
1666                 # Anything else
1667                 all_space = true
1668                 for old in pending_table_character_tokens
1669                         unless is_space_tok old
1670                                 all_space = false
1671                                 break
1672                 if all_space
1673                         for old in pending_table_character_tokens
1674                                 insert_character old
1675                 else
1676                         for old in pending_table_character_tokens
1677                                 ins_mode_table_else old
1678                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1679                 insertion_mode = original_insertion_mode
1680                 insertion_mode t
1681
1682         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1683         ins_mode_in_caption = (t) ->
1684                 if t.type is TYPE_END_TAG and t.name is 'caption'
1685                         if is_in_table_scope 'caption'
1686                                 generate_implied_end_tags()
1687                                 if open_els[0].name isnt 'caption'
1688                                         parse_error()
1689                                 loop
1690                                         el = open_els.shift()
1691                                         if el.name is 'caption'
1692                                                 break
1693                                 clear_afe_to_marker()
1694                                 insertion_mode = in_table
1695                         else
1696                                 parse_error()
1697                                 # fragment case
1698                         return
1699                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1700                         parse_error()
1701                         if is_in_table_scope 'caption'
1702                                 loop
1703                                         el = open_els.shift()
1704                                         if el.name is 'caption'
1705                                                 break
1706                                 clear_afe_to_marker()
1707                                 insertion_mode = in_table
1708                                 insertion_mode t
1709                         # else fragment case
1710                         return
1711                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1712                         parse_error()
1713                         return
1714                 # Anything else
1715                 ins_mode_in_body t
1716
1717         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1718         ins_mode_in_column_group = (t) ->
1719                 if is_space_tok t
1720                         insert_character t
1721                         return
1722                 if t.type is TYPE_COMMENT
1723                         insert_comment t
1724                         return
1725                 if t.type is TYPE_DOCTYPE
1726                         parse_error()
1727                         return
1728                 if t.type is TYPE_START_TAG and t.name is 'html'
1729                         ins_mode_in_body t
1730                         return
1731                 if t.type is TYPE_START_TAG and t.name is 'col'
1732                         el = insert_html_element t
1733                         open_els.shift()
1734                         t.acknowledge_self_closing()
1735                         return
1736                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1737                         if open_els[0].name is 'colgroup'
1738                                 open_els[0].shift()
1739                                 insertion_mode = ins_mode_in_table
1740                         else
1741                                 parse_error()
1742                         return
1743                 if t.type is TYPE_END_TAG and t.name is 'col'
1744                         parse_error()
1745                         return
1746                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1747                         ins_mode_in_head t
1748                         return
1749                 if t.type is TYPE_EOF
1750                         ins_mode_in_body t
1751                         return
1752                 # Anything else
1753                 if open_els[0].name isnt 'colgroup'
1754                         parse_error()
1755                         return
1756                 open_els.shift()
1757                 insertion_mode = ins_mode_in_table
1758                 insertion_mode t
1759                 return
1760
1761         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1762         ins_mode_in_table_body = (t) ->
1763                 if t.type is TYPE_START_TAG and t.name is 'tr'
1764                         clear_stack_to_table_body_context()
1765                         insert_html_element t
1766                         insertion_mode = ins_mode_in_row
1767                         return
1768                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1769                         parse_error()
1770                         clear_stack_to_table_body_context()
1771                         insert_html_element new_open_tag 'tr'
1772                         insertion_mode = ins_mode_in_row
1773                         insertion_mode t
1774                         return
1775                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1776                         unless is_in_table_scope t.name # fixfull check namespace
1777                                 parse_error()
1778                                 return
1779                         clear_stack_to_table_body_context()
1780                         open_els.shift()
1781                         insertion_mode = ins_mode_in_table
1782                         return
1783                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1784                         has = false
1785                         for el in open_els
1786                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1787                                         has = true
1788                                         break
1789                                 if table_scopers[el.name]
1790                                         break
1791                         if !has
1792                                 parse_error()
1793                                 return
1794                         clear_stack_to_table_body_context()
1795                         open_els.shift()
1796                         insertion_mode = ins_mode_in_table
1797                         insertion_mode t
1798                         return
1799                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1800                         parse_error()
1801                         return
1802                 # Anything else
1803                 ins_mode_in_table t
1804
1805         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1806         ins_mode_in_row = (t) ->
1807                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1808                         clear_stack_to_table_row_context()
1809                         insert_html_element t
1810                         insertion_mode = ins_mode_in_cell
1811                         afe_push_marker()
1812                         return
1813                 if t.type is TYPE_END_TAG and t.name is 'tr'
1814                         if is_in_table_scope 'tr'
1815                                 clear_stack_to_table_row_context()
1816                                 open_els.shift()
1817                                 insertion_mode = ins_mode_in_table_body
1818                         else
1819                                 parse_error()
1820                         return
1821                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1822                         if is_in_table_scope 'tr'
1823                                 clear_stack_to_table_row_context()
1824                                 open_els.shift()
1825                                 insertion_mode = ins_mode_in_table_body
1826                                 insertion_mode t
1827                         else
1828                                 parse_error()
1829                         return
1830                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1831                         if is_in_table_scope t.name # fixfull namespace
1832                                 if is_in_table_scope 'tr'
1833                                         clear_stack_to_table_row_context()
1834                                         open_els.shift()
1835                                         insertion_mode = ins_mode_in_table_body
1836                                         insertion_mode t
1837                         else
1838                                 parse_error()
1839                         return
1840                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1841                         parse_error()
1842                         return
1843                 # Anything else
1844                 ins_mode_in_table t
1845
1846         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1847         close_the_cell = ->
1848                 generate_implied_end_tags()
1849                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1850                         parse_error()
1851                 loop
1852                         el = open_els.shift()
1853                         if el.name is 'td' or el.name is 'th'
1854                                 break
1855                 clear_afe_to_marker()
1856                 insertion_mode = ins_mode_in_row
1857
1858         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1859         ins_mode_in_cell = (t) ->
1860                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1861                         if is_in_table_scope t.name
1862                                 generate_implied_end_tags()
1863                                 if open_els[0].name isnt t.name
1864                                         parse_error
1865                                 loop
1866                                         el = open_els.shift()
1867                                         if el.name is t.name
1868                                                 break
1869                                 clear_afe_to_marker()
1870                                 insertion_mode = ins_mode_in_row
1871                         else
1872                                 parse_error()
1873                         return
1874                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1875                         has = false
1876                         for el in open_els
1877                                 if el.name is 'td' or el.name is 'th'
1878                                         has = true
1879                                         break
1880                                 if table_scopers[el.name]
1881                                         break
1882                         if !has
1883                                 parse_error()
1884                                 return
1885                         close_the_cell()
1886                         insertion_mode t
1887                         return
1888                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1889                         parse_error()
1890                         return
1891                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1892                         if is_in_table_scope t.name # fixfull namespace
1893                                 close_the_cell()
1894                                 insertion_mode t
1895                         else
1896                                 parse_error()
1897                         return
1898                 # Anything Else
1899                 ins_mode_in_body t
1900
1901         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1902         ins_mode_in_select = (t) ->
1903                 if t.type is TYPE_TEXT and t.text is "\u0000"
1904                         parse_error()
1905                         return
1906                 if t.type is TYPE_TEXT
1907                         insert_character t
1908                         return
1909                 if t.type is TYPE_COMMENT
1910                         insert_comment t
1911                         return
1912                 if t.type is TYPE_DOCTYPE
1913                         parse_error()
1914                         return
1915                 if t.type is TYPE_START_TAG and t.name is 'html'
1916                         ins_mode_in_body t
1917                         return
1918                 if t.type is TYPE_START_TAG and t.name is 'option'
1919                         if open_els[0].name is 'option'
1920                                 open_els.shift()
1921                         insert_html_element t
1922                         return
1923                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1924                         if open_els[0].name is 'option'
1925                                 open_els.shift()
1926                         if open_els[0].name is 'optgroup'
1927                                 open_els.shift()
1928                         insert_html_element t
1929                         return
1930                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1931                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1932                                 open_els.shift()
1933                         if open_els[0].name is 'optgroup'
1934                                 open_els.shift()
1935                         else
1936                                 parse_error()
1937                         return
1938                 if t.type is TYPE_END_TAG and t.name is 'option'
1939                         if open_els[0].name is 'option'
1940                                 open_els.shift()
1941                         else
1942                                 parse_error()
1943                         return
1944                 if t.type is TYPE_END_TAG and t.name is 'select'
1945                         if is_in_select_scope 'select'
1946                                 loop
1947                                         el = open_els.shift()
1948                                         if el.name is 'select'
1949                                                 break
1950                                 reset_insertion_mode()
1951                         else
1952                                 parse_error()
1953                         return
1954                 if t.type is TYPE_START_TAG and t.name is 'select'
1955                         parse_error()
1956                         loop
1957                                 el = open_els.shift()
1958                                 if el.name is 'select'
1959                                         break
1960                         reset_insertion_mode()
1961                         # spec says that this is the same as </select> but it doesn't say
1962                         # to check scope first
1963                         return
1964                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1965                         parse_error()
1966                         if is_in_select_scope 'select'
1967                                 return
1968                         loop
1969                                 el = open_els.shift()
1970                                 if el.name is 'select'
1971                                         break
1972                         reset_insertion_mode()
1973                         insertion_mode t
1974                         return
1975                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1976                         ins_mode_in_head t
1977                         return
1978                 if t.type is TYPE_EOF
1979                         ins_mode_in_body t
1980                         return
1981                 # Anything else
1982                 parse_error()
1983                 return
1984
1985         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1986         ins_mode_in_select_in_table = (t) ->
1987                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1988                         parse_error()
1989                         loop
1990                                 el = open_els.shift()
1991                                 if el.name is 'select'
1992                                         break
1993                         reset_insertion_mode()
1994                         insertion_mode t
1995                         return
1996                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1997                         parse_error()
1998                         unless is_in_table_scope t.name, NS_HTML
1999                                 return
2000                         loop
2001                                 el = open_els.shift()
2002                                 if el.name is 'select'
2003                                         break
2004                         reset_insertion_mode()
2005                         insertion_mode t
2006                         return
2007                 # Anything else
2008                 ins_mode_in_select t
2009                 return
2010
2011         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2012         ins_mode_in_template = (t) ->
2013                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2014                         ins_mode_in_body t
2015                         return
2016                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2017                         ins_mode_in_head t
2018                         return
2019                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2020                         template_insertion_modes.shift()
2021                         template_insertion_modes.unshift ins_mode_in_table
2022                         insertion_mode = ins_mode_in_table
2023                         insertion_mode t
2024                         return
2025                 if t.type is TYPE_START_TAG and t.name is 'col'
2026                         template_insertion_modes.shift()
2027                         template_insertion_modes.unshift ins_mode_in_column_group
2028                         insertion_mode = ins_mode_in_column_group
2029                         insertion_mode t
2030                         return
2031                 if t.type is TYPE_START_TAG and t.name is 'tr'
2032                         template_insertion_modes.shift()
2033                         template_insertion_modes.unshift ins_mode_in_table_body
2034                         insertion_mode = ins_mode_in_table_body
2035                         insertion_mode t
2036                         return
2037                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2038                         template_insertion_modes.shift()
2039                         template_insertion_modes.unshift ins_mode_in_row
2040                         insertion_mode = ins_mode_in_row
2041                         insertion_mode t
2042                         return
2043                 if t.type is TYPE_START_TAG
2044                         template_insertion_modes.shift()
2045                         template_insertion_modes.unshift ins_mode_in_body
2046                         insertion_mode = ins_mode_in_body
2047                         insertion_mode t
2048                         return
2049                 if t.type is TYPE_END_TAG
2050                         parse_error()
2051                         return
2052                 if t.type is EOF
2053                         unless template_tag_is_open()
2054                                 stop_parsing()
2055                                 return
2056                         parse_error()
2057                         loop
2058                                 el = open_els.shift()
2059                                 if el.name is 'template' # fixfull check namespace
2060                                         break
2061                         clear_afe_to_marker()
2062                         template_insertion_modes.shift()
2063                         reset_insertion_mode()
2064                         insertion_mode t
2065
2066         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2067         ins_mode_after_body = (t) ->
2068                 if is_space_tok t
2069                         ins_mode_in_body t
2070                         return
2071                 if t.type is TYPE_COMMENT
2072                         insert_comment t, [open_els[0], open_els[0].children.length]
2073                         return
2074                 if t.type is TYPE_DOCTYPE
2075                         parse_error()
2076                         return
2077                 if t.type is TYPE_START_TAG and t.name is 'html'
2078                         ins_mode_in_body t
2079                         return
2080                 if t.type is TYPE_END_TAG and t.name is 'html'
2081                         # fixfull fragment case
2082                         insertion_mode = ins_mode_after_after_body
2083                         return
2084                 if t.type is TYPE_EOF
2085                         stop_parsing()
2086                         return
2087                 # Anything ELse
2088                 parse_error()
2089                 insertion_mode = ins_mode_in_body
2090                 insertion_mode t
2091
2092         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2093         ins_mode_in_frameset = (t) ->
2094                 if is_space_tok t
2095                         insert_character t
2096                         return
2097                 if t.type is TYPE_COMMENT
2098                         insert_comment t
2099                         return
2100                 if t.type is TYPE_DOCTYPE
2101                         parse_error()
2102                         return
2103                 if t.type is TYPE_START_TAG and t.name is 'html'
2104                         ins_mode_in_body t
2105                         return
2106                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2107                         insert_html_element t
2108                         return
2109                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2110                         # TODO ?correct for: "if the current node is the root html element"
2111                         if open_els.length is 1
2112                                 parse_error()
2113                                 return # fragment case
2114                         open_els.shift()
2115                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2116                                 insertion_mode = ins_mode_after_frameset
2117                         return
2118                 if t.type is TYPE_START_TAG and t.name is 'frame'
2119                         insert_html_element t
2120                         open_els.shift()
2121                         t.acknowledge_self_closing()
2122                         return
2123                 if t.type is TYPE_START TAG and t.name is 'noframes'
2124                         ins_mode_in_head t
2125                         return
2126                 if t.type is TYPE_EOF
2127                         # TODO ?correct for: "if the current node is not the root html element"
2128                         if open_els.length isnt 1
2129                                 parse_error()
2130                         stop_parsing()
2131                         return
2132                 # Anything else
2133                 parse_error()
2134                 return
2135
2136         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2137         ins_mode_after_frameset = (t) ->
2138                 if is_space_tok t
2139                         insert_character t
2140                         return
2141                 if t.type is TYPE_COMMENT
2142                         insert_comment t
2143                         return
2144                 if t.type is TYPE_DOCTYPE
2145                         parse_error()
2146                         return
2147                 if t.type is TYPE_START_TAG and t.name is 'html'
2148                         ins_mode_in_body t
2149                         return
2150                 if t.type is TYPE_END_TAG and t.name is 'html'
2151                         insert_mode = ins_mode_after_after_frameset
2152                         return
2153                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2154                         ins_mode_in_head t
2155                         return
2156                 if t.type is TYPE_EOF
2157                         stop_parsing()
2158                         return
2159                 # Anything else
2160                 parse_error()
2161                 return
2162
2163         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2164         ins_mode_after_after_body = (t) ->
2165                 if t.type is TYPE_COMMENT
2166                         insert_comment t, [doc, doc.children.length]
2167                         return
2168                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2169                         ins_mode_in_body t
2170                         return
2171                 if t.type is TYPE_EOF
2172                         stop_parsing()
2173                         return
2174                 # Anything else
2175                 parse_error()
2176                 insertion_mode = ins_mode_in_body
2177                 return
2178
2179         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2180         ins_mode_after_after_frameset = (t) ->
2181                 if t.type is TYPE_COMMENT
2182                         insert_comment t, [doc, doc.children.length]
2183                         return
2184                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2185                         ins_mode_in_body t
2186                         return
2187                 if t.type is TYPE_EOF
2188                         stop_parsing()
2189                         return
2190                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2191                         ins_mode_in_head t
2192                         return
2193                 # Anything else
2194                 parse_error()
2195                 return
2196
2197
2198
2199
2200
2201         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2202         tok_state_data = ->
2203                 switch c = txt.charAt(cur++)
2204                         when '&'
2205                                 return new_text_node parse_character_reference()
2206                         when '<'
2207                                 tok_state = tok_state_tag_open
2208                         when "\u0000"
2209                                 parse_error()
2210                                 return new_text_node c
2211                         when '' # EOF
2212                                 return new_eof_token()
2213                         else
2214                                 return new_text_node c
2215                 return null
2216
2217         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2218         # not needed: tok_state_character_reference_in_data = ->
2219         # just call parse_character_reference()
2220
2221         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2222         tok_state_rcdata = ->
2223                 switch c = txt.charAt(cur++)
2224                         when '&'
2225                                 return new_text_node parse_character_reference()
2226                         when '<'
2227                                 tok_state = tok_state_rcdata_less_than_sign
2228                         when "\u0000"
2229                                 parse_error()
2230                                 return new_character_token "\ufffd"
2231                         when '' # EOF
2232                                 return new_eof_token()
2233                         else
2234                                 return new_character_token c
2235                 return null
2236
2237         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2238         # not needed: tok_state_character_reference_in_rcdata = ->
2239         # just call parse_character_reference()
2240
2241         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2242         tok_state_rawtext = ->
2243                 switch c = txt.charAt(cur++)
2244                         when '<'
2245                                 tok_state = tok_state_rawtext_less_than_sign
2246                         when "\u0000"
2247                                 parse_error()
2248                                 return new_character_token "\ufffd"
2249                         when '' # EOF
2250                                 return new_eof_token()
2251                         else
2252                                 return new_character_token c
2253                 return null
2254
2255         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2256         tok_state_script_data = ->
2257                 switch c = txt.charAt(cur++)
2258                         when '<'
2259                                 tok_state = tok_state_script_data_less_than_sign
2260                         when "\u0000"
2261                                 parse_error()
2262                                 return new_character_token "\ufffd"
2263                         when '' # EOF
2264                                 return new_eof_token()
2265                         else
2266                                 return new_character_token c
2267                 return null
2268
2269         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2270         tok_state_plaintext = ->
2271                 switch c = txt.charAt(cur++)
2272                         when "\u0000"
2273                                 parse_error()
2274                                 return new_character_token "\ufffd"
2275                         when '' # EOF
2276                                 return new_eof_token()
2277                         else
2278                                 return new_character_token c
2279                 return null
2280
2281
2282         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2283         tok_state_tag_open = ->
2284                 switch c = txt.charAt(cur++)
2285                         when '!'
2286                                 tok_state = tok_state_markup_declaration_open
2287                         when '/'
2288                                 tok_state = tok_state_end_tag_open
2289                         when '?'
2290                                 parse_error()
2291                                 tok_cur_tag = new_comment_token '?'
2292                                 tok_state = tok_state_bogus_comment
2293                         else
2294                                 if lc_alpha.indexOf(c) > -1
2295                                         tok_cur_tag = new_open_tag c
2296                                         tok_state = tok_state_tag_name
2297                                 else if uc_alpha.indexOf(c) > -1
2298                                         tok_cur_tag = new_open_tag c.toLowerCase()
2299                                         tok_state = tok_state_tag_name
2300                                 else
2301                                         parse_error()
2302                                         tok_state = tok_state_data
2303                                         cur -= 1 # we didn't parse/handle the char after <
2304                                         return new_text_node '<'
2305                 return null
2306
2307         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2308         tok_state_end_tag_open = ->
2309                 switch c = txt.charAt(cur++)
2310                         when '>'
2311                                 parse_error()
2312                                 tok_state = tok_state_data
2313                         when '' # EOF
2314                                 parse_error()
2315                                 tok_state = tok_state_data
2316                                 return new_text_node '</'
2317                         else
2318                                 if uc_alpha.indexOf(c) > -1
2319                                         tok_cur_tag = new_end_tag c.toLowerCase()
2320                                         tok_state = tok_state_tag_name
2321                                 else if lc_alpha.indexOf(c) > -1
2322                                         tok_cur_tag = new_end_tag c
2323                                         tok_state = tok_state_tag_name
2324                                 else
2325                                         parse_error()
2326                                         tok_cur_tag = new_comment_token '/'
2327                                         tok_state = tok_state_bogus_comment
2328                 return null
2329
2330         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2331         tok_state_tag_name = ->
2332                 switch c = txt.charAt(cur++)
2333                         when "\t", "\n", "\u000c", ' '
2334                                 tok_state = tok_state_before_attribute_name
2335                         when '/'
2336                                 tok_state = tok_state_self_closing_start_tag
2337                         when '>'
2338                                 tok_state = tok_state_data
2339                                 tmp = tok_cur_tag
2340                                 tok_cur_tag = null
2341                                 return tmp
2342                         when "\u0000"
2343                                 parse_error()
2344                                 tok_cur_tag.name += "\ufffd"
2345                         when '' # EOF
2346                                 parse_error()
2347                                 tok_state = tok_state_data
2348                         else
2349                                 if uc_alpha.indexOf(c) > -1
2350                                         tok_cur_tag.name += c.toLowerCase()
2351                                 else
2352                                         tok_cur_tag.name += c
2353                 return null
2354
2355         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2356         tok_state_rcdata_less_than_sign = ->
2357                 c = txt.charAt(cur++)
2358                 if c is '/'
2359                         temporary_buffer = ''
2360                         tok_state = tok_state_rcdata_end_tag_open
2361                         return null
2362                 # Anything else
2363                 tok_state = tok_state_rcdata
2364                 cur -= 1 # reconsume the input character
2365                 return new_character_token '<'
2366
2367         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2368         tok_state_rcdata_end_tag_open = ->
2369                 c = txt.charAt(cur++)
2370                 if uc_alpha.indexOf(c) > -1
2371                         tok_cur_tag = new_end_tag c.toLowerCase()
2372                         temporary_buffer += c
2373                         tok_state = tok_state_rcdata_end_tag_name
2374                         return null
2375                 if lc_alpha.indexOf(c) > -1
2376                         tok_cur_tag = new_end_tag c
2377                         temporary_buffer += c
2378                         tok_state = tok_state_rcdata_end_tag_name
2379                         return null
2380                 # Anything else
2381                 tok_state = tok_state_rcdata
2382                 cur -= 1 # reconsume the input character
2383                 return new_character_token "</" # fixfull separate these
2384
2385         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2386         is_appropriate_end_tag = (t) ->
2387                 # spec says to check against "the tag name of the last start tag to
2388                 # have been emitted from this tokenizer", but this is only called from
2389                 # the various "raw" states, which I'm pretty sure all push the start
2390                 # token onto open_els. TODO: verify this after the script data states
2391                 # are implemented
2392                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2393                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2394
2395         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2396         tok_state_rcdata_end_tag_name = ->
2397                 c = txt.charAt(cur++)
2398                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2399                         if is_appropriate_end_tag tok_cur_tag
2400                                 tok_state = tok_state_before_attribute_name
2401                                 return
2402                         # else fall through to "Anything else"
2403                 if c is '/'
2404                         if is_appropriate_end_tag tok_cur_tag
2405                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2406                                 return
2407                         # else fall through to "Anything else"
2408                 if c is '>'
2409                         if is_appropriate_end_tag tok_cur_tag
2410                                 tok_state = tok_state_data
2411                                 return tok_cur_tag
2412                         # else fall through to "Anything else"
2413                 if uc_alpha.indexOf(c) > -1
2414                         tok_cur_tag.name += c.toLowerCase()
2415                         temporary_buffer += c
2416                         return null
2417                 if lc_alpha.indexOf(c) > -1
2418                         tok_cur_tag.name += c
2419                         temporary_buffer += c
2420                         return null
2421                 # Anything else
2422                 tok_state = tok_state_rcdata
2423                 cur -= 1 # reconsume the input character
2424                 return new_character_token '</' + temporary_buffer # fixfull separate these
2425
2426         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2427         tok_state_rawtext_less_than_sign = ->
2428                 c = txt.charAt(cur++)
2429                 if c is '/'
2430                         temporary_buffer = ''
2431                         tok_state = tok_state_rawtext_end_tag_open
2432                         return null
2433                 # Anything else
2434                 tok_state = tok_state_rawtext
2435                 cur -= 1 # reconsume the input character
2436                 return new_character_token '<'
2437
2438         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2439         tok_state_rawtext_end_tag_open = ->
2440                 c = txt.charAt(cur++)
2441                 if uc_alpha.indexOf(c) > -1
2442                         tok_cur_tag = new_end_tag c.toLowerCase()
2443                         temporary_buffer += c
2444                         tok_state = tok_state_rawtext_end_tag_name
2445                         return null
2446                 if lc_alpha.indexOf(c) > -1
2447                         tok_cur_tag = new_end_tag c
2448                         temporary_buffer += c
2449                         tok_state = tok_state_rawtext_end_tag_name
2450                         return null
2451                 # Anything else
2452                 tok_state = tok_state_rawtext
2453                 cur -= 1 # reconsume the input character
2454                 return new_character_token "</" # fixfull separate these
2455
2456         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2457         tok_state_rawtext_end_tag_name = ->
2458                 c = txt.charAt(cur++)
2459                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2460                         if is_appropriate_end_tag tok_cur_tag
2461                                 tok_state = tok_state_before_attribute_name
2462                                 return
2463                         # else fall through to "Anything else"
2464                 if c is '/'
2465                         if is_appropriate_end_tag tok_cur_tag
2466                                 tok_state = tok_state_self_closing_start_tag
2467                                 return
2468                         # else fall through to "Anything else"
2469                 if c is '>'
2470                         if is_appropriate_end_tag tok_cur_tag
2471                                 tok_state = tok_state_data
2472                                 return tok_cur_tag
2473                         # else fall through to "Anything else"
2474                 if uc_alpha.indexOf(c) > -1
2475                         tok_cur_tag.name += c.toLowerCase()
2476                         temporary_buffer += c
2477                         return null
2478                 if lc_alpha.indexOf(c) > -1
2479                         tok_cur_tag.name += c
2480                         temporary_buffer += c
2481                         return null
2482                 # Anything else
2483                 tok_state = tok_state_rawtext
2484                 cur -= 1 # reconsume the input character
2485                 return new_character_token '</' + temporary_buffer # fixfull separate these
2486
2487         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2488
2489         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2490         tok_state_before_attribute_name = ->
2491                 attr_name = null
2492                 switch c = txt.charAt(cur++)
2493                         when "\t", "\n", "\u000c", ' '
2494                                 return null
2495                         when '/'
2496                                 tok_state = tok_state_self_closing_start_tag
2497                                 return null
2498                         when '>'
2499                                 tok_state = tok_state_data
2500                                 tmp = tok_cur_tag
2501                                 tok_cur_tag = null
2502                                 return tmp
2503                         when "\u0000"
2504                                 parse_error()
2505                                 attr_name = "\ufffd"
2506                         when '"', "'", '<', '='
2507                                 parse_error()
2508                                 attr_name = c
2509                         when '' # EOF
2510                                 parse_error()
2511                                 tok_state = tok_state_data
2512                         else
2513                                 if uc_alpha.indexOf(c) > -1
2514                                         attr_name = c.toLowerCase()
2515                                 else
2516                                         attr_name = c
2517                 if attr_name?
2518                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2519                         tok_state = tok_state_attribute_name
2520                 return null
2521
2522         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2523         tok_state_attribute_name = ->
2524                 switch c = txt.charAt(cur++)
2525                         when "\t", "\n", "\u000c", ' '
2526                                 tok_state = tok_state_after_attribute_name
2527                         when '/'
2528                                 tok_state = tok_state_self_closing_start_tag
2529                         when '='
2530                                 tok_state = tok_state_before_attribute_value
2531                         when '>'
2532                                 tok_state = tok_state_data
2533                                 tmp = tok_cur_tag
2534                                 tok_cur_tag = null
2535                                 return tmp
2536                         when "\u0000"
2537                                 parse_error()
2538                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2539                         when '"', "'", '<'
2540                                 parse_error()
2541                                 tok_cur_tag.attrs_a[0][0] = c
2542                         when '' # EOF
2543                                 parse_error()
2544                                 tok_state = tok_state_data
2545                         else
2546                                 if uc_alpha.indexOf(c) > -1
2547                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2548                                 else
2549                                         tok_cur_tag.attrs_a[0][0] += c
2550                 return null
2551
2552         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2553         tok_state_after_attribute_name = ->
2554                 c = txt.charAt(cur++)
2555                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2556                         return
2557                 if c is '/'
2558                         tok_state = tok_state_self_closing_start_tag
2559                         return
2560                 if c is '='
2561                         tok_state = tok_state_before_attribute_value
2562                         return
2563                 if c is '>'
2564                         tok_state = tok_state_data
2565                         return
2566                 if uc_alpha.indexOf(c) > -1
2567                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2568                         tok_state = tok_state_attribute_name
2569                         return
2570                 if c is "\u0000"
2571                         parse_error()
2572                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2573                         tok_state = tok_state_attribute_name
2574                         return
2575                 if c is '' # EOF
2576                         parse_error()
2577                         tok_state = tok_state_data
2578                         cur -= 1 # reconsume
2579                         return
2580                 if c is '"' or c is "'" or c is '<'
2581                         parse_error()
2582                         # fall through to Anything else
2583                 # Anything else
2584                 tok_cur_tag.attrs_a.unshift [c, '']
2585                 tok_state = tok_state_attribute_name
2586
2587         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2588         tok_state_before_attribute_value = ->
2589                 switch c = txt.charAt(cur++)
2590                         when "\t", "\n", "\u000c", ' '
2591                                 return null
2592                         when '"'
2593                                 tok_state = tok_state_attribute_value_double_quoted
2594                         when '&'
2595                                 tok_state = tok_state_attribute_value_unquoted
2596                                 cur -= 1
2597                         when "'"
2598                                 tok_state = tok_state_attribute_value_single_quoted
2599                         when "\u0000"
2600                                 # Parse error
2601                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2602                                 tok_state = tok_state_attribute_value_unquoted
2603                         when '>'
2604                                 # Parse error
2605                                 tok_state = tok_state_data
2606                                 tmp = tok_cur_tag
2607                                 tok_cur_tag = null
2608                                 return tmp
2609                         when '' # EOF
2610                                 parse_error()
2611                                 tok_state = tok_state_data
2612                         else
2613                                 tok_cur_tag.attrs_a[0][1] += c
2614                                 tok_state = tok_state_attribute_value_unquoted
2615                 return null
2616
2617         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2618         tok_state_attribute_value_double_quoted = ->
2619                 switch c = txt.charAt(cur++)
2620                         when '"'
2621                                 tok_state = tok_state_after_attribute_value_quoted
2622                         when '&'
2623                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2624                         when "\u0000"
2625                                 # Parse error
2626                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2627                         when '' # EOF
2628                                 parse_error()
2629                                 tok_state = tok_state_data
2630                         else
2631                                 tok_cur_tag.attrs_a[0][1] += c
2632                 return null
2633
2634         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2635         tok_state_attribute_value_single_quoted = ->
2636                 switch c = txt.charAt(cur++)
2637                         when "'"
2638                                 tok_state = tok_state_after_attribute_value_quoted
2639                         when '&'
2640                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2641                         when "\u0000"
2642                                 # Parse error
2643                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2644                         when '' # EOF
2645                                 parse_error()
2646                                 tok_state = tok_state_data
2647                         else
2648                                 tok_cur_tag.attrs_a[0][1] += c
2649                 return null
2650
2651         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2652         tok_state_attribute_value_unquoted = ->
2653                 switch c = txt.charAt(cur++)
2654                         when "\t", "\n", "\u000c", ' '
2655                                 tok_state = tok_state_before_attribute_name
2656                         when '&'
2657                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2658                         when '>'
2659                                 tok_state = tok_state_data
2660                                 tmp = tok_cur_tag
2661                                 tok_cur_tag = null
2662                                 return tmp
2663                         when "\u0000"
2664                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2665                         when '' # EOF
2666                                 parse_error()
2667                                 tok_state = tok_state_data
2668                         else
2669                                 # Parse Error if ', <, = or ` (backtick)
2670                                 tok_cur_tag.attrs_a[0][1] += c
2671                 return null
2672
2673         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2674         tok_state_after_attribute_value_quoted = ->
2675                 switch c = txt.charAt(cur++)
2676                         when "\t", "\n", "\u000c", ' '
2677                                 tok_state = tok_state_before_attribute_name
2678                         when '/'
2679                                 tok_state = tok_state_self_closing_start_tag
2680                         when '>'
2681                                 tok_state = tok_state_data
2682                                 tmp = tok_cur_tag
2683                                 tok_cur_tag = null
2684                                 return tmp
2685                         when '' # EOF
2686                                 parse_error()
2687                                 tok_state = tok_state_data
2688                         else
2689                                 # Parse Error
2690                                 tok_state = tok_state_before_attribute_name
2691                                 cur -= 1 # we didn't handle that char
2692                 return null
2693
2694         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2695         # WARNING: put a comment token in tok_cur_tag before setting this state
2696         tok_state_bogus_comment = ->
2697                 next_gt = txt.indexOf '>', cur
2698                 if next_gt is -1
2699                         val = txt.substr cur
2700                         cur = txt.length
2701                 else
2702                         val = txt.substr cur, (next_gt - cur)
2703                         cur = next_gt + 1
2704                 val = val.replace "\u0000", "\ufffd"
2705                 tok_cur_tag.text += val
2706                 tok_state = tok_state_data
2707                 return tok_cur_tag
2708
2709         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2710         tok_state_markup_declaration_open = ->
2711                 if txt.substr(cur, 2) is '--'
2712                         cur += 2
2713                         tok_cur_tag = new_comment_token ''
2714                         tok_state = tok_state_comment_start
2715                         return
2716                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2717                         cur += 7
2718                         tok_state = tok_state_doctype
2719                         return
2720                 acn = adjusted_current_node()
2721                 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2722                         cur += 7
2723                         tok_state = tok_state_cdata_section
2724                         return
2725                 # Otherwise
2726                 parse_error()
2727                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2728                 tok_state = tok_state_bogus_comment
2729                 return
2730
2731         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2732         tok_state_comment_start = ->
2733                 switch c = txt.charAt(cur++)
2734                         when '-'
2735                                 tok_state = tok_state_comment_start_dash
2736                         when "\u0000"
2737                                 parse_error()
2738                                 return new_character_token "\ufffd"
2739                         when '>'
2740                                 parse_error()
2741                                 tok_state = tok_state_data
2742                                 return tok_cur_tag
2743                         when '' # EOF
2744                                 parse_error()
2745                                 tok_state = tok_state_data
2746                                 cur -= 1 # Reconsume
2747                                 return tok_cur_tag
2748                         else
2749                                 tok_cur_tag.text += c
2750                 return null
2751
2752         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2753         tok_state_comment_start_dash = ->
2754                 switch c = txt.charAt(cur++)
2755                         when '-'
2756                                 tok_state = tok_state_comment_end
2757                         when "\u0000"
2758                                 parse_error()
2759                                 tok_cur_tag.text += "-\ufffd"
2760                                 tok_state = tok_state_comment
2761                         when '>'
2762                                 parse_error()
2763                                 tok_state = tok_state_data
2764                                 return tok_cur_tag
2765                         when '' # EOF
2766                                 parse_error()
2767                                 tok_state = tok_state_data
2768                                 cur -= 1 # Reconsume
2769                                 return tok_cur_tag
2770                         else
2771                                 tok_cur_tag.text += "-#{c}"
2772                                 tok_state = tok_state_comment
2773                 return null
2774
2775         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2776         tok_state_comment = ->
2777                 switch c = txt.charAt(cur++)
2778                         when '-'
2779                                 tok_state = tok_state_comment_end_dash
2780                         when "\u0000"
2781                                 parse_error()
2782                                 tok_cur_tag.text += "\ufffd"
2783                         when '' # EOF
2784                                 parse_error()
2785                                 tok_state = tok_state_data
2786                                 cur -= 1 # Reconsume
2787                                 return tok_cur_tag
2788                         else
2789                                 tok_cur_tag.text += c
2790                 return null
2791
2792         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2793         tok_state_comment_end_dash = ->
2794                 switch c = txt.charAt(cur++)
2795                         when '-'
2796                                 tok_state = tok_state_comment_end
2797                         when "\u0000"
2798                                 parse_error()
2799                                 tok_cur_tag.text += "-\ufffd"
2800                                 tok_state = tok_state_comment
2801                         when '' # EOF
2802                                 parse_error()
2803                                 tok_state = tok_state_data
2804                                 cur -= 1 # Reconsume
2805                                 return tok_cur_tag
2806                         else
2807                                 tok_cur_tag.text += "-#{c}"
2808                                 tok_state = tok_state_comment
2809                 return null
2810
2811         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2812         tok_state_comment_end = ->
2813                 switch c = txt.charAt(cur++)
2814                         when '>'
2815                                 tok_state = tok_state_data
2816                                 return tok_cur_tag
2817                         when "\u0000"
2818                                 parse_error()
2819                                 tok_cur_tag.text += "--\ufffd"
2820                                 tok_state = tok_state_comment
2821                         when '!'
2822                                 parse_error()
2823                                 tok_state = tok_state_comment_end_bang
2824                         when '-'
2825                                 parse_error()
2826                                 tok_cur_tag.text += '-'
2827                         when '' # EOF
2828                                 parse_error()
2829                                 tok_state = tok_state_data
2830                                 cur -= 1 # Reconsume
2831                                 return tok_cur_tag
2832                         else
2833                                 parse_error()
2834                                 tok_cur_tag.text += "--#{c}"
2835                                 tok_state = tok_state_comment
2836                 return null
2837
2838         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2839         tok_state_comment_end_bang = ->
2840                 switch c = txt.charAt(cur++)
2841                         when '-'
2842                                 tok_cur_tag.text += "--!#{c}"
2843                                 tok_state = tok_state_comment_end_dash
2844                         when '>'
2845                                 tok_state = tok_state_data
2846                                 return tok_cur_tag
2847                         when "\u0000"
2848                                 parse_error()
2849                                 tok_cur_tag.text += "--!\ufffd"
2850                                 tok_state = tok_state_comment
2851                         when '' # EOF
2852                                 parse_error()
2853                                 tok_state = tok_state_data
2854                                 cur -= 1 # Reconsume
2855                                 return tok_cur_tag
2856                         else
2857                                 tok_cur_tag.text += "--!#{c}"
2858                                 tok_state = tok_state_comment
2859                 return null
2860
2861         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2862         tok_state_doctype = ->
2863                 switch c = txt.charAt(cur++)
2864                         when "\t", "\u000a", "\u000c", ' '
2865                                 tok_state = tok_state_before_doctype_name
2866                         when '' # EOF
2867                                 parse_error()
2868                                 tok_state = tok_state_data
2869                                 el = new_doctype_token ''
2870                                 el.flag 'force-quirks', true
2871                                 cur -= 1 # Reconsume
2872                                 return el
2873                         else
2874                                 parse_error()
2875                                 tok_state = tok_state_before_doctype_name
2876                                 cur -= 1 # Reconsume
2877                 return null
2878
2879         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2880         tok_state_before_doctype_name = ->
2881                 c = txt.charAt(cur++)
2882                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2883                         return
2884                 if uc_alpha.indexOf(c) > -1
2885                         tok_cur_tag = new_doctype_token c.toLowerCase()
2886                         tok_state = tok_state_doctype_name
2887                         return
2888                 if c is "\u0000"
2889                         parse_error()
2890                         tok_cur_tag = new_doctype_token "\ufffd"
2891                         tok_state = tok_state_doctype_name
2892                         return
2893                 if c is '>'
2894                         parse_error()
2895                         el = new_doctype_token ''
2896                         el.flag 'force-quirks', true
2897                         tok_state = tok_state_data
2898                         return el
2899                 if c is '' # EOF
2900                         parse_error()
2901                         tok_state = tok_state_data
2902                         el = new_doctype_token ''
2903                         el.flag 'force-quirks', true
2904                         cur -= 1 # Reconsume
2905                         return el
2906                 # Anything else
2907                 tok_cur_tag = new_doctype_token c
2908                 tok_state = tok_state_doctype_name
2909                 return null
2910
2911         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
2912         tok_state_doctype_name = ->
2913                 c = txt.charAt(cur++)
2914                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2915                         tok_state = tok_state_after_doctype_name
2916                         return
2917                 if c is '>'
2918                         tok_state = tok_state_data
2919                         return tok_cur_tag
2920                 if uc_alpha.indexOf(c) > -1
2921                         tok_cur_tag.name += c.toLowerCase()
2922                         return
2923                 if c is "\u0000"
2924                         parse_error()
2925                         tok_cur_tag.name += "\ufffd"
2926                         return
2927                 if c is '' # EOF
2928                         parse_error()
2929                         tok_state = tok_state_data
2930                         tok_cur_tag.flag 'force-quirks', true
2931                         cur -= 1 # Reconsume
2932                         return tok_cur_tag
2933                 # Anything else
2934                 tok_cur_tag.name += c
2935                 return null
2936
2937         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
2938         tok_state_after_doctype_name = ->
2939                 c = txt.charAt(cur++)
2940                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2941                         return
2942                 if c is '>'
2943                         tok_state = tok_state_data
2944                         return tok_cur_tag
2945                 if c is '' # EOF
2946                         parse_error()
2947                         tok_state = tok_state_data
2948                         tok_cur_tag.flag 'force-quirks', true
2949                         cur -= 1 # Reconsume
2950                         return tok_cur_tag
2951                 # Anything else
2952                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
2953                         cur += 5
2954                         tok_state = tok_state_after_doctype_public_keyword
2955                         return
2956                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
2957                         cur += 5
2958                         tok_state = tok_state_after_doctype_system_keyword
2959                         return
2960                 parse_error()
2961                 tok_cur_tag.flag 'force-quirks', true
2962                 tok_state = tok_state_bogus_doctype
2963                 return null
2964
2965         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
2966         tok_state_after_doctype_public_keyword = ->
2967                 c = txt.charAt(cur++)
2968                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2969                         tok_state = tok_state_before_doctype_public_identifier
2970                         return
2971                 if c is '"'
2972                         parse_error()
2973                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2974                         tok_state = tok_state_doctype_public_identifier_double_quoted
2975                         return
2976                 if c is "'"
2977                         parse_error()
2978                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2979                         tok_state = tok_state_doctype_public_identifier_single_quoted
2980                         return
2981                 if c is '>'
2982                         parse_error()
2983                         tok_cur_tag.flag 'force-quirks', true
2984                         tok_state = tok_state_data
2985                         return tok_cur_tag
2986                 if c is '' # EOF
2987                         parse_error()
2988                         tok_state = tok_state_data
2989                         tok_cur_tag.flag 'force-quirks', true
2990                         cur -= 1 # Reconsume
2991                         return tok_cur_tag
2992                 # Anything else
2993                 parse_error()
2994                 tok_cur_tag.flag 'force-quirks', true
2995                 tok_state = tok_state_bogus_doctype
2996                 return null
2997
2998         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
2999         tok_state_before_doctype_public_identifier = ->
3000                 c = txt.charAt(cur++)
3001                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3002                         return
3003                 if c is '"'
3004                         parse_error()
3005                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3006                         tok_state = tok_state_doctype_public_identifier_double_quoted
3007                         return
3008                 if c is "'"
3009                         parse_error()
3010                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3011                         tok_state = tok_state_doctype_public_identifier_single_quoted
3012                         return
3013                 if c is '>'
3014                         parse_error()
3015                         tok_cur_tag.flag 'force-quirks', true
3016                         tok_state = tok_state_data
3017                         return tok_cur_tag
3018                 if c is '' # EOF
3019                         parse_error()
3020                         tok_state = tok_state_data
3021                         tok_cur_tag.flag 'force-quirks', true
3022                         cur -= 1 # Reconsume
3023                         return tok_cur_tag
3024                 # Anything else
3025                 parse_error()
3026                 tok_cur_tag.flag 'force-quirks', true
3027                 tok_state = tok_state_bogus_doctype
3028                 return null
3029
3030
3031         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3032         tok_state_doctype_public_identifier_double_quoted = ->
3033                 c = txt.charAt(cur++)
3034                 if c is '"'
3035                         tok_state = tok_state_after_doctype_public_identifier
3036                         return
3037                 if c is "\u0000"
3038                         parse_error()
3039                         tok_cur_tag.public_identifier += "\ufffd"
3040                         return
3041                 if c is '>'
3042                         parse_error()
3043                         tok_cur_tag.flag 'force-quirks', true
3044                         tok_state = tok_state_data
3045                         return tok_cur_tag
3046                 if c is '' # EOF
3047                         parse_error()
3048                         tok_state = tok_state_data
3049                         tok_cur_tag.flag 'force-quirks', true
3050                         cur -= 1 # Reconsume
3051                         return tok_cur_tag
3052                 # Anything else
3053                 tok_cur_tag.public_identifier += c
3054                 return null
3055
3056         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3057         tok_state_doctype_public_identifier_single_quoted = ->
3058                 c = txt.charAt(cur++)
3059                 if c is "'"
3060                         tok_state = tok_state_after_doctype_public_identifier
3061                         return
3062                 if c is "\u0000"
3063                         parse_error()
3064                         tok_cur_tag.public_identifier += "\ufffd"
3065                         return
3066                 if c is '>'
3067                         parse_error()
3068                         tok_cur_tag.flag 'force-quirks', true
3069                         tok_state = tok_state_data
3070                         return tok_cur_tag
3071                 if c is '' # EOF
3072                         parse_error()
3073                         tok_state = tok_state_data
3074                         tok_cur_tag.flag 'force-quirks', true
3075                         cur -= 1 # Reconsume
3076                         return tok_cur_tag
3077                 # Anything else
3078                 tok_cur_tag.public_identifier += c
3079                 return null
3080
3081         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3082         tok_state_after_doctype_public_identifier = ->
3083                 c = txt.charAt(cur++)
3084                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3085                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3086                         return
3087                 if c is '>'
3088                         tok_state = tok_state_data
3089                         return tok_cur_tag
3090                 if c is '"'
3091                         parse_error()
3092                         tok_cur_tag.system_identifier = ''
3093                         tok_state = tok_state_doctype_system_identifier_double_quoted
3094                         return
3095                 if c is "'"
3096                         parse_error()
3097                         tok_cur_tag.system_identifier = ''
3098                         tok_state = tok_state_doctype_system_identifier_single_quoted
3099                         return
3100                 if c is '' # EOF
3101                         parse_error()
3102                         tok_state = tok_state_data
3103                         tok_cur_tag.flag 'force-quirks', true
3104                         cur -= 1 # Reconsume
3105                         return tok_cur_tag
3106                 # Anything else
3107                 parse_error()
3108                 tok_cur_tag.flag 'force-quirks', true
3109                 tok_state = tok_state_bogus_doctype
3110                 return null
3111
3112         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3113         tok_state_between_doctype_public_and_system_identifiers = ->
3114                 c = txt.charAt(cur++)
3115                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3116                         return
3117                 if c is '>'
3118                         tok_state = tok_state_data
3119                         return tok_cur_tag
3120                 if c is '"'
3121                         parse_error()
3122                         tok_cur_tag.system_identifier = ''
3123                         tok_state = tok_state_doctype_system_identifier_double_quoted
3124                         return
3125                 if c is "'"
3126                         parse_error()
3127                         tok_cur_tag.system_identifier = ''
3128                         tok_state = tok_state_doctype_system_identifier_single_quoted
3129                         return
3130                 if c is '' # EOF
3131                         parse_error()
3132                         tok_state = tok_state_data
3133                         tok_cur_tag.flag 'force-quirks', true
3134                         cur -= 1 # Reconsume
3135                         return tok_cur_tag
3136                 # Anything else
3137                 parse_error()
3138                 tok_cur_tag.flag 'force-quirks', true
3139                 tok_state = tok_state_bogus_doctype
3140                 return null
3141
3142         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3143         tok_state_after_doctype_system_keyword = ->
3144                 c = txt.charAt(cur++)
3145                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3146                         tok_state = tok_state_before_doctype_system_identifier
3147                         return
3148                 if c is '"'
3149                         parse_error()
3150                         tok_cur_tag.system_identifier = ''
3151                         tok_state = tok_state_doctype_system_identifier_double_quoted
3152                         return
3153                 if c is "'"
3154                         parse_error()
3155                         tok_cur_tag.system_identifier = ''
3156                         tok_state = tok_state_doctype_system_identifier_single_quoted
3157                         return
3158                 if c is '>'
3159                         parse_error()
3160                         tok_cur_tag.flag 'force-quirks', true
3161                         tok_state = tok_state_data
3162                         return tok_cur_tag
3163                 if c is '' # EOF
3164                         parse_error()
3165                         tok_state = tok_state_data
3166                         tok_cur_tag.flag 'force-quirks', true
3167                         cur -= 1 # Reconsume
3168                         return tok_cur_tag
3169                 # Anything else
3170                 parse_error()
3171                 tok_cur_tag.flag 'force-quirks', true
3172                 tok_state = tok_state_bogus_doctype
3173                 return null
3174
3175         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3176         tok_state_before_doctype_system_identifier = ->
3177                 c = txt.charAt(cur++)
3178                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3179                         return
3180                 if c is '"'
3181                         tok_cur_tag.system_identifier = ''
3182                         tok_state = tok_state_doctype_system_identifier_double_quoted
3183                         return
3184                 if c is "'"
3185                         tok_cur_tag.system_identifier = ''
3186                         tok_state = tok_state_doctype_system_identifier_single_quoted
3187                         return
3188                 if c is '>'
3189                         parse_error()
3190                         tok_cur_tag.flag 'force-quirks', true
3191                         tok_state = tok_state_data
3192                         return tok_cur_tag
3193                 if c is '' # EOF
3194                         parse_error()
3195                         tok_state = tok_state_data
3196                         tok_cur_tag.flag 'force-quirks', true
3197                         cur -= 1 # Reconsume
3198                         return tok_cur_tag
3199                 # Anything else
3200                 parse_error()
3201                 tok_cur_tag.flag 'force-quirks', true
3202                 tok_state = tok_state_bogus_doctype
3203                 return null
3204
3205         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3206         tok_state_doctype_system_identifier_double_quoted = ->
3207                 c = txt.charAt(cur++)
3208                 if c is '"'
3209                         tok_state = tok_state_after_doctype_system_identifier
3210                         return
3211                 if c is "\u0000"
3212                         parse_error()
3213                         tok_cur_tag.system_identifier += "\ufffd"
3214                         return
3215                 if c is '>'
3216                         parse_error()
3217                         tok_cur_tag.flag 'force-quirks', true
3218                         tok_state = tok_state_data
3219                         return tok_cur_tag
3220                 if c is '' # EOF
3221                         parse_error()
3222                         tok_state = tok_state_data
3223                         tok_cur_tag.flag 'force-quirks', true
3224                         cur -= 1 # Reconsume
3225                         return tok_cur_tag
3226                 # Anything else
3227                 tok_cur_tag.system_identifier += c
3228                 return null
3229
3230         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3231         tok_state_doctype_system_identifier_single_quoted = ->
3232                 c = txt.charAt(cur++)
3233                 if c is "'"
3234                         tok_state = tok_state_after_doctype_system_identifier
3235                         return
3236                 if c is "\u0000"
3237                         parse_error()
3238                         tok_cur_tag.system_identifier += "\ufffd"
3239                         return
3240                 if c is '>'
3241                         parse_error()
3242                         tok_cur_tag.flag 'force-quirks', true
3243                         tok_state = tok_state_data
3244                         return tok_cur_tag
3245                 if c is '' # EOF
3246                         parse_error()
3247                         tok_state = tok_state_data
3248                         tok_cur_tag.flag 'force-quirks', true
3249                         cur -= 1 # Reconsume
3250                         return tok_cur_tag
3251                 # Anything else
3252                 tok_cur_tag.system_identifier += c
3253                 return null
3254
3255         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3256         tok_state_after_doctype_system_identifier = ->
3257                 c = txt.charAt(cur++)
3258                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3259                         return
3260                 if c is '>'
3261                         tok_state = tok_state_data
3262                         return tok_cur_tag
3263                 if c is '' # EOF
3264                         parse_error()
3265                         tok_state = tok_state_data
3266                         tok_cur_tag.flag 'force-quirks', true
3267                         cur -= 1 # Reconsume
3268                         return tok_cur_tag
3269                 # Anything else
3270                 parse_error()
3271                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3272                 tok_state = tok_state_bogus_doctype
3273                 return null
3274
3275         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3276         tok_state_bogus_doctype = ->
3277                 c = txt.charAt(cur++)
3278                 if c is '>'
3279                         tok_state = tok_state_data
3280                         return tok_cur_tag
3281                 if c is '' # EOF
3282                         tok_state = tok_state_data
3283                         cur -= 1 # Reconsume
3284                         return tok_cur_tag
3285                 # Anything else
3286                 return null
3287
3288
3289         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3290         # Don't set this as a state, just call it
3291         # returns a string (NOT a text node)
3292         parse_character_reference = (allowed_char = null, in_attr = false) ->
3293                 if cur >= txt.length
3294                         return '&'
3295                 switch c = txt.charAt(cur)
3296                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3297                                 # explicitly not a parse error
3298                                 return '&'
3299                         when ';'
3300                                 # there has to be "one or more" alnums between & and ; to be a parse error
3301                                 return '&'
3302                         when '#'
3303                                 if cur + 1 >= txt.length
3304                                         return '&'
3305                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3306                                         prefix = '#x'
3307                                         charset = hex_chars
3308                                         start = cur + 2
3309                                 else
3310                                         charset = digits
3311                                         start = cur + 1
3312                                         prefix = '#'
3313                                 i = 0
3314                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3315                                         i += 1
3316                                 if i is 0
3317                                         return '&'
3318                                 if txt.charAt(start + i) is ';'
3319                                         i += 1
3320                                 # FIXME This is supposed to generate parse errors for some chars
3321                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3322                                 if decoded?
3323                                         cur = start + i
3324                                         return decoded
3325                                 return '&'
3326                         else
3327                                 for i in [0...31]
3328                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3329                                                 break
3330                                 if i is 0
3331                                         # exit early, because parse_error() below needs at least one alnum
3332                                         return '&'
3333                                 if txt.charAt(cur + i) is ';'
3334                                         i += 1 # include ';' terminator in value
3335                                         decoded = decode_named_char_ref txt.substr(cur, i)
3336                                         if decoded?
3337                                                 cur += i
3338                                                 return decoded
3339                                         parse_error()
3340                                         return '&'
3341                                 else
3342                                         # no ';' terminator (only legacy char refs)
3343                                         max = i
3344                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3345                                                 c = legacy_char_refs[txt.substr(cur, i)]
3346                                                 if c?
3347                                                         if in_attr
3348                                                                 if txt.charAt(cur + i) is '='
3349                                                                         # "because some legacy user agents will
3350                                                                         # misinterpret the markup in those cases"
3351                                                                         parse_error()
3352                                                                         return '&'
3353                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3354                                                                         # this makes attributes forgiving about url args
3355                                                                         return '&'
3356                                                         # ok, and besides the weird exceptions for attributes...
3357                                                         # return the matching char
3358                                                         cur += i # consume entity chars
3359                                                         parse_error() # because no terminating ";"
3360                                                         return c
3361                                         parse_error()
3362                                         return '&'
3363                 return # never reached
3364
3365         # tree constructor initialization
3366         # see comments on TYPE_TAG/etc for the structure of this data
3367         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3368         open_els = []
3369         afe = [] # active formatting elements
3370         template_insertion_modes = []
3371         insertion_mode = ins_mode_initial
3372         original_insertion_mode = insertion_mode # TODO check spec
3373         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3374         flag_frameset_ok = true
3375         flag_parsing = true
3376         flag_foster_parenting = false
3377         form_element_pointer = null
3378         temporary_buffer = null
3379         pending_table_character_tokens = []
3380         head_element_pointer = null
3381         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3382         context_element = null # FIXME initialize from args.fragment
3383
3384         # tokenizer initialization
3385         tok_state = tok_state_data
3386
3387         # proccess input
3388         while flag_parsing
3389                 t = tok_state()
3390                 if t?
3391                         insertion_mode t
3392                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3393         return doc.children
3394
3395 serialize_els = (els, shallow, show_ids) ->
3396         serialized = ''
3397         sep = ''
3398         for t in els
3399                 serialized += sep
3400                 sep = ','
3401                 serialized += t.serialize shallow, show_ids
3402         return serialized
3403
3404 # TODO export TYPE_*
3405 module.exports.parse_html = parse_html
3406 module.exports.debug_log_reset = debug_log_reset
3407 module.exports.debug_log_each = debug_log_each
3408 module.exports.TYPE_TAG = TYPE_TAG
3409 module.exports.TYPE_TEXT = TYPE_TEXT
3410 module.exports.TYPE_COMMENT = TYPE_COMMENT
3411 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE